feat: made tag extraction more robust

2026-02-10 17:22:02 -06:00 · 2026-02-10 17:22:02 -06:00 · 7b5f3adee9
commit 7b5f3adee9
parent 9517dc4d7a
1 changed files with 25 additions and 16 deletions
--- a/eleventy.config.js
+++ b/eleventy.config.js
@ -6,22 +6,25 @@ const syntaxHighlight = require("@11ty/eleventy-plugin-syntaxhighlight");
 const fs = require("fs");
 const { DateTime } = require("luxon");

-const tagPattern = /(?<=^|\s)#([a-zA-Z][a-zA-Z0-9_]*)(?![a-zA-Z0-9_-])/g;
-
-// TODO: is there any reasonable way to make this use real markdown parsing because right now this is sketchy
-const extractTags = (content) => {
+const extractTags = (content, mdInstance) => {
  if (!content) return [];
-  const matches = content.match(tagPattern);
-  if (!matches) return [];
-  const tags = [...new Set(matches.map(m => m.slice(1)))];
-  return tags;
+  
+  const collectHashtags = (tokens) =>
+    tokens.flatMap(token => [
+      ...(token.type === 'hashtag' ? [token.content] : []),
+      ...(token.children ? collectHashtags(token.children) : [])
+    ]);
+  
+  const tokens = mdInstance.parse(content, {});
+  const tags = collectHashtags(tokens);
+  return [...new Set(tags)];
 }

-const getPostTags = (post) => {
+const getPostTags = (post, mdInstance) => {
  const filePath = post.inputPath;
  try {
    const content = fs.readFileSync(filePath, 'utf-8');
-    const tags = extractTags(content);
+    const tags = extractTags(content, mdInstance);
    return tags.map(tag => {
      const normalizedTag = tag.toLowerCase();
      return normalizedTag
@ -39,15 +42,21 @@ const isReleased = (post) => {
 const markdownItHashtag = (md) => {
  const hashtagRegex = /^#([a-zA-Z][a-zA-Z0-9_]*)(?![a-zA-Z0-9_-])/;
  
+  const HASH_CODE = '#'.charCodeAt(0);
+  const SPACE_CODE = ' '.charCodeAt(0);
+  const TAB_CODE = '\t'.charCodeAt(0);
+  const NEWLINE_CODE = '\n'.charCodeAt(0);
+  const CARRIAGE_RETURN_CODE = '\r'.charCodeAt(0);
+
  md.inline.ruler.push('hashtag', function(state, silent) {
    const pos = state.pos;
    const ch = state.src.charCodeAt(pos);

-    if (ch !== '#') return false;
+    if (ch !== HASH_CODE) return false;

    if (pos > 0) {
      const prevCh = state.src.charCodeAt(pos - 1);
-      if (prevCh !== ' ' && prevCh !== '\t' && prevCh !== '\n' && prevCh !== '\r') {
+      if (prevCh !== SPACE_CODE && prevCh !== TAB_CODE && prevCh !== NEWLINE_CODE && prevCh !== CARRIAGE_RETURN_CODE) {
        return false;
      }
    }
@ -99,11 +108,11 @@ md.use(markdownItMermaid);
 module.exports = (eleventyConfig) => {
  eleventyConfig.addPlugin(syntaxHighlight);

-  eleventyConfig.addFilter("extractTags", extractTags);
+  eleventyConfig.addFilter("extractTags", (content) => extractTags(content, md));
  eleventyConfig.addFilter("extractTagsFromFile", (filePath) => {
    try {
      const content = fs.readFileSync(filePath, 'utf-8');
-      return extractTags(content);
+      return extractTags(content, md);
    } catch (e) {
      return [];
    }
@ -183,7 +192,7 @@ module.exports = (eleventyConfig) => {
  eleventyConfig.addCollection("contentTags", (collectionApi) => {
    const posts = collectionApi.getFilteredByGlob("posts/**/*.md").filter(isReleased);
    
-    return [...new Set(posts.flatMap(getPostTags))].sort();
+    return [...new Set(posts.flatMap(post => getPostTags(post, md)))].sort();
  });

  eleventyConfig.addCollection("postsByTag", (collectionApi) => {
@ -191,7 +200,7 @@ module.exports = (eleventyConfig) => {
    const tagMap = {};
    
    posts.forEach(post => {
-      const tags = getPostTags(post)
+      const tags = getPostTags(post, md)
      tags.forEach((tag) => {
        tagMap[tag] = {
          name: tag,