feat: made tag extraction more robust

This commit is contained in:
Leyla Becker 2026-02-10 17:22:02 -06:00
parent 9517dc4d7a
commit 7b5f3adee9

View file

@ -6,22 +6,25 @@ const syntaxHighlight = require("@11ty/eleventy-plugin-syntaxhighlight");
const fs = require("fs");
const { DateTime } = require("luxon");
const tagPattern = /(?<=^|\s)#([a-zA-Z][a-zA-Z0-9_]*)(?![a-zA-Z0-9_-])/g;
// TODO: is there any reasonable way to make this use real markdown parsing because right now this is sketchy
const extractTags = (content) => {
const extractTags = (content, mdInstance) => {
if (!content) return [];
const matches = content.match(tagPattern);
if (!matches) return [];
const tags = [...new Set(matches.map(m => m.slice(1)))];
return tags;
const collectHashtags = (tokens) =>
tokens.flatMap(token => [
...(token.type === 'hashtag' ? [token.content] : []),
...(token.children ? collectHashtags(token.children) : [])
]);
const tokens = mdInstance.parse(content, {});
const tags = collectHashtags(tokens);
return [...new Set(tags)];
}
const getPostTags = (post) => {
const getPostTags = (post, mdInstance) => {
const filePath = post.inputPath;
try {
const content = fs.readFileSync(filePath, 'utf-8');
const tags = extractTags(content);
const tags = extractTags(content, mdInstance);
return tags.map(tag => {
const normalizedTag = tag.toLowerCase();
return normalizedTag
@ -39,15 +42,21 @@ const isReleased = (post) => {
const markdownItHashtag = (md) => {
const hashtagRegex = /^#([a-zA-Z][a-zA-Z0-9_]*)(?![a-zA-Z0-9_-])/;
const HASH_CODE = '#'.charCodeAt(0);
const SPACE_CODE = ' '.charCodeAt(0);
const TAB_CODE = '\t'.charCodeAt(0);
const NEWLINE_CODE = '\n'.charCodeAt(0);
const CARRIAGE_RETURN_CODE = '\r'.charCodeAt(0);
md.inline.ruler.push('hashtag', function(state, silent) {
const pos = state.pos;
const ch = state.src.charCodeAt(pos);
if (ch !== '#') return false;
if (ch !== HASH_CODE) return false;
if (pos > 0) {
const prevCh = state.src.charCodeAt(pos - 1);
if (prevCh !== ' ' && prevCh !== '\t' && prevCh !== '\n' && prevCh !== '\r') {
if (prevCh !== SPACE_CODE && prevCh !== TAB_CODE && prevCh !== NEWLINE_CODE && prevCh !== CARRIAGE_RETURN_CODE) {
return false;
}
}
@ -99,11 +108,11 @@ md.use(markdownItMermaid);
module.exports = (eleventyConfig) => {
eleventyConfig.addPlugin(syntaxHighlight);
eleventyConfig.addFilter("extractTags", extractTags);
eleventyConfig.addFilter("extractTags", (content) => extractTags(content, md));
eleventyConfig.addFilter("extractTagsFromFile", (filePath) => {
try {
const content = fs.readFileSync(filePath, 'utf-8');
return extractTags(content);
return extractTags(content, md);
} catch (e) {
return [];
}
@ -183,7 +192,7 @@ module.exports = (eleventyConfig) => {
eleventyConfig.addCollection("contentTags", (collectionApi) => {
const posts = collectionApi.getFilteredByGlob("posts/**/*.md").filter(isReleased);
return [...new Set(posts.flatMap(getPostTags))].sort();
return [...new Set(posts.flatMap(post => getPostTags(post, md)))].sort();
});
eleventyConfig.addCollection("postsByTag", (collectionApi) => {
@ -191,7 +200,7 @@ module.exports = (eleventyConfig) => {
const tagMap = {};
posts.forEach(post => {
const tags = getPostTags(post)
const tags = getPostTags(post, md)
tags.forEach((tag) => {
tagMap[tag] = {
name: tag,