volpe/lib/measurements/matcher.js

605 lines
No EOL
18 KiB
JavaScript

/**
* Measurement matcher for recipe markdown text.
*
* Finds and parses measurement strings including weights, volumes,
* temperatures, dimensions, and times. Handles tricky patterns like:
* - Dual units: "200°C (400°F)", "227g (8 oz)"
* - Ranges: "28 to 32 minutes", "180-240g"
* - Fractions: "1/2 cup", "1 1/2 teaspoon"
* - Dimensions: "9x13", "8 x 8 inch", "8x6x2 inch"
* - Approximate: "~250g", "(~220 °C)"
* - Bare temp units: "170C", "100-110C"
*/
// ─── Amount Building Blocks ───────────────────────────────
const NUM = '\\d+(?:\\.\\d+)?';
const FRAC = '\\d+\\s*/\\s*\\d+';
const MIXED = '\\d+\\s+\\d+\\s*/\\s*\\d+';
const UNICODE_FRAC = '[\u00BC-\u00BE\u2150-\u215E]';
const MIXED_UNI = `\\d+\\s*${UNICODE_FRAC}`;
const APPROX_PREFIX = '~\\s*';
const SINGLE_AMOUNT = `(?:${APPROX_PREFIX})?(?:${MIXED}|${FRAC}|${MIXED_UNI}|${UNICODE_FRAC}|${NUM})`;
const RANGE_SEP = '\\s*(?:-|to)\\s*';
const AMOUNT = `(?:${SINGLE_AMOUNT}${RANGE_SEP}${SINGLE_AMOUNT}|${SINGLE_AMOUNT})`;
// ─── Unit Patterns ────────────────────────────────────────
const TEMP_UNIT_DEG = '°\\s*[FfCc]';
const TEMP_UNIT_BARE = '[FfCc]';
const WEIGHT_UNIT = '(?:kg|g|oz|lbs?|ounces?|pounds?)';
const VOLUME_UNIT = '(?:cups?|tablespoons?|table\\s+spoons?|tbsp|teaspoons?|tsp|ml|mL|L|liters?|litres?|quarts?|gallons?|pints?|fl\\.?\\s*oz|fluid\\s+ounces?|parts\\s+by\\s+(?:volume|weight))';
const TIME_UNIT = '(?:minutes?|mins?|hours?|hrs?|days?|weeks?|months?|seconds?|secs?)';
const DIM_UNIT = '(?:inch(?:es)?|in\\.?|cm|mm)';
// ─── Amount Parsing ───────────────────────────────────────
/**
* Parse a single numeric amount string (not a range).
* Handles integers, decimals, fractions, mixed numbers, and approximate markers.
*
* @param {string} str — e.g. "200", "1.5", "1/2", "1 1/2", "~250"
* @returns {{ value: number, approximate: boolean }}
*/
const UNICODE_FRAC_MAP = {
'\u00BC': 0.25, // ¼
'\u00BD': 0.5, // ½
'\u00BE': 0.75, // ¾
'\u2150': 1/7, // ⅐
'\u2151': 1/9, // ⅑
'\u2152': 1/10, // ⅒
'\u2153': 1/3, // ⅓
'\u2154': 2/3, // ⅔
'\u2155': 0.2, // ⅕
'\u2156': 0.4, // ⅖
'\u2157': 0.6, // ⅗
'\u2158': 0.8, // ⅘
'\u2159': 1/6, // ⅙
'\u215A': 5/6, // ⅚
'\u215B': 0.125, // ⅛
'\u215C': 0.375, // ⅜
'\u215D': 0.625, // ⅝
'\u215E': 0.875, // ⅞
};
function parseSingleAmount(str) {
str = str.trim();
const approximate = str.startsWith('~');
if (approximate) {
str = str.replace(/^~\s*/, '');
}
// Unicode mixed number: "1½", "1 ½"
const uniMixedMatch = str.match(/^(\d+)\s*([\u00BC-\u00BE\u2150-\u215E])$/);
if (uniMixedMatch) {
const whole = parseInt(uniMixedMatch[1], 10);
const frac = UNICODE_FRAC_MAP[uniMixedMatch[2]] || 0;
return { value: whole + frac, approximate };
}
// Standalone Unicode fraction: "½", "¾"
const uniFracMatch = str.match(/^([\u00BC-\u00BE\u2150-\u215E])$/);
if (uniFracMatch) {
return { value: UNICODE_FRAC_MAP[uniFracMatch[1]] || 0, approximate };
}
// Mixed number: "1 1/2"
const mixedMatch = str.match(/^(\d+)\s+(\d+)\s*\/\s*(\d+)$/);
if (mixedMatch) {
return {
value: parseInt(mixedMatch[1], 10) + parseInt(mixedMatch[2], 10) / parseInt(mixedMatch[3], 10),
approximate,
};
}
// Fraction: "1/2", "3/4"
const fracMatch = str.match(/^(\d+)\s*\/\s*(\d+)$/);
if (fracMatch) {
return {
value: parseInt(fracMatch[1], 10) / parseInt(fracMatch[2], 10),
approximate,
};
}
// Plain number
return {
value: parseFloat(str),
approximate,
};
}
/**
* Parse an amount string that may be a single value or a range.
*
* @param {string} str — e.g. "200", "180-240", "28 to 32", "1 1/2"
* @returns {
* { value: number, approximate: boolean } |
* { min: { value: number, approximate: boolean }, max: { value: number, approximate: boolean } }
* }
*/
function parseAmount(str) {
str = str.trim();
// Try range with "to" first (word boundary matters to avoid "1 1/2 to 2" mis-parse)
const rangeToMatch = str.match(
new RegExp(`^(${SINGLE_AMOUNT})\\s+to\\s+(${SINGLE_AMOUNT})$`)
);
if (rangeToMatch) {
return {
min: parseSingleAmount(rangeToMatch[1]),
max: parseSingleAmount(rangeToMatch[2]),
};
}
// Try range with dash, but only if it doesn't look like a negative or
// a fraction. Need to be careful: "180-240" is a range, "1/2" is not.
// Strategy: split on dash that is surrounded by digits (not inside fraction).
const rangeDashMatch = str.match(
new RegExp(`^(${SINGLE_AMOUNT})-(${SINGLE_AMOUNT})$`)
);
if (rangeDashMatch) {
return {
min: parseSingleAmount(rangeDashMatch[1]),
max: parseSingleAmount(rangeDashMatch[2]),
};
}
return parseSingleAmount(str);
}
// ─── Unit Normalization ───────────────────────────────────
/** Normalize a unit string for consistent comparison. */
function normalizeUnit(unit) {
if (!unit) return null;
let u = unit.trim().toLowerCase().replace(/\s+/g, ' ').replace(/\.$/, '');
// Temperature
if (/^°\s*f$/.test(u)) return '°F';
if (/^°\s*c$/.test(u)) return '°C';
if (u === 'f') return '°F';
if (u === 'c') return '°C';
// Weight
if (u === 'g') return 'g';
if (u === 'kg') return 'kg';
if (u === 'oz') return 'oz';
if (u === 'lb' || u === 'lbs') return 'lb';
if (u === 'ounce' || u === 'ounces') return 'oz';
if (u === 'pound' || u === 'pounds') return 'lb';
// Volume
if (u === 'cup' || u === 'cups') return 'cup';
if (/^table\s*spoons?$/.test(u)) return 'tablespoon';
if (u === 'tbsp') return 'tablespoon';
if (/^tea\s*spoons?$/.test(u)) return 'teaspoon';
if (u === 'tsp') return 'teaspoon';
if (u === 'ml') return 'ml';
if (u === 'l') return 'L';
if (/^liters?$/.test(u) || /^litres?$/.test(u)) return 'L';
if (/^quarts?$/.test(u)) return 'quart';
if (/^gallons?$/.test(u)) return 'gallon';
if (/^pints?$/.test(u)) return 'pint';
if (/^fl\.?\s*oz$/.test(u)) return 'fl oz';
if (/^fluid\s+ounces?$/.test(u)) return 'fl oz';
if (/^parts\s+by\s+volume$/.test(u)) return 'parts by volume';
if (/^parts\s+by\s+weight$/.test(u)) return 'parts by weight';
// Time
if (/^minutes?$/.test(u) || /^mins?$/.test(u)) return 'minute';
if (/^hours?$/.test(u) || /^hrs?$/.test(u)) return 'hour';
if (/^days?$/.test(u)) return 'day';
if (/^weeks?$/.test(u)) return 'week';
if (/^months?$/.test(u)) return 'month';
if (/^seconds?$/.test(u) || /^secs?$/.test(u)) return 'second';
// Dimension units
if (/^inch(es)?$/.test(u) || u === 'in') return 'inch';
if (u === 'cm') return 'cm';
if (u === 'mm') return 'mm';
if (/^f(oo|ee)t$/.test(u) || u === 'ft') return 'ft';
return u;
}
/** Determine measurement type from a normalized unit. */
function unitType(normalizedUnit) {
if (!normalizedUnit) return null;
if (['°F', '°C'].includes(normalizedUnit)) return 'temperature';
if (['g', 'kg', 'oz', 'lb'].includes(normalizedUnit)) return 'weight';
if (['cup', 'tablespoon', 'teaspoon', 'ml', 'L', 'quart', 'gallon', 'pint', 'fl oz', 'parts by volume', 'parts by weight'].includes(normalizedUnit)) return 'volume';
if (['minute', 'hour', 'day', 'week', 'month', 'second'].includes(normalizedUnit)) return 'time';
if (['inch', 'cm', 'mm', 'ft'].includes(normalizedUnit)) return 'dimension';
return null;
}
// ─── Matchers ─────────────────────────────────────────────
/**
* @typedef {Object} Measurement
* @property {string} match — full matched string from source
* @property {number} index — start position in source text
* @property {string} type — "temperature"|"weight"|"volume"|"time"|"dimension"
* @property {number|number[]|{min:object,max:object}} amount — parsed amount
* @property {string} unit — normalized unit
* @property {boolean} approximate — had ~ prefix
* @property {object|null} alt — alternative measurement in parentheses
*/
/**
* Find temperature measurements in text.
*
* Handles: 350°F, 200°C (400°F), 165-175 °F, 170C, 100-110C,
* 32°c (90°f), ~220 °C
*/
function findTemperatures(text) {
const results = [];
// Pattern with degree symbol: AMOUNT °F/C (optional alt)
const degRe = new RegExp(
`(${AMOUNT})\\s*(${TEMP_UNIT_DEG})` +
`(?:\\s*\\(\\s*(${AMOUNT})\\s*(${TEMP_UNIT_DEG}|${TEMP_UNIT_BARE})\\s*\\))?`,
'gi'
);
let m;
while ((m = degRe.exec(text)) !== null) {
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
let alt = null;
if (m[3] && m[4]) {
alt = {
amount: parseAmount(m[3]),
unit: normalizeUnit(m[4]),
};
}
results.push({
match: m[0],
index: m.index,
type: 'temperature',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt,
});
}
// Pattern with bare C/F (no degree symbol): number directly followed by C or F
// Only match if not already captured by degree pattern above
const bareRe = new RegExp(
`(${AMOUNT})(${TEMP_UNIT_BARE})(?=\\s|\\)|$|,|\\/)` +
`(?:\\s*\\(\\s*(${AMOUNT})\\s*(${TEMP_UNIT_DEG}|${TEMP_UNIT_BARE})\\s*\\))?`,
'gi'
);
while ((m = bareRe.exec(text)) !== null) {
// Skip if this position was already matched by the degree pattern
const alreadyMatched = results.some(
r => m.index >= r.index && m.index < r.index + r.match.length
);
if (alreadyMatched) continue;
// Only match bare C/F if the character directly before the letter is a digit
const beforeUnit = m[0].match(new RegExp(`(${AMOUNT})[FfCc]`));
if (!beforeUnit) continue;
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
let alt = null;
if (m[3] && m[4]) {
alt = {
amount: parseAmount(m[3]),
unit: normalizeUnit(m[4]),
};
}
results.push({
match: m[0],
index: m.index,
type: 'temperature',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt,
});
}
return results;
}
/**
* Find dimension measurements in text.
*
* Handles: 9x13, 9 x 13, 8x6x2, 9 x 13 inch, 8x6x2 inch, 5-8mm (as dimension when mm)
*/
function findDimensions(text) {
const results = [];
// NxN or NxNxN with optional unit
const dimRe = new RegExp(
`(${NUM})\\s*x\\s*(${NUM})(?:\\s*x\\s*(${NUM}))?(?:\\s+(${DIM_UNIT}))?`,
'gi'
);
let m;
while ((m = dimRe.exec(text)) !== null) {
const dims = [parseFloat(m[1]), parseFloat(m[2])];
if (m[3]) dims.push(parseFloat(m[3]));
const rawUnit = m[4] || null;
const unit = normalizeUnit(rawUnit);
results.push({
match: m[0],
index: m.index,
type: 'dimension',
amount: dims,
unit,
approximate: false,
alt: null,
});
}
// Standalone AMOUNT + dimension unit (e.g. "1 inch", "0.5-1cm")
const standaloneDimRe = new RegExp(
`(${AMOUNT})\\s*(${DIM_UNIT})\\b`,
'gi'
);
while ((m = standaloneDimRe.exec(text)) !== null) {
// Skip if overlapping with an NxN match already found
const alreadyMatched = results.some(
r => m.index >= r.index && m.index < r.index + r.match.length
);
if (alreadyMatched) continue;
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
results.push({
match: m[0],
index: m.index,
type: 'dimension',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt: null,
});
}
return results;
}
/**
* Find weight measurements in text.
*
* Handles: 200g, 550 g, ~250g, 180-240g, 1kg, 227g (8 oz),
* 80g (1/3 cups), 860g (800mL (3 1/3 cups))
*/
function findWeights(text) {
const results = [];
// Supports optional nested alternatives:
// 860g (800mL (3 1/3 cups)) → primary=860g, outer=800mL, inner=3 1/3 cups
// 80g (1/3 cups) → primary=80g, outer=1/3 cups
// 227g (8 oz) → primary=227g, outer=8 oz
const weightRe = new RegExp(
`(${AMOUNT})\\s*(${WEIGHT_UNIT})\\b` +
`(?:\\s*\\(\\s*(${AMOUNT})\\s*(${WEIGHT_UNIT}|${VOLUME_UNIT})\\b` +
`(?:\\s*\\(\\s*(${AMOUNT})\\s*(${WEIGHT_UNIT}|${VOLUME_UNIT})\\s*\\))?` +
`\\s*\\))?`,
'gi'
);
let m;
while ((m = weightRe.exec(text)) !== null) {
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
let alt = null;
let intermediate = null;
if (m[5] && m[6]) {
// Nested alt: e.g. 860g (800mL (3 1/3 cups))
// Inner alt is the display alternative, outer is the intermediate for scaling
alt = {
amount: parseAmount(m[5]),
unit: normalizeUnit(m[6]),
};
intermediate = {
amount: parseAmount(m[3]),
unit: normalizeUnit(m[4]),
};
} else if (m[3] && m[4]) {
// Simple alt: e.g. 227g (8 oz), 80g (1/3 cups)
alt = {
amount: parseAmount(m[3]),
unit: normalizeUnit(m[4]),
};
}
results.push({
match: m[0],
index: m.index,
type: 'weight',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt,
intermediate,
});
}
return results;
}
/**
* Find volume measurements in text.
*
* Handles: 2 quarts, 1/2 cups, 1 cup, 6 tablespoons, 6 table spoons,
* 1 1/2 tablespoon, 3/4 teaspoon, 6 parts by volume,
* 800mL (3 1/3 cups)
*/
function findVolumes(text) {
const results = [];
const volumeRe = new RegExp(
`(${AMOUNT})\\s*(${VOLUME_UNIT})\\b` +
`(?:\\s*\\(\\s*(${AMOUNT})\\s*(${VOLUME_UNIT}|${WEIGHT_UNIT})\\s*\\))?`,
'gi'
);
let m;
while ((m = volumeRe.exec(text)) !== null) {
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
let alt = null;
if (m[3] && m[4]) {
alt = {
amount: parseAmount(m[3]),
unit: normalizeUnit(m[4]),
};
}
results.push({
match: m[0],
index: m.index,
type: 'volume',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt,
});
}
return results;
}
/**
* Find time measurements in text.
*
* Handles: 10 minutes, 28 to 32 minutes, 8 to 10 minutes, an hour,
* five days, for hour
*/
function findTimes(text) {
const results = [];
const timeRe = new RegExp(
`(${AMOUNT})\\s+(${TIME_UNIT})\\b`,
'gi'
);
let m;
while ((m = timeRe.exec(text)) !== null) {
const amount = parseAmount(m[1]);
const unit = normalizeUnit(m[2]);
results.push({
match: m[0],
index: m.index,
type: 'time',
amount,
unit,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt: null,
});
}
return results;
}
// ─── Count Matcher ────────────────────────────────────────
/**
* Find bare numeric counts in text (no unit attached).
* These represent ingredient quantities like "eggs 3" or "biscuits 20-24".
* Overlap with unit-bearing matches is resolved by deduplication (longer wins).
*/
function findCounts(text) {
const results = [];
const countRe = new RegExp(
`(${AMOUNT})(?=\\s*$|\\s*,|\\s*\\)|\\s*\\]|\\s*;|\\s+[^\\dxX~])`,
'g'
);
let m;
while ((m = countRe.exec(text)) !== null) {
const amount = parseAmount(m[1]);
results.push({
match: m[1],
index: m.index,
type: 'count',
amount,
unit: null,
approximate: typeof amount.approximate === 'boolean' ? amount.approximate : false,
alt: null,
});
}
return results;
}
// ─── Main Matcher ─────────────────────────────────────────
/**
* Find all measurement strings in the given text.
* Returns an array of Measurement objects sorted by position,
* with overlapping matches resolved (longer match wins).
*
* @param {string} text — markdown or plain text to scan
* @returns {Measurement[]}
*/
function findAllMeasurements(text) {
const all = [
...findTemperatures(text),
...findDimensions(text),
...findWeights(text),
...findVolumes(text),
...findTimes(text),
...findCounts(text),
];
// Sort by position
all.sort((a, b) => a.index - b.index);
// Remove overlapping matches: if two matches overlap, keep the longer one.
// If same length, prefer the one that appeared first in the type-specific
// matcher (temperatures > dimensions > weights > volumes > times).
const deduped = [];
for (const measurement of all) {
const end = measurement.index + measurement.match.length;
const overlapping = deduped.findIndex(existing => {
const existingEnd = existing.index + existing.match.length;
return measurement.index < existingEnd && end > existing.index;
});
if (overlapping === -1) {
deduped.push(measurement);
} else {
// Keep the longer match
const existing = deduped[overlapping];
if (measurement.match.length > existing.match.length) {
deduped[overlapping] = measurement;
}
}
}
return deduped;
}
// ─── Exports ──────────────────────────────────────────────
module.exports = {
// Main API
findAllMeasurements,
// Individual matchers (exported for testing)
findTemperatures,
findDimensions,
findWeights,
findVolumes,
findTimes,
findCounts,
// Parsing utilities (exported for testing)
parseAmount,
parseSingleAmount,
normalizeUnit,
unitType,
};