import {Pattern, pattern} from './pattern.js'; import {Context, forEachUnescaped, replaceUnescaped} from 'regex-utilities'; const RegexContext = { DEFAULT: 'DEFAULT', CHAR_CLASS: 'CHAR_CLASS', ENCLOSED_P: 'ENCLOSED_P', ENCLOSED_U: 'ENCLOSED_U', GROUP_NAME: 'GROUP_NAME', INTERVAL_QUANTIFIER: 'INTERVAL_QUANTIFIER', INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN', }; const CharClassContext = { DEFAULT: 'DEFAULT', ENCLOSED_P: 'ENCLOSED_P', ENCLOSED_Q: 'ENCLOSED_Q', ENCLOSED_U: 'ENCLOSED_U', INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN', RANGE: 'RANGE', }; const enclosedTokenRegexContexts = new Set([ RegexContext.ENCLOSED_P, RegexContext.ENCLOSED_U, ]); const enclosedTokenCharClassContexts = new Set([ CharClassContext.ENCLOSED_P, CharClassContext.ENCLOSED_Q, CharClassContext.ENCLOSED_U, ]); const envSupportsFlagGroups = (() => { try { new RegExp('(?i:)'); } catch { return false; } return true; })(); const envSupportsFlagV = (() => { try { new RegExp('', 'v'); } catch { return false; } return true; })(); const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~'; const namedCapturingDelim = String.raw`$\?<(?![=!])(?[^>]+)>`; const capturingDelim = String.raw`\((?!\?)(?!(?<=\(\?\()DEFINE$)|${namedCapturingDelim}`; /** @param {string} expression @param {number} precedingCaptures @returns {string} */ function adjustNumberedBackrefs(expression, precedingCaptures) { return replaceUnescaped( expression, String.raw`\\(?[1-9]\d*)`, ({groups: {num}}) => `\\${+num + precedingCaptures}`, Context.DEFAULT ); } // Properties of strings as of ES2024 const stringPropertyNames = [ 'Basic_Emoji', 'Emoji_Keycap_Sequence', 'RGI_Emoji_Modifier_Sequence', 'RGI_Emoji_Flag_Sequence', 'RGI_Emoji_Tag_Sequence', 'RGI_Emoji_ZWJ_Sequence', 'RGI_Emoji', ].join('|'); const charClassUnionToken = new RegExp(String.raw` \\(?: c[A-Za-z] | p\{(?${stringPropertyNames})\} | [pP]\{[^\}]+\} | (?q) | u(?:[A-Fa-f\d]{4}|\{[A-Fa-f\d]+\}) | x[A-Fa-f\d]{2} | . ) | -- | && | . `.replace(/\s+/g, ''), 'gsu'); // Assumes flag v and doesn't worry about syntax errors that are caught by it function containsCharClassUnion(charClassPattern) { // Return `true` if it contains: // - `\p` (lowercase only) and the name is a property of strings (case sensitive). // - `\q`. // - Two single-char-matching tokens in sequence. // - One single-char-matching token followed immediately by unescaped `[`. // - One single-char-matching token preceded immediately by unescaped `]`. // Else, `false`. // Ranges with `-` create a single token. // Subtraction and intersection with `--` and `&&` create a single token. // Supports any number of nested classes let hasFirst = false; let lastM; for (const {0: m, groups} of charClassPattern.matchAll(charClassUnionToken)) { if (groups.pStrProp || groups.qStrProp) { return true; } if (m === '[' && hasFirst) { return true; } if (['-', '--', '&&'].includes(m)) { hasFirst = false; } else if (m !== '[' && m !== ']') { if (hasFirst || lastM === ']') { return true; } hasFirst = true; } lastM = m; } return false; } /** @param {string} expression @returns {number} */ function countCaptures(expression) { let num = 0; forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT); return num; } /** Escape special characters for the given context, assuming flag v. @param {string} str String to escape @param {'DEFAULT' | 'CHAR_CLASS'} context `Context` option from lib `regex-utilities` @returns {string} Escaped string */ function escapeV(str, context) { if (context === Context.CHAR_CLASS) { // Escape all double punctuators (including ^, which is special on its own in the first // position) in case they're bordered by the same character in or outside of the escaped string return str.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${doublePunctuatorChars}]`, 'g'), '\\$&'); } return str.replace(/[()\[\]{}|\\^$*+?.]/g, '\\$&'); } // Look for characters that would change the meaning of subsequent tokens outside an interpolated value function getBreakoutChar(expression, regexContext, charClassContext) { const escapesRemoved = expression.replace(/\\./gsu, ''); // Trailing unescaped `\`; checking `.includes('\\')` would also work if (escapesRemoved.endsWith('\\')) { return '\\'; } if (regexContext === RegexContext.DEFAULT) { // Unbalanced `[` or `]` are also errors but don't breakout; they're caught by the wrapper return getUnbalancedChar(escapesRemoved, '(', ')'); } else if ( regexContext === RegexContext.CHAR_CLASS && !enclosedTokenCharClassContexts.has(charClassContext) ) { return getUnbalancedChar(escapesRemoved, '[', ']'); } else if ( regexContext === RegexContext.INTERVAL_QUANTIFIER || enclosedTokenRegexContexts.has(regexContext) || enclosedTokenCharClassContexts.has(charClassContext) ) { if (escapesRemoved.includes('}')) { return '}'; } } else if (regexContext === RegexContext.GROUP_NAME) { if (escapesRemoved.includes('>')) { return '>'; } } return ''; } const contextToken = new RegExp(String.raw` (?\(\?<(?![=!])|\\[gk]<) | (?\\[pPu]\{) | (?\\q\{) | (?\{) | (?\\(?: $ | c(?![A-Za-z]) | u(?![A-Fa-f\d]{4})[A-Fa-f\d]{0,3} | x(?![A-Fa-f\d]{2})[A-Fa-f\d]? ) ) | -- | \\?. `.replace(/\s+/g, ''), 'gsu'); /** @typedef {{ regexContext: string; charClassContext: string; charClassDepth: number; lastPos: number; }} RunningContext */ /** Accepts and returns its full state so it doesn't have to reprocess parts that have already been seen. Assumes flag v and doesn't worry about syntax errors that are caught by it. @param {string} incompleteExpression @param {Partial} [runningContext] @returns {RunningContext} */ function getEndContextForIncompleteExpression(incompleteExpression, runningContext) { let {regexContext, charClassContext, charClassDepth, lastPos} = { regexContext: RegexContext.DEFAULT, charClassContext: CharClassContext.DEFAULT, charClassDepth: 0, lastPos: 0, ...runningContext, }; contextToken.lastIndex = lastPos; let match; while (match = contextToken.exec(incompleteExpression)) { const {0: m, groups: {groupN, enclosedPU, enclosedQ, intervalQ, incompleteT}} = match; if (m === '[') { charClassDepth++; regexContext = RegexContext.CHAR_CLASS; charClassContext = CharClassContext.DEFAULT; } else if (m === ']' && regexContext === RegexContext.CHAR_CLASS) { if (charClassDepth) { charClassDepth--; } if (!charClassDepth) { regexContext = RegexContext.DEFAULT; } charClassContext = CharClassContext.DEFAULT; } else if (regexContext === RegexContext.CHAR_CLASS) { if (incompleteT) { charClassContext = CharClassContext.INVALID_INCOMPLETE_TOKEN; } else if (m === '-') { charClassContext = CharClassContext.RANGE; } else if (enclosedPU) { charClassContext = m[1] === 'u' ? CharClassContext.ENCLOSED_U : CharClassContext.ENCLOSED_P; } else if (enclosedQ) { charClassContext = CharClassContext.ENCLOSED_Q; } else if ( (m === '}' && enclosedTokenCharClassContexts.has(charClassContext)) || // Don't continue in these contexts since we've advanced another token charClassContext === CharClassContext.INVALID_INCOMPLETE_TOKEN || charClassContext === CharClassContext.RANGE ) { charClassContext = CharClassContext.DEFAULT; } } else { if (incompleteT) { regexContext = RegexContext.INVALID_INCOMPLETE_TOKEN; } else if (groupN) { regexContext = RegexContext.GROUP_NAME; } else if (enclosedPU) { regexContext = m[1] === 'u' ? RegexContext.ENCLOSED_U : RegexContext.ENCLOSED_P; } else if (intervalQ) { regexContext = RegexContext.INTERVAL_QUANTIFIER; } else if ( (m === '>' && regexContext === RegexContext.GROUP_NAME) || (m === '}' && (regexContext === RegexContext.INTERVAL_QUANTIFIER || enclosedTokenRegexContexts.has(regexContext))) || // Don't continue in this context since we've advanced another token regexContext === RegexContext.INVALID_INCOMPLETE_TOKEN ) { regexContext = RegexContext.DEFAULT; } } } return { regexContext, charClassContext, charClassDepth, lastPos: incompleteExpression.length, }; } // No special handling for escaped versions of the characters function getUnbalancedChar(expression, leftChar, rightChar) { let numOpen = 0; for (const [m] of expression.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) { numOpen += m === leftChar ? 1 : -1; if (numOpen < 0) { return rightChar; } } if (numOpen > 0) { return leftChar; } return ''; } /** @typedef {import('./regex.js').InterpolatedValue} InterpolatedValue @typedef {import('./regex.js').RawTemplate} RawTemplate @typedef {import('./regex.js').RegexTagOptions} RegexTagOptions @typedef {( value: InterpolatedValue, runningContext: RunningContext, options: Required ) => { transformed: string; runningContext: RunningContext; }} Preprocessor */ /** Returns transformed versions of a template and substitutions, using the given preprocessor. Only processes substitutions that are instanceof `Pattern`. @param {RawTemplate} template @param {ReadonlyArray} substitutions @param {Preprocessor} preprocessor @param {Required} options @returns {{template: RawTemplate; substitutions: ReadonlyArray;}} */ function preprocess(template, substitutions, preprocessor, options) { let /** @type {RawTemplate} */ newTemplate = {raw: []}; let newSubstitutions = []; let runningContext; template.raw.forEach((raw, i) => { const result = preprocessor(raw, {...runningContext, lastPos: 0}, options); newTemplate.raw.push(result.transformed); runningContext = result.runningContext; if (i < template.raw.length - 1) { const substitution = substitutions[i]; if (substitution instanceof Pattern) { const result = preprocessor(substitution, {...runningContext, lastPos: 0}, options); newSubstitutions.push(pattern(result.transformed)); runningContext = result.runningContext; } else { newSubstitutions.push(substitution); } } }); return { template: newTemplate, substitutions: newSubstitutions, }; } // Sandbox `^` if relevant, done so it can't change the meaning of the surrounding character class // if we happen to be at the first position. See `sandboxLoneDoublePunctuatorChar` for more details function sandboxLoneCharClassCaret(str) { return str.replace(/^\^/, '\\^^'); } // Sandbox without escaping by repeating the character and escaping only the first one. The second // one is so that, if followed by the same symbol, the resulting double punctuator will still throw // as expected. Details: // - Only need to check the first position because, if it's part of an implicit union, // interpolation handling will wrap it in nested `[…]`. // - Can't just wrap in nested `[…]` here, since the value might be used in a range. // - Can't add a second unescaped symbol if a lone symbol is the entire string because it might be // followed by the same unescaped symbol outside an interpolation, and since it won't be wrapped, // the second symbol wouldn't be sandboxed from the one following it. function sandboxLoneDoublePunctuatorChar(str) { return str.replace(new RegExp(`^([${doublePunctuatorChars}])(?!\\1)`), (m, _, pos) => { return `\\${m}${pos + 1 === str.length ? '' : m}`; }); } /** Converts `\0` tokens to `\x00` in the given context. @param {string} str @param {'DEFAULT' | 'CHAR_CLASS'} [context] `Context` option from lib `regex-utilities` @returns {string} */ function sandboxUnsafeNulls(str, context) { // regex`[\0${0}]` and regex`[${pattern`\0`}0]` can't be guarded against via nested `[…]` // sandboxing in character classes if the interpolated value doesn't contain union (since it // might be placed on a range boundary). So escape `\0` in character classes as `\x00` return replaceUnescaped(str, String.raw`\\0(?!\d)`, '\\x00', context); } export { adjustNumberedBackrefs, capturingDelim, CharClassContext, containsCharClassUnion, countCaptures, doublePunctuatorChars, enclosedTokenCharClassContexts, enclosedTokenRegexContexts, envSupportsFlagGroups, envSupportsFlagV, escapeV, getBreakoutChar, getEndContextForIncompleteExpression, namedCapturingDelim, preprocess, RegexContext, sandboxLoneCharClassCaret, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls, };