import {atomic, possessive} from './atomic.js'; import {backcompatPlugin} from './backcompat.js'; import {flagNPreprocessor} from './flag-n.js'; import {clean, flagXPreprocessor} from './flag-x.js'; import {Pattern, pattern} from './pattern.js'; import {RegExpSubclass} from './subclass.js'; import {subroutines} from './subroutines.js'; import {adjustNumberedBackrefs, CharClassContext, containsCharClassUnion, countCaptures, enclosedTokenCharClassContexts, enclosedTokenRegexContexts, envSupportsFlagGroups, envSupportsFlagV, escapeV, getBreakoutChar, getEndContextForIncompleteExpression, preprocess, RegexContext, sandboxLoneCharClassCaret, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; import {Context, hasUnescaped, replaceUnescaped} from 'regex-utilities'; /** @typedef {string | RegExp | Pattern | number} InterpolatedValue @typedef {{ flags?: string; captureTransfers?: Map>; hiddenCaptures?: Array; }} PluginData @typedef {{ pattern: string; captureTransfers?: Map>; hiddenCaptures?: Array; }} PluginResult @typedef {TemplateStringsArray | {raw: Array}} RawTemplate @typedef {{ flags?: string; subclass?: boolean; plugins?: Array<(expression: string, data: PluginData) => PluginResult>; unicodeSetsPlugin?: ((expression: string, data: PluginData) => PluginResult) | null; disable?: { x?: boolean; n?: boolean; v?: boolean; atomic?: boolean; subroutines?: boolean; }; force?: { v?: boolean; }; }} RegexTagOptions */ /** @template T @typedef RegexTag @type {{ (template: RawTemplate, ...substitutions: ReadonlyArray): T; (flags?: string): RegexTag; (options: RegexTagOptions & {subclass?: false}): RegexTag; (options: RegexTagOptions & {subclass: true}): RegexTag; }} */ /** Template tag for constructing a regex with extended syntax and context-aware interpolation of regexes, strings, and patterns. Can be called in several ways: 1. `` regex`…` `` - Regex pattern as a raw string. 2. `` regex('gi')`…` `` - To specify flags. 3. `` regex({flags: 'gi'})`…` `` - With options. @type {RegexTag} */ const regex = (first, ...substitutions) => { // Given a template if (Array.isArray(first?.raw)) { return regexFromTemplate({}, first, ...substitutions); // Given flags } else if ((typeof first === 'string' || first === undefined) && !substitutions.length) { return regexFromTemplate.bind(null, {flags: first ?? ''}); // Given an options object } else if ({}.toString.call(first) === '[object Object]' && !substitutions.length) { return regexFromTemplate.bind(null, first); } throw new Error(`Unexpected arguments: ${JSON.stringify([first, ...substitutions])}`); }; /** @template T @typedef RegexFromTemplate @type {{ (options: RegexTagOptions, template: RawTemplate, ...substitutions: ReadonlyArray) : T; }} */ /** Returns a RegExp from a template and substitutions to fill the template holes. @type {RegexFromTemplate} */ const regexFromTemplate = (options, template, ...substitutions) => { const opts = getOptions(options); const prepped = runPreprocessors(template, substitutions, opts); let precedingCaptures = 0; let expression = ''; let runningContext; // Intersperse raw template strings and substitutions prepped.template.raw.forEach((raw, i) => { const wrapEscapedStr = !!(prepped.template.raw[i] || prepped.template.raw[i + 1]); // Even with flag n enabled, we might have named captures precedingCaptures += countCaptures(raw); // Sandbox `\0` in character classes. Not needed outside character classes because in other // cases a following interpolated value would always be atomized expression += sandboxUnsafeNulls(raw, Context.CHAR_CLASS); runningContext = getEndContextForIncompleteExpression(expression, runningContext); const {regexContext, charClassContext} = runningContext; if (i < prepped.template.raw.length - 1) { const substitution = prepped.substitutions[i]; expression += interpolate(substitution, opts.flags, regexContext, charClassContext, wrapEscapedStr, precedingCaptures); if (substitution instanceof RegExp) { precedingCaptures += countCaptures(substitution.source); } else if (substitution instanceof Pattern) { precedingCaptures += countCaptures(String(substitution)); } } }); const plugged = runPlugins(expression, opts); expression = plugged.pattern; try { return opts.subclass ? new RegExpSubclass(expression, opts.flags, {hiddenCaptures: plugged.hiddenCaptures}) : new RegExp(expression, opts.flags); } catch (err) { // Improve DX by always including the generated source in the error message. Some browsers // include it automatically, but not Firefox or Safari const stripped = err.message.replace(/ \/.+\/[a-z]*:/, ''); err.message = `${stripped}: /${expression}/${opts.flags}`; throw err; } }; /** Returns the processed expression and flags as strings. @param {string} expression @param {RegexTagOptions} [options] @returns {{ pattern: string; flags: string; }} */ function rewrite(expression = '', options) { const opts = getOptions(options); if (opts.subclass) { throw new Error('Cannot use option subclass'); } return { // NOTE: Since `pattern` is a Regex+ export with special meaning, the term `expression` is used // in code to refer to regex source strings, except in the public API pattern: runPlugins( runPreprocessors({raw: [expression]}, [], opts).template.raw[0], opts ).pattern, flags: opts.flags, }; } /** Returns a complete set of options, with default values set for options that weren't provided, and some options augmented for use. @param {RegexTagOptions} [options] @returns {Required} */ function getOptions(options) { const opts = { flags: '', subclass: false, plugins: [], unicodeSetsPlugin: backcompatPlugin, disable: {/* n, v, x, atomic, subroutines */}, force: {/* v */}, ...options, }; if (/[nuvx]/.test(opts.flags)) { throw new Error('Implicit flags v/u/x/n cannot be explicitly added'); } const useFlagV = opts.force.v || (opts.disable.v ? false : envSupportsFlagV); opts.flags += useFlagV ? 'v' : 'u'; if (useFlagV) { opts.unicodeSetsPlugin = null; } return opts; } /** @param {RawTemplate} template @param {ReadonlyArray} substitutions @param {Required} options @returns {{ template: RawTemplate; substitutions: ReadonlyArray; }} */ function runPreprocessors(template, substitutions, options) { const preprocessors = []; // Implicit flag x is handled first because otherwise some regex syntax (if unescaped) within // comments could cause problems when parsing if (!options.disable.x) { preprocessors.push(flagXPreprocessor); } // Implicit flag n is a preprocessor because capturing groups affect backreference rewriting in // both interpolation and plugins if (!options.disable.n) { preprocessors.push(flagNPreprocessor); } for (const pp of preprocessors) { ({template, substitutions} = preprocess(template, substitutions, pp, options)); } return { template, substitutions, }; } /** @param {string} expression @param {Required} options @returns {Required} */ function runPlugins(expression, {flags, plugins, unicodeSetsPlugin, disable}) { let hiddenCaptures = []; [ ...plugins, // Run first, so provided plugins can output extended syntax ...(disable.subroutines ? [] : [subroutines]), ...(disable.atomic ? [] : [possessive, atomic]), ...(disable.x ? [] : [clean]), // Run last, so it doesn't have to worry about parsing extended syntax ...(!unicodeSetsPlugin ? [] : [unicodeSetsPlugin]), ].forEach(plugin => { const result = plugin(expression, {flags, hiddenCaptures}); if (typeof result?.pattern !== 'string') { throw new Error('Plugin must return an object with a string property "pattern"'); } expression = result.pattern; if (result.hiddenCaptures) { hiddenCaptures = result.hiddenCaptures; } }); return { pattern: expression, hiddenCaptures, }; } /** @param {InterpolatedValue} value @param {string} flags @param {string} regexContext @param {string} charClassContext @param {boolean} wrapEscapedStr @param {number} precedingCaptures @returns {string} */ function interpolate(value, flags, regexContext, charClassContext, wrapEscapedStr, precedingCaptures) { if (value instanceof RegExp && regexContext !== RegexContext.DEFAULT) { throw new Error('Cannot interpolate a RegExp at this position because the syntax context does not match'); } if (regexContext === RegexContext.INVALID_INCOMPLETE_TOKEN || charClassContext === CharClassContext.INVALID_INCOMPLETE_TOKEN) { // Throw in all cases, but only *need* to handle a preceding unescaped backslash (which would // break sandboxing) since other errors would be handled by the invalid generated regex syntax throw new Error('Interpolation preceded by invalid incomplete token'); } if ( typeof value === 'number' && (regexContext === RegexContext.ENCLOSED_U || charClassContext === CharClassContext.ENCLOSED_U) ) { return value.toString(16); } const isPattern = value instanceof Pattern; let escapedValue = ''; if (!(value instanceof RegExp)) { value = String(value); if (!isPattern) { escapedValue = escapeV( value, regexContext === RegexContext.CHAR_CLASS ? Context.CHAR_CLASS : Context.DEFAULT ); } // Check `escapedValue` (not just patterns) since possible breakout char `>` isn't escaped const breakoutChar = getBreakoutChar(escapedValue || value, regexContext, charClassContext); if (breakoutChar) { throw new Error(`Unescaped stray "${breakoutChar}" in the interpolated value would have side effects outside it`); } } if ( regexContext === RegexContext.INTERVAL_QUANTIFIER || regexContext === RegexContext.GROUP_NAME || enclosedTokenRegexContexts.has(regexContext) || enclosedTokenCharClassContexts.has(charClassContext) ) { return isPattern ? String(value) : escapedValue; } else if (regexContext === RegexContext.CHAR_CLASS) { if (isPattern) { if (hasUnescaped(String(value), '^-|^&&|-$|&&$')) { // Sandboxing so we don't change the chars outside the pattern into being part of an // operation they didn't initiate. Same problem as starting a pattern with a quantifier throw new Error('Cannot use range or set operator at boundary of interpolated pattern; move the operation into the pattern or the operator outside of it'); } const sandboxedValue = sandboxLoneCharClassCaret(sandboxLoneDoublePunctuatorChar(value)); // Atomize via nested character class `[…]` if it contains implicit or explicit union (check // the unadjusted value) return containsCharClassUnion(value) ? `[${sandboxedValue}]` : sandboxUnsafeNulls(sandboxedValue); } // Atomize via nested character class `[…]` if more than one node return containsCharClassUnion(escapedValue) ? `[${escapedValue}]` : escapedValue; } // `RegexContext.DEFAULT` if (value instanceof RegExp) { const transformed = transformForLocalFlags(value, flags); const backrefsAdjusted = adjustNumberedBackrefs(transformed.value, precedingCaptures); // Sandbox and atomize; if we used a pattern modifier it has the same effect return transformed.usedModifier ? backrefsAdjusted : `(?:${backrefsAdjusted})`; } if (isPattern) { // Sandbox and atomize return `(?:${value})`; } // Sandbox and atomize return wrapEscapedStr ? `(?:${escapedValue})` : escapedValue; } /** @param {RegExp} re @param {string} outerFlags @returns {{value: string; usedModifier?: boolean;}} */ function transformForLocalFlags(re, outerFlags) { /** @type {{i: boolean | null; m: boolean | null; s: boolean | null;}} */ const modFlagsObj = { i: null, m: null, s: null, }; const newlines = '\\n\\r\\u2028\\u2029'; let value = re.source; if (re.ignoreCase !== outerFlags.includes('i')) { if (envSupportsFlagGroups) { modFlagsObj.i = re.ignoreCase; } else { throw new Error('Pattern modifiers not supported, so flag i on the outer and interpolated regex must match'); } } if (re.dotAll !== outerFlags.includes('s')) { if (envSupportsFlagGroups) { modFlagsObj.s = re.dotAll; } else { value = replaceUnescaped(value, '\\.', (re.dotAll ? '[^]' : `[^${newlines}]`), Context.DEFAULT); } } if (re.multiline !== outerFlags.includes('m')) { if (envSupportsFlagGroups) { modFlagsObj.m = re.multiline; } else { value = replaceUnescaped(value, '\\^', (re.multiline ? `(?<=^|[${newlines}])` : '(? modFlagsObj[k] === true).join(''); const modOff = keys.filter(k => modFlagsObj[k] === false).join(''); if (modOff) { modifier += `-${modOff}`; } if (modifier) { return { value: `(?${modifier}:${value})`, usedModifier: true, }; } } return {value}; } export { pattern, regex, rewrite, };