// Constant properties for tracking regex syntax context export const Context = Object.freeze({ DEFAULT: 'DEFAULT', CHAR_CLASS: 'CHAR_CLASS', }); /** Replaces all unescaped instances of a regex pattern in the given context, using a replacement string or callback. Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax. @param {string} expression Search target @param {string} needle Search as a regex pattern, with flags `su` applied @param {string | (match: RegExpExecArray, details: { context: 'DEFAULT' | 'CHAR_CLASS'; negated: boolean; }) => string} replacement @param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified @returns {string} Updated expression @example const str = '.\\.\\\\.[[\\.].].'; replaceUnescaped(str, '\\.', '@'); // → '@\\.\\\\@[[\\.]@]@' replaceUnescaped(str, '\\.', '@', Context.DEFAULT); // → '@\\.\\\\@[[\\.].]@' replaceUnescaped(str, '\\.', '@', Context.CHAR_CLASS); // → '.\\.\\\\.[[\\.]@].' */ export function replaceUnescaped(expression, needle, replacement, context) { const re = new RegExp(String.raw`${needle}|(?<$skip>\[\^?|\\?.)`, 'gsu'); const negated = [false]; let numCharClassesOpen = 0; let result = ''; for (const match of expression.matchAll(re)) { const {0: m, groups: {$skip}} = match; if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) { if (replacement instanceof Function) { result += replacement(match, { context: numCharClassesOpen ? Context.CHAR_CLASS : Context.DEFAULT, negated: negated[negated.length - 1], }); } else { result += replacement; } continue; } if (m[0] === '[') { numCharClassesOpen++; negated.push(m[1] === '^'); } else if (m === ']' && numCharClassesOpen) { numCharClassesOpen--; negated.pop(); } result += m; } return result; } /** Runs a callback for each unescaped instance of a regex pattern in the given context. Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax. @param {string} expression Search target @param {string} needle Search as a regex pattern, with flags `su` applied @param {(match: RegExpExecArray, details: { context: 'DEFAULT' | 'CHAR_CLASS'; negated: boolean; }) => void} callback @param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified */ export function forEachUnescaped(expression, needle, callback, context) { // Do this the easy way replaceUnescaped(expression, needle, callback, context); } /** Returns a match object for the first unescaped instance of a regex pattern in the given context, or `null`. Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax. @param {string} expression Search target @param {string} needle Search as a regex pattern, with flags `su` applied @param {number} [pos] Offset to start the search @param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified @returns {RegExpExecArray | null} */ export function execUnescaped(expression, needle, pos = 0, context) { // Quick partial test; avoid the loop if not needed if (!(new RegExp(needle, 'su').test(expression))) { return null; } const re = new RegExp(`${needle}|(?<$skip>\\\\?.)`, 'gsu'); re.lastIndex = pos; let numCharClassesOpen = 0; let match; while (match = re.exec(expression)) { const {0: m, groups: {$skip}} = match; if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) { return match; } if (m === '[') { numCharClassesOpen++; } else if (m === ']' && numCharClassesOpen) { numCharClassesOpen--; } // Avoid an infinite loop on zero-length matches if (re.lastIndex == match.index) { re.lastIndex++; } } return null; } /** Checks whether an unescaped instance of a regex pattern appears in the given context. Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax. @param {string} expression Search target @param {string} needle Search as a regex pattern, with flags `su` applied @param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified @returns {boolean} Whether the pattern was found */ export function hasUnescaped(expression, needle, context) { // Do this the easy way return !!execUnescaped(expression, needle, 0, context); } /** Extracts the full contents of a group (subpattern) from the given expression, accounting for escaped characters, nested groups, and character classes. The group is identified by the position where its contents start (the string index just after the group's opening delimiter). Returns the rest of the string if the group is unclosed. Assumes UnicodeSets-mode syntax. @param {string} expression Search target @param {number} contentsStartPos @returns {string} */ export function getGroupContents(expression, contentsStartPos) { const token = /\\?./gsu; token.lastIndex = contentsStartPos; let contentsEndPos = expression.length; let numCharClassesOpen = 0; // Starting search within an open group, after the group's opening let numGroupsOpen = 1; let match; while (match = token.exec(expression)) { const [m] = match; if (m === '[') { numCharClassesOpen++; } else if (!numCharClassesOpen) { if (m === '(') { numGroupsOpen++; } else if (m === ')') { numGroupsOpen--; if (!numGroupsOpen) { contentsEndPos = match.index; break; } } } else if (m === ']') { numCharClassesOpen--; } } return expression.slice(contentsStartPos, contentsEndPos); }