-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
170 lines (160 loc) · 5.95 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// Constant properties for tracking regex syntax context
export const Context = Object.freeze({
DEFAULT: 'DEFAULT',
CHAR_CLASS: 'CHAR_CLASS',
});
/**
Replaces all unescaped instances of a regex pattern in the given context, using a replacement
string or callback.
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
@param {string} expression Search target
@param {string} needle Search as a regex pattern, with flags `su` applied
@param {string | (match: RegExpExecArray, details: {
context: 'DEFAULT' | 'CHAR_CLASS';
negated: boolean;
}) => string} replacement
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
@returns {string} Updated expression
@example
const str = '.\\.\\\\.[[\\.].].';
replaceUnescaped(str, '\\.', '@');
// → '@\\.\\\\@[[\\.]@]@'
replaceUnescaped(str, '\\.', '@', Context.DEFAULT);
// → '@\\.\\\\@[[\\.].]@'
replaceUnescaped(str, '\\.', '@', Context.CHAR_CLASS);
// → '.\\.\\\\.[[\\.]@].'
*/
export function replaceUnescaped(expression, needle, replacement, context) {
const re = new RegExp(String.raw`${needle}|(?<$skip>\[\^?|\\?.)`, 'gsu');
const negated = [false];
let numCharClassesOpen = 0;
let result = '';
for (const match of expression.matchAll(re)) {
const {0: m, groups: {$skip}} = match;
if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) {
if (replacement instanceof Function) {
result += replacement(match, {
context: numCharClassesOpen ? Context.CHAR_CLASS : Context.DEFAULT,
negated: negated[negated.length - 1],
});
} else {
result += replacement;
}
continue;
}
if (m[0] === '[') {
numCharClassesOpen++;
negated.push(m[1] === '^');
} else if (m === ']' && numCharClassesOpen) {
numCharClassesOpen--;
negated.pop();
}
result += m;
}
return result;
}
/**
Runs a callback for each unescaped instance of a regex pattern in the given context.
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
@param {string} expression Search target
@param {string} needle Search as a regex pattern, with flags `su` applied
@param {(match: RegExpExecArray, details: {
context: 'DEFAULT' | 'CHAR_CLASS';
negated: boolean;
}) => void} callback
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
*/
export function forEachUnescaped(expression, needle, callback, context) {
// Do this the easy way
replaceUnescaped(expression, needle, callback, context);
}
/**
Returns a match object for the first unescaped instance of a regex pattern in the given context, or
`null`.
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
@param {string} expression Search target
@param {string} needle Search as a regex pattern, with flags `su` applied
@param {number} [pos] Offset to start the search
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
@returns {RegExpExecArray | null}
*/
export function execUnescaped(expression, needle, pos = 0, context) {
// Quick partial test; avoid the loop if not needed
if (!(new RegExp(needle, 'su').test(expression))) {
return null;
}
const re = new RegExp(`${needle}|(?<$skip>\\\\?.)`, 'gsu');
re.lastIndex = pos;
let numCharClassesOpen = 0;
let match;
while (match = re.exec(expression)) {
const {0: m, groups: {$skip}} = match;
if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) {
return match;
}
if (m === '[') {
numCharClassesOpen++;
} else if (m === ']' && numCharClassesOpen) {
numCharClassesOpen--;
}
// Avoid an infinite loop on zero-length matches
if (re.lastIndex == match.index) {
re.lastIndex++;
}
}
return null;
}
/**
Checks whether an unescaped instance of a regex pattern appears in the given context.
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
@param {string} expression Search target
@param {string} needle Search as a regex pattern, with flags `su` applied
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
@returns {boolean} Whether the pattern was found
*/
export function hasUnescaped(expression, needle, context) {
// Do this the easy way
return !!execUnescaped(expression, needle, 0, context);
}
/**
Extracts the full contents of a group (subpattern) from the given expression, accounting for
escaped characters, nested groups, and character classes. The group is identified by the position
where its contents start (the string index just after the group's opening delimiter). Returns the
rest of the string if the group is unclosed.
Assumes UnicodeSets-mode syntax.
@param {string} expression Search target
@param {number} contentsStartPos
@returns {string}
*/
export function getGroupContents(expression, contentsStartPos) {
const token = /\\?./gsu;
token.lastIndex = contentsStartPos;
let contentsEndPos = expression.length;
let numCharClassesOpen = 0;
// Starting search within an open group, after the group's opening
let numGroupsOpen = 1;
let match;
while (match = token.exec(expression)) {
const [m] = match;
if (m === '[') {
numCharClassesOpen++;
} else if (!numCharClassesOpen) {
if (m === '(') {
numGroupsOpen++;
} else if (m === ')') {
numGroupsOpen--;
if (!numGroupsOpen) {
contentsEndPos = match.index;
break;
}
}
} else if (m === ']') {
numCharClassesOpen--;
}
}
return expression.slice(contentsStartPos, contentsEndPos);
}