summaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
authorTom Lane2021-08-09 15:26:34 +0000
committerTom Lane2021-08-09 15:26:34 +0000
commit0e6aa8747d439bb7f08f95e358f0509c50396785 (patch)
tree64977d63a54b4f5c266692966282c6d7aab3aba7 /src/backend
parent76ad24400d73fa10d527844d50bedf7dacb1e87b (diff)
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/[email protected]
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/regex/regc_lex.c5
-rw-r--r--src/backend/regex/regcomp.c68
-rw-r--r--src/backend/regex/regexec.c36
-rw-r--r--src/backend/utils/adt/jsonpath_gram.y8
-rw-r--r--src/backend/utils/adt/regexp.c12
5 files changed, 98 insertions, 31 deletions
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 7673dab76f..45727ffa01 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -528,10 +528,7 @@ next(struct vars *v)
}
assert(NOTREACHED);
}
- if (v->cflags & REG_NOSUB)
- RETV('(', 0); /* all parens non-capturing */
- else
- RETV('(', 1);
+ RETV('(', 1);
break;
case CHR(')'):
if (LASTTYPE('('))
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 60a220c57a..ae3a7b6a38 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
static void freesubre(struct vars *, struct subre *);
static void freesubreandsiblings(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
-static void optst(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
static int numst(struct subre *, int);
static void markst(struct subre *);
static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
dumpst(v->tree, debug, 1);
}
#endif
- optst(v, v->tree);
+ if (v->cflags & REG_NOSUB)
+ removecaptures(v, v->tree);
v->ntree = numst(v->tree, 1);
markst(v->tree);
cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
break;
case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG);
- INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
- INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+ subno = v->nextvalue;
+ assert(subno > 0);
+ INSIST(subno < v->nsubs, REG_ESUBREG);
+ NOERRN();
+ INSIST(v->subs[subno] != NULL, REG_ESUBREG);
NOERRN();
- assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp);
NOERRN();
- subno = v->nextvalue;
atom->backno = subno;
+ v->subs[subno]->flags |= BRUSE;
EMPTYARC(lp, rp); /* temporarily, so there's something */
NEXT();
break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v, /* might be NULL */
}
/*
- * optst - optimize a subRE subtree
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
*/
static void
-optst(struct vars *v,
- struct subre *t)
+removecaptures(struct vars *v,
+ struct subre *t)
{
+ struct subre *t2;
+
+ assert(t != NULL);
+
+ /*
+ * If this isn't itself a backref target, clear capno and tentatively
+ * clear CAP flag.
+ */
+ if (!(t->flags & BRUSE))
+ {
+ t->capno = 0;
+ t->flags &= ~CAP;
+ }
+
+ /* Now recurse to children */
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ {
+ removecaptures(v, t2);
+ /* Propagate child CAP flag back up, if it's still set */
+ if (t2->flags & CAP)
+ t->flags |= CAP;
+ }
+
/*
- * DGP (2007-11-13): I assume it was the programmer's intent to eventually
- * come back and add code to optimize subRE trees, but the routine coded
- * just spends effort traversing the tree and doing nothing. We can do
- * nothing with less effort.
+ * If t now contains neither captures nor backrefs, there's no longer any
+ * need to care where its sub-match boundaries are, so we can reduce it to
+ * a simple DFA node. (Note in particular that MIXED child greediness is
+ * not a hindrance here, so we don't use the MESSY() macro.)
*/
- return;
+ if ((t->flags & (CAP | BACKR)) == 0)
+ {
+ if (t->child)
+ freesubreandsiblings(v, t->child);
+ t->child = NULL;
+ t->op = '=';
+ t->flags &= ~MIXED;
+ }
}
/*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
fprintf(f, " hascapture");
if (t->flags & BACKR)
fprintf(f, " hasbackref");
+ if (t->flags & BRUSE)
+ fprintf(f, " isreferenced");
if (!(t->flags & INUSE))
fprintf(f, " UNUSED");
if (t->latype != (char) -1)
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 5b9a087820..db54ebfba4 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -213,12 +213,9 @@ pg_regexec(regex_t *re,
return REG_NOMATCH;
backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
v->eflags = flags;
- if (v->g->cflags & REG_NOSUB)
- nmatch = 0; /* override client */
- v->nmatch = nmatch;
- if (backref)
+ if (backref && nmatch <= v->g->nsub)
{
- /* need work area */
+ /* need larger work area */
if (v->g->nsub + 1 <= LOCALMAT)
v->pmatch = mat;
else
@@ -229,7 +226,17 @@ pg_regexec(regex_t *re,
v->nmatch = v->g->nsub + 1;
}
else
+ {
+ /* we can store results directly in caller's array */
v->pmatch = pmatch;
+ /* ensure any extra entries in caller's array are filled with -1 */
+ if (nmatch > 0)
+ zapallsubs(pmatch, nmatch);
+ /* then forget about extra entries, to avoid useless work in find() */
+ if (nmatch > v->g->nsub + 1)
+ nmatch = v->g->nsub + 1;
+ v->nmatch = nmatch;
+ }
v->details = details;
v->start = (chr *) string;
v->search_start = (chr *) string + search_start;
@@ -290,12 +297,20 @@ pg_regexec(regex_t *re,
else
st = find(v, &v->g->tree->cnfa, &v->g->cmap);
- /* copy (portion of) match vector over if necessary */
- if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+ /* on success, ensure caller's match vector is filled correctly */
+ if (st == REG_OKAY && nmatch > 0)
{
- zapallsubs(pmatch, nmatch);
- n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
- memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+ if (v->pmatch != pmatch)
+ {
+ /* copy portion of match vector over from (larger) work area */
+ assert(nmatch <= v->nmatch);
+ memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+ }
+ if (v->g->cflags & REG_NOSUB)
+ {
+ /* don't expose possibly-partial sub-match results to caller */
+ zapallsubs(pmatch, nmatch);
+ }
}
/* clean up */
@@ -752,7 +767,6 @@ cdissect(struct vars *v,
break;
case '(': /* no-op capture node */
assert(t->child != NULL);
- assert(t->capno > 0);
er = cdissect(v, t->child, begin, end);
break;
default:
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index de3d97931e..bd5d4488a0 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
}
+ /*
+ * We'll never need sub-match details at execution. While
+ * RE_compile_and_execute would set this flag anyway, force it on here to
+ * ensure that the regex cache entries created by makeItemLikeRegex are
+ * useful.
+ */
+ cflags |= REG_NOSUB;
+
return cflags;
}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 484d4265fd..268cee1cbe 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
{
regex_t *re;
+ /* Use REG_NOSUB if caller does not want sub-match details */
+ if (nmatch < 2)
+ cflags |= REG_NOSUB;
+
/* Compile RE */
re = RE_compile_and_cache(text_re, cflags, collation);
@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
int orig_len;
pg_wchar *wide_str;
int wide_len;
+ int cflags;
regex_t *cpattern;
regmatch_t *pmatch;
int pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* set up the compiled pattern */
- cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+ cflags = re_flags->cflags;
+ if (!use_subpatterns)
+ cflags |= REG_NOSUB;
+ cpattern = RE_compile_and_cache(pattern, cflags, collation);
/* do we want to remember subpatterns? */
if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
if (case_insensitive)
cflags |= REG_ICASE;
- re = RE_compile_and_cache(text_re, cflags, collation);
+ re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
/* Examine it to see if there's a fixed prefix */
re_result = pg_regprefix(re, &str, &slen);