From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 3 Jan 2007 22:40:04 +0000 (+0000)
Subject: Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=de7f4251cd80244ee48c5ab3a169a68f783d083f;p=users%2Fbernd%2Fpostgres.git

Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
form '^(foo)$'.  Before, these could never be optimized into indexscans.
The recent changes to make psql and pg_dump generate such patterns (for \d
commands and -t and related switches, respectively) therefore represented
a big performance hit for people with large pg_class catalogs, as seen in
recent gripe from Erik Jones.  While at it, be more paranoid about
case-sensitivity checking in multibyte encodings, and fix some other
corner cases in which a regex might be interpreted too liberally.
---

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 2db0ff31ee..2959135861 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2735,7 +2735,10 @@ get_join_vars(List *args, Var **var1, Var **var2)
  * These routines support analysis of LIKE and regular-expression patterns
  * by the planner/optimizer.  It's important that they agree with the
  * regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c.
+ * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
+ * must be conservative: if we report a string longer than the true fixed
+ * prefix, the query may produce actually wrong answers, rather than just
+ * getting a bad selectivity estimate!
  *
  * Note that the prefix-analysis functions are called from
  * backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -2764,6 +2767,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 	Oid			typeid = patt_const->consttype;
 	int			pos,
 				match_pos;
+	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 
 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -2811,11 +2815,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 		}
 
 		/*
-		 * XXX I suspect isalpha() is not an adequately locale-sensitive
-		 * test for characters that can vary under case folding?
+		 * XXX In multibyte character sets, we can't trust isalpha, so assume
+		 * any multibyte char is potentially case-varying.
 		 */
-		if (case_insensitive && isalpha((unsigned char) patt[pos]))
-			break;
+		if (case_insensitive)
+		{
+			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+				break;
+			if (isalpha((unsigned char) patt[pos]))
+				break;
+		}
 
 		/*
 		 * NOTE: this code used to think that %% meant a literal %, but
@@ -2861,11 +2870,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	char	   *match;
 	int			pos,
 				match_pos,
-				paren_depth;
+				prev_pos,
+				prev_match_pos;
+	bool		have_leading_paren;
 	char	   *patt;
-	char	   *prefix;
 	char	   *rest;
 	Oid			typeid = patt_const->consttype;
+	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 
 	/*
 	 * Should be unnecessary, there are no bytea regex operators defined.
@@ -2879,7 +2890,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
 
 	/* Pattern must be anchored left */
-	if (patt[0] != '^')
+	pos = 0;
+	if (patt[pos] != '^')
 	{
 		rest = patt;
 
@@ -2888,104 +2900,130 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 
 		return Pattern_Prefix_None;
 	}
+	pos++;
 
 	/*
-	 * If unquoted | is present at paren level 0 in pattern, then there
-	 * are multiple alternatives for the start of the string.
+	 * If '|' is present in pattern, then there may be multiple alternatives
+	 * for the start of the string.  (There are cases where this isn't so,
+	 * for instance if the '|' is inside parens, but detecting that reliably
+	 * is too hard.)
 	 */
-	paren_depth = 0;
-	for (pos = 1; patt[pos]; pos++)
+	if (strchr(patt + pos, '|') != NULL)
 	{
-		if (patt[pos] == '|' && paren_depth == 0)
-		{
-			rest = patt;
+		rest = patt;
 
-			*prefix_const = NULL;
-			*rest_const = string_to_const(rest, typeid);
+		*prefix_const = NULL;
+		*rest_const = string_to_const(rest, typeid);
 
-			return Pattern_Prefix_None;
-		}
-		else if (patt[pos] == '(')
-			paren_depth++;
-		else if (patt[pos] == ')' && paren_depth > 0)
-			paren_depth--;
-		else if (patt[pos] == '\\')
-		{
-			/* backslash quotes the next character */
-			pos++;
-			if (patt[pos] == '\0')
-				break;
-		}
+		return Pattern_Prefix_None;
 	}
 
 	/* OK, allocate space for pattern */
-	prefix = match = palloc(strlen(patt) + 1);
-	match_pos = 0;
+	match = palloc(strlen(patt) + 1);
+	prev_match_pos = match_pos = 0;
 
-	/* note start at pos 1 to skip leading ^ */
-	for (pos = 1; patt[pos]; pos++)
+	/*
+	 * We special-case the syntax '^(...)$' because psql uses it.  But beware:
+	 * sequences beginning "(?" are not what they seem.
+	 */
+	have_leading_paren = false;
+	if (patt[pos] == '(' && patt[pos + 1] != '?')
+	{
+		have_leading_paren = true;
+		pos++;
+	}
+
+	/* Scan remainder of pattern */
+	prev_pos = pos;
+	while (patt[pos])
 	{
+		int			len;
+
 		/*
-		 * Check for characters that indicate multiple possible matches
-		 * here. XXX I suspect isalpha() is not an adequately
-		 * locale-sensitive test for characters that can vary under case
-		 * folding?
+		 * Check for characters that indicate multiple possible matches here.
+		 * Also, drop out at ')' or '$' so the termination test works right.
 		 */
 		if (patt[pos] == '.' ||
 			patt[pos] == '(' ||
+			patt[pos] == ')' ||
 			patt[pos] == '[' ||
-			patt[pos] == '$' ||
-			(case_insensitive && isalpha((unsigned char) patt[pos])))
+			patt[pos] == '^' ||
+			patt[pos] == '$')
 			break;
 
+		/*
+		 * XXX In multibyte character sets, we can't trust isalpha, so assume
+		 * any multibyte char is potentially case-varying.
+		 */
+		if (case_insensitive)
+		{
+			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+				break;
+			if (isalpha((unsigned char) patt[pos]))
+				break;
+		}
+
 		/*
 		 * Check for quantifiers.  Except for +, this means the preceding
-		 * character is optional, so we must remove it from the prefix
-		 * too!
+		 * character is optional, so we must remove it from the prefix too!
 		 */
 		if (patt[pos] == '*' ||
 			patt[pos] == '?' ||
 			patt[pos] == '{')
 		{
-			if (match_pos > 0)
-				match_pos--;
-			pos--;
+			match_pos = prev_match_pos;
+			pos = prev_pos;
 			break;
 		}
 		if (patt[pos] == '+')
 		{
-			pos--;
+			pos = prev_pos;
 			break;
 		}
+
+		/*
+		 * backslash quotes the next character.
+		 */
 		if (patt[pos] == '\\')
 		{
-			/* backslash quotes the next character */
 			pos++;
 			if (patt[pos] == '\0')
 				break;
 		}
-		match[match_pos++] = patt[pos];
+		/* save position in case we need to back up on next loop cycle */
+		prev_match_pos = match_pos;
+		prev_pos = pos;
+		/* must use encoding-aware processing here */
+		len = pg_mblen(&patt[pos]);
+		memcpy(&match[match_pos], &patt[pos], len);
+		match_pos += len;
+		pos += len;
 	}
 
 	match[match_pos] = '\0';
 	rest = &patt[pos];
 
+	if (have_leading_paren && patt[pos] == ')')
+		pos++;
+
 	if (patt[pos] == '$' && patt[pos + 1] == '\0')
 	{
 		rest = &patt[pos + 1];
 
-		*prefix_const = string_to_const(prefix, typeid);
+		*prefix_const = string_to_const(match, typeid);
 		*rest_const = string_to_const(rest, typeid);
 
+		pfree(patt);
+		pfree(match);
+
 		return Pattern_Prefix_Exact;	/* pattern specifies exact match */
 	}
 
-	*prefix_const = string_to_const(prefix, typeid);
+	*prefix_const = string_to_const(match, typeid);
 	*rest_const = string_to_const(rest, typeid);
 
 	pfree(patt);
 	pfree(match);
-	prefix = NULL;
 
 	if (match_pos > 0)
 		return Pattern_Prefix_Partial;