diff options
author | Bruce Momjian | 2005-07-10 04:54:33 +0000 |
---|---|---|
committer | Bruce Momjian | 2005-07-10 04:54:33 +0000 |
commit | ba78293516eb521ff282aeb2aad8f18eee9f7add (patch) | |
tree | 727a764c7b1e6fc019d4343a8ae795a9384cf913 | |
parent | 1c2577907fdf8ed98afae2b19411ad1b25044914 (diff) |
I made the patch that implements regexp_replace again.
The specification of this function is as follows.
regexp_replace(source text, pattern text, replacement text, [flags
text])
returns text
Replace string that matches to regular expression in source text to
replacement text.
- pattern is regular expression pattern.
- replacement is replace string that can use '\1'-'\9', and '\&'.
'\1'-'\9': back reference to the n'th subexpression.
'\&' : entire matched string.
- flags can use the following values:
g: global (replace all)
i: ignore case
When the flags is not specified, case sensitive, replace the first
instance only.
Atsushi Ogawa
-rw-r--r-- | doc/src/sgml/func.sgml | 20 | ||||
-rw-r--r-- | src/backend/regex/regexec.c | 8 | ||||
-rw-r--r-- | src/backend/utils/adt/regexp.c | 163 | ||||
-rw-r--r-- | src/backend/utils/adt/varlena.c | 220 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.h | 4 | ||||
-rw-r--r-- | src/include/regex/regex.h | 2 | ||||
-rw-r--r-- | src/include/utils/builtins.h | 3 | ||||
-rw-r--r-- | src/test/regress/expected/strings.out | 28 | ||||
-rw-r--r-- | src/test/regress/sql/strings.sql | 7 |
10 files changed, 410 insertions, 47 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 403f744043..e8365a273a 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1257,6 +1257,26 @@ PostgreSQL documentation </row> <row> + <entry><literal><function>regexp_replace</function>(<parameter>source</parameter> <type>text</type>, + <parameter>pattern</parameter> <type>text</type>, + <parameter>replacement</parameter> <type>text</type> + <optional>, <parameter>flags</parameter> <type>text</type></optional>)</literal></entry> + <entry><type>text</type></entry> + <entry>Replace string that matches the regular expression + <parameter>pattern</parameter> in <parameter>source</parameter> to + <parameter>replacement</parameter>. + <parameter>replacement</parameter> can use <literal>\1</>-<literal>\9</> and <literal>\&</>. + <literal>\1</>-<literal>\9</> is a back reference to the n'th subexpression, and + <literal>\&</> is the entire matched string. + <parameter>flags</parameter> can use <literal>g</>(global) and <literal>i</>(ignore case). + When flags is not specified, case sensitive matching is used, and it replaces + only the instance. + </entry> + <entry><literal>regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3')</literal></entry> + <entry><literal>(111) 222-3333</literal></entry> + </row> + + <row> <entry><literal><function>repeat</function>(<parameter>string</parameter> <type>text</type>, <parameter>number</parameter> <type>integer</type>)</literal></entry> <entry><type>text</type></entry> <entry>Repeat <parameter>string</parameter> the specified diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 808f894711..eb5807545e 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -110,6 +110,7 @@ struct vars regmatch_t *pmatch; rm_detail_t *details; chr *start; /* start of string */ + chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ regoff_t *mem; /* memory vector for backtracking */ @@ -168,6 +169,7 @@ int pg_regexec(regex_t *re, const chr *string, size_t len, + size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], @@ -219,6 +221,7 @@ pg_regexec(regex_t *re, v->pmatch = pmatch; v->details = details; v->start = (chr *) string; + v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; if (backref) @@ -288,7 +291,8 @@ find(struct vars * v, NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; - close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); + close = shortest(v, s, v->search_start, v->search_start, v->stop, + &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) @@ -415,7 +419,7 @@ cfindloop(struct vars * v, assert(d != NULL && s != NULL); cold = NULL; - close = v->start; + close = v->search_start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index e1460b6373..d85a1b538c 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -81,38 +81,27 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ /* - * RE_compile_and_execute - compile and execute a RE, caching if possible + * RE_compile_and_cache - compile a RE, caching if possible * - * Returns TRUE on match, FALSE on no match + * Returns regex_t * - * text_re --- the pattern, expressed as an *untoasted* TEXT object - * dat --- the data to match against (need not be null-terminated) - * dat_len --- the length of the data string - * cflags --- compile options for the pattern - * nmatch, pmatch --- optional return area for match details + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * cflags --- compile options for the pattern * - * Both pattern and data are given in the database encoding. We internally - * convert to array of pg_wchar which is what Spencer's regex package wants. + * Pattern is given in the database encoding. We internally convert to + * array of pg_wchar which is what Spencer's regex package wants. */ -static bool -RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, - int cflags, int nmatch, regmatch_t *pmatch) +static regex_t +RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); - pg_wchar *data; - size_t data_len; pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; - int regexec_result; cached_re_str re_temp; char errMsg[100]; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, @@ -134,28 +123,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, re_array[0] = re_temp; } - /* Perform RE match and return result */ - regexec_result = pg_regexec(&re_array[0].cre_re, - data, - data_len, - NULL, /* no details */ - nmatch, - pmatch, - 0); - - pfree(data); - - if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) - { - /* re failed??? */ - pg_regerror(regexec_result, &re_array[0].cre_re, - errMsg, sizeof(errMsg)); - ereport(ERROR, - (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), - errmsg("regular expression failed: %s", errMsg))); - } - - return (regexec_result == REG_OKAY); + return re_array[0].cre_re; } } @@ -220,10 +188,45 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, re_array[0] = re_temp; num_res++; + return re_array[0].cre_re; +} + +/* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ +static bool +RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) +{ + pg_wchar *data; + size_t data_len; + int regexec_result; + regex_t re; + char errMsg[100]; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + /* Perform RE match and return result */ regexec_result = pg_regexec(&re_array[0].cre_re, data, data_len, + 0, NULL, /* no details */ nmatch, pmatch, @@ -428,15 +431,89 @@ textregexsubstr(PG_FUNCTION_ARGS) eo = pmatch[0].rm_eo; } - return (DirectFunctionCall3(text_substr, + return DirectFunctionCall3(text_substr, PointerGetDatum(s), Int32GetDatum(so + 1), - Int32GetDatum(eo - so))); + Int32GetDatum(eo - so)); } PG_RETURN_NULL(); } +/* + * textregexreplace_noopt() + * Return a replace string matched by a regular expression. + * This function is a version that doesn't specify the option of + * textregexreplace. This is case sensitive, replace the first + * instance only. + */ +Datum +textregexreplace_noopt(PG_FUNCTION_ARGS) +{ + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + regex_t re; + + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(false)); +} + +/* + * textregexreplace() + * Return a replace string matched by a regular expression. + */ +Datum +textregexreplace(PG_FUNCTION_ARGS) +{ + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + text *opt = PG_GETARG_TEXT_P(3); + char *opt_p = VARDATA(opt); + int opt_len = (VARSIZE(opt) - VARHDRSZ); + int i; + bool global = false; + bool ignorecase = false; + regex_t re; + + /* parse options */ + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'i': + ignorecase = true; + break; + case 'g': + global = true; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option of regexp_replace: %c", + opt_p[i]))); + break; + } + } + + if (ignorecase) + re = RE_compile_and_cache(p, regex_flavor | REG_ICASE); + else + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(global)); +} + /* similar_escape() * Convert a SQL99 regexp pattern to POSIX style, so it can be used by * our regexp engine. diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 873ce894e2..bf9f8435f5 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -28,6 +28,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" +#include "regex/regex.h" typedef struct varlena unknown; @@ -1994,6 +1995,225 @@ replace_text(PG_FUNCTION_ARGS) } /* + * check_replace_text_has_escape_char + * check whether replace_text has escape char. + */ +static bool +check_replace_text_has_escape_char(const text *replace_text) +{ + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + if (pg_database_encoding_max_length() == 1) + { + for (; p < p_end; p++) + if (*p == '\\') return true; + } + else + { + for (; p < p_end; p += pg_mblen(p)) + if (*p == '\\') return true; + } + + return false; +} + +/* + * appendStringInfoRegexpSubstr + * append string by using back references of regexp. + */ +static void +appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + regmatch_t *pmatch, text *src_text) +{ + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + int eml = pg_database_encoding_max_length(); + + int substr_start = 1; + int ch_cnt; + + int so; + int eo; + + while (1) + { + /* Find escape char. */ + ch_cnt = 0; + if (eml == 1) + { + for (; p < p_end && *p != '\\'; p++) + ch_cnt++; + } + else + { + for (; p < p_end && *p != '\\'; p += pg_mblen(p)) + ch_cnt++; + } + + /* + * Copy the text when there is a text in the left of escape char + * or escape char is not found. + */ + if (ch_cnt) + { + text *append_text = text_substring(PointerGetDatum(replace_text), + substr_start, ch_cnt, false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + substr_start += ch_cnt + 1; + + if (p >= p_end) /* When escape char is not found. */ + break; + + /* See the next character of escape char. */ + p++; + so = eo = -1; + + if (*p >= '1' && *p <= '9') + { + /* Use the back reference of regexp. */ + int idx = *p - '0'; + so = pmatch[idx].rm_so; + eo = pmatch[idx].rm_eo; + p++; + substr_start++; + } + else if (*p == '&') + { + /* Use the entire matched string. */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + p++; + substr_start++; + } + + if (so != -1 && eo != -1) + { + /* Copy the text that is back reference of regexp. */ + text *append_text = text_substring(PointerGetDatum(src_text), + so + 1, (eo - so), false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + } +} + +#define REGEXP_REPLACE_BACKREF_CNT 10 + +/* + * replace_text_regexp + * replace text that matches to regexp in src_text to replace_text. + */ +Datum +replace_text_regexp(PG_FUNCTION_ARGS) +{ + text *ret_text; + text *src_text = PG_GETARG_TEXT_P(0); + int src_text_len = VARSIZE(src_text) - VARHDRSZ; + regex_t *re = (regex_t *)PG_GETARG_POINTER(1); + text *replace_text = PG_GETARG_TEXT_P(2); + bool global = PG_GETARG_BOOL(3); + StringInfo str = makeStringInfo(); + int regexec_result; + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; + pg_wchar *data; + size_t data_len; + int search_start; + int data_pos; + bool have_escape; + + /* Convert data string to wide characters. */ + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(VARDATA(src_text), data, src_text_len); + + /* Check whether replace_text has escape char. */ + have_escape = check_replace_text_has_escape_char(replace_text); + + for (search_start = data_pos = 0; search_start <= data_len;) + { + regexec_result = pg_regexec(re, + data, + data_len, + search_start, + NULL, /* no details */ + REGEXP_REPLACE_BACKREF_CNT, + pmatch, + 0); + + if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) + { + char errMsg[100]; + + /* re failed??? */ + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + if (regexec_result == REG_NOMATCH) + break; + + /* + * Copy the text when there is a text in the left of matched position. + */ + if (pmatch[0].rm_so - data_pos > 0) + { + text *left_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, + pmatch[0].rm_so - data_pos, false); + appendStringInfoString(str, PG_TEXT_GET_STR(left_text)); + pfree(left_text); + } + + /* + * Copy the replace_text. Process back references when the + * replace_text has escape characters. + */ + if (have_escape) + appendStringInfoRegexpSubstr(str, replace_text, pmatch, src_text); + else + appendStringInfoString(str, PG_TEXT_GET_STR(replace_text)); + + search_start = data_pos = pmatch[0].rm_eo; + + /* + * When global option is off, replace the first instance only. + */ + if (!global) + break; + + /* + * Search from next character when the matching text is zero width. + */ + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + } + + /* + * Copy the text when there is a text at the right of last matched + * or regexp is not matched. + */ + if (data_pos < data_len) + { + text *right_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, -1, true); + appendStringInfoString(str, PG_TEXT_GET_STR(right_text)); + pfree(right_text); + } + + ret_text = PG_STR_GET_TEXT(str->data); + pfree(str->data); + pfree(str); + pfree(data); + + PG_RETURN_TEXT_P(ret_text); +} + +/* * split_text * parse input string * return ord item (1 based) diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 29e6d81487..974ec91d06 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200507081 +#define CATALOG_VERSION_NO 200507101 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 4ec80b470a..7dbf663627 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2207,6 +2207,10 @@ DATA(insert OID = 937 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 23" _nu DESCR("return portion of string"); DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" _null_ _null_ _null_ replace_text - _null_ )); DESCR("replace all occurrences of old_substr with new_substr in string"); +DATA(insert OID = 2284 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" _null_ _null_ _null_ textregexreplace_noopt - _null_ )); +DESCR("replace text using regexp"); +DATA(insert OID = 2285 ( regexp_replace PGNSP PGUID 12 f f t f i 4 25 "25 25 25 25" _null_ _null_ _null_ textregexreplace - _null_ )); +DESCR("replace text using regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25 25 23" _null_ _null_ _null_ split_text - _null_ )); DESCR("split string by field_sep and return field_num"); DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23" _null_ _null_ _null_ to_hex32 - _null_ )); diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index ca48cf1892..f684c13858 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -163,7 +163,7 @@ typedef struct * the prototypes for exported functions */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); -extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *, size_t, regmatch_t[], int); +extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index b23f6eaaa5..010c7c0d77 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -429,6 +429,8 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS); extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); +extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS); +extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); /* regproc.c */ @@ -566,6 +568,7 @@ extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist); extern Datum replace_text(PG_FUNCTION_ARGS); +extern Datum replace_text_regexp(PG_FUNCTION_ARGS); extern Datum split_text(PG_FUNCTION_ARGS); extern Datum text_to_array(PG_FUNCTION_ARGS); extern Datum array_to_text(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index ab4cd6a797..d05ce56299 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -192,6 +192,34 @@ SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde (1 row) +-- PostgreSQL extension to allow using back reference in replace string; +SELECT regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3'); + regexp_replace +---------------- + (111) 222-3333 +(1 row) + +SELECT regexp_replace('AAA BBB CCC ', '\\s+', ' ', 'g'); + regexp_replace +---------------- + AAA BBB CCC +(1 row) + +SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); + regexp_replace +---------------- + ZAAAZ +(1 row) + +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); + regexp_replace +---------------- + Z Z +(1 row) + +-- invalid option of REGEXP_REPLACE +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); +ERROR: invalid option of regexp_replace: z -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; 4 diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index a59b39cf99..620aabe8ae 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -80,6 +80,13 @@ SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; -- With a parenthesized subexpression, return only what matches the subexpr SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; +-- PostgreSQL extension to allow using back reference in replace string; +SELECT regexp_replace('1112223333', '(\\d{3})(\\d{3})(\\d{4})', '(\\1) \\2-\\3'); +SELECT regexp_replace('AAA BBB CCC ', '\\s+', ' ', 'g'); +SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); +-- invalid option of REGEXP_REPLACE +SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; |