diff options
author | Marc G. Fournier | 1998-03-15 07:39:04 +0000 |
---|---|---|
committer | Marc G. Fournier | 1998-03-15 07:39:04 +0000 |
commit | 661ecf3c48e16a9add216287eb969d7615e47968 (patch) | |
tree | 91b54d5905aa2e22bd0ae9ea8c6b0f3cab75d3f4 /src/include/regex | |
parent | 31a925c4d07675bc098a742ee9ca642ec79a40ee (diff) |
From: [email protected]
Included are patches intended for allowing PostgreSQL to handle
multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and
Mule internal code. With the MB patch you can use multi-byte character
sets in regexp and LIKE. The encoding system chosen is determined at
the compile time.
To enable the MB extension, you need to define a variable "MB" in
Makefile.global or in Makefile.custom. For further information please
take a look at README.mb under doc directory.
(Note that unlike "jp patch" I do not use modified GNU regexp any
more. I changed Henry Spencer's regexp coming with PostgreSQL.)
Diffstat (limited to 'src/include/regex')
-rw-r--r-- | src/include/regex/pg_wchar.h | 44 | ||||
-rw-r--r-- | src/include/regex/regex.h | 7 | ||||
-rw-r--r-- | src/include/regex/regex2.h | 42 | ||||
-rw-r--r-- | src/include/regex/utils.h | 5 |
4 files changed, 93 insertions, 5 deletions
diff --git a/src/include/regex/pg_wchar.h b/src/include/regex/pg_wchar.h new file mode 100644 index 0000000000..616f76cfec --- /dev/null +++ b/src/include/regex/pg_wchar.h @@ -0,0 +1,44 @@ +/* $Id: pg_wchar.h,v 1.1 1998/03/15 07:38:47 scrappy Exp $ */ + +#ifndef PG_WCHAR_H +#define PG_WCHAR_H + +#include <sys/types.h> + +#define EUC_JP 0 /* EUC for Japanese */ +#define EUC_CN 1 /* EUC for Chinese */ +#define EUC_KR 2 /* EUC for Korean */ +#define EUC_TW 3 /* EUC for Taiwan */ +#define UNICODE 4 /* Unicode UTF-8 */ +#define MULE_INTERNAL 5 /* Mule internal code */ + +#ifdef MB +typedef unsigned int pg_wchar; +#else +#define pg_wchar char +#endif + +/* + * various definitions for EUC + */ +#define SS2 0x8e /* single shift 2 */ +#define SS3 0x8f /* single shift 3 */ + +/* + * various definitions for mule internal code + */ +#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8f) +#define IS_LCPRV1(c) ((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b) +#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99) +#define IS_LCPRV2(c) ((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d) + +#ifdef MB +extern void pg_mb2wchar(const unsigned char *, pg_wchar *); +extern void pg_mb2wchar_with_len(const unsigned char *, pg_wchar *, int); +extern int pg_char_and_wchar_strcmp(const char *, const pg_wchar *); +extern int pg_wchar_strncmp(const pg_wchar *, const pg_wchar *, size_t); +extern int pg_char_and_wchar_strncmp(const char *, const pg_wchar *, size_t); +extern size_t pg_wchar_strlen(const pg_wchar *); +#endif + +#endif diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index cd9efbceb7..f0c9876fe0 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -41,6 +41,7 @@ #define _REGEX_H_ #include <sys/types.h> +#include <regex/pg_wchar.h> /* types */ typedef off_t regoff_t; @@ -49,8 +50,12 @@ typedef struct { int re_magic; size_t re_nsub; /* number of parenthesized subexpressions */ - const char *re_endp; /* end pointer for REG_PEND */ + const pg_wchar *re_endp; /* end pointer for REG_PEND */ struct re_guts *re_g; /* none of your business :-) */ +#ifdef MB + pg_wchar *patsave; /* mee too :-) */ +#endif + } regex_t; typedef struct diff --git a/src/include/regex/regex2.h b/src/include/regex/regex2.h index 564c626c5b..01cdadff45 100644 --- a/src/include/regex/regex2.h +++ b/src/include/regex/regex2.h @@ -127,12 +127,29 @@ typedef struct { uch *ptr; /* -> uch [csetsize] */ uch mask; /* bit within array */ - uch hash; /* hash code */ +#ifdef MB + pg_wchar hash; /* hash code */ + unsigned int lc; /* leading character (character-set) */ +#else + uch hash; /* hash code */ +#endif size_t smultis; char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ } cset; /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ +#ifdef MB +#define CHlc(c) (((unsigned)(c)&0xff0000)>>16) +#define CHadd(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] |= (cs)->mask, (cs)->hash += (unsigned)(c)&0xffff,\ + (cs)->lc = CHlc(c)) +#define CHsub(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] &= ~(cs)->mask, (cs)->hash -= (unsigned)(c)&0xffff) +#define CHIN(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] & (cs)->mask && \ + ((cs)->lc == CHlc(c))) +#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal + * fns */ +#define MCsub(p, cs, cp) mcsub(p, cs, cp) +#define MCin(p, cs, cp) mcin(p, cs, cp) +#else #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) @@ -140,6 +157,7 @@ typedef struct * fns */ #define MCsub(p, cs, cp) mcsub(p, cs, cp) #define MCin(p, cs, cp) mcin(p, cs, cp) +#endif /* stuff for character categories */ typedef unsigned char cat_t; @@ -168,7 +186,7 @@ struct re_guts int neol; /* number of $ used */ int ncategories; /* how many character categories */ cat_t *categories; /* ->catspace[-CHAR_MIN] */ - char *must; /* match must contain this string */ + pg_wchar *must; /* match must contain this string */ int mlen; /* length of must */ size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ @@ -178,5 +196,21 @@ struct re_guts }; /* misc utilities */ -#define OUT (CHAR_MAX+1) /* a non-character value */ -#define ISWORD(c) (isalnum(c) || (c) == '_') +#ifdef MB +# if MB == MULE_INTERNAL +# define OUT (16777216+1) /* 16777216 == 2^24 == 3 bytes */ +# elif MB == EUC_JP || MB == EUC_CN || MB == EUC_KR || MB == EUC_TW +# define OUT (USHRT_MAX+1) /* 2 bytes */ +# elif MB == UNICODE +# define OUT (USHRT_MAX+1) /* 2 bytes. assuming UCS-2 */ +# endif +#else +# define OUT (CHAR_MAX+1) /* a non-character value */ +#endif + +#ifdef MB +#define ISWORD(c) ((c >= 0 && c <= UCHAR_MAX) && \ + (isalnum(c) || (c) == '_')) +#else +#define ISWORD(c) (isalnum(c) || (c) == '_') +#endif diff --git a/src/include/regex/utils.h b/src/include/regex/utils.h index a7cae06919..6f02759aa1 100644 --- a/src/include/regex/utils.h +++ b/src/include/regex/utils.h @@ -42,7 +42,12 @@ /* utility definitions */ #define DUPMAX 100000000 /* xxx is this right? */ #define INFINITY (DUPMAX + 1) + +#ifdef MB +#define NC (SHRT_MAX - SHRT_MIN + 1) +#else #define NC (CHAR_MAX - CHAR_MIN + 1) +#endif typedef unsigned char uch; /* switch off assertions (if not already off) if no REDEBUG */ |