summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane2009-08-05 18:06:49 +0000
committerTom Lane2009-08-05 18:06:49 +0000
commit059bc14fc7b18e8dfced6bf799709d08b46c1e39 (patch)
tree6b7f092678591afee036036dbdeb6a9572970492
parent89e1d6641d142659679869079bcbeeb6df63f089 (diff)
Add matchorig, matchsynonyms, and keepsynonyms options to contrib/dict_xsyn.
Sergey Karpov
-rw-r--r--contrib/dict_xsyn/dict_xsyn.c110
-rw-r--r--contrib/dict_xsyn/expected/dict_xsyn.out130
-rw-r--r--contrib/dict_xsyn/sql/dict_xsyn.sql41
-rw-r--r--doc/src/sgml/dict-xsyn.sgml49
4 files changed, 280 insertions, 50 deletions
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 17b6ba6606..db23a784e3 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -33,7 +33,10 @@ typedef struct
int len;
Syn *syn;
+ bool matchorig;
bool keeporig;
+ bool matchsynonyms;
+ bool keepsynonyms;
} DictSyn;
@@ -88,7 +91,8 @@ read_dictionary(DictSyn *d, char *filename)
{
char *value;
char *key;
- char *end = NULL;
+ char *pos;
+ char *end;
if (*line == '\0')
continue;
@@ -96,26 +100,36 @@ read_dictionary(DictSyn *d, char *filename)
value = lowerstr(line);
pfree(line);
- key = find_word(value, &end);
- if (!key)
+ pos = value;
+ while ((key = find_word(pos, &end)) != NULL)
{
- pfree(value);
- continue;
- }
+ /* Enlarge syn structure if full */
+ if (cur == d->len)
+ {
+ d->len = (d->len > 0) ? 2 * d->len : 16;
+ if (d->syn)
+ d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+ else
+ d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+ }
- if (cur == d->len)
- {
- d->len = (d->len > 0) ? 2 * d->len : 16;
- if (d->syn)
- d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
- else
- d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
- }
+ /* Save first word only if we will match it */
+ if (pos != value || d->matchorig)
+ {
+ d->syn[cur].key = pnstrdup(key, end - key);
+ d->syn[cur].value = pstrdup(value);
- d->syn[cur].key = pnstrdup(key, end - key);
- d->syn[cur].value = value;
+ cur++;
+ }
+
+ pos = end;
- cur++;
+ /* Don't bother scanning synonyms if we will not match them */
+ if (!d->matchsynonyms)
+ break;
+ }
+
+ pfree(value);
}
tsearch_readline_end(&trst);
@@ -133,23 +147,40 @@ dxsyn_init(PG_FUNCTION_ARGS)
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d;
ListCell *l;
+ char *filename = NULL;
d = (DictSyn *) palloc0(sizeof(DictSyn));
d->len = 0;
d->syn = NULL;
+ d->matchorig = true;
d->keeporig = true;
+ d->matchsynonyms = false;
+ d->keepsynonyms = true;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
- if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
+ if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
+ {
+ d->matchorig = defGetBoolean(defel);
+ }
+ else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
{
d->keeporig = defGetBoolean(defel);
}
+ else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
+ {
+ d->matchsynonyms = defGetBoolean(defel);
+ }
+ else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
+ {
+ d->keepsynonyms = defGetBoolean(defel);
+ }
else if (pg_strcasecmp(defel->defname, "RULES") == 0)
{
- read_dictionary(d, defGetString(defel));
+ /* we can't read the rules before parsing all options! */
+ filename = defGetString(defel);
}
else
{
@@ -160,6 +191,9 @@ dxsyn_init(PG_FUNCTION_ARGS)
}
}
+ if (filename)
+ read_dictionary(d, filename);
+
PG_RETURN_POINTER(d);
}
@@ -194,41 +228,33 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
/* Parse string of synonyms and return array of words */
{
- char *value = pstrdup(found->value);
- int value_length = strlen(value);
- char *pos = value;
+ char *value = found->value;
+ char *syn;
+ char *pos;
+ char *end;
int nsyns = 0;
- bool is_first = true;
- res = palloc(0);
+ res = palloc(sizeof(TSLexeme));
- while (pos < value + value_length)
+ pos = value;
+ while ((syn = find_word(pos, &end)) != NULL)
{
- char *end;
- char *syn = find_word(pos, &end);
-
- if (!syn)
- break;
- *end = '\0';
-
res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
- res[nsyns].lexeme = NULL;
- /* first word is added to result only if KEEPORIG flag is set */
- if (d->keeporig || !is_first)
+ /* The first word is output only if keeporig=true */
+ if (pos != value || d->keeporig)
{
- res[nsyns].lexeme = pstrdup(syn);
- res[nsyns + 1].lexeme = NULL;
-
+ res[nsyns].lexeme = pnstrdup(syn, end - syn);
nsyns++;
}
- is_first = false;
+ pos = end;
- pos = end + 1;
+ /* Stop if we are not to output the synonyms */
+ if (!d->keepsynonyms)
+ break;
}
-
- pfree(value);
+ res[nsyns].lexeme = NULL;
}
PG_RETURN_POINTER(res);
diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out
index 99071ea8c7..d91697a97e 100644
--- a/contrib/dict_xsyn/expected/dict_xsyn.out
+++ b/contrib/dict_xsyn/expected/dict_xsyn.out
@@ -5,10 +5,76 @@
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
ts_lexize
----------------
{sn,sne,1987a}
@@ -20,3 +86,63 @@ SELECT ts_lexize('xsyn', 'grb');
(1 row)
+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-------------
+ {supernova}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql
index 17f6df9cf3..9db0851700 100644
--- a/contrib/dict_xsyn/sql/dict_xsyn.sql
+++ b/contrib/dict_xsyn/sql/dict_xsyn.sql
@@ -8,9 +8,46 @@ SET client_min_messages = warning;
\set ECHO all
RESET client_min_messages;
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
diff --git a/doc/src/sgml/dict-xsyn.sgml b/doc/src/sgml/dict-xsyn.sgml
index 481c0e074c..ff116b1fbf 100644
--- a/doc/src/sgml/dict-xsyn.sgml
+++ b/doc/src/sgml/dict-xsyn.sgml
@@ -23,9 +23,26 @@
<itemizedlist>
<listitem>
<para>
- <literal>keeporig</> controls whether the original word is included (if
- <literal>true</>), or only its synonyms (if <literal>false</>). Default
- is <literal>true</>.
+ <literal>matchorig</> controls whether the original word is accepted by
+ the dictionary. Default is <literal>true</>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>matchsynonyms</> controls whether the synonyms are
+ accepted by the dictionary. Default is <literal>false</>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>keeporig</> controls whether the original word is included in
+ the dictionary's output. Default is <literal>true</>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>keepsynonyms</> controls whether the synonyms are included in
+ the dictionary's output. Default is <literal>true</>.
</para>
</listitem>
<listitem>
@@ -90,10 +107,34 @@ ALTER TEXT SEARCH DICTIONARY
mydb=# SELECT ts_lexize('xsyn', 'word');
ts_lexize
-----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'word');
+ ts_lexize
+-----------------------
{word,syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+ ts_lexize
+-----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+ ts_lexize
+-----------------------
+ {word}
</programlisting>
- but real-world usage will involve including it in a text search
+ Real-world usage will involve including it in a text search
configuration as described in <xref linkend="textsearch">.
That might look like this: