diff options
author | Tom Lane | 2023-04-07 21:31:51 +0000 |
---|---|---|
committer | Tom Lane | 2023-04-07 21:32:26 +0000 |
commit | a290378a3752f787cdc2252889ba513f6fb393db (patch) | |
tree | e3cf6aa204d45baa958b9ab95fc0a01f6044bf4b /doc/src | |
parent | 728015a47016dcd734c516e43f326ae491b6a3d2 (diff) |
Add support for Daitch-Mokotoff Soundex in contrib/fuzzystrmatch.
This modernized version of Soundex works significantly better than
the original, particularly for non-English names.
Dag Lem, reviewed by quite a few people along the way
Discussion: https://postgr.es/m/[email protected]
Diffstat (limited to 'doc/src')
-rw-r--r-- | doc/src/sgml/fuzzystrmatch.sgml | 169 |
1 files changed, 162 insertions, 7 deletions
diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml index 1a5ebbb22f..bcadc440e3 100644 --- a/doc/src/sgml/fuzzystrmatch.sgml +++ b/doc/src/sgml/fuzzystrmatch.sgml @@ -17,6 +17,8 @@ At present, the <function>soundex</function>, <function>metaphone</function>, <function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do not work well with multibyte encodings (such as UTF-8). + Use <function>daitch_mokotoff</function> + or <function>levenshtein</function> with such data. </para> </caution> @@ -88,6 +90,159 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2; </programlisting> </sect2> + <sect2 id="fuzzystrmatch-daitch-mokotoff"> + <title>Daitch-Mokotoff Soundex</title> + + <para> + Like the original Soundex system, Daitch-Mokotoff Soundex matches + similar-sounding names by converting them to the same code. + However, Daitch-Mokotoff Soundex is significantly more useful for + non-English names than the original system. + Major improvements over the original system include: + + <itemizedlist spacing="compact" mark="bullet"> + <listitem> + <para> + The code is based on the first six meaningful letters rather than four. + </para> + </listitem> + <listitem> + <para> + A letter or combination of letters maps into ten possible codes rather + than seven. + </para> + </listitem> + <listitem> + <para> + Where two consecutive letters have a single sound, they are coded as a + single number. + </para> + </listitem> + <listitem> + <para> + When a letter or combination of letters may have different sounds, + multiple codes are emitted to cover all possibilities. + </para> + </listitem> + </itemizedlist> + </para> + + <indexterm> + <primary>daitch_mokotoff</primary> + </indexterm> + + <para> + This function generates the Daitch-Mokotoff soundex codes for its input: + </para> + +<synopsis> +daitch_mokotoff(<parameter>source</parameter> text) returns text[] +</synopsis> + + <para> + The result may contain one or more codes depending on how many plausible + pronunciations there are, so it is represented as an array. + </para> + + <para> + Since a Daitch-Mokotoff soundex code consists of only 6 digits, + <parameter>source</parameter> should be preferably a single word or name. + </para> + + <para> + Here are some examples: + </para> + +<programlisting> +SELECT daitch_mokotoff('George'); + daitch_mokotoff +----------------- + {595000} + +SELECT daitch_mokotoff('John'); + daitch_mokotoff +----------------- + {160000,460000} + +SELECT daitch_mokotoff('Bierschbach'); + daitch_mokotoff +----------------------------------------------------------- + {794575,794574,794750,794740,745750,745740,747500,747400} + +SELECT daitch_mokotoff('Schwartzenegger'); + daitch_mokotoff +----------------- + {479465} +</programlisting> + + <para> + For matching of single names, returned text arrays can be matched + directly using the <literal>&&</literal> operator: any overlap + can be considered a match. A GIN index may + be used for efficiency, see <xref linkend="gin"/> and this example: + </para> + +<programlisting> +CREATE TABLE s (nm text); +CREATE INDEX ix_s_dm ON s USING gin (daitch_mokotoff(nm)) WITH (fastupdate = off); + +INSERT INTO s (nm) VALUES + ('Schwartzenegger'), + ('John'), + ('James'), + ('Steinman'), + ('Steinmetz'); + +SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Swartzenegger'); +SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jane'); +SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jens'); +</programlisting> + + <para> + For indexing and matching of any number of names in any order, Full Text + Search features can be used. See <xref linkend="textsearch"/> and this + example: + </para> + +<programlisting> +CREATE FUNCTION soundex_tsvector(v_name text) RETURNS tsvector +BEGIN ATOMIC + SELECT to_tsvector('simple', + string_agg(array_to_string(daitch_mokotoff(n), ' '), ' ')) + FROM regexp_split_to_table(v_name, '\s+') AS n; +END; + +CREATE FUNCTION soundex_tsquery(v_name text) RETURNS tsquery +BEGIN ATOMIC + SELECT string_agg('(' || array_to_string(daitch_mokotoff(n), '|') || ')', '&')::tsquery + FROM regexp_split_to_table(v_name, '\s+') AS n; +END; + +CREATE TABLE s (nm text); +CREATE INDEX ix_s_txt ON s USING gin (soundex_tsvector(nm)) WITH (fastupdate = off); + +INSERT INTO s (nm) VALUES + ('John Doe'), + ('Jane Roe'), + ('Public John Q.'), + ('George Best'), + ('John Yamson'); + +SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john'); +SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('jane doe'); +SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john public'); +SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('besst, giorgio'); +SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('Jameson John'); +</programlisting> + + <para> + If it is desired to avoid recalculation of soundex codes during index + rechecks, an index on a separate column can be used instead of an index on + an expression. A stored generated column can be used for this; see + <xref linkend="ddl-generated-columns"/>. + </para> + </sect2> + <sect2 id="fuzzystrmatch-levenshtein"> <title>Levenshtein</title> @@ -104,10 +259,10 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2; </indexterm> <synopsis> -levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int -levenshtein(text source, text target) returns int -levenshtein_less_equal(text source, text target, int ins_cost, int del_cost, int sub_cost, int max_d) returns int -levenshtein_less_equal(text source, text target, int max_d) returns int +levenshtein(source text, target text, ins_cost int, del_cost int, sub_cost int) returns int +levenshtein(source text, target text) returns int +levenshtein_less_equal(source text, target text, ins_cost int, del_cost int, sub_cost int, max_d int) returns int +levenshtein_less_equal(source text, target text, max_d int) returns int </synopsis> <para> @@ -177,7 +332,7 @@ test=# SELECT levenshtein_less_equal('extensive', 'exhaustive', 4); </indexterm> <synopsis> -metaphone(text source, int max_output_length) returns text +metaphone(source text, max_output_length int) returns text </synopsis> <para> @@ -220,8 +375,8 @@ test=# SELECT metaphone('GUMBO', 4); </indexterm> <synopsis> -dmetaphone(text source) returns text -dmetaphone_alt(text source) returns text +dmetaphone(source text) returns text +dmetaphone_alt(source text) returns text </synopsis> <para> |