Skip to content

Commit c0828b7

Browse files
committed
Move the guts of our Levenshtein implementation into core.
The hope is that we can use this to produce better diagnostics in some cases. Peter Geoghegan, reviewed by Michael Paquier, with some further changes by me.
1 parent 1d69ae4 commit c0828b7

File tree

6 files changed

+122
-83
lines changed

6 files changed

+122
-83
lines changed

contrib/fuzzystrmatch/Makefile

-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,3 @@ top_builddir = ../..
1717
include $(top_builddir)/src/Makefile.global
1818
include $(top_srcdir)/contrib/contrib-global.mk
1919
endif
20-
21-
# levenshtein.c is #included by fuzzystrmatch.c
22-
fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c

contrib/fuzzystrmatch/fuzzystrmatch.c

+57-25
Original file line numberDiff line numberDiff line change
@@ -154,23 +154,6 @@ getcode(char c)
154154
/* These prevent GH from becoming F */
155155
#define NOGHTOF(c) (getcode(c) & 16) /* BDH */
156156

157-
/* Faster than memcmp(), for this use case. */
158-
static inline bool
159-
rest_of_char_same(const char *s1, const char *s2, int len)
160-
{
161-
while (len > 0)
162-
{
163-
len--;
164-
if (s1[len] != s2[len])
165-
return false;
166-
}
167-
return true;
168-
}
169-
170-
#include "levenshtein.c"
171-
#define LEVENSHTEIN_LESS_EQUAL
172-
#include "levenshtein.c"
173-
174157
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
175158
Datum
176159
levenshtein_with_costs(PG_FUNCTION_ARGS)
@@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
180163
int ins_c = PG_GETARG_INT32(2);
181164
int del_c = PG_GETARG_INT32(3);
182165
int sub_c = PG_GETARG_INT32(4);
183-
184-
PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
166+
const char *s_data;
167+
const char *t_data;
168+
int s_bytes,
169+
t_bytes;
170+
171+
/* Extract a pointer to the actual character data */
172+
s_data = VARDATA_ANY(src);
173+
t_data = VARDATA_ANY(dst);
174+
/* Determine length of each string in bytes and characters */
175+
s_bytes = VARSIZE_ANY_EXHDR(src);
176+
t_bytes = VARSIZE_ANY_EXHDR(dst);
177+
178+
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
179+
del_c, sub_c));
185180
}
186181

187182

@@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS)
191186
{
192187
text *src = PG_GETARG_TEXT_PP(0);
193188
text *dst = PG_GETARG_TEXT_PP(1);
194-
195-
PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
189+
const char *s_data;
190+
const char *t_data;
191+
int s_bytes,
192+
t_bytes;
193+
194+
/* Extract a pointer to the actual character data */
195+
s_data = VARDATA_ANY(src);
196+
t_data = VARDATA_ANY(dst);
197+
/* Determine length of each string in bytes and characters */
198+
s_bytes = VARSIZE_ANY_EXHDR(src);
199+
t_bytes = VARSIZE_ANY_EXHDR(dst);
200+
201+
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
202+
1));
196203
}
197204

198205

@@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
206213
int del_c = PG_GETARG_INT32(3);
207214
int sub_c = PG_GETARG_INT32(4);
208215
int max_d = PG_GETARG_INT32(5);
209-
210-
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
216+
const char *s_data;
217+
const char *t_data;
218+
int s_bytes,
219+
t_bytes;
220+
221+
/* Extract a pointer to the actual character data */
222+
s_data = VARDATA_ANY(src);
223+
t_data = VARDATA_ANY(dst);
224+
/* Determine length of each string in bytes and characters */
225+
s_bytes = VARSIZE_ANY_EXHDR(src);
226+
t_bytes = VARSIZE_ANY_EXHDR(dst);
227+
228+
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
229+
t_bytes, ins_c, del_c,
230+
sub_c, max_d));
211231
}
212232

213233

@@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
218238
text *src = PG_GETARG_TEXT_PP(0);
219239
text *dst = PG_GETARG_TEXT_PP(1);
220240
int max_d = PG_GETARG_INT32(2);
221-
222-
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
241+
const char *s_data;
242+
const char *t_data;
243+
int s_bytes,
244+
t_bytes;
245+
246+
/* Extract a pointer to the actual character data */
247+
s_data = VARDATA_ANY(src);
248+
t_data = VARDATA_ANY(dst);
249+
/* Determine length of each string in bytes and characters */
250+
s_bytes = VARSIZE_ANY_EXHDR(src);
251+
t_bytes = VARSIZE_ANY_EXHDR(dst);
252+
253+
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
254+
t_bytes, 1, 1, 1, max_d));
223255
}
224256

225257

src/backend/utils/adt/Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
3838

3939
like.o: like.c like_match.c
4040

41+
varlena.o: varlena.c levenshtein.c
42+
4143
include $(top_srcdir)/src/backend/common.mk

contrib/fuzzystrmatch/levenshtein.c renamed to src/backend/utils/adt/levenshtein.c

+37-54
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,34 @@
1-
/*
1+
/*-------------------------------------------------------------------------
2+
*
23
* levenshtein.c
4+
* Levenshtein distance implementation.
35
*
4-
* Functions for "fuzzy" comparison of strings
6+
* Original author: Joe Conway <[email protected]>
57
*
6-
* Joe Conway <[email protected]>
8+
* This file is included by varlena.c twice, to provide matching code for (1)
9+
* Levenshtein distance with custom costings, and (2) Levenshtein distance with
10+
* custom costings and a "max" value above which exact distances are not
11+
* interesting. Before the inclusion, we rely on the presence of the inline
12+
* function rest_of_char_same().
13+
*
14+
* Written based on a description of the algorithm by Michael Gilleland found
15+
* at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the
16+
* PHP 4.0.6 distribution for inspiration. Configurable penalty costs
17+
* extension is introduced by Volkan YAZICI <[email protected].
718
*
819
* Copyright (c) 2001-2014, PostgreSQL Global Development Group
9-
* ALL RIGHTS RESERVED;
1020
*
11-
* levenshtein()
12-
* -------------
13-
* Written based on a description of the algorithm by Michael Gilleland
14-
* found at http://www.merriampark.com/ld.htm
15-
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
16-
* inspiration.
17-
* Configurable penalty costs extension is introduced by Volkan
18-
* YAZICI <[email protected]>.
19-
*/
20-
21-
/*
22-
* External declarations for exported functions
21+
* IDENTIFICATION
22+
* src/backend/utils/adt/levenshtein.c
23+
*
24+
*-------------------------------------------------------------------------
2325
*/
24-
#ifdef LEVENSHTEIN_LESS_EQUAL
25-
static int levenshtein_less_equal_internal(text *s, text *t,
26-
int ins_c, int del_c, int sub_c, int max_d);
27-
#else
28-
static int levenshtein_internal(text *s, text *t,
29-
int ins_c, int del_c, int sub_c);
30-
#endif
31-
3226
#define MAX_LEVENSHTEIN_STRLEN 255
3327

34-
3528
/*
36-
* Calculates Levenshtein distance metric between supplied strings. Generally
37-
* (1, 1, 1) penalty costs suffices for common cases, but your mileage may
38-
* vary.
29+
* Calculates Levenshtein distance metric between supplied csrings, which are
30+
* not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices
31+
* for common cases, but your mileage may vary.
3932
*
4033
* One way to compute Levenshtein distance is to incrementally construct
4134
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t,
6356
* identify the portion of the matrix close to the diagonal which can still
6457
* affect the final answer.
6558
*/
66-
static int
59+
int
6760
#ifdef LEVENSHTEIN_LESS_EQUAL
68-
levenshtein_less_equal_internal(text *s, text *t,
69-
int ins_c, int del_c, int sub_c, int max_d)
61+
varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
62+
int tlen, int ins_c, int del_c, int sub_c,
63+
int max_d)
7064
#else
71-
levenshtein_internal(text *s, text *t,
72-
int ins_c, int del_c, int sub_c)
65+
varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
66+
int ins_c, int del_c, int sub_c)
7367
#endif
7468
{
7569
int m,
76-
n,
77-
s_bytes,
78-
t_bytes;
70+
n;
7971
int *prev;
8072
int *curr;
8173
int *s_char_len = NULL;
8274
int i,
8375
j;
84-
const char *s_data;
85-
const char *t_data;
8676
const char *y;
8777

8878
/*
89-
* For levenshtein_less_equal_internal, we have real variables called
79+
* For varstr_levenshtein_less_equal, we have real variables called
9080
* start_column and stop_column; otherwise it's just short-hand for 0 and
9181
* m.
9282
*/
@@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t,
10595
#define STOP_COLUMN m
10696
#endif
10797

108-
/* Extract a pointer to the actual character data. */
109-
s_data = VARDATA_ANY(s);
110-
t_data = VARDATA_ANY(t);
111-
112-
/* Determine length of each string in bytes and characters. */
113-
s_bytes = VARSIZE_ANY_EXHDR(s);
114-
t_bytes = VARSIZE_ANY_EXHDR(t);
115-
m = pg_mbstrlen_with_len(s_data, s_bytes);
116-
n = pg_mbstrlen_with_len(t_data, t_bytes);
98+
m = pg_mbstrlen_with_len(source, slen);
99+
n = pg_mbstrlen_with_len(target, tlen);
117100

118101
/*
119102
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t,
193176
* multi-byte characters, we still build the array, so that the fast-path
194177
* needn't deal with the case where the array hasn't been initialized.
195178
*/
196-
if (m != s_bytes || n != t_bytes)
179+
if (m != slen || n != tlen)
197180
{
198181
int i;
199-
const char *cp = s_data;
182+
const char *cp = source;
200183

201184
s_char_len = (int *) palloc((m + 1) * sizeof(int));
202185
for (i = 0; i < m; ++i)
@@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t,
223206
prev[i] = i * del_c;
224207

225208
/* Loop through rows of the notional array */
226-
for (y = t_data, j = 1; j < n; j++)
209+
for (y = target, j = 1; j < n; j++)
227210
{
228211
int *temp;
229-
const char *x = s_data;
230-
int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
212+
const char *x = source;
213+
int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
231214

232215
#ifdef LEVENSHTEIN_LESS_EQUAL
233216

@@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t,
384367
prev[start_column] = max_d + 1;
385368
curr[start_column] = max_d + 1;
386369
if (start_column != 0)
387-
s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
370+
source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
388371
start_column++;
389372
}
390373

src/backend/utils/adt/varlena.c

+21-1
Original file line numberDiff line numberDiff line change
@@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
15461546
return result;
15471547
}
15481548

1549-
15501549
/* text_cmp()
15511550
* Internal comparison function for text strings.
15521551
* Returns -1, 0 or 1
@@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS)
47474746
{
47484747
return text_format(fcinfo);
47494748
}
4749+
4750+
/*
4751+
* Helper function for Levenshtein distance functions. Faster than memcmp(),
4752+
* for this use case.
4753+
*/
4754+
static inline bool
4755+
rest_of_char_same(const char *s1, const char *s2, int len)
4756+
{
4757+
while (len > 0)
4758+
{
4759+
len--;
4760+
if (s1[len] != s2[len])
4761+
return false;
4762+
}
4763+
return true;
4764+
}
4765+
4766+
/* Expand each Levenshtein distance variant */
4767+
#include "levenshtein.c"
4768+
#define LEVENSHTEIN_LESS_EQUAL
4769+
#include "levenshtein.c"

src/include/utils/builtins.h

+5
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
786786
extern Datum name_text(PG_FUNCTION_ARGS);
787787
extern Datum text_name(PG_FUNCTION_ARGS);
788788
extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
789+
extern int varstr_levenshtein(const char *source, int slen, const char *target,
790+
int tlen, int ins_c, int del_c, int sub_c);
791+
extern int varstr_levenshtein_less_equal(const char *source, int slen,
792+
const char *target, int tlen, int ins_c,
793+
int del_c, int sub_c, int max_d);
789794
extern List *textToQualifiedNameList(text *textval);
790795
extern bool SplitIdentifierString(char *rawstring, char separator,
791796
List **namelist);

0 commit comments

Comments
 (0)