1
- /*
1
+ /*-------------------------------------------------------------------------
2
+ *
2
3
* levenshtein.c
4
+ * Levenshtein distance implementation.
3
5
*
4
- * Functions for "fuzzy" comparison of strings
6
+ * Original author: Joe Conway <[email protected] >
5
7
*
6
-
8
+ * This file is included by varlena.c twice, to provide matching code for (1)
9
+ * Levenshtein distance with custom costings, and (2) Levenshtein distance with
10
+ * custom costings and a "max" value above which exact distances are not
11
+ * interesting. Before the inclusion, we rely on the presence of the inline
12
+ * function rest_of_char_same().
13
+ *
14
+ * Written based on a description of the algorithm by Michael Gilleland found
15
+ * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the
16
+ * PHP 4.0.6 distribution for inspiration. Configurable penalty costs
17
+ * extension is introduced by Volkan YAZICI <[email protected] .
7
18
*
8
19
* Copyright (c) 2001-2014, PostgreSQL Global Development Group
9
- * ALL RIGHTS RESERVED;
10
20
*
11
- * levenshtein()
12
- * -------------
13
- * Written based on a description of the algorithm by Michael Gilleland
14
- * found at http://www.merriampark.com/ld.htm
15
- * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
16
- * inspiration.
17
- * Configurable penalty costs extension is introduced by Volkan
18
-
19
- */
20
-
21
- /*
22
- * External declarations for exported functions
21
+ * IDENTIFICATION
22
+ * src/backend/utils/adt/levenshtein.c
23
+ *
24
+ *-------------------------------------------------------------------------
23
25
*/
24
- #ifdef LEVENSHTEIN_LESS_EQUAL
25
- static int levenshtein_less_equal_internal (text * s , text * t ,
26
- int ins_c , int del_c , int sub_c , int max_d );
27
- #else
28
- static int levenshtein_internal (text * s , text * t ,
29
- int ins_c , int del_c , int sub_c );
30
- #endif
31
-
32
26
#define MAX_LEVENSHTEIN_STRLEN 255
33
27
34
-
35
28
/*
36
- * Calculates Levenshtein distance metric between supplied strings. Generally
37
- * (1, 1, 1) penalty costs suffices for common cases, but your mileage may
38
- * vary.
29
+ * Calculates Levenshtein distance metric between supplied csrings, which are
30
+ * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices
31
+ * for common cases, but your mileage may vary.
39
32
*
40
33
* One way to compute Levenshtein distance is to incrementally construct
41
34
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t,
63
56
* identify the portion of the matrix close to the diagonal which can still
64
57
* affect the final answer.
65
58
*/
66
- static int
59
+ int
67
60
#ifdef LEVENSHTEIN_LESS_EQUAL
68
- levenshtein_less_equal_internal (text * s , text * t ,
69
- int ins_c , int del_c , int sub_c , int max_d )
61
+ varstr_levenshtein_less_equal (const char * source , int slen , const char * target ,
62
+ int tlen , int ins_c , int del_c , int sub_c ,
63
+ int max_d )
70
64
#else
71
- levenshtein_internal ( text * s , text * t ,
72
- int ins_c , int del_c , int sub_c )
65
+ varstr_levenshtein ( const char * source , int slen , const char * target , int tlen ,
66
+ int ins_c , int del_c , int sub_c )
73
67
#endif
74
68
{
75
69
int m ,
76
- n ,
77
- s_bytes ,
78
- t_bytes ;
70
+ n ;
79
71
int * prev ;
80
72
int * curr ;
81
73
int * s_char_len = NULL ;
82
74
int i ,
83
75
j ;
84
- const char * s_data ;
85
- const char * t_data ;
86
76
const char * y ;
87
77
88
78
/*
89
- * For levenshtein_less_equal_internal , we have real variables called
79
+ * For varstr_levenshtein_less_equal , we have real variables called
90
80
* start_column and stop_column; otherwise it's just short-hand for 0 and
91
81
* m.
92
82
*/
@@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t,
105
95
#define STOP_COLUMN m
106
96
#endif
107
97
108
- /* Extract a pointer to the actual character data. */
109
- s_data = VARDATA_ANY (s );
110
- t_data = VARDATA_ANY (t );
111
-
112
- /* Determine length of each string in bytes and characters. */
113
- s_bytes = VARSIZE_ANY_EXHDR (s );
114
- t_bytes = VARSIZE_ANY_EXHDR (t );
115
- m = pg_mbstrlen_with_len (s_data , s_bytes );
116
- n = pg_mbstrlen_with_len (t_data , t_bytes );
98
+ m = pg_mbstrlen_with_len (source , slen );
99
+ n = pg_mbstrlen_with_len (target , tlen );
117
100
118
101
/*
119
102
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t,
193
176
* multi-byte characters, we still build the array, so that the fast-path
194
177
* needn't deal with the case where the array hasn't been initialized.
195
178
*/
196
- if (m != s_bytes || n != t_bytes )
179
+ if (m != slen || n != tlen )
197
180
{
198
181
int i ;
199
- const char * cp = s_data ;
182
+ const char * cp = source ;
200
183
201
184
s_char_len = (int * ) palloc ((m + 1 ) * sizeof (int ));
202
185
for (i = 0 ; i < m ; ++ i )
@@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t,
223
206
prev [i ] = i * del_c ;
224
207
225
208
/* Loop through rows of the notional array */
226
- for (y = t_data , j = 1 ; j < n ; j ++ )
209
+ for (y = target , j = 1 ; j < n ; j ++ )
227
210
{
228
211
int * temp ;
229
- const char * x = s_data ;
230
- int y_char_len = n != t_bytes + 1 ? pg_mblen (y ) : 1 ;
212
+ const char * x = source ;
213
+ int y_char_len = n != tlen + 1 ? pg_mblen (y ) : 1 ;
231
214
232
215
#ifdef LEVENSHTEIN_LESS_EQUAL
233
216
@@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t,
384
367
prev [start_column ] = max_d + 1 ;
385
368
curr [start_column ] = max_d + 1 ;
386
369
if (start_column != 0 )
387
- s_data += (s_char_len != NULL ) ? s_char_len [start_column - 1 ] : 1 ;
370
+ source += (s_char_len != NULL ) ? s_char_len [start_column - 1 ] : 1 ;
388
371
start_column ++ ;
389
372
}
390
373
0 commit comments