Skip to content

Commit c9cbe52

Browse files
authored
Metaphone performance improvement (php#10501)
* Don't do toupper() redundantly in encoding the data for metaphone All inputs for ENCODE() are already uppercase, so there's no need to spend time uppercasing them again. * Don't compute uppercase letter redundantly in checks If it's a zero-terminator check, or an isalpha() check, there's no need to convert it to uppercase first. * Clean-up LookAhead helper * Add some letter caching to metaphone to increase performance We don't have to re-read letters, and re-uppercase them if we already did it once. By caching these results, we gain performance. Furthermore, we can avoid fetching and uppercasing in some conditions by first checking what we already had: e.g. if a condition depends on both Prev_Letter and After_Next_Letter, but we already have Prev_Letter cached, we can place that first to avoid a fetch+toupper of the "after next letter".
1 parent cd01639 commit c9cbe52

File tree

1 file changed

+83
-61
lines changed

1 file changed

+83
-61
lines changed

ext/standard/metaphone.c

+83-61
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ static const char _codes[26] =
7878
};
7979

8080

81-
#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
81+
/* Note: these macros require an uppercase letter input! */
82+
#define ENCODE(c) (isalpha(c) ? _codes[((c) - 'A')] : 0)
8283

8384
#define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
8485

@@ -101,16 +102,19 @@ static const char _codes[26] =
101102
/* I suppose I could have been using a character pointer instead of
102103
* accesssing the array directly... */
103104

105+
#define Convert_Raw(c) toupper(c)
104106
/* Look at the next letter in the word */
105-
#define Next_Letter (toupper(word[w_idx+1]))
107+
#define Read_Raw_Next_Letter (word[w_idx+1])
108+
#define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter))
106109
/* Look at the current letter in the word */
107-
#define Curr_Letter (toupper(word[w_idx]))
110+
#define Read_Raw_Curr_Letter (word[w_idx])
111+
#define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter))
108112
/* Go N letters back. */
109-
#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
113+
#define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0')
110114
/* Previous letter. I dunno, should this return null on failure? */
111-
#define Prev_Letter (Look_Back_Letter(1))
115+
#define Read_Prev_Letter (Look_Back_Letter(1))
112116
/* Look two letters down. It makes sure you don't walk off the string. */
113-
#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
117+
#define Read_After_Next_Letter (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \
114118
: '\0')
115119
#define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n)))
116120

@@ -119,15 +123,13 @@ static const char _codes[26] =
119123
/* I probably could have just used strlen... */
120124
static char Lookahead(char *word, int how_far)
121125
{
122-
char letter_ahead = '\0'; /* null by default */
123126
int idx;
124127
for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
125128
/* Edge forward in the string... */
126129

127-
letter_ahead = word[idx]; /* idx will be either == to how_far or
128-
* at the end of the string
130+
return word[idx]; /* idx will be either == to how_far or
131+
* at the end of the string where it will be null
129132
*/
130-
return letter_ahead;
131133
}
132134

133135

@@ -164,6 +166,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
164166
int w_idx = 0; /* point in the phonization we're at. */
165167
size_t p_idx = 0; /* end of the phoned phrase */
166168
size_t max_buffer_len = 0; /* maximum length of the destination buffer */
169+
char curr_letter;
167170
ZEND_ASSERT(word != NULL);
168171
ZEND_ASSERT(max_phonemes >= 0);
169172

@@ -179,18 +182,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
179182

180183
/*-- The first phoneme has to be processed specially. --*/
181184
/* Find our first letter */
182-
for (; !isalpha(Curr_Letter); w_idx++) {
185+
for (; !isalpha(curr_letter = Read_Raw_Curr_Letter); w_idx++) {
183186
/* On the off chance we were given nothing but crap... */
184-
if (Curr_Letter == '\0') {
187+
if (curr_letter == '\0') {
185188
End_Phoned_Word();
186189
return;
187190
}
188191
}
189192

190-
switch (Curr_Letter) {
193+
curr_letter = Convert_Raw(curr_letter);
194+
195+
switch (curr_letter) {
191196
/* AE becomes E */
192197
case 'A':
193-
if (Next_Letter == 'E') {
198+
if (Read_Next_Letter == 'E') {
194199
Phonize('E');
195200
w_idx += 2;
196201
}
@@ -204,24 +209,26 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
204209
case 'G':
205210
case 'K':
206211
case 'P':
207-
if (Next_Letter == 'N') {
212+
if (Read_Next_Letter == 'N') {
208213
Phonize('N');
209214
w_idx += 2;
210215
}
211216
break;
212217
/* WH becomes W,
213218
WR becomes R
214219
W if followed by a vowel */
215-
case 'W':
216-
if (Next_Letter == 'R') {
217-
Phonize(Next_Letter);
220+
case 'W': {
221+
char next_letter = Read_Next_Letter;
222+
if (next_letter == 'R') {
223+
Phonize('R');
218224
w_idx += 2;
219-
} else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
225+
} else if (next_letter == 'H' || isvowel(next_letter)) {
220226
Phonize('W');
221227
w_idx += 2;
222228
}
223229
/* else ignore */
224230
break;
231+
}
225232
/* X becomes S */
226233
case 'X':
227234
Phonize('S');
@@ -236,7 +243,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
236243
case 'I':
237244
case 'O':
238245
case 'U':
239-
Phonize(Curr_Letter);
246+
Phonize(curr_letter);
240247
w_idx++;
241248
break;
242249
default:
@@ -247,7 +254,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
247254

248255

249256
/* On to the metaphoning */
250-
for (; Curr_Letter != '\0' &&
257+
for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' &&
251258
(max_phonemes == 0 || Phone_Len < (size_t)max_phonemes);
252259
w_idx++) {
253260
/* How many letters to skip because an eariler encoding handled
@@ -263,18 +270,23 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
263270
*/
264271

265272
/* Ignore non-alphas */
266-
if (!isalpha(Curr_Letter))
273+
if (!isalpha(curr_letter))
267274
continue;
268275

276+
curr_letter = Convert_Raw(curr_letter);
277+
/* Note: we can't cache curr_letter from the previous loop
278+
* because of the skip_letter variable. */
279+
char prev_letter = Read_Prev_Letter;
280+
269281
/* Drop duplicates, except CC */
270-
if (Curr_Letter == Prev_Letter &&
271-
Curr_Letter != 'C')
282+
if (curr_letter == prev_letter &&
283+
curr_letter != 'C')
272284
continue;
273285

274-
switch (Curr_Letter) {
286+
switch (curr_letter) {
275287
/* B -> B unless in MB */
276288
case 'B':
277-
if (Prev_Letter != 'M')
289+
if (prev_letter != 'M')
278290
Phonize('B');
279291
break;
280292
/* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
@@ -283,20 +295,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
283295
* dropped if -SCI-, SCE-, -SCY- (handed in S)
284296
* else K
285297
*/
286-
case 'C':
287-
if (MAKESOFT(Next_Letter)) { /* C[IEY] */
288-
if (After_Next_Letter == 'A' &&
289-
Next_Letter == 'I') { /* CIA */
298+
case 'C': {
299+
char next_letter = Read_Next_Letter;
300+
if (MAKESOFT(next_letter)) { /* C[IEY] */
301+
if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */
290302
Phonize(SH);
291303
}
292304
/* SC[IEY] */
293-
else if (Prev_Letter == 'S') {
305+
else if (prev_letter == 'S') {
294306
/* Dropped */
295307
} else {
296308
Phonize('S');
297309
}
298-
} else if (Next_Letter == 'H') {
299-
if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
310+
} else if (next_letter == 'H') {
311+
if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */
300312
Phonize('K');
301313
} else {
302314
Phonize(SH);
@@ -306,12 +318,13 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
306318
Phonize('K');
307319
}
308320
break;
321+
}
309322
/* J if in -DGE-, -DGI- or -DGY-
310323
* else T
311324
*/
312325
case 'D':
313-
if (Next_Letter == 'G' &&
314-
MAKESOFT(After_Next_Letter)) {
326+
if (Read_Next_Letter == 'G' &&
327+
MAKESOFT(Read_After_Next_Letter)) {
315328
Phonize('J');
316329
skip_letter++;
317330
} else
@@ -323,47 +336,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
323336
* else J if in -GE-, -GI, -GY and not GG
324337
* else K
325338
*/
326-
case 'G':
327-
if (Next_Letter == 'H') {
339+
case 'G': {
340+
char next_letter = Read_Next_Letter;
341+
if (next_letter == 'H') {
328342
if (!(NOGHTOF(Look_Back_Letter(3)) ||
329343
Look_Back_Letter(4) == 'H')) {
330344
Phonize('F');
331345
skip_letter++;
332346
} else {
333347
/* silent */
334348
}
335-
} else if (Next_Letter == 'N') {
336-
if (Isbreak(After_Next_Letter) ||
337-
(After_Next_Letter == 'E' &&
349+
} else if (next_letter == 'N') {
350+
char after_next_letter = Read_After_Next_Letter;
351+
if (Isbreak(after_next_letter) ||
352+
(after_next_letter == 'E' &&
338353
Look_Ahead_Letter(3) == 'D')) {
339354
/* dropped */
340355
} else
341356
Phonize('K');
342-
} else if (MAKESOFT(Next_Letter) &&
343-
Prev_Letter != 'G') {
357+
} else if (MAKESOFT(next_letter) &&
358+
prev_letter != 'G') {
344359
Phonize('J');
345360
} else {
346361
Phonize('K');
347362
}
348363
break;
364+
}
349365
/* H if before a vowel and not after C,G,P,S,T */
350366
case 'H':
351-
if (isvowel(Next_Letter) &&
352-
!AFFECTH(Prev_Letter))
367+
if (isvowel(Read_Next_Letter) &&
368+
!AFFECTH(prev_letter))
353369
Phonize('H');
354370
break;
355371
/* dropped if after C
356372
* else K
357373
*/
358374
case 'K':
359-
if (Prev_Letter != 'C')
375+
if (prev_letter != 'C')
360376
Phonize('K');
361377
break;
362378
/* F if before H
363379
* else P
364380
*/
365381
case 'P':
366-
if (Next_Letter == 'H') {
382+
if (Read_Next_Letter == 'H') {
367383
Phonize('F');
368384
} else {
369385
Phonize('P');
@@ -377,44 +393,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
377393
/* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
378394
* else S
379395
*/
380-
case 'S':
381-
if (Next_Letter == 'I' &&
382-
(After_Next_Letter == 'O' ||
383-
After_Next_Letter == 'A')) {
396+
case 'S': {
397+
char next_letter = Read_Next_Letter;
398+
char after_next_letter;
399+
if (next_letter == 'I' &&
400+
((after_next_letter = Read_After_Next_Letter) == 'O' ||
401+
after_next_letter == 'A')) {
384402
Phonize(SH);
385-
} else if (Next_Letter == 'H') {
403+
} else if (next_letter == 'H') {
386404
Phonize(SH);
387405
skip_letter++;
388-
} else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
406+
} else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
389407
Phonize(SH);
390408
skip_letter += 2;
391409
} else {
392410
Phonize('S');
393411
}
394412
break;
413+
}
395414
/* 'sh' in -TIA- or -TIO-
396415
* else 'th' before H
397416
* else T
398417
*/
399-
case 'T':
400-
if (Next_Letter == 'I' &&
401-
(After_Next_Letter == 'O' ||
402-
After_Next_Letter == 'A')) {
418+
case 'T': {
419+
char next_letter = Read_Next_Letter;
420+
char after_next_letter;
421+
if (next_letter == 'I' &&
422+
((after_next_letter = Read_After_Next_Letter) == 'O' ||
423+
after_next_letter == 'A')) {
403424
Phonize(SH);
404-
} else if (Next_Letter == 'H') {
425+
} else if (next_letter == 'H') {
405426
Phonize(TH);
406427
skip_letter++;
407-
} else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
428+
} else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) {
408429
Phonize('T');
409430
}
410431
break;
432+
}
411433
/* F */
412434
case 'V':
413435
Phonize('F');
414436
break;
415437
/* W before a vowel, else dropped */
416438
case 'W':
417-
if (isvowel(Next_Letter))
439+
if (isvowel(Read_Next_Letter))
418440
Phonize('W');
419441
break;
420442
/* KS */
@@ -424,7 +446,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
424446
break;
425447
/* Y if followed by a vowel */
426448
case 'Y':
427-
if (isvowel(Next_Letter))
449+
if (isvowel(Read_Next_Letter))
428450
Phonize('Y');
429451
break;
430452
/* S */
@@ -438,7 +460,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
438460
case 'M':
439461
case 'N':
440462
case 'R':
441-
Phonize(Curr_Letter);
463+
Phonize(curr_letter);
442464
break;
443465
default:
444466
/* nothing */

0 commit comments

Comments
 (0)