Skip to content

Commit d3b2e5e

Browse files
committed
Refactor convert_case() to prepare for optimizations.
Upcoming optimizations will add complexity to convert_case(). This patch reorganizes slightly so that the complexity can be contained within the logic to convert the case of a single character, rather than mixing it in with logic to iterate through the string. Reviewed-by: Alexander Borisov <[email protected]> Discussion: https://postgr.es/m/[email protected]
1 parent 3abe9dc commit d3b2e5e

File tree

1 file changed

+101
-52
lines changed

1 file changed

+101
-52
lines changed

src/common/unicode_case.c

+101-52
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,20 @@
2020
#include "common/unicode_category.h"
2121
#include "mb/pg_wchar.h"
2222

23+
enum CaseMapResult
24+
{
25+
CASEMAP_SELF,
26+
CASEMAP_SIMPLE,
27+
CASEMAP_SPECIAL,
28+
};
29+
2330
static const pg_case_map *find_case_map(pg_wchar ucs);
2431
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
2532
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
2633
void *wbstate);
27-
static bool check_special_conditions(int conditions, const char *str,
28-
size_t len, size_t offset);
34+
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
35+
const char *src, size_t srclen, size_t srcoff,
36+
pg_wchar *u2, const pg_wchar **special);
2937

3038
pg_wchar
3139
unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
214222
{
215223
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
216224
int u1len = unicode_utf8len(u1);
217-
const pg_case_map *casemap = find_case_map(u1);
218-
const pg_special_case *special = NULL;
225+
pg_wchar simple = 0;
226+
const pg_wchar *special = NULL;
227+
enum CaseMapResult casemap_result;
219228

220229
if (str_casekind == CaseTitle)
221230
{
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
228237
chr_casekind = CaseLower;
229238
}
230239

231-
/*
232-
* Find special case that matches the conditions, if any.
233-
*
234-
* Note: only a single special mapping per codepoint is currently
235-
* supported, though Unicode allows for multiple special mappings for
236-
* a single codepoint.
237-
*/
238-
if (full && casemap && casemap->special_case)
239-
{
240-
int16 conditions = casemap->special_case->conditions;
241-
242-
Assert(casemap->special_case->codepoint == u1);
243-
if (check_special_conditions(conditions, src, srclen, srcoff))
244-
special = casemap->special_case;
245-
}
240+
casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
241+
&simple, &special);
246242

247-
/* perform mapping, update result_len, and write to dst */
248-
if (special)
243+
switch (casemap_result)
249244
{
250-
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
251-
{
252-
pg_wchar u2 = special->map[chr_casekind][i];
253-
size_t u2len = unicode_utf8len(u2);
254-
255-
if (u2 == '\0')
256-
break;
257-
258-
if (result_len + u2len <= dstsize)
259-
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
260-
261-
result_len += u2len;
262-
}
263-
}
264-
else if (casemap)
265-
{
266-
pg_wchar u2 = casemap->simplemap[chr_casekind];
267-
pg_wchar u2len = unicode_utf8len(u2);
268-
269-
if (result_len + u2len <= dstsize)
270-
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
271-
272-
result_len += u2len;
273-
}
274-
else
275-
{
276-
/* no mapping; copy bytes from src */
277-
if (result_len + u1len <= dstsize)
278-
memcpy(dst + result_len, src + srcoff, u1len);
279-
280-
result_len += u1len;
245+
case CASEMAP_SELF:
246+
/* no mapping; copy bytes from src */
247+
Assert(simple == 0);
248+
Assert(special == NULL);
249+
if (result_len + u1len <= dstsize)
250+
memcpy(dst + result_len, src + srcoff, u1len);
251+
252+
result_len += u1len;
253+
break;
254+
case CASEMAP_SIMPLE:
255+
{
256+
/* replace with single character */
257+
pg_wchar u2 = simple;
258+
pg_wchar u2len = unicode_utf8len(u2);
259+
260+
Assert(special == NULL);
261+
if (result_len + u2len <= dstsize)
262+
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
263+
264+
result_len += u2len;
265+
}
266+
break;
267+
case CASEMAP_SPECIAL:
268+
/* replace with up to MAX_CASE_EXPANSION characters */
269+
Assert(simple == 0);
270+
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
271+
{
272+
pg_wchar u2 = special[i];
273+
size_t u2len = unicode_utf8len(u2);
274+
275+
if (result_len + u2len <= dstsize)
276+
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
277+
278+
result_len += u2len;
279+
}
280+
break;
281281
}
282282

283283
srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
351351
return true;
352352
}
353353

354+
/*
355+
* Unicode allows for special casing to be applied only under certain
356+
* circumstances. The only currently-supported condition is Final_Sigma.
357+
*/
354358
static bool
355359
check_special_conditions(int conditions, const char *str, size_t len,
356360
size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
365369
return false;
366370
}
367371

372+
/*
373+
* Map the given character to the requested case.
374+
*
375+
* If full is true, and a special case mapping is found and the conditions are
376+
* met, 'special' is set to the mapping result (which is an array of up to
377+
* MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
378+
*
379+
* Otherwise, search for a simple mapping, and if found, set 'simple' to the
380+
* result and return CASEMAP_SIMPLE.
381+
*
382+
* If no mapping is found, return CASEMAP_SELF, and the caller should copy the
383+
* character without modification.
384+
*/
385+
static enum CaseMapResult
386+
casemap(pg_wchar u1, CaseKind casekind, bool full,
387+
const char *src, size_t srclen, size_t srcoff,
388+
pg_wchar *simple, const pg_wchar **special)
389+
{
390+
const pg_case_map *map;
391+
392+
if (u1 < 0x80)
393+
{
394+
*simple = case_map[u1].simplemap[casekind];
395+
396+
return CASEMAP_SIMPLE;
397+
}
398+
399+
map = find_case_map(u1);
400+
401+
if (map == NULL)
402+
return CASEMAP_SELF;
403+
404+
if (full && map->special_case != NULL &&
405+
check_special_conditions(map->special_case->conditions,
406+
src, srclen, srcoff))
407+
{
408+
*special = map->special_case->map[casekind];
409+
return CASEMAP_SPECIAL;
410+
}
411+
412+
*simple = map->simplemap[casekind];
413+
414+
return CASEMAP_SIMPLE;
415+
}
416+
368417
/* find entry in simple case map, if any */
369418
static const pg_case_map *
370419
find_case_map(pg_wchar ucs)

0 commit comments

Comments
 (0)