diff --git a/docs/design/features/globalization-hybrid-mode.md b/docs/design/features/globalization-hybrid-mode.md index 920f808b4e6aec..7f7b3bba2c03b7 100644 --- a/docs/design/features/globalization-hybrid-mode.md +++ b/docs/design/features/globalization-hybrid-mode.md @@ -423,19 +423,3 @@ Below function are used from apple native functions: - [uppercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1413316-uppercasestringwithlocale?language=objc) - [lowercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1417298-lowercasestringwithlocale?language=objc) -Behavioural changes compared to ICU - - - Final sigma behavior correction: - - ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ". - - - Below cases will throw exception because of insufficiently sized destination buffer - - - Capitalizing the German letter ß (sharp S) gives SS when using Apple native functions. - - - Capitalizing ligatures gives different result on Apple platforms, eg. "\uFB00" (ff) uppercase (FF) - - - Capitalizing "\u0149" (ʼn) on Apple platforms returns combination of "\u02BC" (ʼ) and N -> (ʼN) - - - diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index 6426a2a427baf8..466be98c442d9f 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -274,9 +274,9 @@ public static IEnumerable ToLower_TestData() // we also don't preform. // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule). yield return new object[] { cultureName, "\u03A3", "\u03C3" }; - if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX) + if (PlatformDetection.IsHybridGlobalizationOnBrowser) { - // JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior + // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior yield return new object[] { cultureName, "O\u03A3", "o\u03C2" }; } else @@ -396,29 +396,24 @@ public static IEnumerable ToUpper_TestData() // RAINBOW (outside the BMP and does not case) yield return new object[] { cultureName, "\U0001F308", "\U0001F308" }; - if (!PlatformDetection.IsHybridGlobalizationOnOSX) - { - // Unicode defines some codepoints which expand into multiple codepoints - // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done - // these sorts of expansions, since it would cause string lengths to change when cased, - // which is non-intuitive. In addition, there are some context sensitive mappings which - // we also don't preform. - // es-zed does not case to SS when uppercased. - // on OSX, capitalizing the German letter ß (sharp S) gives SS - yield return new object[] { cultureName, "\u00DF", "\u00DF" }; - yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; - if (!PlatformDetection.IsNlsGlobalization) - yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; - - // Ligatures do not expand when cased. - // on OSX, this is uppercase to "FF" - yield return new object[] { cultureName, "\uFB00", "\uFB00" }; - - // Precomposed character with no uppercase variant, we don't want to "decompose" this - // as part of casing. - // on OSX, this is uppercased to "ʼN" - yield return new object[] { cultureName, "\u0149", "\u0149" }; - } + + // Unicode defines some codepoints which expand into multiple codepoints + // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done + // these sorts of expansions, since it would cause string lengths to change when cased, + // which is non-intuitive. In addition, there are some context sensitive mappings which + // we also don't preform. + // es-zed does not case to SS when uppercased. + yield return new object[] { cultureName, "\u00DF", "\u00DF" }; + yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; + if (!PlatformDetection.IsNlsGlobalization) + yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; + + // Ligatures do not expand when cased. + yield return new object[] { cultureName, "\uFB00", "\uFB00" }; + + // Precomposed character with no uppercase variant, we don't want to "decompose" this + // as part of casing. + yield return new object[] { cultureName, "\u0149", "\u0149" }; } // Turkish i diff --git a/src/native/libs/System.Globalization.Native/pal_casing.m b/src/native/libs/System.Globalization.Native/pal_casing.m index 7aeacb54cf4a65..e0bd0f80805c96 100644 --- a/src/native/libs/System.Globalization.Native/pal_casing.m +++ b/src/native/libs/System.Globalization.Native/pal_casing.m @@ -9,6 +9,47 @@ #if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS) +/** + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return true or false + */ +#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800) + +/** + * Is this code unit a trail surrogate (U+dc00..U+dfff)? + * @param c 16-bit code unit + * @return true or false + */ +#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00) + +/** + * Get a code point index from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case for casing will be read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate. + * + * @param s const uint16_t* string + * @param i output string offset, must be i srcLength is to prevent code point expansions + dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index]; + Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); + index++; + } if (isError) return isError; } @@ -81,19 +137,33 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t ChangeCaseInvariantNative Performs upper or lower casing of a string into a new buffer. +Two things we are considering here: +1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer. + Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded. +2. Properly handling surrogate pairs. Characters can be comprised of more than one code point + (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case Returns 0 for success, non-zero on failure see ErrorCodes. */ int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper) { - NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength]; - NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString; - int32_t srcIdx = 0, dstIdx = 0, isError = 0; uint16_t dstCodepoint; - while (srcIdx < result.length) + while (srcIdx < cwSrcLength) { - dstCodepoint = [result characterAtIndex:srcIdx++]; - Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); + int32_t startIndex = srcIdx; + NEXTOFFSET(lpSrc, srcIdx, cwSrcLength); + int32_t srcLength = srcIdx - startIndex; + NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength]; + NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString; + int32_t index = 0; + // iterate over all code points of a surrogate pair character + while (index < srcLength) + { + // the dst.length > srcLength is to prevent code point expansions + dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index]; + Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); + index++; + } if (isError) return isError; }