1 /* 2 * Copyright 2009 Red Hat, Inc. 3 * Copyright 2009 Keith Stribley 4 * Copyright 2011 Google, Inc. 5 * 6 * This is part of HarfBuzz, a text shaping library. 7 * 8 * Permission is hereby granted, without written agreement and without 9 * license or royalty fees, to use, copy, modify, and distribute this 10 * software and its documentation for any purpose, provided that the 11 * above copyright notice and the following two paragraphs appear in 12 * all copies of this software. 13 * 14 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 15 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 16 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 17 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 18 * DAMAGE. 19 * 20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 25 * 26 * Red Hat Author(s): Behdad Esfahbod 27 * Google Author(s): Behdad Esfahbod 28 */ 29 30 #include "hb-private.hh" 31 32 #include "hb-icu.h" 33 34 #include "hb-unicode-private.hh" 35 36 #include <unicode/uchar.h> 37 #include <unicode/unorm2.h> 38 #include <unicode/ustring.h> 39 #include <unicode/utf16.h> 40 #include <unicode/uversion.h> 41 42 43 hb_script_t 44 hb_icu_script_to_script (UScriptCode script) 45 { 46 if (unlikely (script == USCRIPT_INVALID_CODE)) 47 return HB_SCRIPT_INVALID; 48 49 return hb_script_from_string (uscript_getShortName (script), -1); 50 } 51 52 UScriptCode 53 hb_icu_script_from_script (hb_script_t script) 54 { 55 if (unlikely (script == HB_SCRIPT_INVALID)) 56 return USCRIPT_INVALID_CODE; 57 58 for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) 59 if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) 60 return (UScriptCode) i; 61 62 return USCRIPT_UNKNOWN; 63 } 64 65 66 static hb_unicode_combining_class_t 67 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, 68 hb_codepoint_t unicode, 69 void *user_data HB_UNUSED) 70 71 { 72 return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); 73 } 74 75 static unsigned int 76 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, 77 hb_codepoint_t unicode, 78 void *user_data HB_UNUSED) 79 { 80 switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) 81 { 82 case U_EA_WIDE: 83 case U_EA_FULLWIDTH: 84 return 2; 85 case U_EA_NEUTRAL: 86 case U_EA_AMBIGUOUS: 87 case U_EA_HALFWIDTH: 88 case U_EA_NARROW: 89 return 1; 90 } 91 return 1; 92 } 93 94 static hb_unicode_general_category_t 95 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, 96 hb_codepoint_t unicode, 97 void *user_data HB_UNUSED) 98 { 99 switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) 100 { 101 case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 102 103 case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; 104 case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; 105 case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; 106 case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; 107 case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; 108 109 case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; 110 case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; 111 case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; 112 113 case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; 114 case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; 115 case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; 116 117 case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; 118 case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; 119 case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; 120 121 case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; 122 case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; 123 case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; 124 case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; 125 126 127 case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; 128 case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; 129 case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; 130 case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; 131 case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; 132 133 case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; 134 case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; 135 case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; 136 case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; 137 138 case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; 139 case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; 140 } 141 142 return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 143 } 144 145 static hb_codepoint_t 146 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, 147 hb_codepoint_t unicode, 148 void *user_data HB_UNUSED) 149 { 150 return u_charMirror(unicode); 151 } 152 153 static hb_script_t 154 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, 155 hb_codepoint_t unicode, 156 void *user_data HB_UNUSED) 157 { 158 UErrorCode status = U_ZERO_ERROR; 159 UScriptCode scriptCode = uscript_getScript(unicode, &status); 160 161 if (unlikely (U_FAILURE (status))) 162 return HB_SCRIPT_UNKNOWN; 163 164 return hb_icu_script_to_script (scriptCode); 165 } 166 167 #if U_ICU_VERSION_MAJOR_NUM >= 49 168 static const UNormalizer2 *normalizer; 169 #endif 170 171 static hb_bool_t 172 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 173 hb_codepoint_t a, 174 hb_codepoint_t b, 175 hb_codepoint_t *ab, 176 void *user_data HB_UNUSED) 177 { 178 #if U_ICU_VERSION_MAJOR_NUM >= 49 179 { 180 UChar32 ret = unorm2_composePair (normalizer, a, b); 181 if (ret < 0) return false; 182 *ab = ret; 183 return true; 184 } 185 #endif 186 187 /* We don't ifdef-out the fallback code such that compiler always 188 * sees it and makes sure it's compilable. */ 189 190 UChar utf16[4], normalized[5]; 191 unsigned int len; 192 hb_bool_t ret, err; 193 UErrorCode icu_err; 194 195 len = 0; 196 err = false; 197 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); 198 if (err) return false; 199 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); 200 if (err) return false; 201 202 icu_err = U_ZERO_ERROR; 203 len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err); 204 if (U_FAILURE (icu_err)) 205 return false; 206 if (u_countChar32 (normalized, len) == 1) { 207 U16_GET_UNSAFE (normalized, 0, *ab); 208 ret = true; 209 } else { 210 ret = false; 211 } 212 213 return ret; 214 } 215 216 static hb_bool_t 217 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 218 hb_codepoint_t ab, 219 hb_codepoint_t *a, 220 hb_codepoint_t *b, 221 void *user_data HB_UNUSED) 222 { 223 #if U_ICU_VERSION_MAJOR_NUM >= 49 224 { 225 UChar decomposed[4]; 226 int len; 227 UErrorCode icu_err = U_ZERO_ERROR; 228 len = unorm2_getRawDecomposition (normalizer, ab, decomposed, 229 ARRAY_LENGTH (decomposed), &icu_err); 230 if (U_FAILURE (icu_err) || len < 0) return false; 231 232 len = u_countChar32 (decomposed, len); 233 if (len == 1) { 234 U16_GET_UNSAFE (decomposed, 0, *a); 235 *b = 0; 236 return *a != ab; 237 } else if (len == 2) { 238 len =0; 239 U16_NEXT_UNSAFE (decomposed, len, *a); 240 U16_NEXT_UNSAFE (decomposed, len, *b); 241 } 242 return true; 243 } 244 #endif 245 246 /* We don't ifdef-out the fallback code such that compiler always 247 * sees it and makes sure it's compilable. */ 248 249 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 250 unsigned int len; 251 hb_bool_t ret, err; 252 UErrorCode icu_err; 253 254 /* This function is a monster! Maybe it wasn't a good idea adding a 255 * pairwise decompose API... */ 256 /* Watchout for the dragons. Err, watchout for macros changing len. */ 257 258 len = 0; 259 err = false; 260 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); 261 if (err) return false; 262 263 icu_err = U_ZERO_ERROR; 264 len = unorm2_normalize (unorm2_getNFDInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err); 265 if (U_FAILURE (icu_err)) 266 return false; 267 268 len = u_countChar32 (normalized, len); 269 270 if (len == 1) { 271 U16_GET_UNSAFE (normalized, 0, *a); 272 *b = 0; 273 ret = *a != ab; 274 } else if (len == 2) { 275 len =0; 276 U16_NEXT_UNSAFE (normalized, len, *a); 277 U16_NEXT_UNSAFE (normalized, len, *b); 278 279 /* Here's the ugly part: if ab decomposes to a single character and 280 * that character decomposes again, we have to detect that and undo 281 * the second part :-(. */ 282 UChar recomposed[20]; 283 icu_err = U_ZERO_ERROR; 284 unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 285 if (U_FAILURE (icu_err)) 286 return false; 287 hb_codepoint_t c; 288 U16_GET_UNSAFE (recomposed, 0, c); 289 if (c != *a && c != ab) { 290 *a = c; 291 *b = 0; 292 } 293 ret = true; 294 } else { 295 /* If decomposed to more than two characters, take the last one, 296 * and recompose the rest to get the first component. */ 297 U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ 298 UChar recomposed[18 * 2]; 299 icu_err = U_ZERO_ERROR; 300 len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 301 if (U_FAILURE (icu_err)) 302 return false; 303 /* We expect that recomposed has exactly one character now. */ 304 if (unlikely (u_countChar32 (recomposed, len) != 1)) 305 return false; 306 U16_GET_UNSAFE (recomposed, 0, *a); 307 ret = true; 308 } 309 310 return ret; 311 } 312 313 static unsigned int 314 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, 315 hb_codepoint_t u, 316 hb_codepoint_t *decomposed, 317 void *user_data HB_UNUSED) 318 { 319 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 320 unsigned int len; 321 int32_t utf32_len; 322 hb_bool_t err; 323 UErrorCode icu_err; 324 325 /* Copy @u into a UTF-16 array to be passed to ICU. */ 326 len = 0; 327 err = false; 328 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); 329 if (err) 330 return 0; 331 332 /* Normalise the codepoint using NFKD mode. */ 333 icu_err = U_ZERO_ERROR; 334 len = unorm2_normalize (unorm2_getNFKDInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err); 335 if (U_FAILURE (icu_err)) 336 return 0; 337 338 /* Convert the decomposed form from UTF-16 to UTF-32. */ 339 icu_err = U_ZERO_ERROR; 340 u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); 341 if (U_FAILURE (icu_err)) 342 return 0; 343 344 return utf32_len; 345 } 346 347 348 static hb_unicode_funcs_t *static_icu_funcs = nullptr; 349 350 #ifdef HB_USE_ATEXIT 351 static 352 void free_static_icu_funcs (void) 353 { 354 hb_unicode_funcs_destroy (static_icu_funcs); 355 } 356 #endif 357 358 hb_unicode_funcs_t * 359 hb_icu_get_unicode_funcs (void) 360 { 361 retry: 362 hb_unicode_funcs_t *funcs = (hb_unicode_funcs_t *) hb_atomic_ptr_get (&static_icu_funcs); 363 364 if (unlikely (!funcs)) 365 { 366 #if U_ICU_VERSION_MAJOR_NUM >= 49 367 if (!hb_atomic_ptr_get (&normalizer)) { 368 UErrorCode icu_err = U_ZERO_ERROR; 369 /* We ignore failure in getNFCInstace(). */ 370 (void) hb_atomic_ptr_cmpexch (&normalizer, nullptr, unorm2_getNFCInstance (&icu_err)); 371 } 372 #endif 373 374 funcs = hb_unicode_funcs_create (nullptr); 375 376 #define HB_UNICODE_FUNC_IMPLEMENT(name) \ 377 hb_unicode_funcs_set_##name##_func (funcs, hb_icu_unicode_##name, nullptr, nullptr); 378 HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS 379 #undef HB_UNICODE_FUNC_IMPLEMENT 380 381 hb_unicode_funcs_make_immutable (funcs); 382 383 if (!hb_atomic_ptr_cmpexch (&static_icu_funcs, nullptr, funcs)) { 384 hb_unicode_funcs_destroy (funcs); 385 goto retry; 386 } 387 388 #ifdef HB_USE_ATEXIT 389 atexit (free_static_icu_funcs); /* First person registers atexit() callback. */ 390 #endif 391 }; 392 393 return hb_unicode_funcs_reference (funcs); 394 } 395