1 /* 2 * Copyright 2009 Red Hat, Inc. 3 * Copyright 2009 Keith Stribley 4 * Copyright 2011 Google, Inc. 5 * 6 * This is part of HarfBuzz, a text shaping library. 7 * 8 * Permission is hereby granted, without written agreement and without 9 * license or royalty fees, to use, copy, modify, and distribute this 10 * software and its documentation for any purpose, provided that the 11 * above copyright notice and the following two paragraphs appear in 12 * all copies of this software. 13 * 14 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 15 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 16 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 17 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 18 * DAMAGE. 19 * 20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 25 * 26 * Red Hat Author(s): Behdad Esfahbod 27 * Google Author(s): Behdad Esfahbod 28 */ 29 30 #include "hb-private.hh" 31 32 #include "hb-icu.h" 33 34 #include "hb-unicode-private.hh" 35 36 #include <unicode/uchar.h> 37 #include <unicode/unorm.h> 38 #include <unicode/ustring.h> 39 #include <unicode/uversion.h> 40 41 42 hb_script_t 43 hb_icu_script_to_script (UScriptCode script) 44 { 45 if (unlikely (script == USCRIPT_INVALID_CODE)) 46 return HB_SCRIPT_INVALID; 47 48 return hb_script_from_string (uscript_getShortName (script), -1); 49 } 50 51 UScriptCode 52 hb_icu_script_from_script (hb_script_t script) 53 { 54 if (unlikely (script == HB_SCRIPT_INVALID)) 55 return USCRIPT_INVALID_CODE; 56 57 for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) 58 if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) 59 return (UScriptCode) i; 60 61 return USCRIPT_UNKNOWN; 62 } 63 64 65 static hb_unicode_combining_class_t 66 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, 67 hb_codepoint_t unicode, 68 void *user_data HB_UNUSED) 69 70 { 71 return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); 72 } 73 74 static unsigned int 75 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, 76 hb_codepoint_t unicode, 77 void *user_data HB_UNUSED) 78 { 79 switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) 80 { 81 case U_EA_WIDE: 82 case U_EA_FULLWIDTH: 83 return 2; 84 case U_EA_NEUTRAL: 85 case U_EA_AMBIGUOUS: 86 case U_EA_HALFWIDTH: 87 case U_EA_NARROW: 88 return 1; 89 } 90 return 1; 91 } 92 93 static hb_unicode_general_category_t 94 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, 95 hb_codepoint_t unicode, 96 void *user_data HB_UNUSED) 97 { 98 switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) 99 { 100 case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 101 102 case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; 103 case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; 104 case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; 105 case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; 106 case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; 107 108 case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; 109 case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; 110 case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; 111 112 case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; 113 case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; 114 case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; 115 116 case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; 117 case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; 118 case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; 119 120 case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; 121 case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; 122 case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; 123 case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; 124 125 126 case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; 127 case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; 128 case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; 129 case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; 130 case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; 131 132 case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; 133 case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; 134 case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; 135 case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; 136 137 case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; 138 case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; 139 } 140 141 return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 142 } 143 144 static hb_codepoint_t 145 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, 146 hb_codepoint_t unicode, 147 void *user_data HB_UNUSED) 148 { 149 return u_charMirror(unicode); 150 } 151 152 static hb_script_t 153 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, 154 hb_codepoint_t unicode, 155 void *user_data HB_UNUSED) 156 { 157 UErrorCode status = U_ZERO_ERROR; 158 UScriptCode scriptCode = uscript_getScript(unicode, &status); 159 160 if (unlikely (U_FAILURE (status))) 161 return HB_SCRIPT_UNKNOWN; 162 163 return hb_icu_script_to_script (scriptCode); 164 } 165 166 #if U_ICU_VERSION_MAJOR_NUM >= 49 167 static const UNormalizer2 *normalizer; 168 #endif 169 170 static hb_bool_t 171 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 172 hb_codepoint_t a, 173 hb_codepoint_t b, 174 hb_codepoint_t *ab, 175 void *user_data HB_UNUSED) 176 { 177 #if U_ICU_VERSION_MAJOR_NUM >= 49 178 { 179 UChar32 ret = unorm2_composePair (normalizer, a, b); 180 if (ret < 0) return false; 181 *ab = ret; 182 return true; 183 } 184 #endif 185 186 /* We don't ifdef-out the fallback code such that compiler always 187 * sees it and makes sure it's compilable. */ 188 189 UChar utf16[4], normalized[5]; 190 unsigned int len; 191 hb_bool_t ret, err; 192 UErrorCode icu_err; 193 194 len = 0; 195 err = false; 196 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); 197 if (err) return false; 198 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); 199 if (err) return false; 200 201 icu_err = U_ZERO_ERROR; 202 len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 203 if (U_FAILURE (icu_err)) 204 return false; 205 if (u_countChar32 (normalized, len) == 1) { 206 U16_GET_UNSAFE (normalized, 0, *ab); 207 ret = true; 208 } else { 209 ret = false; 210 } 211 212 return ret; 213 } 214 215 static hb_bool_t 216 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 217 hb_codepoint_t ab, 218 hb_codepoint_t *a, 219 hb_codepoint_t *b, 220 void *user_data HB_UNUSED) 221 { 222 #if U_ICU_VERSION_MAJOR_NUM >= 49 223 { 224 UChar decomposed[4]; 225 int len; 226 UErrorCode icu_err = U_ZERO_ERROR; 227 len = unorm2_getRawDecomposition (normalizer, ab, decomposed, 228 ARRAY_LENGTH (decomposed), &icu_err); 229 if (U_FAILURE (icu_err) || len < 0) return false; 230 231 len = u_countChar32 (decomposed, len); 232 if (len == 1) { 233 U16_GET_UNSAFE (decomposed, 0, *a); 234 *b = 0; 235 return *a != ab; 236 } else if (len == 2) { 237 len =0; 238 U16_NEXT_UNSAFE (decomposed, len, *a); 239 U16_NEXT_UNSAFE (decomposed, len, *b); 240 } 241 return true; 242 } 243 #endif 244 245 /* We don't ifdef-out the fallback code such that compiler always 246 * sees it and makes sure it's compilable. */ 247 248 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 249 unsigned int len; 250 hb_bool_t ret, err; 251 UErrorCode icu_err; 252 253 /* This function is a monster! Maybe it wasn't a good idea adding a 254 * pairwise decompose API... */ 255 /* Watchout for the dragons. Err, watchout for macros changing len. */ 256 257 len = 0; 258 err = false; 259 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); 260 if (err) return false; 261 262 icu_err = U_ZERO_ERROR; 263 len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 264 if (U_FAILURE (icu_err)) 265 return false; 266 267 len = u_countChar32 (normalized, len); 268 269 if (len == 1) { 270 U16_GET_UNSAFE (normalized, 0, *a); 271 *b = 0; 272 ret = *a != ab; 273 } else if (len == 2) { 274 len =0; 275 U16_NEXT_UNSAFE (normalized, len, *a); 276 U16_NEXT_UNSAFE (normalized, len, *b); 277 278 /* Here's the ugly part: if ab decomposes to a single character and 279 * that character decomposes again, we have to detect that and undo 280 * the second part :-(. */ 281 UChar recomposed[20]; 282 icu_err = U_ZERO_ERROR; 283 unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 284 if (U_FAILURE (icu_err)) 285 return false; 286 hb_codepoint_t c; 287 U16_GET_UNSAFE (recomposed, 0, c); 288 if (c != *a && c != ab) { 289 *a = c; 290 *b = 0; 291 } 292 ret = true; 293 } else { 294 /* If decomposed to more than two characters, take the last one, 295 * and recompose the rest to get the first component. */ 296 U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ 297 UChar recomposed[18 * 2]; 298 icu_err = U_ZERO_ERROR; 299 len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 300 if (U_FAILURE (icu_err)) 301 return false; 302 /* We expect that recomposed has exactly one character now. */ 303 if (unlikely (u_countChar32 (recomposed, len) != 1)) 304 return false; 305 U16_GET_UNSAFE (recomposed, 0, *a); 306 ret = true; 307 } 308 309 return ret; 310 } 311 312 static unsigned int 313 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, 314 hb_codepoint_t u, 315 hb_codepoint_t *decomposed, 316 void *user_data HB_UNUSED) 317 { 318 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 319 unsigned int len; 320 int32_t utf32_len; 321 hb_bool_t err; 322 UErrorCode icu_err; 323 324 /* Copy @u into a UTF-16 array to be passed to ICU. */ 325 len = 0; 326 err = FALSE; 327 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); 328 if (err) 329 return 0; 330 331 /* Normalise the codepoint using NFKD mode. */ 332 icu_err = U_ZERO_ERROR; 333 len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 334 if (icu_err) 335 return 0; 336 337 /* Convert the decomposed form from UTF-16 to UTF-32. */ 338 icu_err = U_ZERO_ERROR; 339 u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); 340 if (icu_err) 341 return 0; 342 343 return utf32_len; 344 } 345 346 347 hb_unicode_funcs_t * 348 hb_icu_get_unicode_funcs (void) 349 { 350 static const hb_unicode_funcs_t _hb_icu_unicode_funcs = { 351 HB_OBJECT_HEADER_STATIC, 352 353 NULL, /* parent */ 354 true, /* immutable */ 355 { 356 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name, 357 HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS 358 #undef HB_UNICODE_FUNC_IMPLEMENT 359 } 360 }; 361 362 #if U_ICU_VERSION_MAJOR_NUM >= 49 363 if (!hb_atomic_ptr_get (&normalizer)) { 364 UErrorCode icu_err = U_ZERO_ERROR; 365 /* We ignore failure in getNFCInstace(). */ 366 hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err)); 367 } 368 #endif 369 return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs); 370 } 371 372 373