1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * File USC_IMPL.C 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 07/08/2002 Eric Mader Creation. 13 ****************************************************************************** 14 */ 15 16 #include "unicode/uscript.h" 17 #include "usc_impl.h" 18 #include "cmemory.h" 19 20 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 21 22 #define PAREN_STACK_DEPTH 32 23 24 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH) 25 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH) 26 #define INC(sp,count) (MOD((sp) + (count))) 27 #define INC1(sp) (INC(sp, 1)) 28 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count))) 29 #define DEC1(sp) (DEC(sp, 1)) 30 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0) 31 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun)) 32 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP]) 33 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0) 34 35 struct ParenStackEntry 36 { 37 int32_t pairIndex; 38 UScriptCode scriptCode; 39 }; 40 41 struct UScriptRun 42 { 43 int32_t textLength; 44 const UChar *textArray; 45 46 int32_t scriptStart; 47 int32_t scriptLimit; 48 UScriptCode scriptCode; 49 50 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH]; 51 int32_t parenSP; 52 int32_t pushCount; 53 int32_t fixupCount; 54 }; 55 56 static int8_t highBit(int32_t value); 57 58 static const UChar32 pairedChars[] = { 59 0x0028, 0x0029, /* ascii paired punctuation */ 60 0x003c, 0x003e, 61 0x005b, 0x005d, 62 0x007b, 0x007d, 63 0x00ab, 0x00bb, /* guillemets */ 64 0x2018, 0x2019, /* general punctuation */ 65 0x201c, 0x201d, 66 0x2039, 0x203a, 67 0x3008, 0x3009, /* chinese paired punctuation */ 68 0x300a, 0x300b, 69 0x300c, 0x300d, 70 0x300e, 0x300f, 71 0x3010, 0x3011, 72 0x3014, 0x3015, 73 0x3016, 0x3017, 74 0x3018, 0x3019, 75 0x301a, 0x301b 76 }; 77 78 static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode) 79 { 80 scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount); 81 scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount); 82 83 scriptRun->parenSP = INC1(scriptRun->parenSP); 84 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex; 85 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode; 86 } 87 88 static void pop(UScriptRun *scriptRun) 89 { 90 if (STACK_IS_EMPTY(scriptRun)) { 91 return; 92 } 93 94 if (scriptRun->fixupCount > 0) { 95 scriptRun->fixupCount -= 1; 96 } 97 98 scriptRun->pushCount -= 1; 99 scriptRun->parenSP = DEC1(scriptRun->parenSP); 100 101 /* If the stack is now empty, reset the stack 102 pointers to their initial values. 103 */ 104 if (STACK_IS_EMPTY(scriptRun)) { 105 scriptRun->parenSP = -1; 106 } 107 } 108 109 static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode) 110 { 111 int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount); 112 113 while (scriptRun->fixupCount-- > 0) { 114 fixupSP = INC1(fixupSP); 115 scriptRun->parenStack[fixupSP].scriptCode = scriptCode; 116 } 117 } 118 119 static int8_t 120 highBit(int32_t value) 121 { 122 int8_t bit = 0; 123 124 if (value <= 0) { 125 return -32; 126 } 127 128 if (value >= 1 << 16) { 129 value >>= 16; 130 bit += 16; 131 } 132 133 if (value >= 1 << 8) { 134 value >>= 8; 135 bit += 8; 136 } 137 138 if (value >= 1 << 4) { 139 value >>= 4; 140 bit += 4; 141 } 142 143 if (value >= 1 << 2) { 144 value >>= 2; 145 bit += 2; 146 } 147 148 if (value >= 1 << 1) { 149 value >>= 1; 150 bit += 1; 151 } 152 153 return bit; 154 } 155 156 static int32_t 157 getPairIndex(UChar32 ch) 158 { 159 int32_t pairedCharCount = ARRAY_SIZE(pairedChars); 160 int32_t pairedCharPower = 1 << highBit(pairedCharCount); 161 int32_t pairedCharExtra = pairedCharCount - pairedCharPower; 162 163 int32_t probe = pairedCharPower; 164 int32_t pairIndex = 0; 165 166 if (ch >= pairedChars[pairedCharExtra]) { 167 pairIndex = pairedCharExtra; 168 } 169 170 while (probe > (1 << 0)) { 171 probe >>= 1; 172 173 if (ch >= pairedChars[pairIndex + probe]) { 174 pairIndex += probe; 175 } 176 } 177 178 if (pairedChars[pairIndex] != ch) { 179 pairIndex = -1; 180 } 181 182 return pairIndex; 183 } 184 185 static UBool 186 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo) 187 { 188 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; 189 } 190 191 U_CAPI UScriptRun * U_EXPORT2 192 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode) 193 { 194 UScriptRun *result = NULL; 195 196 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { 197 return NULL; 198 } 199 200 result = uprv_malloc(sizeof (UScriptRun)); 201 202 if (result == NULL) { 203 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 204 return NULL; 205 } 206 207 uscript_setRunText(result, src, length, pErrorCode); 208 209 /* Release the UScriptRun if uscript_setRunText() returns an error */ 210 if (U_FAILURE(*pErrorCode)) { 211 uprv_free(result); 212 result = NULL; 213 } 214 215 return result; 216 } 217 218 U_CAPI void U_EXPORT2 219 uscript_closeRun(UScriptRun *scriptRun) 220 { 221 if (scriptRun != NULL) { 222 uprv_free(scriptRun); 223 } 224 } 225 226 U_CAPI void U_EXPORT2 227 uscript_resetRun(UScriptRun *scriptRun) 228 { 229 if (scriptRun != NULL) { 230 scriptRun->scriptStart = 0; 231 scriptRun->scriptLimit = 0; 232 scriptRun->scriptCode = USCRIPT_INVALID_CODE; 233 scriptRun->parenSP = -1; 234 scriptRun->pushCount = 0; 235 scriptRun->fixupCount = 0; 236 } 237 } 238 239 U_CAPI void U_EXPORT2 240 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode) 241 { 242 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { 243 return; 244 } 245 246 if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) { 247 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 248 return; 249 } 250 251 scriptRun->textArray = src; 252 scriptRun->textLength = length; 253 254 uscript_resetRun(scriptRun); 255 } 256 257 U_CAPI UBool U_EXPORT2 258 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript) 259 { 260 UErrorCode error = U_ZERO_ERROR; 261 262 /* if we've fallen off the end of the text, we're done */ 263 if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) { 264 return FALSE; 265 } 266 267 SYNC_FIXUP(scriptRun); 268 scriptRun->scriptCode = USCRIPT_COMMON; 269 270 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) { 271 UChar high = scriptRun->textArray[scriptRun->scriptLimit]; 272 UChar32 ch = high; 273 UScriptCode sc; 274 int32_t pairIndex; 275 276 /* 277 * if the character is a high surrogate and it's not the last one 278 * in the text, see if it's followed by a low surrogate 279 */ 280 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) { 281 UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1]; 282 283 /* 284 * if it is followed by a low surrogate, 285 * consume it and form the full character 286 */ 287 if (low >= 0xDC00 && low <= 0xDFFF) { 288 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; 289 scriptRun->scriptLimit += 1; 290 } 291 } 292 293 sc = uscript_getScript(ch, &error); 294 pairIndex = getPairIndex(ch); 295 296 /* 297 * Paired character handling: 298 * 299 * if it's an open character, push it onto the stack. 300 * if it's a close character, find the matching open on the 301 * stack, and use that script code. Any non-matching open 302 * characters above it on the stack will be poped. 303 */ 304 if (pairIndex >= 0) { 305 if ((pairIndex & 1) == 0) { 306 push(scriptRun, pairIndex, scriptRun->scriptCode); 307 } else { 308 int32_t pi = pairIndex & ~1; 309 310 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) { 311 pop(scriptRun); 312 } 313 314 if (STACK_IS_NOT_EMPTY(scriptRun)) { 315 sc = TOP(scriptRun).scriptCode; 316 } 317 } 318 } 319 320 if (sameScript(scriptRun->scriptCode, sc)) { 321 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { 322 scriptRun->scriptCode = sc; 323 324 fixup(scriptRun, scriptRun->scriptCode); 325 } 326 327 /* 328 * if this character is a close paired character, 329 * pop the matching open character from the stack 330 */ 331 if (pairIndex >= 0 && (pairIndex & 1) != 0) { 332 pop(scriptRun); 333 } 334 } else { 335 /* 336 * if the run broke on a surrogate pair, 337 * end it before the high surrogate 338 */ 339 if (ch >= 0x10000) { 340 scriptRun->scriptLimit -= 1; 341 } 342 343 break; 344 } 345 } 346 347 348 if (pRunStart != NULL) { 349 *pRunStart = scriptRun->scriptStart; 350 } 351 352 if (pRunLimit != NULL) { 353 *pRunLimit = scriptRun->scriptLimit; 354 } 355 356 if (pRunScript != NULL) { 357 *pRunScript = scriptRun->scriptCode; 358 } 359 360 return TRUE; 361 } 362