1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: scrptrun.cpp 11 * 12 * created on: 10/17/2001 13 * created by: Eric R. Mader 14 */ 15 16 #include "unicode/utypes.h" 17 #include "unicode/uscript.h" 18 19 #include "cmemory.h" 20 #include "scrptrun.h" 21 22 const char ScriptRun::fgClassID=0; 23 24 UChar32 ScriptRun::pairedChars[] = { 25 0x0028, 0x0029, // ascii paired punctuation 26 0x003c, 0x003e, 27 0x005b, 0x005d, 28 0x007b, 0x007d, 29 0x00ab, 0x00bb, // guillemets 30 0x2018, 0x2019, // general punctuation 31 0x201c, 0x201d, 32 0x2039, 0x203a, 33 0x3008, 0x3009, // chinese paired punctuation 34 0x300a, 0x300b, 35 0x300c, 0x300d, 36 0x300e, 0x300f, 37 0x3010, 0x3011, 38 0x3014, 0x3015, 39 0x3016, 0x3017, 40 0x3018, 0x3019, 41 0x301a, 0x301b 42 }; 43 44 const int32_t ScriptRun::pairedCharCount = UPRV_LENGTHOF(pairedChars); 45 const int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount); 46 const int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower; 47 48 int8_t ScriptRun::highBit(int32_t value) 49 { 50 if (value <= 0) { 51 return -32; 52 } 53 54 int8_t bit = 0; 55 56 if (value >= 1 << 16) { 57 value >>= 16; 58 bit += 16; 59 } 60 61 if (value >= 1 << 8) { 62 value >>= 8; 63 bit += 8; 64 } 65 66 if (value >= 1 << 4) { 67 value >>= 4; 68 bit += 4; 69 } 70 71 if (value >= 1 << 2) { 72 value >>= 2; 73 bit += 2; 74 } 75 76 if (value >= 1 << 1) { 77 value >>= 1; 78 bit += 1; 79 } 80 81 return bit; 82 } 83 84 int32_t ScriptRun::getPairIndex(UChar32 ch) 85 { 86 int32_t probe = pairedCharPower; 87 int32_t index = 0; 88 89 if (ch >= pairedChars[pairedCharExtra]) { 90 index = pairedCharExtra; 91 } 92 93 while (probe > (1 << 0)) { 94 probe >>= 1; 95 96 if (ch >= pairedChars[index + probe]) { 97 index += probe; 98 } 99 } 100 101 if (pairedChars[index] != ch) { 102 index = -1; 103 } 104 105 return index; 106 } 107 108 UBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo) 109 { 110 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; 111 } 112 113 UBool ScriptRun::next() 114 { 115 int32_t startSP = parenSP; // used to find the first new open character 116 UErrorCode error = U_ZERO_ERROR; 117 118 // if we've fallen off the end of the text, we're done 119 if (scriptEnd >= charLimit) { 120 return false; 121 } 122 123 scriptCode = USCRIPT_COMMON; 124 125 for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) { 126 UChar high = charArray[scriptEnd]; 127 UChar32 ch = high; 128 129 // if the character is a high surrogate and it's not the last one 130 // in the text, see if it's followed by a low surrogate 131 if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1) 132 { 133 UChar low = charArray[scriptEnd + 1]; 134 135 // if it is followed by a low surrogate, 136 // consume it and form the full character 137 if (low >= 0xDC00 && low <= 0xDFFF) { 138 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; 139 scriptEnd += 1; 140 } 141 } 142 143 UScriptCode sc = uscript_getScript(ch, &error); 144 int32_t pairIndex = getPairIndex(ch); 145 146 // Paired character handling: 147 // 148 // if it's an open character, push it onto the stack. 149 // if it's a close character, find the matching open on the 150 // stack, and use that script code. Any non-matching open 151 // characters above it on the stack will be poped. 152 if (pairIndex >= 0) { 153 if ((pairIndex & 1) == 0) { 154 parenStack[++parenSP].pairIndex = pairIndex; 155 parenStack[parenSP].scriptCode = scriptCode; 156 } else if (parenSP >= 0) { 157 int32_t pi = pairIndex & ~1; 158 159 while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { 160 parenSP -= 1; 161 } 162 163 if (parenSP < startSP) { 164 startSP = parenSP; 165 } 166 167 if (parenSP >= 0) { 168 sc = parenStack[parenSP].scriptCode; 169 } 170 } 171 } 172 173 if (sameScript(scriptCode, sc)) { 174 if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { 175 scriptCode = sc; 176 177 // now that we have a final script code, fix any open 178 // characters we pushed before we knew the script code. 179 while (startSP < parenSP) { 180 parenStack[++startSP].scriptCode = scriptCode; 181 } 182 } 183 184 // if this character is a close paired character, 185 // pop it from the stack 186 if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { 187 parenSP -= 1; 188 startSP -= 1; 189 } 190 } else { 191 // if the run broke on a surrogate pair, 192 // end it before the high surrogate 193 if (ch >= 0x10000) { 194 scriptEnd -= 1; 195 } 196 197 break; 198 } 199 } 200 201 return true; 202 } 203 204