1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "encodings/compact_lang_det/getonescriptspan.h" 6 #include <stdio.h> 7 #include <string.h> 8 9 #include "base/basictypes.h" 10 #include "encodings/lang_enc.h" 11 #include "encodings/compact_lang_det/utf8propjustletter.h" 12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h" 13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h" 14 15 #include "encodings/compact_lang_det/win/cld_basictypes.h" 16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h" 17 #include "encodings/compact_lang_det/win/cld_google.h" 18 #include "encodings/compact_lang_det/win/cld_htmlutils.h" 19 #include "encodings/compact_lang_det/win/cld_unilib.h" 20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 21 #include "encodings/compact_lang_det/win/cld_utf8utils.h" 22 23 static const Language GRAY_LANG = (Language)254; 24 25 static const int kMaxUpToWordBoundary = 50; // span < this make longer, 26 // else make shorter 27 static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes 28 // to round to word boundary, 29 // direction above 30 31 static const char kSpecialSymbol[256] = { // true for < > & 32 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 33 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, 34 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 35 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 36 37 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 38 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 39 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 41 }; 42 43 44 45 #define LT 0 // < 46 #define GT 1 // > 47 #define EX 2 // ! 48 #define HY 3 // - 49 #define QU 4 // " 50 #define AP 5 // ' 51 #define SL 6 // / 52 #define S_ 7 53 #define C_ 8 54 #define R_ 9 55 #define I_ 10 56 #define P_ 11 57 #define T_ 12 58 #define Y_ 13 59 #define L_ 14 60 #define E_ 15 61 #define CR 16 // <cr> or <lf> 62 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation 63 #define PL 18 // possible letter, incl. & 64 #define xx 19 // <unused> 65 66 // Map byte to one of ~20 interesting categories for cheap tag parsing 67 static const uint8 kCharToSub[256] = { 68 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, 69 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 70 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, 71 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, 72 73 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 74 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 75 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 76 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 77 78 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 79 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 80 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 81 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 82 83 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 84 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 85 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 86 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 87 }; 88 89 #undef LT 90 #undef GT 91 #undef EX 92 #undef HY 93 #undef QU 94 #undef AP 95 #undef SL 96 #undef S_ 97 #undef C_ 98 #undef R_ 99 #undef I_ 100 #undef P_ 101 #undef T_ 102 #undef Y_ 103 #undef L_ 104 #undef E_ 105 #undef CR 106 #undef NL 107 #undef PL 108 #undef xx 109 110 111 #define OK 0 112 #define X_ 1 113 114 // State machine to do cheap parse of non-letter strings incl. tags 115 // advances <tag> 116 // | | 117 // advances <tag> ... </tag> for <script> <style> 118 // | | 119 // advances <!-- ... <tag> ... --> 120 // | | 121 // advances <tag 122 // || (0) 123 // advances <tag <tag2> 124 // || (0) 125 static const uint8 kTagParseTbl_0[] = { 126 // < > ! - " ' / S C R I P T Y L E CR NL PL xx 127 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK 128 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error 129 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* 130 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < 131 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! 132 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- 133 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* 134 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- 135 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- 136 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* 137 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" 138 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' 139 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' 140 141 // < > ! - " ' / S C R I P T Y L E CR NL PL xx 142 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S 143 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC 144 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR 145 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI 146 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP 147 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT 148 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* 149 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< 150 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</ 151 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S 152 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC 153 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR 154 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI 155 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP 156 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT 157 158 // < > ! - " ' / S C R I P T Y L E CR NL PL xx 159 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST 160 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY 161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL 162 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE 163 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* 164 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< 165 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</ 166 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S 167 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST 168 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY 169 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL 170 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE 171 }; 172 173 #undef OK 174 #undef X_ 175 176 177 /* 178 // Convert GetTimeOfDay output to 64-bit usec 179 static inline uint64 Microseconds(const struct timeval& t) { 180 // The SumReducer uses uint64, so convert to (uint64) microseconds, 181 // not (double) seconds. 182 return t.tv_sec * 1000000ULL + t.tv_usec; 183 } 184 */ 185 186 187 // Returns true if character is < > or & 188 bool inline IsSpecial(char c) { 189 if ((c & 0xe0) == 0x20) { 190 return kSpecialSymbol[static_cast<uint8>(c)]; 191 } 192 return false; 193 } 194 195 // Quick Skip to next letter or < > & or to end of string (eos) 196 // Always return is_letter for eos 197 int ScanToLetterOrSpecial(const char* src, int len) { 198 int bytes_consumed; 199 cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len, 200 &bytes_consumed); 201 return bytes_consumed; 202 } 203 204 205 206 // src points to non-letter, such as tag-opening '<' 207 // Return length from here to next possible letter 208 // On eos or another < before >, return 1 209 // advances <tag> 210 // | | 211 // advances <tag> ... </tag> for <script> <style> 212 // | | 213 // advances <!-- ... <tag> ... --> 214 // | | 215 // advances <tag 216 // || (1) 217 // advances <tag <tag2> 218 // || (1) 219 int ScanToPossibleLetter(const char* isrc, int len) { 220 const uint8* src = reinterpret_cast<const uint8*>(isrc); 221 const uint8* srclimit = src + len; 222 const uint8* tagParseTbl = kTagParseTbl_0; 223 int e = 0; 224 while (src < srclimit) { 225 e = tagParseTbl[kCharToSub[*src++]]; 226 if ((e & ~1) == 0) { 227 // We overshot by one byte 228 --src; 229 break; 230 } 231 tagParseTbl = &kTagParseTbl_0[e * 20]; 232 } 233 234 if (src >= srclimit) { 235 // We fell off the end of the text. 236 // It looks like the most common case for this is a truncated file, not 237 // mismatched angle brackets. So we pretend that the last char was '>' 238 return len; 239 } 240 241 // OK to be in state 0 or state 2 at exit 242 if ((e != 0) && (e != 2)) { 243 // Error, '<' followed by '<' 244 // We want to back up to first <, then advance by one byte past it 245 int offset = src - reinterpret_cast<const uint8*>(isrc); 246 // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc); 247 248 // Backscan to first '<' and return enough length to just get past it 249 --offset; // back up over the second '<', which caused us to stop 250 while ((0 < offset) && (isrc[offset] != '<')) { 251 // Find the first '<', which is unmatched 252 --offset; 253 } 254 // skip to just beyond first '<' 255 // printf(" returning %d\n", offset + 1); 256 return offset + 1; 257 } 258 259 return src - reinterpret_cast<const uint8*>(isrc); 260 } 261 262 263 264 ScriptScanner::ScriptScanner(const char* buffer, 265 int buffer_length, 266 bool is_plain_text) 267 : start_byte_(buffer), 268 next_byte_(buffer), 269 next_byte_limit_(buffer + buffer_length), 270 byte_length_(buffer_length), 271 is_plain_text_(is_plain_text) { 272 script_buffer_ = new char[getone::kMaxScriptBuffer]; 273 script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer]; 274 } 275 276 ScriptScanner::~ScriptScanner() { 277 delete[] script_buffer_; 278 delete[] script_buffer_lower_; 279 } 280 281 282 283 284 // Get to the first real non-tag letter or entity that is a letter 285 // Sets script of that letter 286 // Return len if no more letters 287 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { 288 int sc = UNKNOWN_LSCRIPT; 289 int skip = 0; 290 int tlen, plen; 291 292 // Do run of non-letters (tag | &NL | NL)* 293 while (skip < len) { 294 // Do fast scan to next interesting byte 295 // int oldskip = skip; 296 skip += ScanToLetterOrSpecial(src + skip, len - skip); 297 // TEMP 298 // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n", 299 // oldskip, src[oldskip], skip, src[skip]); 300 301 // Check for no more letters/specials 302 if (skip >= len) { 303 // All done 304 return len; 305 } 306 307 // We are at a letter, nonletter, tag, or entity 308 if (IsSpecial(src[skip]) && !is_plain_text_) { 309 if (src[skip] == '<') { 310 // Begining of tag; skip to end and go around again 311 tlen = ScanToPossibleLetter(src + skip, len - skip); 312 sc = 0; 313 // printf("<...> "); 314 } else if (src[skip] == '>') { 315 // Unexpected end of tag; skip it and go around again 316 tlen = 1; // Over the > 317 sc = 0; 318 // printf("..> "); 319 } else if (src[skip] == '&') { 320 // Expand entity, no advance 321 char temp[4]; 322 EntityToBuffer(src + skip, len - skip, 323 temp, &tlen, &plen); 324 sc = getone::GetUTF8LetterScriptNum(temp); 325 // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc); 326 } 327 } else { 328 // Update 1..4 bytes 329 tlen = cld_UniLib::OneCharLen(src + skip); 330 sc = getone::GetUTF8LetterScriptNum(src + skip); 331 // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc); 332 } 333 // TEMP 334 // printf("sc=%d ", sc); 335 if (sc != 0) {break;} // Letter found 336 skip += tlen; // Advance 337 } 338 339 *script = sc; 340 return skip; 341 } 342 343 #ifdef NEED_ALIGNED_LOADS 344 static const bool kNeedsAlignedLoads = true; 345 #else 346 static const bool kNeedsAlignedLoads = false; 347 #endif 348 349 350 // Copy next run of same-script non-tag letters to buffer [NUL terminated] 351 // Buffer has leading space and all text is lowercased 352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) { 353 span->text = script_buffer_; 354 span->text_bytes = 0; 355 span->offset = next_byte_ - start_byte_; 356 span->script = UNKNOWN_LSCRIPT; 357 span->lang = UNKNOWN_LANGUAGE; 358 span->truncated = false; 359 360 // printf("GetOneScriptSpan[[ "); 361 // struct timeval script_start, script_mid, script_end; 362 363 int spanscript; // The script of this span 364 int sc = UNKNOWN_LSCRIPT; // The script of next character 365 int tlen, plen; 366 367 368 script_buffer_[0] = ' '; // Always a space at front of output 369 script_buffer_[1] = '\0'; 370 int take = 0; 371 int put = 1; // Start after the initial space 372 373 // gettimeofday(&script_start, NULL); 374 // Get to the first real non-tag letter or entity that is a letter 375 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); 376 next_byte_ += skip; 377 byte_length_ -= skip; 378 if (byte_length_ <= 0) { 379 // printf("]]\n"); 380 return false; // No more letters to be found 381 } 382 383 // gettimeofday(&script_mid, NULL); 384 385 // There is at least one letter, so we know the script for this span 386 // printf("{%d} ", spanscript); 387 span->script = (UnicodeLScript)spanscript; 388 389 390 // Go over alternating spans of same-script letters and non-letters, 391 // copying letters to buffer with single spaces for each run of non-letters 392 while (take < byte_length_) { 393 // Copy run of letters in same script (&LS | LS)* 394 int letter_count = 0; // Keep track of word length 395 bool need_break = false; 396 while (take < byte_length_) { 397 // We are at a letter, nonletter, tag, or entity 398 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 399 // printf("\"%c\" ", next_byte_[take]); 400 if (next_byte_[take] == '<') { 401 // Begining of tag 402 sc = 0; 403 break; 404 } else if (next_byte_[take] == '>') { 405 // Unexpected end of tag 406 sc = 0; 407 break; 408 } else if (next_byte_[take] == '&') { 409 // Copy entity, no advance 410 EntityToBuffer(next_byte_ + take, byte_length_ - take, 411 script_buffer_ + put, &tlen, &plen); 412 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 413 } 414 } else { 415 // Real letter, safely copy up to 4 bytes, increment by 1..4 416 // Will update by 1..4 bytes at Advance, below 417 tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take); 418 if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) { 419 // Fast case 420 *reinterpret_cast<uint32*>(script_buffer_ + put) = 421 *reinterpret_cast<const uint32*>(next_byte_ + take); 422 } else { 423 // Slow case, happens 1-3 times per input document 424 memcpy(script_buffer_ + put, next_byte_ + take, plen); 425 } 426 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 427 } 428 // printf("sc(%c)=%d ", next_byte_[take], sc); 429 // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen); 430 // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc); 431 432 // Allow continue across a single letter in a different script: 433 // A B D = three scripts, c = common script, i = inherited script, 434 // - = don't care, ( = take position before the += below 435 // AAA(A- continue 436 // 437 // AAA(BA continue 438 // AAA(BB break 439 // AAA(Bc continue (breaks after B) 440 // AAA(BD break 441 // AAA(Bi break 442 // 443 // AAA(c- break 444 // 445 // AAA(i- continue 446 // 447 448 if ((sc != spanscript) && (sc != ULScript_Inherited)) { 449 // Might need to break this script span 450 if (sc == ULScript_Common) { 451 need_break = true; 452 } else { 453 // Look at next following character, ignoring entity as Common 454 int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen); 455 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { 456 need_break = true; 457 } 458 } 459 } 460 if (need_break) {break;} // Non-letter or letter in wrong script 461 462 take += tlen; // Advance 463 put += plen; // Advance 464 ++letter_count; 465 if (put >= getone::kMaxScriptBytes) { 466 // Buffer is full 467 span->truncated = true; 468 break; 469 } 470 } // End while letters 471 472 // Do run of non-letters (tag | &NL | NL)* 473 while (take < byte_length_) { 474 // Do fast scan to next interesting byte 475 take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); 476 477 // Check for no more letters/specials 478 if (take >= byte_length_) { 479 take = byte_length_; 480 break; 481 } 482 483 // We are at a letter, nonletter, tag, or entity 484 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 485 // printf("\"%c\" ", next_byte_[take]); 486 if (next_byte_[take] == '<') { 487 // Begining of tag; skip to end and go around again 488 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take); 489 sc = 0; 490 // printf("<...> "); 491 } else if (next_byte_[take] == '>') { 492 // Unexpected end of tag; skip it and go around again 493 tlen = 1; // Over the > 494 sc = 0; 495 // printf("..> "); 496 } else if (next_byte_[take] == '&') { 497 // Expand entity, no advance 498 EntityToBuffer(next_byte_ + take, byte_length_ - take, 499 script_buffer_ + put, &tlen, &plen); 500 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 501 } 502 } else { 503 // Update 1..4 504 tlen = cld_UniLib::OneCharLen(next_byte_ + take); 505 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 506 } 507 // printf("sc[%c]=%d ", next_byte_[take], sc); 508 if (sc != 0) {break;} // Letter found 509 take += tlen; // Advance 510 } // End while not-letters 511 512 script_buffer_[put++] = ' '; 513 514 // We are at a letter again (or eos), after letter* not-letter* 515 if (sc != spanscript) {break;} // Letter in wrong script 516 if (put >= getone::kMaxScriptBytes - 8) { 517 // Buffer is almost full 518 span->truncated = true; 519 break; 520 } 521 } 522 523 // Update input position 524 next_byte_ += take; 525 byte_length_ -= take; 526 527 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 528 // kMaxScriptBytes | | put 529 script_buffer_[put + 0] = ' '; 530 script_buffer_[put + 1] = ' '; 531 script_buffer_[put + 2] = ' '; 532 script_buffer_[put + 3] = '\0'; 533 534 span->text_bytes = put; // Does not include the last four chars above 535 536 // printf(" %d]]\n\n", put); 537 return true; 538 } 539 540 // Force Latin, Cyrillic, Greek scripts to be lowercase 541 void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) { 542 // On Windows, text is lowercased beforehand, so no need to do anything here. 543 #if !defined(CLD_WINDOWS) 544 // If needed, lowercase all the text. If we do it sooner, might miss 545 // lowercasing an entity such as Á 546 // We only need to do this for Latn and Cyrl scripts 547 if ((span->script == ULScript_Latin) || 548 (span->script == ULScript_Cyrillic) || 549 (span->script == ULScript_Greek)) { 550 // Full Unicode lowercase of the entire buffer, including 551 // four pad bytes off the end 552 int consumed, filled; 553 UniLib::ToLower(span->text, span->text_bytes + 4, 554 script_buffer_lower_, getone::kMaxScriptLowerBuffer, 555 &consumed, &filled); 556 span->text = script_buffer_lower_; 557 span->text_bytes = filled - 4; 558 } 559 #endif 560 } 561 562 // Copy next run of same-script non-tag letters to buffer [NUL terminated] 563 // Force Latin and Cyrillic scripts to be lowercase 564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) { 565 bool ok = GetOneScriptSpan(span); 566 LowerScriptSpan(span); 567 return ok; 568 } 569 570 // Gets lscript number for letters; always returns 571 // 0 (common script) for non-letters 572 int getone::GetUTF8LetterScriptNum(const char* src) { 573 int srclen = cld_UniLib::OneCharLen(src); 574 const uint8* usrc = reinterpret_cast<const uint8*>(src); 575 return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen); 576 } 577