Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "encodings/compact_lang_det/getonescriptspan.h"
      6 #include <stdio.h>
      7 #include <string.h>
      8 
      9 #include "base/basictypes.h"
     10 #include "encodings/lang_enc.h"
     11 #include "encodings/compact_lang_det/utf8propjustletter.h"
     12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
     13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
     14 
     15 #include "encodings/compact_lang_det/win/cld_basictypes.h"
     16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
     17 #include "encodings/compact_lang_det/win/cld_google.h"
     18 #include "encodings/compact_lang_det/win/cld_htmlutils.h"
     19 #include "encodings/compact_lang_det/win/cld_unilib.h"
     20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
     21 #include "encodings/compact_lang_det/win/cld_utf8utils.h"
     22 
     23 static const Language GRAY_LANG = (Language)254;
     24 
     25 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
     26                                                   // else make shorter
     27 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
     28                                                   // to round to word boundary,
     29                                                   // direction above
     30 
     31 static const char kSpecialSymbol[256] = {       // true for < > &
     32   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     33   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
     34   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     35   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     36 
     37   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     38   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     39   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     40   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
     41 };
     42 
     43 
     44 
     45 #define LT 0      // <
     46 #define GT 1      // >
     47 #define EX 2      // !
     48 #define HY 3      // -
     49 #define QU 4      // "
     50 #define AP 5      // '
     51 #define SL 6      // /
     52 #define S_ 7
     53 #define C_ 8
     54 #define R_ 9
     55 #define I_ 10
     56 #define P_ 11
     57 #define T_ 12
     58 #define Y_ 13
     59 #define L_ 14
     60 #define E_ 15
     61 #define CR 16     // <cr> or <lf>
     62 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
     63 #define PL 18     // possible letter, incl. &
     64 #define xx 19     // <unused>
     65 
     66 // Map byte to one of ~20 interesting categories for cheap tag parsing
     67 static const uint8 kCharToSub[256] = {
     68   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
     69   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
     70   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
     71   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
     72 
     73   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
     74   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
     75   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
     76   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
     77 
     78   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
     79   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
     80   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
     81   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
     82 
     83   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
     84   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
     85   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
     86   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
     87 };
     88 
     89 #undef LT
     90 #undef GT
     91 #undef EX
     92 #undef HY
     93 #undef QU
     94 #undef AP
     95 #undef SL
     96 #undef S_
     97 #undef C_
     98 #undef R_
     99 #undef I_
    100 #undef P_
    101 #undef T_
    102 #undef Y_
    103 #undef L_
    104 #undef E_
    105 #undef CR
    106 #undef NL
    107 #undef PL
    108 #undef xx
    109 
    110 
    111 #define OK 0
    112 #define X_ 1
    113 
    114 // State machine to do cheap parse of non-letter strings incl. tags
    115 // advances <tag>
    116 //          |    |
    117 // advances <tag> ... </tag>  for <script> <style>
    118 //          |               |
    119 // advances <!-- ... <tag> ... -->
    120 //          |                     |
    121 // advances <tag
    122 //          ||  (0)
    123 // advances <tag <tag2>
    124 //          ||  (0)
    125 static const uint8 kTagParseTbl_0[] = {
    126 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
    127    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
    128   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
    129    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
    130   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
    131   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
    132   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
    133    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
    134    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
    135    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
    136   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
    137   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
    138   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
    139   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
    140 
    141 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
    142   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
    143   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
    144   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
    145   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
    146   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
    147   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
    148   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
    149   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
    150   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
    151   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
    152   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
    153   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
    154   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
    155   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
    156   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
    157 
    158 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
    159   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
    160   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
    161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
    162   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
    163   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
    164   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
    165   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
    166   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
    167   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
    168   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
    169   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
    170   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
    171 };
    172 
    173 #undef OK
    174 #undef X_
    175 
    176 
    177 /*
    178 // Convert GetTimeOfDay output to 64-bit usec
    179 static inline uint64 Microseconds(const struct timeval& t) {
    180   // The SumReducer uses uint64, so convert to (uint64) microseconds,
    181   // not (double) seconds.
    182   return t.tv_sec * 1000000ULL + t.tv_usec;
    183 }
    184 */
    185 
    186 
    187 // Returns true if character is < > or &
    188 bool inline IsSpecial(char c) {
    189   if ((c & 0xe0) == 0x20) {
    190     return kSpecialSymbol[static_cast<uint8>(c)];
    191   }
    192   return false;
    193 }
    194 
    195 // Quick Skip to next letter or < > & or to end of string (eos)
    196 // Always return is_letter for eos
    197 int ScanToLetterOrSpecial(const char* src, int len) {
    198   int bytes_consumed;
    199   cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
    200                        &bytes_consumed);
    201   return bytes_consumed;
    202 }
    203 
    204 
    205 
    206 // src points to non-letter, such as tag-opening '<'
    207 // Return length from here to next possible letter
    208 // On eos or another < before >, return 1
    209 // advances <tag>
    210 //          |    |
    211 // advances <tag> ... </tag>  for <script> <style>
    212 //          |               |
    213 // advances <!-- ... <tag> ... -->
    214 //          |                     |
    215 // advances <tag
    216 //          ||  (1)
    217 // advances <tag <tag2>
    218 //          ||  (1)
    219 int ScanToPossibleLetter(const char* isrc, int len) {
    220   const uint8* src = reinterpret_cast<const uint8*>(isrc);
    221   const uint8* srclimit = src + len;
    222   const uint8* tagParseTbl = kTagParseTbl_0;
    223   int e = 0;
    224   while (src < srclimit) {
    225     e = tagParseTbl[kCharToSub[*src++]];
    226     if ((e & ~1) == 0) {
    227       // We overshot by one byte
    228       --src;
    229       break;
    230     }
    231     tagParseTbl = &kTagParseTbl_0[e * 20];
    232   }
    233 
    234   if (src >= srclimit) {
    235     // We fell off the end of the text.
    236     // It looks like the most common case for this is a truncated file, not
    237     // mismatched angle brackets. So we pretend that the last char was '>'
    238     return len;
    239   }
    240 
    241   // OK to be in state 0 or state 2 at exit
    242   if ((e != 0) && (e != 2)) {
    243     // Error, '<' followed by '<'
    244     // We want to back up to first <, then advance by one byte past it
    245     int offset = src - reinterpret_cast<const uint8*>(isrc);
    246     // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
    247 
    248     // Backscan to first '<' and return enough length to just get past it
    249     --offset;   // back up over the second '<', which caused us to stop
    250     while ((0 < offset) && (isrc[offset] != '<')) {
    251       // Find the first '<', which is unmatched
    252       --offset;
    253     }
    254     // skip to just beyond first '<'
    255     // printf("  returning %d\n", offset + 1);
    256     return offset + 1;
    257   }
    258 
    259   return src - reinterpret_cast<const uint8*>(isrc);
    260 }
    261 
    262 
    263 
    264 ScriptScanner::ScriptScanner(const char* buffer,
    265                              int buffer_length,
    266                              bool is_plain_text)
    267   : start_byte_(buffer),
    268   next_byte_(buffer),
    269   next_byte_limit_(buffer + buffer_length),
    270   byte_length_(buffer_length),
    271   is_plain_text_(is_plain_text) {
    272     script_buffer_ = new char[getone::kMaxScriptBuffer];
    273     script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
    274 }
    275 
    276 ScriptScanner::~ScriptScanner() {
    277   delete[] script_buffer_;
    278   delete[] script_buffer_lower_;
    279 }
    280 
    281 
    282 
    283 
    284 // Get to the first real non-tag letter or entity that is a letter
    285 // Sets script of that letter
    286 // Return len if no more letters
    287 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
    288   int sc = UNKNOWN_LSCRIPT;
    289   int skip = 0;
    290   int tlen, plen;
    291 
    292   // Do run of non-letters (tag | &NL | NL)*
    293   while (skip < len) {
    294     // Do fast scan to next interesting byte
    295     // int oldskip = skip;
    296     skip += ScanToLetterOrSpecial(src + skip, len - skip);
    297     // TEMP
    298     // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
    299     //       oldskip, src[oldskip], skip, src[skip]);
    300 
    301     // Check for no more letters/specials
    302     if (skip >= len) {
    303       // All done
    304       return len;
    305     }
    306 
    307     // We are at a letter, nonletter, tag, or entity
    308     if (IsSpecial(src[skip]) && !is_plain_text_) {
    309       if (src[skip] == '<') {
    310         // Begining of tag; skip to end and go around again
    311         tlen = ScanToPossibleLetter(src + skip, len - skip);
    312         sc = 0;
    313         // printf("<...> ");
    314       } else if (src[skip] == '>') {
    315         // Unexpected end of tag; skip it and go around again
    316         tlen = 1;         // Over the >
    317         sc = 0;
    318         // printf("..> ");
    319       } else if (src[skip] == '&') {
    320         // Expand entity, no advance
    321         char temp[4];
    322         EntityToBuffer(src + skip, len - skip,
    323                        temp, &tlen, &plen);
    324         sc = getone::GetUTF8LetterScriptNum(temp);
    325         // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
    326       }
    327     } else {
    328       // Update 1..4 bytes
    329       tlen = cld_UniLib::OneCharLen(src + skip);
    330       sc = getone::GetUTF8LetterScriptNum(src + skip);
    331       // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
    332     }
    333     // TEMP
    334     // printf("sc=%d ", sc);
    335     if (sc != 0) {break;}           // Letter found
    336     skip += tlen;                   // Advance
    337   }
    338 
    339   *script = sc;
    340   return skip;
    341 }
    342 
    343 #ifdef NEED_ALIGNED_LOADS
    344 static const bool kNeedsAlignedLoads = true;
    345 #else
    346 static const bool kNeedsAlignedLoads = false;
    347 #endif
    348 
    349 
    350 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    351 // Buffer has leading space and all text is lowercased
    352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
    353   span->text = script_buffer_;
    354   span->text_bytes = 0;
    355   span->offset = next_byte_ - start_byte_;
    356   span->script = UNKNOWN_LSCRIPT;
    357   span->lang = UNKNOWN_LANGUAGE;
    358   span->truncated = false;
    359 
    360   // printf("GetOneScriptSpan[[ ");
    361   // struct timeval script_start, script_mid, script_end;
    362 
    363   int spanscript;           // The script of this span
    364   int sc = UNKNOWN_LSCRIPT;  // The script of next character
    365   int tlen, plen;
    366 
    367 
    368   script_buffer_[0] = ' ';  // Always a space at front of output
    369   script_buffer_[1] = '\0';
    370   int take = 0;
    371   int put = 1;              // Start after the initial space
    372 
    373   // gettimeofday(&script_start, NULL);
    374   // Get to the first real non-tag letter or entity that is a letter
    375   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
    376   next_byte_ += skip;
    377   byte_length_ -= skip;
    378   if (byte_length_ <= 0) {
    379     // printf("]]\n");
    380     return false;               // No more letters to be found
    381   }
    382 
    383   // gettimeofday(&script_mid, NULL);
    384 
    385   // There is at least one letter, so we know the script for this span
    386   // printf("{%d} ", spanscript);
    387   span->script = (UnicodeLScript)spanscript;
    388 
    389 
    390   // Go over alternating spans of same-script letters and non-letters,
    391   // copying letters to buffer with single spaces for each run of non-letters
    392   while (take < byte_length_) {
    393     // Copy run of letters in same script (&LS | LS)*
    394     int letter_count = 0;              // Keep track of word length
    395     bool need_break = false;
    396     while (take < byte_length_) {
    397       // We are at a letter, nonletter, tag, or entity
    398       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
    399         // printf("\"%c\" ", next_byte_[take]);
    400         if (next_byte_[take] == '<') {
    401           // Begining of tag
    402           sc = 0;
    403           break;
    404         } else if (next_byte_[take] == '>') {
    405           // Unexpected end of tag
    406           sc = 0;
    407           break;
    408         } else if (next_byte_[take] == '&') {
    409           // Copy entity, no advance
    410           EntityToBuffer(next_byte_ + take, byte_length_ - take,
    411                          script_buffer_ + put, &tlen, &plen);
    412           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
    413         }
    414       } else {
    415         // Real letter, safely copy up to 4 bytes, increment by 1..4
    416         // Will update by 1..4 bytes at Advance, below
    417         tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
    418         if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
    419           // Fast case
    420           *reinterpret_cast<uint32*>(script_buffer_ + put) =
    421             *reinterpret_cast<const uint32*>(next_byte_ + take);
    422         } else {
    423           // Slow case, happens 1-3 times per input document
    424           memcpy(script_buffer_ + put, next_byte_ + take, plen);
    425         }
    426         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
    427       }
    428       // printf("sc(%c)=%d ", next_byte_[take], sc);
    429       // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
    430       // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
    431 
    432       // Allow continue across a single letter in a different script:
    433       // A B D = three scripts, c = common script, i = inherited script,
    434       // - = don't care, ( = take position before the += below
    435       //  AAA(A-    continue
    436       //
    437       //  AAA(BA    continue
    438       //  AAA(BB    break
    439       //  AAA(Bc    continue (breaks after B)
    440       //  AAA(BD    break
    441       //  AAA(Bi    break
    442       //
    443       //  AAA(c-    break
    444       //
    445       //  AAA(i-    continue
    446       //
    447 
    448       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
    449         // Might need to break this script span
    450         if (sc == ULScript_Common) {
    451           need_break = true;
    452         } else {
    453           // Look at next following character, ignoring entity as Common
    454           int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
    455           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
    456             need_break = true;
    457           }
    458         }
    459       }
    460       if (need_break) {break;}  // Non-letter or letter in wrong script
    461 
    462       take += tlen;                   // Advance
    463       put += plen;                    // Advance
    464       ++letter_count;
    465       if (put >= getone::kMaxScriptBytes) {
    466         // Buffer is full
    467         span->truncated = true;
    468         break;
    469       }
    470     }     // End while letters
    471 
    472     // Do run of non-letters (tag | &NL | NL)*
    473     while (take < byte_length_) {
    474       // Do fast scan to next interesting byte
    475       take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
    476 
    477       // Check for no more letters/specials
    478       if (take >= byte_length_) {
    479         take = byte_length_;
    480         break;
    481       }
    482 
    483       // We are at a letter, nonletter, tag, or entity
    484       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
    485         // printf("\"%c\" ", next_byte_[take]);
    486         if (next_byte_[take] == '<') {
    487           // Begining of tag; skip to end and go around again
    488           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
    489           sc = 0;
    490           // printf("<...> ");
    491         } else if (next_byte_[take] == '>') {
    492           // Unexpected end of tag; skip it and go around again
    493           tlen = 1;         // Over the >
    494           sc = 0;
    495           // printf("..> ");
    496         } else if (next_byte_[take] == '&') {
    497           // Expand entity, no advance
    498           EntityToBuffer(next_byte_ + take, byte_length_ - take,
    499                          script_buffer_ + put, &tlen, &plen);
    500           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
    501         }
    502       } else {
    503         // Update 1..4
    504         tlen = cld_UniLib::OneCharLen(next_byte_ + take);
    505         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
    506       }
    507       // printf("sc[%c]=%d ", next_byte_[take], sc);
    508       if (sc != 0) {break;}           // Letter found
    509       take += tlen;                   // Advance
    510     }     // End while not-letters
    511 
    512     script_buffer_[put++] = ' ';
    513 
    514     // We are at a letter again (or eos), after letter* not-letter*
    515     if (sc != spanscript) {break;}            // Letter in wrong script
    516     if (put >= getone::kMaxScriptBytes - 8) {
    517       // Buffer is almost full
    518       span->truncated = true;
    519       break;
    520     }
    521   }
    522 
    523   // Update input position
    524   next_byte_ += take;
    525   byte_length_ -= take;
    526 
    527   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
    528   //                          kMaxScriptBytes |   | put
    529   script_buffer_[put + 0] = ' ';
    530   script_buffer_[put + 1] = ' ';
    531   script_buffer_[put + 2] = ' ';
    532   script_buffer_[put + 3] = '\0';
    533 
    534   span->text_bytes = put;       // Does not include the last four chars above
    535 
    536   // printf(" %d]]\n\n", put);
    537   return true;
    538 }
    539 
    540 // Force Latin, Cyrillic, Greek scripts to be lowercase
    541 void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
    542   // On Windows, text is lowercased beforehand, so no need to do anything here.
    543 #if !defined(CLD_WINDOWS)
    544   // If needed, lowercase all the text. If we do it sooner, might miss
    545   // lowercasing an entity such as &Aacute;
    546   // We only need to do this for Latn and Cyrl scripts
    547   if ((span->script == ULScript_Latin) ||
    548       (span->script == ULScript_Cyrillic) ||
    549       (span->script == ULScript_Greek)) {
    550     // Full Unicode lowercase of the entire buffer, including
    551     // four pad bytes off the end
    552     int consumed, filled;
    553     UniLib::ToLower(span->text, span->text_bytes + 4,
    554                     script_buffer_lower_, getone::kMaxScriptLowerBuffer,
    555                     &consumed, &filled);
    556     span->text = script_buffer_lower_;
    557     span->text_bytes = filled - 4;
    558   }
    559 #endif
    560 }
    561 
    562 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
    563 // Force Latin and Cyrillic scripts to be lowercase
    564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
    565   bool ok = GetOneScriptSpan(span);
    566   LowerScriptSpan(span);
    567   return ok;
    568 }
    569 
    570 // Gets lscript number for letters; always returns
    571 //   0 (common script) for non-letters
    572 int getone::GetUTF8LetterScriptNum(const char* src) {
    573   int srclen = cld_UniLib::OneCharLen(src);
    574   const uint8* usrc = reinterpret_cast<const uint8*>(src);
    575   return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
    576 }
    577