Home | History | Annotate | Download | only in win
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
      6 
      7 #include "base/basictypes.h"
      8 
      9 // Return true if current Tbl pointer is within state0 range
     10 // Note that unsigned compare checks both ends of range simultaneously
     11 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
     12   const uint8* Tbl0 = &st->state_table[st->state0];
     13   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
     14 }
     15 
     16 
     17 // Look up property of one UTF-8 character and advance over it
     18 // Return 0 if input length is zero
     19 // Return 0 and advance one byte if input is ill-formed
     20 uint8 UTF8GenericProperty(const UTF8PropObj* st,
     21                           const uint8** src,
     22                           int* srclen) {
     23   if (*srclen <= 0) {
     24     return 0;
     25   }
     26 
     27   const uint8* lsrc = *src;
     28   const uint8* Tbl_0 = &st->state_table[st->state0];
     29   const uint8* Tbl = Tbl_0;
     30   int e;
     31   int eshift = st->entry_shift;
     32 
     33   // Short series of tests faster than switch, optimizes 7-bit ASCII
     34   unsigned char c = lsrc[0];
     35   if (static_cast<signed char>(c) >= 0) {           // one byte
     36     e = Tbl[c];
     37     *src += 1;
     38     *srclen -= 1;
     39   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
     40     e = Tbl[c];
     41     Tbl = &Tbl_0[e << eshift];
     42     e = Tbl[lsrc[1]];
     43     *src += 2;
     44     *srclen -= 2;
     45   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
     46     e = Tbl[c];
     47     Tbl = &Tbl_0[e << eshift];
     48     e = Tbl[lsrc[1]];
     49     Tbl = &Tbl_0[e << eshift];
     50     e = Tbl[lsrc[2]];
     51     *src += 3;
     52     *srclen -= 3;
     53   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
     54     e = Tbl[c];
     55     Tbl = &Tbl_0[e << eshift];
     56     e = Tbl[lsrc[1]];
     57     Tbl = &Tbl_0[e << eshift];
     58     e = Tbl[lsrc[2]];
     59     Tbl = &Tbl_0[e << eshift];
     60     e = Tbl[lsrc[3]];
     61     *src += 4;
     62     *srclen -= 4;
     63   } else {                                                // Ill-formed
     64     e = 0;
     65     *src += 1;
     66     *srclen -= 1;
     67   }
     68   return e;
     69 }
     70 
     71 // BigOneByte versions are needed for tables > 240 states, but most
     72 // won't need the TwoByte versions.
     73 // Internally, to next-to-last offset is multiplied by 16 and the last
     74 // offset is relative instead of absolute.
     75 // Look up property of one UTF-8 character and advance over it
     76 // Return 0 if input length is zero
     77 // Return 0 and advance one byte if input is ill-formed
     78 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
     79                           const uint8** src,
     80                           int* srclen) {
     81   if (*srclen <= 0) {
     82     return 0;
     83   }
     84 
     85   const uint8* lsrc = *src;
     86   const uint8* Tbl_0 = &st->state_table[st->state0];
     87   const uint8* Tbl = Tbl_0;
     88   int e;
     89   int eshift = st->entry_shift;
     90 
     91   // Short series of tests faster than switch, optimizes 7-bit ASCII
     92   unsigned char c = lsrc[0];
     93   if (static_cast<signed char>(c) >= 0) {           // one byte
     94     e = Tbl[c];
     95     *src += 1;
     96     *srclen -= 1;
     97   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
     98     e = Tbl[c];
     99     Tbl = &Tbl_0[e << eshift];
    100     e = Tbl[lsrc[1]];
    101     *src += 2;
    102     *srclen -= 2;
    103   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
    104     e = Tbl[c];
    105     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
    106     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
    107     Tbl = &Tbl[e << eshift];          // Relative +/-
    108     e = Tbl[lsrc[2]];
    109     *src += 3;
    110     *srclen -= 3;
    111   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
    112     e = Tbl[c];
    113     Tbl = &Tbl_0[e << eshift];
    114     e = Tbl[lsrc[1]];
    115     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
    116     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
    117     Tbl = &Tbl[e << eshift];          // Relative +/-
    118     e = Tbl[lsrc[3]];
    119     *src += 4;
    120     *srclen -= 4;
    121   } else {                                                // Ill-formed
    122     e = 0;
    123     *src += 1;
    124     *srclen -= 1;
    125   }
    126   return e;
    127 }
    128 
    129 // Scan a UTF-8 stringpiece based on a state table.
    130 // Always scan complete UTF-8 characters
    131 // Set number of bytes scanned. Return reason for exiting
    132 int UTF8GenericScan(const UTF8ScanObj* st,
    133                     const uint8* str,
    134                     const int len,
    135                     int* bytes_consumed) {
    136   int eshift = st->entry_shift;        // 6 (space optimized) or 8
    137   // int nEntries = (1 << eshift);       // 64 or 256 entries per state
    138 
    139   const uint8* isrc = str;
    140     //reinterpret_cast<const uint8*>(str.data());
    141   const uint8* src = isrc;
    142   //const int len = str.length();
    143   const uint8* srclimit = isrc + len;
    144   const uint8* srclimit8 = srclimit - 7;
    145   *bytes_consumed = 0;
    146   if (len == 0) return kExitOK;
    147 
    148   const uint8* Tbl_0 = &st->state_table[st->state0];
    149 
    150 DoAgain:
    151   // Do state-table scan
    152   int e = 0;
    153   uint8 c;
    154 
    155   // Do fast for groups of 8 identity bytes.
    156   // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
    157   // including slowing slightly on cr/lf/ht
    158   //----------------------------
    159   const uint8* Tbl2 = &st->fast_state[0];
    160   uint32 losub = st->losub;
    161   uint32 hiadd = st->hiadd;
    162   while (src < srclimit8) {
    163     uint32 s0123 = UnalignedLoad32(src);
    164     uint32 s4567 = UnalignedLoad32(src + 4);
    165     src += 8;
    166     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
    167     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
    168                   (s4567 - losub) | (s4567 + hiadd);
    169     if ((temp & 0x80808080) != 0) {
    170       // We typically end up here on cr/lf/ht; src was incremented
    171       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
    172                   (Tbl2[src[-6]] | Tbl2[src[-5]]);
    173       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
    174       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
    175               (Tbl2[src[-2]] | Tbl2[src[-1]]);
    176       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
    177       // Else OK, go around again
    178     }
    179   }
    180   //----------------------------
    181 
    182   // Byte-at-a-time scan
    183   //----------------------------
    184   const uint8* Tbl = Tbl_0;
    185   while (src < srclimit) {
    186     c = *src;
    187     e = Tbl[c];
    188     src++;
    189     if (e >= kExitIllegalStructure) {break;}
    190     Tbl = &Tbl_0[e << eshift];
    191   }
    192   //----------------------------
    193 
    194 
    195   // Exit posibilities:
    196   //  Some exit code, !state0, back up over last char
    197   //  Some exit code, state0, back up one byte exactly
    198   //  source consumed, !state0, back up over partial char
    199   //  source consumed, state0, exit OK
    200   // For illegal byte in state0, avoid backup up over PREVIOUS char
    201   // For truncated last char, back up to beginning of it
    202 
    203   if (e >= kExitIllegalStructure) {
    204     // Back up over exactly one byte of rejected/illegal UTF-8 character
    205     src--;
    206     // Back up more if needed
    207     if (!InStateZero(st, Tbl)) {
    208       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
    209     }
    210   } else if (!InStateZero(st, Tbl)) {
    211     // Back up over truncated UTF-8 character
    212     e = kExitIllegalStructure;
    213     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
    214   } else {
    215     // Normal termination, source fully consumed
    216     e = kExitOK;
    217   }
    218 
    219   if (e == kExitDoAgain) {
    220     // Loop back up to the fast scan
    221     goto DoAgain;
    222   }
    223 
    224   *bytes_consumed = src - isrc;
    225   return e;
    226 }
    227