1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 6 7 #include "base/basictypes.h" 8 9 // Return true if current Tbl pointer is within state0 range 10 // Note that unsigned compare checks both ends of range simultaneously 11 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 12 const uint8* Tbl0 = &st->state_table[st->state0]; 13 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 14 } 15 16 17 // Look up property of one UTF-8 character and advance over it 18 // Return 0 if input length is zero 19 // Return 0 and advance one byte if input is ill-formed 20 uint8 UTF8GenericProperty(const UTF8PropObj* st, 21 const uint8** src, 22 int* srclen) { 23 if (*srclen <= 0) { 24 return 0; 25 } 26 27 const uint8* lsrc = *src; 28 const uint8* Tbl_0 = &st->state_table[st->state0]; 29 const uint8* Tbl = Tbl_0; 30 int e; 31 int eshift = st->entry_shift; 32 33 // Short series of tests faster than switch, optimizes 7-bit ASCII 34 unsigned char c = lsrc[0]; 35 if (static_cast<signed char>(c) >= 0) { // one byte 36 e = Tbl[c]; 37 *src += 1; 38 *srclen -= 1; 39 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes 40 e = Tbl[c]; 41 Tbl = &Tbl_0[e << eshift]; 42 e = Tbl[lsrc[1]]; 43 *src += 2; 44 *srclen -= 2; 45 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes 46 e = Tbl[c]; 47 Tbl = &Tbl_0[e << eshift]; 48 e = Tbl[lsrc[1]]; 49 Tbl = &Tbl_0[e << eshift]; 50 e = Tbl[lsrc[2]]; 51 *src += 3; 52 *srclen -= 3; 53 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes 54 e = Tbl[c]; 55 Tbl = &Tbl_0[e << eshift]; 56 e = Tbl[lsrc[1]]; 57 Tbl = &Tbl_0[e << eshift]; 58 e = Tbl[lsrc[2]]; 59 Tbl = &Tbl_0[e << eshift]; 60 e = Tbl[lsrc[3]]; 61 *src += 4; 62 *srclen -= 4; 63 } else { // Ill-formed 64 e = 0; 65 *src += 1; 66 *srclen -= 1; 67 } 68 return e; 69 } 70 71 // BigOneByte versions are needed for tables > 240 states, but most 72 // won't need the TwoByte versions. 73 // Internally, to next-to-last offset is multiplied by 16 and the last 74 // offset is relative instead of absolute. 75 // Look up property of one UTF-8 character and advance over it 76 // Return 0 if input length is zero 77 // Return 0 and advance one byte if input is ill-formed 78 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, 79 const uint8** src, 80 int* srclen) { 81 if (*srclen <= 0) { 82 return 0; 83 } 84 85 const uint8* lsrc = *src; 86 const uint8* Tbl_0 = &st->state_table[st->state0]; 87 const uint8* Tbl = Tbl_0; 88 int e; 89 int eshift = st->entry_shift; 90 91 // Short series of tests faster than switch, optimizes 7-bit ASCII 92 unsigned char c = lsrc[0]; 93 if (static_cast<signed char>(c) >= 0) { // one byte 94 e = Tbl[c]; 95 *src += 1; 96 *srclen -= 1; 97 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes 98 e = Tbl[c]; 99 Tbl = &Tbl_0[e << eshift]; 100 e = Tbl[lsrc[1]]; 101 *src += 2; 102 *srclen -= 2; 103 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes 104 e = Tbl[c]; 105 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 106 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; 107 Tbl = &Tbl[e << eshift]; // Relative +/- 108 e = Tbl[lsrc[2]]; 109 *src += 3; 110 *srclen -= 3; 111 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes 112 e = Tbl[c]; 113 Tbl = &Tbl_0[e << eshift]; 114 e = Tbl[lsrc[1]]; 115 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 116 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; 117 Tbl = &Tbl[e << eshift]; // Relative +/- 118 e = Tbl[lsrc[3]]; 119 *src += 4; 120 *srclen -= 4; 121 } else { // Ill-formed 122 e = 0; 123 *src += 1; 124 *srclen -= 1; 125 } 126 return e; 127 } 128 129 // Scan a UTF-8 stringpiece based on a state table. 130 // Always scan complete UTF-8 characters 131 // Set number of bytes scanned. Return reason for exiting 132 int UTF8GenericScan(const UTF8ScanObj* st, 133 const uint8* str, 134 const int len, 135 int* bytes_consumed) { 136 int eshift = st->entry_shift; // 6 (space optimized) or 8 137 // int nEntries = (1 << eshift); // 64 or 256 entries per state 138 139 const uint8* isrc = str; 140 //reinterpret_cast<const uint8*>(str.data()); 141 const uint8* src = isrc; 142 //const int len = str.length(); 143 const uint8* srclimit = isrc + len; 144 const uint8* srclimit8 = srclimit - 7; 145 *bytes_consumed = 0; 146 if (len == 0) return kExitOK; 147 148 const uint8* Tbl_0 = &st->state_table[st->state0]; 149 150 DoAgain: 151 // Do state-table scan 152 int e = 0; 153 uint8 c; 154 155 // Do fast for groups of 8 identity bytes. 156 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, 157 // including slowing slightly on cr/lf/ht 158 //---------------------------- 159 const uint8* Tbl2 = &st->fast_state[0]; 160 uint32 losub = st->losub; 161 uint32 hiadd = st->hiadd; 162 while (src < srclimit8) { 163 uint32 s0123 = UnalignedLoad32(src); 164 uint32 s4567 = UnalignedLoad32(src + 4); 165 src += 8; 166 // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 167 uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 168 (s4567 - losub) | (s4567 + hiadd); 169 if ((temp & 0x80808080) != 0) { 170 // We typically end up here on cr/lf/ht; src was incremented 171 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 172 (Tbl2[src[-6]] | Tbl2[src[-5]]); 173 if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange 174 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 175 (Tbl2[src[-2]] | Tbl2[src[-1]]); 176 if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange 177 // Else OK, go around again 178 } 179 } 180 //---------------------------- 181 182 // Byte-at-a-time scan 183 //---------------------------- 184 const uint8* Tbl = Tbl_0; 185 while (src < srclimit) { 186 c = *src; 187 e = Tbl[c]; 188 src++; 189 if (e >= kExitIllegalStructure) {break;} 190 Tbl = &Tbl_0[e << eshift]; 191 } 192 //---------------------------- 193 194 195 // Exit posibilities: 196 // Some exit code, !state0, back up over last char 197 // Some exit code, state0, back up one byte exactly 198 // source consumed, !state0, back up over partial char 199 // source consumed, state0, exit OK 200 // For illegal byte in state0, avoid backup up over PREVIOUS char 201 // For truncated last char, back up to beginning of it 202 203 if (e >= kExitIllegalStructure) { 204 // Back up over exactly one byte of rejected/illegal UTF-8 character 205 src--; 206 // Back up more if needed 207 if (!InStateZero(st, Tbl)) { 208 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 209 } 210 } else if (!InStateZero(st, Tbl)) { 211 // Back up over truncated UTF-8 character 212 e = kExitIllegalStructure; 213 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 214 } else { 215 // Normal termination, source fully consumed 216 e = kExitOK; 217 } 218 219 if (e == kExitDoAgain) { 220 // Loop back up to the fast scan 221 goto DoAgain; 222 } 223 224 *bytes_consumed = src - isrc; 225 return e; 226 } 227