1 /* 2 * Copyright (C) 2007 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "UTF8.h" 28 29 namespace WTF { 30 namespace Unicode { 31 32 inline int inlineUTF8SequenceLengthNonASCII(char b0) 33 { 34 if ((b0 & 0xC0) != 0xC0) 35 return 0; 36 if ((b0 & 0xE0) == 0xC0) 37 return 2; 38 if ((b0 & 0xF0) == 0xE0) 39 return 3; 40 if ((b0 & 0xF8) == 0xF0) 41 return 4; 42 return 0; 43 } 44 45 inline int inlineUTF8SequenceLength(char b0) 46 { 47 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 48 } 49 50 int UTF8SequenceLength(char b0) 51 { 52 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 53 } 54 55 int decodeUTF8Sequence(const char* sequence) 56 { 57 // Handle 0-byte sequences (never valid). 58 const unsigned char b0 = sequence[0]; 59 const int length = inlineUTF8SequenceLength(b0); 60 if (length == 0) 61 return -1; 62 63 // Handle 1-byte sequences (plain ASCII). 64 const unsigned char b1 = sequence[1]; 65 if (length == 1) { 66 if (b1) 67 return -1; 68 return b0; 69 } 70 71 // Handle 2-byte sequences. 72 if ((b1 & 0xC0) != 0x80) 73 return -1; 74 const unsigned char b2 = sequence[2]; 75 if (length == 2) { 76 if (b2) 77 return -1; 78 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 79 if (c < 0x80) 80 return -1; 81 return c; 82 } 83 84 // Handle 3-byte sequences. 85 if ((b2 & 0xC0) != 0x80) 86 return -1; 87 const unsigned char b3 = sequence[3]; 88 if (length == 3) { 89 if (b3) 90 return -1; 91 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 92 if (c < 0x800) 93 return -1; 94 // UTF-16 surrogates should never appear in UTF-8 data. 95 if (c >= 0xD800 && c <= 0xDFFF) 96 return -1; 97 return c; 98 } 99 100 // Handle 4-byte sequences. 101 if ((b3 & 0xC0) != 0x80) 102 return -1; 103 const unsigned char b4 = sequence[4]; 104 if (length == 4) { 105 if (b4) 106 return -1; 107 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 108 if (c < 0x10000 || c > 0x10FFFF) 109 return -1; 110 return c; 111 } 112 113 return -1; 114 } 115 116 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 117 // into the first byte, depending on how many bytes follow. There are 118 // as many entries in this table as there are UTF-8 sequence types. 119 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs 120 // for *legal* UTF-8 will be 4 or fewer bytes total. 121 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 122 123 ConversionResult convertUTF16ToUTF8( 124 const UChar** sourceStart, const UChar* sourceEnd, 125 char** targetStart, char* targetEnd, bool strict) 126 { 127 ConversionResult result = conversionOK; 128 const UChar* source = *sourceStart; 129 char* target = *targetStart; 130 while (source < sourceEnd) { 131 UChar32 ch; 132 unsigned short bytesToWrite = 0; 133 const UChar32 byteMask = 0xBF; 134 const UChar32 byteMark = 0x80; 135 const UChar* oldSource = source; // In case we have to back up because of target overflow. 136 ch = static_cast<unsigned short>(*source++); 137 // If we have a surrogate pair, convert to UChar32 first. 138 if (ch >= 0xD800 && ch <= 0xDBFF) { 139 // If the 16 bits following the high surrogate are in the source buffer... 140 if (source < sourceEnd) { 141 UChar32 ch2 = static_cast<unsigned short>(*source); 142 // If it's a low surrogate, convert to UChar32. 143 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 144 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 145 ++source; 146 } else if (strict) { // it's an unpaired high surrogate 147 --source; // return to the illegal value itself 148 result = sourceIllegal; 149 break; 150 } 151 } else { // We don't have the 16 bits following the high surrogate. 152 --source; // return to the high surrogate 153 result = sourceExhausted; 154 break; 155 } 156 } else if (strict) { 157 // UTF-16 surrogate values are illegal in UTF-32 158 if (ch >= 0xDC00 && ch <= 0xDFFF) { 159 --source; // return to the illegal value itself 160 result = sourceIllegal; 161 break; 162 } 163 } 164 // Figure out how many bytes the result will require 165 if (ch < (UChar32)0x80) { 166 bytesToWrite = 1; 167 } else if (ch < (UChar32)0x800) { 168 bytesToWrite = 2; 169 } else if (ch < (UChar32)0x10000) { 170 bytesToWrite = 3; 171 } else if (ch < (UChar32)0x110000) { 172 bytesToWrite = 4; 173 } else { 174 bytesToWrite = 3; 175 ch = 0xFFFD; 176 } 177 178 target += bytesToWrite; 179 if (target > targetEnd) { 180 source = oldSource; // Back up source pointer! 181 target -= bytesToWrite; 182 result = targetExhausted; 183 break; 184 } 185 switch (bytesToWrite) { // note: everything falls through. 186 case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 187 case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 188 case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 189 case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); 190 } 191 target += bytesToWrite; 192 } 193 *sourceStart = source; 194 *targetStart = target; 195 return result; 196 } 197 198 // This must be called with the length pre-determined by the first byte. 199 // If presented with a length > 4, this returns false. The Unicode 200 // definition of UTF-8 goes up to 4-byte sequences. 201 static bool isLegalUTF8(const unsigned char* source, int length) 202 { 203 unsigned char a; 204 const unsigned char* srcptr = source + length; 205 switch (length) { 206 default: return false; 207 // Everything else falls through when "true"... 208 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 209 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 210 case 2: if ((a = (*--srcptr)) > 0xBF) return false; 211 212 switch (*source) { 213 // no fall-through in this inner switch 214 case 0xE0: if (a < 0xA0) return false; break; 215 case 0xED: if (a > 0x9F) return false; break; 216 case 0xF0: if (a < 0x90) return false; break; 217 case 0xF4: if (a > 0x8F) return false; break; 218 default: if (a < 0x80) return false; 219 } 220 221 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 222 } 223 if (*source > 0xF4) 224 return false; 225 return true; 226 } 227 228 // Magic values subtracted from a buffer value during UTF8 conversion. 229 // This table contains as many values as there might be trailing bytes 230 // in a UTF-8 sequence. 231 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 232 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 233 234 ConversionResult convertUTF8ToUTF16( 235 const char** sourceStart, const char* sourceEnd, 236 UChar** targetStart, UChar* targetEnd, bool strict) 237 { 238 ConversionResult result = conversionOK; 239 const char* source = *sourceStart; 240 UChar* target = *targetStart; 241 while (source < sourceEnd) { 242 UChar32 ch = 0; 243 int extraBytesToRead = UTF8SequenceLength(*source) - 1; 244 if (source + extraBytesToRead >= sourceEnd) { 245 result = sourceExhausted; 246 break; 247 } 248 // Do this check whether lenient or strict 249 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { 250 result = sourceIllegal; 251 break; 252 } 253 // The cases all fall through. 254 switch (extraBytesToRead) { 255 case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 256 case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 257 case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; 258 case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; 259 case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; 260 case 0: ch += static_cast<unsigned char>(*source++); 261 } 262 ch -= offsetsFromUTF8[extraBytesToRead]; 263 264 if (target >= targetEnd) { 265 source -= (extraBytesToRead + 1); // Back up source pointer! 266 result = targetExhausted; break; 267 } 268 if (ch <= 0xFFFF) { 269 // UTF-16 surrogate values are illegal in UTF-32 270 if (ch >= 0xD800 && ch <= 0xDFFF) { 271 if (strict) { 272 source -= (extraBytesToRead + 1); // return to the illegal value itself 273 result = sourceIllegal; 274 break; 275 } else 276 *target++ = 0xFFFD; 277 } else 278 *target++ = (UChar)ch; // normal case 279 } else if (ch > 0x10FFFF) { 280 if (strict) { 281 result = sourceIllegal; 282 source -= (extraBytesToRead + 1); // return to the start 283 break; // Bail out; shouldn't continue 284 } else 285 *target++ = 0xFFFD; 286 } else { 287 // target is a character in range 0xFFFF - 0x10FFFF 288 if (target + 1 >= targetEnd) { 289 source -= (extraBytesToRead + 1); // Back up source pointer! 290 result = targetExhausted; 291 break; 292 } 293 ch -= 0x0010000UL; 294 *target++ = (UChar)((ch >> 10) + 0xD800); 295 *target++ = (UChar)((ch & 0x03FF) + 0xDC00); 296 } 297 } 298 *sourceStart = source; 299 *targetStart = target; 300 return result; 301 } 302 303 } 304 } 305