1 /* 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "wtf/text/TextCodecUTF8.h" 28 29 #include "wtf/text/TextCodecASCIIFastPath.h" 30 #include "wtf/text/CString.h" 31 #include "wtf/text/StringBuffer.h" 32 #include "wtf/unicode/CharacterNames.h" 33 34 using namespace WTF; 35 using namespace WTF::Unicode; 36 using namespace std; 37 38 namespace WTF { 39 40 const int nonCharacter = -1; 41 42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 43 { 44 return adoptPtr(new TextCodecUTF8); 45 } 46 47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 48 { 49 registrar("UTF-8", "UTF-8"); 50 51 // Additional aliases that originally were present in the encoding 52 // table in WebKit on Macintosh, and subsequently added by 53 // TextCodecICU. Perhaps we can prove some are not used on the web 54 // and remove them. 55 registrar("unicode11utf8", "UTF-8"); 56 registrar("unicode20utf8", "UTF-8"); 57 registrar("utf8", "UTF-8"); 58 registrar("x-unicode20utf8", "UTF-8"); 59 } 60 61 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 62 { 63 registrar("UTF-8", create, 0); 64 } 65 66 static inline int nonASCIISequenceLength(uint8_t firstByte) 67 { 68 static const uint8_t lengths[256] = { 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 84 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 85 }; 86 return lengths[firstByte]; 87 } 88 89 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) 90 { 91 ASSERT(!isASCII(sequence[0])); 92 if (length == 2) { 93 ASSERT(sequence[0] <= 0xDF); 94 if (sequence[0] < 0xC2) 95 return nonCharacter; 96 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 97 return nonCharacter; 98 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 99 } 100 if (length == 3) { 101 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 102 switch (sequence[0]) { 103 case 0xE0: 104 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 105 return nonCharacter; 106 break; 107 case 0xED: 108 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 109 return nonCharacter; 110 break; 111 default: 112 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 113 return nonCharacter; 114 } 115 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 116 return nonCharacter; 117 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; 118 } 119 ASSERT(length == 4); 120 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); 121 switch (sequence[0]) { 122 case 0xF0: 123 if (sequence[1] < 0x90 || sequence[1] > 0xBF) 124 return nonCharacter; 125 break; 126 case 0xF4: 127 if (sequence[1] < 0x80 || sequence[1] > 0x8F) 128 return nonCharacter; 129 break; 130 default: 131 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 132 return nonCharacter; 133 } 134 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 135 return nonCharacter; 136 if (sequence[3] < 0x80 || sequence[3] > 0xBF) 137 return nonCharacter; 138 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; 139 } 140 141 static inline UChar* appendCharacter(UChar* destination, int character) 142 { 143 ASSERT(character != nonCharacter); 144 ASSERT(!U_IS_SURROGATE(character)); 145 if (U_IS_BMP(character)) 146 *destination++ = character; 147 else { 148 *destination++ = U16_LEAD(character); 149 *destination++ = U16_TRAIL(character); 150 } 151 return destination; 152 } 153 154 void TextCodecUTF8::consumePartialSequenceByte() 155 { 156 --m_partialSequenceSize; 157 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); 158 } 159 160 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError) 161 { 162 sawError = true; 163 if (stopOnError) 164 return; 165 // Each error generates a replacement character and consumes one byte. 166 *destination++ = replacementCharacter; 167 consumePartialSequenceByte(); 168 } 169 170 template <> 171 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&) 172 { 173 ASSERT(m_partialSequenceSize); 174 do { 175 if (isASCII(m_partialSequence[0])) { 176 *destination++ = m_partialSequence[0]; 177 consumePartialSequenceByte(); 178 continue; 179 } 180 int count = nonASCIISequenceLength(m_partialSequence[0]); 181 if (!count) 182 return true; 183 184 if (count > m_partialSequenceSize) { 185 if (count - m_partialSequenceSize > end - source) { 186 if (!flush) { 187 // The new data is not enough to complete the sequence, so 188 // add it to the existing partial sequence. 189 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 190 m_partialSequenceSize += end - source; 191 return false; 192 } 193 // An incomplete partial sequence at the end is an error, but it will create 194 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle 195 // the error. 196 return true; 197 } 198 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 199 source += count - m_partialSequenceSize; 200 m_partialSequenceSize = count; 201 } 202 int character = decodeNonASCIISequence(m_partialSequence, count); 203 if ((character == nonCharacter) || (character > 0xff)) 204 return true; 205 206 m_partialSequenceSize -= count; 207 *destination++ = character; 208 } while (m_partialSequenceSize); 209 210 return false; 211 } 212 213 template <> 214 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 215 { 216 ASSERT(m_partialSequenceSize); 217 do { 218 if (isASCII(m_partialSequence[0])) { 219 *destination++ = m_partialSequence[0]; 220 consumePartialSequenceByte(); 221 continue; 222 } 223 int count = nonASCIISequenceLength(m_partialSequence[0]); 224 if (!count) { 225 handleError(destination, stopOnError, sawError); 226 if (stopOnError) 227 return false; 228 continue; 229 } 230 if (count > m_partialSequenceSize) { 231 if (count - m_partialSequenceSize > end - source) { 232 if (!flush) { 233 // The new data is not enough to complete the sequence, so 234 // add it to the existing partial sequence. 235 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 236 m_partialSequenceSize += end - source; 237 return false; 238 } 239 // An incomplete partial sequence at the end is an error. 240 handleError(destination, stopOnError, sawError); 241 if (stopOnError) 242 return false; 243 continue; 244 } 245 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 246 source += count - m_partialSequenceSize; 247 m_partialSequenceSize = count; 248 } 249 int character = decodeNonASCIISequence(m_partialSequence, count); 250 if (character == nonCharacter) { 251 handleError(destination, stopOnError, sawError); 252 if (stopOnError) 253 return false; 254 continue; 255 } 256 257 m_partialSequenceSize -= count; 258 destination = appendCharacter(destination, character); 259 } while (m_partialSequenceSize); 260 261 return false; 262 } 263 264 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 265 { 266 // Each input byte might turn into a character. 267 // That includes all bytes in the partial-sequence buffer because 268 // each byte in an invalid sequence will turn into a replacement character. 269 StringBuffer<LChar> buffer(m_partialSequenceSize + length); 270 271 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); 272 const uint8_t* end = source + length; 273 const uint8_t* alignedEnd = alignToMachineWord(end); 274 LChar* destination = buffer.characters(); 275 276 do { 277 if (m_partialSequenceSize) { 278 // Explicitly copy destination and source pointers to avoid taking pointers to the 279 // local variables, which may harm code generation by disabling some optimizations 280 // in some compilers. 281 LChar* destinationForHandlePartialSequence = destination; 282 const uint8_t* sourceForHandlePartialSequence = source; 283 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) { 284 source = sourceForHandlePartialSequence; 285 goto upConvertTo16Bit; 286 } 287 destination = destinationForHandlePartialSequence; 288 source = sourceForHandlePartialSequence; 289 if (m_partialSequenceSize) 290 break; 291 } 292 293 while (source < end) { 294 if (isASCII(*source)) { 295 // Fast path for ASCII. Most UTF-8 text will be ASCII. 296 if (isAlignedToMachineWord(source)) { 297 while (source < alignedEnd) { 298 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 299 if (!isAllASCII<LChar>(chunk)) 300 break; 301 copyASCIIMachineWord(destination, source); 302 source += sizeof(MachineWord); 303 destination += sizeof(MachineWord); 304 } 305 if (source == end) 306 break; 307 if (!isASCII(*source)) 308 continue; 309 } 310 *destination++ = *source++; 311 continue; 312 } 313 int count = nonASCIISequenceLength(*source); 314 int character; 315 if (!count) 316 character = nonCharacter; 317 else { 318 if (count > end - source) { 319 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 320 ASSERT(!m_partialSequenceSize); 321 m_partialSequenceSize = end - source; 322 memcpy(m_partialSequence, source, m_partialSequenceSize); 323 source = end; 324 break; 325 } 326 character = decodeNonASCIISequence(source, count); 327 } 328 if (character == nonCharacter) { 329 sawError = true; 330 if (stopOnError) 331 break; 332 333 goto upConvertTo16Bit; 334 } 335 if (character > 0xff) 336 goto upConvertTo16Bit; 337 338 source += count; 339 *destination++ = character; 340 } 341 } while (flush && m_partialSequenceSize); 342 343 buffer.shrink(destination - buffer.characters()); 344 345 return String::adopt(buffer); 346 347 upConvertTo16Bit: 348 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); 349 350 UChar* destination16 = buffer16.characters(); 351 352 // Copy the already converted characters 353 for (LChar* converted8 = buffer.characters(); converted8 < destination;) 354 *destination16++ = *converted8++; 355 356 do { 357 if (m_partialSequenceSize) { 358 // Explicitly copy destination and source pointers to avoid taking pointers to the 359 // local variables, which may harm code generation by disabling some optimizations 360 // in some compilers. 361 UChar* destinationForHandlePartialSequence = destination16; 362 const uint8_t* sourceForHandlePartialSequence = source; 363 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); 364 destination16 = destinationForHandlePartialSequence; 365 source = sourceForHandlePartialSequence; 366 if (m_partialSequenceSize) 367 break; 368 } 369 370 while (source < end) { 371 if (isASCII(*source)) { 372 // Fast path for ASCII. Most UTF-8 text will be ASCII. 373 if (isAlignedToMachineWord(source)) { 374 while (source < alignedEnd) { 375 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 376 if (!isAllASCII<LChar>(chunk)) 377 break; 378 copyASCIIMachineWord(destination16, source); 379 source += sizeof(MachineWord); 380 destination16 += sizeof(MachineWord); 381 } 382 if (source == end) 383 break; 384 if (!isASCII(*source)) 385 continue; 386 } 387 *destination16++ = *source++; 388 continue; 389 } 390 int count = nonASCIISequenceLength(*source); 391 int character; 392 if (!count) 393 character = nonCharacter; 394 else { 395 if (count > end - source) { 396 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 397 ASSERT(!m_partialSequenceSize); 398 m_partialSequenceSize = end - source; 399 memcpy(m_partialSequence, source, m_partialSequenceSize); 400 source = end; 401 break; 402 } 403 character = decodeNonASCIISequence(source, count); 404 } 405 if (character == nonCharacter) { 406 sawError = true; 407 if (stopOnError) 408 break; 409 // Each error generates a replacement character and consumes one byte. 410 *destination16++ = replacementCharacter; 411 ++source; 412 continue; 413 } 414 source += count; 415 destination16 = appendCharacter(destination16, character); 416 } 417 } while (flush && m_partialSequenceSize); 418 419 buffer16.shrink(destination16 - buffer16.characters()); 420 421 return String::adopt(buffer16); 422 } 423 424 template<typename CharType> 425 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) 426 { 427 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. 428 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). 429 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). 430 if (length > numeric_limits<size_t>::max() / 3) 431 CRASH(); 432 Vector<uint8_t> bytes(length * 3); 433 434 size_t i = 0; 435 size_t bytesWritten = 0; 436 while (i < length) { 437 UChar32 character; 438 U16_NEXT(characters, i, length, character); 439 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); 440 } 441 442 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); 443 } 444 445 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) 446 { 447 return encodeCommon(characters, length); 448 } 449 450 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling) 451 { 452 return encodeCommon(characters, length); 453 } 454 455 } // namespace WTF 456