1 /* 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "wtf/text/TextCodecUTF8.h" 28 29 #include "wtf/text/TextCodecASCIIFastPath.h" 30 #include "wtf/text/CString.h" 31 #include "wtf/text/StringBuffer.h" 32 #include "wtf/unicode/CharacterNames.h" 33 34 using namespace WTF; 35 using namespace WTF::Unicode; 36 using namespace std; 37 38 namespace WTF { 39 40 const int nonCharacter = -1; 41 42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 43 { 44 return adoptPtr(new TextCodecUTF8); 45 } 46 47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 48 { 49 registrar("UTF-8", "UTF-8"); 50 51 // Additional aliases that originally were present in the encoding 52 // table in WebKit on Macintosh, and subsequently added by 53 // TextCodecICU. Perhaps we can prove some are not used on the web 54 // and remove them. 55 registrar("unicode11utf8", "UTF-8"); 56 registrar("unicode20utf8", "UTF-8"); 57 registrar("utf8", "UTF-8"); 58 registrar("x-unicode20utf8", "UTF-8"); 59 60 // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/) 61 // and Firefox (24), but not in ICU 4.6. 62 registrar("unicode-1-1-utf-8", "UTF-8"); 63 } 64 65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 66 { 67 registrar("UTF-8", create, 0); 68 } 69 70 static inline int nonASCIISequenceLength(uint8_t firstByte) 71 { 72 static const uint8_t lengths[256] = { 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 87 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 89 }; 90 return lengths[firstByte]; 91 } 92 93 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) 94 { 95 ASSERT(!isASCII(sequence[0])); 96 if (length == 2) { 97 ASSERT(sequence[0] <= 0xDF); 98 if (sequence[0] < 0xC2) 99 return nonCharacter; 100 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 101 return nonCharacter; 102 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 103 } 104 if (length == 3) { 105 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 106 switch (sequence[0]) { 107 case 0xE0: 108 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 109 return nonCharacter; 110 break; 111 case 0xED: 112 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 113 return nonCharacter; 114 break; 115 default: 116 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 117 return nonCharacter; 118 } 119 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 120 return nonCharacter; 121 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; 122 } 123 ASSERT(length == 4); 124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); 125 switch (sequence[0]) { 126 case 0xF0: 127 if (sequence[1] < 0x90 || sequence[1] > 0xBF) 128 return nonCharacter; 129 break; 130 case 0xF4: 131 if (sequence[1] < 0x80 || sequence[1] > 0x8F) 132 return nonCharacter; 133 break; 134 default: 135 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 136 return nonCharacter; 137 } 138 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 139 return nonCharacter; 140 if (sequence[3] < 0x80 || sequence[3] > 0xBF) 141 return nonCharacter; 142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; 143 } 144 145 static inline UChar* appendCharacter(UChar* destination, int character) 146 { 147 ASSERT(character != nonCharacter); 148 ASSERT(!U_IS_SURROGATE(character)); 149 if (U_IS_BMP(character)) 150 *destination++ = character; 151 else { 152 *destination++ = U16_LEAD(character); 153 *destination++ = U16_TRAIL(character); 154 } 155 return destination; 156 } 157 158 void TextCodecUTF8::consumePartialSequenceByte() 159 { 160 --m_partialSequenceSize; 161 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); 162 } 163 164 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError) 165 { 166 sawError = true; 167 if (stopOnError) 168 return; 169 // Each error generates a replacement character and consumes one byte. 170 *destination++ = replacementCharacter; 171 consumePartialSequenceByte(); 172 } 173 174 template <> 175 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&) 176 { 177 ASSERT(m_partialSequenceSize); 178 do { 179 if (isASCII(m_partialSequence[0])) { 180 *destination++ = m_partialSequence[0]; 181 consumePartialSequenceByte(); 182 continue; 183 } 184 int count = nonASCIISequenceLength(m_partialSequence[0]); 185 if (!count) 186 return true; 187 188 if (count > m_partialSequenceSize) { 189 if (count - m_partialSequenceSize > end - source) { 190 if (!flush) { 191 // The new data is not enough to complete the sequence, so 192 // add it to the existing partial sequence. 193 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 194 m_partialSequenceSize += end - source; 195 return false; 196 } 197 // An incomplete partial sequence at the end is an error, but it will create 198 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle 199 // the error. 200 return true; 201 } 202 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 203 source += count - m_partialSequenceSize; 204 m_partialSequenceSize = count; 205 } 206 int character = decodeNonASCIISequence(m_partialSequence, count); 207 if ((character == nonCharacter) || (character > 0xff)) 208 return true; 209 210 m_partialSequenceSize -= count; 211 *destination++ = character; 212 } while (m_partialSequenceSize); 213 214 return false; 215 } 216 217 template <> 218 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 219 { 220 ASSERT(m_partialSequenceSize); 221 do { 222 if (isASCII(m_partialSequence[0])) { 223 *destination++ = m_partialSequence[0]; 224 consumePartialSequenceByte(); 225 continue; 226 } 227 int count = nonASCIISequenceLength(m_partialSequence[0]); 228 if (!count) { 229 handleError(destination, stopOnError, sawError); 230 if (stopOnError) 231 return false; 232 continue; 233 } 234 if (count > m_partialSequenceSize) { 235 if (count - m_partialSequenceSize > end - source) { 236 if (!flush) { 237 // The new data is not enough to complete the sequence, so 238 // add it to the existing partial sequence. 239 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 240 m_partialSequenceSize += end - source; 241 return false; 242 } 243 // An incomplete partial sequence at the end is an error. 244 handleError(destination, stopOnError, sawError); 245 if (stopOnError) 246 return false; 247 continue; 248 } 249 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 250 source += count - m_partialSequenceSize; 251 m_partialSequenceSize = count; 252 } 253 int character = decodeNonASCIISequence(m_partialSequence, count); 254 if (character == nonCharacter) { 255 handleError(destination, stopOnError, sawError); 256 if (stopOnError) 257 return false; 258 continue; 259 } 260 261 m_partialSequenceSize -= count; 262 destination = appendCharacter(destination, character); 263 } while (m_partialSequenceSize); 264 265 return false; 266 } 267 268 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 269 { 270 // Each input byte might turn into a character. 271 // That includes all bytes in the partial-sequence buffer because 272 // each byte in an invalid sequence will turn into a replacement character. 273 StringBuffer<LChar> buffer(m_partialSequenceSize + length); 274 275 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); 276 const uint8_t* end = source + length; 277 const uint8_t* alignedEnd = alignToMachineWord(end); 278 LChar* destination = buffer.characters(); 279 280 do { 281 if (m_partialSequenceSize) { 282 // Explicitly copy destination and source pointers to avoid taking pointers to the 283 // local variables, which may harm code generation by disabling some optimizations 284 // in some compilers. 285 LChar* destinationForHandlePartialSequence = destination; 286 const uint8_t* sourceForHandlePartialSequence = source; 287 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) { 288 source = sourceForHandlePartialSequence; 289 goto upConvertTo16Bit; 290 } 291 destination = destinationForHandlePartialSequence; 292 source = sourceForHandlePartialSequence; 293 if (m_partialSequenceSize) 294 break; 295 } 296 297 while (source < end) { 298 if (isASCII(*source)) { 299 // Fast path for ASCII. Most UTF-8 text will be ASCII. 300 if (isAlignedToMachineWord(source)) { 301 while (source < alignedEnd) { 302 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 303 if (!isAllASCII<LChar>(chunk)) 304 break; 305 copyASCIIMachineWord(destination, source); 306 source += sizeof(MachineWord); 307 destination += sizeof(MachineWord); 308 } 309 if (source == end) 310 break; 311 if (!isASCII(*source)) 312 continue; 313 } 314 *destination++ = *source++; 315 continue; 316 } 317 int count = nonASCIISequenceLength(*source); 318 int character; 319 if (!count) 320 character = nonCharacter; 321 else { 322 if (count > end - source) { 323 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 324 ASSERT(!m_partialSequenceSize); 325 m_partialSequenceSize = end - source; 326 memcpy(m_partialSequence, source, m_partialSequenceSize); 327 source = end; 328 break; 329 } 330 character = decodeNonASCIISequence(source, count); 331 } 332 if (character == nonCharacter) { 333 sawError = true; 334 if (stopOnError) 335 break; 336 337 goto upConvertTo16Bit; 338 } 339 if (character > 0xff) 340 goto upConvertTo16Bit; 341 342 source += count; 343 *destination++ = character; 344 } 345 } while (flush && m_partialSequenceSize); 346 347 buffer.shrink(destination - buffer.characters()); 348 349 return String::adopt(buffer); 350 351 upConvertTo16Bit: 352 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); 353 354 UChar* destination16 = buffer16.characters(); 355 356 // Copy the already converted characters 357 for (LChar* converted8 = buffer.characters(); converted8 < destination;) 358 *destination16++ = *converted8++; 359 360 do { 361 if (m_partialSequenceSize) { 362 // Explicitly copy destination and source pointers to avoid taking pointers to the 363 // local variables, which may harm code generation by disabling some optimizations 364 // in some compilers. 365 UChar* destinationForHandlePartialSequence = destination16; 366 const uint8_t* sourceForHandlePartialSequence = source; 367 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); 368 destination16 = destinationForHandlePartialSequence; 369 source = sourceForHandlePartialSequence; 370 if (m_partialSequenceSize) 371 break; 372 } 373 374 while (source < end) { 375 if (isASCII(*source)) { 376 // Fast path for ASCII. Most UTF-8 text will be ASCII. 377 if (isAlignedToMachineWord(source)) { 378 while (source < alignedEnd) { 379 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 380 if (!isAllASCII<LChar>(chunk)) 381 break; 382 copyASCIIMachineWord(destination16, source); 383 source += sizeof(MachineWord); 384 destination16 += sizeof(MachineWord); 385 } 386 if (source == end) 387 break; 388 if (!isASCII(*source)) 389 continue; 390 } 391 *destination16++ = *source++; 392 continue; 393 } 394 int count = nonASCIISequenceLength(*source); 395 int character; 396 if (!count) 397 character = nonCharacter; 398 else { 399 if (count > end - source) { 400 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 401 ASSERT(!m_partialSequenceSize); 402 m_partialSequenceSize = end - source; 403 memcpy(m_partialSequence, source, m_partialSequenceSize); 404 source = end; 405 break; 406 } 407 character = decodeNonASCIISequence(source, count); 408 } 409 if (character == nonCharacter) { 410 sawError = true; 411 if (stopOnError) 412 break; 413 // Each error generates a replacement character and consumes one byte. 414 *destination16++ = replacementCharacter; 415 ++source; 416 continue; 417 } 418 source += count; 419 destination16 = appendCharacter(destination16, character); 420 } 421 } while (flush && m_partialSequenceSize); 422 423 buffer16.shrink(destination16 - buffer16.characters()); 424 425 return String::adopt(buffer16); 426 } 427 428 template<typename CharType> 429 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) 430 { 431 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. 432 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). 433 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). 434 if (length > numeric_limits<size_t>::max() / 3) 435 CRASH(); 436 Vector<uint8_t> bytes(length * 3); 437 438 size_t i = 0; 439 size_t bytesWritten = 0; 440 while (i < length) { 441 UChar32 character; 442 U16_NEXT(characters, i, length, character); 443 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate 444 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here. 445 if (0xD800 <= character && character <= 0xDFFF) 446 character = replacementCharacter; 447 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); 448 } 449 450 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); 451 } 452 453 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) 454 { 455 return encodeCommon(characters, length); 456 } 457 458 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling) 459 { 460 return encodeCommon(characters, length); 461 } 462 463 } // namespace WTF 464