1 // Copyright 2014 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_UNICODE_DECODER_H_ 6 #define V8_UNICODE_DECODER_H_ 7 8 #include <sys/types.h> 9 #include "src/globals.h" 10 11 namespace unibrow { 12 13 class Utf8DecoderBase { 14 public: 15 // Initialization done in subclass. 16 inline Utf8DecoderBase(); 17 inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, 18 const uint8_t* stream, size_t stream_length); 19 inline size_t Utf16Length() const { return utf16_length_; } 20 21 protected: 22 // This reads all characters and sets the utf16_length_. 23 // The first buffer_length utf16 chars are cached in the buffer. 24 void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream, 25 size_t stream_length); 26 static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length, 27 uint16_t* data, size_t length); 28 const uint8_t* unbuffered_start_; 29 size_t unbuffered_length_; 30 size_t utf16_length_; 31 bool last_byte_of_buffer_unused_; 32 33 private: 34 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); 35 }; 36 37 template <size_t kBufferSize> 38 class Utf8Decoder : public Utf8DecoderBase { 39 public: 40 inline Utf8Decoder() {} 41 inline Utf8Decoder(const char* stream, size_t length); 42 inline void Reset(const char* stream, size_t length); 43 inline size_t WriteUtf16(uint16_t* data, size_t length) const; 44 45 private: 46 uint16_t buffer_[kBufferSize]; 47 }; 48 49 50 Utf8DecoderBase::Utf8DecoderBase() 51 : unbuffered_start_(NULL), 52 unbuffered_length_(0), 53 utf16_length_(0), 54 last_byte_of_buffer_unused_(false) {} 55 56 57 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, 58 const uint8_t* stream, size_t stream_length) { 59 Reset(buffer, buffer_length, stream, stream_length); 60 } 61 62 63 template <size_t kBufferSize> 64 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length) 65 : Utf8DecoderBase(buffer_, kBufferSize, 66 reinterpret_cast<const uint8_t*>(stream), length) {} 67 68 69 template <size_t kBufferSize> 70 void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) { 71 Utf8DecoderBase::Reset(buffer_, kBufferSize, 72 reinterpret_cast<const uint8_t*>(stream), length); 73 } 74 75 76 template <size_t kBufferSize> 77 size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 78 size_t length) const { 79 DCHECK(length > 0); 80 if (length > utf16_length_) length = utf16_length_; 81 // memcpy everything in buffer. 82 size_t buffer_length = 83 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 84 size_t memcpy_length = length <= buffer_length ? length : buffer_length; 85 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); 86 if (length <= buffer_length) return length; 87 DCHECK(unbuffered_start_ != NULL); 88 // Copy the rest the slow way. 89 WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length, 90 length - buffer_length); 91 return length; 92 } 93 94 class Latin1 { 95 public: 96 static const unsigned kMaxChar = 0xff; 97 // Returns 0 if character does not convert to single latin-1 character 98 // or if the character doesn't not convert back to latin-1 via inverse 99 // operation (upper to lower, etc). 100 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); 101 }; 102 103 104 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 105 DCHECK(c > Latin1::kMaxChar); 106 switch (c) { 107 // This are equivalent characters in unicode. 108 case 0x39c: 109 case 0x3bc: 110 return 0xb5; 111 // This is an uppercase of a Latin-1 character 112 // outside of Latin-1. 113 case 0x178: 114 return 0xff; 115 } 116 return 0; 117 } 118 119 120 } // namespace unibrow 121 122 #endif // V8_UNICODE_DECODER_H_ 123