1 // Copyright 2014 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_UNICODE_DECODER_H_ 6 #define V8_UNICODE_DECODER_H_ 7 8 #include <sys/types.h> 9 #include "src/globals.h" 10 #include "src/utils.h" 11 12 namespace unibrow { 13 14 class V8_EXPORT_PRIVATE Utf8DecoderBase { 15 public: 16 // Initialization done in subclass. 17 inline Utf8DecoderBase(); 18 inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, 19 const uint8_t* stream, size_t stream_length); 20 inline size_t Utf16Length() const { return utf16_length_; } 21 22 protected: 23 // This reads all characters and sets the utf16_length_. 24 // The first buffer_length utf16 chars are cached in the buffer. 25 void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream, 26 size_t stream_length); 27 static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length, 28 uint16_t* data, size_t length); 29 const uint8_t* unbuffered_start_; 30 size_t unbuffered_length_; 31 size_t utf16_length_; 32 bool last_byte_of_buffer_unused_; 33 34 private: 35 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); 36 }; 37 38 template <size_t kBufferSize> 39 class Utf8Decoder : public Utf8DecoderBase { 40 public: 41 inline Utf8Decoder() {} 42 inline Utf8Decoder(const char* stream, size_t length); 43 inline void Reset(const char* stream, size_t length); 44 inline size_t WriteUtf16(uint16_t* data, size_t length) const; 45 46 private: 47 uint16_t buffer_[kBufferSize]; 48 }; 49 50 51 Utf8DecoderBase::Utf8DecoderBase() 52 : unbuffered_start_(NULL), 53 unbuffered_length_(0), 54 utf16_length_(0), 55 last_byte_of_buffer_unused_(false) {} 56 57 58 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, 59 const uint8_t* stream, size_t stream_length) { 60 Reset(buffer, buffer_length, stream, stream_length); 61 } 62 63 64 template <size_t kBufferSize> 65 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length) 66 : Utf8DecoderBase(buffer_, kBufferSize, 67 reinterpret_cast<const uint8_t*>(stream), length) {} 68 69 70 template <size_t kBufferSize> 71 void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) { 72 Utf8DecoderBase::Reset(buffer_, kBufferSize, 73 reinterpret_cast<const uint8_t*>(stream), length); 74 } 75 76 77 template <size_t kBufferSize> 78 size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 79 size_t length) const { 80 DCHECK(length > 0); 81 if (length > utf16_length_) length = utf16_length_; 82 // memcpy everything in buffer. 83 size_t buffer_length = 84 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 85 size_t memcpy_length = length <= buffer_length ? length : buffer_length; 86 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); 87 if (length <= buffer_length) return length; 88 DCHECK(unbuffered_start_ != NULL); 89 // Copy the rest the slow way. 90 WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length, 91 length - buffer_length); 92 return length; 93 } 94 95 class Latin1 { 96 public: 97 static const unsigned kMaxChar = 0xff; 98 // Returns 0 if character does not convert to single latin-1 character 99 // or if the character doesn't not convert back to latin-1 via inverse 100 // operation (upper to lower, etc). 101 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); 102 }; 103 104 105 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 106 DCHECK(c > Latin1::kMaxChar); 107 switch (c) { 108 // This are equivalent characters in unicode. 109 case 0x39c: 110 case 0x3bc: 111 return 0xb5; 112 // This is an uppercase of a Latin-1 character 113 // outside of Latin-1. 114 case 0x178: 115 return 0xff; 116 } 117 return 0; 118 } 119 120 121 } // namespace unibrow 122 123 #endif // V8_UNICODE_DECODER_H_ 124