1 // Copyright 2007-2010 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_UNICODE_INL_H_ 29 #define V8_UNICODE_INL_H_ 30 31 #include "unicode.h" 32 #include "checks.h" 33 #include "platform.h" 34 35 namespace unibrow { 36 37 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 38 CacheEntry entry = entries_[code_point & kMask]; 39 if (entry.code_point_ == code_point) return entry.value_; 40 return CalculateValue(code_point); 41 } 42 43 template <class T, int s> bool Predicate<T, s>::CalculateValue( 44 uchar code_point) { 45 bool result = T::Is(code_point); 46 entries_[code_point & kMask] = CacheEntry(code_point, result); 47 return result; 48 } 49 50 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 51 uchar* result) { 52 CacheEntry entry = entries_[c & kMask]; 53 if (entry.code_point_ == c) { 54 if (entry.offset_ == 0) { 55 return 0; 56 } else { 57 result[0] = c + entry.offset_; 58 return 1; 59 } 60 } else { 61 return CalculateValue(c, n, result); 62 } 63 } 64 65 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 66 uchar* result) { 67 bool allow_caching = true; 68 int length = T::Convert(c, n, result, &allow_caching); 69 if (allow_caching) { 70 if (length == 1) { 71 entries_[c & kMask] = CacheEntry(c, result[0] - c); 72 return 1; 73 } else { 74 entries_[c & kMask] = CacheEntry(c, 0); 75 return 0; 76 } 77 } else { 78 return length; 79 } 80 } 81 82 83 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 84 ASSERT(c > Latin1::kMaxChar); 85 switch (c) { 86 // This are equivalent characters in unicode. 87 case 0x39c: 88 case 0x3bc: 89 return 0xb5; 90 // This is an uppercase of a Latin-1 character 91 // outside of Latin-1. 92 case 0x178: 93 return 0xff; 94 } 95 return 0; 96 } 97 98 99 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 100 static const int kMask = ~(1 << 6); 101 if (c <= kMaxOneByteChar) { 102 str[0] = c; 103 return 1; 104 } 105 str[0] = 0xC0 | (c >> 6); 106 str[1] = 0x80 | (c & kMask); 107 return 2; 108 } 109 110 111 unsigned Utf8::Encode(char* str, uchar c, int previous) { 112 static const int kMask = ~(1 << 6); 113 if (c <= kMaxOneByteChar) { 114 str[0] = c; 115 return 1; 116 } else if (c <= kMaxTwoByteChar) { 117 str[0] = 0xC0 | (c >> 6); 118 str[1] = 0x80 | (c & kMask); 119 return 2; 120 } else if (c <= kMaxThreeByteChar) { 121 if (Utf16::IsTrailSurrogate(c) && 122 Utf16::IsLeadSurrogate(previous)) { 123 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 124 return Encode(str - kUnmatchedSize, 125 Utf16::CombineSurrogatePair(previous, c), 126 Utf16::kNoPreviousCharacter) - kUnmatchedSize; 127 } 128 str[0] = 0xE0 | (c >> 12); 129 str[1] = 0x80 | ((c >> 6) & kMask); 130 str[2] = 0x80 | (c & kMask); 131 return 3; 132 } else { 133 str[0] = 0xF0 | (c >> 18); 134 str[1] = 0x80 | ((c >> 12) & kMask); 135 str[2] = 0x80 | ((c >> 6) & kMask); 136 str[3] = 0x80 | (c & kMask); 137 return 4; 138 } 139 } 140 141 142 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 143 if (length <= 0) return kBadChar; 144 byte first = bytes[0]; 145 // Characters between 0000 and 0007F are encoded as a single character 146 if (first <= kMaxOneByteChar) { 147 *cursor += 1; 148 return first; 149 } 150 return CalculateValue(bytes, length, cursor); 151 } 152 153 unsigned Utf8::Length(uchar c, int previous) { 154 if (c <= kMaxOneByteChar) { 155 return 1; 156 } else if (c <= kMaxTwoByteChar) { 157 return 2; 158 } else if (c <= kMaxThreeByteChar) { 159 if (Utf16::IsTrailSurrogate(c) && 160 Utf16::IsLeadSurrogate(previous)) { 161 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 162 } 163 return 3; 164 } else { 165 return 4; 166 } 167 } 168 169 Utf8DecoderBase::Utf8DecoderBase() 170 : unbuffered_start_(NULL), 171 utf16_length_(0), 172 last_byte_of_buffer_unused_(false) {} 173 174 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, 175 unsigned buffer_length, 176 const uint8_t* stream, 177 unsigned stream_length) { 178 Reset(buffer, buffer_length, stream, stream_length); 179 } 180 181 template<unsigned kBufferSize> 182 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) 183 : Utf8DecoderBase(buffer_, 184 kBufferSize, 185 reinterpret_cast<const uint8_t*>(stream), 186 length) { 187 } 188 189 template<unsigned kBufferSize> 190 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { 191 Utf8DecoderBase::Reset(buffer_, 192 kBufferSize, 193 reinterpret_cast<const uint8_t*>(stream), 194 length); 195 } 196 197 template <unsigned kBufferSize> 198 unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 199 unsigned length) const { 200 ASSERT(length > 0); 201 if (length > utf16_length_) length = utf16_length_; 202 // memcpy everything in buffer. 203 unsigned buffer_length = 204 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 205 unsigned memcpy_length = length <= buffer_length ? length : buffer_length; 206 v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t)); 207 if (length <= buffer_length) return length; 208 ASSERT(unbuffered_start_ != NULL); 209 // Copy the rest the slow way. 210 WriteUtf16Slow(unbuffered_start_, 211 data + buffer_length, 212 length - buffer_length); 213 return length; 214 } 215 216 } // namespace unibrow 217 218 #endif // V8_UNICODE_INL_H_ 219