1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_UNICODE_H_ 29 #define V8_UNICODE_H_ 30 31 #include <sys/types.h> 32 #include "globals.h" 33 /** 34 * \file 35 * Definitions and convenience functions for working with unicode. 36 */ 37 38 namespace unibrow { 39 40 typedef unsigned int uchar; 41 typedef unsigned char byte; 42 43 /** 44 * The max length of the result of converting the case of a single 45 * character. 46 */ 47 const int kMaxMappingSize = 4; 48 49 template <class T, int size = 256> 50 class Predicate { 51 public: 52 inline Predicate() { } 53 inline bool get(uchar c); 54 private: 55 friend class Test; 56 bool CalculateValue(uchar c); 57 struct CacheEntry { 58 inline CacheEntry() : code_point_(0), value_(0) { } 59 inline CacheEntry(uchar code_point, bool value) 60 : code_point_(code_point), 61 value_(value) { } 62 uchar code_point_ : 21; 63 bool value_ : 1; 64 }; 65 static const int kSize = size; 66 static const int kMask = kSize - 1; 67 CacheEntry entries_[kSize]; 68 }; 69 70 // A cache used in case conversion. It caches the value for characters 71 // that either have no mapping or map to a single character independent 72 // of context. Characters that map to more than one character or that 73 // map differently depending on context are always looked up. 74 template <class T, int size = 256> 75 class Mapping { 76 public: 77 inline Mapping() { } 78 inline int get(uchar c, uchar n, uchar* result); 79 private: 80 friend class Test; 81 int CalculateValue(uchar c, uchar n, uchar* result); 82 struct CacheEntry { 83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } 84 inline CacheEntry(uchar code_point, signed offset) 85 : code_point_(code_point), 86 offset_(offset) { } 87 uchar code_point_; 88 signed offset_; 89 static const int kNoChar = (1 << 21) - 1; 90 }; 91 static const int kSize = size; 92 static const int kMask = kSize - 1; 93 CacheEntry entries_[kSize]; 94 }; 95 96 class UnicodeData { 97 private: 98 friend class Test; 99 static int GetByteCount(); 100 static const uchar kMaxCodePoint; 101 }; 102 103 class Utf16 { 104 public: 105 static inline bool IsLeadSurrogate(int code) { 106 if (code == kNoPreviousCharacter) return false; 107 return (code & 0xfc00) == 0xd800; 108 } 109 static inline bool IsTrailSurrogate(int code) { 110 if (code == kNoPreviousCharacter) return false; 111 return (code & 0xfc00) == 0xdc00; 112 } 113 114 static inline int CombineSurrogatePair(uchar lead, uchar trail) { 115 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); 116 } 117 static const int kNoPreviousCharacter = -1; 118 static const uchar kMaxNonSurrogateCharCode = 0xffff; 119 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes 120 // of UTF-8 data. The special case where the unit is a surrogate 121 // trail produces 1 byte net, because the encoding of the pair is 122 // 4 bytes and the 3 bytes that were used to encode the lead surrogate 123 // can be reclaimed. 124 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; 125 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. 126 // The illegality stems from the surrogate not being part of a pair. 127 static const int kUtf8BytesToCodeASurrogate = 3; 128 static inline uint16_t LeadSurrogate(uint32_t char_code) { 129 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 130 } 131 static inline uint16_t TrailSurrogate(uint32_t char_code) { 132 return 0xdc00 + (char_code & 0x3ff); 133 } 134 }; 135 136 class Latin1 { 137 public: 138 static const unsigned kMaxChar = 0xff; 139 // Returns 0 if character does not convert to single latin-1 character 140 // or if the character doesn't not convert back to latin-1 via inverse 141 // operation (upper to lower, etc). 142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); 143 }; 144 145 class Utf8 { 146 public: 147 static inline uchar Length(uchar chr, int previous); 148 static inline unsigned EncodeOneByte(char* out, uint8_t c); 149 static inline unsigned Encode( 150 char* out, uchar c, int previous); 151 static uchar CalculateValue(const byte* str, 152 unsigned length, 153 unsigned* cursor); 154 static const uchar kBadChar = 0xFFFD; 155 static const unsigned kMaxEncodedSize = 4; 156 static const unsigned kMaxOneByteChar = 0x7f; 157 static const unsigned kMaxTwoByteChar = 0x7ff; 158 static const unsigned kMaxThreeByteChar = 0xffff; 159 static const unsigned kMaxFourByteChar = 0x1fffff; 160 161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 162 // that match are coded as a 4 byte UTF-8 sequence. 163 static const unsigned kBytesSavedByCombiningSurrogates = 2; 164 static const unsigned kSizeOfUnmatchedSurrogate = 3; 165 static inline uchar ValueOf(const byte* str, 166 unsigned length, 167 unsigned* cursor); 168 }; 169 170 171 class Utf8DecoderBase { 172 public: 173 // Initialization done in subclass. 174 inline Utf8DecoderBase(); 175 inline Utf8DecoderBase(uint16_t* buffer, 176 unsigned buffer_length, 177 const uint8_t* stream, 178 unsigned stream_length); 179 inline unsigned Utf16Length() const { return utf16_length_; } 180 protected: 181 // This reads all characters and sets the utf16_length_. 182 // The first buffer_length utf16 chars are cached in the buffer. 183 void Reset(uint16_t* buffer, 184 unsigned buffer_length, 185 const uint8_t* stream, 186 unsigned stream_length); 187 static void WriteUtf16Slow(const uint8_t* stream, 188 uint16_t* data, 189 unsigned length); 190 const uint8_t* unbuffered_start_; 191 unsigned utf16_length_; 192 bool last_byte_of_buffer_unused_; 193 private: 194 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); 195 }; 196 197 template <unsigned kBufferSize> 198 class Utf8Decoder : public Utf8DecoderBase { 199 public: 200 inline Utf8Decoder() {} 201 inline Utf8Decoder(const char* stream, unsigned length); 202 inline void Reset(const char* stream, unsigned length); 203 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; 204 private: 205 uint16_t buffer_[kBufferSize]; 206 }; 207 208 209 struct Uppercase { 210 static bool Is(uchar c); 211 }; 212 struct Lowercase { 213 static bool Is(uchar c); 214 }; 215 struct Letter { 216 static bool Is(uchar c); 217 }; 218 struct Space { 219 static bool Is(uchar c); 220 }; 221 struct Number { 222 static bool Is(uchar c); 223 }; 224 struct WhiteSpace { 225 static bool Is(uchar c); 226 }; 227 struct LineTerminator { 228 static bool Is(uchar c); 229 }; 230 struct CombiningMark { 231 static bool Is(uchar c); 232 }; 233 struct ConnectorPunctuation { 234 static bool Is(uchar c); 235 }; 236 struct ToLowercase { 237 static const int kMaxWidth = 3; 238 static const bool kIsToLower = true; 239 static int Convert(uchar c, 240 uchar n, 241 uchar* result, 242 bool* allow_caching_ptr); 243 }; 244 struct ToUppercase { 245 static const int kMaxWidth = 3; 246 static const bool kIsToLower = false; 247 static int Convert(uchar c, 248 uchar n, 249 uchar* result, 250 bool* allow_caching_ptr); 251 }; 252 struct Ecma262Canonicalize { 253 static const int kMaxWidth = 1; 254 static int Convert(uchar c, 255 uchar n, 256 uchar* result, 257 bool* allow_caching_ptr); 258 }; 259 struct Ecma262UnCanonicalize { 260 static const int kMaxWidth = 4; 261 static int Convert(uchar c, 262 uchar n, 263 uchar* result, 264 bool* allow_caching_ptr); 265 }; 266 struct CanonicalizationRange { 267 static const int kMaxWidth = 1; 268 static int Convert(uchar c, 269 uchar n, 270 uchar* result, 271 bool* allow_caching_ptr); 272 }; 273 274 } // namespace unibrow 275 276 #endif // V8_UNICODE_H_ 277