1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_UNICODE_H_ 29 #define V8_UNICODE_H_ 30 31 #include <sys/types.h> 32 33 /** 34 * \file 35 * Definitions and convenience functions for working with unicode. 36 */ 37 38 namespace unibrow { 39 40 typedef unsigned int uchar; 41 typedef unsigned char byte; 42 43 /** 44 * The max length of the result of converting the case of a single 45 * character. 46 */ 47 const int kMaxMappingSize = 4; 48 49 template <class T, int size = 256> 50 class Predicate { 51 public: 52 inline Predicate() { } 53 inline bool get(uchar c); 54 private: 55 friend class Test; 56 bool CalculateValue(uchar c); 57 struct CacheEntry { 58 inline CacheEntry() : code_point_(0), value_(0) { } 59 inline CacheEntry(uchar code_point, bool value) 60 : code_point_(code_point), 61 value_(value) { } 62 uchar code_point_ : 21; 63 bool value_ : 1; 64 }; 65 static const int kSize = size; 66 static const int kMask = kSize - 1; 67 CacheEntry entries_[kSize]; 68 }; 69 70 // A cache used in case conversion. It caches the value for characters 71 // that either have no mapping or map to a single character independent 72 // of context. Characters that map to more than one character or that 73 // map differently depending on context are always looked up. 74 template <class T, int size = 256> 75 class Mapping { 76 public: 77 inline Mapping() { } 78 inline int get(uchar c, uchar n, uchar* result); 79 private: 80 friend class Test; 81 int CalculateValue(uchar c, uchar n, uchar* result); 82 struct CacheEntry { 83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } 84 inline CacheEntry(uchar code_point, signed offset) 85 : code_point_(code_point), 86 offset_(offset) { } 87 uchar code_point_; 88 signed offset_; 89 static const int kNoChar = (1 << 21) - 1; 90 }; 91 static const int kSize = size; 92 static const int kMask = kSize - 1; 93 CacheEntry entries_[kSize]; 94 }; 95 96 class UnicodeData { 97 private: 98 friend class Test; 99 static int GetByteCount(); 100 static const uchar kMaxCodePoint; 101 }; 102 103 // --- U t f 8 a n d 16 --- 104 105 template <typename Data> 106 class Buffer { 107 public: 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } 109 inline Buffer() : data_(0), length_(0) { } 110 Data data() { return data_; } 111 unsigned length() { return length_; } 112 private: 113 Data data_; 114 unsigned length_; 115 }; 116 117 118 class Utf16 { 119 public: 120 static inline bool IsLeadSurrogate(int code) { 121 if (code == kNoPreviousCharacter) return false; 122 return (code & 0xfc00) == 0xd800; 123 } 124 static inline bool IsTrailSurrogate(int code) { 125 if (code == kNoPreviousCharacter) return false; 126 return (code & 0xfc00) == 0xdc00; 127 } 128 129 static inline int CombineSurrogatePair(uchar lead, uchar trail) { 130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); 131 } 132 static const int kNoPreviousCharacter = -1; 133 static const uchar kMaxNonSurrogateCharCode = 0xffff; 134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes 135 // of UTF-8 data. The special case where the unit is a surrogate 136 // trail produces 1 byte net, because the encoding of the pair is 137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate 138 // can be reclaimed. 139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; 140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. 141 // The illegality stems from the surrogate not being part of a pair. 142 static const int kUtf8BytesToCodeASurrogate = 3; 143 static inline uchar LeadSurrogate(int char_code) { 144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 145 } 146 static inline uchar TrailSurrogate(int char_code) { 147 return 0xdc00 + (char_code & 0x3ff); 148 } 149 }; 150 151 152 class Utf8 { 153 public: 154 static inline uchar Length(uchar chr, int previous); 155 static inline unsigned Encode( 156 char* out, uchar c, int previous); 157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, 158 unsigned capacity, unsigned* chars_read, unsigned* offset); 159 static uchar CalculateValue(const byte* str, 160 unsigned length, 161 unsigned* cursor); 162 static const uchar kBadChar = 0xFFFD; 163 static const unsigned kMaxEncodedSize = 4; 164 static const unsigned kMaxOneByteChar = 0x7f; 165 static const unsigned kMaxTwoByteChar = 0x7ff; 166 static const unsigned kMaxThreeByteChar = 0xffff; 167 static const unsigned kMaxFourByteChar = 0x1fffff; 168 169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 170 // that match are coded as a 4 byte UTF-8 sequence. 171 static const unsigned kBytesSavedByCombiningSurrogates = 2; 172 static const unsigned kSizeOfUnmatchedSurrogate = 3; 173 174 private: 175 template <unsigned s> friend class Utf8InputBuffer; 176 friend class Test; 177 static inline uchar ValueOf(const byte* str, 178 unsigned length, 179 unsigned* cursor); 180 }; 181 182 // --- C h a r a c t e r S t r e a m --- 183 184 class CharacterStream { 185 public: 186 inline uchar GetNext(); 187 inline bool has_more() { return remaining_ != 0; } 188 // Note that default implementation is not efficient. 189 virtual void Seek(unsigned); 190 unsigned Length(); 191 unsigned Utf16Length(); 192 virtual ~CharacterStream() { } 193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, 194 unsigned& offset); 195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, 196 unsigned capacity, unsigned& offset); 197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, 198 unsigned capacity, unsigned& offset); 199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); 200 virtual void Rewind() = 0; 201 202 protected: 203 virtual void FillBuffer() = 0; 204 // The number of characters left in the current buffer 205 unsigned remaining_; 206 // The current offset within the buffer 207 unsigned cursor_; 208 // The buffer containing the decoded characters. 209 const byte* buffer_; 210 }; 211 212 // --- I n p u t B u f f e r --- 213 214 /** 215 * Provides efficient access to encoded characters in strings. It 216 * does so by reading characters one block at a time, rather than one 217 * character at a time, which gives string implementations an 218 * opportunity to optimize the decoding. 219 */ 220 template <class Reader, class Input = Reader*, unsigned kSize = 256> 221 class InputBuffer : public CharacterStream { 222 public: 223 virtual void Rewind(); 224 inline void Reset(Input input); 225 void Seek(unsigned position); 226 inline void Reset(unsigned position, Input input); 227 protected: 228 InputBuffer() { } 229 explicit InputBuffer(Input input) { Reset(input); } 230 virtual void FillBuffer(); 231 232 // A custom offset that can be used by the string implementation to 233 // mark progress within the encoded string. 234 unsigned offset_; 235 // The input string 236 Input input_; 237 // To avoid heap allocation, we keep an internal buffer to which 238 // the encoded string can write its characters. The string 239 // implementation is free to decide whether it wants to use this 240 // buffer or not. 241 byte util_buffer_[kSize]; 242 }; 243 244 // --- U t f 8 I n p u t B u f f e r --- 245 246 template <unsigned s = 256> 247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> { 248 public: 249 inline Utf8InputBuffer() { } 250 inline Utf8InputBuffer(const char* data, unsigned length); 251 inline void Reset(const char* data, unsigned length) { 252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset( 253 Buffer<const char*>(data, length)); 254 } 255 }; 256 257 258 struct Uppercase { 259 static bool Is(uchar c); 260 }; 261 struct Lowercase { 262 static bool Is(uchar c); 263 }; 264 struct Letter { 265 static bool Is(uchar c); 266 }; 267 struct Space { 268 static bool Is(uchar c); 269 }; 270 struct Number { 271 static bool Is(uchar c); 272 }; 273 struct WhiteSpace { 274 static bool Is(uchar c); 275 }; 276 struct LineTerminator { 277 static bool Is(uchar c); 278 }; 279 struct CombiningMark { 280 static bool Is(uchar c); 281 }; 282 struct ConnectorPunctuation { 283 static bool Is(uchar c); 284 }; 285 struct ToLowercase { 286 static const int kMaxWidth = 3; 287 static int Convert(uchar c, 288 uchar n, 289 uchar* result, 290 bool* allow_caching_ptr); 291 }; 292 struct ToUppercase { 293 static const int kMaxWidth = 3; 294 static int Convert(uchar c, 295 uchar n, 296 uchar* result, 297 bool* allow_caching_ptr); 298 }; 299 struct Ecma262Canonicalize { 300 static const int kMaxWidth = 1; 301 static int Convert(uchar c, 302 uchar n, 303 uchar* result, 304 bool* allow_caching_ptr); 305 }; 306 struct Ecma262UnCanonicalize { 307 static const int kMaxWidth = 4; 308 static int Convert(uchar c, 309 uchar n, 310 uchar* result, 311 bool* allow_caching_ptr); 312 }; 313 struct CanonicalizationRange { 314 static const int kMaxWidth = 1; 315 static int Convert(uchar c, 316 uchar n, 317 uchar* result, 318 bool* allow_caching_ptr); 319 }; 320 321 } // namespace unibrow 322 323 #endif // V8_UNICODE_H_ 324