Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_UNICODE_H_
      6 #define V8_UNICODE_H_
      7 
      8 #include <sys/types.h>
      9 #include "src/globals.h"
     10 /**
     11  * \file
     12  * Definitions and convenience functions for working with unicode.
     13  */
     14 
     15 namespace unibrow {
     16 
     17 typedef unsigned int uchar;
     18 typedef unsigned char byte;
     19 
     20 /**
     21  * The max length of the result of converting the case of a single
     22  * character.
     23  */
     24 const int kMaxMappingSize = 4;
     25 
     26 template <class T, int size = 256>
     27 class Predicate {
     28  public:
     29   inline Predicate() { }
     30   inline bool get(uchar c);
     31  private:
     32   friend class Test;
     33   bool CalculateValue(uchar c);
     34   struct CacheEntry {
     35     inline CacheEntry() : code_point_(0), value_(0) { }
     36     inline CacheEntry(uchar code_point, bool value)
     37       : code_point_(code_point),
     38         value_(value) { }
     39     uchar code_point_ : 21;
     40     bool value_ : 1;
     41   };
     42   static const int kSize = size;
     43   static const int kMask = kSize - 1;
     44   CacheEntry entries_[kSize];
     45 };
     46 
     47 // A cache used in case conversion.  It caches the value for characters
     48 // that either have no mapping or map to a single character independent
     49 // of context.  Characters that map to more than one character or that
     50 // map differently depending on context are always looked up.
     51 template <class T, int size = 256>
     52 class Mapping {
     53  public:
     54   inline Mapping() { }
     55   inline int get(uchar c, uchar n, uchar* result);
     56  private:
     57   friend class Test;
     58   int CalculateValue(uchar c, uchar n, uchar* result);
     59   struct CacheEntry {
     60     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
     61     inline CacheEntry(uchar code_point, signed offset)
     62       : code_point_(code_point),
     63         offset_(offset) { }
     64     uchar code_point_;
     65     signed offset_;
     66     static const int kNoChar = (1 << 21) - 1;
     67   };
     68   static const int kSize = size;
     69   static const int kMask = kSize - 1;
     70   CacheEntry entries_[kSize];
     71 };
     72 
     73 class UnicodeData {
     74  private:
     75   friend class Test;
     76   static int GetByteCount();
     77   static const uchar kMaxCodePoint;
     78 };
     79 
     80 class Utf16 {
     81  public:
     82   static inline bool IsSurrogatePair(int lead, int trail) {
     83     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
     84   }
     85   static inline bool IsLeadSurrogate(int code) {
     86     if (code == kNoPreviousCharacter) return false;
     87     return (code & 0xfc00) == 0xd800;
     88   }
     89   static inline bool IsTrailSurrogate(int code) {
     90     if (code == kNoPreviousCharacter) return false;
     91     return (code & 0xfc00) == 0xdc00;
     92   }
     93 
     94   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
     95     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
     96   }
     97   static const int kNoPreviousCharacter = -1;
     98   static const uchar kMaxNonSurrogateCharCode = 0xffff;
     99   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
    100   // of UTF-8 data.  The special case where the unit is a surrogate
    101   // trail produces 1 byte net, because the encoding of the pair is
    102   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
    103   // can be reclaimed.
    104   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
    105   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
    106   // The illegality stems from the surrogate not being part of a pair.
    107   static const int kUtf8BytesToCodeASurrogate = 3;
    108   static inline uint16_t LeadSurrogate(uint32_t char_code) {
    109     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
    110   }
    111   static inline uint16_t TrailSurrogate(uint32_t char_code) {
    112     return 0xdc00 + (char_code & 0x3ff);
    113   }
    114 };
    115 
    116 class Latin1 {
    117  public:
    118   static const unsigned kMaxChar = 0xff;
    119   // Returns 0 if character does not convert to single latin-1 character
    120   // or if the character doesn't not convert back to latin-1 via inverse
    121   // operation (upper to lower, etc).
    122   static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
    123 };
    124 
    125 class Utf8 {
    126  public:
    127   static inline uchar Length(uchar chr, int previous);
    128   static inline unsigned EncodeOneByte(char* out, uint8_t c);
    129   static inline unsigned Encode(char* out,
    130                                 uchar c,
    131                                 int previous,
    132                                 bool replace_invalid = false);
    133   static uchar CalculateValue(const byte* str,
    134                               unsigned length,
    135                               unsigned* cursor);
    136 
    137   // The unicode replacement character, used to signal invalid unicode
    138   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
    139   static const uchar kBadChar = 0xFFFD;
    140   static const unsigned kMaxEncodedSize   = 4;
    141   static const unsigned kMaxOneByteChar   = 0x7f;
    142   static const unsigned kMaxTwoByteChar   = 0x7ff;
    143   static const unsigned kMaxThreeByteChar = 0xffff;
    144   static const unsigned kMaxFourByteChar  = 0x1fffff;
    145 
    146   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
    147   // that match are coded as a 4 byte UTF-8 sequence.
    148   static const unsigned kBytesSavedByCombiningSurrogates = 2;
    149   static const unsigned kSizeOfUnmatchedSurrogate = 3;
    150   // The maximum size a single UTF-16 code unit may take up when encoded as
    151   // UTF-8.
    152   static const unsigned kMax16BitCodeUnitSize  = 3;
    153   static inline uchar ValueOf(const byte* str,
    154                               unsigned length,
    155                               unsigned* cursor);
    156 };
    157 
    158 
    159 class Utf8DecoderBase {
    160  public:
    161   // Initialization done in subclass.
    162   inline Utf8DecoderBase();
    163   inline Utf8DecoderBase(uint16_t* buffer,
    164                          unsigned buffer_length,
    165                          const uint8_t* stream,
    166                          unsigned stream_length);
    167   inline unsigned Utf16Length() const { return utf16_length_; }
    168  protected:
    169   // This reads all characters and sets the utf16_length_.
    170   // The first buffer_length utf16 chars are cached in the buffer.
    171   void Reset(uint16_t* buffer,
    172              unsigned buffer_length,
    173              const uint8_t* stream,
    174              unsigned stream_length);
    175   static void WriteUtf16Slow(const uint8_t* stream,
    176                              uint16_t* data,
    177                              unsigned length);
    178   const uint8_t* unbuffered_start_;
    179   unsigned utf16_length_;
    180   bool last_byte_of_buffer_unused_;
    181  private:
    182   DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
    183 };
    184 
    185 template <unsigned kBufferSize>
    186 class Utf8Decoder : public Utf8DecoderBase {
    187  public:
    188   inline Utf8Decoder() {}
    189   inline Utf8Decoder(const char* stream, unsigned length);
    190   inline void Reset(const char* stream, unsigned length);
    191   inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
    192  private:
    193   uint16_t buffer_[kBufferSize];
    194 };
    195 
    196 
    197 struct Uppercase {
    198   static bool Is(uchar c);
    199 };
    200 struct Lowercase {
    201   static bool Is(uchar c);
    202 };
    203 struct Letter {
    204   static bool Is(uchar c);
    205 };
    206 struct Number {
    207   static bool Is(uchar c);
    208 };
    209 struct WhiteSpace {
    210   static bool Is(uchar c);
    211 };
    212 struct LineTerminator {
    213   static bool Is(uchar c);
    214 };
    215 struct CombiningMark {
    216   static bool Is(uchar c);
    217 };
    218 struct ConnectorPunctuation {
    219   static bool Is(uchar c);
    220 };
    221 struct ToLowercase {
    222   static const int kMaxWidth = 3;
    223   static const bool kIsToLower = true;
    224   static int Convert(uchar c,
    225                      uchar n,
    226                      uchar* result,
    227                      bool* allow_caching_ptr);
    228 };
    229 struct ToUppercase {
    230   static const int kMaxWidth = 3;
    231   static const bool kIsToLower = false;
    232   static int Convert(uchar c,
    233                      uchar n,
    234                      uchar* result,
    235                      bool* allow_caching_ptr);
    236 };
    237 struct Ecma262Canonicalize {
    238   static const int kMaxWidth = 1;
    239   static int Convert(uchar c,
    240                      uchar n,
    241                      uchar* result,
    242                      bool* allow_caching_ptr);
    243 };
    244 struct Ecma262UnCanonicalize {
    245   static const int kMaxWidth = 4;
    246   static int Convert(uchar c,
    247                      uchar n,
    248                      uchar* result,
    249                      bool* allow_caching_ptr);
    250 };
    251 struct CanonicalizationRange {
    252   static const int kMaxWidth = 1;
    253   static int Convert(uchar c,
    254                      uchar n,
    255                      uchar* result,
    256                      bool* allow_caching_ptr);
    257 };
    258 
    259 }  // namespace unibrow
    260 
    261 #endif  // V8_UNICODE_H_
    262