Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_UNICODE_H_
     29 #define V8_UNICODE_H_
     30 
     31 #include <sys/types.h>
     32 
     33 /**
     34  * \file
     35  * Definitions and convenience functions for working with unicode.
     36  */
     37 
     38 namespace unibrow {
     39 
     40 typedef unsigned int uchar;
     41 typedef unsigned char byte;
     42 
     43 /**
     44  * The max length of the result of converting the case of a single
     45  * character.
     46  */
     47 const int kMaxMappingSize = 4;
     48 
     49 template <class T, int size = 256>
     50 class Predicate {
     51  public:
     52   inline Predicate() { }
     53   inline bool get(uchar c);
     54  private:
     55   friend class Test;
     56   bool CalculateValue(uchar c);
     57   struct CacheEntry {
     58     inline CacheEntry() : code_point_(0), value_(0) { }
     59     inline CacheEntry(uchar code_point, bool value)
     60       : code_point_(code_point),
     61         value_(value) { }
     62     uchar code_point_ : 21;
     63     bool value_ : 1;
     64   };
     65   static const int kSize = size;
     66   static const int kMask = kSize - 1;
     67   CacheEntry entries_[kSize];
     68 };
     69 
     70 // A cache used in case conversion.  It caches the value for characters
     71 // that either have no mapping or map to a single character independent
     72 // of context.  Characters that map to more than one character or that
     73 // map differently depending on context are always looked up.
     74 template <class T, int size = 256>
     75 class Mapping {
     76  public:
     77   inline Mapping() { }
     78   inline int get(uchar c, uchar n, uchar* result);
     79  private:
     80   friend class Test;
     81   int CalculateValue(uchar c, uchar n, uchar* result);
     82   struct CacheEntry {
     83     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
     84     inline CacheEntry(uchar code_point, signed offset)
     85       : code_point_(code_point),
     86         offset_(offset) { }
     87     uchar code_point_;
     88     signed offset_;
     89     static const int kNoChar = (1 << 21) - 1;
     90   };
     91   static const int kSize = size;
     92   static const int kMask = kSize - 1;
     93   CacheEntry entries_[kSize];
     94 };
     95 
     96 class UnicodeData {
     97  private:
     98   friend class Test;
     99   static int GetByteCount();
    100   static const uchar kMaxCodePoint;
    101 };
    102 
    103 // --- U t f   8   a n d   16 ---
    104 
    105 template <typename Data>
    106 class Buffer {
    107  public:
    108   inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
    109   inline Buffer() : data_(0), length_(0) { }
    110   Data data() { return data_; }
    111   unsigned length() { return length_; }
    112  private:
    113   Data data_;
    114   unsigned length_;
    115 };
    116 
    117 
    118 class Utf16 {
    119  public:
    120   static inline bool IsLeadSurrogate(int code) {
    121     if (code == kNoPreviousCharacter) return false;
    122     return (code & 0xfc00) == 0xd800;
    123   }
    124   static inline bool IsTrailSurrogate(int code) {
    125     if (code == kNoPreviousCharacter) return false;
    126     return (code & 0xfc00) == 0xdc00;
    127   }
    128 
    129   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
    130     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
    131   }
    132   static const int kNoPreviousCharacter = -1;
    133   static const uchar kMaxNonSurrogateCharCode = 0xffff;
    134   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
    135   // of UTF-8 data.  The special case where the unit is a surrogate
    136   // trail produces 1 byte net, because the encoding of the pair is
    137   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
    138   // can be reclaimed.
    139   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
    140   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
    141   // The illegality stems from the surrogate not being part of a pair.
    142   static const int kUtf8BytesToCodeASurrogate = 3;
    143   static inline uchar LeadSurrogate(int char_code) {
    144     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
    145   }
    146   static inline uchar TrailSurrogate(int char_code) {
    147     return 0xdc00 + (char_code & 0x3ff);
    148   }
    149 };
    150 
    151 
    152 class Utf8 {
    153  public:
    154   static inline uchar Length(uchar chr, int previous);
    155   static inline unsigned Encode(
    156       char* out, uchar c, int previous);
    157   static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
    158       unsigned capacity, unsigned* chars_read, unsigned* offset);
    159   static uchar CalculateValue(const byte* str,
    160                               unsigned length,
    161                               unsigned* cursor);
    162   static const uchar kBadChar = 0xFFFD;
    163   static const unsigned kMaxEncodedSize   = 4;
    164   static const unsigned kMaxOneByteChar   = 0x7f;
    165   static const unsigned kMaxTwoByteChar   = 0x7ff;
    166   static const unsigned kMaxThreeByteChar = 0xffff;
    167   static const unsigned kMaxFourByteChar  = 0x1fffff;
    168 
    169   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
    170   // that match are coded as a 4 byte UTF-8 sequence.
    171   static const unsigned kBytesSavedByCombiningSurrogates = 2;
    172   static const unsigned kSizeOfUnmatchedSurrogate = 3;
    173 
    174  private:
    175   template <unsigned s> friend class Utf8InputBuffer;
    176   friend class Test;
    177   static inline uchar ValueOf(const byte* str,
    178                               unsigned length,
    179                               unsigned* cursor);
    180 };
    181 
    182 // --- C h a r a c t e r   S t r e a m ---
    183 
    184 class CharacterStream {
    185  public:
    186   inline uchar GetNext();
    187   inline bool has_more() { return remaining_ != 0; }
    188   // Note that default implementation is not efficient.
    189   virtual void Seek(unsigned);
    190   unsigned Length();
    191   unsigned Utf16Length();
    192   virtual ~CharacterStream() { }
    193   static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
    194       unsigned& offset);
    195   static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
    196       unsigned capacity, unsigned& offset);
    197   static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
    198       unsigned capacity, unsigned& offset);
    199   static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
    200   virtual void Rewind() = 0;
    201 
    202  protected:
    203   virtual void FillBuffer() = 0;
    204   // The number of characters left in the current buffer
    205   unsigned remaining_;
    206   // The current offset within the buffer
    207   unsigned cursor_;
    208   // The buffer containing the decoded characters.
    209   const byte* buffer_;
    210 };
    211 
    212 // --- I n p u t   B u f f e r ---
    213 
    214 /**
    215  * Provides efficient access to encoded characters in strings.  It
    216  * does so by reading characters one block at a time, rather than one
    217  * character at a time, which gives string implementations an
    218  * opportunity to optimize the decoding.
    219  */
    220 template <class Reader, class Input = Reader*, unsigned kSize = 256>
    221 class InputBuffer : public CharacterStream {
    222  public:
    223   virtual void Rewind();
    224   inline void Reset(Input input);
    225   void Seek(unsigned position);
    226   inline void Reset(unsigned position, Input input);
    227  protected:
    228   InputBuffer() { }
    229   explicit InputBuffer(Input input) { Reset(input); }
    230   virtual void FillBuffer();
    231 
    232   // A custom offset that can be used by the string implementation to
    233   // mark progress within the encoded string.
    234   unsigned offset_;
    235   // The input string
    236   Input input_;
    237   // To avoid heap allocation, we keep an internal buffer to which
    238   // the encoded string can write its characters.  The string
    239   // implementation is free to decide whether it wants to use this
    240   // buffer or not.
    241   byte util_buffer_[kSize];
    242 };
    243 
    244 // --- U t f 8   I n p u t   B u f f e r ---
    245 
    246 template <unsigned s = 256>
    247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
    248  public:
    249   inline Utf8InputBuffer() { }
    250   inline Utf8InputBuffer(const char* data, unsigned length);
    251   inline void Reset(const char* data, unsigned length) {
    252     InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
    253         Buffer<const char*>(data, length));
    254   }
    255 };
    256 
    257 
    258 struct Uppercase {
    259   static bool Is(uchar c);
    260 };
    261 struct Lowercase {
    262   static bool Is(uchar c);
    263 };
    264 struct Letter {
    265   static bool Is(uchar c);
    266 };
    267 struct Space {
    268   static bool Is(uchar c);
    269 };
    270 struct Number {
    271   static bool Is(uchar c);
    272 };
    273 struct WhiteSpace {
    274   static bool Is(uchar c);
    275 };
    276 struct LineTerminator {
    277   static bool Is(uchar c);
    278 };
    279 struct CombiningMark {
    280   static bool Is(uchar c);
    281 };
    282 struct ConnectorPunctuation {
    283   static bool Is(uchar c);
    284 };
    285 struct ToLowercase {
    286   static const int kMaxWidth = 3;
    287   static int Convert(uchar c,
    288                      uchar n,
    289                      uchar* result,
    290                      bool* allow_caching_ptr);
    291 };
    292 struct ToUppercase {
    293   static const int kMaxWidth = 3;
    294   static int Convert(uchar c,
    295                      uchar n,
    296                      uchar* result,
    297                      bool* allow_caching_ptr);
    298 };
    299 struct Ecma262Canonicalize {
    300   static const int kMaxWidth = 1;
    301   static int Convert(uchar c,
    302                      uchar n,
    303                      uchar* result,
    304                      bool* allow_caching_ptr);
    305 };
    306 struct Ecma262UnCanonicalize {
    307   static const int kMaxWidth = 4;
    308   static int Convert(uchar c,
    309                      uchar n,
    310                      uchar* result,
    311                      bool* allow_caching_ptr);
    312 };
    313 struct CanonicalizationRange {
    314   static const int kMaxWidth = 1;
    315   static int Convert(uchar c,
    316                      uchar n,
    317                      uchar* result,
    318                      bool* allow_caching_ptr);
    319 };
    320 
    321 }  // namespace unibrow
    322 
    323 #endif  // V8_UNICODE_H_
    324