Home | History | Annotate | Download | only in src
      1 // Copyright 2007-2008 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_UNICODE_H_
     29 #define V8_UNICODE_H_
     30 
     31 #include <sys/types.h>
     32 
     33 /**
     34  * \file
     35  * Definitions and convenience functions for working with unicode.
     36  */
     37 
     38 namespace unibrow {
     39 
     40 typedef unsigned int uchar;
     41 typedef unsigned char byte;
     42 
     43 /**
     44  * The max length of the result of converting the case of a single
     45  * character.
     46  */
     47 static const int kMaxMappingSize = 4;
     48 
     49 template <class T, int size = 256>
     50 class Predicate {
     51  public:
     52   inline Predicate() { }
     53   inline bool get(uchar c);
     54  private:
     55   friend class Test;
     56   bool CalculateValue(uchar c);
     57   struct CacheEntry {
     58     inline CacheEntry() : code_point_(0), value_(0) { }
     59     inline CacheEntry(uchar code_point, bool value)
     60       : code_point_(code_point),
     61         value_(value) { }
     62     uchar code_point_ : 21;
     63     bool value_ : 1;
     64   };
     65   static const int kSize = size;
     66   static const int kMask = kSize - 1;
     67   CacheEntry entries_[kSize];
     68 };
     69 
     70 // A cache used in case conversion.  It caches the value for characters
     71 // that either have no mapping or map to a single character independent
     72 // of context.  Characters that map to more than one character or that
     73 // map differently depending on context are always looked up.
     74 template <class T, int size = 256>
     75 class Mapping {
     76  public:
     77   inline Mapping() { }
     78   inline int get(uchar c, uchar n, uchar* result);
     79  private:
     80   friend class Test;
     81   int CalculateValue(uchar c, uchar n, uchar* result);
     82   struct CacheEntry {
     83     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
     84     inline CacheEntry(uchar code_point, signed offset)
     85       : code_point_(code_point),
     86         offset_(offset) { }
     87     uchar code_point_;
     88     signed offset_;
     89     static const int kNoChar = (1 << 21) - 1;
     90   };
     91   static const int kSize = size;
     92   static const int kMask = kSize - 1;
     93   CacheEntry entries_[kSize];
     94 };
     95 
     96 class UnicodeData {
     97  private:
     98   friend class Test;
     99   static int GetByteCount();
    100   static const uchar kMaxCodePoint;
    101 };
    102 
    103 // --- U t f   8 ---
    104 
    105 template <typename Data>
    106 class Buffer {
    107  public:
    108   inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
    109   inline Buffer() : data_(0), length_(0) { }
    110   Data data() { return data_; }
    111   unsigned length() { return length_; }
    112  private:
    113   Data data_;
    114   unsigned length_;
    115 };
    116 
    117 class Utf8 {
    118  public:
    119   static inline uchar Length(uchar chr);
    120   static inline unsigned Encode(char* out, uchar c);
    121   static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
    122       unsigned capacity, unsigned* chars_read, unsigned* offset);
    123   static uchar CalculateValue(const byte* str,
    124                               unsigned length,
    125                               unsigned* cursor);
    126   static const uchar kBadChar = 0xFFFD;
    127   static const unsigned kMaxEncodedSize   = 4;
    128   static const unsigned kMaxOneByteChar   = 0x7f;
    129   static const unsigned kMaxTwoByteChar   = 0x7ff;
    130   static const unsigned kMaxThreeByteChar = 0xffff;
    131   static const unsigned kMaxFourByteChar  = 0x1fffff;
    132 
    133  private:
    134   template <unsigned s> friend class Utf8InputBuffer;
    135   friend class Test;
    136   static inline uchar ValueOf(const byte* str,
    137                               unsigned length,
    138                               unsigned* cursor);
    139 };
    140 
    141 // --- C h a r a c t e r   S t r e a m ---
    142 
    143 class CharacterStream {
    144  public:
    145   inline uchar GetNext();
    146   inline bool has_more() { return remaining_ != 0; }
    147   // Note that default implementation is not efficient.
    148   virtual void Seek(unsigned);
    149   unsigned Length();
    150   virtual ~CharacterStream() { }
    151   static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
    152       unsigned& offset);
    153   static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
    154       unsigned capacity, unsigned& offset);
    155   static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
    156       unsigned capacity, unsigned& offset);
    157   static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
    158   virtual void Rewind() = 0;
    159  protected:
    160   virtual void FillBuffer() = 0;
    161   // The number of characters left in the current buffer
    162   unsigned remaining_;
    163   // The current offset within the buffer
    164   unsigned cursor_;
    165   // The buffer containing the decoded characters.
    166   const byte* buffer_;
    167 };
    168 
    169 // --- I n p u t   B u f f e r ---
    170 
    171 /**
    172  * Provides efficient access to encoded characters in strings.  It
    173  * does so by reading characters one block at a time, rather than one
    174  * character at a time, which gives string implementations an
    175  * opportunity to optimize the decoding.
    176  */
    177 template <class Reader, class Input = Reader*, unsigned kSize = 256>
    178 class InputBuffer : public CharacterStream {
    179  public:
    180   virtual void Rewind();
    181   inline void Reset(Input input);
    182   void Seek(unsigned position);
    183   inline void Reset(unsigned position, Input input);
    184  protected:
    185   InputBuffer() { }
    186   explicit InputBuffer(Input input) { Reset(input); }
    187   virtual void FillBuffer();
    188 
    189   // A custom offset that can be used by the string implementation to
    190   // mark progress within the encoded string.
    191   unsigned offset_;
    192   // The input string
    193   Input input_;
    194   // To avoid heap allocation, we keep an internal buffer to which
    195   // the encoded string can write its characters.  The string
    196   // implementation is free to decide whether it wants to use this
    197   // buffer or not.
    198   byte util_buffer_[kSize];
    199 };
    200 
    201 // --- U t f 8   I n p u t   B u f f e r ---
    202 
    203 template <unsigned s = 256>
    204 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
    205  public:
    206   inline Utf8InputBuffer() { }
    207   inline Utf8InputBuffer(const char* data, unsigned length);
    208   inline void Reset(const char* data, unsigned length) {
    209     InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
    210         Buffer<const char*>(data, length));
    211   }
    212 };
    213 
    214 
    215 struct Uppercase {
    216   static bool Is(uchar c);
    217 };
    218 struct Lowercase {
    219   static bool Is(uchar c);
    220 };
    221 struct Letter {
    222   static bool Is(uchar c);
    223 };
    224 struct Space {
    225   static bool Is(uchar c);
    226 };
    227 struct Number {
    228   static bool Is(uchar c);
    229 };
    230 struct WhiteSpace {
    231   static bool Is(uchar c);
    232 };
    233 struct LineTerminator {
    234   static bool Is(uchar c);
    235 };
    236 struct CombiningMark {
    237   static bool Is(uchar c);
    238 };
    239 struct ConnectorPunctuation {
    240   static bool Is(uchar c);
    241 };
    242 struct ToLowercase {
    243   static const int kMaxWidth = 3;
    244   static int Convert(uchar c,
    245                      uchar n,
    246                      uchar* result,
    247                      bool* allow_caching_ptr);
    248 };
    249 struct ToUppercase {
    250   static const int kMaxWidth = 3;
    251   static int Convert(uchar c,
    252                      uchar n,
    253                      uchar* result,
    254                      bool* allow_caching_ptr);
    255 };
    256 struct Ecma262Canonicalize {
    257   static const int kMaxWidth = 1;
    258   static int Convert(uchar c,
    259                      uchar n,
    260                      uchar* result,
    261                      bool* allow_caching_ptr);
    262 };
    263 struct Ecma262UnCanonicalize {
    264   static const int kMaxWidth = 4;
    265   static int Convert(uchar c,
    266                      uchar n,
    267                      uchar* result,
    268                      bool* allow_caching_ptr);
    269 };
    270 struct CanonicalizationRange {
    271   static const int kMaxWidth = 1;
    272   static int Convert(uchar c,
    273                      uchar n,
    274                      uchar* result,
    275                      bool* allow_caching_ptr);
    276 };
    277 
    278 }  // namespace unibrow
    279 
    280 #endif  // V8_UNICODE_H_
    281