Home | History | Annotate | Download | only in src
      1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_UNICODE_INL_H_
     29 #define V8_UNICODE_INL_H_
     30 
     31 #include "unicode.h"
     32 #include "checks.h"
     33 #include "platform.h"
     34 
     35 namespace unibrow {
     36 
     37 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
     38   CacheEntry entry = entries_[code_point & kMask];
     39   if (entry.code_point_ == code_point) return entry.value_;
     40   return CalculateValue(code_point);
     41 }
     42 
     43 template <class T, int s> bool Predicate<T, s>::CalculateValue(
     44     uchar code_point) {
     45   bool result = T::Is(code_point);
     46   entries_[code_point & kMask] = CacheEntry(code_point, result);
     47   return result;
     48 }
     49 
     50 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
     51     uchar* result) {
     52   CacheEntry entry = entries_[c & kMask];
     53   if (entry.code_point_ == c) {
     54     if (entry.offset_ == 0) {
     55       return 0;
     56     } else {
     57       result[0] = c + entry.offset_;
     58       return 1;
     59     }
     60   } else {
     61     return CalculateValue(c, n, result);
     62   }
     63 }
     64 
     65 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
     66     uchar* result) {
     67   bool allow_caching = true;
     68   int length = T::Convert(c, n, result, &allow_caching);
     69   if (allow_caching) {
     70     if (length == 1) {
     71       entries_[c & kMask] = CacheEntry(c, result[0] - c);
     72       return 1;
     73     } else {
     74       entries_[c & kMask] = CacheEntry(c, 0);
     75       return 0;
     76     }
     77   } else {
     78     return length;
     79   }
     80 }
     81 
     82 
     83 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
     84   ASSERT(c > Latin1::kMaxChar);
     85   switch (c) {
     86     // This are equivalent characters in unicode.
     87     case 0x39c:
     88     case 0x3bc:
     89       return 0xb5;
     90     // This is an uppercase of a Latin-1 character
     91     // outside of Latin-1.
     92     case 0x178:
     93       return 0xff;
     94   }
     95   return 0;
     96 }
     97 
     98 
     99 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
    100   static const int kMask = ~(1 << 6);
    101   if (c <= kMaxOneByteChar) {
    102     str[0] = c;
    103     return 1;
    104   }
    105   str[0] = 0xC0 | (c >> 6);
    106   str[1] = 0x80 | (c & kMask);
    107   return 2;
    108 }
    109 
    110 
    111 unsigned Utf8::Encode(char* str, uchar c, int previous) {
    112   static const int kMask = ~(1 << 6);
    113   if (c <= kMaxOneByteChar) {
    114     str[0] = c;
    115     return 1;
    116   } else if (c <= kMaxTwoByteChar) {
    117     str[0] = 0xC0 | (c >> 6);
    118     str[1] = 0x80 | (c & kMask);
    119     return 2;
    120   } else if (c <= kMaxThreeByteChar) {
    121     if (Utf16::IsTrailSurrogate(c) &&
    122         Utf16::IsLeadSurrogate(previous)) {
    123       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
    124       return Encode(str - kUnmatchedSize,
    125                     Utf16::CombineSurrogatePair(previous, c),
    126                     Utf16::kNoPreviousCharacter) - kUnmatchedSize;
    127     }
    128     str[0] = 0xE0 | (c >> 12);
    129     str[1] = 0x80 | ((c >> 6) & kMask);
    130     str[2] = 0x80 | (c & kMask);
    131     return 3;
    132   } else {
    133     str[0] = 0xF0 | (c >> 18);
    134     str[1] = 0x80 | ((c >> 12) & kMask);
    135     str[2] = 0x80 | ((c >> 6) & kMask);
    136     str[3] = 0x80 | (c & kMask);
    137     return 4;
    138   }
    139 }
    140 
    141 
    142 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
    143   if (length <= 0) return kBadChar;
    144   byte first = bytes[0];
    145   // Characters between 0000 and 0007F are encoded as a single character
    146   if (first <= kMaxOneByteChar) {
    147     *cursor += 1;
    148     return first;
    149   }
    150   return CalculateValue(bytes, length, cursor);
    151 }
    152 
    153 unsigned Utf8::Length(uchar c, int previous) {
    154   if (c <= kMaxOneByteChar) {
    155     return 1;
    156   } else if (c <= kMaxTwoByteChar) {
    157     return 2;
    158   } else if (c <= kMaxThreeByteChar) {
    159     if (Utf16::IsTrailSurrogate(c) &&
    160         Utf16::IsLeadSurrogate(previous)) {
    161       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
    162     }
    163     return 3;
    164   } else {
    165     return 4;
    166   }
    167 }
    168 
    169 Utf8DecoderBase::Utf8DecoderBase()
    170   : unbuffered_start_(NULL),
    171     utf16_length_(0),
    172     last_byte_of_buffer_unused_(false) {}
    173 
    174 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
    175                                  unsigned buffer_length,
    176                                  const uint8_t* stream,
    177                                  unsigned stream_length) {
    178   Reset(buffer, buffer_length, stream, stream_length);
    179 }
    180 
    181 template<unsigned kBufferSize>
    182 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
    183   : Utf8DecoderBase(buffer_,
    184                     kBufferSize,
    185                     reinterpret_cast<const uint8_t*>(stream),
    186                     length) {
    187 }
    188 
    189 template<unsigned kBufferSize>
    190 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
    191   Utf8DecoderBase::Reset(buffer_,
    192                          kBufferSize,
    193                          reinterpret_cast<const uint8_t*>(stream),
    194                          length);
    195 }
    196 
    197 template <unsigned kBufferSize>
    198 unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
    199                                               unsigned length) const {
    200   ASSERT(length > 0);
    201   if (length > utf16_length_) length = utf16_length_;
    202   // memcpy everything in buffer.
    203   unsigned buffer_length =
    204       last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
    205   unsigned memcpy_length = length <= buffer_length  ? length : buffer_length;
    206   v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t));
    207   if (length <= buffer_length) return length;
    208   ASSERT(unbuffered_start_ != NULL);
    209   // Copy the rest the slow way.
    210   WriteUtf16Slow(unbuffered_start_,
    211                  data + buffer_length,
    212                  length - buffer_length);
    213   return length;
    214 }
    215 
    216 }  // namespace unibrow
    217 
    218 #endif  // V8_UNICODE_INL_H_
    219