Home | History | Annotate | Download | only in src
      1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_UNICODE_INL_H_
      6 #define V8_UNICODE_INL_H_
      7 
      8 #include "src/unicode.h"
      9 #include "src/base/logging.h"
     10 #include "src/utils.h"
     11 
     12 namespace unibrow {
     13 
     14 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
     15   CacheEntry entry = entries_[code_point & kMask];
     16   if (entry.code_point_ == code_point) return entry.value_;
     17   return CalculateValue(code_point);
     18 }
     19 
     20 template <class T, int s> bool Predicate<T, s>::CalculateValue(
     21     uchar code_point) {
     22   bool result = T::Is(code_point);
     23   entries_[code_point & kMask] = CacheEntry(code_point, result);
     24   return result;
     25 }
     26 
     27 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
     28     uchar* result) {
     29   CacheEntry entry = entries_[c & kMask];
     30   if (entry.code_point_ == c) {
     31     if (entry.offset_ == 0) {
     32       return 0;
     33     } else {
     34       result[0] = c + entry.offset_;
     35       return 1;
     36     }
     37   } else {
     38     return CalculateValue(c, n, result);
     39   }
     40 }
     41 
     42 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
     43     uchar* result) {
     44   bool allow_caching = true;
     45   int length = T::Convert(c, n, result, &allow_caching);
     46   if (allow_caching) {
     47     if (length == 1) {
     48       entries_[c & kMask] = CacheEntry(c, result[0] - c);
     49       return 1;
     50     } else {
     51       entries_[c & kMask] = CacheEntry(c, 0);
     52       return 0;
     53     }
     54   } else {
     55     return length;
     56   }
     57 }
     58 
     59 
     60 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
     61   DCHECK(c > Latin1::kMaxChar);
     62   switch (c) {
     63     // This are equivalent characters in unicode.
     64     case 0x39c:
     65     case 0x3bc:
     66       return 0xb5;
     67     // This is an uppercase of a Latin-1 character
     68     // outside of Latin-1.
     69     case 0x178:
     70       return 0xff;
     71   }
     72   return 0;
     73 }
     74 
     75 
     76 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
     77   static const int kMask = ~(1 << 6);
     78   if (c <= kMaxOneByteChar) {
     79     str[0] = c;
     80     return 1;
     81   }
     82   str[0] = 0xC0 | (c >> 6);
     83   str[1] = 0x80 | (c & kMask);
     84   return 2;
     85 }
     86 
     87 // Encode encodes the UTF-16 code units c and previous into the given str
     88 // buffer, and combines surrogate code units into single code points. If
     89 // replace_invalid is set to true, orphan surrogate code units will be replaced
     90 // with kBadChar.
     91 unsigned Utf8::Encode(char* str,
     92                       uchar c,
     93                       int previous,
     94                       bool replace_invalid) {
     95   static const int kMask = ~(1 << 6);
     96   if (c <= kMaxOneByteChar) {
     97     str[0] = c;
     98     return 1;
     99   } else if (c <= kMaxTwoByteChar) {
    100     str[0] = 0xC0 | (c >> 6);
    101     str[1] = 0x80 | (c & kMask);
    102     return 2;
    103   } else if (c <= kMaxThreeByteChar) {
    104     if (Utf16::IsSurrogatePair(previous, c)) {
    105       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
    106       return Encode(str - kUnmatchedSize,
    107                     Utf16::CombineSurrogatePair(previous, c),
    108                     Utf16::kNoPreviousCharacter,
    109                     replace_invalid) - kUnmatchedSize;
    110     } else if (replace_invalid &&
    111                (Utf16::IsLeadSurrogate(c) ||
    112                Utf16::IsTrailSurrogate(c))) {
    113       c = kBadChar;
    114     }
    115     str[0] = 0xE0 | (c >> 12);
    116     str[1] = 0x80 | ((c >> 6) & kMask);
    117     str[2] = 0x80 | (c & kMask);
    118     return 3;
    119   } else {
    120     str[0] = 0xF0 | (c >> 18);
    121     str[1] = 0x80 | ((c >> 12) & kMask);
    122     str[2] = 0x80 | ((c >> 6) & kMask);
    123     str[3] = 0x80 | (c & kMask);
    124     return 4;
    125   }
    126 }
    127 
    128 
    129 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
    130   if (length <= 0) return kBadChar;
    131   byte first = bytes[0];
    132   // Characters between 0000 and 0007F are encoded as a single character
    133   if (first <= kMaxOneByteChar) {
    134     *cursor += 1;
    135     return first;
    136   }
    137   return CalculateValue(bytes, length, cursor);
    138 }
    139 
    140 unsigned Utf8::Length(uchar c, int previous) {
    141   if (c <= kMaxOneByteChar) {
    142     return 1;
    143   } else if (c <= kMaxTwoByteChar) {
    144     return 2;
    145   } else if (c <= kMaxThreeByteChar) {
    146     if (Utf16::IsTrailSurrogate(c) &&
    147         Utf16::IsLeadSurrogate(previous)) {
    148       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
    149     }
    150     return 3;
    151   } else {
    152     return 4;
    153   }
    154 }
    155 
    156 Utf8DecoderBase::Utf8DecoderBase()
    157   : unbuffered_start_(NULL),
    158     utf16_length_(0),
    159     last_byte_of_buffer_unused_(false) {}
    160 
    161 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
    162                                  unsigned buffer_length,
    163                                  const uint8_t* stream,
    164                                  unsigned stream_length) {
    165   Reset(buffer, buffer_length, stream, stream_length);
    166 }
    167 
    168 template<unsigned kBufferSize>
    169 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
    170   : Utf8DecoderBase(buffer_,
    171                     kBufferSize,
    172                     reinterpret_cast<const uint8_t*>(stream),
    173                     length) {
    174 }
    175 
    176 template<unsigned kBufferSize>
    177 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
    178   Utf8DecoderBase::Reset(buffer_,
    179                          kBufferSize,
    180                          reinterpret_cast<const uint8_t*>(stream),
    181                          length);
    182 }
    183 
    184 template <unsigned kBufferSize>
    185 unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
    186                                               unsigned length) const {
    187   DCHECK(length > 0);
    188   if (length > utf16_length_) length = utf16_length_;
    189   // memcpy everything in buffer.
    190   unsigned buffer_length =
    191       last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
    192   unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
    193   v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
    194   if (length <= buffer_length) return length;
    195   DCHECK(unbuffered_start_ != NULL);
    196   // Copy the rest the slow way.
    197   WriteUtf16Slow(unbuffered_start_,
    198                  data + buffer_length,
    199                  length - buffer_length);
    200   return length;
    201 }
    202 
    203 }  // namespace unibrow
    204 
    205 #endif  // V8_UNICODE_INL_H_
    206