Home | History | Annotate | Download | only in src
      1 // Copyright 2014 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_UNICODE_DECODER_H_
      6 #define V8_UNICODE_DECODER_H_
      7 
      8 #include <sys/types.h>
      9 #include "src/globals.h"
     10 #include "src/utils.h"
     11 
     12 namespace unibrow {
     13 
     14 class V8_EXPORT_PRIVATE Utf8DecoderBase {
     15  public:
     16   // Initialization done in subclass.
     17   inline Utf8DecoderBase();
     18   inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
     19                          const uint8_t* stream, size_t stream_length);
     20   inline size_t Utf16Length() const { return utf16_length_; }
     21 
     22  protected:
     23   // This reads all characters and sets the utf16_length_.
     24   // The first buffer_length utf16 chars are cached in the buffer.
     25   void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
     26              size_t stream_length);
     27   static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
     28                              uint16_t* data, size_t length);
     29   const uint8_t* unbuffered_start_;
     30   size_t unbuffered_length_;
     31   size_t utf16_length_;
     32   bool last_byte_of_buffer_unused_;
     33 
     34  private:
     35   DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
     36 };
     37 
     38 template <size_t kBufferSize>
     39 class Utf8Decoder : public Utf8DecoderBase {
     40  public:
     41   inline Utf8Decoder() {}
     42   inline Utf8Decoder(const char* stream, size_t length);
     43   inline void Reset(const char* stream, size_t length);
     44   inline size_t WriteUtf16(uint16_t* data, size_t length) const;
     45 
     46  private:
     47   uint16_t buffer_[kBufferSize];
     48 };
     49 
     50 
     51 Utf8DecoderBase::Utf8DecoderBase()
     52     : unbuffered_start_(NULL),
     53       unbuffered_length_(0),
     54       utf16_length_(0),
     55       last_byte_of_buffer_unused_(false) {}
     56 
     57 
     58 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
     59                                  const uint8_t* stream, size_t stream_length) {
     60   Reset(buffer, buffer_length, stream, stream_length);
     61 }
     62 
     63 
     64 template <size_t kBufferSize>
     65 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
     66     : Utf8DecoderBase(buffer_, kBufferSize,
     67                       reinterpret_cast<const uint8_t*>(stream), length) {}
     68 
     69 
     70 template <size_t kBufferSize>
     71 void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
     72   Utf8DecoderBase::Reset(buffer_, kBufferSize,
     73                          reinterpret_cast<const uint8_t*>(stream), length);
     74 }
     75 
     76 
     77 template <size_t kBufferSize>
     78 size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
     79                                             size_t length) const {
     80   DCHECK(length > 0);
     81   if (length > utf16_length_) length = utf16_length_;
     82   // memcpy everything in buffer.
     83   size_t buffer_length =
     84       last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
     85   size_t memcpy_length = length <= buffer_length ? length : buffer_length;
     86   v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
     87   if (length <= buffer_length) return length;
     88   DCHECK(unbuffered_start_ != NULL);
     89   // Copy the rest the slow way.
     90   WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
     91                  length - buffer_length);
     92   return length;
     93 }
     94 
     95 class Latin1 {
     96  public:
     97   static const unsigned kMaxChar = 0xff;
     98   // Returns 0 if character does not convert to single latin-1 character
     99   // or if the character doesn't not convert back to latin-1 via inverse
    100   // operation (upper to lower, etc).
    101   static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
    102 };
    103 
    104 
    105 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
    106   DCHECK(c > Latin1::kMaxChar);
    107   switch (c) {
    108     // This are equivalent characters in unicode.
    109     case 0x39c:
    110     case 0x3bc:
    111       return 0xb5;
    112     // This is an uppercase of a Latin-1 character
    113     // outside of Latin-1.
    114     case 0x178:
    115       return 0xff;
    116   }
    117   return 0;
    118 }
    119 
    120 
    121 }  // namespace unibrow
    122 
    123 #endif  // V8_UNICODE_DECODER_H_
    124