Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "util/utf8/unicodetext.h"
     18 
     19 #include "base.h"
     20 #include "util/strings/utf8.h"
     21 
     22 namespace libtextclassifier {
     23 
     24 // *************** Data representation **********
     25 // Note: the copy constructor is undefined.
     26 
     27 void UnicodeText::Repr::PointTo(const char* data, int size) {
     28   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
     29   data_ = const_cast<char*>(data);
     30   size_ = size;
     31   capacity_ = size;
     32   ours_ = false;
     33 }
     34 
     35 void UnicodeText::Repr::Copy(const char* data, int size) {
     36   resize(size);
     37   memcpy(data_, data, size);
     38 }
     39 
     40 void UnicodeText::Repr::resize(int new_size) {
     41   if (new_size == 0) {
     42     clear();
     43   } else {
     44     if (!ours_ || new_size > capacity_) reserve(new_size);
     45     // Clear the memory in the expanded part.
     46     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
     47     size_ = new_size;
     48     ours_ = true;
     49   }
     50 }
     51 
     52 void UnicodeText::Repr::reserve(int new_capacity) {
     53   // If there's already enough capacity, and we're an owner, do nothing.
     54   if (capacity_ >= new_capacity && ours_) return;
     55 
     56   // Otherwise, allocate a new buffer.
     57   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
     58   char* new_data = new char[capacity_];
     59 
     60   // If there is an old buffer, copy it into the new buffer.
     61   if (data_) {
     62     memcpy(new_data, data_, size_);
     63     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
     64   }
     65   data_ = new_data;
     66   ours_ = true;  // We own the new buffer.
     67   // size_ is unchanged.
     68 }
     69 
     70 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
     71   reserve(size_ + byte_length);
     72   memcpy(data_ + size_, bytes, byte_length);
     73   size_ += byte_length;
     74 }
     75 
     76 void UnicodeText::Repr::clear() {
     77   if (ours_) delete[] data_;
     78   data_ = nullptr;
     79   size_ = capacity_ = 0;
     80   ours_ = true;
     81 }
     82 
     83 // *************** UnicodeText ******************
     84 
     85 UnicodeText::UnicodeText() {}
     86 
     87 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); }
     88 
     89 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
     90   repr_.Copy(src.repr_.data_, src.repr_.size_);
     91   return *this;
     92 }
     93 
     94 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
     95   repr_.PointTo(buffer, byte_length);
     96   return *this;
     97 }
     98 
     99 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
    100   repr_.Copy(buffer, byte_length);
    101   return *this;
    102 }
    103 
    104 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
    105   repr_.append(utf8, len);
    106   return *this;
    107 }
    108 
    109 void UnicodeText::clear() { repr_.clear(); }
    110 
    111 std::string UnicodeText::UTF8Substring(const const_iterator& first,
    112                                        const const_iterator& last) {
    113   return std::string(first.it_, last.it_ - first.it_);
    114 }
    115 
    116 UnicodeText::~UnicodeText() {}
    117 
    118 // ******************* UnicodeText::const_iterator *********************
    119 
    120 // The implementation of const_iterator would be nicer if it
    121 // inherited from boost::iterator_facade
    122 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
    123 
    124 UnicodeText::const_iterator::const_iterator() : it_(0) {}
    125 
    126 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
    127     const const_iterator& other) {
    128   if (&other != this) it_ = other.it_;
    129   return *this;
    130 }
    131 
    132 UnicodeText::const_iterator UnicodeText::begin() const {
    133   return const_iterator(repr_.data_);
    134 }
    135 
    136 UnicodeText::const_iterator UnicodeText::end() const {
    137   return const_iterator(repr_.data_ + repr_.size_);
    138 }
    139 
    140 bool operator<(const UnicodeText::const_iterator& lhs,
    141                const UnicodeText::const_iterator& rhs) {
    142   return lhs.it_ < rhs.it_;
    143 }
    144 
    145 char32 UnicodeText::const_iterator::operator*() const {
    146   // (We could call chartorune here, but that does some
    147   // error-checking, and we're guaranteed that our data is valid
    148   // UTF-8. Also, we expect this routine to be called very often. So
    149   // for speed, we do the calculation ourselves.)
    150 
    151   // Convert from UTF-8
    152   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
    153   if (byte1 < 0x80) return byte1;
    154 
    155   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
    156   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
    157 
    158   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
    159   if (byte1 < 0xF0) {
    160     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
    161   }
    162 
    163   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
    164   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
    165          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
    166 }
    167 
    168 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
    169   it_ += GetNumBytesForNonZeroUTF8Char(it_);
    170   return *this;
    171 }
    172 
    173 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
    174   while (IsTrailByte(*--it_)) {
    175   }
    176   return *this;
    177 }
    178 
    179 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
    180   UnicodeText t;
    181   if (do_copy) {
    182     t.CopyUTF8(utf8_buf, len);
    183   } else {
    184     t.PointToUTF8(utf8_buf, len);
    185   }
    186   return t;
    187 }
    188 
    189 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
    190   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
    191 }
    192 
    193 }  // namespace libtextclassifier
    194