Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "util/utf8/unicodetext.h"
     18 
     19 #include <string.h>
     20 
     21 #include <algorithm>
     22 
     23 #include "util/strings/utf8.h"
     24 
     25 namespace libtextclassifier2 {
     26 
     27 // *************** Data representation **********
     28 // Note: the copy constructor is undefined.
     29 
     30 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
     31   if (ours_ && data_) delete[] data_;
     32   data_ = src.data_;
     33   size_ = src.size_;
     34   capacity_ = src.capacity_;
     35   ours_ = src.ours_;
     36   src.ours_ = false;
     37   return *this;
     38 }
     39 
     40 void UnicodeText::Repr::PointTo(const char* data, int size) {
     41   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
     42   data_ = const_cast<char*>(data);
     43   size_ = size;
     44   capacity_ = size;
     45   ours_ = false;
     46 }
     47 
     48 void UnicodeText::Repr::Copy(const char* data, int size) {
     49   resize(size);
     50   memcpy(data_, data, size);
     51 }
     52 
     53 void UnicodeText::Repr::resize(int new_size) {
     54   if (new_size == 0) {
     55     clear();
     56   } else {
     57     if (!ours_ || new_size > capacity_) reserve(new_size);
     58     // Clear the memory in the expanded part.
     59     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
     60     size_ = new_size;
     61     ours_ = true;
     62   }
     63 }
     64 
     65 void UnicodeText::Repr::reserve(int new_capacity) {
     66   // If there's already enough capacity, and we're an owner, do nothing.
     67   if (capacity_ >= new_capacity && ours_) return;
     68 
     69   // Otherwise, allocate a new buffer.
     70   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
     71   char* new_data = new char[capacity_];
     72 
     73   // If there is an old buffer, copy it into the new buffer.
     74   if (data_) {
     75     memcpy(new_data, data_, size_);
     76     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
     77   }
     78   data_ = new_data;
     79   ours_ = true;  // We own the new buffer.
     80   // size_ is unchanged.
     81 }
     82 
     83 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
     84   reserve(size_ + byte_length);
     85   memcpy(data_ + size_, bytes, byte_length);
     86   size_ += byte_length;
     87 }
     88 
     89 void UnicodeText::Repr::clear() {
     90   if (ours_) delete[] data_;
     91   data_ = nullptr;
     92   size_ = capacity_ = 0;
     93   ours_ = true;
     94 }
     95 
     96 // *************** UnicodeText ******************
     97 
     98 UnicodeText::UnicodeText() {}
     99 
    100 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); }
    101 
    102 UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
    103   this->repr_ = std::move(src.repr_);
    104   return *this;
    105 }
    106 
    107 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
    108   repr_.Copy(src.repr_.data_, src.repr_.size_);
    109   return *this;
    110 }
    111 
    112 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
    113   repr_.PointTo(buffer, byte_length);
    114   return *this;
    115 }
    116 
    117 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
    118   repr_.Copy(buffer, byte_length);
    119   return *this;
    120 }
    121 
    122 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
    123   repr_.append(utf8, len);
    124   return *this;
    125 }
    126 
    127 const char* UnicodeText::data() const { return repr_.data_; }
    128 
    129 int UnicodeText::size_bytes() const { return repr_.size_; }
    130 
    131 namespace {
    132 
    133 enum {
    134   RuneError = 0xFFFD,  // Decoding error in UTF.
    135   RuneMax = 0x10FFFF,  // Maximum rune value.
    136 };
    137 
    138 int runetochar(const char32 rune, char* dest) {
    139   // Convert to unsigned for range check.
    140   uint32 c;
    141 
    142   // 1 char 00-7F
    143   c = rune;
    144   if (c <= 0x7F) {
    145     dest[0] = static_cast<char>(c);
    146     return 1;
    147   }
    148 
    149   // 2 char 0080-07FF
    150   if (c <= 0x07FF) {
    151     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
    152     dest[1] = 0x80 | (c & 0x3F);
    153     return 2;
    154   }
    155 
    156   // Range check
    157   if (c > RuneMax) {
    158     c = RuneError;
    159   }
    160 
    161   // 3 char 0800-FFFF
    162   if (c <= 0xFFFF) {
    163     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
    164     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
    165     dest[2] = 0x80 | (c & 0x3F);
    166     return 3;
    167   }
    168 
    169   // 4 char 10000-1FFFFF
    170   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
    171   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
    172   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
    173   dest[3] = 0x80 | (c & 0x3F);
    174   return 4;
    175 }
    176 
    177 }  // namespace
    178 
    179 UnicodeText& UnicodeText::AppendCodepoint(char32 ch) {
    180   char str[4];
    181   int char_len = runetochar(ch, str);
    182   repr_.append(str, char_len);
    183   return *this;
    184 }
    185 
    186 void UnicodeText::clear() { repr_.clear(); }
    187 
    188 int UnicodeText::size_codepoints() const {
    189   return std::distance(begin(), end());
    190 }
    191 
    192 bool UnicodeText::empty() const { return size_bytes() == 0; }
    193 
    194 bool UnicodeText::is_valid() const {
    195   return IsValidUTF8(repr_.data_, repr_.size_);
    196 }
    197 
    198 bool UnicodeText::operator==(const UnicodeText& other) const {
    199   if (repr_.size_ != other.repr_.size_) {
    200     return false;
    201   }
    202   return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
    203 }
    204 
    205 std::string UnicodeText::ToUTF8String() const {
    206   return UTF8Substring(begin(), end());
    207 }
    208 
    209 std::string UnicodeText::UTF8Substring(const const_iterator& first,
    210                                        const const_iterator& last) {
    211   return std::string(first.it_, last.it_ - first.it_);
    212 }
    213 
    214 UnicodeText::~UnicodeText() {}
    215 
    216 // ******************* UnicodeText::const_iterator *********************
    217 
    218 // The implementation of const_iterator would be nicer if it
    219 // inherited from boost::iterator_facade
    220 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
    221 
    222 UnicodeText::const_iterator::const_iterator() : it_(0) {}
    223 
    224 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
    225     const const_iterator& other) {
    226   if (&other != this) it_ = other.it_;
    227   return *this;
    228 }
    229 
    230 UnicodeText::const_iterator UnicodeText::begin() const {
    231   return const_iterator(repr_.data_);
    232 }
    233 
    234 UnicodeText::const_iterator UnicodeText::end() const {
    235   return const_iterator(repr_.data_ + repr_.size_);
    236 }
    237 
    238 bool operator<(const UnicodeText::const_iterator& lhs,
    239                const UnicodeText::const_iterator& rhs) {
    240   return lhs.it_ < rhs.it_;
    241 }
    242 
    243 char32 UnicodeText::const_iterator::operator*() const {
    244   // (We could call chartorune here, but that does some
    245   // error-checking, and we're guaranteed that our data is valid
    246   // UTF-8. Also, we expect this routine to be called very often. So
    247   // for speed, we do the calculation ourselves.)
    248 
    249   // Convert from UTF-8
    250   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
    251   if (byte1 < 0x80) return byte1;
    252 
    253   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
    254   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
    255 
    256   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
    257   if (byte1 < 0xF0) {
    258     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
    259   }
    260 
    261   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
    262   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
    263          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
    264 }
    265 
    266 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
    267   it_ += GetNumBytesForNonZeroUTF8Char(it_);
    268   return *this;
    269 }
    270 
    271 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
    272   while (IsTrailByte(*--it_)) {
    273   }
    274   return *this;
    275 }
    276 
    277 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
    278   UnicodeText t;
    279   if (do_copy) {
    280     t.CopyUTF8(utf8_buf, len);
    281   } else {
    282     t.PointToUTF8(utf8_buf, len);
    283   }
    284   return t;
    285 }
    286 
    287 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
    288   return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
    289 }
    290 
    291 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
    292   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
    293 }
    294 
    295 UnicodeText UTF8ToUnicodeText(const std::string& str) {
    296   return UTF8ToUnicodeText(str, /*do_copy=*/true);
    297 }
    298 
    299 }  // namespace libtextclassifier2
    300