1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/utf8/unicodetext.h" 18 19 #include "base.h" 20 #include "util/strings/utf8.h" 21 22 namespace libtextclassifier { 23 24 // *************** Data representation ********** 25 // Note: the copy constructor is undefined. 26 27 void UnicodeText::Repr::PointTo(const char* data, int size) { 28 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 29 data_ = const_cast<char*>(data); 30 size_ = size; 31 capacity_ = size; 32 ours_ = false; 33 } 34 35 void UnicodeText::Repr::Copy(const char* data, int size) { 36 resize(size); 37 memcpy(data_, data, size); 38 } 39 40 void UnicodeText::Repr::resize(int new_size) { 41 if (new_size == 0) { 42 clear(); 43 } else { 44 if (!ours_ || new_size > capacity_) reserve(new_size); 45 // Clear the memory in the expanded part. 46 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); 47 size_ = new_size; 48 ours_ = true; 49 } 50 } 51 52 void UnicodeText::Repr::reserve(int new_capacity) { 53 // If there's already enough capacity, and we're an owner, do nothing. 54 if (capacity_ >= new_capacity && ours_) return; 55 56 // Otherwise, allocate a new buffer. 57 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20); 58 char* new_data = new char[capacity_]; 59 60 // If there is an old buffer, copy it into the new buffer. 61 if (data_) { 62 memcpy(new_data, data_, size_); 63 if (ours_) delete[] data_; // If we owned the old buffer, free it. 64 } 65 data_ = new_data; 66 ours_ = true; // We own the new buffer. 67 // size_ is unchanged. 68 } 69 70 void UnicodeText::Repr::append(const char* bytes, int byte_length) { 71 reserve(size_ + byte_length); 72 memcpy(data_ + size_, bytes, byte_length); 73 size_ += byte_length; 74 } 75 76 void UnicodeText::Repr::clear() { 77 if (ours_) delete[] data_; 78 data_ = nullptr; 79 size_ = capacity_ = 0; 80 ours_ = true; 81 } 82 83 // *************** UnicodeText ****************** 84 85 UnicodeText::UnicodeText() {} 86 87 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); } 88 89 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { 90 repr_.Copy(src.repr_.data_, src.repr_.size_); 91 return *this; 92 } 93 94 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { 95 repr_.PointTo(buffer, byte_length); 96 return *this; 97 } 98 99 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { 100 repr_.Copy(buffer, byte_length); 101 return *this; 102 } 103 104 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) { 105 repr_.append(utf8, len); 106 return *this; 107 } 108 109 void UnicodeText::clear() { repr_.clear(); } 110 111 std::string UnicodeText::UTF8Substring(const const_iterator& first, 112 const const_iterator& last) { 113 return std::string(first.it_, last.it_ - first.it_); 114 } 115 116 UnicodeText::~UnicodeText() {} 117 118 // ******************* UnicodeText::const_iterator ********************* 119 120 // The implementation of const_iterator would be nicer if it 121 // inherited from boost::iterator_facade 122 // (http://boost.org/libs/iterator/doc/iterator_facade.html). 123 124 UnicodeText::const_iterator::const_iterator() : it_(0) {} 125 126 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=( 127 const const_iterator& other) { 128 if (&other != this) it_ = other.it_; 129 return *this; 130 } 131 132 UnicodeText::const_iterator UnicodeText::begin() const { 133 return const_iterator(repr_.data_); 134 } 135 136 UnicodeText::const_iterator UnicodeText::end() const { 137 return const_iterator(repr_.data_ + repr_.size_); 138 } 139 140 bool operator<(const UnicodeText::const_iterator& lhs, 141 const UnicodeText::const_iterator& rhs) { 142 return lhs.it_ < rhs.it_; 143 } 144 145 char32 UnicodeText::const_iterator::operator*() const { 146 // (We could call chartorune here, but that does some 147 // error-checking, and we're guaranteed that our data is valid 148 // UTF-8. Also, we expect this routine to be called very often. So 149 // for speed, we do the calculation ourselves.) 150 151 // Convert from UTF-8 152 unsigned char byte1 = static_cast<unsigned char>(it_[0]); 153 if (byte1 < 0x80) return byte1; 154 155 unsigned char byte2 = static_cast<unsigned char>(it_[1]); 156 if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F); 157 158 unsigned char byte3 = static_cast<unsigned char>(it_[2]); 159 if (byte1 < 0xF0) { 160 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F); 161 } 162 163 unsigned char byte4 = static_cast<unsigned char>(it_[3]); 164 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | 165 ((byte3 & 0x3F) << 6) | (byte4 & 0x3F); 166 } 167 168 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { 169 it_ += GetNumBytesForNonZeroUTF8Char(it_); 170 return *this; 171 } 172 173 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { 174 while (IsTrailByte(*--it_)) { 175 } 176 return *this; 177 } 178 179 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) { 180 UnicodeText t; 181 if (do_copy) { 182 t.CopyUTF8(utf8_buf, len); 183 } else { 184 t.PointToUTF8(utf8_buf, len); 185 } 186 return t; 187 } 188 189 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) { 190 return UTF8ToUnicodeText(str.data(), str.size(), do_copy); 191 } 192 193 } // namespace libtextclassifier 194