1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/utf8/unicodetext.h" 18 19 #include <string.h> 20 21 #include <algorithm> 22 23 #include "util/strings/utf8.h" 24 25 namespace libtextclassifier2 { 26 27 // *************** Data representation ********** 28 // Note: the copy constructor is undefined. 29 30 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) { 31 if (ours_ && data_) delete[] data_; 32 data_ = src.data_; 33 size_ = src.size_; 34 capacity_ = src.capacity_; 35 ours_ = src.ours_; 36 src.ours_ = false; 37 return *this; 38 } 39 40 void UnicodeText::Repr::PointTo(const char* data, int size) { 41 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 42 data_ = const_cast<char*>(data); 43 size_ = size; 44 capacity_ = size; 45 ours_ = false; 46 } 47 48 void UnicodeText::Repr::Copy(const char* data, int size) { 49 resize(size); 50 memcpy(data_, data, size); 51 } 52 53 void UnicodeText::Repr::resize(int new_size) { 54 if (new_size == 0) { 55 clear(); 56 } else { 57 if (!ours_ || new_size > capacity_) reserve(new_size); 58 // Clear the memory in the expanded part. 59 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); 60 size_ = new_size; 61 ours_ = true; 62 } 63 } 64 65 void UnicodeText::Repr::reserve(int new_capacity) { 66 // If there's already enough capacity, and we're an owner, do nothing. 67 if (capacity_ >= new_capacity && ours_) return; 68 69 // Otherwise, allocate a new buffer. 70 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20); 71 char* new_data = new char[capacity_]; 72 73 // If there is an old buffer, copy it into the new buffer. 74 if (data_) { 75 memcpy(new_data, data_, size_); 76 if (ours_) delete[] data_; // If we owned the old buffer, free it. 77 } 78 data_ = new_data; 79 ours_ = true; // We own the new buffer. 80 // size_ is unchanged. 81 } 82 83 void UnicodeText::Repr::append(const char* bytes, int byte_length) { 84 reserve(size_ + byte_length); 85 memcpy(data_ + size_, bytes, byte_length); 86 size_ += byte_length; 87 } 88 89 void UnicodeText::Repr::clear() { 90 if (ours_) delete[] data_; 91 data_ = nullptr; 92 size_ = capacity_ = 0; 93 ours_ = true; 94 } 95 96 // *************** UnicodeText ****************** 97 98 UnicodeText::UnicodeText() {} 99 100 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); } 101 102 UnicodeText& UnicodeText::operator=(UnicodeText&& src) { 103 this->repr_ = std::move(src.repr_); 104 return *this; 105 } 106 107 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { 108 repr_.Copy(src.repr_.data_, src.repr_.size_); 109 return *this; 110 } 111 112 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { 113 repr_.PointTo(buffer, byte_length); 114 return *this; 115 } 116 117 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { 118 repr_.Copy(buffer, byte_length); 119 return *this; 120 } 121 122 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) { 123 repr_.append(utf8, len); 124 return *this; 125 } 126 127 const char* UnicodeText::data() const { return repr_.data_; } 128 129 int UnicodeText::size_bytes() const { return repr_.size_; } 130 131 namespace { 132 133 enum { 134 RuneError = 0xFFFD, // Decoding error in UTF. 135 RuneMax = 0x10FFFF, // Maximum rune value. 136 }; 137 138 int runetochar(const char32 rune, char* dest) { 139 // Convert to unsigned for range check. 140 uint32 c; 141 142 // 1 char 00-7F 143 c = rune; 144 if (c <= 0x7F) { 145 dest[0] = static_cast<char>(c); 146 return 1; 147 } 148 149 // 2 char 0080-07FF 150 if (c <= 0x07FF) { 151 dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6); 152 dest[1] = 0x80 | (c & 0x3F); 153 return 2; 154 } 155 156 // Range check 157 if (c > RuneMax) { 158 c = RuneError; 159 } 160 161 // 3 char 0800-FFFF 162 if (c <= 0xFFFF) { 163 dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6); 164 dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F); 165 dest[2] = 0x80 | (c & 0x3F); 166 return 3; 167 } 168 169 // 4 char 10000-1FFFFF 170 dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6); 171 dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F); 172 dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F); 173 dest[3] = 0x80 | (c & 0x3F); 174 return 4; 175 } 176 177 } // namespace 178 179 UnicodeText& UnicodeText::AppendCodepoint(char32 ch) { 180 char str[4]; 181 int char_len = runetochar(ch, str); 182 repr_.append(str, char_len); 183 return *this; 184 } 185 186 void UnicodeText::clear() { repr_.clear(); } 187 188 int UnicodeText::size_codepoints() const { 189 return std::distance(begin(), end()); 190 } 191 192 bool UnicodeText::empty() const { return size_bytes() == 0; } 193 194 bool UnicodeText::is_valid() const { 195 return IsValidUTF8(repr_.data_, repr_.size_); 196 } 197 198 bool UnicodeText::operator==(const UnicodeText& other) const { 199 if (repr_.size_ != other.repr_.size_) { 200 return false; 201 } 202 return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0; 203 } 204 205 std::string UnicodeText::ToUTF8String() const { 206 return UTF8Substring(begin(), end()); 207 } 208 209 std::string UnicodeText::UTF8Substring(const const_iterator& first, 210 const const_iterator& last) { 211 return std::string(first.it_, last.it_ - first.it_); 212 } 213 214 UnicodeText::~UnicodeText() {} 215 216 // ******************* UnicodeText::const_iterator ********************* 217 218 // The implementation of const_iterator would be nicer if it 219 // inherited from boost::iterator_facade 220 // (http://boost.org/libs/iterator/doc/iterator_facade.html). 221 222 UnicodeText::const_iterator::const_iterator() : it_(0) {} 223 224 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=( 225 const const_iterator& other) { 226 if (&other != this) it_ = other.it_; 227 return *this; 228 } 229 230 UnicodeText::const_iterator UnicodeText::begin() const { 231 return const_iterator(repr_.data_); 232 } 233 234 UnicodeText::const_iterator UnicodeText::end() const { 235 return const_iterator(repr_.data_ + repr_.size_); 236 } 237 238 bool operator<(const UnicodeText::const_iterator& lhs, 239 const UnicodeText::const_iterator& rhs) { 240 return lhs.it_ < rhs.it_; 241 } 242 243 char32 UnicodeText::const_iterator::operator*() const { 244 // (We could call chartorune here, but that does some 245 // error-checking, and we're guaranteed that our data is valid 246 // UTF-8. Also, we expect this routine to be called very often. So 247 // for speed, we do the calculation ourselves.) 248 249 // Convert from UTF-8 250 unsigned char byte1 = static_cast<unsigned char>(it_[0]); 251 if (byte1 < 0x80) return byte1; 252 253 unsigned char byte2 = static_cast<unsigned char>(it_[1]); 254 if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F); 255 256 unsigned char byte3 = static_cast<unsigned char>(it_[2]); 257 if (byte1 < 0xF0) { 258 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F); 259 } 260 261 unsigned char byte4 = static_cast<unsigned char>(it_[3]); 262 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | 263 ((byte3 & 0x3F) << 6) | (byte4 & 0x3F); 264 } 265 266 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { 267 it_ += GetNumBytesForNonZeroUTF8Char(it_); 268 return *this; 269 } 270 271 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { 272 while (IsTrailByte(*--it_)) { 273 } 274 return *this; 275 } 276 277 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) { 278 UnicodeText t; 279 if (do_copy) { 280 t.CopyUTF8(utf8_buf, len); 281 } else { 282 t.PointToUTF8(utf8_buf, len); 283 } 284 return t; 285 } 286 287 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) { 288 return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy); 289 } 290 291 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) { 292 return UTF8ToUnicodeText(str.data(), str.size(), do_copy); 293 } 294 295 UnicodeText UTF8ToUnicodeText(const std::string& str) { 296 return UTF8ToUnicodeText(str, /*do_copy=*/true); 297 } 298 299 } // namespace libtextclassifier2 300