1 // Copyright (C) 2006 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Author: Jim Meehan 16 17 #include <iostream> 18 #include <sstream> 19 #include <cassert> 20 21 #include "phonenumbers/utf/unicodetext.h" 22 #include "phonenumbers/utf/stringpiece.h" 23 //#include "utf/stringprintf.h" 24 #include "phonenumbers/utf/utf.h" 25 #include "phonenumbers/utf/unilib.h" 26 27 namespace i18n { 28 namespace phonenumbers { 29 30 using std::stringstream; 31 using std::max; 32 using std::hex; 33 using std::dec; 34 using std::cerr; 35 using std::endl; 36 37 static int CodepointDistance(const char* start, const char* end) { 38 int n = 0; 39 // Increment n on every non-trail-byte. 40 for (const char* p = start; p < end; ++p) { 41 n += (*reinterpret_cast<const signed char*>(p) >= -0x40); 42 } 43 return n; 44 } 45 46 static int CodepointCount(const char* utf8, int len) { 47 return CodepointDistance(utf8, utf8 + len); 48 } 49 50 UnicodeText::const_iterator::difference_type 51 distance(const UnicodeText::const_iterator& first, 52 const UnicodeText::const_iterator& last) { 53 return CodepointDistance(first.it_, last.it_); 54 } 55 56 // ---------- Utility ---------- 57 58 static int ConvertToInterchangeValid(char* start, int len) { 59 // This routine is called only when we've discovered that a UTF-8 buffer 60 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 61 // was not interchange valid. This indicates a bug in the caller, and 62 // a LOG(WARNING) is done in that case. 63 // This is similar to CoerceToInterchangeValid, but it replaces each 64 // structurally valid byte with a space, and each non-interchange 65 // character with a space, even when that character requires more 66 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is 67 // structurally valid UTF8, but U+FDD0 is not an interchange-valid 68 // code point. The result should contain one space, not three. 69 // 70 // Since the conversion never needs to write more data than it 71 // reads, it is safe to change the buffer in place. It returns the 72 // number of bytes written. 73 char* const in = start; 74 char* out = start; 75 char* const end = start + len; 76 while (start < end) { 77 int good = UniLib::SpanInterchangeValid(start, end - start); 78 if (good > 0) { 79 if (out != start) { 80 memmove(out, start, good); 81 } 82 out += good; 83 start += good; 84 if (start == end) { 85 break; 86 } 87 } 88 // Is the current string invalid UTF8 or just non-interchange UTF8? 89 char32 rune; 90 int n; 91 if (isvalidcharntorune(start, end - start, &rune, &n)) { 92 // structurally valid UTF8, but not interchange valid 93 start += n; // Skip over the whole character. 94 } else { // bad UTF8 95 start += 1; // Skip over just one byte 96 } 97 *out++ = ' '; 98 } 99 return out - in; 100 } 101 102 103 // *************** Data representation ********** 104 105 // Note: the copy constructor is undefined. 106 107 // After reserve(), resize(), or clear(), we're an owner, not an alias. 108 109 void UnicodeText::Repr::reserve(int new_capacity) { 110 // If there's already enough capacity, and we're an owner, do nothing. 111 if (capacity_ >= new_capacity && ours_) return; 112 113 // Otherwise, allocate a new buffer. 114 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); 115 char* new_data = new char[capacity_]; 116 117 // If there is an old buffer, copy it into the new buffer. 118 if (data_) { 119 memcpy(new_data, data_, size_); 120 if (ours_) delete[] data_; // If we owned the old buffer, free it. 121 } 122 data_ = new_data; 123 ours_ = true; // We own the new buffer. 124 // size_ is unchanged. 125 } 126 127 void UnicodeText::Repr::resize(int new_size) { 128 if (new_size == 0) { 129 clear(); 130 } else { 131 if (!ours_ || new_size > capacity_) reserve(new_size); 132 // Clear the memory in the expanded part. 133 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); 134 size_ = new_size; 135 ours_ = true; 136 } 137 } 138 139 // This implementation of clear() deallocates the buffer if we're an owner. 140 // That's not strictly necessary; we could just set size_ to 0. 141 void UnicodeText::Repr::clear() { 142 if (ours_) delete[] data_; 143 data_ = NULL; 144 size_ = capacity_ = 0; 145 ours_ = true; 146 } 147 148 void UnicodeText::Repr::Copy(const char* data, int size) { 149 resize(size); 150 memcpy(data_, data, size); 151 } 152 153 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { 154 if (data == data_) return; // We already own this memory. (Weird case.) 155 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 156 data_ = data; 157 size_ = size; 158 capacity_ = capacity; 159 ours_ = true; 160 } 161 162 void UnicodeText::Repr::PointTo(const char* data, int size) { 163 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 164 data_ = const_cast<char*>(data); 165 size_ = size; 166 capacity_ = size; 167 ours_ = false; 168 } 169 170 void UnicodeText::Repr::append(const char* bytes, int byte_length) { 171 reserve(size_ + byte_length); 172 memcpy(data_ + size_, bytes, byte_length); 173 size_ += byte_length; 174 } 175 176 string UnicodeText::Repr::DebugString() const { 177 stringstream ss; 178 179 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec 180 << size_ << " capacity=" << capacity_ << " " 181 << (ours_ ? "Owned" : "Alias") << "}"; 182 183 string result; 184 ss >> result; 185 186 return result; 187 } 188 189 190 191 // *************** UnicodeText ****************** 192 193 // ----- Constructors ----- 194 195 // Default constructor 196 UnicodeText::UnicodeText() { 197 } 198 199 // Copy constructor 200 UnicodeText::UnicodeText(const UnicodeText& src) { 201 Copy(src); 202 } 203 204 // Substring constructor 205 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, 206 const UnicodeText::const_iterator& last) { 207 assert(first <= last && "Incompatible iterators"); 208 repr_.append(first.it_, last.it_ - first.it_); 209 } 210 211 string UnicodeText::UTF8Substring(const const_iterator& first, 212 const const_iterator& last) { 213 assert(first <= last && "Incompatible iterators"); 214 return string(first.it_, last.it_ - first.it_); 215 } 216 217 218 // ----- Copy ----- 219 220 UnicodeText& UnicodeText::operator=(const UnicodeText& src) { 221 if (this != &src) { 222 Copy(src); 223 } 224 return *this; 225 } 226 227 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { 228 repr_.Copy(src.repr_.data_, src.repr_.size_); 229 return *this; 230 } 231 232 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { 233 repr_.Copy(buffer, byte_length); 234 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 235 cerr << "UTF-8 buffer is not interchange-valid." << endl; 236 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 237 } 238 return *this; 239 } 240 241 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, 242 int byte_length) { 243 repr_.Copy(buffer, byte_length); 244 return *this; 245 } 246 247 // ----- TakeOwnershipOf ----- 248 249 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, 250 int byte_length, 251 int byte_capacity) { 252 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 253 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 254 cerr << "UTF-8 buffer is not interchange-valid." << endl; 255 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 256 } 257 return *this; 258 } 259 260 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, 261 int byte_length, 262 int byte_capacity) { 263 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 264 return *this; 265 } 266 267 // ----- PointTo ----- 268 269 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { 270 if (UniLib:: IsInterchangeValid(buffer, byte_length)) { 271 repr_.PointTo(buffer, byte_length); 272 } else { 273 cerr << "UTF-8 buffer is not interchange-valid." << endl; 274 repr_.Copy(buffer, byte_length); 275 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 276 } 277 return *this; 278 } 279 280 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, 281 int byte_length) { 282 repr_.PointTo(buffer, byte_length); 283 return *this; 284 } 285 286 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { 287 repr_.PointTo(src.repr_.data_, src.repr_.size_); 288 return *this; 289 } 290 291 UnicodeText& UnicodeText::PointTo(const const_iterator &first, 292 const const_iterator &last) { 293 assert(first <= last && " Incompatible iterators"); 294 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); 295 return *this; 296 } 297 298 // ----- Append ----- 299 300 UnicodeText& UnicodeText::append(const UnicodeText& u) { 301 repr_.append(u.repr_.data_, u.repr_.size_); 302 return *this; 303 } 304 305 UnicodeText& UnicodeText::append(const const_iterator& first, 306 const const_iterator& last) { 307 assert(first <= last && "Incompatible iterators"); 308 repr_.append(first.it_, last.it_ - first.it_); 309 return *this; 310 } 311 312 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { 313 repr_.append(utf8, len); 314 return *this; 315 } 316 317 // ----- substring searching ----- 318 319 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, 320 const_iterator start_pos) const { 321 assert(start_pos.utf8_data() >= utf8_data()); 322 assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); 323 return UnsafeFind(look, start_pos); 324 } 325 326 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { 327 return UnsafeFind(look, begin()); 328 } 329 330 UnicodeText::const_iterator UnicodeText::UnsafeFind( 331 const UnicodeText& look, const_iterator start_pos) const { 332 // Due to the magic of the UTF8 encoding, searching for a sequence of 333 // letters is equivalent to substring search. 334 StringPiece searching(utf8_data(), utf8_length()); 335 StringPiece look_piece(look.utf8_data(), look.utf8_length()); 336 StringPiece::size_type found = 337 searching.find(look_piece, start_pos.utf8_data() - utf8_data()); 338 if (found == StringPiece::npos) return end(); 339 return const_iterator(utf8_data() + found); 340 } 341 342 bool UnicodeText::HasReplacementChar() const { 343 // Equivalent to: 344 // UnicodeText replacement_char; 345 // replacement_char.push_back(0xFFFD); 346 // return find(replacement_char) != end(); 347 StringPiece searching(utf8_data(), utf8_length()); 348 StringPiece looking_for("\xEF\xBF\xBD", 3); 349 return searching.find(looking_for) != StringPiece::npos; 350 } 351 352 // ----- other methods ----- 353 354 // Clear operator 355 void UnicodeText::clear() { 356 repr_.clear(); 357 } 358 359 // Destructor 360 UnicodeText::~UnicodeText() {} 361 362 363 void UnicodeText::push_back(char32 c) { 364 if (UniLib::IsValidCodepoint(c)) { 365 char buf[UTFmax]; 366 int len = runetochar(buf, &c); 367 if (UniLib::IsInterchangeValid(buf, len)) { 368 repr_.append(buf, len); 369 } else { 370 cerr << "Unicode value 0x" << hex << c 371 << " is not valid for interchange" << endl; 372 repr_.append(" ", 1); 373 } 374 } else { 375 cerr << "Illegal Unicode value: 0x" << hex << c << endl; 376 repr_.append(" ", 1); 377 } 378 } 379 380 int UnicodeText::size() const { 381 return CodepointCount(repr_.data_, repr_.size_); 382 } 383 384 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { 385 if (&lhs == &rhs) return true; 386 if (lhs.repr_.size_ != rhs.repr_.size_) return false; 387 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; 388 } 389 390 string UnicodeText::DebugString() const { 391 stringstream ss; 392 393 ss << "{UnicodeText " << hex << this << dec << " chars=" 394 << size() << " repr=" << repr_.DebugString() << "}"; 395 #if 0 396 return StringPrintf("{UnicodeText %p chars=%d repr=%s}", 397 this, 398 size(), 399 repr_.DebugString().c_str()); 400 #endif 401 string result; 402 ss >> result; 403 404 return result; 405 } 406 407 408 // ******************* UnicodeText::const_iterator ********************* 409 410 // The implementation of const_iterator would be nicer if it 411 // inherited from boost::iterator_facade 412 // (http://boost.org/libs/iterator/doc/iterator_facade.html). 413 414 UnicodeText::const_iterator::const_iterator() : it_(0) {} 415 416 UnicodeText::const_iterator::const_iterator(const const_iterator& other) 417 : it_(other.it_) { 418 } 419 420 UnicodeText::const_iterator& 421 UnicodeText::const_iterator::operator=(const const_iterator& other) { 422 if (&other != this) 423 it_ = other.it_; 424 return *this; 425 } 426 427 UnicodeText::const_iterator UnicodeText::begin() const { 428 return const_iterator(repr_.data_); 429 } 430 431 UnicodeText::const_iterator UnicodeText::end() const { 432 return const_iterator(repr_.data_ + repr_.size_); 433 } 434 435 bool operator<(const UnicodeText::const_iterator& lhs, 436 const UnicodeText::const_iterator& rhs) { 437 return lhs.it_ < rhs.it_; 438 } 439 440 char32 UnicodeText::const_iterator::operator*() const { 441 // (We could call chartorune here, but that does some 442 // error-checking, and we're guaranteed that our data is valid 443 // UTF-8. Also, we expect this routine to be called very often. So 444 // for speed, we do the calculation ourselves.) 445 446 // Convert from UTF-8 447 uint8 byte1 = static_cast<uint8>(it_[0]); 448 if (byte1 < 0x80) 449 return byte1; 450 451 uint8 byte2 = static_cast<uint8>(it_[1]); 452 if (byte1 < 0xE0) 453 return ((byte1 & 0x1F) << 6) 454 | (byte2 & 0x3F); 455 456 uint8 byte3 = static_cast<uint8>(it_[2]); 457 if (byte1 < 0xF0) 458 return ((byte1 & 0x0F) << 12) 459 | ((byte2 & 0x3F) << 6) 460 | (byte3 & 0x3F); 461 462 uint8 byte4 = static_cast<uint8>(it_[3]); 463 return ((byte1 & 0x07) << 18) 464 | ((byte2 & 0x3F) << 12) 465 | ((byte3 & 0x3F) << 6) 466 | (byte4 & 0x3F); 467 } 468 469 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { 470 it_ += UniLib::OneCharLen(it_); 471 return *this; 472 } 473 474 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { 475 while (UniLib::IsTrailByte(*--it_)) { } 476 return *this; 477 } 478 479 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { 480 utf8_output[0] = it_[0]; 481 if (static_cast<unsigned char>(it_[0]) < 0x80) 482 return 1; 483 484 utf8_output[1] = it_[1]; 485 if (static_cast<unsigned char>(it_[0]) < 0xE0) 486 return 2; 487 488 utf8_output[2] = it_[2]; 489 if (static_cast<unsigned char>(it_[0]) < 0xF0) 490 return 3; 491 492 utf8_output[3] = it_[3]; 493 return 4; 494 } 495 496 497 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { 498 assert(p != NULL); 499 const char* start = utf8_data(); 500 int len = utf8_length(); 501 const char* end = start + len; 502 assert(p >= start); 503 assert(p <= end); 504 assert(p == end || !UniLib::IsTrailByte(*p)); 505 return const_iterator(p); 506 } 507 508 string UnicodeText::const_iterator::DebugString() const { 509 stringstream ss; 510 511 ss << "{iter " << hex << it_ << "}"; 512 string result; 513 ss >> result; 514 515 return result; 516 } 517 518 } // namespace phonenumbers 519 } // namespace i18n 520