1 // Copyright (C) 2006 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Author: Jim Meehan 16 17 #include <algorithm> 18 #include <sstream> 19 #include <cassert> 20 #include <cstdio> 21 22 #include "phonenumbers/utf/unicodetext.h" 23 #include "phonenumbers/utf/stringpiece.h" 24 //#include "utf/stringprintf.h" 25 #include "phonenumbers/utf/utf.h" 26 #include "phonenumbers/utf/unilib.h" 27 28 namespace i18n { 29 namespace phonenumbers { 30 31 using std::stringstream; 32 using std::max; 33 using std::hex; 34 using std::dec; 35 36 static int CodepointDistance(const char* start, const char* end) { 37 int n = 0; 38 // Increment n on every non-trail-byte. 39 for (const char* p = start; p < end; ++p) { 40 n += (*reinterpret_cast<const signed char*>(p) >= -0x40); 41 } 42 return n; 43 } 44 45 static int CodepointCount(const char* utf8, int len) { 46 return CodepointDistance(utf8, utf8 + len); 47 } 48 49 UnicodeText::const_iterator::difference_type 50 distance(const UnicodeText::const_iterator& first, 51 const UnicodeText::const_iterator& last) { 52 return CodepointDistance(first.it_, last.it_); 53 } 54 55 // ---------- Utility ---------- 56 57 static int ConvertToInterchangeValid(char* start, int len) { 58 // This routine is called only when we've discovered that a UTF-8 buffer 59 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 60 // was not interchange valid. This indicates a bug in the caller, and 61 // a LOG(WARNING) is done in that case. 62 // This is similar to CoerceToInterchangeValid, but it replaces each 63 // structurally valid byte with a space, and each non-interchange 64 // character with a space, even when that character requires more 65 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is 66 // structurally valid UTF8, but U+FDD0 is not an interchange-valid 67 // code point. The result should contain one space, not three. 68 // 69 // Since the conversion never needs to write more data than it 70 // reads, it is safe to change the buffer in place. It returns the 71 // number of bytes written. 72 char* const in = start; 73 char* out = start; 74 char* const end = start + len; 75 while (start < end) { 76 int good = UniLib::SpanInterchangeValid(start, end - start); 77 if (good > 0) { 78 if (out != start) { 79 memmove(out, start, good); 80 } 81 out += good; 82 start += good; 83 if (start == end) { 84 break; 85 } 86 } 87 // Is the current string invalid UTF8 or just non-interchange UTF8? 88 char32 rune; 89 int n; 90 if (isvalidcharntorune(start, end - start, &rune, &n)) { 91 // structurally valid UTF8, but not interchange valid 92 start += n; // Skip over the whole character. 93 } else { // bad UTF8 94 start += 1; // Skip over just one byte 95 } 96 *out++ = ' '; 97 } 98 return out - in; 99 } 100 101 102 // *************** Data representation ********** 103 104 // Note: the copy constructor is undefined. 105 106 // After reserve(), resize(), or clear(), we're an owner, not an alias. 107 108 void UnicodeText::Repr::reserve(int new_capacity) { 109 // If there's already enough capacity, and we're an owner, do nothing. 110 if (capacity_ >= new_capacity && ours_) return; 111 112 // Otherwise, allocate a new buffer. 113 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); 114 char* new_data = new char[capacity_]; 115 116 // If there is an old buffer, copy it into the new buffer. 117 if (data_) { 118 memcpy(new_data, data_, size_); 119 if (ours_) delete[] data_; // If we owned the old buffer, free it. 120 } 121 data_ = new_data; 122 ours_ = true; // We own the new buffer. 123 // size_ is unchanged. 124 } 125 126 void UnicodeText::Repr::resize(int new_size) { 127 if (new_size == 0) { 128 clear(); 129 } else { 130 if (!ours_ || new_size > capacity_) reserve(new_size); 131 // Clear the memory in the expanded part. 132 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); 133 size_ = new_size; 134 ours_ = true; 135 } 136 } 137 138 // This implementation of clear() deallocates the buffer if we're an owner. 139 // That's not strictly necessary; we could just set size_ to 0. 140 void UnicodeText::Repr::clear() { 141 if (ours_) delete[] data_; 142 data_ = NULL; 143 size_ = capacity_ = 0; 144 ours_ = true; 145 } 146 147 void UnicodeText::Repr::Copy(const char* data, int size) { 148 resize(size); 149 memcpy(data_, data, size); 150 } 151 152 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { 153 if (data == data_) return; // We already own this memory. (Weird case.) 154 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 155 data_ = data; 156 size_ = size; 157 capacity_ = capacity; 158 ours_ = true; 159 } 160 161 void UnicodeText::Repr::PointTo(const char* data, int size) { 162 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 163 data_ = const_cast<char*>(data); 164 size_ = size; 165 capacity_ = size; 166 ours_ = false; 167 } 168 169 void UnicodeText::Repr::append(const char* bytes, int byte_length) { 170 reserve(size_ + byte_length); 171 memcpy(data_ + size_, bytes, byte_length); 172 size_ += byte_length; 173 } 174 175 string UnicodeText::Repr::DebugString() const { 176 stringstream ss; 177 178 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec 179 << size_ << " capacity=" << capacity_ << " " 180 << (ours_ ? "Owned" : "Alias") << "}"; 181 182 string result; 183 ss >> result; 184 185 return result; 186 } 187 188 189 190 // *************** UnicodeText ****************** 191 192 // ----- Constructors ----- 193 194 // Default constructor 195 UnicodeText::UnicodeText() { 196 } 197 198 // Copy constructor 199 UnicodeText::UnicodeText(const UnicodeText& src) { 200 Copy(src); 201 } 202 203 // Substring constructor 204 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, 205 const UnicodeText::const_iterator& last) { 206 assert(first <= last && "Incompatible iterators"); 207 repr_.append(first.it_, last.it_ - first.it_); 208 } 209 210 string UnicodeText::UTF8Substring(const const_iterator& first, 211 const const_iterator& last) { 212 assert(first <= last && "Incompatible iterators"); 213 return string(first.it_, last.it_ - first.it_); 214 } 215 216 217 // ----- Copy ----- 218 219 UnicodeText& UnicodeText::operator=(const UnicodeText& src) { 220 if (this != &src) { 221 Copy(src); 222 } 223 return *this; 224 } 225 226 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { 227 repr_.Copy(src.repr_.data_, src.repr_.size_); 228 return *this; 229 } 230 231 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { 232 repr_.Copy(buffer, byte_length); 233 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 234 fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n"); 235 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 236 } 237 return *this; 238 } 239 240 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, 241 int byte_length) { 242 repr_.Copy(buffer, byte_length); 243 return *this; 244 } 245 246 // ----- TakeOwnershipOf ----- 247 248 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, 249 int byte_length, 250 int byte_capacity) { 251 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 252 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 253 fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n"); 254 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 255 } 256 return *this; 257 } 258 259 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, 260 int byte_length, 261 int byte_capacity) { 262 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 263 return *this; 264 } 265 266 // ----- PointTo ----- 267 268 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { 269 if (UniLib:: IsInterchangeValid(buffer, byte_length)) { 270 repr_.PointTo(buffer, byte_length); 271 } else { 272 fprintf(stderr, "UTF-8 buffer is not interchange-valid."); 273 repr_.Copy(buffer, byte_length); 274 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 275 } 276 return *this; 277 } 278 279 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, 280 int byte_length) { 281 repr_.PointTo(buffer, byte_length); 282 return *this; 283 } 284 285 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { 286 repr_.PointTo(src.repr_.data_, src.repr_.size_); 287 return *this; 288 } 289 290 UnicodeText& UnicodeText::PointTo(const const_iterator &first, 291 const const_iterator &last) { 292 assert(first <= last && " Incompatible iterators"); 293 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); 294 return *this; 295 } 296 297 // ----- Append ----- 298 299 UnicodeText& UnicodeText::append(const UnicodeText& u) { 300 repr_.append(u.repr_.data_, u.repr_.size_); 301 return *this; 302 } 303 304 UnicodeText& UnicodeText::append(const const_iterator& first, 305 const const_iterator& last) { 306 assert(first <= last && "Incompatible iterators"); 307 repr_.append(first.it_, last.it_ - first.it_); 308 return *this; 309 } 310 311 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { 312 repr_.append(utf8, len); 313 return *this; 314 } 315 316 // ----- substring searching ----- 317 318 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, 319 const_iterator start_pos) const { 320 assert(start_pos.utf8_data() >= utf8_data()); 321 assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); 322 return UnsafeFind(look, start_pos); 323 } 324 325 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { 326 return UnsafeFind(look, begin()); 327 } 328 329 UnicodeText::const_iterator UnicodeText::UnsafeFind( 330 const UnicodeText& look, const_iterator start_pos) const { 331 // Due to the magic of the UTF8 encoding, searching for a sequence of 332 // letters is equivalent to substring search. 333 StringPiece searching(utf8_data(), utf8_length()); 334 StringPiece look_piece(look.utf8_data(), look.utf8_length()); 335 StringPiece::size_type found = 336 searching.find(look_piece, start_pos.utf8_data() - utf8_data()); 337 if (found == StringPiece::npos) return end(); 338 return const_iterator(utf8_data() + found); 339 } 340 341 bool UnicodeText::HasReplacementChar() const { 342 // Equivalent to: 343 // UnicodeText replacement_char; 344 // replacement_char.push_back(0xFFFD); 345 // return find(replacement_char) != end(); 346 StringPiece searching(utf8_data(), utf8_length()); 347 StringPiece looking_for("\xEF\xBF\xBD", 3); 348 return searching.find(looking_for) != StringPiece::npos; 349 } 350 351 // ----- other methods ----- 352 353 // Clear operator 354 void UnicodeText::clear() { 355 repr_.clear(); 356 } 357 358 // Destructor 359 UnicodeText::~UnicodeText() {} 360 361 362 void UnicodeText::push_back(char32 c) { 363 if (UniLib::IsValidCodepoint(c)) { 364 char buf[UTFmax]; 365 int len = runetochar(buf, &c); 366 if (UniLib::IsInterchangeValid(buf, len)) { 367 repr_.append(buf, len); 368 } else { 369 fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c); 370 repr_.append(" ", 1); 371 } 372 } else { 373 fprintf(stderr, "Illegal Unicode value: 0x%x\n", c); 374 repr_.append(" ", 1); 375 } 376 } 377 378 int UnicodeText::size() const { 379 return CodepointCount(repr_.data_, repr_.size_); 380 } 381 382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { 383 if (&lhs == &rhs) return true; 384 if (lhs.repr_.size_ != rhs.repr_.size_) return false; 385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; 386 } 387 388 string UnicodeText::DebugString() const { 389 stringstream ss; 390 391 ss << "{UnicodeText " << hex << this << dec << " chars=" 392 << size() << " repr=" << repr_.DebugString() << "}"; 393 #if 0 394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}", 395 this, 396 size(), 397 repr_.DebugString().c_str()); 398 #endif 399 string result; 400 ss >> result; 401 402 return result; 403 } 404 405 406 // ******************* UnicodeText::const_iterator ********************* 407 408 // The implementation of const_iterator would be nicer if it 409 // inherited from boost::iterator_facade 410 // (http://boost.org/libs/iterator/doc/iterator_facade.html). 411 412 UnicodeText::const_iterator::const_iterator() : it_(0) {} 413 414 UnicodeText::const_iterator::const_iterator(const const_iterator& other) 415 : it_(other.it_) { 416 } 417 418 UnicodeText::const_iterator& 419 UnicodeText::const_iterator::operator=(const const_iterator& other) { 420 if (&other != this) 421 it_ = other.it_; 422 return *this; 423 } 424 425 UnicodeText::const_iterator UnicodeText::begin() const { 426 return const_iterator(repr_.data_); 427 } 428 429 UnicodeText::const_iterator UnicodeText::end() const { 430 return const_iterator(repr_.data_ + repr_.size_); 431 } 432 433 bool operator<(const UnicodeText::const_iterator& lhs, 434 const UnicodeText::const_iterator& rhs) { 435 return lhs.it_ < rhs.it_; 436 } 437 438 char32 UnicodeText::const_iterator::operator*() const { 439 // (We could call chartorune here, but that does some 440 // error-checking, and we're guaranteed that our data is valid 441 // UTF-8. Also, we expect this routine to be called very often. So 442 // for speed, we do the calculation ourselves.) 443 444 // Convert from UTF-8 445 uint8 byte1 = static_cast<uint8>(it_[0]); 446 if (byte1 < 0x80) 447 return byte1; 448 449 uint8 byte2 = static_cast<uint8>(it_[1]); 450 if (byte1 < 0xE0) 451 return ((byte1 & 0x1F) << 6) 452 | (byte2 & 0x3F); 453 454 uint8 byte3 = static_cast<uint8>(it_[2]); 455 if (byte1 < 0xF0) 456 return ((byte1 & 0x0F) << 12) 457 | ((byte2 & 0x3F) << 6) 458 | (byte3 & 0x3F); 459 460 uint8 byte4 = static_cast<uint8>(it_[3]); 461 return ((byte1 & 0x07) << 18) 462 | ((byte2 & 0x3F) << 12) 463 | ((byte3 & 0x3F) << 6) 464 | (byte4 & 0x3F); 465 } 466 467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { 468 it_ += UniLib::OneCharLen(it_); 469 return *this; 470 } 471 472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { 473 while (UniLib::IsTrailByte(*--it_)) { } 474 return *this; 475 } 476 477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { 478 utf8_output[0] = it_[0]; 479 if (static_cast<unsigned char>(it_[0]) < 0x80) 480 return 1; 481 482 utf8_output[1] = it_[1]; 483 if (static_cast<unsigned char>(it_[0]) < 0xE0) 484 return 2; 485 486 utf8_output[2] = it_[2]; 487 if (static_cast<unsigned char>(it_[0]) < 0xF0) 488 return 3; 489 490 utf8_output[3] = it_[3]; 491 return 4; 492 } 493 494 495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { 496 assert(p != NULL); 497 const char* start = utf8_data(); 498 int len = utf8_length(); 499 const char* end = start + len; 500 assert(p >= start); 501 assert(p <= end); 502 assert(p == end || !UniLib::IsTrailByte(*p)); 503 return const_iterator(p); 504 } 505 506 string UnicodeText::const_iterator::DebugString() const { 507 stringstream ss; 508 509 ss << "{iter " << hex << it_ << "}"; 510 string result; 511 ss >> result; 512 513 return result; 514 } 515 516 } // namespace phonenumbers 517 } // namespace i18n 518