Home | History | Annotate | Download | only in utf
      1 // Copyright (C) 2006 Google Inc.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Author: Jim Meehan
     16 
     17 #include <iostream>
     18 #include <sstream>
     19 #include <cassert>
     20 
     21 #include "phonenumbers/utf/unicodetext.h"
     22 #include "phonenumbers/utf/stringpiece.h"
     23 //#include "utf/stringprintf.h"
     24 #include "phonenumbers/utf/utf.h"
     25 #include "phonenumbers/utf/unilib.h"
     26 
     27 namespace i18n {
     28 namespace phonenumbers {
     29 
     30 using std::stringstream;
     31 using std::max;
     32 using std::hex;
     33 using std::dec;
     34 using std::cerr;
     35 using std::endl;
     36 
     37 static int CodepointDistance(const char* start, const char* end) {
     38   int n = 0;
     39   // Increment n on every non-trail-byte.
     40   for (const char* p = start; p < end; ++p) {
     41     n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
     42   }
     43   return n;
     44 }
     45 
     46 static int CodepointCount(const char* utf8, int len) {
     47   return CodepointDistance(utf8, utf8 + len);
     48 }
     49 
     50 UnicodeText::const_iterator::difference_type
     51 distance(const UnicodeText::const_iterator& first,
     52          const UnicodeText::const_iterator& last) {
     53   return CodepointDistance(first.it_, last.it_);
     54 }
     55 
     56 // ---------- Utility ----------
     57 
     58 static int ConvertToInterchangeValid(char* start, int len) {
     59   // This routine is called only when we've discovered that a UTF-8 buffer
     60   // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
     61   // was not interchange valid. This indicates a bug in the caller, and
     62   // a LOG(WARNING) is done in that case.
     63   // This is similar to CoerceToInterchangeValid, but it replaces each
     64   // structurally valid byte with a space, and each non-interchange
     65   // character with a space, even when that character requires more
     66   // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
     67   // structurally valid UTF8, but U+FDD0 is not an interchange-valid
     68   // code point. The result should contain one space, not three.
     69   //
     70   // Since the conversion never needs to write more data than it
     71   // reads, it is safe to change the buffer in place. It returns the
     72   // number of bytes written.
     73   char* const in = start;
     74   char* out = start;
     75   char* const end = start + len;
     76   while (start < end) {
     77     int good = UniLib::SpanInterchangeValid(start, end - start);
     78     if (good > 0) {
     79       if (out != start) {
     80         memmove(out, start, good);
     81       }
     82       out += good;
     83       start += good;
     84       if (start == end) {
     85         break;
     86       }
     87     }
     88     // Is the current string invalid UTF8 or just non-interchange UTF8?
     89     char32 rune;
     90     int n;
     91     if (isvalidcharntorune(start, end - start, &rune, &n)) {
     92       // structurally valid UTF8, but not interchange valid
     93       start += n;  // Skip over the whole character.
     94     } else {  // bad UTF8
     95       start += 1;  // Skip over just one byte
     96     }
     97     *out++ = ' ';
     98   }
     99   return out - in;
    100 }
    101 
    102 
    103 // *************** Data representation **********
    104 
    105 // Note: the copy constructor is undefined.
    106 
    107 // After reserve(), resize(), or clear(), we're an owner, not an alias.
    108 
    109 void UnicodeText::Repr::reserve(int new_capacity) {
    110   // If there's already enough capacity, and we're an owner, do nothing.
    111   if (capacity_ >= new_capacity && ours_) return;
    112 
    113   // Otherwise, allocate a new buffer.
    114   capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
    115   char* new_data = new char[capacity_];
    116 
    117   // If there is an old buffer, copy it into the new buffer.
    118   if (data_) {
    119     memcpy(new_data, data_, size_);
    120     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
    121   }
    122   data_ = new_data;
    123   ours_ = true;  // We own the new buffer.
    124   // size_ is unchanged.
    125 }
    126 
    127 void UnicodeText::Repr::resize(int new_size) {
    128   if (new_size == 0) {
    129     clear();
    130   } else {
    131     if (!ours_ || new_size > capacity_) reserve(new_size);
    132     // Clear the memory in the expanded part.
    133     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
    134     size_ = new_size;
    135     ours_ = true;
    136   }
    137 }
    138 
    139 // This implementation of clear() deallocates the buffer if we're an owner.
    140 // That's not strictly necessary; we could just set size_ to 0.
    141 void UnicodeText::Repr::clear() {
    142   if (ours_) delete[] data_;
    143   data_ = NULL;
    144   size_ = capacity_ = 0;
    145   ours_ = true;
    146 }
    147 
    148 void UnicodeText::Repr::Copy(const char* data, int size) {
    149   resize(size);
    150   memcpy(data_, data, size);
    151 }
    152 
    153 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
    154   if (data == data_) return;  // We already own this memory. (Weird case.)
    155   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
    156   data_ = data;
    157   size_ = size;
    158   capacity_ = capacity;
    159   ours_ = true;
    160 }
    161 
    162 void UnicodeText::Repr::PointTo(const char* data, int size) {
    163   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
    164   data_ = const_cast<char*>(data);
    165   size_ = size;
    166   capacity_ = size;
    167   ours_ = false;
    168 }
    169 
    170 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
    171   reserve(size_ + byte_length);
    172   memcpy(data_ + size_, bytes, byte_length);
    173   size_ += byte_length;
    174 }
    175 
    176 string UnicodeText::Repr::DebugString() const {
    177   stringstream ss;
    178 
    179   ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
    180      << size_ << " capacity=" << capacity_ << " "
    181      << (ours_ ? "Owned" : "Alias") << "}";
    182 
    183   string result;
    184   ss >> result;
    185 
    186   return result;
    187 }
    188 
    189 
    190 
    191 // *************** UnicodeText ******************
    192 
    193 // ----- Constructors -----
    194 
    195 // Default constructor
    196 UnicodeText::UnicodeText() {
    197 }
    198 
    199 // Copy constructor
    200 UnicodeText::UnicodeText(const UnicodeText& src) {
    201   Copy(src);
    202 }
    203 
    204 // Substring constructor
    205 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
    206                          const UnicodeText::const_iterator& last) {
    207   assert(first <= last && "Incompatible iterators");
    208   repr_.append(first.it_, last.it_ - first.it_);
    209 }
    210 
    211 string UnicodeText::UTF8Substring(const const_iterator& first,
    212                                   const const_iterator& last) {
    213   assert(first <= last && "Incompatible iterators");
    214   return string(first.it_, last.it_ - first.it_);
    215 }
    216 
    217 
    218 // ----- Copy -----
    219 
    220 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
    221   if (this != &src) {
    222     Copy(src);
    223   }
    224   return *this;
    225 }
    226 
    227 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
    228   repr_.Copy(src.repr_.data_, src.repr_.size_);
    229   return *this;
    230 }
    231 
    232 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
    233   repr_.Copy(buffer, byte_length);
    234   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
    235     cerr << "UTF-8 buffer is not interchange-valid." << endl;
    236     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    237   }
    238   return *this;
    239 }
    240 
    241 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
    242                                            int byte_length) {
    243   repr_.Copy(buffer, byte_length);
    244   return *this;
    245 }
    246 
    247 // ----- TakeOwnershipOf  -----
    248 
    249 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
    250                                               int byte_length,
    251                                               int byte_capacity) {
    252   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
    253   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
    254     cerr << "UTF-8 buffer is not interchange-valid." << endl;
    255     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    256   }
    257   return *this;
    258 }
    259 
    260 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
    261                                                     int byte_length,
    262                                                     int byte_capacity) {
    263   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
    264   return *this;
    265 }
    266 
    267 // ----- PointTo -----
    268 
    269 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
    270   if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
    271     repr_.PointTo(buffer, byte_length);
    272   } else {
    273     cerr << "UTF-8 buffer is not interchange-valid." << endl;
    274     repr_.Copy(buffer, byte_length);
    275     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    276   }
    277   return *this;
    278 }
    279 
    280 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
    281                                           int byte_length) {
    282   repr_.PointTo(buffer, byte_length);
    283   return *this;
    284 }
    285 
    286 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
    287   repr_.PointTo(src.repr_.data_, src.repr_.size_);
    288   return *this;
    289 }
    290 
    291 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
    292                                   const const_iterator &last) {
    293   assert(first <= last && " Incompatible iterators");
    294   repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
    295   return *this;
    296 }
    297 
    298 // ----- Append -----
    299 
    300 UnicodeText& UnicodeText::append(const UnicodeText& u) {
    301   repr_.append(u.repr_.data_, u.repr_.size_);
    302   return *this;
    303 }
    304 
    305 UnicodeText& UnicodeText::append(const const_iterator& first,
    306                                  const const_iterator& last) {
    307   assert(first <= last && "Incompatible iterators");
    308   repr_.append(first.it_, last.it_ - first.it_);
    309   return *this;
    310 }
    311 
    312 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
    313   repr_.append(utf8, len);
    314   return *this;
    315 }
    316 
    317 // ----- substring searching -----
    318 
    319 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
    320                                               const_iterator start_pos) const {
    321   assert(start_pos.utf8_data() >= utf8_data());
    322   assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
    323   return UnsafeFind(look, start_pos);
    324 }
    325 
    326 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
    327   return UnsafeFind(look, begin());
    328 }
    329 
    330 UnicodeText::const_iterator UnicodeText::UnsafeFind(
    331     const UnicodeText& look, const_iterator start_pos) const {
    332   // Due to the magic of the UTF8 encoding, searching for a sequence of
    333   // letters is equivalent to substring search.
    334   StringPiece searching(utf8_data(), utf8_length());
    335   StringPiece look_piece(look.utf8_data(), look.utf8_length());
    336   StringPiece::size_type found =
    337       searching.find(look_piece, start_pos.utf8_data() - utf8_data());
    338   if (found == StringPiece::npos) return end();
    339   return const_iterator(utf8_data() + found);
    340 }
    341 
    342 bool UnicodeText::HasReplacementChar() const {
    343   // Equivalent to:
    344   //   UnicodeText replacement_char;
    345   //   replacement_char.push_back(0xFFFD);
    346   //   return find(replacement_char) != end();
    347   StringPiece searching(utf8_data(), utf8_length());
    348   StringPiece looking_for("\xEF\xBF\xBD", 3);
    349   return searching.find(looking_for) != StringPiece::npos;
    350 }
    351 
    352 // ----- other methods -----
    353 
    354 // Clear operator
    355 void UnicodeText::clear() {
    356   repr_.clear();
    357 }
    358 
    359 // Destructor
    360 UnicodeText::~UnicodeText() {}
    361 
    362 
    363 void UnicodeText::push_back(char32 c) {
    364   if (UniLib::IsValidCodepoint(c)) {
    365     char buf[UTFmax];
    366     int len = runetochar(buf, &c);
    367     if (UniLib::IsInterchangeValid(buf, len)) {
    368       repr_.append(buf, len);
    369     } else {
    370       cerr << "Unicode value 0x" << hex << c
    371            << " is not valid for interchange" << endl;
    372       repr_.append(" ", 1);
    373     }
    374   } else {
    375     cerr << "Illegal Unicode value: 0x" << hex << c << endl;
    376     repr_.append(" ", 1);
    377   }
    378 }
    379 
    380 int UnicodeText::size() const {
    381   return CodepointCount(repr_.data_, repr_.size_);
    382 }
    383 
    384 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
    385   if (&lhs == &rhs) return true;
    386   if (lhs.repr_.size_ != rhs.repr_.size_) return false;
    387   return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
    388 }
    389 
    390 string UnicodeText::DebugString() const {
    391   stringstream ss;
    392 
    393   ss << "{UnicodeText " << hex << this << dec << " chars="
    394      << size() << " repr=" << repr_.DebugString() << "}";
    395 #if 0
    396   return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
    397                       this,
    398                       size(),
    399                       repr_.DebugString().c_str());
    400 #endif
    401   string result;
    402   ss >> result;
    403 
    404   return result;
    405 }
    406 
    407 
    408 // ******************* UnicodeText::const_iterator *********************
    409 
    410 // The implementation of const_iterator would be nicer if it
    411 // inherited from boost::iterator_facade
    412 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
    413 
    414 UnicodeText::const_iterator::const_iterator() : it_(0) {}
    415 
    416 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
    417     : it_(other.it_) {
    418 }
    419 
    420 UnicodeText::const_iterator&
    421 UnicodeText::const_iterator::operator=(const const_iterator& other) {
    422   if (&other != this)
    423     it_ = other.it_;
    424   return *this;
    425 }
    426 
    427 UnicodeText::const_iterator UnicodeText::begin() const {
    428   return const_iterator(repr_.data_);
    429 }
    430 
    431 UnicodeText::const_iterator UnicodeText::end() const {
    432   return const_iterator(repr_.data_ + repr_.size_);
    433 }
    434 
    435 bool operator<(const UnicodeText::const_iterator& lhs,
    436                const UnicodeText::const_iterator& rhs) {
    437   return lhs.it_ < rhs.it_;
    438 }
    439 
    440 char32 UnicodeText::const_iterator::operator*() const {
    441   // (We could call chartorune here, but that does some
    442   // error-checking, and we're guaranteed that our data is valid
    443   // UTF-8. Also, we expect this routine to be called very often. So
    444   // for speed, we do the calculation ourselves.)
    445 
    446   // Convert from UTF-8
    447   uint8 byte1 = static_cast<uint8>(it_[0]);
    448   if (byte1 < 0x80)
    449     return byte1;
    450 
    451   uint8 byte2 = static_cast<uint8>(it_[1]);
    452   if (byte1 < 0xE0)
    453     return ((byte1 & 0x1F) << 6)
    454           | (byte2 & 0x3F);
    455 
    456   uint8 byte3 = static_cast<uint8>(it_[2]);
    457   if (byte1 < 0xF0)
    458     return ((byte1 & 0x0F) << 12)
    459          | ((byte2 & 0x3F) << 6)
    460          |  (byte3 & 0x3F);
    461 
    462   uint8 byte4 = static_cast<uint8>(it_[3]);
    463   return ((byte1 & 0x07) << 18)
    464        | ((byte2 & 0x3F) << 12)
    465        | ((byte3 & 0x3F) << 6)
    466        |  (byte4 & 0x3F);
    467 }
    468 
    469 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
    470   it_ += UniLib::OneCharLen(it_);
    471   return *this;
    472 }
    473 
    474 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
    475   while (UniLib::IsTrailByte(*--it_)) { }
    476   return *this;
    477 }
    478 
    479 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
    480   utf8_output[0] = it_[0];
    481   if (static_cast<unsigned char>(it_[0]) < 0x80)
    482     return 1;
    483 
    484   utf8_output[1] = it_[1];
    485   if (static_cast<unsigned char>(it_[0]) < 0xE0)
    486     return 2;
    487 
    488   utf8_output[2] = it_[2];
    489   if (static_cast<unsigned char>(it_[0]) < 0xF0)
    490     return 3;
    491 
    492   utf8_output[3] = it_[3];
    493   return 4;
    494 }
    495 
    496 
    497 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
    498   assert(p != NULL);
    499   const char* start = utf8_data();
    500   int len = utf8_length();
    501   const char* end = start + len;
    502   assert(p >= start);
    503   assert(p <= end);
    504   assert(p == end || !UniLib::IsTrailByte(*p));
    505   return const_iterator(p);
    506 }
    507 
    508 string UnicodeText::const_iterator::DebugString() const {
    509   stringstream ss;
    510 
    511   ss << "{iter " << hex << it_ << "}";
    512   string result;
    513   ss >> result;
    514 
    515   return result;
    516 }
    517 
    518 }  // namespace phonenumbers
    519 }  // namespace i18n
    520