Home | History | Annotate | Download | only in utf
      1 // Copyright (C) 2006 Google Inc.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Author: Jim Meehan
     16 
     17 #include <algorithm>
     18 #include <sstream>
     19 #include <cassert>
     20 #include <cstdio>
     21 
     22 #include "phonenumbers/utf/unicodetext.h"
     23 #include "phonenumbers/utf/stringpiece.h"
     24 //#include "utf/stringprintf.h"
     25 #include "phonenumbers/utf/utf.h"
     26 #include "phonenumbers/utf/unilib.h"
     27 
     28 namespace i18n {
     29 namespace phonenumbers {
     30 
     31 using std::stringstream;
     32 using std::max;
     33 using std::hex;
     34 using std::dec;
     35 
     36 static int CodepointDistance(const char* start, const char* end) {
     37   int n = 0;
     38   // Increment n on every non-trail-byte.
     39   for (const char* p = start; p < end; ++p) {
     40     n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
     41   }
     42   return n;
     43 }
     44 
     45 static int CodepointCount(const char* utf8, int len) {
     46   return CodepointDistance(utf8, utf8 + len);
     47 }
     48 
     49 UnicodeText::const_iterator::difference_type
     50 distance(const UnicodeText::const_iterator& first,
     51          const UnicodeText::const_iterator& last) {
     52   return CodepointDistance(first.it_, last.it_);
     53 }
     54 
     55 // ---------- Utility ----------
     56 
     57 static int ConvertToInterchangeValid(char* start, int len) {
     58   // This routine is called only when we've discovered that a UTF-8 buffer
     59   // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
     60   // was not interchange valid. This indicates a bug in the caller, and
     61   // a LOG(WARNING) is done in that case.
     62   // This is similar to CoerceToInterchangeValid, but it replaces each
     63   // structurally valid byte with a space, and each non-interchange
     64   // character with a space, even when that character requires more
     65   // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
     66   // structurally valid UTF8, but U+FDD0 is not an interchange-valid
     67   // code point. The result should contain one space, not three.
     68   //
     69   // Since the conversion never needs to write more data than it
     70   // reads, it is safe to change the buffer in place. It returns the
     71   // number of bytes written.
     72   char* const in = start;
     73   char* out = start;
     74   char* const end = start + len;
     75   while (start < end) {
     76     int good = UniLib::SpanInterchangeValid(start, end - start);
     77     if (good > 0) {
     78       if (out != start) {
     79         memmove(out, start, good);
     80       }
     81       out += good;
     82       start += good;
     83       if (start == end) {
     84         break;
     85       }
     86     }
     87     // Is the current string invalid UTF8 or just non-interchange UTF8?
     88     char32 rune;
     89     int n;
     90     if (isvalidcharntorune(start, end - start, &rune, &n)) {
     91       // structurally valid UTF8, but not interchange valid
     92       start += n;  // Skip over the whole character.
     93     } else {  // bad UTF8
     94       start += 1;  // Skip over just one byte
     95     }
     96     *out++ = ' ';
     97   }
     98   return out - in;
     99 }
    100 
    101 
    102 // *************** Data representation **********
    103 
    104 // Note: the copy constructor is undefined.
    105 
    106 // After reserve(), resize(), or clear(), we're an owner, not an alias.
    107 
    108 void UnicodeText::Repr::reserve(int new_capacity) {
    109   // If there's already enough capacity, and we're an owner, do nothing.
    110   if (capacity_ >= new_capacity && ours_) return;
    111 
    112   // Otherwise, allocate a new buffer.
    113   capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
    114   char* new_data = new char[capacity_];
    115 
    116   // If there is an old buffer, copy it into the new buffer.
    117   if (data_) {
    118     memcpy(new_data, data_, size_);
    119     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
    120   }
    121   data_ = new_data;
    122   ours_ = true;  // We own the new buffer.
    123   // size_ is unchanged.
    124 }
    125 
    126 void UnicodeText::Repr::resize(int new_size) {
    127   if (new_size == 0) {
    128     clear();
    129   } else {
    130     if (!ours_ || new_size > capacity_) reserve(new_size);
    131     // Clear the memory in the expanded part.
    132     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
    133     size_ = new_size;
    134     ours_ = true;
    135   }
    136 }
    137 
    138 // This implementation of clear() deallocates the buffer if we're an owner.
    139 // That's not strictly necessary; we could just set size_ to 0.
    140 void UnicodeText::Repr::clear() {
    141   if (ours_) delete[] data_;
    142   data_ = NULL;
    143   size_ = capacity_ = 0;
    144   ours_ = true;
    145 }
    146 
    147 void UnicodeText::Repr::Copy(const char* data, int size) {
    148   resize(size);
    149   memcpy(data_, data, size);
    150 }
    151 
    152 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
    153   if (data == data_) return;  // We already own this memory. (Weird case.)
    154   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
    155   data_ = data;
    156   size_ = size;
    157   capacity_ = capacity;
    158   ours_ = true;
    159 }
    160 
    161 void UnicodeText::Repr::PointTo(const char* data, int size) {
    162   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
    163   data_ = const_cast<char*>(data);
    164   size_ = size;
    165   capacity_ = size;
    166   ours_ = false;
    167 }
    168 
    169 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
    170   reserve(size_ + byte_length);
    171   memcpy(data_ + size_, bytes, byte_length);
    172   size_ += byte_length;
    173 }
    174 
    175 string UnicodeText::Repr::DebugString() const {
    176   stringstream ss;
    177 
    178   ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
    179      << size_ << " capacity=" << capacity_ << " "
    180      << (ours_ ? "Owned" : "Alias") << "}";
    181 
    182   string result;
    183   ss >> result;
    184 
    185   return result;
    186 }
    187 
    188 
    189 
    190 // *************** UnicodeText ******************
    191 
    192 // ----- Constructors -----
    193 
    194 // Default constructor
    195 UnicodeText::UnicodeText() {
    196 }
    197 
    198 // Copy constructor
    199 UnicodeText::UnicodeText(const UnicodeText& src) {
    200   Copy(src);
    201 }
    202 
    203 // Substring constructor
    204 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
    205                          const UnicodeText::const_iterator& last) {
    206   assert(first <= last && "Incompatible iterators");
    207   repr_.append(first.it_, last.it_ - first.it_);
    208 }
    209 
    210 string UnicodeText::UTF8Substring(const const_iterator& first,
    211                                   const const_iterator& last) {
    212   assert(first <= last && "Incompatible iterators");
    213   return string(first.it_, last.it_ - first.it_);
    214 }
    215 
    216 
    217 // ----- Copy -----
    218 
    219 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
    220   if (this != &src) {
    221     Copy(src);
    222   }
    223   return *this;
    224 }
    225 
    226 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
    227   repr_.Copy(src.repr_.data_, src.repr_.size_);
    228   return *this;
    229 }
    230 
    231 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
    232   repr_.Copy(buffer, byte_length);
    233   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
    234     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
    235     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    236   }
    237   return *this;
    238 }
    239 
    240 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
    241                                            int byte_length) {
    242   repr_.Copy(buffer, byte_length);
    243   return *this;
    244 }
    245 
    246 // ----- TakeOwnershipOf  -----
    247 
    248 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
    249                                               int byte_length,
    250                                               int byte_capacity) {
    251   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
    252   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
    253     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
    254     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    255   }
    256   return *this;
    257 }
    258 
    259 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
    260                                                     int byte_length,
    261                                                     int byte_capacity) {
    262   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
    263   return *this;
    264 }
    265 
    266 // ----- PointTo -----
    267 
    268 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
    269   if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
    270     repr_.PointTo(buffer, byte_length);
    271   } else {
    272     fprintf(stderr, "UTF-8 buffer is not interchange-valid.");
    273     repr_.Copy(buffer, byte_length);
    274     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
    275   }
    276   return *this;
    277 }
    278 
    279 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
    280                                           int byte_length) {
    281   repr_.PointTo(buffer, byte_length);
    282   return *this;
    283 }
    284 
    285 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
    286   repr_.PointTo(src.repr_.data_, src.repr_.size_);
    287   return *this;
    288 }
    289 
    290 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
    291                                   const const_iterator &last) {
    292   assert(first <= last && " Incompatible iterators");
    293   repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
    294   return *this;
    295 }
    296 
    297 // ----- Append -----
    298 
    299 UnicodeText& UnicodeText::append(const UnicodeText& u) {
    300   repr_.append(u.repr_.data_, u.repr_.size_);
    301   return *this;
    302 }
    303 
    304 UnicodeText& UnicodeText::append(const const_iterator& first,
    305                                  const const_iterator& last) {
    306   assert(first <= last && "Incompatible iterators");
    307   repr_.append(first.it_, last.it_ - first.it_);
    308   return *this;
    309 }
    310 
    311 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
    312   repr_.append(utf8, len);
    313   return *this;
    314 }
    315 
    316 // ----- substring searching -----
    317 
    318 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
    319                                               const_iterator start_pos) const {
    320   assert(start_pos.utf8_data() >= utf8_data());
    321   assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
    322   return UnsafeFind(look, start_pos);
    323 }
    324 
    325 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
    326   return UnsafeFind(look, begin());
    327 }
    328 
    329 UnicodeText::const_iterator UnicodeText::UnsafeFind(
    330     const UnicodeText& look, const_iterator start_pos) const {
    331   // Due to the magic of the UTF8 encoding, searching for a sequence of
    332   // letters is equivalent to substring search.
    333   StringPiece searching(utf8_data(), utf8_length());
    334   StringPiece look_piece(look.utf8_data(), look.utf8_length());
    335   StringPiece::size_type found =
    336       searching.find(look_piece, start_pos.utf8_data() - utf8_data());
    337   if (found == StringPiece::npos) return end();
    338   return const_iterator(utf8_data() + found);
    339 }
    340 
    341 bool UnicodeText::HasReplacementChar() const {
    342   // Equivalent to:
    343   //   UnicodeText replacement_char;
    344   //   replacement_char.push_back(0xFFFD);
    345   //   return find(replacement_char) != end();
    346   StringPiece searching(utf8_data(), utf8_length());
    347   StringPiece looking_for("\xEF\xBF\xBD", 3);
    348   return searching.find(looking_for) != StringPiece::npos;
    349 }
    350 
    351 // ----- other methods -----
    352 
    353 // Clear operator
    354 void UnicodeText::clear() {
    355   repr_.clear();
    356 }
    357 
    358 // Destructor
    359 UnicodeText::~UnicodeText() {}
    360 
    361 
    362 void UnicodeText::push_back(char32 c) {
    363   if (UniLib::IsValidCodepoint(c)) {
    364     char buf[UTFmax];
    365     int len = runetochar(buf, &c);
    366     if (UniLib::IsInterchangeValid(buf, len)) {
    367       repr_.append(buf, len);
    368     } else {
    369       fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c);
    370       repr_.append(" ", 1);
    371     }
    372   } else {
    373     fprintf(stderr, "Illegal Unicode value: 0x%x\n", c);
    374     repr_.append(" ", 1);
    375   }
    376 }
    377 
    378 int UnicodeText::size() const {
    379   return CodepointCount(repr_.data_, repr_.size_);
    380 }
    381 
    382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
    383   if (&lhs == &rhs) return true;
    384   if (lhs.repr_.size_ != rhs.repr_.size_) return false;
    385   return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
    386 }
    387 
    388 string UnicodeText::DebugString() const {
    389   stringstream ss;
    390 
    391   ss << "{UnicodeText " << hex << this << dec << " chars="
    392      << size() << " repr=" << repr_.DebugString() << "}";
    393 #if 0
    394   return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
    395                       this,
    396                       size(),
    397                       repr_.DebugString().c_str());
    398 #endif
    399   string result;
    400   ss >> result;
    401 
    402   return result;
    403 }
    404 
    405 
    406 // ******************* UnicodeText::const_iterator *********************
    407 
    408 // The implementation of const_iterator would be nicer if it
    409 // inherited from boost::iterator_facade
    410 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
    411 
    412 UnicodeText::const_iterator::const_iterator() : it_(0) {}
    413 
    414 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
    415     : it_(other.it_) {
    416 }
    417 
    418 UnicodeText::const_iterator&
    419 UnicodeText::const_iterator::operator=(const const_iterator& other) {
    420   if (&other != this)
    421     it_ = other.it_;
    422   return *this;
    423 }
    424 
    425 UnicodeText::const_iterator UnicodeText::begin() const {
    426   return const_iterator(repr_.data_);
    427 }
    428 
    429 UnicodeText::const_iterator UnicodeText::end() const {
    430   return const_iterator(repr_.data_ + repr_.size_);
    431 }
    432 
    433 bool operator<(const UnicodeText::const_iterator& lhs,
    434                const UnicodeText::const_iterator& rhs) {
    435   return lhs.it_ < rhs.it_;
    436 }
    437 
    438 char32 UnicodeText::const_iterator::operator*() const {
    439   // (We could call chartorune here, but that does some
    440   // error-checking, and we're guaranteed that our data is valid
    441   // UTF-8. Also, we expect this routine to be called very often. So
    442   // for speed, we do the calculation ourselves.)
    443 
    444   // Convert from UTF-8
    445   uint8 byte1 = static_cast<uint8>(it_[0]);
    446   if (byte1 < 0x80)
    447     return byte1;
    448 
    449   uint8 byte2 = static_cast<uint8>(it_[1]);
    450   if (byte1 < 0xE0)
    451     return ((byte1 & 0x1F) << 6)
    452           | (byte2 & 0x3F);
    453 
    454   uint8 byte3 = static_cast<uint8>(it_[2]);
    455   if (byte1 < 0xF0)
    456     return ((byte1 & 0x0F) << 12)
    457          | ((byte2 & 0x3F) << 6)
    458          |  (byte3 & 0x3F);
    459 
    460   uint8 byte4 = static_cast<uint8>(it_[3]);
    461   return ((byte1 & 0x07) << 18)
    462        | ((byte2 & 0x3F) << 12)
    463        | ((byte3 & 0x3F) << 6)
    464        |  (byte4 & 0x3F);
    465 }
    466 
    467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
    468   it_ += UniLib::OneCharLen(it_);
    469   return *this;
    470 }
    471 
    472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
    473   while (UniLib::IsTrailByte(*--it_)) { }
    474   return *this;
    475 }
    476 
    477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
    478   utf8_output[0] = it_[0];
    479   if (static_cast<unsigned char>(it_[0]) < 0x80)
    480     return 1;
    481 
    482   utf8_output[1] = it_[1];
    483   if (static_cast<unsigned char>(it_[0]) < 0xE0)
    484     return 2;
    485 
    486   utf8_output[2] = it_[2];
    487   if (static_cast<unsigned char>(it_[0]) < 0xF0)
    488     return 3;
    489 
    490   utf8_output[3] = it_[3];
    491   return 4;
    492 }
    493 
    494 
    495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
    496   assert(p != NULL);
    497   const char* start = utf8_data();
    498   int len = utf8_length();
    499   const char* end = start + len;
    500   assert(p >= start);
    501   assert(p <= end);
    502   assert(p == end || !UniLib::IsTrailByte(*p));
    503   return const_iterator(p);
    504 }
    505 
    506 string UnicodeText::const_iterator::DebugString() const {
    507   stringstream ss;
    508 
    509   ss << "{iter " << hex << it_ << "}";
    510   string result;
    511   ss >> result;
    512 
    513   return result;
    514 }
    515 
    516 }  // namespace phonenumbers
    517 }  // namespace i18n
    518