Home | History | Annotate | Download | only in utf
      1 // Copyright (C) 2006 Google Inc.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Author: Jim Meehan
     16 
     17 #ifndef UTIL_UTF8_UNICODETEXT_H__
     18 #define UTIL_UTF8_UNICODETEXT_H__
     19 
     20 #include <iterator>
     21 #include <string>
     22 #include <utility>
     23 #include "phonenumbers/base/basictypes.h"
     24 
     25 namespace i18n {
     26 namespace phonenumbers {
     27 
     28 using std::string;
     29 using std::bidirectional_iterator_tag;
     30 using std::pair;
     31 
     32 // ***************************** UnicodeText **************************
     33 //
     34 // A UnicodeText object is a container for a sequence of Unicode
     35 // codepoint values. It has default, copy, and assignment constructors.
     36 // Data can be appended to it from another UnicodeText, from
     37 // iterators, or from a single codepoint.
     38 //
     39 // The internal representation of the text is UTF-8. Since UTF-8 is a
     40 // variable-width format, UnicodeText does not provide random access
     41 // to the text, and changes to the text are permitted only at the end.
     42 //
     43 // The UnicodeText class defines a const_iterator. The dereferencing
     44 // operator (*) returns a codepoint (char32). The iterator is a
     45 // bidirectional, read-only iterator. It becomes invalid if the text
     46 // is changed.
     47 //
     48 // There are methods for appending and retrieving UTF-8 data directly.
     49 // The 'utf8_data' method returns a const char* that contains the
     50 // UTF-8-encoded version of the text; 'utf8_length' returns the number
     51 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to
     52 // 4 bytes of UTF-8 data in a char array and returns the number of
     53 // bytes that it stored.
     54 //
     55 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
     56 // 0x10FFFF], but UnicodeText has the additional restriction that it
     57 // can contain only those characters that are valid for interchange on
     58 // the Web. This excludes all of the control codes except for carriage
     59 // return, line feed, and horizontal tab.  It also excludes
     60 // non-characters, but codepoints that are in the Private Use regions
     61 // are allowed, as are codepoints that are unassigned. (See the
     62 // Unicode reference for details.) The function UniLib::IsInterchangeValid
     63 // can be used as a test for this property.
     64 //
     65 // UnicodeTexts are safe. Every method that constructs or modifies a
     66 // UnicodeText tests for interchange-validity, and will substitute a
     67 // space for the invalid data. Such cases are reported via
     68 // LOG(WARNING).
     69 //
     70 // MEMORY MANAGEMENT: copy, take ownership, or point to
     71 //
     72 // A UnicodeText is either an "owner", meaning that it owns the memory
     73 // for the data buffer and will free it when the UnicodeText is
     74 // destroyed, or it is an "alias", meaning that it does not.
     75 //
     76 // There are three methods for storing UTF-8 data in a UnicodeText:
     77 //
     78 // CopyUTF8(buffer, len) copies buffer.
     79 //
     80 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
     81 //
     82 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
     83 //
     84 // All three methods perform a validity check on the buffer. There are
     85 // private, "unsafe" versions of these functions that bypass the
     86 // validity check. They are used internally and by friend-functions
     87 // that are handling UTF-8 data that has already been validated.
     88 //
     89 // The purpose of an alias is to avoid making an unnecessary copy of a
     90 // UTF-8 buffer while still providing access to the Unicode values
     91 // within that text through iterators or the fast scanners that are
     92 // based on UTF-8 state tables. The lifetime of an alias must not
     93 // exceed the lifetime of the buffer from which it was constructed.
     94 //
     95 // The semantics of an alias might be described as "copy on write or
     96 // repair." The source data is never modified. If push_back() or
     97 // append() is called on an alias, a copy of the data will be created,
     98 // and the UnicodeText will become an owner. If clear() is called on
     99 // an alias, it becomes an (empty) owner.
    100 //
    101 // The copy constructor and the assignment operator produce an owner.
    102 // That is, after direct initialization ("UnicodeText x(y);") or copy
    103 // initialization ("UnicodeText x = y;") x will be an owner, even if y
    104 // was an alias. The assignment operator ("x = y;") also produces an
    105 // owner unless x and y are the same object and y is an alias.
    106 //
    107 // Aliases should be used with care. If the source from which an alias
    108 // was created is freed, or if the contents are changed, while the
    109 // alias is still in use, fatal errors could result. But it can be
    110 // quite useful to have a UnicodeText "window" through which to see a
    111 // UTF-8 buffer without having to pay the price of making a copy.
    112 //
    113 // UTILITIES
    114 //
    115 // The interfaces in util/utf8/public/textutils.h provide higher-level
    116 // utilities for dealing with UnicodeTexts, including routines for
    117 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
    118 // strings, creating strings from UnicodeTexts, normalizing text for
    119 // efficient matching or display, and others.
    120 
    121 class UnicodeText {
    122  public:
    123   class const_iterator;
    124 
    125   typedef char32 value_type;
    126 
    127   // Constructors. These always produce owners.
    128   UnicodeText();  // Create an empty text.
    129   UnicodeText(const UnicodeText& src);  // copy constructor
    130   // Construct a substring (copies the data).
    131   UnicodeText(const const_iterator& first, const const_iterator& last);
    132 
    133   // Assignment operator. This copies the data and produces an owner
    134   // unless this == &src, e.g., "x = x;", which is a no-op.
    135   UnicodeText& operator=(const UnicodeText& src);
    136 
    137   // x.Copy(y) copies the data from y into x.
    138   UnicodeText& Copy(const UnicodeText& src);
    139   inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
    140 
    141   // x.PointTo(y) changes x so that it points to y's data.
    142   // It does not copy y or take ownership of y's data.
    143   UnicodeText& PointTo(const UnicodeText& src);
    144   UnicodeText& PointTo(const const_iterator& first,
    145                        const const_iterator& last);
    146 
    147   ~UnicodeText();
    148 
    149   void clear();  // Clear text.
    150   bool empty() { return repr_.size_ == 0; }  // Test if text is empty.
    151 
    152   // Add a codepoint to the end of the text.
    153   // If the codepoint is not interchange-valid, add a space instead
    154   // and log a warning.
    155   void push_back(char32 codepoint);
    156 
    157   // Generic appending operation.
    158   // iterator_traits<ForwardIterator>::value_type must be implicitly
    159   // convertible to char32. Typical uses of this method might include:
    160   //     char32 chars[] = {0x1, 0x2, ...};
    161   //     vector<char32> more_chars = ...;
    162   //     utext.append(chars, chars+arraysize(chars));
    163   //     utext.append(more_chars.begin(), more_chars.end());
    164   template<typename ForwardIterator>
    165   UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
    166     while (first != last) { push_back(*first++); }
    167     return *this;
    168   }
    169 
    170   // A specialization of the generic append() method.
    171   UnicodeText& append(const const_iterator& first, const const_iterator& last);
    172 
    173   // An optimization of append(source.begin(), source.end()).
    174   UnicodeText& append(const UnicodeText& source);
    175 
    176   int size() const;  // the number of Unicode characters (codepoints)
    177 
    178   friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
    179   friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
    180 
    181   class const_iterator {
    182     typedef const_iterator CI;
    183    public:
    184     typedef bidirectional_iterator_tag iterator_category;
    185     typedef char32 value_type;
    186     typedef ptrdiff_t difference_type;
    187     typedef void pointer;  // (Not needed.)
    188     typedef const char32 reference;  // (Needed for const_reverse_iterator)
    189 
    190     // Iterators are default-constructible.
    191     const_iterator();
    192 
    193     // It's safe to make multiple passes over a UnicodeText.
    194     const_iterator(const const_iterator& other);
    195     const_iterator& operator=(const const_iterator& other);
    196 
    197     char32 operator*() const;  // Dereference
    198 
    199     const_iterator& operator++();  // Advance (++iter)
    200     const_iterator operator++(int) {  // (iter++)
    201       const_iterator result(*this);
    202       ++*this;
    203       return result;
    204     }
    205 
    206     const_iterator& operator--();  // Retreat (--iter)
    207     const_iterator operator--(int) {  // (iter--)
    208       const_iterator result(*this);
    209       --*this;
    210       return result;
    211     }
    212 
    213     // We love relational operators.
    214     friend bool operator==(const CI& lhs, const CI& rhs) {
    215       return lhs.it_ == rhs.it_; }
    216     friend bool operator!=(const CI& lhs, const CI& rhs) {
    217       return !(lhs == rhs); }
    218     friend bool operator<(const CI& lhs, const CI& rhs);
    219     friend bool operator>(const CI& lhs, const CI& rhs) {
    220       return rhs < lhs; }
    221     friend bool operator<=(const CI& lhs, const CI& rhs) {
    222       return !(rhs < lhs); }
    223     friend bool operator>=(const CI& lhs, const CI& rhs) {
    224       return !(lhs < rhs); }
    225 
    226     friend difference_type distance(const CI& first, const CI& last);
    227 
    228     // UTF-8-specific methods
    229     // Store the UTF-8 encoding of the current codepoint into buf,
    230     // which must be at least 4 bytes long. Return the number of
    231     // bytes written.
    232     int get_utf8(char* buf) const;
    233     // Return the iterator's pointer into the UTF-8 data.
    234     const char* utf8_data() const { return it_; }
    235 
    236     string DebugString() const;
    237 
    238    private:
    239     friend class UnicodeText;
    240     friend class UnicodeTextUtils;
    241     friend class UTF8StateTableProperty;
    242     explicit const_iterator(const char* it) : it_(it) {}
    243 
    244     const char* it_;
    245   };
    246 
    247   const_iterator begin() const;
    248   const_iterator end() const;
    249 
    250   class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
    251    public:
    252     const_reverse_iterator(const_iterator it) :
    253         std::reverse_iterator<const_iterator>(it) {}
    254     const char* utf8_data() const {
    255       const_iterator tmp_it = base();
    256       return (--tmp_it).utf8_data();
    257     }
    258     int get_utf8(char* buf) const {
    259       const_iterator tmp_it = base();
    260       return (--tmp_it).get_utf8(buf);
    261     }
    262   };
    263   const_reverse_iterator rbegin() const {
    264     return const_reverse_iterator(end());
    265   }
    266   const_reverse_iterator rend() const {
    267     return const_reverse_iterator(begin());
    268   }
    269 
    270   // Substring searching.  Returns the beginning of the first
    271   // occurrence of "look", or end() if not found.
    272   const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
    273   // Equivalent to find(look, begin())
    274   const_iterator find(const UnicodeText& look) const;
    275 
    276   // Returns whether this contains the character U+FFFD.  This can
    277   // occur, for example, if the input to Encodings::Decode() had byte
    278   // sequences that were invalid in the source encoding.
    279   bool HasReplacementChar() const;
    280 
    281   // UTF-8-specific methods
    282   //
    283   // Return the data, length, and capacity of UTF-8-encoded version of
    284   // the text. Length and capacity are measured in bytes.
    285   const char* utf8_data() const { return repr_.data_; }
    286   int utf8_length() const { return repr_.size_; }
    287   int utf8_capacity() const { return repr_.capacity_; }
    288 
    289   // Return the UTF-8 data as a string.
    290   static string UTF8Substring(const const_iterator& first,
    291                               const const_iterator& last);
    292 
    293   // There are three methods for initializing a UnicodeText from UTF-8
    294   // data. They vary in details of memory management. In all cases,
    295   // the data is tested for interchange-validity. If it is not
    296   // interchange-valid, a LOG(WARNING) is issued, and each
    297   // structurally invalid byte and each interchange-invalid codepoint
    298   // is replaced with a space.
    299 
    300   // x.CopyUTF8(buf, len) copies buf into x.
    301   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
    302 
    303   // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
    304   // buf. buf is not copied.
    305   UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
    306                                    int byte_length,
    307                                    int byte_capacity);
    308 
    309   // x.PointToUTF8(buf,len) changes x so that it points to buf
    310   // ("becomes an alias"). It does not take ownership or copy buf.
    311   // If the buffer is not valid, this has the same effect as
    312   // CopyUTF8(utf8_buffer, byte_length).
    313   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
    314 
    315   // Occasionally it is necessary to use functions that operate on the
    316   // pointer returned by utf8_data(). MakeIterator(p) provides a way
    317   // to get back to the UnicodeText level. It uses CHECK to ensure
    318   // that p is a pointer within this object's UTF-8 data, and that it
    319   // points to the beginning of a character.
    320   const_iterator MakeIterator(const char* p) const;
    321 
    322   string DebugString() const;
    323 
    324  private:
    325   friend class const_iterator;
    326   friend class UnicodeTextUtils;
    327 
    328   class Repr {  // A byte-string.
    329    public:
    330     char* data_;
    331     int size_;
    332     int capacity_;
    333     bool ours_;  // Do we own data_?
    334 
    335     Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
    336     ~Repr() { if (ours_) delete[] data_; }
    337 
    338     void clear();
    339     void reserve(int capacity);
    340     void resize(int size);
    341 
    342     void append(const char* bytes, int byte_length);
    343     void Copy(const char* data, int size);
    344     void TakeOwnershipOf(char* data, int size, int capacity);
    345     void PointTo(const char* data, int size);
    346 
    347     string DebugString() const;
    348 
    349    private:
    350     Repr& operator=(const Repr&);
    351     Repr(const Repr& other);
    352   };
    353 
    354   Repr repr_;
    355 
    356   // UTF-8-specific private methods.
    357   // These routines do not perform a validity check when compiled
    358   // in opt mode.
    359   // It is an error to call these methods with UTF-8 data that
    360   // is not interchange-valid.
    361   //
    362   UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
    363   UnicodeText& UnsafeTakeOwnershipOfUTF8(
    364       char* utf8_buffer, int byte_length, int byte_capacity);
    365   UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
    366   UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
    367   const_iterator UnsafeFind(const UnicodeText& look,
    368                             const_iterator start_pos) const;
    369 };
    370 
    371 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
    372 
    373 inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
    374   return !(lhs == rhs);
    375 }
    376 
    377 // UnicodeTextRange is a pair of iterators, useful for specifying text
    378 // segments. If the iterators are ==, the segment is empty.
    379 typedef pair<UnicodeText::const_iterator,
    380              UnicodeText::const_iterator> UnicodeTextRange;
    381 
    382 inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
    383   return r.first == r.second;
    384 }
    385 
    386 
    387 // *************************** Utilities *************************
    388 
    389 // A factory function for creating a UnicodeText from a buffer of
    390 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
    391 // is an "owner.")
    392 //
    393 // Each byte that is structurally invalid will be replaced with a
    394 // space. Each codepoint that is interchange-invalid will also be
    395 // replaced with a space, even if the codepoint was represented with a
    396 // multibyte sequence in the UTF-8 data.
    397 //
    398 inline UnicodeText MakeUnicodeTextAcceptingOwnership(
    399     char* utf8_buffer, int byte_length, int byte_capacity) {
    400   return UnicodeText().TakeOwnershipOfUTF8(
    401       utf8_buffer, byte_length, byte_capacity);
    402 }
    403 
    404 // A factory function for creating a UnicodeText from a buffer of
    405 // UTF-8 data. The new UnicodeText does not take ownership of the
    406 // buffer. (It is an "alias.")
    407 //
    408 inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
    409     const char* utf8_buffer, int byte_length) {
    410   return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
    411 }
    412 
    413 // Create a UnicodeText from a UTF-8 string or buffer.
    414 //
    415 // If do_copy is true, then a copy of the string is made. The copy is
    416 // owned by the resulting UnicodeText object and will be freed when
    417 // the object is destroyed. This UnicodeText object is referred to
    418 // as an "owner."
    419 //
    420 // If do_copy is false, then no copy is made. The resulting
    421 // UnicodeText object does NOT take ownership of the string; in this
    422 // case, the lifetime of the UnicodeText object must not exceed the
    423 // lifetime of the string. This Unicodetext object is referred to as
    424 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
    425 //
    426 // If the input string does not contain valid UTF-8, then a copy is
    427 // made (as if do_copy were true) and coerced to valid UTF-8 by
    428 // replacing each invalid byte with a space.
    429 //
    430 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
    431                                      bool do_copy) {
    432   UnicodeText t;
    433   if (do_copy) {
    434     t.CopyUTF8(utf8_buf, len);
    435   } else {
    436     t.PointToUTF8(utf8_buf, len);
    437   }
    438   return t;
    439 }
    440 
    441 inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
    442   return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
    443 }
    444 
    445 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
    446   return UTF8ToUnicodeText(utf8_buf, len, true);
    447 }
    448 inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
    449   return UTF8ToUnicodeText(utf8_string, true);
    450 }
    451 
    452 // Return a string containing the UTF-8 encoded version of all the
    453 // Unicode characters in t.
    454 inline string UnicodeTextToUTF8(const UnicodeText& t) {
    455   return string(t.utf8_data(), t.utf8_length());
    456 }
    457 
    458 }  // namespace phonenumbers
    459 }  // namespace i18n
    460 
    461 #endif  // UTIL_UTF8_UNICODETEXT_H__
    462