1 // Copyright (C) 2006 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Author: Jim Meehan 16 17 #ifndef UTIL_UTF8_UNICODETEXT_H__ 18 #define UTIL_UTF8_UNICODETEXT_H__ 19 20 #include <iterator> 21 #include <string> 22 #include <utility> 23 #include "phonenumbers/base/basictypes.h" 24 25 namespace i18n { 26 namespace phonenumbers { 27 28 using std::string; 29 using std::bidirectional_iterator_tag; 30 using std::pair; 31 32 // ***************************** UnicodeText ************************** 33 // 34 // A UnicodeText object is a container for a sequence of Unicode 35 // codepoint values. It has default, copy, and assignment constructors. 36 // Data can be appended to it from another UnicodeText, from 37 // iterators, or from a single codepoint. 38 // 39 // The internal representation of the text is UTF-8. Since UTF-8 is a 40 // variable-width format, UnicodeText does not provide random access 41 // to the text, and changes to the text are permitted only at the end. 42 // 43 // The UnicodeText class defines a const_iterator. The dereferencing 44 // operator (*) returns a codepoint (char32). The iterator is a 45 // bidirectional, read-only iterator. It becomes invalid if the text 46 // is changed. 47 // 48 // There are methods for appending and retrieving UTF-8 data directly. 49 // The 'utf8_data' method returns a const char* that contains the 50 // UTF-8-encoded version of the text; 'utf8_length' returns the number 51 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to 52 // 4 bytes of UTF-8 data in a char array and returns the number of 53 // bytes that it stored. 54 // 55 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 56 // 0x10FFFF], but UnicodeText has the additional restriction that it 57 // can contain only those characters that are valid for interchange on 58 // the Web. This excludes all of the control codes except for carriage 59 // return, line feed, and horizontal tab. It also excludes 60 // non-characters, but codepoints that are in the Private Use regions 61 // are allowed, as are codepoints that are unassigned. (See the 62 // Unicode reference for details.) The function UniLib::IsInterchangeValid 63 // can be used as a test for this property. 64 // 65 // UnicodeTexts are safe. Every method that constructs or modifies a 66 // UnicodeText tests for interchange-validity, and will substitute a 67 // space for the invalid data. Such cases are reported via 68 // LOG(WARNING). 69 // 70 // MEMORY MANAGEMENT: copy, take ownership, or point to 71 // 72 // A UnicodeText is either an "owner", meaning that it owns the memory 73 // for the data buffer and will free it when the UnicodeText is 74 // destroyed, or it is an "alias", meaning that it does not. 75 // 76 // There are three methods for storing UTF-8 data in a UnicodeText: 77 // 78 // CopyUTF8(buffer, len) copies buffer. 79 // 80 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. 81 // 82 // PointToUTF8(buffer, size) creates an alias pointing to buffer. 83 // 84 // All three methods perform a validity check on the buffer. There are 85 // private, "unsafe" versions of these functions that bypass the 86 // validity check. They are used internally and by friend-functions 87 // that are handling UTF-8 data that has already been validated. 88 // 89 // The purpose of an alias is to avoid making an unnecessary copy of a 90 // UTF-8 buffer while still providing access to the Unicode values 91 // within that text through iterators or the fast scanners that are 92 // based on UTF-8 state tables. The lifetime of an alias must not 93 // exceed the lifetime of the buffer from which it was constructed. 94 // 95 // The semantics of an alias might be described as "copy on write or 96 // repair." The source data is never modified. If push_back() or 97 // append() is called on an alias, a copy of the data will be created, 98 // and the UnicodeText will become an owner. If clear() is called on 99 // an alias, it becomes an (empty) owner. 100 // 101 // The copy constructor and the assignment operator produce an owner. 102 // That is, after direct initialization ("UnicodeText x(y);") or copy 103 // initialization ("UnicodeText x = y;") x will be an owner, even if y 104 // was an alias. The assignment operator ("x = y;") also produces an 105 // owner unless x and y are the same object and y is an alias. 106 // 107 // Aliases should be used with care. If the source from which an alias 108 // was created is freed, or if the contents are changed, while the 109 // alias is still in use, fatal errors could result. But it can be 110 // quite useful to have a UnicodeText "window" through which to see a 111 // UTF-8 buffer without having to pay the price of making a copy. 112 // 113 // UTILITIES 114 // 115 // The interfaces in util/utf8/public/textutils.h provide higher-level 116 // utilities for dealing with UnicodeTexts, including routines for 117 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or 118 // strings, creating strings from UnicodeTexts, normalizing text for 119 // efficient matching or display, and others. 120 121 class UnicodeText { 122 public: 123 class const_iterator; 124 125 typedef char32 value_type; 126 127 // Constructors. These always produce owners. 128 UnicodeText(); // Create an empty text. 129 UnicodeText(const UnicodeText& src); // copy constructor 130 // Construct a substring (copies the data). 131 UnicodeText(const const_iterator& first, const const_iterator& last); 132 133 // Assignment operator. This copies the data and produces an owner 134 // unless this == &src, e.g., "x = x;", which is a no-op. 135 UnicodeText& operator=(const UnicodeText& src); 136 137 // x.Copy(y) copies the data from y into x. 138 UnicodeText& Copy(const UnicodeText& src); 139 inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } 140 141 // x.PointTo(y) changes x so that it points to y's data. 142 // It does not copy y or take ownership of y's data. 143 UnicodeText& PointTo(const UnicodeText& src); 144 UnicodeText& PointTo(const const_iterator& first, 145 const const_iterator& last); 146 147 ~UnicodeText(); 148 149 void clear(); // Clear text. 150 bool empty() { return repr_.size_ == 0; } // Test if text is empty. 151 152 // Add a codepoint to the end of the text. 153 // If the codepoint is not interchange-valid, add a space instead 154 // and log a warning. 155 void push_back(char32 codepoint); 156 157 // Generic appending operation. 158 // iterator_traits<ForwardIterator>::value_type must be implicitly 159 // convertible to char32. Typical uses of this method might include: 160 // char32 chars[] = {0x1, 0x2, ...}; 161 // vector<char32> more_chars = ...; 162 // utext.append(chars, chars+arraysize(chars)); 163 // utext.append(more_chars.begin(), more_chars.end()); 164 template<typename ForwardIterator> 165 UnicodeText& append(ForwardIterator first, const ForwardIterator last) { 166 while (first != last) { push_back(*first++); } 167 return *this; 168 } 169 170 // A specialization of the generic append() method. 171 UnicodeText& append(const const_iterator& first, const const_iterator& last); 172 173 // An optimization of append(source.begin(), source.end()). 174 UnicodeText& append(const UnicodeText& source); 175 176 int size() const; // the number of Unicode characters (codepoints) 177 178 friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); 179 friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); 180 181 class const_iterator { 182 typedef const_iterator CI; 183 public: 184 typedef bidirectional_iterator_tag iterator_category; 185 typedef char32 value_type; 186 typedef ptrdiff_t difference_type; 187 typedef void pointer; // (Not needed.) 188 typedef const char32 reference; // (Needed for const_reverse_iterator) 189 190 // Iterators are default-constructible. 191 const_iterator(); 192 193 // It's safe to make multiple passes over a UnicodeText. 194 const_iterator(const const_iterator& other); 195 const_iterator& operator=(const const_iterator& other); 196 197 char32 operator*() const; // Dereference 198 199 const_iterator& operator++(); // Advance (++iter) 200 const_iterator operator++(int) { // (iter++) 201 const_iterator result(*this); 202 ++*this; 203 return result; 204 } 205 206 const_iterator& operator--(); // Retreat (--iter) 207 const_iterator operator--(int) { // (iter--) 208 const_iterator result(*this); 209 --*this; 210 return result; 211 } 212 213 // We love relational operators. 214 friend bool operator==(const CI& lhs, const CI& rhs) { 215 return lhs.it_ == rhs.it_; } 216 friend bool operator!=(const CI& lhs, const CI& rhs) { 217 return !(lhs == rhs); } 218 friend bool operator<(const CI& lhs, const CI& rhs); 219 friend bool operator>(const CI& lhs, const CI& rhs) { 220 return rhs < lhs; } 221 friend bool operator<=(const CI& lhs, const CI& rhs) { 222 return !(rhs < lhs); } 223 friend bool operator>=(const CI& lhs, const CI& rhs) { 224 return !(lhs < rhs); } 225 226 friend difference_type distance(const CI& first, const CI& last); 227 228 // UTF-8-specific methods 229 // Store the UTF-8 encoding of the current codepoint into buf, 230 // which must be at least 4 bytes long. Return the number of 231 // bytes written. 232 int get_utf8(char* buf) const; 233 // Return the iterator's pointer into the UTF-8 data. 234 const char* utf8_data() const { return it_; } 235 236 string DebugString() const; 237 238 private: 239 friend class UnicodeText; 240 friend class UnicodeTextUtils; 241 friend class UTF8StateTableProperty; 242 explicit const_iterator(const char* it) : it_(it) {} 243 244 const char* it_; 245 }; 246 247 const_iterator begin() const; 248 const_iterator end() const; 249 250 class const_reverse_iterator : public std::reverse_iterator<const_iterator> { 251 public: 252 const_reverse_iterator(const_iterator it) : 253 std::reverse_iterator<const_iterator>(it) {} 254 const char* utf8_data() const { 255 const_iterator tmp_it = base(); 256 return (--tmp_it).utf8_data(); 257 } 258 int get_utf8(char* buf) const { 259 const_iterator tmp_it = base(); 260 return (--tmp_it).get_utf8(buf); 261 } 262 }; 263 const_reverse_iterator rbegin() const { 264 return const_reverse_iterator(end()); 265 } 266 const_reverse_iterator rend() const { 267 return const_reverse_iterator(begin()); 268 } 269 270 // Substring searching. Returns the beginning of the first 271 // occurrence of "look", or end() if not found. 272 const_iterator find(const UnicodeText& look, const_iterator start_pos) const; 273 // Equivalent to find(look, begin()) 274 const_iterator find(const UnicodeText& look) const; 275 276 // Returns whether this contains the character U+FFFD. This can 277 // occur, for example, if the input to Encodings::Decode() had byte 278 // sequences that were invalid in the source encoding. 279 bool HasReplacementChar() const; 280 281 // UTF-8-specific methods 282 // 283 // Return the data, length, and capacity of UTF-8-encoded version of 284 // the text. Length and capacity are measured in bytes. 285 const char* utf8_data() const { return repr_.data_; } 286 int utf8_length() const { return repr_.size_; } 287 int utf8_capacity() const { return repr_.capacity_; } 288 289 // Return the UTF-8 data as a string. 290 static string UTF8Substring(const const_iterator& first, 291 const const_iterator& last); 292 293 // There are three methods for initializing a UnicodeText from UTF-8 294 // data. They vary in details of memory management. In all cases, 295 // the data is tested for interchange-validity. If it is not 296 // interchange-valid, a LOG(WARNING) is issued, and each 297 // structurally invalid byte and each interchange-invalid codepoint 298 // is replaced with a space. 299 300 // x.CopyUTF8(buf, len) copies buf into x. 301 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 302 303 // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of 304 // buf. buf is not copied. 305 UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, 306 int byte_length, 307 int byte_capacity); 308 309 // x.PointToUTF8(buf,len) changes x so that it points to buf 310 // ("becomes an alias"). It does not take ownership or copy buf. 311 // If the buffer is not valid, this has the same effect as 312 // CopyUTF8(utf8_buffer, byte_length). 313 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 314 315 // Occasionally it is necessary to use functions that operate on the 316 // pointer returned by utf8_data(). MakeIterator(p) provides a way 317 // to get back to the UnicodeText level. It uses CHECK to ensure 318 // that p is a pointer within this object's UTF-8 data, and that it 319 // points to the beginning of a character. 320 const_iterator MakeIterator(const char* p) const; 321 322 string DebugString() const; 323 324 private: 325 friend class const_iterator; 326 friend class UnicodeTextUtils; 327 328 class Repr { // A byte-string. 329 public: 330 char* data_; 331 int size_; 332 int capacity_; 333 bool ours_; // Do we own data_? 334 335 Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} 336 ~Repr() { if (ours_) delete[] data_; } 337 338 void clear(); 339 void reserve(int capacity); 340 void resize(int size); 341 342 void append(const char* bytes, int byte_length); 343 void Copy(const char* data, int size); 344 void TakeOwnershipOf(char* data, int size, int capacity); 345 void PointTo(const char* data, int size); 346 347 string DebugString() const; 348 349 private: 350 Repr& operator=(const Repr&); 351 Repr(const Repr& other); 352 }; 353 354 Repr repr_; 355 356 // UTF-8-specific private methods. 357 // These routines do not perform a validity check when compiled 358 // in opt mode. 359 // It is an error to call these methods with UTF-8 data that 360 // is not interchange-valid. 361 // 362 UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); 363 UnicodeText& UnsafeTakeOwnershipOfUTF8( 364 char* utf8_buffer, int byte_length, int byte_capacity); 365 UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); 366 UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); 367 const_iterator UnsafeFind(const UnicodeText& look, 368 const_iterator start_pos) const; 369 }; 370 371 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); 372 373 inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { 374 return !(lhs == rhs); 375 } 376 377 // UnicodeTextRange is a pair of iterators, useful for specifying text 378 // segments. If the iterators are ==, the segment is empty. 379 typedef pair<UnicodeText::const_iterator, 380 UnicodeText::const_iterator> UnicodeTextRange; 381 382 inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { 383 return r.first == r.second; 384 } 385 386 387 // *************************** Utilities ************************* 388 389 // A factory function for creating a UnicodeText from a buffer of 390 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It 391 // is an "owner.") 392 // 393 // Each byte that is structurally invalid will be replaced with a 394 // space. Each codepoint that is interchange-invalid will also be 395 // replaced with a space, even if the codepoint was represented with a 396 // multibyte sequence in the UTF-8 data. 397 // 398 inline UnicodeText MakeUnicodeTextAcceptingOwnership( 399 char* utf8_buffer, int byte_length, int byte_capacity) { 400 return UnicodeText().TakeOwnershipOfUTF8( 401 utf8_buffer, byte_length, byte_capacity); 402 } 403 404 // A factory function for creating a UnicodeText from a buffer of 405 // UTF-8 data. The new UnicodeText does not take ownership of the 406 // buffer. (It is an "alias.") 407 // 408 inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( 409 const char* utf8_buffer, int byte_length) { 410 return UnicodeText().PointToUTF8(utf8_buffer, byte_length); 411 } 412 413 // Create a UnicodeText from a UTF-8 string or buffer. 414 // 415 // If do_copy is true, then a copy of the string is made. The copy is 416 // owned by the resulting UnicodeText object and will be freed when 417 // the object is destroyed. This UnicodeText object is referred to 418 // as an "owner." 419 // 420 // If do_copy is false, then no copy is made. The resulting 421 // UnicodeText object does NOT take ownership of the string; in this 422 // case, the lifetime of the UnicodeText object must not exceed the 423 // lifetime of the string. This Unicodetext object is referred to as 424 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. 425 // 426 // If the input string does not contain valid UTF-8, then a copy is 427 // made (as if do_copy were true) and coerced to valid UTF-8 by 428 // replacing each invalid byte with a space. 429 // 430 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, 431 bool do_copy) { 432 UnicodeText t; 433 if (do_copy) { 434 t.CopyUTF8(utf8_buf, len); 435 } else { 436 t.PointToUTF8(utf8_buf, len); 437 } 438 return t; 439 } 440 441 inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { 442 return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); 443 } 444 445 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { 446 return UTF8ToUnicodeText(utf8_buf, len, true); 447 } 448 inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { 449 return UTF8ToUnicodeText(utf8_string, true); 450 } 451 452 // Return a string containing the UTF-8 encoded version of all the 453 // Unicode characters in t. 454 inline string UnicodeTextToUTF8(const UnicodeText& t) { 455 return string(t.utf8_data(), t.utf8_length()); 456 } 457 458 } // namespace phonenumbers 459 } // namespace i18n 460 461 #endif // UTIL_UTF8_UNICODETEXT_H__ 462