Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
     18 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
     19 
     20 #include <iterator>
     21 #include <string>
     22 #include <utility>
     23 
     24 #include "util/base/integral_types.h"
     25 
     26 namespace libtextclassifier2 {
     27 
     28 // ***************************** UnicodeText **************************
     29 //
     30 // A UnicodeText object is a wrapper around a sequence of Unicode
     31 // codepoint values that allows iteration over these values.
     32 //
     33 // The internal representation of the text is UTF-8. Since UTF-8 is a
     34 // variable-width format, UnicodeText does not provide random access
     35 // to the text, and changes to the text are permitted only at the end.
     36 //
     37 // The UnicodeText class defines a const_iterator. The dereferencing
     38 // operator (*) returns a codepoint (int32). The iterator is a
     39 // read-only iterator. It becomes invalid if the text is changed.
     40 //
     41 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
     42 // 0x10FFFF], but UnicodeText has the additional restriction that it
     43 // can contain only those characters that are valid for interchange on
     44 // the Web. This excludes all of the control codes except for carriage
     45 // return, line feed, and horizontal tab.  It also excludes
     46 // non-characters, but codepoints that are in the Private Use regions
     47 // are allowed, as are codepoints that are unassigned. (See the
     48 // Unicode reference for details.)
     49 //
     50 // MEMORY MANAGEMENT:
     51 //
     52 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
     53 //
     54 // The purpose of an alias is to avoid making an unnecessary copy of a
     55 // UTF-8 buffer while still providing access to the Unicode values
     56 // within that text through iterators. The lifetime of an alias must not
     57 // exceed the lifetime of the buffer from which it was constructed.
     58 //
     59 // Aliases should be used with care. If the source from which an alias
     60 // was created is freed, or if the contents are changed, while the
     61 // alias is still in use, fatal errors could result. But it can be
     62 // quite useful to have a UnicodeText "window" through which to see a
     63 // UTF-8 buffer without having to pay the price of making a copy.
     64 
     65 class UnicodeText {
     66  public:
     67   class const_iterator;
     68 
     69   UnicodeText();  // Create an empty text.
     70   UnicodeText(const UnicodeText& src);
     71   UnicodeText& operator=(UnicodeText&& src);
     72   ~UnicodeText();
     73 
     74   class const_iterator {
     75     typedef const_iterator CI;
     76 
     77    public:
     78     typedef std::input_iterator_tag iterator_category;
     79     typedef char32 value_type;
     80     typedef int difference_type;
     81     typedef void pointer;            // (Not needed.)
     82     typedef const char32 reference;  // (Needed for const_reverse_iterator)
     83 
     84     // Iterators are default-constructible.
     85     const_iterator();
     86 
     87     // It's safe to make multiple passes over a UnicodeText.
     88     const_iterator& operator=(const const_iterator& other);
     89 
     90     char32 operator*() const;  // Dereference
     91 
     92     const_iterator& operator++();     // Advance (++iter)
     93     const_iterator operator++(int) {  // (iter++)
     94       const_iterator result(*this);
     95       ++*this;
     96       return result;
     97     }
     98 
     99     const_iterator& operator--();     // Retreat (--iter)
    100     const_iterator operator--(int) {  // (iter--)
    101       const_iterator result(*this);
    102       --*this;
    103       return result;
    104     }
    105 
    106     friend bool operator==(const CI& lhs, const CI& rhs) {
    107       return lhs.it_ == rhs.it_;
    108     }
    109     friend bool operator!=(const CI& lhs, const CI& rhs) {
    110       return !(lhs == rhs);
    111     }
    112     friend bool operator<(const CI& lhs, const CI& rhs);
    113     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
    114     friend bool operator<=(const CI& lhs, const CI& rhs) {
    115       return !(rhs < lhs);
    116     }
    117     friend bool operator>=(const CI& lhs, const CI& rhs) {
    118       return !(lhs < rhs);
    119     }
    120 
    121     int utf8_length() const {
    122       if (it_[0] < 0x80) {
    123         return 1;
    124       } else if (it_[0] < 0xE0) {
    125         return 2;
    126       } else if (it_[0] < 0xF0) {
    127         return 3;
    128       } else {
    129         return 4;
    130       }
    131     }
    132     const char* utf8_data() const { return it_; }
    133 
    134    private:
    135     friend class UnicodeText;
    136     explicit const_iterator(const char* it) : it_(it) {}
    137 
    138     const char* it_;
    139   };
    140 
    141   const_iterator begin() const;
    142   const_iterator end() const;
    143 
    144   // Gets pointer to the underlying utf8 data.
    145   const char* data() const;
    146 
    147   // Gets length (in bytes) of the underlying utf8 data.
    148   int size_bytes() const;
    149 
    150   // Computes length (in number of Unicode codepoints) of the underlying utf8
    151   // data.
    152   // NOTE: Complexity O(n).
    153   int size_codepoints() const;
    154 
    155   bool empty() const;
    156 
    157   // Checks whether the underlying data is valid utf8 data.
    158   bool is_valid() const;
    159 
    160   bool operator==(const UnicodeText& other) const;
    161 
    162   // x.PointToUTF8(buf,len) changes x so that it points to buf
    163   // ("becomes an alias"). It does not take ownership or copy buf.
    164   // This function assumes that the input is interchange valid UTF8.
    165   UnicodeText& Copy(const UnicodeText& src);
    166   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
    167   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
    168 
    169   // Calling this may invalidate pointers to underlying data.
    170   UnicodeText& AppendUTF8(const char* utf8, int len);
    171   UnicodeText& AppendCodepoint(char32 ch);
    172   void clear();
    173 
    174   std::string ToUTF8String() const;
    175   static std::string UTF8Substring(const const_iterator& first,
    176                                    const const_iterator& last);
    177 
    178  private:
    179   friend class const_iterator;
    180 
    181   class Repr {  // A byte-string.
    182    public:
    183     char* data_;
    184     int size_;
    185     int capacity_;
    186     bool ours_;  // Do we own data_?
    187 
    188     Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
    189     Repr& operator=(Repr&& src);
    190     ~Repr() {
    191       if (ours_) delete[] data_;
    192     }
    193 
    194     void clear();
    195     void reserve(int capacity);
    196     void resize(int size);
    197 
    198     void append(const char* bytes, int byte_length);
    199     void Copy(const char* data, int size);
    200     void PointTo(const char* data, int size);
    201 
    202    private:
    203     Repr& operator=(const Repr&);
    204     Repr(const Repr& other);
    205   };
    206 
    207   Repr repr_;
    208 };
    209 
    210 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
    211     UnicodeTextRange;
    212 
    213 // NOTE: The following are needed to avoid implicit conversion from char* to
    214 // std::string, or from ::string to std::string, because if this happens it
    215 // often results in invalid memory access to a temporary object created during
    216 // such conversion (if do_copy == false).
    217 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
    218 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy);
    219 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
    220 UnicodeText UTF8ToUnicodeText(const std::string& str);
    221 
    222 }  // namespace libtextclassifier2
    223 
    224 #endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
    225