Home | History | Annotate | Download | only in utf
      1 /**
      2  * Copyright 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 // Routines to do manipulation of Unicode characters or text
     18 //
     19 // The StructurallyValid routines accept buffers of arbitrary bytes.
     20 // For CoerceToStructurallyValid(), the input buffer and output buffers may
     21 // point to exactly the same memory.
     22 //
     23 // In all other cases, the UTF-8 string must be structurally valid and
     24 // have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
     25 // Debug builds take a fatal error for invalid UTF-8 input.
     26 // The input and output buffers may not overlap at all.
     27 //
     28 // The char32 routines are here only for convenience; they convert to UTF-8
     29 // internally and use the UTF-8 routines.
     30 
     31 #ifndef UTIL_UTF8_UNILIB_H__
     32 #define UTIL_UTF8_UNILIB_H__
     33 
     34 #include <string>
     35 #include "phonenumbers/base/basictypes.h"
     36 
     37 namespace i18n {
     38 namespace phonenumbers {
     39 namespace UniLib {
     40 
     41 // Returns true unless a surrogate code point
     42 inline bool IsValidCodepoint(char32 c) {
     43   // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
     44   return (static_cast<uint32>(c) < 0xD800)
     45     || (c >= 0xE000 && c <= 0x10FFFF);
     46 }
     47 
     48 // Table of UTF-8 character lengths, based on first byte
     49 static const unsigned char kUTF8LenTbl[256] = {
     50   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     51   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     52   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     53   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     54 
     55   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     56   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
     57   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
     58   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
     59 };
     60 
     61 // Return length of a single UTF-8 source character
     62 inline int OneCharLen(const char* src) {
     63   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
     64 }
     65 
     66 // Return length of a single UTF-8 source character
     67 inline int OneCharLen(const uint8* src) {
     68   return kUTF8LenTbl[*src];
     69 }
     70 
     71 // Return true if this byte is a trailing UTF-8 byte (10xx xxxx)
     72 inline bool IsTrailByte(char x) {
     73   // return (x & 0xC0) == 0x80;
     74   // Since trail bytes are always in [0x80, 0xBF], we can optimize:
     75   return static_cast<signed char>(x) < -0x40;
     76 }
     77 
     78 // Returns the length in bytes of the prefix of src that is all
     79 //  interchange valid UTF-8
     80 int SpanInterchangeValid(const char* src, int byte_length);
     81 inline int SpanInterchangeValid(const std::string& src) {
     82   return SpanInterchangeValid(src.data(), src.size());
     83 }
     84 
     85 // Returns true if the source is all interchange valid UTF-8
     86 // "Interchange valid" is a stronger than structurally valid --
     87 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
     88 inline bool IsInterchangeValid(const char* src, int byte_length) {
     89   return (byte_length == SpanInterchangeValid(src, byte_length));
     90 }
     91 inline bool IsInterchangeValid(const std::string& src) {
     92   return IsInterchangeValid(src.data(), src.size());
     93 }
     94 
     95 }  // namespace UniLib
     96 }  // namespace phonenumbers
     97 }  // namespace i18n
     98 
     99 #endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
    100