1 /** 2 * Copyright 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Routines to do manipulation of Unicode characters or text 18 // 19 // The StructurallyValid routines accept buffers of arbitrary bytes. 20 // For CoerceToStructurallyValid(), the input buffer and output buffers may 21 // point to exactly the same memory. 22 // 23 // In all other cases, the UTF-8 string must be structurally valid and 24 // have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. 25 // Debug builds take a fatal error for invalid UTF-8 input. 26 // The input and output buffers may not overlap at all. 27 // 28 // The char32 routines are here only for convenience; they convert to UTF-8 29 // internally and use the UTF-8 routines. 30 31 #ifndef UTIL_UTF8_UNILIB_H__ 32 #define UTIL_UTF8_UNILIB_H__ 33 34 #include <string> 35 #include "phonenumbers/base/basictypes.h" 36 37 namespace i18n { 38 namespace phonenumbers { 39 namespace UniLib { 40 41 // Returns true unless a surrogate code point 42 inline bool IsValidCodepoint(char32 c) { 43 // In the range [0, 0xD800) or [0xE000, 0x10FFFF] 44 return (static_cast<uint32>(c) < 0xD800) 45 || (c >= 0xE000 && c <= 0x10FFFF); 46 } 47 48 // Table of UTF-8 character lengths, based on first byte 49 static const unsigned char kUTF8LenTbl[256] = { 50 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 51 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 52 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 53 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 54 55 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 56 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 57 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 58 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 59 }; 60 61 // Return length of a single UTF-8 source character 62 inline int OneCharLen(const char* src) { 63 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; 64 } 65 66 // Return length of a single UTF-8 source character 67 inline int OneCharLen(const uint8* src) { 68 return kUTF8LenTbl[*src]; 69 } 70 71 // Return true if this byte is a trailing UTF-8 byte (10xx xxxx) 72 inline bool IsTrailByte(char x) { 73 // return (x & 0xC0) == 0x80; 74 // Since trail bytes are always in [0x80, 0xBF], we can optimize: 75 return static_cast<signed char>(x) < -0x40; 76 } 77 78 // Returns the length in bytes of the prefix of src that is all 79 // interchange valid UTF-8 80 int SpanInterchangeValid(const char* src, int byte_length); 81 inline int SpanInterchangeValid(const std::string& src) { 82 return SpanInterchangeValid(src.data(), src.size()); 83 } 84 85 // Returns true if the source is all interchange valid UTF-8 86 // "Interchange valid" is a stronger than structurally valid -- 87 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. 88 inline bool IsInterchangeValid(const char* src, int byte_length) { 89 return (byte_length == SpanInterchangeValid(src, byte_length)); 90 } 91 inline bool IsInterchangeValid(const std::string& src) { 92 return IsInterchangeValid(src.data(), src.size()); 93 } 94 95 } // namespace UniLib 96 } // namespace phonenumbers 97 } // namespace i18n 98 99 #endif // UTIL_UTF8_PUBLIC_UNILIB_H_ 100