1 // icu.h 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Copyright 2005-2010 Google, Inc. 16 // Author: sorenj (at) google.com (Jeffrey Sorensen) 17 // roubert (at) google.com (Fredrik Roubert) 18 // 19 // This library implements an unrestricted Thompson/Pike UTF-8 parser and 20 // serializer. UTF-8 is a restricted subset of this byte stream encoding. See 21 // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding 22 // details. 23 24 #ifndef FST_LIB_ICU_H_ 25 #define FST_LIB_ICU_H_ 26 27 #include <iostream> 28 #include <fstream> 29 #include <sstream> 30 31 namespace fst { 32 33 template <class Label> 34 bool UTF8StringToLabels(const string &str, vector<Label> *labels) { 35 const char *data = str.data(); 36 size_t length = str.size(); 37 for (int i = 0; i < length; /* no update */) { 38 int c = data[i++] & 0xff; 39 if ((c & 0x80) == 0) { 40 labels->push_back(c); 41 } else { 42 if ((c & 0xc0) == 0x80) { 43 LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte"; 44 return false; 45 } 46 int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + 47 (c >= 0xfc); 48 int code = c & ((1 << (6 - count)) - 1); 49 while (count != 0) { 50 if (i == length) { 51 LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence"; 52 return false; 53 } 54 char cb = data[i++]; 55 if ((cb & 0xc0) != 0x80) { 56 LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte"; 57 return false; 58 } 59 code = (code << 6) | (cb & 0x3f); 60 count--; 61 } 62 if (code < 0) { 63 // This should not be able to happen. 64 LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; 65 return false; 66 } 67 labels->push_back(code); 68 } 69 } 70 return true; 71 } 72 73 template <class Label> 74 bool LabelsToUTF8String(const vector<Label> &labels, string *str) { 75 ostringstream ostr; 76 for (size_t i = 0; i < labels.size(); ++i) { 77 int32_t code = labels[i]; 78 if (code < 0) { 79 LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code; 80 return false; 81 } else if (code < 0x80) { 82 ostr << static_cast<char>(code); 83 } else if (code < 0x800) { 84 ostr << static_cast<char>((code >> 6) | 0xc0); 85 ostr << static_cast<char>((code & 0x3f) | 0x80); 86 } else if (code < 0x10000) { 87 ostr << static_cast<char>((code >> 12) | 0xe0); 88 ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); 89 ostr << static_cast<char>((code & 0x3f) | 0x80); 90 } else if (code < 0x200000) { 91 ostr << static_cast<char>((code >> 18) | 0xf0); 92 ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); 93 ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); 94 ostr << static_cast<char>((code & 0x3f) | 0x80); 95 } else if (code < 0x4000000) { 96 ostr << static_cast<char>((code >> 24) | 0xf8); 97 ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); 98 ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); 99 ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); 100 ostr << static_cast<char>((code & 0x3f) | 0x80); 101 } else { 102 ostr << static_cast<char>((code >> 30) | 0xfc); 103 ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80); 104 ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); 105 ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); 106 ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); 107 ostr << static_cast<char>((code & 0x3f) | 0x80); 108 } 109 } 110 *str = ostr.str(); 111 return true; 112 } 113 114 } // namespace fst 115 116 #endif // FST_LIB_ICU_H_ 117