Home | History | Annotate | Download | only in fst
      1 // icu.h
      2 
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Copyright 2005-2010 Google, Inc.
     16 // Author: sorenj (at) google.com (Jeffrey Sorensen)
     17 //         roubert (at) google.com (Fredrik Roubert)
     18 //
     19 // This library implements an unrestricted Thompson/Pike UTF-8 parser and
     20 // serializer.  UTF-8 is a restricted subset of this byte stream encoding.  See
     21 // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding
     22 // details.
     23 
     24 #ifndef FST_LIB_ICU_H_
     25 #define FST_LIB_ICU_H_
     26 
     27 #include <iostream>
     28 #include <fstream>
     29 #include <sstream>
     30 
     31 namespace fst {
     32 
     33 template <class Label>
     34 bool UTF8StringToLabels(const string &str, vector<Label> *labels) {
     35   const char *data = str.data();
     36   size_t length = str.size();
     37   for (int i = 0; i < length; /* no update */) {
     38     int c = data[i++] & 0xff;
     39     if ((c & 0x80) == 0) {
     40       labels->push_back(c);
     41     } else {
     42       if ((c & 0xc0) == 0x80) {
     43         LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte";
     44         return false;
     45       }
     46       int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) +
     47                   (c >= 0xfc);
     48       int code = c & ((1 << (6 - count)) - 1);
     49       while (count != 0) {
     50         if (i == length) {
     51           LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence";
     52           return false;
     53         }
     54         char cb = data[i++];
     55         if ((cb & 0xc0) != 0x80) {
     56           LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte";
     57           return false;
     58         }
     59         code = (code << 6) | (cb & 0x3f);
     60         count--;
     61       }
     62       if (code < 0) {
     63         // This should not be able to happen.
     64         LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
     65         return false;
     66       }
     67       labels->push_back(code);
     68     }
     69   }
     70   return true;
     71 }
     72 
     73 template <class Label>
     74 bool LabelsToUTF8String(const vector<Label> &labels, string *str) {
     75   ostringstream ostr;
     76   for (size_t i = 0; i < labels.size(); ++i) {
     77     int32_t code = labels[i];
     78     if (code < 0) {
     79       LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code;
     80       return false;
     81     } else if (code < 0x80) {
     82       ostr << static_cast<char>(code);
     83     } else if (code < 0x800) {
     84       ostr << static_cast<char>((code >> 6) | 0xc0);
     85       ostr << static_cast<char>((code & 0x3f) | 0x80);
     86     } else if (code < 0x10000) {
     87       ostr << static_cast<char>((code >> 12) | 0xe0);
     88       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
     89       ostr << static_cast<char>((code & 0x3f) | 0x80);
     90     } else if (code < 0x200000) {
     91       ostr << static_cast<char>((code >> 18) | 0xf0);
     92       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
     93       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
     94       ostr << static_cast<char>((code & 0x3f) | 0x80);
     95     } else if (code < 0x4000000) {
     96       ostr << static_cast<char>((code >> 24) | 0xf8);
     97       ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
     98       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
     99       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
    100       ostr << static_cast<char>((code & 0x3f) | 0x80);
    101     } else {
    102       ostr << static_cast<char>((code >> 30) | 0xfc);
    103       ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80);
    104       ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
    105       ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
    106       ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
    107       ostr << static_cast<char>((code & 0x3f) | 0x80);
    108     }
    109   }
    110   *str = ostr.str();
    111   return true;
    112 }
    113 
    114 }  // namespace fst
    115 
    116 #endif  // FST_LIB_ICU_H_
    117