Home | History | Annotate | Download | only in i18n
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Create a state machine for validating UTF-8. The algorithm in brief:
      6 // 1. Convert the complete unicode range of code points, except for the
      7 //    surrogate code points, to an ordered array of sequences of bytes in
      8 //    UTF-8.
      9 // 2. Convert individual bytes to ranges, starting from the right of each byte
     10 //    sequence. For each range, ensure the bytes on the left and the ranges
     11 //    on the right are the identical.
     12 // 3. Convert the resulting list of ranges into a state machine, collapsing
     13 //    identical states.
     14 // 4. Convert the state machine to an array of bytes.
     15 // 5. Output as a C++ file.
     16 //
     17 // To use:
     18 //  $ ninja -C out/Release build_utf8_validator_tables
     19 //  $ out/Release/build_utf8_validator_tables
     20 //                                   --output=base/i18n/utf8_validator_tables.cc
     21 //  $ git add base/i18n/utf8_validator_tables.cc
     22 //
     23 // Because the table is not expected to ever change, it is checked into the
     24 // repository rather than being regenerated at build time.
     25 //
     26 // This code uses type uint8 throughout to represent bytes, to avoid
     27 // signed/unsigned char confusion.
     28 
     29 #include <stdio.h>
     30 #include <stdlib.h>
     31 #include <string.h>
     32 
     33 #include <algorithm>
     34 #include <map>
     35 #include <string>
     36 #include <vector>
     37 
     38 #include "base/basictypes.h"
     39 #include "base/command_line.h"
     40 #include "base/files/file_path.h"
     41 #include "base/files/file_util.h"
     42 #include "base/logging.h"
     43 #include "base/numerics/safe_conversions.h"
     44 #include "base/strings/stringprintf.h"
     45 #include "third_party/icu/source/common/unicode/utf8.h"
     46 
     47 namespace {
     48 
     49 const char kHelpText[] =
     50     "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
     51 
     52 const char kProlog[] =
     53     "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
     54     "// Use of this source code is governed by a BSD-style license that can "
     55     "be\n"
     56     "// found in the LICENSE file.\n"
     57     "\n"
     58     "// This file is auto-generated by build_utf8_validator_tables.\n"
     59     "// DO NOT EDIT.\n"
     60     "\n"
     61     "#include \"base/i18n/utf8_validator_tables.h\"\n"
     62     "\n"
     63     "namespace base {\n"
     64     "namespace internal {\n"
     65     "\n"
     66     "const uint8 kUtf8ValidatorTables[] = {\n";
     67 
     68 const char kEpilog[] =
     69     "};\n"
     70     "\n"
     71     "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
     72     "\n"
     73     "}  // namespace internal\n"
     74     "}  // namespace base\n";
     75 
     76 // Ranges are inclusive at both ends--they represent [from, to]
     77 class Range {
     78  public:
     79   // Ranges always start with just one byte.
     80   explicit Range(uint8 value) : from_(value), to_(value) {}
     81 
     82   // Range objects are copyable and assignable to be used in STL
     83   // containers. Since they only contain non-pointer POD types, the default copy
     84   // constructor, assignment operator and destructor will work.
     85 
     86   // Add a byte to the range. We intentionally only support adding a byte at the
     87   // end, since that is the only operation the code needs.
     88   void AddByte(uint8 to) {
     89     CHECK(to == to_ + 1);
     90     to_ = to;
     91   }
     92 
     93   uint8 from() const { return from_; }
     94   uint8 to() const { return to_; }
     95 
     96   bool operator<(const Range& rhs) const {
     97     return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
     98   }
     99 
    100   bool operator==(const Range& rhs) const {
    101     return from() == rhs.from() && to() == rhs.to();
    102   }
    103 
    104  private:
    105   uint8 from_;
    106   uint8 to_;
    107 };
    108 
    109 // A vector of Ranges is like a simple regular expression--it corresponds to
    110 // a set of strings of the same length that have bytes in each position in
    111 // the appropriate range.
    112 typedef std::vector<Range> StringSet;
    113 
    114 // A UTF-8 "character" is represented by a sequence of bytes.
    115 typedef std::vector<uint8> Character;
    116 
    117 // In the second stage of the algorithm, we want to convert a large list of
    118 // Characters into a small list of StringSets.
    119 struct Pair {
    120   Character character;
    121   StringSet set;
    122 };
    123 
    124 typedef std::vector<Pair> PairVector;
    125 
    126 // A class to print a table of numbers in the same style as clang-format.
    127 class TablePrinter {
    128  public:
    129   explicit TablePrinter(FILE* stream)
    130       : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
    131 
    132   void PrintValue(uint8 value) {
    133     if (values_on_this_line_ == 0) {
    134       fputs("   ", stream_);
    135     } else if (values_on_this_line_ == kMaxValuesPerLine) {
    136       fprintf(stream_, "  // 0x%02x\n   ", current_offset_);
    137       values_on_this_line_ = 0;
    138     }
    139     fprintf(stream_, " 0x%02x,", static_cast<int>(value));
    140     ++values_on_this_line_;
    141     ++current_offset_;
    142   }
    143 
    144   void NewLine() {
    145     while (values_on_this_line_ < kMaxValuesPerLine) {
    146       fputs("      ", stream_);
    147       ++values_on_this_line_;
    148     }
    149     fprintf(stream_, "  // 0x%02x\n", current_offset_);
    150     values_on_this_line_ = 0;
    151   }
    152 
    153  private:
    154   // stdio stream. Not owned.
    155   FILE* stream_;
    156 
    157   // Number of values so far printed on this line.
    158   int values_on_this_line_;
    159 
    160   // Total values printed so far.
    161   int current_offset_;
    162 
    163   static const int kMaxValuesPerLine = 8;
    164 
    165   DISALLOW_COPY_AND_ASSIGN(TablePrinter);
    166 };
    167 
    168 // Start by filling a PairVector with characters. The resulting vector goes from
    169 // "\x00" to "\xf4\x8f\xbf\xbf".
    170 PairVector InitializeCharacters() {
    171   PairVector vector;
    172   for (int i = 0; i <= 0x10FFFF; ++i) {
    173     if (i >= 0xD800 && i < 0xE000) {
    174       // Surrogate codepoints are not permitted. Non-character code points are
    175       // explicitly permitted.
    176       continue;
    177     }
    178     uint8 bytes[4];
    179     unsigned int offset = 0;
    180     UBool is_error = false;
    181     U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
    182     DCHECK(!is_error);
    183     DCHECK_GT(offset, 0u);
    184     DCHECK_LE(offset, arraysize(bytes));
    185     Pair pair = {Character(bytes, bytes + offset), StringSet()};
    186     vector.push_back(pair);
    187   }
    188   return vector;
    189 }
    190 
    191 // Construct a new Pair from |character| and the concatenation of |new_range|
    192 // and |existing_set|, and append it to |pairs|.
    193 void ConstructPairAndAppend(const Character& character,
    194                             const Range& new_range,
    195                             const StringSet& existing_set,
    196                             PairVector* pairs) {
    197   Pair new_pair = {character, StringSet(1, new_range)};
    198   new_pair.set.insert(
    199       new_pair.set.end(), existing_set.begin(), existing_set.end());
    200   pairs->push_back(new_pair);
    201 }
    202 
    203 // Each pass over the PairVector strips one byte off the right-hand-side of the
    204 // characters and adds a range to the set on the right. For example, the first
    205 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
    206 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
    207 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
    208 // [\xa0-\xbf][\x80-\xbf]).
    209 void MoveRightMostCharToSet(PairVector* pairs) {
    210   PairVector new_pairs;
    211   PairVector::const_iterator it = pairs->begin();
    212   while (it != pairs->end() && it->character.empty()) {
    213     new_pairs.push_back(*it);
    214     ++it;
    215   }
    216   CHECK(it != pairs->end());
    217   Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
    218   Range new_range(it->character.back());
    219   StringSet converted = it->set;
    220   ++it;
    221   while (it != pairs->end()) {
    222     const Pair& current_pair = *it++;
    223     if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
    224         std::equal(unconverted_bytes.begin(),
    225                    unconverted_bytes.end(),
    226                    current_pair.character.begin()) &&
    227         converted == current_pair.set) {
    228       // The particular set of UTF-8 codepoints we are validating guarantees
    229       // that each byte range will be contiguous. This would not necessarily be
    230       // true for an arbitrary set of UTF-8 codepoints.
    231       DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
    232       new_range.AddByte(current_pair.character.back());
    233       continue;
    234     }
    235     ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
    236     unconverted_bytes = Character(current_pair.character.begin(),
    237                                   current_pair.character.end() - 1);
    238     new_range = Range(current_pair.character.back());
    239     converted = current_pair.set;
    240   }
    241   ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
    242   new_pairs.swap(*pairs);
    243 }
    244 
    245 void MoveAllCharsToSets(PairVector* pairs) {
    246   // Since each pass of the function moves one character, and UTF-8 sequences
    247   // are at most 4 characters long, this simply runs the algorithm four times.
    248   for (int i = 0; i < 4; ++i) {
    249     MoveRightMostCharToSet(pairs);
    250   }
    251 #if DCHECK_IS_ON
    252   for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
    253        ++it) {
    254     DCHECK(it->character.empty());
    255   }
    256 #endif
    257 }
    258 
    259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
    260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
    261 // algorithm is working. Use the command-line option
    262 // --vmodule=build_utf8_validator_tables=1 to see this output.
    263 void LogStringSets(const PairVector& pairs) {
    264   for (PairVector::const_iterator pair_it = pairs.begin();
    265        pair_it != pairs.end();
    266        ++pair_it) {
    267     std::string set_as_string;
    268     for (StringSet::const_iterator set_it = pair_it->set.begin();
    269          set_it != pair_it->set.end();
    270          ++set_it) {
    271       set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
    272                                           static_cast<int>(set_it->from()),
    273                                           static_cast<int>(set_it->to()));
    274     }
    275     VLOG(1) << set_as_string;
    276   }
    277 }
    278 
    279 // A single state in the state machine is represented by a sorted vector of
    280 // start bytes and target states. All input bytes in the range between the start
    281 // byte and the next entry in the vector (or 0xFF) result in a transition to the
    282 // target state.
    283 struct StateRange {
    284   uint8 from;
    285   uint8 target_state;
    286 };
    287 
    288 typedef std::vector<StateRange> State;
    289 
    290 // Generates a state where all bytes go to state 1 (invalid). This is also used
    291 // as an initialiser for other states (since bytes from outside the desired
    292 // range are invalid).
    293 State GenerateInvalidState() {
    294   const StateRange range = {0, 1};
    295   return State(1, range);
    296 }
    297 
    298 // A map from a state (ie. a set of strings which will match from this state) to
    299 // a number (which is an index into the array of states).
    300 typedef std::map<StringSet, uint8> StateMap;
    301 
    302 // Create a new state corresponding to |set|, add it |states| and |state_map|
    303 // and return the index it was given in |states|.
    304 uint8 MakeState(const StringSet& set,
    305                 std::vector<State>* states,
    306                 StateMap* state_map) {
    307   DCHECK(!set.empty());
    308   const Range& range = set.front();
    309   const StringSet rest(set.begin() + 1, set.end());
    310   const StateMap::const_iterator where = state_map->find(rest);
    311   const uint8 target_state = where == state_map->end()
    312                                  ? MakeState(rest, states, state_map)
    313                                  : where->second;
    314   DCHECK_LT(0, range.from());
    315   DCHECK_LT(range.to(), 0xFF);
    316   const StateRange new_state_initializer[] = {
    317       {0, 1}, {range.from(), target_state},
    318       {static_cast<uint8>(range.to() + 1), 1}};
    319   states->push_back(
    320       State(new_state_initializer,
    321             new_state_initializer + arraysize(new_state_initializer)));
    322   const uint8 new_state_number =
    323       base::checked_cast<uint8>(states->size() - 1);
    324   CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
    325   return new_state_number;
    326 }
    327 
    328 std::vector<State> GenerateStates(const PairVector& pairs) {
    329   // States 0 and 1 are the initial/valid state and invalid state, respectively.
    330   std::vector<State> states(2, GenerateInvalidState());
    331   StateMap state_map;
    332   state_map.insert(std::make_pair(StringSet(), 0));
    333   for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
    334     DCHECK(it->character.empty());
    335     DCHECK(!it->set.empty());
    336     const Range& range = it->set.front();
    337     const StringSet rest(it->set.begin() + 1, it->set.end());
    338     const StateMap::const_iterator where = state_map.find(rest);
    339     const uint8 target_state = where == state_map.end()
    340                                    ? MakeState(rest, &states, &state_map)
    341                                    : where->second;
    342     if (states[0].back().from == range.from()) {
    343       DCHECK_EQ(1, states[0].back().target_state);
    344       states[0].back().target_state = target_state;
    345       DCHECK_LT(range.to(), 0xFF);
    346       const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};
    347       states[0].push_back(new_range);
    348     } else {
    349       DCHECK_LT(range.to(), 0xFF);
    350       const StateRange new_range_initializer[] = {{range.from(), target_state},
    351            {static_cast<uint8>(range.to() + 1), 1}};
    352       states[0]
    353           .insert(states[0].end(),
    354                   new_range_initializer,
    355                   new_range_initializer + arraysize(new_range_initializer));
    356     }
    357   }
    358   return states;
    359 }
    360 
    361 // Output the generated states as a C++ table. Two tricks are used to compact
    362 // the table: each state in the table starts with a shift value which indicates
    363 // how many bits we can discard from the right-hand-side of the byte before
    364 // doing the table lookup. Secondly, only the state-transitions for bytes
    365 // with the top-bit set are included in the table; bytes without the top-bit set
    366 // are just ASCII and are handled directly by the code.
    367 void PrintStates(const std::vector<State>& states, FILE* stream) {
    368   // First calculate the start-offset of each state. This allows the state
    369   // machine to jump directly to the correct offset, avoiding an extra
    370   // indirection. State 0 starts at offset 0.
    371   std::vector<uint8> state_offset(1, 0);
    372   std::vector<uint8> shifts;
    373   uint8 pos = 0;
    374 
    375   for (std::vector<State>::const_iterator state_it = states.begin();
    376        state_it != states.end();
    377        ++state_it) {
    378     // We want to set |shift| to the (0-based) index of the least-significant
    379     // set bit in any of the ranges for this state, since this tells us how many
    380     // bits we can discard and still determine what range a byte lies in. Sadly
    381     // it appears that ffs() is not portable, so we do it clumsily.
    382     uint8 shift = 7;
    383     for (State::const_iterator range_it = state_it->begin();
    384          range_it != state_it->end();
    385          ++range_it) {
    386       while (shift > 0 && range_it->from % (1 << shift) != 0) {
    387         --shift;
    388       }
    389     }
    390     shifts.push_back(shift);
    391     pos += 1 + (1 << (7 - shift));
    392     state_offset.push_back(pos);
    393   }
    394 
    395   DCHECK_EQ(129, state_offset[1]);
    396 
    397   fputs(kProlog, stream);
    398   TablePrinter table_printer(stream);
    399 
    400   for (uint8 state_index = 0; state_index < states.size(); ++state_index) {
    401     const uint8 shift = shifts[state_index];
    402     uint8 next_range = 0;
    403     uint8 target_state = 1;
    404     fprintf(stream,
    405             "    // State %d, offset 0x%02x\n",
    406             static_cast<int>(state_index),
    407             static_cast<int>(state_offset[state_index]));
    408     table_printer.PrintValue(shift);
    409     for (int i = 0; i < 0x100; i += (1 << shift)) {
    410       if (next_range < states[state_index].size() &&
    411           states[state_index][next_range].from == i) {
    412         target_state = states[state_index][next_range].target_state;
    413         ++next_range;
    414       }
    415       if (i >= 0x80) {
    416         table_printer.PrintValue(state_offset[target_state]);
    417       }
    418     }
    419     table_printer.NewLine();
    420   }
    421 
    422   fputs(kEpilog, stream);
    423 }
    424 
    425 }  // namespace
    426 
    427 int main(int argc, char* argv[]) {
    428   CommandLine::Init(argc, argv);
    429   logging::LoggingSettings settings;
    430   settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
    431   logging::InitLogging(settings);
    432   if (CommandLine::ForCurrentProcess()->HasSwitch("help")) {
    433     fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
    434     exit(EXIT_SUCCESS);
    435   }
    436   base::FilePath filename =
    437       CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
    438 
    439   FILE* output = stdout;
    440   if (!filename.empty()) {
    441     output = base::OpenFile(filename, "wb");
    442     if (!output)
    443       PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
    444                   << "' for writing";
    445   }
    446 
    447   // Step 1: Enumerate the characters
    448   PairVector pairs = InitializeCharacters();
    449   // Step 2: Convert to sets.
    450   MoveAllCharsToSets(&pairs);
    451   if (VLOG_IS_ON(1)) {
    452     LogStringSets(pairs);
    453   }
    454   // Step 3: Generate states.
    455   std::vector<State> states = GenerateStates(pairs);
    456   // Step 4/5: Print output
    457   PrintStates(states, output);
    458 
    459   if (!filename.empty()) {
    460     if (!base::CloseFile(output))
    461       PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
    462                   << "'";
    463   }
    464 
    465   return EXIT_SUCCESS;
    466 }
    467