1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Create a state machine for validating UTF-8. The algorithm in brief: 6 // 1. Convert the complete unicode range of code points, except for the 7 // surrogate code points, to an ordered array of sequences of bytes in 8 // UTF-8. 9 // 2. Convert individual bytes to ranges, starting from the right of each byte 10 // sequence. For each range, ensure the bytes on the left and the ranges 11 // on the right are the identical. 12 // 3. Convert the resulting list of ranges into a state machine, collapsing 13 // identical states. 14 // 4. Convert the state machine to an array of bytes. 15 // 5. Output as a C++ file. 16 // 17 // To use: 18 // $ ninja -C out/Release build_utf8_validator_tables 19 // $ out/Release/build_utf8_validator_tables 20 // --output=base/i18n/utf8_validator_tables.cc 21 // $ git add base/i18n/utf8_validator_tables.cc 22 // 23 // Because the table is not expected to ever change, it is checked into the 24 // repository rather than being regenerated at build time. 25 // 26 // This code uses type uint8 throughout to represent bytes, to avoid 27 // signed/unsigned char confusion. 28 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 33 #include <algorithm> 34 #include <map> 35 #include <string> 36 #include <vector> 37 38 #include "base/basictypes.h" 39 #include "base/command_line.h" 40 #include "base/files/file_path.h" 41 #include "base/files/file_util.h" 42 #include "base/logging.h" 43 #include "base/numerics/safe_conversions.h" 44 #include "base/strings/stringprintf.h" 45 #include "third_party/icu/source/common/unicode/utf8.h" 46 47 namespace { 48 49 const char kHelpText[] = 50 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; 51 52 const char kProlog[] = 53 "// Copyright 2013 The Chromium Authors. All rights reserved.\n" 54 "// Use of this source code is governed by a BSD-style license that can " 55 "be\n" 56 "// found in the LICENSE file.\n" 57 "\n" 58 "// This file is auto-generated by build_utf8_validator_tables.\n" 59 "// DO NOT EDIT.\n" 60 "\n" 61 "#include \"base/i18n/utf8_validator_tables.h\"\n" 62 "\n" 63 "namespace base {\n" 64 "namespace internal {\n" 65 "\n" 66 "const uint8 kUtf8ValidatorTables[] = {\n"; 67 68 const char kEpilog[] = 69 "};\n" 70 "\n" 71 "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" 72 "\n" 73 "} // namespace internal\n" 74 "} // namespace base\n"; 75 76 // Ranges are inclusive at both ends--they represent [from, to] 77 class Range { 78 public: 79 // Ranges always start with just one byte. 80 explicit Range(uint8 value) : from_(value), to_(value) {} 81 82 // Range objects are copyable and assignable to be used in STL 83 // containers. Since they only contain non-pointer POD types, the default copy 84 // constructor, assignment operator and destructor will work. 85 86 // Add a byte to the range. We intentionally only support adding a byte at the 87 // end, since that is the only operation the code needs. 88 void AddByte(uint8 to) { 89 CHECK(to == to_ + 1); 90 to_ = to; 91 } 92 93 uint8 from() const { return from_; } 94 uint8 to() const { return to_; } 95 96 bool operator<(const Range& rhs) const { 97 return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); 98 } 99 100 bool operator==(const Range& rhs) const { 101 return from() == rhs.from() && to() == rhs.to(); 102 } 103 104 private: 105 uint8 from_; 106 uint8 to_; 107 }; 108 109 // A vector of Ranges is like a simple regular expression--it corresponds to 110 // a set of strings of the same length that have bytes in each position in 111 // the appropriate range. 112 typedef std::vector<Range> StringSet; 113 114 // A UTF-8 "character" is represented by a sequence of bytes. 115 typedef std::vector<uint8> Character; 116 117 // In the second stage of the algorithm, we want to convert a large list of 118 // Characters into a small list of StringSets. 119 struct Pair { 120 Character character; 121 StringSet set; 122 }; 123 124 typedef std::vector<Pair> PairVector; 125 126 // A class to print a table of numbers in the same style as clang-format. 127 class TablePrinter { 128 public: 129 explicit TablePrinter(FILE* stream) 130 : stream_(stream), values_on_this_line_(0), current_offset_(0) {} 131 132 void PrintValue(uint8 value) { 133 if (values_on_this_line_ == 0) { 134 fputs(" ", stream_); 135 } else if (values_on_this_line_ == kMaxValuesPerLine) { 136 fprintf(stream_, " // 0x%02x\n ", current_offset_); 137 values_on_this_line_ = 0; 138 } 139 fprintf(stream_, " 0x%02x,", static_cast<int>(value)); 140 ++values_on_this_line_; 141 ++current_offset_; 142 } 143 144 void NewLine() { 145 while (values_on_this_line_ < kMaxValuesPerLine) { 146 fputs(" ", stream_); 147 ++values_on_this_line_; 148 } 149 fprintf(stream_, " // 0x%02x\n", current_offset_); 150 values_on_this_line_ = 0; 151 } 152 153 private: 154 // stdio stream. Not owned. 155 FILE* stream_; 156 157 // Number of values so far printed on this line. 158 int values_on_this_line_; 159 160 // Total values printed so far. 161 int current_offset_; 162 163 static const int kMaxValuesPerLine = 8; 164 165 DISALLOW_COPY_AND_ASSIGN(TablePrinter); 166 }; 167 168 // Start by filling a PairVector with characters. The resulting vector goes from 169 // "\x00" to "\xf4\x8f\xbf\xbf". 170 PairVector InitializeCharacters() { 171 PairVector vector; 172 for (int i = 0; i <= 0x10FFFF; ++i) { 173 if (i >= 0xD800 && i < 0xE000) { 174 // Surrogate codepoints are not permitted. Non-character code points are 175 // explicitly permitted. 176 continue; 177 } 178 uint8 bytes[4]; 179 unsigned int offset = 0; 180 UBool is_error = false; 181 U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); 182 DCHECK(!is_error); 183 DCHECK_GT(offset, 0u); 184 DCHECK_LE(offset, arraysize(bytes)); 185 Pair pair = {Character(bytes, bytes + offset), StringSet()}; 186 vector.push_back(pair); 187 } 188 return vector; 189 } 190 191 // Construct a new Pair from |character| and the concatenation of |new_range| 192 // and |existing_set|, and append it to |pairs|. 193 void ConstructPairAndAppend(const Character& character, 194 const Range& new_range, 195 const StringSet& existing_set, 196 PairVector* pairs) { 197 Pair new_pair = {character, StringSet(1, new_range)}; 198 new_pair.set.insert( 199 new_pair.set.end(), existing_set.begin(), existing_set.end()); 200 pairs->push_back(new_pair); 201 } 202 203 // Each pass over the PairVector strips one byte off the right-hand-side of the 204 // characters and adds a range to the set on the right. For example, the first 205 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", 206 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", 207 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", 208 // [\xa0-\xbf][\x80-\xbf]). 209 void MoveRightMostCharToSet(PairVector* pairs) { 210 PairVector new_pairs; 211 PairVector::const_iterator it = pairs->begin(); 212 while (it != pairs->end() && it->character.empty()) { 213 new_pairs.push_back(*it); 214 ++it; 215 } 216 CHECK(it != pairs->end()); 217 Character unconverted_bytes(it->character.begin(), it->character.end() - 1); 218 Range new_range(it->character.back()); 219 StringSet converted = it->set; 220 ++it; 221 while (it != pairs->end()) { 222 const Pair& current_pair = *it++; 223 if (current_pair.character.size() == unconverted_bytes.size() + 1 && 224 std::equal(unconverted_bytes.begin(), 225 unconverted_bytes.end(), 226 current_pair.character.begin()) && 227 converted == current_pair.set) { 228 // The particular set of UTF-8 codepoints we are validating guarantees 229 // that each byte range will be contiguous. This would not necessarily be 230 // true for an arbitrary set of UTF-8 codepoints. 231 DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); 232 new_range.AddByte(current_pair.character.back()); 233 continue; 234 } 235 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 236 unconverted_bytes = Character(current_pair.character.begin(), 237 current_pair.character.end() - 1); 238 new_range = Range(current_pair.character.back()); 239 converted = current_pair.set; 240 } 241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); 242 new_pairs.swap(*pairs); 243 } 244 245 void MoveAllCharsToSets(PairVector* pairs) { 246 // Since each pass of the function moves one character, and UTF-8 sequences 247 // are at most 4 characters long, this simply runs the algorithm four times. 248 for (int i = 0; i < 4; ++i) { 249 MoveRightMostCharToSet(pairs); 250 } 251 #if DCHECK_IS_ON 252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); 253 ++it) { 254 DCHECK(it->character.empty()); 255 } 256 #endif 257 } 258 259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], 260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the 261 // algorithm is working. Use the command-line option 262 // --vmodule=build_utf8_validator_tables=1 to see this output. 263 void LogStringSets(const PairVector& pairs) { 264 for (PairVector::const_iterator pair_it = pairs.begin(); 265 pair_it != pairs.end(); 266 ++pair_it) { 267 std::string set_as_string; 268 for (StringSet::const_iterator set_it = pair_it->set.begin(); 269 set_it != pair_it->set.end(); 270 ++set_it) { 271 set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", 272 static_cast<int>(set_it->from()), 273 static_cast<int>(set_it->to())); 274 } 275 VLOG(1) << set_as_string; 276 } 277 } 278 279 // A single state in the state machine is represented by a sorted vector of 280 // start bytes and target states. All input bytes in the range between the start 281 // byte and the next entry in the vector (or 0xFF) result in a transition to the 282 // target state. 283 struct StateRange { 284 uint8 from; 285 uint8 target_state; 286 }; 287 288 typedef std::vector<StateRange> State; 289 290 // Generates a state where all bytes go to state 1 (invalid). This is also used 291 // as an initialiser for other states (since bytes from outside the desired 292 // range are invalid). 293 State GenerateInvalidState() { 294 const StateRange range = {0, 1}; 295 return State(1, range); 296 } 297 298 // A map from a state (ie. a set of strings which will match from this state) to 299 // a number (which is an index into the array of states). 300 typedef std::map<StringSet, uint8> StateMap; 301 302 // Create a new state corresponding to |set|, add it |states| and |state_map| 303 // and return the index it was given in |states|. 304 uint8 MakeState(const StringSet& set, 305 std::vector<State>* states, 306 StateMap* state_map) { 307 DCHECK(!set.empty()); 308 const Range& range = set.front(); 309 const StringSet rest(set.begin() + 1, set.end()); 310 const StateMap::const_iterator where = state_map->find(rest); 311 const uint8 target_state = where == state_map->end() 312 ? MakeState(rest, states, state_map) 313 : where->second; 314 DCHECK_LT(0, range.from()); 315 DCHECK_LT(range.to(), 0xFF); 316 const StateRange new_state_initializer[] = { 317 {0, 1}, {range.from(), target_state}, 318 {static_cast<uint8>(range.to() + 1), 1}}; 319 states->push_back( 320 State(new_state_initializer, 321 new_state_initializer + arraysize(new_state_initializer))); 322 const uint8 new_state_number = 323 base::checked_cast<uint8>(states->size() - 1); 324 CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); 325 return new_state_number; 326 } 327 328 std::vector<State> GenerateStates(const PairVector& pairs) { 329 // States 0 and 1 are the initial/valid state and invalid state, respectively. 330 std::vector<State> states(2, GenerateInvalidState()); 331 StateMap state_map; 332 state_map.insert(std::make_pair(StringSet(), 0)); 333 for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { 334 DCHECK(it->character.empty()); 335 DCHECK(!it->set.empty()); 336 const Range& range = it->set.front(); 337 const StringSet rest(it->set.begin() + 1, it->set.end()); 338 const StateMap::const_iterator where = state_map.find(rest); 339 const uint8 target_state = where == state_map.end() 340 ? MakeState(rest, &states, &state_map) 341 : where->second; 342 if (states[0].back().from == range.from()) { 343 DCHECK_EQ(1, states[0].back().target_state); 344 states[0].back().target_state = target_state; 345 DCHECK_LT(range.to(), 0xFF); 346 const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; 347 states[0].push_back(new_range); 348 } else { 349 DCHECK_LT(range.to(), 0xFF); 350 const StateRange new_range_initializer[] = {{range.from(), target_state}, 351 {static_cast<uint8>(range.to() + 1), 1}}; 352 states[0] 353 .insert(states[0].end(), 354 new_range_initializer, 355 new_range_initializer + arraysize(new_range_initializer)); 356 } 357 } 358 return states; 359 } 360 361 // Output the generated states as a C++ table. Two tricks are used to compact 362 // the table: each state in the table starts with a shift value which indicates 363 // how many bits we can discard from the right-hand-side of the byte before 364 // doing the table lookup. Secondly, only the state-transitions for bytes 365 // with the top-bit set are included in the table; bytes without the top-bit set 366 // are just ASCII and are handled directly by the code. 367 void PrintStates(const std::vector<State>& states, FILE* stream) { 368 // First calculate the start-offset of each state. This allows the state 369 // machine to jump directly to the correct offset, avoiding an extra 370 // indirection. State 0 starts at offset 0. 371 std::vector<uint8> state_offset(1, 0); 372 std::vector<uint8> shifts; 373 uint8 pos = 0; 374 375 for (std::vector<State>::const_iterator state_it = states.begin(); 376 state_it != states.end(); 377 ++state_it) { 378 // We want to set |shift| to the (0-based) index of the least-significant 379 // set bit in any of the ranges for this state, since this tells us how many 380 // bits we can discard and still determine what range a byte lies in. Sadly 381 // it appears that ffs() is not portable, so we do it clumsily. 382 uint8 shift = 7; 383 for (State::const_iterator range_it = state_it->begin(); 384 range_it != state_it->end(); 385 ++range_it) { 386 while (shift > 0 && range_it->from % (1 << shift) != 0) { 387 --shift; 388 } 389 } 390 shifts.push_back(shift); 391 pos += 1 + (1 << (7 - shift)); 392 state_offset.push_back(pos); 393 } 394 395 DCHECK_EQ(129, state_offset[1]); 396 397 fputs(kProlog, stream); 398 TablePrinter table_printer(stream); 399 400 for (uint8 state_index = 0; state_index < states.size(); ++state_index) { 401 const uint8 shift = shifts[state_index]; 402 uint8 next_range = 0; 403 uint8 target_state = 1; 404 fprintf(stream, 405 " // State %d, offset 0x%02x\n", 406 static_cast<int>(state_index), 407 static_cast<int>(state_offset[state_index])); 408 table_printer.PrintValue(shift); 409 for (int i = 0; i < 0x100; i += (1 << shift)) { 410 if (next_range < states[state_index].size() && 411 states[state_index][next_range].from == i) { 412 target_state = states[state_index][next_range].target_state; 413 ++next_range; 414 } 415 if (i >= 0x80) { 416 table_printer.PrintValue(state_offset[target_state]); 417 } 418 } 419 table_printer.NewLine(); 420 } 421 422 fputs(kEpilog, stream); 423 } 424 425 } // namespace 426 427 int main(int argc, char* argv[]) { 428 CommandLine::Init(argc, argv); 429 logging::LoggingSettings settings; 430 settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; 431 logging::InitLogging(settings); 432 if (CommandLine::ForCurrentProcess()->HasSwitch("help")) { 433 fwrite(kHelpText, 1, arraysize(kHelpText), stdout); 434 exit(EXIT_SUCCESS); 435 } 436 base::FilePath filename = 437 CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); 438 439 FILE* output = stdout; 440 if (!filename.empty()) { 441 output = base::OpenFile(filename, "wb"); 442 if (!output) 443 PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() 444 << "' for writing"; 445 } 446 447 // Step 1: Enumerate the characters 448 PairVector pairs = InitializeCharacters(); 449 // Step 2: Convert to sets. 450 MoveAllCharsToSets(&pairs); 451 if (VLOG_IS_ON(1)) { 452 LogStringSets(pairs); 453 } 454 // Step 3: Generate states. 455 std::vector<State> states = GenerateStates(pairs); 456 // Step 4/5: Print output 457 PrintStates(states, output); 458 459 if (!filename.empty()) { 460 if (!base::CloseFile(output)) 461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() 462 << "'"; 463 } 464 465 return EXIT_SUCCESS; 466 } 467