1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/i18n/streaming_utf8_validator.h" 6 7 #include <stdio.h> 8 #include <string.h> 9 10 #include <string> 11 12 #include "base/strings/string_piece.h" 13 #include "testing/gtest/include/gtest/gtest.h" 14 15 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class 16 // accepts exactly the same set of 4-byte strings as ICU-based validation. This 17 // tests every possible 4-byte string, so it is too slow to run routinely on 18 // low-powered machines. 19 // 20 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST 21 22 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST 23 24 #include "base/basictypes.h" 25 #include "base/bind.h" 26 #include "base/location.h" 27 #include "base/logging.h" 28 #include "base/memory/ref_counted.h" 29 #include "base/strings/string_util.h" 30 #include "base/strings/stringprintf.h" 31 #include "base/strings/utf_string_conversion_utils.h" 32 #include "base/synchronization/condition_variable.h" 33 #include "base/synchronization/lock.h" 34 #include "base/threading/sequenced_worker_pool.h" 35 #include "third_party/icu/source/common/unicode/utf8.h" 36 37 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST 38 39 namespace base { 40 namespace { 41 42 // Avoid having to qualify the enum values in the tests. 43 const StreamingUtf8Validator::State VALID_ENDPOINT = 44 StreamingUtf8Validator::VALID_ENDPOINT; 45 const StreamingUtf8Validator::State VALID_MIDPOINT = 46 StreamingUtf8Validator::VALID_MIDPOINT; 47 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID; 48 49 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST 50 51 const uint32 kThoroughTestChunkSize = 1 << 24; 52 53 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test { 54 protected: 55 StreamingUtf8ValidatorThoroughTest() 56 : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {} 57 58 // This uses the same logic as base::IsStringUTF8 except it considers 59 // non-characters valid (and doesn't require a string as input). 60 static bool IsStringUtf8(const char* src, int32 src_len) { 61 int32 char_index = 0; 62 63 while (char_index < src_len) { 64 int32 code_point; 65 U8_NEXT(src, char_index, src_len, code_point); 66 if (!base::IsValidCodepoint(code_point)) 67 return false; 68 } 69 return true; 70 } 71 72 // Converts the passed-in integer to a 4 byte string and then 73 // verifies that IsStringUtf8 and StreamingUtf8Validator agree on 74 // whether it is valid UTF-8 or not. 75 void TestNumber(uint32 n) const { 76 char test[sizeof n]; 77 memcpy(test, &n, sizeof n); 78 StreamingUtf8Validator validator; 79 EXPECT_EQ(IsStringUtf8(test, sizeof n), 80 validator.AddBytes(test, sizeof n) == VALID_ENDPOINT) 81 << "Difference of opinion for \"" 82 << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X", 83 test[0] & 0xFF, 84 test[1] & 0xFF, 85 test[2] & 0xFF, 86 test[3] & 0xFF) << "\""; 87 } 88 89 public: 90 // Tests the 4-byte sequences corresponding to the |size| integers 91 // starting at |begin|. This is intended to be run from a worker 92 // pool. Signals |all_done_| at the end if it thinks all tasks are 93 // finished. 94 void TestRange(uint32 begin, uint32 size) { 95 for (uint32 i = 0; i < size; ++i) { 96 TestNumber(begin + i); 97 } 98 base::AutoLock al(lock_); 99 ++tasks_finished_; 100 LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_ 101 << " tasks done\n"; 102 if (tasks_finished_ >= tasks_dispatched_) { 103 all_done_.Signal(); 104 } 105 } 106 107 protected: 108 base::Lock lock_; 109 base::ConditionVariable all_done_; 110 int tasks_dispatched_; 111 int tasks_finished_; 112 }; 113 114 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) { 115 scoped_refptr<base::SequencedWorkerPool> pool = 116 new base::SequencedWorkerPool(32, "TestEverything"); 117 base::AutoLock al(lock_); 118 uint32 begin = 0; 119 do { 120 pool->PostWorkerTask( 121 FROM_HERE, 122 base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange, 123 base::Unretained(this), 124 begin, 125 kThoroughTestChunkSize)); 126 ++tasks_dispatched_; 127 begin += kThoroughTestChunkSize; 128 } while (begin != 0); 129 while (tasks_finished_ < tasks_dispatched_) 130 all_done_.Wait(); 131 } 132 133 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST 134 135 // These valid and invalid UTF-8 sequences are based on the tests from 136 // base/strings/string_util_unittest.cc 137 138 // All of the strings in |valid| must represent a single codepoint, because 139 // partial sequences are constructed by taking non-empty prefixes of these 140 // strings. 141 const char* const valid[] = {"\r", "\n", "a", 142 "\xc2\x81", "\xe1\x80\xbf", "\xf1\x80\xa0\xbf", 143 "\xef\xbb\xbf", // UTF-8 BOM 144 }; 145 146 const char* const* const valid_end = valid + arraysize(valid); 147 148 const char* const invalid[] = { 149 // always invalid bytes 150 "\xc0", "\xc1", 151 "\xf5", "\xf6", "\xf7", 152 "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff", 153 // surrogate code points 154 "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf", 155 // 156 // overlong sequences 157 "\xc0\x80" // U+0000 158 "\xc1\x80", // "A" 159 "\xc1\x81", // "B" 160 "\xe0\x80\x80", // U+0000 161 "\xe0\x82\x80", // U+0080 162 "\xe0\x9f\xbf", // U+07ff 163 "\xf0\x80\x80\x8D", // U+000D 164 "\xf0\x80\x82\x91", // U+0091 165 "\xf0\x80\xa0\x80", // U+0800 166 "\xf0\x8f\xbb\xbf", // U+FEFF (BOM) 167 "\xf8\x80\x80\x80\xbf", // U+003F 168 "\xfc\x80\x80\x80\xa0\xa5", 169 // 170 // Beyond U+10FFFF 171 "\xf4\x90\x80\x80", // U+110000 172 "\xf8\xa0\xbf\x80\xbf", // 5 bytes 173 "\xfc\x9c\xbf\x80\xbf\x80", // 6 bytes 174 // 175 // BOMs in UTF-16(BE|LE) 176 "\xfe\xff", "\xff\xfe", 177 }; 178 179 const char* const* const invalid_end = invalid + arraysize(invalid); 180 181 // A ForwardIterator which returns all the non-empty prefixes of the elements of 182 // "valid". 183 class PartialIterator { 184 public: 185 // The constructor returns the first iterator, ie. it is equivalent to 186 // begin(). 187 PartialIterator() : index_(0), prefix_length_(0) { Advance(); } 188 // The trivial destructor left intentionally undefined. 189 // This is a value type; the default copy constructor and assignment operator 190 // generated by the compiler are used. 191 192 static PartialIterator end() { return PartialIterator(arraysize(valid), 1); } 193 194 PartialIterator& operator++() { 195 Advance(); 196 return *this; 197 } 198 199 base::StringPiece operator*() const { 200 return base::StringPiece(valid[index_], prefix_length_); 201 } 202 203 bool operator==(const PartialIterator& rhs) const { 204 return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_; 205 } 206 207 bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); } 208 209 private: 210 // This constructor is used by the end() method. 211 PartialIterator(size_t index, size_t prefix_length) 212 : index_(index), prefix_length_(prefix_length) {} 213 214 void Advance() { 215 if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_])) 216 ++prefix_length_; 217 while (index_ < arraysize(valid) && 218 prefix_length_ == strlen(valid[index_])) { 219 ++index_; 220 prefix_length_ = 1; 221 } 222 } 223 224 // The UTF-8 sequence, as an offset into the |valid| array. 225 size_t index_; 226 size_t prefix_length_; 227 }; 228 229 // A test fixture for tests which test one UTF-8 sequence (or invalid 230 // byte sequence) at a time. 231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test { 232 protected: 233 // Iterator must be convertible when de-referenced to StringPiece. 234 template <typename Iterator> 235 void CheckRange(Iterator begin, 236 Iterator end, 237 StreamingUtf8Validator::State expected) { 238 for (Iterator it = begin; it != end; ++it) { 239 StreamingUtf8Validator validator; 240 base::StringPiece sequence = *it; 241 EXPECT_EQ(expected, 242 validator.AddBytes(sequence.data(), sequence.size())) 243 << "Failed for \"" << sequence << "\""; 244 } 245 } 246 247 // Adding input a byte at a time should make absolutely no difference. 248 template <typename Iterator> 249 void CheckRangeByteAtATime(Iterator begin, 250 Iterator end, 251 StreamingUtf8Validator::State expected) { 252 for (Iterator it = begin; it != end; ++it) { 253 StreamingUtf8Validator validator; 254 base::StringPiece sequence = *it; 255 StreamingUtf8Validator::State state = VALID_ENDPOINT; 256 for (base::StringPiece::const_iterator cit = sequence.begin(); 257 cit != sequence.end(); 258 ++cit) { 259 state = validator.AddBytes(&*cit, 1); 260 } 261 EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\""; 262 } 263 } 264 }; 265 266 // A test fixture for tests which test the concatenation of byte sequences. 267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test { 268 protected: 269 // Check every possible concatenation of byte sequences from two 270 // ranges, and verify that the combination matches the expected 271 // state. 272 template <typename Iterator1, typename Iterator2> 273 void CheckCombinations(Iterator1 begin1, 274 Iterator1 end1, 275 Iterator2 begin2, 276 Iterator2 end2, 277 StreamingUtf8Validator::State expected) { 278 StreamingUtf8Validator validator; 279 for (Iterator1 it1 = begin1; it1 != end1; ++it1) { 280 base::StringPiece c1 = *it1; 281 for (Iterator2 it2 = begin2; it2 != end2; ++it2) { 282 base::StringPiece c2 = *it2; 283 validator.AddBytes(c1.data(), c1.size()); 284 EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size())) 285 << "Failed for \"" << c1 << c2 << "\""; 286 validator.Reset(); 287 } 288 } 289 } 290 }; 291 292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) { 293 static const char kNothing[] = ""; 294 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0)); 295 } 296 297 // Because the members of the |valid| array need to be non-zero length 298 // sequences and are measured with strlen(), |valid| cannot be used it 299 // to test the NUL character '\0', so the NUL character gets its own 300 // test. 301 TEST(StreamingUtf8ValidatorTest, NulIsValid) { 302 static const char kNul[] = "\x00"; 303 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1)); 304 } 305 306 // Just a basic sanity test before we start getting fancy. 307 TEST(StreamingUtf8ValidatorTest, HelloWorld) { 308 static const char kHelloWorld[] = "Hello, World!"; 309 EXPECT_EQ( 310 VALID_ENDPOINT, 311 StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld))); 312 } 313 314 // Check that the Reset() method works. 315 TEST(StreamingUtf8ValidatorTest, ResetWorks) { 316 StreamingUtf8Validator validator; 317 EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1)); 318 EXPECT_EQ(INVALID, validator.AddBytes("a", 1)); 319 validator.Reset(); 320 EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1)); 321 } 322 323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) { 324 CheckRange(valid, valid_end, VALID_ENDPOINT); 325 } 326 327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) { 328 CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); 329 } 330 331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) { 332 CheckRange(invalid, invalid_end, INVALID); 333 } 334 335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) { 336 CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT); 337 } 338 339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) { 340 CheckRangeByteAtATime( 341 PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); 342 } 343 344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) { 345 CheckRangeByteAtATime(invalid, invalid_end, INVALID); 346 } 347 348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) { 349 CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT); 350 } 351 352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) { 353 CheckCombinations(valid, 354 valid_end, 355 PartialIterator(), 356 PartialIterator::end(), 357 VALID_MIDPOINT); 358 } 359 360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) { 361 CheckCombinations( 362 PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID); 363 } 364 365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) { 366 CheckCombinations(PartialIterator(), 367 PartialIterator::end(), 368 PartialIterator(), 369 PartialIterator::end(), 370 INVALID); 371 } 372 373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) { 374 CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID); 375 } 376 377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) { 378 CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID); 379 } 380 381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) { 382 CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID); 383 } 384 385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) { 386 CheckCombinations( 387 invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID); 388 } 389 390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) { 391 CheckCombinations( 392 PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID); 393 } 394 395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) { 396 EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string())); 397 } 398 399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) { 400 EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81")); 401 } 402 403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) { 404 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80")); 405 } 406 407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) { 408 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2")); 409 } 410 411 } // namespace 412 } // namespace base 413