Home | History | Annotate | Download | only in i18n
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/i18n/streaming_utf8_validator.h"
      6 
      7 #include <stdio.h>
      8 #include <string.h>
      9 
     10 #include <string>
     11 
     12 #include "base/strings/string_piece.h"
     13 #include "testing/gtest/include/gtest/gtest.h"
     14 
     15 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class
     16 // accepts exactly the same set of 4-byte strings as ICU-based validation. This
     17 // tests every possible 4-byte string, so it is too slow to run routinely on
     18 // low-powered machines.
     19 //
     20 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
     21 
     22 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
     23 
     24 #include "base/basictypes.h"
     25 #include "base/bind.h"
     26 #include "base/location.h"
     27 #include "base/logging.h"
     28 #include "base/memory/ref_counted.h"
     29 #include "base/strings/string_util.h"
     30 #include "base/strings/stringprintf.h"
     31 #include "base/strings/utf_string_conversion_utils.h"
     32 #include "base/synchronization/condition_variable.h"
     33 #include "base/synchronization/lock.h"
     34 #include "base/threading/sequenced_worker_pool.h"
     35 #include "third_party/icu/source/common/unicode/utf8.h"
     36 
     37 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
     38 
     39 namespace base {
     40 namespace {
     41 
     42 // Avoid having to qualify the enum values in the tests.
     43 const StreamingUtf8Validator::State VALID_ENDPOINT =
     44     StreamingUtf8Validator::VALID_ENDPOINT;
     45 const StreamingUtf8Validator::State VALID_MIDPOINT =
     46     StreamingUtf8Validator::VALID_MIDPOINT;
     47 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
     48 
     49 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
     50 
     51 const uint32 kThoroughTestChunkSize = 1 << 24;
     52 
     53 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
     54  protected:
     55   StreamingUtf8ValidatorThoroughTest()
     56       : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {}
     57 
     58   // This uses the same logic as base::IsStringUTF8 except it considers
     59   // non-characters valid (and doesn't require a string as input).
     60   static bool IsStringUtf8(const char* src, int32 src_len) {
     61     int32 char_index = 0;
     62 
     63     while (char_index < src_len) {
     64       int32 code_point;
     65       U8_NEXT(src, char_index, src_len, code_point);
     66       if (!base::IsValidCodepoint(code_point))
     67         return false;
     68     }
     69     return true;
     70   }
     71 
     72   // Converts the passed-in integer to a 4 byte string and then
     73   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
     74   // whether it is valid UTF-8 or not.
     75   void TestNumber(uint32 n) const {
     76     char test[sizeof n];
     77     memcpy(test, &n, sizeof n);
     78     StreamingUtf8Validator validator;
     79     EXPECT_EQ(IsStringUtf8(test, sizeof n),
     80               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT)
     81         << "Difference of opinion for \""
     82         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X",
     83                               test[0] & 0xFF,
     84                               test[1] & 0xFF,
     85                               test[2] & 0xFF,
     86                               test[3] & 0xFF) << "\"";
     87   }
     88 
     89  public:
     90   // Tests the 4-byte sequences corresponding to the |size| integers
     91   // starting at |begin|. This is intended to be run from a worker
     92   // pool. Signals |all_done_| at the end if it thinks all tasks are
     93   // finished.
     94   void TestRange(uint32 begin, uint32 size) {
     95     for (uint32 i = 0; i < size; ++i) {
     96       TestNumber(begin + i);
     97     }
     98     base::AutoLock al(lock_);
     99     ++tasks_finished_;
    100     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
    101               << " tasks done\n";
    102     if (tasks_finished_ >= tasks_dispatched_) {
    103       all_done_.Signal();
    104     }
    105   }
    106 
    107  protected:
    108   base::Lock lock_;
    109   base::ConditionVariable all_done_;
    110   int tasks_dispatched_;
    111   int tasks_finished_;
    112 };
    113 
    114 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) {
    115   scoped_refptr<base::SequencedWorkerPool> pool =
    116       new base::SequencedWorkerPool(32, "TestEverything");
    117   base::AutoLock al(lock_);
    118   uint32 begin = 0;
    119   do {
    120     pool->PostWorkerTask(
    121         FROM_HERE,
    122         base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange,
    123                    base::Unretained(this),
    124                    begin,
    125                    kThoroughTestChunkSize));
    126     ++tasks_dispatched_;
    127     begin += kThoroughTestChunkSize;
    128   } while (begin != 0);
    129   while (tasks_finished_ < tasks_dispatched_)
    130     all_done_.Wait();
    131 }
    132 
    133 #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST
    134 
    135 // These valid and invalid UTF-8 sequences are based on the tests from
    136 // base/strings/string_util_unittest.cc
    137 
    138 // All of the strings in |valid| must represent a single codepoint, because
    139 // partial sequences are constructed by taking non-empty prefixes of these
    140 // strings.
    141 const char* const valid[] = {"\r",           "\n",           "a",
    142                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
    143                              "\xef\xbb\xbf",  // UTF-8 BOM
    144 };
    145 
    146 const char* const* const valid_end = valid + arraysize(valid);
    147 
    148 const char* const invalid[] = {
    149     // always invalid bytes
    150     "\xc0", "\xc1",
    151     "\xf5", "\xf6", "\xf7",
    152     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
    153     // surrogate code points
    154     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
    155     //
    156     // overlong sequences
    157     "\xc0\x80"               // U+0000
    158     "\xc1\x80",              // "A"
    159     "\xc1\x81",              // "B"
    160     "\xe0\x80\x80",          // U+0000
    161     "\xe0\x82\x80",          // U+0080
    162     "\xe0\x9f\xbf",          // U+07ff
    163     "\xf0\x80\x80\x8D",      // U+000D
    164     "\xf0\x80\x82\x91",      // U+0091
    165     "\xf0\x80\xa0\x80",      // U+0800
    166     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
    167     "\xf8\x80\x80\x80\xbf",  // U+003F
    168     "\xfc\x80\x80\x80\xa0\xa5",
    169     //
    170     // Beyond U+10FFFF
    171     "\xf4\x90\x80\x80",          // U+110000
    172     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
    173     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
    174     //
    175     // BOMs in UTF-16(BE|LE)
    176     "\xfe\xff", "\xff\xfe",
    177 };
    178 
    179 const char* const* const invalid_end = invalid + arraysize(invalid);
    180 
    181 // A ForwardIterator which returns all the non-empty prefixes of the elements of
    182 // "valid".
    183 class PartialIterator {
    184  public:
    185   // The constructor returns the first iterator, ie. it is equivalent to
    186   // begin().
    187   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
    188   // The trivial destructor left intentionally undefined.
    189   // This is a value type; the default copy constructor and assignment operator
    190   // generated by the compiler are used.
    191 
    192   static PartialIterator end() { return PartialIterator(arraysize(valid), 1); }
    193 
    194   PartialIterator& operator++() {
    195     Advance();
    196     return *this;
    197   }
    198 
    199   base::StringPiece operator*() const {
    200     return base::StringPiece(valid[index_], prefix_length_);
    201   }
    202 
    203   bool operator==(const PartialIterator& rhs) const {
    204     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
    205   }
    206 
    207   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
    208 
    209  private:
    210   // This constructor is used by the end() method.
    211   PartialIterator(size_t index, size_t prefix_length)
    212       : index_(index), prefix_length_(prefix_length) {}
    213 
    214   void Advance() {
    215     if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_]))
    216       ++prefix_length_;
    217     while (index_ < arraysize(valid) &&
    218            prefix_length_ == strlen(valid[index_])) {
    219       ++index_;
    220       prefix_length_ = 1;
    221     }
    222   }
    223 
    224   // The UTF-8 sequence, as an offset into the |valid| array.
    225   size_t index_;
    226   size_t prefix_length_;
    227 };
    228 
    229 // A test fixture for tests which test one UTF-8 sequence (or invalid
    230 // byte sequence) at a time.
    231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
    232  protected:
    233   // Iterator must be convertible when de-referenced to StringPiece.
    234   template <typename Iterator>
    235   void CheckRange(Iterator begin,
    236                   Iterator end,
    237                   StreamingUtf8Validator::State expected) {
    238     for (Iterator it = begin; it != end; ++it) {
    239       StreamingUtf8Validator validator;
    240       base::StringPiece sequence = *it;
    241       EXPECT_EQ(expected,
    242                 validator.AddBytes(sequence.data(), sequence.size()))
    243           << "Failed for \"" << sequence << "\"";
    244     }
    245   }
    246 
    247   // Adding input a byte at a time should make absolutely no difference.
    248   template <typename Iterator>
    249   void CheckRangeByteAtATime(Iterator begin,
    250                              Iterator end,
    251                              StreamingUtf8Validator::State expected) {
    252     for (Iterator it = begin; it != end; ++it) {
    253       StreamingUtf8Validator validator;
    254       base::StringPiece sequence = *it;
    255       StreamingUtf8Validator::State state = VALID_ENDPOINT;
    256       for (base::StringPiece::const_iterator cit = sequence.begin();
    257            cit != sequence.end();
    258            ++cit) {
    259         state = validator.AddBytes(&*cit, 1);
    260       }
    261       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
    262     }
    263   }
    264 };
    265 
    266 // A test fixture for tests which test the concatenation of byte sequences.
    267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
    268  protected:
    269   // Check every possible concatenation of byte sequences from two
    270   // ranges, and verify that the combination matches the expected
    271   // state.
    272   template <typename Iterator1, typename Iterator2>
    273   void CheckCombinations(Iterator1 begin1,
    274                          Iterator1 end1,
    275                          Iterator2 begin2,
    276                          Iterator2 end2,
    277                          StreamingUtf8Validator::State expected) {
    278     StreamingUtf8Validator validator;
    279     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
    280       base::StringPiece c1 = *it1;
    281       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
    282         base::StringPiece c2 = *it2;
    283         validator.AddBytes(c1.data(), c1.size());
    284         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size()))
    285             << "Failed for \"" << c1 << c2 << "\"";
    286         validator.Reset();
    287       }
    288     }
    289   }
    290 };
    291 
    292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
    293   static const char kNothing[] = "";
    294   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0));
    295 }
    296 
    297 // Because the members of the |valid| array need to be non-zero length
    298 // sequences and are measured with strlen(), |valid| cannot be used it
    299 // to test the NUL character '\0', so the NUL character gets its own
    300 // test.
    301 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
    302   static const char kNul[] = "\x00";
    303   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1));
    304 }
    305 
    306 // Just a basic sanity test before we start getting fancy.
    307 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
    308   static const char kHelloWorld[] = "Hello, World!";
    309   EXPECT_EQ(
    310       VALID_ENDPOINT,
    311       StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld)));
    312 }
    313 
    314 // Check that the Reset() method works.
    315 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
    316   StreamingUtf8Validator validator;
    317   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1));
    318   EXPECT_EQ(INVALID, validator.AddBytes("a", 1));
    319   validator.Reset();
    320   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1));
    321 }
    322 
    323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
    324   CheckRange(valid, valid_end, VALID_ENDPOINT);
    325 }
    326 
    327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
    328   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
    329 }
    330 
    331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
    332   CheckRange(invalid, invalid_end, INVALID);
    333 }
    334 
    335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
    336   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
    337 }
    338 
    339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
    340   CheckRangeByteAtATime(
    341       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
    342 }
    343 
    344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
    345   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
    346 }
    347 
    348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
    349   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
    350 }
    351 
    352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
    353   CheckCombinations(valid,
    354                     valid_end,
    355                     PartialIterator(),
    356                     PartialIterator::end(),
    357                     VALID_MIDPOINT);
    358 }
    359 
    360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
    361   CheckCombinations(
    362       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
    363 }
    364 
    365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
    366   CheckCombinations(PartialIterator(),
    367                     PartialIterator::end(),
    368                     PartialIterator(),
    369                     PartialIterator::end(),
    370                     INVALID);
    371 }
    372 
    373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
    374   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
    375 }
    376 
    377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
    378   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
    379 }
    380 
    381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
    382   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
    383 }
    384 
    385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
    386   CheckCombinations(
    387       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
    388 }
    389 
    390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
    391   CheckCombinations(
    392       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
    393 }
    394 
    395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
    396   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
    397 }
    398 
    399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
    400   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
    401 }
    402 
    403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
    404   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
    405 }
    406 
    407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
    408   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
    409 }
    410 
    411 }  // namespace
    412 }  // namespace base
    413