Home | History | Annotate | Download | only in i18n
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // A streaming validator for UTF-8. Validation is based on the definition in
      6 // RFC-3629. In particular, it does not reject the invalid characters rejected
      7 // by base::IsStringUTF8().
      8 //
      9 // The implementation detects errors on the first possible byte.
     10 
     11 #ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
     12 #define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
     13 
     14 #include <string>
     15 
     16 #include "base/basictypes.h"
     17 #include "base/i18n/base_i18n_export.h"
     18 
     19 namespace base {
     20 
     21 class BASE_I18N_EXPORT StreamingUtf8Validator {
     22  public:
     23   // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
     24   // processes characters it alternates between VALID_ENDPOINT and
     25   // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
     26   // state changes permanently to INVALID.
     27   enum State {
     28     VALID_ENDPOINT,
     29     VALID_MIDPOINT,
     30     INVALID
     31   };
     32 
     33   StreamingUtf8Validator() : state_(0u) {}
     34   // Trivial destructor intentionally omitted.
     35 
     36   // Validate |size| bytes starting at |data|. If the concatenation of all calls
     37   // to AddBytes() since this object was constructed or reset is a valid UTF-8
     38   // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
     39   // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
     40   // present, returns INVALID.
     41   State AddBytes(const char* data, size_t size);
     42 
     43   // Return the object to a freshly-constructed state so that it can be re-used.
     44   void Reset();
     45 
     46   // Validate a complete string using the same criteria. Returns true if the
     47   // string only contains complete, valid UTF-8 codepoints.
     48   static bool Validate(const std::string& string);
     49 
     50  private:
     51   // The current state of the validator. Value 0 is the initial/valid state.
     52   // The state is stored as an offset into |kUtf8ValidatorTables|. The special
     53   // state |kUtf8InvalidState| is invalid.
     54   uint8 state_;
     55 
     56   // This type could be made copyable but there is currently no use-case for
     57   // it.
     58   DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator);
     59 };
     60 
     61 }  // namespace base
     62 
     63 #endif  // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
     64