Home | History | Annotate | Download | only in rapidjson
      1 // Tencent is pleased to support the open source community by making RapidJSON available.
      2 //
      3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
      4 //
      5 // Licensed under the MIT License (the "License"); you may not use this file except
      6 // in compliance with the License. You may obtain a copy of the License at
      7 //
      8 // http://opensource.org/licenses/MIT
      9 //
     10 // Unless required by applicable law or agreed to in writing, software distributed
     11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
     12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
     13 // specific language governing permissions and limitations under the License.
     14 
     15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
     16 #define RAPIDJSON_ENCODEDSTREAM_H_
     17 
     18 #include "rapidjson.h"
     19 
     20 #ifdef __GNUC__
     21 RAPIDJSON_DIAG_PUSH
     22 RAPIDJSON_DIAG_OFF(effc++)
     23 #endif
     24 
     25 RAPIDJSON_NAMESPACE_BEGIN
     26 
     27 //! Input byte stream wrapper with a statically bound encoding.
     28 /*!
     29     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
     30     \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
     31 */
     32 template <typename Encoding, typename InputByteStream>
     33 class EncodedInputStream {
     34     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
     35 public:
     36     typedef typename Encoding::Ch Ch;
     37 
     38     EncodedInputStream(InputByteStream& is) : is_(is) {
     39         current_ = Encoding::TakeBOM(is_);
     40     }
     41 
     42     Ch Peek() const { return current_; }
     43     Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
     44     size_t Tell() const { return is_.Tell(); }
     45 
     46     // Not implemented
     47     void Put(Ch) { RAPIDJSON_ASSERT(false); }
     48     void Flush() { RAPIDJSON_ASSERT(false); }
     49     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
     50     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
     51 
     52 private:
     53     EncodedInputStream(const EncodedInputStream&);
     54     EncodedInputStream& operator=(const EncodedInputStream&);
     55 
     56     InputByteStream& is_;
     57     Ch current_;
     58 };
     59 
     60 //! Output byte stream wrapper with statically bound encoding.
     61 /*!
     62     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
     63     \tparam InputByteStream Type of input byte stream. For example, FileWriteStream.
     64 */
     65 template <typename Encoding, typename OutputByteStream>
     66 class EncodedOutputStream {
     67     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
     68 public:
     69     typedef typename Encoding::Ch Ch;
     70 
     71     EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
     72         if (putBOM)
     73             Encoding::PutBOM(os_);
     74     }
     75 
     76     void Put(Ch c) { Encoding::Put(os_, c);  }
     77     void Flush() { os_.Flush(); }
     78 
     79     // Not implemented
     80     Ch Peek() const { RAPIDJSON_ASSERT(false); }
     81     Ch Take() { RAPIDJSON_ASSERT(false);  }
     82     size_t Tell() const { RAPIDJSON_ASSERT(false);  return 0; }
     83     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
     84     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
     85 
     86 private:
     87     EncodedOutputStream(const EncodedOutputStream&);
     88     EncodedOutputStream& operator=(const EncodedOutputStream&);
     89 
     90     OutputByteStream& os_;
     91 };
     92 
     93 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
     94 
     95 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
     96 /*!
     97     \tparam CharType Type of character for reading.
     98     \tparam InputByteStream type of input byte stream to be wrapped.
     99 */
    100 template <typename CharType, typename InputByteStream>
    101 class AutoUTFInputStream {
    102     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
    103 public:
    104     typedef CharType Ch;
    105 
    106     //! Constructor.
    107     /*!
    108         \param is input stream to be wrapped.
    109         \param type UTF encoding type if it is not detected from the stream.
    110     */
    111     AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
    112         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
    113         DetectType();
    114         static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
    115         takeFunc_ = f[type_];
    116         current_ = takeFunc_(*is_);
    117     }
    118 
    119     UTFType GetType() const { return type_; }
    120     bool HasBOM() const { return hasBOM_; }
    121 
    122     Ch Peek() const { return current_; }
    123     Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
    124     size_t Tell() const { return is_->Tell(); }
    125 
    126     // Not implemented
    127     void Put(Ch) { RAPIDJSON_ASSERT(false); }
    128     void Flush() { RAPIDJSON_ASSERT(false); }
    129     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
    130     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
    131 
    132 private:
    133     AutoUTFInputStream(const AutoUTFInputStream&);
    134     AutoUTFInputStream& operator=(const AutoUTFInputStream&);
    135 
    136     // Detect encoding type with BOM or RFC 4627
    137     void DetectType() {
    138         // BOM (Byte Order Mark):
    139         // 00 00 FE FF  UTF-32BE
    140         // FF FE 00 00  UTF-32LE
    141         // FE FF        UTF-16BE
    142         // FF FE        UTF-16LE
    143         // EF BB BF     UTF-8
    144 
    145         const unsigned char* c = (const unsigned char *)is_->Peek4();
    146         if (!c)
    147             return;
    148 
    149         unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
    150         hasBOM_ = false;
    151         if (bom == 0xFFFE0000)                  { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
    152         else if (bom == 0x0000FEFF)             { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
    153         else if ((bom & 0xFFFF) == 0xFFFE)      { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take();                           }
    154         else if ((bom & 0xFFFF) == 0xFEFF)      { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take();                           }
    155         else if ((bom & 0xFFFFFF) == 0xBFBBEF)  { type_ = kUTF8;    hasBOM_ = true; is_->Take(); is_->Take(); is_->Take();              }
    156 
    157         // RFC 4627: Section 3
    158         // "Since the first two characters of a JSON text will always be ASCII
    159         // characters [RFC0020], it is possible to determine whether an octet
    160         // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
    161         // at the pattern of nulls in the first four octets."
    162         // 00 00 00 xx  UTF-32BE
    163         // 00 xx 00 xx  UTF-16BE
    164         // xx 00 00 00  UTF-32LE
    165         // xx 00 xx 00  UTF-16LE
    166         // xx xx xx xx  UTF-8
    167 
    168         if (!hasBOM_) {
    169             unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
    170             switch (pattern) {
    171             case 0x08: type_ = kUTF32BE; break;
    172             case 0x0A: type_ = kUTF16BE; break;
    173             case 0x01: type_ = kUTF32LE; break;
    174             case 0x05: type_ = kUTF16LE; break;
    175             case 0x0F: type_ = kUTF8;    break;
    176             default: break; // Use type defined by user.
    177             }
    178         }
    179 
    180         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
    181         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
    182         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
    183     }
    184 
    185     typedef Ch (*TakeFunc)(InputByteStream& is);
    186     InputByteStream* is_;
    187     UTFType type_;
    188     Ch current_;
    189     TakeFunc takeFunc_;
    190     bool hasBOM_;
    191 };
    192 
    193 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
    194 /*!
    195     \tparam CharType Type of character for writing.
    196     \tparam InputByteStream type of output byte stream to be wrapped.
    197 */
    198 template <typename CharType, typename OutputByteStream>
    199 class AutoUTFOutputStream {
    200     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
    201 public:
    202     typedef CharType Ch;
    203 
    204     //! Constructor.
    205     /*!
    206         \param os output stream to be wrapped.
    207         \param type UTF encoding type.
    208         \param putBOM Whether to write BOM at the beginning of the stream.
    209     */
    210     AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
    211         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
    212 
    213         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
    214         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
    215         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
    216 
    217         static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
    218         putFunc_ = f[type_];
    219 
    220         if (putBOM)
    221             PutBOM();
    222     }
    223 
    224     UTFType GetType() const { return type_; }
    225 
    226     void Put(Ch c) { putFunc_(*os_, c); }
    227     void Flush() { os_->Flush(); }
    228 
    229     // Not implemented
    230     Ch Peek() const { RAPIDJSON_ASSERT(false); }
    231     Ch Take() { RAPIDJSON_ASSERT(false); }
    232     size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
    233     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
    234     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
    235 
    236 private:
    237     AutoUTFOutputStream(const AutoUTFOutputStream&);
    238     AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
    239 
    240     void PutBOM() {
    241         typedef void (*PutBOMFunc)(OutputByteStream&);
    242         static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
    243         f[type_](*os_);
    244     }
    245 
    246     typedef void (*PutFunc)(OutputByteStream&, Ch);
    247 
    248     OutputByteStream* os_;
    249     UTFType type_;
    250     PutFunc putFunc_;
    251 };
    252 
    253 #undef RAPIDJSON_ENCODINGS_FUNC
    254 
    255 RAPIDJSON_NAMESPACE_END
    256 
    257 #ifdef __GNUC__
    258 RAPIDJSON_DIAG_POP
    259 #endif
    260 
    261 #endif // RAPIDJSON_FILESTREAM_H_
    262