1 // Tencent is pleased to support the open source community by making RapidJSON available. 2 // 3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. 4 // 5 // Licensed under the MIT License (the "License"); you may not use this file except 6 // in compliance with the License. You may obtain a copy of the License at 7 // 8 // http://opensource.org/licenses/MIT 9 // 10 // Unless required by applicable law or agreed to in writing, software distributed 11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 // specific language governing permissions and limitations under the License. 14 15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_ 16 #define RAPIDJSON_ENCODEDSTREAM_H_ 17 18 #include "rapidjson.h" 19 20 #ifdef __GNUC__ 21 RAPIDJSON_DIAG_PUSH 22 RAPIDJSON_DIAG_OFF(effc++) 23 #endif 24 25 RAPIDJSON_NAMESPACE_BEGIN 26 27 //! Input byte stream wrapper with a statically bound encoding. 28 /*! 29 \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. 30 \tparam InputByteStream Type of input byte stream. For example, FileReadStream. 31 */ 32 template <typename Encoding, typename InputByteStream> 33 class EncodedInputStream { 34 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 35 public: 36 typedef typename Encoding::Ch Ch; 37 38 EncodedInputStream(InputByteStream& is) : is_(is) { 39 current_ = Encoding::TakeBOM(is_); 40 } 41 42 Ch Peek() const { return current_; } 43 Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } 44 size_t Tell() const { return is_.Tell(); } 45 46 // Not implemented 47 void Put(Ch) { RAPIDJSON_ASSERT(false); } 48 void Flush() { RAPIDJSON_ASSERT(false); } 49 Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } 50 size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } 51 52 private: 53 EncodedInputStream(const EncodedInputStream&); 54 EncodedInputStream& operator=(const EncodedInputStream&); 55 56 InputByteStream& is_; 57 Ch current_; 58 }; 59 60 //! Output byte stream wrapper with statically bound encoding. 61 /*! 62 \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. 63 \tparam InputByteStream Type of input byte stream. For example, FileWriteStream. 64 */ 65 template <typename Encoding, typename OutputByteStream> 66 class EncodedOutputStream { 67 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 68 public: 69 typedef typename Encoding::Ch Ch; 70 71 EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { 72 if (putBOM) 73 Encoding::PutBOM(os_); 74 } 75 76 void Put(Ch c) { Encoding::Put(os_, c); } 77 void Flush() { os_.Flush(); } 78 79 // Not implemented 80 Ch Peek() const { RAPIDJSON_ASSERT(false); } 81 Ch Take() { RAPIDJSON_ASSERT(false); } 82 size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } 83 Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } 84 size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } 85 86 private: 87 EncodedOutputStream(const EncodedOutputStream&); 88 EncodedOutputStream& operator=(const EncodedOutputStream&); 89 90 OutputByteStream& os_; 91 }; 92 93 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x 94 95 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection. 96 /*! 97 \tparam CharType Type of character for reading. 98 \tparam InputByteStream type of input byte stream to be wrapped. 99 */ 100 template <typename CharType, typename InputByteStream> 101 class AutoUTFInputStream { 102 RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); 103 public: 104 typedef CharType Ch; 105 106 //! Constructor. 107 /*! 108 \param is input stream to be wrapped. 109 \param type UTF encoding type if it is not detected from the stream. 110 */ 111 AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) { 112 RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); 113 DetectType(); 114 static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) }; 115 takeFunc_ = f[type_]; 116 current_ = takeFunc_(*is_); 117 } 118 119 UTFType GetType() const { return type_; } 120 bool HasBOM() const { return hasBOM_; } 121 122 Ch Peek() const { return current_; } 123 Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; } 124 size_t Tell() const { return is_->Tell(); } 125 126 // Not implemented 127 void Put(Ch) { RAPIDJSON_ASSERT(false); } 128 void Flush() { RAPIDJSON_ASSERT(false); } 129 Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } 130 size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } 131 132 private: 133 AutoUTFInputStream(const AutoUTFInputStream&); 134 AutoUTFInputStream& operator=(const AutoUTFInputStream&); 135 136 // Detect encoding type with BOM or RFC 4627 137 void DetectType() { 138 // BOM (Byte Order Mark): 139 // 00 00 FE FF UTF-32BE 140 // FF FE 00 00 UTF-32LE 141 // FE FF UTF-16BE 142 // FF FE UTF-16LE 143 // EF BB BF UTF-8 144 145 const unsigned char* c = (const unsigned char *)is_->Peek4(); 146 if (!c) 147 return; 148 149 unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24)); 150 hasBOM_ = false; 151 if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } 152 else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } 153 else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); } 154 else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); } 155 else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); } 156 157 // RFC 4627: Section 3 158 // "Since the first two characters of a JSON text will always be ASCII 159 // characters [RFC0020], it is possible to determine whether an octet 160 // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking 161 // at the pattern of nulls in the first four octets." 162 // 00 00 00 xx UTF-32BE 163 // 00 xx 00 xx UTF-16BE 164 // xx 00 00 00 UTF-32LE 165 // xx 00 xx 00 UTF-16LE 166 // xx xx xx xx UTF-8 167 168 if (!hasBOM_) { 169 unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); 170 switch (pattern) { 171 case 0x08: type_ = kUTF32BE; break; 172 case 0x0A: type_ = kUTF16BE; break; 173 case 0x01: type_ = kUTF32LE; break; 174 case 0x05: type_ = kUTF16LE; break; 175 case 0x0F: type_ = kUTF8; break; 176 default: break; // Use type defined by user. 177 } 178 } 179 180 // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. 181 if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); 182 if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); 183 } 184 185 typedef Ch (*TakeFunc)(InputByteStream& is); 186 InputByteStream* is_; 187 UTFType type_; 188 Ch current_; 189 TakeFunc takeFunc_; 190 bool hasBOM_; 191 }; 192 193 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection. 194 /*! 195 \tparam CharType Type of character for writing. 196 \tparam InputByteStream type of output byte stream to be wrapped. 197 */ 198 template <typename CharType, typename OutputByteStream> 199 class AutoUTFOutputStream { 200 RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); 201 public: 202 typedef CharType Ch; 203 204 //! Constructor. 205 /*! 206 \param os output stream to be wrapped. 207 \param type UTF encoding type. 208 \param putBOM Whether to write BOM at the beginning of the stream. 209 */ 210 AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) { 211 RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); 212 213 // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. 214 if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); 215 if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); 216 217 static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) }; 218 putFunc_ = f[type_]; 219 220 if (putBOM) 221 PutBOM(); 222 } 223 224 UTFType GetType() const { return type_; } 225 226 void Put(Ch c) { putFunc_(*os_, c); } 227 void Flush() { os_->Flush(); } 228 229 // Not implemented 230 Ch Peek() const { RAPIDJSON_ASSERT(false); } 231 Ch Take() { RAPIDJSON_ASSERT(false); } 232 size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } 233 Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } 234 size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } 235 236 private: 237 AutoUTFOutputStream(const AutoUTFOutputStream&); 238 AutoUTFOutputStream& operator=(const AutoUTFOutputStream&); 239 240 void PutBOM() { 241 typedef void (*PutBOMFunc)(OutputByteStream&); 242 static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) }; 243 f[type_](*os_); 244 } 245 246 typedef void (*PutFunc)(OutputByteStream&, Ch); 247 248 OutputByteStream* os_; 249 UTFType type_; 250 PutFunc putFunc_; 251 }; 252 253 #undef RAPIDJSON_ENCODINGS_FUNC 254 255 RAPIDJSON_NAMESPACE_END 256 257 #ifdef __GNUC__ 258 RAPIDJSON_DIAG_POP 259 #endif 260 261 #endif // RAPIDJSON_FILESTREAM_H_ 262