1 // Tencent is pleased to support the open source community by making RapidJSON available. 2 // 3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. 4 // 5 // Licensed under the MIT License (the "License"); you may not use this file except 6 // in compliance with the License. You may obtain a copy of the License at 7 // 8 // http://opensource.org/licenses/MIT 9 // 10 // Unless required by applicable law or agreed to in writing, software distributed 11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 // specific language governing permissions and limitations under the License. 14 15 #include "unittest.h" 16 #include "rapidjson/filereadstream.h" 17 #include "rapidjson/filewritestream.h" 18 #include "rapidjson/encodedstream.h" 19 #include "rapidjson/stringbuffer.h" 20 21 using namespace rapidjson; 22 23 // Verification of encoders/decoders with Hoehrmann's UTF8 decoder 24 25 // http://www.unicode.org/Public/UNIDATA/Blocks.txt 26 static const unsigned kCodepointRanges[] = { 27 0x0000, 0x007F, // Basic Latin 28 0x0080, 0x00FF, // Latin-1 Supplement 29 0x0100, 0x017F, // Latin Extended-A 30 0x0180, 0x024F, // Latin Extended-B 31 0x0250, 0x02AF, // IPA Extensions 32 0x02B0, 0x02FF, // Spacing Modifier Letters 33 0x0300, 0x036F, // Combining Diacritical Marks 34 0x0370, 0x03FF, // Greek and Coptic 35 0x0400, 0x04FF, // Cyrillic 36 0x0500, 0x052F, // Cyrillic Supplement 37 0x0530, 0x058F, // Armenian 38 0x0590, 0x05FF, // Hebrew 39 0x0600, 0x06FF, // Arabic 40 0x0700, 0x074F, // Syriac 41 0x0750, 0x077F, // Arabic Supplement 42 0x0780, 0x07BF, // Thaana 43 0x07C0, 0x07FF, // NKo 44 0x0800, 0x083F, // Samaritan 45 0x0840, 0x085F, // Mandaic 46 0x0900, 0x097F, // Devanagari 47 0x0980, 0x09FF, // Bengali 48 0x0A00, 0x0A7F, // Gurmukhi 49 0x0A80, 0x0AFF, // Gujarati 50 0x0B00, 0x0B7F, // Oriya 51 0x0B80, 0x0BFF, // Tamil 52 0x0C00, 0x0C7F, // Telugu 53 0x0C80, 0x0CFF, // Kannada 54 0x0D00, 0x0D7F, // Malayalam 55 0x0D80, 0x0DFF, // Sinhala 56 0x0E00, 0x0E7F, // Thai 57 0x0E80, 0x0EFF, // Lao 58 0x0F00, 0x0FFF, // Tibetan 59 0x1000, 0x109F, // Myanmar 60 0x10A0, 0x10FF, // Georgian 61 0x1100, 0x11FF, // Hangul Jamo 62 0x1200, 0x137F, // Ethiopic 63 0x1380, 0x139F, // Ethiopic Supplement 64 0x13A0, 0x13FF, // Cherokee 65 0x1400, 0x167F, // Unified Canadian Aboriginal Syllabics 66 0x1680, 0x169F, // Ogham 67 0x16A0, 0x16FF, // Runic 68 0x1700, 0x171F, // Tagalog 69 0x1720, 0x173F, // Hanunoo 70 0x1740, 0x175F, // Buhid 71 0x1760, 0x177F, // Tagbanwa 72 0x1780, 0x17FF, // Khmer 73 0x1800, 0x18AF, // Mongolian 74 0x18B0, 0x18FF, // Unified Canadian Aboriginal Syllabics Extended 75 0x1900, 0x194F, // Limbu 76 0x1950, 0x197F, // Tai Le 77 0x1980, 0x19DF, // New Tai Lue 78 0x19E0, 0x19FF, // Khmer Symbols 79 0x1A00, 0x1A1F, // Buginese 80 0x1A20, 0x1AAF, // Tai Tham 81 0x1B00, 0x1B7F, // Balinese 82 0x1B80, 0x1BBF, // Sundanese 83 0x1BC0, 0x1BFF, // Batak 84 0x1C00, 0x1C4F, // Lepcha 85 0x1C50, 0x1C7F, // Ol Chiki 86 0x1CD0, 0x1CFF, // Vedic Extensions 87 0x1D00, 0x1D7F, // Phonetic Extensions 88 0x1D80, 0x1DBF, // Phonetic Extensions Supplement 89 0x1DC0, 0x1DFF, // Combining Diacritical Marks Supplement 90 0x1E00, 0x1EFF, // Latin Extended Additional 91 0x1F00, 0x1FFF, // Greek Extended 92 0x2000, 0x206F, // General Punctuation 93 0x2070, 0x209F, // Superscripts and Subscripts 94 0x20A0, 0x20CF, // Currency Symbols 95 0x20D0, 0x20FF, // Combining Diacritical Marks for Symbols 96 0x2100, 0x214F, // Letterlike Symbols 97 0x2150, 0x218F, // Number Forms 98 0x2190, 0x21FF, // Arrows 99 0x2200, 0x22FF, // Mathematical Operators 100 0x2300, 0x23FF, // Miscellaneous Technical 101 0x2400, 0x243F, // Control Pictures 102 0x2440, 0x245F, // Optical Character Recognition 103 0x2460, 0x24FF, // Enclosed Alphanumerics 104 0x2500, 0x257F, // Box Drawing 105 0x2580, 0x259F, // Block Elements 106 0x25A0, 0x25FF, // Geometric Shapes 107 0x2600, 0x26FF, // Miscellaneous Symbols 108 0x2700, 0x27BF, // Dingbats 109 0x27C0, 0x27EF, // Miscellaneous Mathematical Symbols-A 110 0x27F0, 0x27FF, // Supplemental Arrows-A 111 0x2800, 0x28FF, // Braille Patterns 112 0x2900, 0x297F, // Supplemental Arrows-B 113 0x2980, 0x29FF, // Miscellaneous Mathematical Symbols-B 114 0x2A00, 0x2AFF, // Supplemental Mathematical Operators 115 0x2B00, 0x2BFF, // Miscellaneous Symbols and Arrows 116 0x2C00, 0x2C5F, // Glagolitic 117 0x2C60, 0x2C7F, // Latin Extended-C 118 0x2C80, 0x2CFF, // Coptic 119 0x2D00, 0x2D2F, // Georgian Supplement 120 0x2D30, 0x2D7F, // Tifinagh 121 0x2D80, 0x2DDF, // Ethiopic Extended 122 0x2DE0, 0x2DFF, // Cyrillic Extended-A 123 0x2E00, 0x2E7F, // Supplemental Punctuation 124 0x2E80, 0x2EFF, // CJK Radicals Supplement 125 0x2F00, 0x2FDF, // Kangxi Radicals 126 0x2FF0, 0x2FFF, // Ideographic Description Characters 127 0x3000, 0x303F, // CJK Symbols and Punctuation 128 0x3040, 0x309F, // Hiragana 129 0x30A0, 0x30FF, // Katakana 130 0x3100, 0x312F, // Bopomofo 131 0x3130, 0x318F, // Hangul Compatibility Jamo 132 0x3190, 0x319F, // Kanbun 133 0x31A0, 0x31BF, // Bopomofo Extended 134 0x31C0, 0x31EF, // CJK Strokes 135 0x31F0, 0x31FF, // Katakana Phonetic Extensions 136 0x3200, 0x32FF, // Enclosed CJK Letters and Months 137 0x3300, 0x33FF, // CJK Compatibility 138 0x3400, 0x4DBF, // CJK Unified Ideographs Extension A 139 0x4DC0, 0x4DFF, // Yijing Hexagram Symbols 140 0x4E00, 0x9FFF, // CJK Unified Ideographs 141 0xA000, 0xA48F, // Yi Syllables 142 0xA490, 0xA4CF, // Yi Radicals 143 0xA4D0, 0xA4FF, // Lisu 144 0xA500, 0xA63F, // Vai 145 0xA640, 0xA69F, // Cyrillic Extended-B 146 0xA6A0, 0xA6FF, // Bamum 147 0xA700, 0xA71F, // Modifier Tone Letters 148 0xA720, 0xA7FF, // Latin Extended-D 149 0xA800, 0xA82F, // Syloti Nagri 150 0xA830, 0xA83F, // Common Indic Number Forms 151 0xA840, 0xA87F, // Phags-pa 152 0xA880, 0xA8DF, // Saurashtra 153 0xA8E0, 0xA8FF, // Devanagari Extended 154 0xA900, 0xA92F, // Kayah Li 155 0xA930, 0xA95F, // Rejang 156 0xA960, 0xA97F, // Hangul Jamo Extended-A 157 0xA980, 0xA9DF, // Javanese 158 0xAA00, 0xAA5F, // Cham 159 0xAA60, 0xAA7F, // Myanmar Extended-A 160 0xAA80, 0xAADF, // Tai Viet 161 0xAB00, 0xAB2F, // Ethiopic Extended-A 162 0xABC0, 0xABFF, // Meetei Mayek 163 0xAC00, 0xD7AF, // Hangul Syllables 164 0xD7B0, 0xD7FF, // Hangul Jamo Extended-B 165 //0xD800, 0xDB7F, // High Surrogates 166 //0xDB80, 0xDBFF, // High Private Use Surrogates 167 //0xDC00, 0xDFFF, // Low Surrogates 168 0xE000, 0xF8FF, // Private Use Area 169 0xF900, 0xFAFF, // CJK Compatibility Ideographs 170 0xFB00, 0xFB4F, // Alphabetic Presentation Forms 171 0xFB50, 0xFDFF, // Arabic Presentation Forms-A 172 0xFE00, 0xFE0F, // Variation Selectors 173 0xFE10, 0xFE1F, // Vertical Forms 174 0xFE20, 0xFE2F, // Combining Half Marks 175 0xFE30, 0xFE4F, // CJK Compatibility Forms 176 0xFE50, 0xFE6F, // Small Form Variants 177 0xFE70, 0xFEFF, // Arabic Presentation Forms-B 178 0xFF00, 0xFFEF, // Halfwidth and Fullwidth Forms 179 0xFFF0, 0xFFFF, // Specials 180 0x10000, 0x1007F, // Linear B Syllabary 181 0x10080, 0x100FF, // Linear B Ideograms 182 0x10100, 0x1013F, // Aegean Numbers 183 0x10140, 0x1018F, // Ancient Greek Numbers 184 0x10190, 0x101CF, // Ancient Symbols 185 0x101D0, 0x101FF, // Phaistos Disc 186 0x10280, 0x1029F, // Lycian 187 0x102A0, 0x102DF, // Carian 188 0x10300, 0x1032F, // Old Italic 189 0x10330, 0x1034F, // Gothic 190 0x10380, 0x1039F, // Ugaritic 191 0x103A0, 0x103DF, // Old Persian 192 0x10400, 0x1044F, // Deseret 193 0x10450, 0x1047F, // Shavian 194 0x10480, 0x104AF, // Osmanya 195 0x10800, 0x1083F, // Cypriot Syllabary 196 0x10840, 0x1085F, // Imperial Aramaic 197 0x10900, 0x1091F, // Phoenician 198 0x10920, 0x1093F, // Lydian 199 0x10A00, 0x10A5F, // Kharoshthi 200 0x10A60, 0x10A7F, // Old South Arabian 201 0x10B00, 0x10B3F, // Avestan 202 0x10B40, 0x10B5F, // Inscriptional Parthian 203 0x10B60, 0x10B7F, // Inscriptional Pahlavi 204 0x10C00, 0x10C4F, // Old Turkic 205 0x10E60, 0x10E7F, // Rumi Numeral Symbols 206 0x11000, 0x1107F, // Brahmi 207 0x11080, 0x110CF, // Kaithi 208 0x12000, 0x123FF, // Cuneiform 209 0x12400, 0x1247F, // Cuneiform Numbers and Punctuation 210 0x13000, 0x1342F, // Egyptian Hieroglyphs 211 0x16800, 0x16A3F, // Bamum Supplement 212 0x1B000, 0x1B0FF, // Kana Supplement 213 0x1D000, 0x1D0FF, // Byzantine Musical Symbols 214 0x1D100, 0x1D1FF, // Musical Symbols 215 0x1D200, 0x1D24F, // Ancient Greek Musical Notation 216 0x1D300, 0x1D35F, // Tai Xuan Jing Symbols 217 0x1D360, 0x1D37F, // Counting Rod Numerals 218 0x1D400, 0x1D7FF, // Mathematical Alphanumeric Symbols 219 0x1F000, 0x1F02F, // Mahjong Tiles 220 0x1F030, 0x1F09F, // Domino Tiles 221 0x1F0A0, 0x1F0FF, // Playing Cards 222 0x1F100, 0x1F1FF, // Enclosed Alphanumeric Supplement 223 0x1F200, 0x1F2FF, // Enclosed Ideographic Supplement 224 0x1F300, 0x1F5FF, // Miscellaneous Symbols And Pictographs 225 0x1F600, 0x1F64F, // Emoticons 226 0x1F680, 0x1F6FF, // Transport And Map Symbols 227 0x1F700, 0x1F77F, // Alchemical Symbols 228 0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B 229 0x2A700, 0x2B73F, // CJK Unified Ideographs Extension C 230 0x2B740, 0x2B81F, // CJK Unified Ideographs Extension D 231 0x2F800, 0x2FA1F, // CJK Compatibility Ideographs Supplement 232 0xE0000, 0xE007F, // Tags 233 0xE0100, 0xE01EF, // Variation Selectors Supplement 234 0xF0000, 0xFFFFF, // Supplementary Private Use Area-A 235 0x100000, 0x10FFFF, // Supplementary Private Use Area-B 236 0xFFFFFFFF 237 }; 238 239 // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern (at) hoehrmann.de> 240 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 241 242 #define UTF8_ACCEPT 0u 243 #define UTF8_REJECT 12u 244 245 static const unsigned char utf8d[] = { 246 // The first part of the table maps bytes to character classes that 247 // to reduce the size of the transition table and create bitmasks. 248 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 249 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 250 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 251 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 252 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 253 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 254 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 255 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 256 257 // The second part is a transition table that maps a combination 258 // of a state of the automaton and a character class to a state. 259 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 260 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 261 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 262 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 263 12,36,12,12,12,12,12,12,12,12,12,12, 264 }; 265 266 static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) { 267 unsigned type = utf8d[byte]; 268 269 *codep = (*state != UTF8_ACCEPT) ? 270 (byte & 0x3fu) | (*codep << 6) : 271 (0xff >> type) & (byte); 272 273 *state = utf8d[256 + *state + type]; 274 return *state; 275 } 276 277 //static bool IsUTF8(unsigned char* s) { 278 // unsigned codepoint, state = 0; 279 // 280 // while (*s) 281 // decode(&state, &codepoint, *s++); 282 // 283 // return state == UTF8_ACCEPT; 284 //} 285 286 TEST(EncodingsTest, UTF8) { 287 StringBuffer os, os2; 288 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { 289 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { 290 os.Clear(); 291 UTF8<>::Encode(os, codepoint); 292 const char* encodedStr = os.GetString(); 293 294 // Decode with Hoehrmann 295 { 296 unsigned decodedCodepoint = 0; 297 unsigned state = 0; 298 299 unsigned decodedCount = 0; 300 for (const char* s = encodedStr; *s; ++s) 301 if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) { 302 EXPECT_EQ(codepoint, decodedCodepoint); 303 decodedCount++; 304 } 305 306 if (*encodedStr) // This decoder cannot handle U+0000 307 EXPECT_EQ(1u, decodedCount); // Should only contain one code point 308 309 EXPECT_EQ(UTF8_ACCEPT, state); 310 if (UTF8_ACCEPT != state) 311 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; 312 } 313 314 // Decode 315 { 316 StringStream is(encodedStr); 317 unsigned decodedCodepoint; 318 bool result = UTF8<>::Decode(is, &decodedCodepoint); 319 EXPECT_TRUE(result); 320 EXPECT_EQ(codepoint, decodedCodepoint); 321 if (!result || codepoint != decodedCodepoint) 322 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; 323 } 324 325 // Validate 326 { 327 StringStream is(encodedStr); 328 os2.Clear(); 329 bool result = UTF8<>::Validate(is, os2); 330 EXPECT_TRUE(result); 331 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); 332 } 333 } 334 } 335 } 336 337 TEST(EncodingsTest, UTF16) { 338 GenericStringBuffer<UTF16<> > os, os2; 339 GenericStringBuffer<UTF8<> > utf8os; 340 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { 341 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { 342 os.Clear(); 343 UTF16<>::Encode(os, codepoint); 344 const UTF16<>::Ch* encodedStr = os.GetString(); 345 346 // Encode with Hoehrmann's code 347 if (codepoint != 0) // cannot handle U+0000 348 { 349 // encode with UTF8<> first 350 utf8os.Clear(); 351 UTF8<>::Encode(utf8os, codepoint); 352 353 // transcode from UTF8 to UTF16 with Hoehrmann's code 354 unsigned decodedCodepoint = 0; 355 unsigned state = 0; 356 UTF16<>::Ch buffer[3], *p = &buffer[0]; 357 for (const char* s = utf8os.GetString(); *s; ++s) { 358 if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) 359 break; 360 } 361 362 if (codepoint <= 0xFFFF) 363 *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint); 364 else { 365 // Encode code points above U+FFFF as surrogate pair. 366 *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10)); 367 *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF)); 368 } 369 *p++ = '\0'; 370 371 EXPECT_EQ(0, StrCmp(buffer, encodedStr)); 372 } 373 374 // Decode 375 { 376 GenericStringStream<UTF16<> > is(encodedStr); 377 unsigned decodedCodepoint; 378 bool result = UTF16<>::Decode(is, &decodedCodepoint); 379 EXPECT_TRUE(result); 380 EXPECT_EQ(codepoint, decodedCodepoint); 381 if (!result || codepoint != decodedCodepoint) 382 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; 383 } 384 385 // Validate 386 { 387 GenericStringStream<UTF16<> > is(encodedStr); 388 os2.Clear(); 389 bool result = UTF16<>::Validate(is, os2); 390 EXPECT_TRUE(result); 391 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); 392 } 393 } 394 } 395 } 396 397 TEST(EncodingsTest, UTF32) { 398 GenericStringBuffer<UTF32<> > os, os2; 399 for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) { 400 for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) { 401 os.Clear(); 402 UTF32<>::Encode(os, codepoint); 403 const UTF32<>::Ch* encodedStr = os.GetString(); 404 405 // Decode 406 { 407 GenericStringStream<UTF32<> > is(encodedStr); 408 unsigned decodedCodepoint; 409 bool result = UTF32<>::Decode(is, &decodedCodepoint); 410 EXPECT_TRUE(result); 411 EXPECT_EQ(codepoint, decodedCodepoint); 412 if (!result || codepoint != decodedCodepoint) 413 std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl; 414 } 415 416 // Validate 417 { 418 GenericStringStream<UTF32<> > is(encodedStr); 419 os2.Clear(); 420 bool result = UTF32<>::Validate(is, os2); 421 EXPECT_TRUE(result); 422 EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString())); 423 } 424 } 425 } 426 } 427