1 <html> 2 <head> 3 <link rel="stylesheet" href="../js/resources/js-test-style.css"> 4 <script src="../js/resources/js-test-pre.js"></script> 5 <script src="resources/char-decoding-utils.js"></script> 6 </head> 7 <body> 8 <p id="description"></p> 9 <div id="console"></div> 10 <script> 11 12 description("This tests decoding characters in various character sets."); 13 14 testDecode('UTF-8', '%E2%88%9A', 'U+221A'); 15 16 // <http://bugs.webkit.org/show_bug.cgi?id=17014> EUC-CN code A3A0 is mapped to U+E5E5 instead of U+3000 17 testDecode('gb2312', '%A3%A0', 'U+3000'); 18 testDecode('gb_2312-80', '%A3%A0', 'U+3000'); 19 testDecode('chinese', '%A3%A0', 'U+3000'); 20 testDecode('gbk', '%A3%A0', 'U+3000'); 21 testDecode('gb18030', '%A3%A0', 'U+3000'); 22 testDecode('EUC-CN', '%A3%A0', 'U+3000'); 23 24 // Test Shift_JIS aliases. 25 testDecode('Shift_JIS', '%82%d0', 'U+3072'); 26 testDecode('shift-jis', '%82%d0', 'U+3072'); 27 28 // Test that all Korean encodings of EUC-KR family are treated as windows-949. 29 var korean = { 30 encodings: ['korean', 'EUC-KR', 'windows-949', 'x-windows-949', 'x-uhc', 31 'iso-ir-149', 'KS_C_5601-1987', 'KS_C_5601-1989' ], 32 encoded: ['%A2%E6', '%A1%A4', '%A1%A9', '%A1%AA', '%A1%AD', '%A2%A6', 33 '%A2%C1', '%1A', '%1C', '%8F%A1', '%B4%D3', '%A2%41'], 34 unicode: ['U+20AC', 'U+00B7', 'U+00AD', 'U+2015', 'U+223C', 'U+FF5E', 35 'U+2299', 'U+001A', 'U+001C', 'U+B8EA', 'U+B2D2', 'U+C910'] 36 }; 37 38 batchTestDecode(korean); 39 40 // Test that ISO-8859-9 (Turkish) is upgraded to windows-1254 with Euro symbol. 41 var turkish = { 42 encodings: ['iso-8859-9', 'latin5', 'windows-1254'], 43 encoded: ['%80', '%9F', '%FD'], 44 unicode: ['U+20AC', 'U+0178', 'U+0131'] 45 }; 46 47 batchTestDecode(turkish); 48 49 // FIXME: Have to add tests for Euro and a few new characters added to ISO-8859-x 50 // that are NOT subsets of the corresponding Windows codepages. For instance, 51 // ISO-8859-7:2003 has Euro at 0xA4 and a couple of other new characters. 52 // ICU 3.8.x or later has them. Perhaps, we need to have a separate test that 53 // can be enabled only with modern ICU. 54 55 // Baltic encodings fine points. 56 testDecode('ISO-8859-13', '%A1', 'U+201D'); 57 testDecode('ISO-8859-13', '%A5', 'U+201E'); 58 testDecode('ISO-8859-13', '%B4', 'U+201C'); 59 testDecode('ISO-8859-13', '%FF', 'U+2019'); 60 testDecode('windows-1257', '%80', 'U+20AC'); 61 testDecode('windows-1257', '%B4', 'U+00B4'); 62 testDecode('windows-1257', '%FF', 'U+02D9'); 63 64 // Greek encodings fine points. 65 testDecode('iso-8859-7', '%A1', 'U+2018'); 66 testDecode('iso-8859-7', '%B5', 'U+0385'); 67 testDecode('iso-8859-7', '%B6', 'U+0386'); 68 testDecode('windows-1253', '%80', 'U+20AC'); 69 testDecode('windows-1253', '%A1', 'U+0385'); 70 testDecode('windows-1253', '%B5', 'U+00B5'); 71 testDecode('windows-1253', '%B6', 'U+00B6'); 72 73 // KOI-8 variants 74 testDecode('KOI8-R', '%A4', 'U+2553'); 75 testDecode('KOI8-R', '%AD', 'U+255C'); 76 testDecode('KOI8-U', '%A4', 'U+0454'); 77 testDecode('KOI8-U', '%AD', 'U+0491'); 78 79 // Test that TIS-620 and ISO-8859-11 (Thai) are upgraded to windows-874. 80 // "0xDB => U+F8C1" is a weird PUA mapping that doesn't seem to be of 81 // any use, even on Windows. 82 var thai = { 83 encodings: ['TIS-620', 'ISO-8859-11', 'windows-874', 'dos-874'], 84 encoded: ['%80', '%96', '%A0', '%A1', '%DB'], 85 unicode: ['U+20AC', 'U+2013', 'U+00A0', 'U+0E01', 'U+F8C1'] 86 }; 87 88 batchTestDecode(thai); 89 90 // UTF-7 is expressly forbidden, so decoding it should not work correctly. 91 // This attempts to decode '<' as UTF-7 (+AD4) but it ends up being decoded 92 // as a '+AD4'. 93 testDecode('UTF-7', '+AD4', 'U+002B/U+0041/U+0044/U+0034'); 94 testDecode('utf-7', '+AD4', 'U+002B/U+0041/U+0044/U+0034'); 95 96 // UTF-16LE and variants. 97 testDecode('UTF-16LE', '%69%D8%D6%DE', 'U+D869/U+DED6'); 98 testDecode('unicodeFEFF', '%69%D8%D6%DE', 'U+D869/U+DED6'); 99 // According to HTML5 and for IE compatibility, UTF-16 is treated as little endian. The following tests fail as of Firefox 3.6.13. 100 testDecode('UTF-16', '%69%D8%D6%DE', 'U+D869/U+DED6'); 101 testDecode('ISO-10646-UCS-2', '%69%D8%D6%DE', 'U+D869/U+DED6'); 102 testDecode('UCS-2', '%69%D8%D6%DE', 'U+D869/U+DED6'); 103 testDecode('Unicode', '%69%D8%D6%DE', 'U+D869/U+DED6'); 104 testDecode('csUnicode', '%69%D8%D6%DE', 'U+D869/U+DED6'); 105 106 // UTF-16BE and variants. 107 testDecode('UTF-16BE', '%D8%69%DE%D6', 'U+D869/U+DED6'); 108 testDecode('unicodeFFFE', '%D8%69%DE%D6', 'U+D869/U+DED6'); 109 110 successfullyParsed = true; 111 112 113 114