1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch> 5 * Copyright (C) 2009 Dominik Rttsches <dominik.roettsches (at) access-company.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "config.h" 30 #include "TextCodecGtk.h" 31 32 #include <gio/gio.h> 33 #include "GOwnPtr.h" 34 #include "Logging.h" 35 #include "PlatformString.h" 36 #include <wtf/Assertions.h> 37 #include <wtf/HashMap.h> 38 #include <wtf/text/CString.h> 39 40 using std::min; 41 42 namespace WebCore { 43 44 // TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380). 45 // That's why we need to avoid generating extra BOM's for the conversion result. 46 // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib. 47 48 #if (G_BYTE_ORDER == G_BIG_ENDIAN) 49 static const gchar* internalEncodingName = "UTF-16BE"; 50 #else 51 static const gchar* internalEncodingName = "UTF-16LE"; 52 #endif 53 54 55 const size_t ConversionBufferSize = 16384; 56 57 58 static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*) 59 { 60 return new TextCodecGtk(encoding); 61 } 62 63 static bool isEncodingAvailable(const gchar* encodingName) 64 { 65 GIConv tester; 66 // test decoding 67 tester = g_iconv_open(internalEncodingName, encodingName); 68 if (tester == reinterpret_cast<GIConv>(-1)) { 69 return false; 70 } else { 71 g_iconv_close(tester); 72 // test encoding 73 tester = g_iconv_open(encodingName, internalEncodingName); 74 if (tester == reinterpret_cast<GIConv>(-1)) { 75 return false; 76 } else { 77 g_iconv_close(tester); 78 return true; 79 } 80 } 81 } 82 83 static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName) 84 { 85 if (isEncodingAvailable(canonicalName)) { 86 registrar(canonicalName, canonicalName); 87 return true; 88 } 89 90 return false; 91 } 92 93 static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName) 94 { 95 if (isEncodingAvailable(aliasName)) 96 registrar(aliasName, canonicalName); 97 } 98 99 static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName) 100 { 101 if (isEncodingAvailable(codecName)) 102 registrar(codecName, newTextCodecGtk, 0); 103 } 104 105 void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar) 106 { 107 // Unicode 108 registerEncodingNameIfAvailable(registrar, "UTF-8"); 109 registerEncodingNameIfAvailable(registrar, "UTF-32"); 110 registerEncodingNameIfAvailable(registrar, "UTF-32BE"); 111 registerEncodingNameIfAvailable(registrar, "UTF-32LE"); 112 113 // Western 114 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) { 115 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819"); 116 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819"); 117 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100"); 118 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1"); 119 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1"); 120 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987"); 121 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1"); 122 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1"); 123 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1"); 124 } 125 } 126 127 void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar) 128 { 129 // Unicode 130 registerCodecIfAvailable(registrar, "UTF-8"); 131 registerCodecIfAvailable(registrar, "UTF-32"); 132 registerCodecIfAvailable(registrar, "UTF-32BE"); 133 registerCodecIfAvailable(registrar, "UTF-32LE"); 134 135 // Western 136 registerCodecIfAvailable(registrar, "ISO-8859-1"); 137 } 138 139 void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar) 140 { 141 // Western 142 if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) { 143 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC"); 144 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH"); 145 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH"); 146 } 147 148 // Japanese 149 if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) { 150 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI"); 151 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS"); 152 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS"); 153 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS"); 154 } 155 if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) { 156 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP"); 157 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP"); 158 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"); 159 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE"); 160 } 161 registerEncodingNameIfAvailable(registrar, "ISO-2022-JP"); 162 163 // Traditional Chinese 164 if (registerEncodingNameIfAvailable(registrar, "BIG5")) { 165 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5"); 166 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE"); 167 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE"); 168 registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5"); 169 registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5"); 170 } 171 if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) { 172 registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004"); 173 registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS"); 174 } 175 registerEncodingNameIfAvailable(registrar, "CP950"); 176 177 // Korean 178 if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR")) 179 registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR"); 180 if (registerEncodingNameIfAvailable(registrar, "CP949")) 181 registerEncodingAliasIfAvailable(registrar, "CP949", "UHC"); 182 if (registerEncodingNameIfAvailable(registrar, "EUC-KR")) 183 registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR"); 184 185 // Arabic 186 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) { 187 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC"); 188 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708"); 189 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114"); 190 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127"); 191 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6"); 192 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6"); 193 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987"); 194 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC"); 195 } 196 // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case 197 if (registerEncodingNameIfAvailable(registrar, "windows-1256")) { 198 registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256"); 199 registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB"); 200 } 201 202 // Hebrew 203 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) { 204 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW"); 205 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8"); 206 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138"); 207 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8"); 208 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8"); 209 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988"); 210 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW"); 211 } 212 // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html 213 if (registerEncodingNameIfAvailable(registrar, "windows-1255")) { 214 registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255"); 215 registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR"); 216 } 217 218 // Greek 219 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) { 220 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118"); 221 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928"); 222 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK"); 223 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8"); 224 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126"); 225 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7"); 226 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7"); 227 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987"); 228 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003"); 229 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI"); 230 } 231 if (registerEncodingNameIfAvailable(registrar, "CP869")) { 232 registerEncodingAliasIfAvailable(registrar, "CP869", "869"); 233 registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR"); 234 registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869"); 235 registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869"); 236 } 237 registerEncodingNameIfAvailable(registrar, "WINDOWS-1253"); 238 239 // Cyrillic 240 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) { 241 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC"); 242 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144"); 243 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5"); 244 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5"); 245 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988"); 246 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC"); 247 } 248 if (registerEncodingNameIfAvailable(registrar, "KOI8-R")) 249 registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R"); 250 if (registerEncodingNameIfAvailable(registrar, "CP866")) { 251 registerEncodingAliasIfAvailable(registrar, "CP866", "866"); 252 registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866"); 253 registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866"); 254 } 255 registerEncodingNameIfAvailable(registrar, "KOI8-U"); 256 // CP1251 added to pass /fast/encoding/charset-cp1251.html 257 if (registerEncodingNameIfAvailable(registrar, "windows-1251")) 258 registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251"); 259 if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) { 260 registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC"); 261 registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic"); 262 } 263 264 // Thai 265 if (registerEncodingNameIfAvailable(registrar, "CP874")) 266 registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874"); 267 registerEncodingNameIfAvailable(registrar, "TIS-620"); 268 269 // Simplified Chinese 270 registerEncodingNameIfAvailable(registrar, "GBK"); 271 if (registerEncodingNameIfAvailable(registrar, "HZ")) 272 registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312"); 273 registerEncodingNameIfAvailable(registrar, "GB18030"); 274 if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) { 275 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN"); 276 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312"); 277 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB"); 278 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312"); 279 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN"); 280 } 281 if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) { 282 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE"); 283 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280"); 284 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0"); 285 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58"); 286 } 287 288 // Central European 289 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) { 290 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101"); 291 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2"); 292 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2"); 293 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987"); 294 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2"); 295 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2"); 296 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2"); 297 } 298 if (registerEncodingNameIfAvailable(registrar, "CP1250")) { 299 registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE"); 300 registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250"); 301 } 302 registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE"); 303 304 // Vietnamese 305 if (registerEncodingNameIfAvailable(registrar, "CP1258")) 306 registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258"); 307 308 // Turkish 309 if (registerEncodingNameIfAvailable(registrar, "CP1254")) { 310 registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK"); 311 registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254"); 312 } 313 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) { 314 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148"); 315 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9"); 316 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9"); 317 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989"); 318 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5"); 319 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5"); 320 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5"); 321 } 322 323 // Baltic 324 if (registerEncodingNameIfAvailable(registrar, "CP1257")) { 325 registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM"); 326 registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257"); 327 } 328 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) { 329 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110"); 330 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4"); 331 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4"); 332 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988"); 333 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4"); 334 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4"); 335 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4"); 336 } 337 } 338 339 void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar) 340 { 341 // Western 342 registerCodecIfAvailable(registrar, "MACROMAN"); 343 344 // Japanese 345 registerCodecIfAvailable(registrar, "Shift_JIS"); 346 registerCodecIfAvailable(registrar, "EUC-JP"); 347 registerCodecIfAvailable(registrar, "ISO-2022-JP"); 348 349 // Traditional Chinese 350 registerCodecIfAvailable(registrar, "BIG5"); 351 registerCodecIfAvailable(registrar, "BIG5-HKSCS"); 352 registerCodecIfAvailable(registrar, "CP950"); 353 354 // Korean 355 registerCodecIfAvailable(registrar, "ISO-2022-KR"); 356 registerCodecIfAvailable(registrar, "CP949"); 357 registerCodecIfAvailable(registrar, "EUC-KR"); 358 359 // Arabic 360 registerCodecIfAvailable(registrar, "ISO-8859-6"); 361 // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case 362 registerCodecIfAvailable(registrar, "windows-1256"); 363 364 // Hebrew 365 registerCodecIfAvailable(registrar, "ISO-8859-8"); 366 // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html 367 registerCodecIfAvailable(registrar, "windows-1255"); 368 369 // Greek 370 registerCodecIfAvailable(registrar, "ISO-8859-7"); 371 registerCodecIfAvailable(registrar, "CP869"); 372 registerCodecIfAvailable(registrar, "WINDOWS-1253"); 373 374 // Cyrillic 375 registerCodecIfAvailable(registrar, "ISO-8859-5"); 376 registerCodecIfAvailable(registrar, "KOI8-R"); 377 registerCodecIfAvailable(registrar, "CP866"); 378 registerCodecIfAvailable(registrar, "KOI8-U"); 379 // CP1251 added to pass /fast/encoding/charset-cp1251.html 380 registerCodecIfAvailable(registrar, "windows-1251"); 381 registerCodecIfAvailable(registrar, "mac-cyrillic"); 382 383 // Thai 384 registerCodecIfAvailable(registrar, "CP874"); 385 registerCodecIfAvailable(registrar, "TIS-620"); 386 387 // Simplified Chinese 388 registerCodecIfAvailable(registrar, "GBK"); 389 registerCodecIfAvailable(registrar, "HZ"); 390 registerCodecIfAvailable(registrar, "GB18030"); 391 registerCodecIfAvailable(registrar, "EUC-CN"); 392 registerCodecIfAvailable(registrar, "GB_2312-80"); 393 394 // Central European 395 registerCodecIfAvailable(registrar, "ISO-8859-2"); 396 registerCodecIfAvailable(registrar, "CP1250"); 397 registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE"); 398 399 // Vietnamese 400 registerCodecIfAvailable(registrar, "CP1258"); 401 402 // Turkish 403 registerCodecIfAvailable(registrar, "CP1254"); 404 registerCodecIfAvailable(registrar, "ISO-8859-9"); 405 406 // Baltic 407 registerCodecIfAvailable(registrar, "CP1257"); 408 registerCodecIfAvailable(registrar, "ISO-8859-4"); 409 } 410 411 TextCodecGtk::TextCodecGtk(const TextEncoding& encoding) 412 : m_encoding(encoding) 413 , m_numBufferedBytes(0) 414 { 415 } 416 417 TextCodecGtk::~TextCodecGtk() 418 { 419 } 420 421 void TextCodecGtk::createIConvDecoder() const 422 { 423 ASSERT(!m_iconvDecoder); 424 425 m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0)); 426 } 427 428 void TextCodecGtk::createIConvEncoder() const 429 { 430 ASSERT(!m_iconvEncoder); 431 432 m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0)); 433 } 434 435 String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 436 { 437 // Get a converter for the passed-in encoding. 438 if (!m_iconvDecoder) 439 createIConvDecoder(); 440 if (!m_iconvDecoder) { 441 LOG_ERROR("Error creating IConv encoder even though encoding was in table."); 442 return String(); 443 } 444 445 Vector<UChar> result; 446 447 gsize bytesRead = 0; 448 gsize bytesWritten = 0; 449 const gchar* input = bytes; 450 gsize inputLength = length; 451 gchar buffer[ConversionBufferSize]; 452 int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS; 453 if (flush) 454 flags |= G_CONVERTER_FLUSH; 455 456 bool bufferWasFull = false; 457 char* prefixedBytes = 0; 458 459 if (m_numBufferedBytes) { 460 inputLength = length + m_numBufferedBytes; 461 prefixedBytes = static_cast<char*>(fastMalloc(inputLength)); 462 memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes); 463 memcpy(prefixedBytes + m_numBufferedBytes, bytes, length); 464 465 input = prefixedBytes; 466 467 // all buffered bytes are consumed now 468 m_numBufferedBytes = 0; 469 } 470 471 do { 472 GOwnPtr<GError> error; 473 GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()), 474 input, inputLength, 475 buffer, sizeof(buffer), 476 static_cast<GConverterFlags>(flags), 477 &bytesRead, &bytesWritten, 478 &error.outPtr()); 479 input += bytesRead; 480 inputLength -= bytesRead; 481 482 if (res == G_CONVERTER_ERROR) { 483 if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) { 484 // There is not enough input to fully determine what the conversion should produce, 485 // save it to a buffer to prepend it to the next input. 486 memcpy(m_bufferedBytes, input, inputLength); 487 m_numBufferedBytes = inputLength; 488 inputLength = 0; 489 } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE)) 490 bufferWasFull = true; 491 else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { 492 if (stopOnError) 493 sawError = true; 494 if (inputLength) { 495 // Ignore invalid character. 496 input += 1; 497 inputLength -= 1; 498 } 499 } else { 500 sawError = true; 501 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); 502 m_numBufferedBytes = 0; // Reset state for subsequent calls to decode. 503 fastFree(prefixedBytes); 504 return String(); 505 } 506 } 507 508 result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar)); 509 } while ((inputLength || bufferWasFull) && !sawError); 510 511 fastFree(prefixedBytes); 512 513 return String::adopt(result); 514 } 515 516 CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling) 517 { 518 if (!length) 519 return ""; 520 521 if (!m_iconvEncoder) 522 createIConvEncoder(); 523 if (!m_iconvEncoder) { 524 LOG_ERROR("Error creating IConv encoder even though encoding was in table."); 525 return CString(); 526 } 527 528 gsize bytesRead = 0; 529 gsize bytesWritten = 0; 530 const gchar* input = reinterpret_cast<const char*>(characters); 531 gsize inputLength = length * sizeof(UChar); 532 gchar buffer[ConversionBufferSize]; 533 Vector<char> result; 534 GOwnPtr<GError> error; 535 536 size_t size = 0; 537 do { 538 g_converter_convert(G_CONVERTER(m_iconvEncoder.get()), 539 input, inputLength, 540 buffer, sizeof(buffer), 541 G_CONVERTER_INPUT_AT_END, 542 &bytesRead, &bytesWritten, 543 &error.outPtr()); 544 input += bytesRead; 545 inputLength -= bytesRead; 546 if (bytesWritten > 0) { 547 result.grow(size + bytesWritten); 548 memcpy(result.data() + size, buffer, bytesWritten); 549 size += bytesWritten; 550 } 551 552 if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { 553 UChar codePoint = reinterpret_cast<const UChar*>(input)[0]; 554 UnencodableReplacementArray replacement; 555 int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement); 556 557 // Consume the invalid character. 558 input += sizeof(UChar); 559 inputLength -= sizeof(UChar); 560 561 // Append replacement string to result buffer. 562 result.grow(size + replacementLength); 563 memcpy(result.data() + size, replacement, replacementLength); 564 size += replacementLength; 565 566 error.clear(); 567 } 568 } while (inputLength && !error.get()); 569 570 if (error) { 571 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message); 572 return CString(); 573 } 574 575 return CString(result.data(), size); 576 } 577 578 } // namespace WebCore 579