Home | History | Annotate | Download | only in gtk
      1 /*
      2  * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
      3  * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com>
      4  * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch>
      5  * Copyright (C) 2009 Dominik Rttsches <dominik.roettsches (at) access-company.com>
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     20  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     24  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include "config.h"
     30 #include "TextCodecGtk.h"
     31 
     32 #include <gio/gio.h>
     33 #include "GOwnPtr.h"
     34 #include "Logging.h"
     35 #include "PlatformString.h"
     36 #include <wtf/Assertions.h>
     37 #include <wtf/HashMap.h>
     38 #include <wtf/text/CString.h>
     39 
     40 using std::min;
     41 
     42 namespace WebCore {
     43 
     44 // TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380).
     45 // That's why we need to avoid generating extra BOM's for the conversion result.
     46 // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib.
     47 
     48 #if (G_BYTE_ORDER == G_BIG_ENDIAN)
     49 static const gchar* internalEncodingName = "UTF-16BE";
     50 #else
     51 static const gchar* internalEncodingName = "UTF-16LE";
     52 #endif
     53 
     54 
     55 const size_t ConversionBufferSize = 16384;
     56 
     57 
     58 static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
     59 {
     60     return new TextCodecGtk(encoding);
     61 }
     62 
     63 static bool isEncodingAvailable(const gchar* encodingName)
     64 {
     65     GIConv tester;
     66     // test decoding
     67     tester = g_iconv_open(internalEncodingName, encodingName);
     68     if (tester == reinterpret_cast<GIConv>(-1)) {
     69         return false;
     70     } else {
     71         g_iconv_close(tester);
     72         // test encoding
     73         tester = g_iconv_open(encodingName, internalEncodingName);
     74         if (tester == reinterpret_cast<GIConv>(-1)) {
     75             return false;
     76         } else {
     77             g_iconv_close(tester);
     78             return true;
     79         }
     80     }
     81 }
     82 
     83 static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName)
     84 {
     85     if (isEncodingAvailable(canonicalName)) {
     86         registrar(canonicalName, canonicalName);
     87         return true;
     88     }
     89 
     90     return false;
     91 }
     92 
     93 static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName)
     94 {
     95     if (isEncodingAvailable(aliasName))
     96         registrar(aliasName, canonicalName);
     97 }
     98 
     99 static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName)
    100 {
    101     if (isEncodingAvailable(codecName))
    102         registrar(codecName, newTextCodecGtk, 0);
    103 }
    104 
    105 void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
    106 {
    107     // Unicode
    108     registerEncodingNameIfAvailable(registrar, "UTF-8");
    109     registerEncodingNameIfAvailable(registrar, "UTF-32");
    110     registerEncodingNameIfAvailable(registrar, "UTF-32BE");
    111     registerEncodingNameIfAvailable(registrar, "UTF-32LE");
    112 
    113     // Western
    114     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) {
    115         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819");
    116         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819");
    117         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100");
    118         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1");
    119         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1");
    120         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987");
    121         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1");
    122         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1");
    123         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1");
    124     }
    125 }
    126 
    127 void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
    128 {
    129     // Unicode
    130     registerCodecIfAvailable(registrar, "UTF-8");
    131     registerCodecIfAvailable(registrar, "UTF-32");
    132     registerCodecIfAvailable(registrar, "UTF-32BE");
    133     registerCodecIfAvailable(registrar, "UTF-32LE");
    134 
    135     // Western
    136     registerCodecIfAvailable(registrar, "ISO-8859-1");
    137 }
    138 
    139 void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
    140 {
    141     // Western
    142     if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) {
    143         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC");
    144         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH");
    145         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH");
    146     }
    147 
    148     // Japanese
    149     if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) {
    150         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI");
    151         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS");
    152         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS");
    153         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS");
    154     }
    155     if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) {
    156         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP");
    157         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP");
    158         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE");
    159         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE");
    160     }
    161     registerEncodingNameIfAvailable(registrar, "ISO-2022-JP");
    162 
    163     // Traditional Chinese
    164     if (registerEncodingNameIfAvailable(registrar, "BIG5")) {
    165         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5");
    166         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE");
    167         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE");
    168         registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5");
    169         registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5");
    170     }
    171     if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) {
    172         registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004");
    173         registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS");
    174     }
    175     registerEncodingNameIfAvailable(registrar, "CP950");
    176 
    177     // Korean
    178     if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR"))
    179         registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR");
    180     if (registerEncodingNameIfAvailable(registrar, "CP949"))
    181         registerEncodingAliasIfAvailable(registrar, "CP949", "UHC");
    182     if (registerEncodingNameIfAvailable(registrar, "EUC-KR"))
    183         registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR");
    184 
    185     // Arabic
    186     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) {
    187         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC");
    188         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708");
    189         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114");
    190         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127");
    191         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6");
    192         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6");
    193         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987");
    194         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC");
    195     }
    196     // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
    197     if (registerEncodingNameIfAvailable(registrar, "windows-1256")) {
    198         registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256");
    199         registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB");
    200     }
    201 
    202     // Hebrew
    203     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) {
    204         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW");
    205         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8");
    206         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138");
    207         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8");
    208         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8");
    209         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988");
    210         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW");
    211     }
    212     // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
    213     if (registerEncodingNameIfAvailable(registrar, "windows-1255")) {
    214         registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255");
    215         registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR");
    216     }
    217 
    218     // Greek
    219     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) {
    220         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118");
    221         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928");
    222         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK");
    223         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8");
    224         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126");
    225         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7");
    226         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7");
    227         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987");
    228         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003");
    229         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI");
    230     }
    231     if (registerEncodingNameIfAvailable(registrar, "CP869")) {
    232         registerEncodingAliasIfAvailable(registrar, "CP869", "869");
    233         registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR");
    234         registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869");
    235         registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869");
    236     }
    237     registerEncodingNameIfAvailable(registrar, "WINDOWS-1253");
    238 
    239     // Cyrillic
    240     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) {
    241         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC");
    242         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144");
    243         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5");
    244         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5");
    245         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988");
    246         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC");
    247     }
    248     if (registerEncodingNameIfAvailable(registrar, "KOI8-R"))
    249         registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R");
    250     if (registerEncodingNameIfAvailable(registrar, "CP866")) {
    251         registerEncodingAliasIfAvailable(registrar, "CP866", "866");
    252         registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866");
    253         registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866");
    254     }
    255     registerEncodingNameIfAvailable(registrar, "KOI8-U");
    256     // CP1251 added to pass /fast/encoding/charset-cp1251.html
    257     if (registerEncodingNameIfAvailable(registrar, "windows-1251"))
    258         registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251");
    259     if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) {
    260         registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC");
    261         registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic");
    262     }
    263 
    264     // Thai
    265     if (registerEncodingNameIfAvailable(registrar, "CP874"))
    266         registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874");
    267     registerEncodingNameIfAvailable(registrar, "TIS-620");
    268 
    269     // Simplified Chinese
    270     registerEncodingNameIfAvailable(registrar, "GBK");
    271     if (registerEncodingNameIfAvailable(registrar, "HZ"))
    272         registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312");
    273     registerEncodingNameIfAvailable(registrar, "GB18030");
    274     if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) {
    275         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN");
    276         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312");
    277         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB");
    278         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312");
    279         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN");
    280     }
    281     if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) {
    282         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE");
    283         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280");
    284         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0");
    285         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58");
    286     }
    287 
    288     // Central European
    289     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) {
    290         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101");
    291         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2");
    292         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2");
    293         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987");
    294         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2");
    295         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2");
    296         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2");
    297     }
    298     if (registerEncodingNameIfAvailable(registrar, "CP1250")) {
    299         registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE");
    300         registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250");
    301     }
    302     registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE");
    303 
    304     // Vietnamese
    305     if (registerEncodingNameIfAvailable(registrar, "CP1258"))
    306         registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258");
    307 
    308     // Turkish
    309     if (registerEncodingNameIfAvailable(registrar, "CP1254")) {
    310         registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK");
    311         registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254");
    312     }
    313     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) {
    314         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148");
    315         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9");
    316         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9");
    317         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989");
    318         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5");
    319         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5");
    320         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5");
    321     }
    322 
    323     // Baltic
    324     if (registerEncodingNameIfAvailable(registrar, "CP1257")) {
    325         registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM");
    326         registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257");
    327     }
    328     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) {
    329         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110");
    330         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4");
    331         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4");
    332         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988");
    333         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4");
    334         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4");
    335         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4");
    336     }
    337 }
    338 
    339 void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
    340 {
    341     // Western
    342     registerCodecIfAvailable(registrar, "MACROMAN");
    343 
    344     // Japanese
    345     registerCodecIfAvailable(registrar, "Shift_JIS");
    346     registerCodecIfAvailable(registrar, "EUC-JP");
    347     registerCodecIfAvailable(registrar, "ISO-2022-JP");
    348 
    349     // Traditional Chinese
    350     registerCodecIfAvailable(registrar, "BIG5");
    351     registerCodecIfAvailable(registrar, "BIG5-HKSCS");
    352     registerCodecIfAvailable(registrar, "CP950");
    353 
    354     // Korean
    355     registerCodecIfAvailable(registrar, "ISO-2022-KR");
    356     registerCodecIfAvailable(registrar, "CP949");
    357     registerCodecIfAvailable(registrar, "EUC-KR");
    358 
    359     // Arabic
    360     registerCodecIfAvailable(registrar, "ISO-8859-6");
    361     // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
    362     registerCodecIfAvailable(registrar, "windows-1256");
    363 
    364     // Hebrew
    365     registerCodecIfAvailable(registrar, "ISO-8859-8");
    366     // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
    367     registerCodecIfAvailable(registrar, "windows-1255");
    368 
    369     // Greek
    370     registerCodecIfAvailable(registrar, "ISO-8859-7");
    371     registerCodecIfAvailable(registrar, "CP869");
    372     registerCodecIfAvailable(registrar, "WINDOWS-1253");
    373 
    374     // Cyrillic
    375     registerCodecIfAvailable(registrar, "ISO-8859-5");
    376     registerCodecIfAvailable(registrar, "KOI8-R");
    377     registerCodecIfAvailable(registrar, "CP866");
    378     registerCodecIfAvailable(registrar, "KOI8-U");
    379     // CP1251 added to pass /fast/encoding/charset-cp1251.html
    380     registerCodecIfAvailable(registrar, "windows-1251");
    381     registerCodecIfAvailable(registrar, "mac-cyrillic");
    382 
    383     // Thai
    384     registerCodecIfAvailable(registrar, "CP874");
    385     registerCodecIfAvailable(registrar, "TIS-620");
    386 
    387     // Simplified Chinese
    388     registerCodecIfAvailable(registrar, "GBK");
    389     registerCodecIfAvailable(registrar, "HZ");
    390     registerCodecIfAvailable(registrar, "GB18030");
    391     registerCodecIfAvailable(registrar, "EUC-CN");
    392     registerCodecIfAvailable(registrar, "GB_2312-80");
    393 
    394     // Central European
    395     registerCodecIfAvailable(registrar, "ISO-8859-2");
    396     registerCodecIfAvailable(registrar, "CP1250");
    397     registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE");
    398 
    399     // Vietnamese
    400     registerCodecIfAvailable(registrar, "CP1258");
    401 
    402     // Turkish
    403     registerCodecIfAvailable(registrar, "CP1254");
    404     registerCodecIfAvailable(registrar, "ISO-8859-9");
    405 
    406     // Baltic
    407     registerCodecIfAvailable(registrar, "CP1257");
    408     registerCodecIfAvailable(registrar, "ISO-8859-4");
    409 }
    410 
    411 TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
    412     : m_encoding(encoding)
    413     , m_numBufferedBytes(0)
    414 {
    415 }
    416 
    417 TextCodecGtk::~TextCodecGtk()
    418 {
    419 }
    420 
    421 void TextCodecGtk::createIConvDecoder() const
    422 {
    423     ASSERT(!m_iconvDecoder);
    424 
    425     m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0));
    426 }
    427 
    428 void TextCodecGtk::createIConvEncoder() const
    429 {
    430     ASSERT(!m_iconvEncoder);
    431 
    432     m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0));
    433 }
    434 
    435 String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
    436 {
    437     // Get a converter for the passed-in encoding.
    438     if (!m_iconvDecoder)
    439         createIConvDecoder();
    440     if (!m_iconvDecoder) {
    441         LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
    442         return String();
    443     }
    444 
    445     Vector<UChar> result;
    446 
    447     gsize bytesRead = 0;
    448     gsize bytesWritten = 0;
    449     const gchar* input = bytes;
    450     gsize inputLength = length;
    451     gchar buffer[ConversionBufferSize];
    452     int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS;
    453     if (flush)
    454         flags |= G_CONVERTER_FLUSH;
    455 
    456     bool bufferWasFull = false;
    457     char* prefixedBytes = 0;
    458 
    459     if (m_numBufferedBytes) {
    460         inputLength = length + m_numBufferedBytes;
    461         prefixedBytes = static_cast<char*>(fastMalloc(inputLength));
    462         memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
    463         memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
    464 
    465         input = prefixedBytes;
    466 
    467         // all buffered bytes are consumed now
    468         m_numBufferedBytes = 0;
    469     }
    470 
    471     do {
    472         GOwnPtr<GError> error;
    473         GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()),
    474                                                    input, inputLength,
    475                                                    buffer, sizeof(buffer),
    476                                                    static_cast<GConverterFlags>(flags),
    477                                                    &bytesRead, &bytesWritten,
    478                                                    &error.outPtr());
    479         input += bytesRead;
    480         inputLength -= bytesRead;
    481 
    482         if (res == G_CONVERTER_ERROR) {
    483             if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
    484                 // There is not enough input to fully determine what the conversion should produce,
    485                 // save it to a buffer to prepend it to the next input.
    486                 memcpy(m_bufferedBytes, input, inputLength);
    487                 m_numBufferedBytes = inputLength;
    488                 inputLength = 0;
    489             } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE))
    490                 bufferWasFull = true;
    491             else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
    492                 if (stopOnError)
    493                     sawError = true;
    494                 if (inputLength) {
    495                     // Ignore invalid character.
    496                     input += 1;
    497                     inputLength -= 1;
    498                 }
    499             } else {
    500                 sawError = true;
    501                 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
    502                 m_numBufferedBytes = 0; // Reset state for subsequent calls to decode.
    503                 fastFree(prefixedBytes);
    504                 return String();
    505             }
    506         }
    507 
    508         result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar));
    509     } while ((inputLength || bufferWasFull) && !sawError);
    510 
    511     fastFree(prefixedBytes);
    512 
    513     return String::adopt(result);
    514 }
    515 
    516 CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
    517 {
    518     if (!length)
    519         return "";
    520 
    521     if (!m_iconvEncoder)
    522         createIConvEncoder();
    523     if (!m_iconvEncoder) {
    524         LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
    525         return CString();
    526     }
    527 
    528     gsize bytesRead = 0;
    529     gsize bytesWritten = 0;
    530     const gchar* input = reinterpret_cast<const char*>(characters);
    531     gsize inputLength = length * sizeof(UChar);
    532     gchar buffer[ConversionBufferSize];
    533     Vector<char> result;
    534     GOwnPtr<GError> error;
    535 
    536     size_t size = 0;
    537     do {
    538         g_converter_convert(G_CONVERTER(m_iconvEncoder.get()),
    539                             input, inputLength,
    540                             buffer, sizeof(buffer),
    541                             G_CONVERTER_INPUT_AT_END,
    542                             &bytesRead, &bytesWritten,
    543                             &error.outPtr());
    544         input += bytesRead;
    545         inputLength -= bytesRead;
    546         if (bytesWritten > 0) {
    547             result.grow(size + bytesWritten);
    548             memcpy(result.data() + size, buffer, bytesWritten);
    549             size += bytesWritten;
    550         }
    551 
    552         if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
    553             UChar codePoint = reinterpret_cast<const UChar*>(input)[0];
    554             UnencodableReplacementArray replacement;
    555             int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement);
    556 
    557             // Consume the invalid character.
    558             input += sizeof(UChar);
    559             inputLength -= sizeof(UChar);
    560 
    561             // Append replacement string to result buffer.
    562             result.grow(size + replacementLength);
    563             memcpy(result.data() + size, replacement, replacementLength);
    564             size += replacementLength;
    565 
    566             error.clear();
    567         }
    568     } while (inputLength && !error.get());
    569 
    570     if (error) {
    571         LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
    572         return CString();
    573     }
    574 
    575     return CString(result.data(), size);
    576 }
    577 
    578 } // namespace WebCore
    579