Home | History | Annotate | Download | only in patches
      1 Add code support for ICU.
      2 
      3 diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c
      4 index b86a547..0f41df9 100644
      5 --- a/third_party/libxml/encoding.c
      6 +++ b/third_party/libxml/encoding.c
      7 @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
      8  static int xmlCharEncodingAliasesNb = 0;
      9  static int xmlCharEncodingAliasesMax = 0;
     10  
     11 -#ifdef LIBXML_ICONV_ENABLED
     12 +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
     13  #if 0
     14  #define DEBUG_ENCODING  /* Define this to get encoding traces */
     15  #endif
     16 @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
     17                      NULL, 0, val, NULL, NULL, 0, 0, msg, val);
     18  }
     19  
     20 +#ifdef LIBXML_ICU_ENABLED
     21 +static uconv_t* 
     22 +openIcuConverter(const char* name, int toUnicode)
     23 +{
     24 +  UErrorCode status = U_ZERO_ERROR;
     25 +  uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
     26 +  if (conv == NULL)
     27 +    return NULL;
     28 +
     29 +  conv->uconv = ucnv_open(name, &status);
     30 +  if (U_FAILURE(status))
     31 +    goto error;
     32 +
     33 +  status = U_ZERO_ERROR;
     34 +  if (toUnicode) {
     35 +    ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, 
     36 +                        NULL, NULL, NULL, &status);
     37 +  }
     38 +  else {
     39 +    ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, 
     40 +                        NULL, NULL, NULL, &status);
     41 +  }
     42 +  if (U_FAILURE(status))
     43 +    goto error;
     44 +
     45 +  status = U_ZERO_ERROR;
     46 +  conv->utf8 = ucnv_open("UTF-8", &status);
     47 +  if (U_SUCCESS(status))
     48 +    return conv;
     49 +
     50 +error:
     51 +  if (conv->uconv) 
     52 +    ucnv_close(conv->uconv);
     53 +  xmlFree(conv);
     54 +  return NULL;
     55 +}
     56 +
     57 +static void
     58 +closeIcuConverter(uconv_t *conv)
     59 +{
     60 +  if (conv != NULL) {
     61 +    ucnv_close(conv->uconv);
     62 +    ucnv_close(conv->utf8);
     63 +    xmlFree(conv);
     64 +  }
     65 +}
     66 +#endif /* LIBXML_ICU_ENABLED */
     67 +
     68  /************************************************************************
     69   *									*
     70   *		Conversions To/From UTF8 encoding			*
     71 @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
     72  #ifdef LIBXML_ICONV_ENABLED
     73      handler->iconv_in = NULL;
     74      handler->iconv_out = NULL;
     75 -#endif /* LIBXML_ICONV_ENABLED */
     76 +#endif
     77 +#ifdef LIBXML_ICU_ENABLED
     78 +    handler->uconv_in = NULL;
     79 +    handler->uconv_out = NULL;
     80 +#endif
     81  
     82      /*
     83       * registers and returns the handler.
     84 @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
     85      xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
     86      xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
     87  #endif /* LIBXML_OUTPUT_ENABLED */
     88 -#ifndef LIBXML_ICONV_ENABLED
     89 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
     90  #ifdef LIBXML_ISO8859X_ENABLED
     91      xmlRegisterCharEncodingHandlersISO8859x ();
     92  #endif
     93 @@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) {
     94      xmlCharEncodingHandlerPtr enc;
     95      iconv_t icv_in, icv_out;
     96  #endif /* LIBXML_ICONV_ENABLED */
     97 +#ifdef LIBXML_ICU_ENABLED
     98 +    xmlCharEncodingHandlerPtr enc;
     99 +    uconv_t *ucv_in, *ucv_out;
    100 +#endif /* LIBXML_ICU_ENABLED */
    101      char upper[100];
    102      int i;
    103  
    104 @@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) {
    105  		    "iconv : problems with filters for '%s'\n", name);
    106      }
    107  #endif /* LIBXML_ICONV_ENABLED */
    108 +#ifdef LIBXML_ICU_ENABLED
    109 +    /* check whether icu can handle this */
    110 +    ucv_in = openIcuConverter(name, 1);
    111 +    ucv_out = openIcuConverter(name, 0);
    112 +    if (ucv_in != NULL && ucv_out != NULL) {
    113 +	    enc = (xmlCharEncodingHandlerPtr)
    114 +	          xmlMalloc(sizeof(xmlCharEncodingHandler));
    115 +	    if (enc == NULL) {
    116 +                closeIcuConverter(ucv_in);
    117 +                closeIcuConverter(ucv_out);
    118 +		return(NULL);
    119 +	    }
    120 +	    enc->name = xmlMemStrdup(name);
    121 +	    enc->input = NULL;
    122 +	    enc->output = NULL;
    123 +	    enc->uconv_in = ucv_in;
    124 +	    enc->uconv_out = ucv_out;
    125 +#ifdef DEBUG_ENCODING
    126 +            xmlGenericError(xmlGenericErrorContext,
    127 +		    "Found ICU converter handler for encoding %s\n", name);
    128 +#endif
    129 +	    return enc;
    130 +    } else if (ucv_in != NULL || ucv_out != NULL) {
    131 +            closeIcuConverter(ucv_in);
    132 +            closeIcuConverter(ucv_out);
    133 +	    xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
    134 +		    "ICU converter : problems with filters for '%s'\n", name);
    135 +    }
    136 +#endif /* LIBXML_ICU_ENABLED */
    137  
    138  #ifdef DEBUG_ENCODING
    139      xmlGenericError(xmlGenericErrorContext,
    140 @@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
    141  
    142  /************************************************************************
    143   *									*
    144 + *		ICU based generic conversion functions	         	*
    145 + *									*
    146 + ************************************************************************/
    147 +
    148 +#ifdef LIBXML_ICU_ENABLED
    149 +/**
    150 + * xmlUconvWrapper:
    151 + * @cd: ICU uconverter data structure
    152 + * @toUnicode : non-zero if toUnicode. 0 otherwise.
    153 + * @out:  a pointer to an array of bytes to store the result
    154 + * @outlen:  the length of @out
    155 + * @in:  a pointer to an array of ISO Latin 1 chars
    156 + * @inlen:  the length of @in
    157 + *
    158 + * Returns 0 if success, or 
    159 + *     -1 by lack of space, or
    160 + *     -2 if the transcoding fails (for *in is not valid utf8 string or
    161 + *        the result of transformation can't fit into the encoding we want), or
    162 + *     -3 if there the last byte can't form a single output char.
    163 + *     
    164 + * The value of @inlen after return is the number of octets consumed
    165 + *     as the return value is positive, else unpredictable.
    166 + * The value of @outlen after return is the number of ocetes consumed.
    167 + */
    168 +static int
    169 +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
    170 +                const unsigned char *in, int *inlen) {
    171 +    const char *ucv_in = (const char *) in;
    172 +    char *ucv_out = (char *) out;
    173 +    UErrorCode err = U_ZERO_ERROR;
    174 +
    175 +    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
    176 +        if (outlen != NULL) *outlen = 0;
    177 +        return(-1);
    178 +    }
    179 +
    180 +    /* 
    181 +     * TODO(jungshik)
    182 +     * 1. is ucnv_convert(To|From)Algorithmic better?
    183 +     * 2. had we better use an explicit pivot buffer?
    184 +     * 3. error returned comes from 'fromUnicode' only even
    185 +     *    when toUnicode is true !
    186 +     */
    187 +    if (toUnicode) {
    188 +        /* encoding => UTF-16 => UTF-8 */
    189 +        ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
    190 +                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
    191 +                       0, TRUE, &err);
    192 +    } else {
    193 +        /* UTF-8 => UTF-16 => encoding */
    194 +        ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
    195 +                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
    196 +                       0, TRUE, &err);
    197 +    }
    198 +    *inlen = ucv_in - (const char*) in; 
    199 +    *outlen = ucv_out - (char *) out;
    200 +    if (U_SUCCESS(err))
    201 +        return 0;
    202 +    if (err == U_BUFFER_OVERFLOW_ERROR)
    203 +        return -1;
    204 +    if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
    205 +        return -2;
    206 +    /* if (err == U_TRUNCATED_CHAR_FOUND) */
    207 +    return -3;
    208 +}
    209 +#endif /* LIBXML_ICU_ENABLED */
    210 +
    211 +/************************************************************************
    212 + *									*
    213   *		The real API used by libxml for on-the-fly conversion	*
    214   *									*
    215   ************************************************************************/
    216 @@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
    217  	if (ret == -1) ret = -3;
    218      }
    219  #endif /* LIBXML_ICONV_ENABLED */
    220 +#ifdef LIBXML_ICU_ENABLED
    221 +    else if (handler->uconv_in != NULL) {
    222 +	ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
    223 +	                      &written, in->content, &toconv);
    224 +	xmlBufferShrink(in, toconv);
    225 +	out->use += written;
    226 +	out->content[out->use] = 0;
    227 +	if (ret == -1) ret = -3;
    228 +    }
    229 +#endif /* LIBXML_ICU_ENABLED */
    230  #ifdef DEBUG_ENCODING
    231      switch (ret) {
    232          case 0:
    233 @@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
    234              ret = -3;
    235      }
    236  #endif /* LIBXML_ICONV_ENABLED */
    237 +#ifdef LIBXML_ICU_ENABLED
    238 +    else if (handler->uconv_in != NULL) {
    239 +        ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
    240 +                              &written, in->content, &toconv);
    241 +        xmlBufferShrink(in, toconv);
    242 +        out->use += written;
    243 +        out->content[out->use] = 0;
    244 +        if (ret == -1)
    245 +            ret = -3;
    246 +    }
    247 +#endif /* LIBXML_ICU_ENABLED */
    248      switch (ret) {
    249          case 0:
    250  #ifdef DEBUG_ENCODING
    251 @@ -2015,6 +2190,15 @@ retry:
    252  	    out->content[out->use] = 0;
    253  	}
    254  #endif /* LIBXML_ICONV_ENABLED */
    255 +#ifdef LIBXML_ICU_ENABLED
    256 +	else if (handler->uconv_out != NULL) {
    257 +	    ret = xmlUconvWrapper(handler->uconv_out, 0,
    258 +                              &out->content[out->use],
    259 + 				              &written, NULL, &toconv);
    260 +	    out->use += written;
    261 +	    out->content[out->use] = 0;
    262 +	}
    263 +#endif /* LIBXML_ICU_ENABLED */
    264  #ifdef DEBUG_ENCODING
    265  	xmlGenericError(xmlGenericErrorContext,
    266  		"initialized encoder\n");
    267 @@ -2061,6 +2245,26 @@ retry:
    268  	}
    269      }
    270  #endif /* LIBXML_ICONV_ENABLED */
    271 +#ifdef LIBXML_ICU_ENABLED
    272 +    else if (handler->uconv_out != NULL) {
    273 +	ret = xmlUconvWrapper(handler->uconv_out, 0,
    274 +                              &out->content[out->use],
    275 +	                      &written, in->content, &toconv);
    276 +	xmlBufferShrink(in, toconv);
    277 +	out->use += written;
    278 +	writtentot += written;
    279 +	out->content[out->use] = 0;
    280 +	if (ret == -1) {
    281 +	    if (written > 0) {
    282 +		/*
    283 +		 * Can be a limitation of iconv
    284 +		 */
    285 +		goto retry;
    286 +	    }
    287 +	    ret = -3;
    288 +	}
    289 +    }
    290 +#endif /* LIBXML_ICU_ENABLED */
    291      else {
    292  	xmlEncodingErr(XML_I18N_NO_OUTPUT,
    293  		       "xmlCharEncOutFunc: no output function !\n", NULL);
    294 @@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
    295  	xmlFree(handler);
    296      }
    297  #endif /* LIBXML_ICONV_ENABLED */
    298 +#ifdef LIBXML_ICU_ENABLED
    299 +    if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
    300 +	if (handler->name != NULL)
    301 +	    xmlFree(handler->name);
    302 +	handler->name = NULL;
    303 +	if (handler->uconv_out != NULL) {
    304 +	    closeIcuConverter(handler->uconv_out);
    305 +	    handler->uconv_out = NULL;
    306 +	}
    307 +	if (handler->uconv_in != NULL) {
    308 +	    closeIcuConverter(handler->uconv_in);
    309 +	    handler->uconv_in = NULL;
    310 +	}
    311 +	xmlFree(handler);
    312 +    }
    313 +#endif
    314  #ifdef DEBUG_ENCODING
    315      if (ret)
    316          xmlGenericError(xmlGenericErrorContext,
    317 @@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
    318  		    cur += toconv;
    319  		} while (ret == -2);
    320  #endif
    321 +#ifdef LIBXML_ICU_ENABLED
    322 +	    } else if (handler->uconv_out != NULL) {
    323 +	        do {
    324 +		    toconv = in->end - cur;
    325 +		    written = 32000;
    326 +		    ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
    327 +	                      &written, cur, &toconv);
    328 +		    if (ret < 0) {
    329 +		        if (written > 0)
    330 +			    ret = -2;
    331 +			else
    332 +			    return(-1);
    333 +		    }
    334 +		    unused += written;
    335 +		    cur += toconv;
    336 +		} while (ret == -2);
    337              } else {
    338  	        /* could not find a converter */
    339  	        return(-1);
    340 @@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
    341      }
    342      return(in->consumed + (in->cur - in->base));
    343  }
    344 +#endif
    345  
    346 -#ifndef LIBXML_ICONV_ENABLED
    347 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
    348  #ifdef LIBXML_ISO8859X_ENABLED
    349  
    350  /**
    351 diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h
    352 index c74b25f..b5f8b48 100644
    353 --- a/third_party/libxml/include/libxml/encoding.h
    354 +++ b/third_party/libxml/include/libxml/encoding.h
    355 @@ -26,6 +26,24 @@
    356  
    357  #ifdef LIBXML_ICONV_ENABLED
    358  #include <iconv.h>
    359 +#else 
    360 +#ifdef LIBXML_ICU_ENABLED
    361 +#include <unicode/ucnv.h>
    362 +#if 0
    363 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
    364 + * to prevent unwanted ICU symbols being exposed to users of libxml2.
    365 + * One particular case is Qt4 conflicting on UChar32.
    366 + */
    367 +#include <stdint.h>
    368 +struct UConverter;
    369 +typedef struct UConverter UConverter;
    370 +#ifdef _MSC_VER
    371 +typedef wchar_t UChar;
    372 +#else
    373 +typedef uint16_t UChar;
    374 +#endif
    375 +#endif
    376 +#endif
    377  #endif
    378  #ifdef __cplusplus
    379  extern "C" {
    380 @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
    381   * Block defining the handlers for non UTF-8 encodings.
    382   * If iconv is supported, there are two extra fields.
    383   */
    384 +#ifdef LIBXML_ICU_ENABLED
    385 +struct _uconv_t {
    386 +  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
    387 +  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
    388 +};
    389 +typedef struct _uconv_t uconv_t;
    390 +#endif
    391  
    392  typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
    393  typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
    394 @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
    395      iconv_t                    iconv_in;
    396      iconv_t                    iconv_out;
    397  #endif /* LIBXML_ICONV_ENABLED */
    398 +#ifdef LIBXML_ICU_ENABLED
    399 +    uconv_t                    *uconv_in;
    400 +    uconv_t                    *uconv_out;
    401 +#endif /* LIBXML_ICU_ENABLED */
    402  };
    403  
    404  #ifdef __cplusplus
    405 diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h
    406 index dd79c42..3580b63 100644
    407 --- a/third_party/libxml/include/libxml/parser.h
    408 +++ b/third_party/libxml/include/libxml/parser.h
    409 @@ -1222,6 +1222,7 @@ typedef enum {
    410      XML_WITH_DEBUG_MEM = 29,
    411      XML_WITH_DEBUG_RUN = 30,
    412      XML_WITH_ZLIB = 31,
    413 +    XML_WITH_ICU = 32,
    414      XML_WITH_NONE = 99999 /* just to be sure of allocation size */
    415  } xmlFeature;
    416  
    417 diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in
    418 index 4739f3a..de310ab 100644
    419 --- a/third_party/libxml/include/libxml/xmlversion.h.in
    420 +++ b/third_party/libxml/include/libxml/xmlversion.h.in
    421 @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
    422  #endif
    423  
    424  /**
    425 + * LIBXML_ICU_ENABLED:
    426 + *
    427 + * Whether icu support is available
    428 + */
    429 +#if @WITH_ICU@
    430 +#define LIBXML_ICU_ENABLED
    431 +#endif
    432 +
    433 +/**
    434   * LIBXML_ISO8859X_ENABLED:
    435   *
    436   * Whether ISO-8859-* support is made available in case iconv is not
    437 diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c
    438 index 85e7599..3ba2a06 100644
    439 --- a/third_party/libxml/parser.c
    440 +++ b/third_party/libxml/parser.c
    441 @@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature)
    442  #else
    443              return(0);
    444  #endif
    445 +        case XML_WITH_ICU:
    446 +#ifdef LIBXML_ICU_ENABLED
    447 +            return(1);
    448 +#else
    449 +            return(0);
    450 +#endif
    451          default:
    452  	    break;
    453       }
    454