1 Add code support for ICU. 2 3 diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c 4 index b86a547..0f41df9 100644 5 --- a/third_party/libxml/encoding.c 6 +++ b/third_party/libxml/encoding.c 7 @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; 8 static int xmlCharEncodingAliasesNb = 0; 9 static int xmlCharEncodingAliasesMax = 0; 10 11 -#ifdef LIBXML_ICONV_ENABLED 12 +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) 13 #if 0 14 #define DEBUG_ENCODING /* Define this to get encoding traces */ 15 #endif 16 @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) 17 NULL, 0, val, NULL, NULL, 0, 0, msg, val); 18 } 19 20 +#ifdef LIBXML_ICU_ENABLED 21 +static uconv_t* 22 +openIcuConverter(const char* name, int toUnicode) 23 +{ 24 + UErrorCode status = U_ZERO_ERROR; 25 + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); 26 + if (conv == NULL) 27 + return NULL; 28 + 29 + conv->uconv = ucnv_open(name, &status); 30 + if (U_FAILURE(status)) 31 + goto error; 32 + 33 + status = U_ZERO_ERROR; 34 + if (toUnicode) { 35 + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, 36 + NULL, NULL, NULL, &status); 37 + } 38 + else { 39 + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, 40 + NULL, NULL, NULL, &status); 41 + } 42 + if (U_FAILURE(status)) 43 + goto error; 44 + 45 + status = U_ZERO_ERROR; 46 + conv->utf8 = ucnv_open("UTF-8", &status); 47 + if (U_SUCCESS(status)) 48 + return conv; 49 + 50 +error: 51 + if (conv->uconv) 52 + ucnv_close(conv->uconv); 53 + xmlFree(conv); 54 + return NULL; 55 +} 56 + 57 +static void 58 +closeIcuConverter(uconv_t *conv) 59 +{ 60 + if (conv != NULL) { 61 + ucnv_close(conv->uconv); 62 + ucnv_close(conv->utf8); 63 + xmlFree(conv); 64 + } 65 +} 66 +#endif /* LIBXML_ICU_ENABLED */ 67 + 68 /************************************************************************ 69 * * 70 * Conversions To/From UTF8 encoding * 71 @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, 72 #ifdef LIBXML_ICONV_ENABLED 73 handler->iconv_in = NULL; 74 handler->iconv_out = NULL; 75 -#endif /* LIBXML_ICONV_ENABLED */ 76 +#endif 77 +#ifdef LIBXML_ICU_ENABLED 78 + handler->uconv_in = NULL; 79 + handler->uconv_out = NULL; 80 +#endif 81 82 /* 83 * registers and returns the handler. 84 @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { 85 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); 86 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); 87 #endif /* LIBXML_OUTPUT_ENABLED */ 88 -#ifndef LIBXML_ICONV_ENABLED 89 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 90 #ifdef LIBXML_ISO8859X_ENABLED 91 xmlRegisterCharEncodingHandlersISO8859x (); 92 #endif 93 @@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) { 94 xmlCharEncodingHandlerPtr enc; 95 iconv_t icv_in, icv_out; 96 #endif /* LIBXML_ICONV_ENABLED */ 97 +#ifdef LIBXML_ICU_ENABLED 98 + xmlCharEncodingHandlerPtr enc; 99 + uconv_t *ucv_in, *ucv_out; 100 +#endif /* LIBXML_ICU_ENABLED */ 101 char upper[100]; 102 int i; 103 104 @@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) { 105 "iconv : problems with filters for '%s'\n", name); 106 } 107 #endif /* LIBXML_ICONV_ENABLED */ 108 +#ifdef LIBXML_ICU_ENABLED 109 + /* check whether icu can handle this */ 110 + ucv_in = openIcuConverter(name, 1); 111 + ucv_out = openIcuConverter(name, 0); 112 + if (ucv_in != NULL && ucv_out != NULL) { 113 + enc = (xmlCharEncodingHandlerPtr) 114 + xmlMalloc(sizeof(xmlCharEncodingHandler)); 115 + if (enc == NULL) { 116 + closeIcuConverter(ucv_in); 117 + closeIcuConverter(ucv_out); 118 + return(NULL); 119 + } 120 + enc->name = xmlMemStrdup(name); 121 + enc->input = NULL; 122 + enc->output = NULL; 123 + enc->uconv_in = ucv_in; 124 + enc->uconv_out = ucv_out; 125 +#ifdef DEBUG_ENCODING 126 + xmlGenericError(xmlGenericErrorContext, 127 + "Found ICU converter handler for encoding %s\n", name); 128 +#endif 129 + return enc; 130 + } else if (ucv_in != NULL || ucv_out != NULL) { 131 + closeIcuConverter(ucv_in); 132 + closeIcuConverter(ucv_out); 133 + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, 134 + "ICU converter : problems with filters for '%s'\n", name); 135 + } 136 +#endif /* LIBXML_ICU_ENABLED */ 137 138 #ifdef DEBUG_ENCODING 139 xmlGenericError(xmlGenericErrorContext, 140 @@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, 141 142 /************************************************************************ 143 * * 144 + * ICU based generic conversion functions * 145 + * * 146 + ************************************************************************/ 147 + 148 +#ifdef LIBXML_ICU_ENABLED 149 +/** 150 + * xmlUconvWrapper: 151 + * @cd: ICU uconverter data structure 152 + * @toUnicode : non-zero if toUnicode. 0 otherwise. 153 + * @out: a pointer to an array of bytes to store the result 154 + * @outlen: the length of @out 155 + * @in: a pointer to an array of ISO Latin 1 chars 156 + * @inlen: the length of @in 157 + * 158 + * Returns 0 if success, or 159 + * -1 by lack of space, or 160 + * -2 if the transcoding fails (for *in is not valid utf8 string or 161 + * the result of transformation can't fit into the encoding we want), or 162 + * -3 if there the last byte can't form a single output char. 163 + * 164 + * The value of @inlen after return is the number of octets consumed 165 + * as the return value is positive, else unpredictable. 166 + * The value of @outlen after return is the number of ocetes consumed. 167 + */ 168 +static int 169 +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, 170 + const unsigned char *in, int *inlen) { 171 + const char *ucv_in = (const char *) in; 172 + char *ucv_out = (char *) out; 173 + UErrorCode err = U_ZERO_ERROR; 174 + 175 + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { 176 + if (outlen != NULL) *outlen = 0; 177 + return(-1); 178 + } 179 + 180 + /* 181 + * TODO(jungshik) 182 + * 1. is ucnv_convert(To|From)Algorithmic better? 183 + * 2. had we better use an explicit pivot buffer? 184 + * 3. error returned comes from 'fromUnicode' only even 185 + * when toUnicode is true ! 186 + */ 187 + if (toUnicode) { 188 + /* encoding => UTF-16 => UTF-8 */ 189 + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, 190 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 191 + 0, TRUE, &err); 192 + } else { 193 + /* UTF-8 => UTF-16 => encoding */ 194 + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, 195 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 196 + 0, TRUE, &err); 197 + } 198 + *inlen = ucv_in - (const char*) in; 199 + *outlen = ucv_out - (char *) out; 200 + if (U_SUCCESS(err)) 201 + return 0; 202 + if (err == U_BUFFER_OVERFLOW_ERROR) 203 + return -1; 204 + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) 205 + return -2; 206 + /* if (err == U_TRUNCATED_CHAR_FOUND) */ 207 + return -3; 208 +} 209 +#endif /* LIBXML_ICU_ENABLED */ 210 + 211 +/************************************************************************ 212 + * * 213 * The real API used by libxml for on-the-fly conversion * 214 * * 215 ************************************************************************/ 216 @@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, 217 if (ret == -1) ret = -3; 218 } 219 #endif /* LIBXML_ICONV_ENABLED */ 220 +#ifdef LIBXML_ICU_ENABLED 221 + else if (handler->uconv_in != NULL) { 222 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 223 + &written, in->content, &toconv); 224 + xmlBufferShrink(in, toconv); 225 + out->use += written; 226 + out->content[out->use] = 0; 227 + if (ret == -1) ret = -3; 228 + } 229 +#endif /* LIBXML_ICU_ENABLED */ 230 #ifdef DEBUG_ENCODING 231 switch (ret) { 232 case 0: 233 @@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, 234 ret = -3; 235 } 236 #endif /* LIBXML_ICONV_ENABLED */ 237 +#ifdef LIBXML_ICU_ENABLED 238 + else if (handler->uconv_in != NULL) { 239 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 240 + &written, in->content, &toconv); 241 + xmlBufferShrink(in, toconv); 242 + out->use += written; 243 + out->content[out->use] = 0; 244 + if (ret == -1) 245 + ret = -3; 246 + } 247 +#endif /* LIBXML_ICU_ENABLED */ 248 switch (ret) { 249 case 0: 250 #ifdef DEBUG_ENCODING 251 @@ -2015,6 +2190,15 @@ retry: 252 out->content[out->use] = 0; 253 } 254 #endif /* LIBXML_ICONV_ENABLED */ 255 +#ifdef LIBXML_ICU_ENABLED 256 + else if (handler->uconv_out != NULL) { 257 + ret = xmlUconvWrapper(handler->uconv_out, 0, 258 + &out->content[out->use], 259 + &written, NULL, &toconv); 260 + out->use += written; 261 + out->content[out->use] = 0; 262 + } 263 +#endif /* LIBXML_ICU_ENABLED */ 264 #ifdef DEBUG_ENCODING 265 xmlGenericError(xmlGenericErrorContext, 266 "initialized encoder\n"); 267 @@ -2061,6 +2245,26 @@ retry: 268 } 269 } 270 #endif /* LIBXML_ICONV_ENABLED */ 271 +#ifdef LIBXML_ICU_ENABLED 272 + else if (handler->uconv_out != NULL) { 273 + ret = xmlUconvWrapper(handler->uconv_out, 0, 274 + &out->content[out->use], 275 + &written, in->content, &toconv); 276 + xmlBufferShrink(in, toconv); 277 + out->use += written; 278 + writtentot += written; 279 + out->content[out->use] = 0; 280 + if (ret == -1) { 281 + if (written > 0) { 282 + /* 283 + * Can be a limitation of iconv 284 + */ 285 + goto retry; 286 + } 287 + ret = -3; 288 + } 289 + } 290 +#endif /* LIBXML_ICU_ENABLED */ 291 else { 292 xmlEncodingErr(XML_I18N_NO_OUTPUT, 293 "xmlCharEncOutFunc: no output function !\n", NULL); 294 @@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { 295 xmlFree(handler); 296 } 297 #endif /* LIBXML_ICONV_ENABLED */ 298 +#ifdef LIBXML_ICU_ENABLED 299 + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { 300 + if (handler->name != NULL) 301 + xmlFree(handler->name); 302 + handler->name = NULL; 303 + if (handler->uconv_out != NULL) { 304 + closeIcuConverter(handler->uconv_out); 305 + handler->uconv_out = NULL; 306 + } 307 + if (handler->uconv_in != NULL) { 308 + closeIcuConverter(handler->uconv_in); 309 + handler->uconv_in = NULL; 310 + } 311 + xmlFree(handler); 312 + } 313 +#endif 314 #ifdef DEBUG_ENCODING 315 if (ret) 316 xmlGenericError(xmlGenericErrorContext, 317 @@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 318 cur += toconv; 319 } while (ret == -2); 320 #endif 321 +#ifdef LIBXML_ICU_ENABLED 322 + } else if (handler->uconv_out != NULL) { 323 + do { 324 + toconv = in->end - cur; 325 + written = 32000; 326 + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], 327 + &written, cur, &toconv); 328 + if (ret < 0) { 329 + if (written > 0) 330 + ret = -2; 331 + else 332 + return(-1); 333 + } 334 + unused += written; 335 + cur += toconv; 336 + } while (ret == -2); 337 } else { 338 /* could not find a converter */ 339 return(-1); 340 @@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 341 } 342 return(in->consumed + (in->cur - in->base)); 343 } 344 +#endif 345 346 -#ifndef LIBXML_ICONV_ENABLED 347 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 348 #ifdef LIBXML_ISO8859X_ENABLED 349 350 /** 351 diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h 352 index c74b25f..b5f8b48 100644 353 --- a/third_party/libxml/include/libxml/encoding.h 354 +++ b/third_party/libxml/include/libxml/encoding.h 355 @@ -26,6 +26,24 @@ 356 357 #ifdef LIBXML_ICONV_ENABLED 358 #include <iconv.h> 359 +#else 360 +#ifdef LIBXML_ICU_ENABLED 361 +#include <unicode/ucnv.h> 362 +#if 0 363 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> 364 + * to prevent unwanted ICU symbols being exposed to users of libxml2. 365 + * One particular case is Qt4 conflicting on UChar32. 366 + */ 367 +#include <stdint.h> 368 +struct UConverter; 369 +typedef struct UConverter UConverter; 370 +#ifdef _MSC_VER 371 +typedef wchar_t UChar; 372 +#else 373 +typedef uint16_t UChar; 374 +#endif 375 +#endif 376 +#endif 377 #endif 378 #ifdef __cplusplus 379 extern "C" { 380 @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 381 * Block defining the handlers for non UTF-8 encodings. 382 * If iconv is supported, there are two extra fields. 383 */ 384 +#ifdef LIBXML_ICU_ENABLED 385 +struct _uconv_t { 386 + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ 387 + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ 388 +}; 389 +typedef struct _uconv_t uconv_t; 390 +#endif 391 392 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 393 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 394 @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { 395 iconv_t iconv_in; 396 iconv_t iconv_out; 397 #endif /* LIBXML_ICONV_ENABLED */ 398 +#ifdef LIBXML_ICU_ENABLED 399 + uconv_t *uconv_in; 400 + uconv_t *uconv_out; 401 +#endif /* LIBXML_ICU_ENABLED */ 402 }; 403 404 #ifdef __cplusplus 405 diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h 406 index dd79c42..3580b63 100644 407 --- a/third_party/libxml/include/libxml/parser.h 408 +++ b/third_party/libxml/include/libxml/parser.h 409 @@ -1222,6 +1222,7 @@ typedef enum { 410 XML_WITH_DEBUG_MEM = 29, 411 XML_WITH_DEBUG_RUN = 30, 412 XML_WITH_ZLIB = 31, 413 + XML_WITH_ICU = 32, 414 XML_WITH_NONE = 99999 /* just to be sure of allocation size */ 415 } xmlFeature; 416 417 diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in 418 index 4739f3a..de310ab 100644 419 --- a/third_party/libxml/include/libxml/xmlversion.h.in 420 +++ b/third_party/libxml/include/libxml/xmlversion.h.in 421 @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); 422 #endif 423 424 /** 425 + * LIBXML_ICU_ENABLED: 426 + * 427 + * Whether icu support is available 428 + */ 429 +#if @WITH_ICU@ 430 +#define LIBXML_ICU_ENABLED 431 +#endif 432 + 433 +/** 434 * LIBXML_ISO8859X_ENABLED: 435 * 436 * Whether ISO-8859-* support is made available in case iconv is not 437 diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c 438 index 85e7599..3ba2a06 100644 439 --- a/third_party/libxml/parser.c 440 +++ b/third_party/libxml/parser.c 441 @@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature) 442 #else 443 return(0); 444 #endif 445 + case XML_WITH_ICU: 446 +#ifdef LIBXML_ICU_ENABLED 447 + return(1); 448 +#else 449 + return(0); 450 +#endif 451 default: 452 break; 453 } 454