1 From f1121648d0762cf9bf4e5117bfc1008447fb4080 Mon Sep 17 00:00:00 2001 2 From: android 3 Date: Thu, 1 Apr 2010 11:46:35 -0700 4 Subject: [PATCH] Add ICU support for libxml. 5 6 This is derived from Jungshik's patch. The encoding.c is a copy from Chrome's source, 7 which has one extra modification than Jungshik's patch. 8 9 Issue:2557315 10 Change-Id: I8e4c9e544660f3f943a15042756f7248d5afff8e 11 --- 12 Android.mk | 4 +- 13 encoding.c | 248 +++++++++++++++++++++++++++++++++++++++++- 14 include/libxml/encoding.h | 29 +++++ 15 include/libxml/parser.h | 3 +- 16 include/libxml/xmlversion.h | 11 ++- 17 parser.c | 9 ++ 18 xmlregexp.c | 2 +- 19 7 files changed, 294 insertions(+), 12 deletions(-) 20 21 diff --git a/Android.mk b/Android.mk 22 index 3d0ede8..08bf11f 100644 23 --- a/Android.mk 24 +++ b/Android.mk 25 @@ -57,7 +57,7 @@ common_C_INCLUDES += \ 26 include $(CLEAR_VARS) 27 28 LOCAL_SRC_FILES := $(common_SRC_FILES) 29 -LOCAL_C_INCLUDES += $(common_C_INCLUDES) 30 +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common 31 LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) 32 LOCAL_CFLAGS += -fvisibility=hidden 33 34 @@ -71,7 +71,7 @@ include $(BUILD_STATIC_LIBRARY) 35 36 include $(CLEAR_VARS) 37 LOCAL_SRC_FILES := $(common_SRC_FILES) 38 -LOCAL_C_INCLUDES += $(common_C_INCLUDES) 39 +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common 40 LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) 41 LOCAL_MODULE:= libxml2 42 include $(BUILD_HOST_STATIC_LIBRARY) 43 diff --git a/encoding.c b/encoding.c 44 index e2df797..2abc32e 100644 45 --- a/encoding.c 46 +++ b/encoding.c 47 @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; 48 static int xmlCharEncodingAliasesNb = 0; 49 static int xmlCharEncodingAliasesMax = 0; 50 51 -#ifdef LIBXML_ICONV_ENABLED 52 +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) 53 #if 0 54 #define DEBUG_ENCODING /* Define this to get encoding traces */ 55 #endif 56 @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) 57 NULL, 0, val, NULL, NULL, 0, 0, msg, val); 58 } 59 60 +#ifdef LIBXML_ICU_ENABLED 61 +static uconv_t* 62 +openIcuConverter(const char* name, int toUnicode) 63 +{ 64 + UErrorCode status = U_ZERO_ERROR; 65 + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); 66 + if (conv == NULL) 67 + return NULL; 68 + 69 + conv->uconv = ucnv_open(name, &status); 70 + if (U_FAILURE(status)) 71 + goto error; 72 + 73 + status = U_ZERO_ERROR; 74 + if (toUnicode) { 75 + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, 76 + NULL, NULL, NULL, &status); 77 + } 78 + else { 79 + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, 80 + NULL, NULL, NULL, &status); 81 + } 82 + if (U_FAILURE(status)) 83 + goto error; 84 + 85 + status = U_ZERO_ERROR; 86 + conv->utf8 = ucnv_open("UTF-8", &status); 87 + if (U_SUCCESS(status)) 88 + return conv; 89 + 90 +error: 91 + if (conv->uconv) 92 + ucnv_close(conv->uconv); 93 + xmlFree(conv); 94 + return NULL; 95 +} 96 + 97 +static void 98 +closeIcuConverter(uconv_t *conv) 99 +{ 100 + if (conv != NULL) { 101 + ucnv_close(conv->uconv); 102 + ucnv_close(conv->utf8); 103 + xmlFree(conv); 104 + } 105 +} 106 +#endif /* LIBXML_ICU_ENABLED */ 107 + 108 /************************************************************************ 109 * * 110 * Conversions To/From UTF8 encoding * 111 @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, 112 #ifdef LIBXML_ICONV_ENABLED 113 handler->iconv_in = NULL; 114 handler->iconv_out = NULL; 115 -#endif /* LIBXML_ICONV_ENABLED */ 116 +#endif 117 +#ifdef LIBXML_ICU_ENABLED 118 + handler->uconv_in = NULL; 119 + handler->uconv_out = NULL; 120 +#endif 121 122 /* 123 * registers and returns the handler. 124 @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { 125 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); 126 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); 127 #endif /* LIBXML_OUTPUT_ENABLED */ 128 -#ifndef LIBXML_ICONV_ENABLED 129 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 130 #ifdef LIBXML_ISO8859X_ENABLED 131 xmlRegisterCharEncodingHandlersISO8859x (); 132 #endif 133 @@ -1576,6 +1628,10 @@ xmlFindCharEncodingHandler(const char *name) { 134 xmlCharEncodingHandlerPtr enc; 135 iconv_t icv_in, icv_out; 136 #endif /* LIBXML_ICONV_ENABLED */ 137 +#ifdef LIBXML_ICU_ENABLED 138 + xmlCharEncodingHandlerPtr enc; 139 + uconv_t *ucv_in, *ucv_out; 140 +#endif /* LIBXML_ICU_ENABLED */ 141 char upper[100]; 142 int i; 143 144 @@ -1642,6 +1698,35 @@ xmlFindCharEncodingHandler(const char *name) { 145 "iconv : problems with filters for '%s'\n", name); 146 } 147 #endif /* LIBXML_ICONV_ENABLED */ 148 +#ifdef LIBXML_ICU_ENABLED 149 + /* check whether icu can handle this */ 150 + ucv_in = openIcuConverter(name, 1); 151 + ucv_out = openIcuConverter(name, 0); 152 + if (ucv_in != NULL && ucv_out != NULL) { 153 + enc = (xmlCharEncodingHandlerPtr) 154 + xmlMalloc(sizeof(xmlCharEncodingHandler)); 155 + if (enc == NULL) { 156 + closeIcuConverter(ucv_in); 157 + closeIcuConverter(ucv_out); 158 + return(NULL); 159 + } 160 + enc->name = xmlMemStrdup(name); 161 + enc->input = NULL; 162 + enc->output = NULL; 163 + enc->uconv_in = ucv_in; 164 + enc->uconv_out = ucv_out; 165 +#ifdef DEBUG_ENCODING 166 + xmlGenericError(xmlGenericErrorContext, 167 + "Found ICU converter handler for encoding %s\n", name); 168 +#endif 169 + return enc; 170 + } else if (ucv_in != NULL || ucv_out != NULL) { 171 + closeIcuConverter(ucv_in); 172 + closeIcuConverter(ucv_out); 173 + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, 174 + "ICU converter : problems with filters for '%s'\n", name); 175 + } 176 +#endif /* LIBXML_ICU_ENABLED */ 177 178 #ifdef DEBUG_ENCODING 179 xmlGenericError(xmlGenericErrorContext, 180 @@ -1732,6 +1817,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, 181 182 /************************************************************************ 183 * * 184 + * ICU based generic conversion functions * 185 + * * 186 + ************************************************************************/ 187 + 188 +#ifdef LIBXML_ICU_ENABLED 189 +/** 190 + * xmlUconvWrapper: 191 + * @cd: ICU uconverter data structure 192 + * @toUnicode : non-zero if toUnicode. 0 otherwise. 193 + * @out: a pointer to an array of bytes to store the result 194 + * @outlen: the length of @out 195 + * @in: a pointer to an array of ISO Latin 1 chars 196 + * @inlen: the length of @in 197 + * 198 + * Returns 0 if success, or 199 + * -1 by lack of space, or 200 + * -2 if the transcoding fails (for *in is not valid utf8 string or 201 + * the result of transformation can't fit into the encoding we want), or 202 + * -3 if there the last byte can't form a single output char. 203 + * 204 + * The value of @inlen after return is the number of octets consumed 205 + * as the return value is positive, else unpredictable. 206 + * The value of @outlen after return is the number of ocetes consumed. 207 + */ 208 +static int 209 +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, 210 + const unsigned char *in, int *inlen) { 211 + const char *ucv_in = (const char *) in; 212 + char *ucv_out = (char *) out; 213 + UErrorCode err = U_ZERO_ERROR; 214 + 215 + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { 216 + if (outlen != NULL) *outlen = 0; 217 + return(-1); 218 + } 219 + 220 + /* 221 + * TODO(jungshik) 222 + * 1. is ucnv_convert(To|From)Algorithmic better? 223 + * 2. had we better use an explicit pivot buffer? 224 + * 3. error returned comes from 'fromUnicode' only even 225 + * when toUnicode is true ! 226 + */ 227 + if (toUnicode) { 228 + /* encoding => UTF-16 => UTF-8 */ 229 + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, 230 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 231 + 0, TRUE, &err); 232 + } else { 233 + /* UTF-8 => UTF-16 => encoding */ 234 + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, 235 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 236 + 0, TRUE, &err); 237 + } 238 + *inlen = ucv_in - (const char*) in; 239 + *outlen = ucv_out - (char *) out; 240 + if (U_SUCCESS(err)) 241 + return 0; 242 + if (err == U_BUFFER_OVERFLOW_ERROR) 243 + return -1; 244 + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) 245 + return -2; 246 + /* if (err == U_TRUNCATED_CHAR_FOUND) */ 247 + return -3; 248 +} 249 +#endif /* LIBXML_ICU_ENABLED */ 250 + 251 +/************************************************************************ 252 + * * 253 * The real API used by libxml for on-the-fly conversion * 254 * * 255 ************************************************************************/ 256 @@ -1794,6 +1948,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, 257 if (ret == -1) ret = -3; 258 } 259 #endif /* LIBXML_ICONV_ENABLED */ 260 +#ifdef LIBXML_ICU_ENABLED 261 + else if (handler->uconv_in != NULL) { 262 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 263 + &written, in->content, &toconv); 264 + xmlBufferShrink(in, toconv); 265 + out->use += written; 266 + out->content[out->use] = 0; 267 + if (ret == -1) ret = -3; 268 + } 269 +#endif /* LIBXML_ICU_ENABLED */ 270 #ifdef DEBUG_ENCODING 271 switch (ret) { 272 case 0: 273 @@ -1879,6 +2043,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, 274 ret = -3; 275 } 276 #endif /* LIBXML_ICONV_ENABLED */ 277 +#ifdef LIBXML_ICU_ENABLED 278 + else if (handler->uconv_in != NULL) { 279 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 280 + &written, in->content, &toconv); 281 + xmlBufferShrink(in, toconv); 282 + out->use += written; 283 + out->content[out->use] = 0; 284 + if (ret == -1) 285 + ret = -3; 286 + } 287 +#endif /* LIBXML_ICU_ENABLED */ 288 switch (ret) { 289 case 0: 290 #ifdef DEBUG_ENCODING 291 @@ -1979,6 +2154,15 @@ retry: 292 out->content[out->use] = 0; 293 } 294 #endif /* LIBXML_ICONV_ENABLED */ 295 +#ifdef LIBXML_ICU_ENABLED 296 + else if (handler->uconv_out != NULL) { 297 + ret = xmlUconvWrapper(handler->uconv_out, 0, 298 + &out->content[out->use], 299 + &written, NULL, &toconv); 300 + out->use += written; 301 + out->content[out->use] = 0; 302 + } 303 +#endif /* LIBXML_ICU_ENABLED */ 304 #ifdef DEBUG_ENCODING 305 xmlGenericError(xmlGenericErrorContext, 306 "initialized encoder\n"); 307 @@ -2003,7 +2187,7 @@ retry: 308 xmlBufferShrink(in, toconv); 309 out->use += written; 310 writtentot += written; 311 - } 312 + } 313 out->content[out->use] = 0; 314 } 315 #ifdef LIBXML_ICONV_ENABLED 316 @@ -2025,6 +2209,26 @@ retry: 317 } 318 } 319 #endif /* LIBXML_ICONV_ENABLED */ 320 +#ifdef LIBXML_ICU_ENABLED 321 + else if (handler->uconv_out != NULL) { 322 + ret = xmlUconvWrapper(handler->uconv_out, 0, 323 + &out->content[out->use], 324 + &written, in->content, &toconv); 325 + xmlBufferShrink(in, toconv); 326 + out->use += written; 327 + writtentot += written; 328 + out->content[out->use] = 0; 329 + if (ret == -1) { 330 + if (written > 0) { 331 + /* 332 + * Can be a limitation of iconv 333 + */ 334 + goto retry; 335 + } 336 + ret = -3; 337 + } 338 + } 339 +#endif /* LIBXML_ICU_ENABLED */ 340 else { 341 xmlEncodingErr(XML_I18N_NO_OUTPUT, 342 "xmlCharEncOutFunc: no output function !\n", NULL); 343 @@ -2137,6 +2341,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { 344 xmlFree(handler); 345 } 346 #endif /* LIBXML_ICONV_ENABLED */ 347 +#ifdef LIBXML_ICU_ENABLED 348 + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { 349 + if (handler->name != NULL) 350 + xmlFree(handler->name); 351 + handler->name = NULL; 352 + if (handler->uconv_out != NULL) { 353 + closeIcuConverter(handler->uconv_out); 354 + handler->uconv_out = NULL; 355 + } 356 + if (handler->uconv_in != NULL) { 357 + closeIcuConverter(handler->uconv_in); 358 + handler->uconv_in = NULL; 359 + } 360 + xmlFree(handler); 361 + } 362 +#endif 363 #ifdef DEBUG_ENCODING 364 if (ret) 365 xmlGenericError(xmlGenericErrorContext, 366 @@ -2212,6 +2432,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 367 cur += toconv; 368 } while (ret == -2); 369 #endif 370 +#ifdef LIBXML_ICU_ENABLED 371 + } else if (handler->uconv_out != NULL) { 372 + do { 373 + toconv = in->end - cur; 374 + written = 32000; 375 + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], 376 + &written, cur, &toconv); 377 + if (ret < 0) { 378 + if (written > 0) 379 + ret = -2; 380 + else 381 + return(-1); 382 + } 383 + unused += written; 384 + cur += toconv; 385 + } while (ret == -2); 386 } else { 387 /* could not find a converter */ 388 return(-1); 389 @@ -2223,8 +2459,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 390 } 391 return(in->consumed + (in->cur - in->base)); 392 } 393 +#endif 394 395 -#ifndef LIBXML_ICONV_ENABLED 396 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 397 #ifdef LIBXML_ISO8859X_ENABLED 398 399 /** 400 @@ -3296,4 +3533,3 @@ xmlRegisterCharEncodingHandlersISO8859x (void) { 401 402 #define bottom_encoding 403 #include "elfgcchack.h" 404 - 405 diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h 406 index c74b25f..c68ec10 100644 407 --- a/include/libxml/encoding.h 408 +++ b/include/libxml/encoding.h 409 @@ -26,6 +26,24 @@ 410 411 #ifdef LIBXML_ICONV_ENABLED 412 #include <iconv.h> 413 +#else 414 +#ifdef LIBXML_ICU_ENABLED 415 +#include <unicode/ucnv.h> 416 +#if 0 417 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> 418 + * to prevent unwanted ICU symbols being exposed to users of libxml2. 419 + * One particular case is Qt4 conflicting on UChar32. 420 + */ 421 +#include <stdint.h> 422 +struct UConverter; 423 +typedef struct UConverter UConverter; 424 +#ifdef _MSC_VER 425 +typedef wchar_t UChar; 426 +#else 427 +typedef uint16_t UChar; 428 +#endif 429 +#endif 430 +#endif 431 #endif 432 #ifdef __cplusplus 433 extern "C" { 434 @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 435 * Block defining the handlers for non UTF-8 encodings. 436 * If iconv is supported, there are two extra fields. 437 */ 438 +#ifdef LIBXML_ICU_ENABLED 439 +struct _uconv_t { 440 + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ 441 + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ 442 +}; 443 +typedef struct _uconv_t uconv_t; 444 +#endif 445 446 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 447 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 448 @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { 449 iconv_t iconv_in; 450 iconv_t iconv_out; 451 #endif /* LIBXML_ICONV_ENABLED */ 452 +#ifdef LIBXML_ICU_ENABLED 453 + uconv_t *uconv_in; 454 + uconv_t *uconv_out; 455 +#endif /* LIBXML_ICU_ENABLED */ 456 }; 457 458 #ifdef __cplusplus 459 diff --git a/include/libxml/parser.h b/include/libxml/parser.h 460 index 567addb..bd9de24 100644 461 --- a/include/libxml/parser.h 462 +++ b/include/libxml/parser.h 463 @@ -276,6 +276,7 @@ struct _xmlParserCtxt { 464 int nsNr; /* the number of inherited namespaces */ 465 int nsMax; /* the size of the arrays */ 466 const xmlChar * *nsTab; /* the array of prefix/namespace name */ 467 + struct _xmlParserCtxt *nsParent; /* parent context to inherit namespaces from * */ 468 int *attallocs; /* which attribute were allocated */ 469 void * *pushTab; /* array of data for push */ 470 xmlHashTablePtr attsDefault; /* defaulted attributes if any */ 471 @@ -1213,6 +1214,7 @@ typedef enum { 472 XML_WITH_DEBUG_MEM = 29, 473 XML_WITH_DEBUG_RUN = 30, 474 XML_WITH_ZLIB = 31, 475 + XML_WITH_ICU = 32, 476 XML_WITH_NONE = 99999 /* just to be sure of allocation size */ 477 } xmlFeature; 478 479 @@ -1223,4 +1225,3 @@ XMLPUBFUN int XMLCALL 480 } 481 #endif 482 #endif /* __XML_PARSER_H__ */ 483 - 484 diff --git a/include/libxml/xmlversion.h b/include/libxml/xmlversion.h 485 index a98e00c..fb2b8ca 100644 486 --- a/include/libxml/xmlversion.h 487 +++ b/include/libxml/xmlversion.h 488 @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); 489 #endif 490 491 /** 492 + * LIBXML_ICU_ENABLED: 493 + * 494 + * Whether icu support is available 495 + */ 496 +#if 1 497 +#define LIBXML_ICU_ENABLED 498 +#endif 499 + 500 +/** 501 * LIBXML_ISO8859X_ENABLED: 502 * 503 * Whether ISO-8859-* support is made available in case iconv is not 504 @@ -454,5 +463,3 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); 505 } 506 #endif /* __cplusplus */ 507 #endif 508 - 509 - 510 diff --git a/parser.c b/parser.c 511 index 9db664f..306b84d 100644 512 --- a/parser.c 513 +++ b/parser.c 514 @@ -937,6 +937,12 @@ xmlHasFeature(xmlFeature feature) 515 #else 516 return(0); 517 #endif 518 + case XML_WITH_ICU: 519 +#ifdef LIBXML_ICU_ENABLED 520 + return(1); 521 +#else 522 + return(0); 523 +#endif 524 default: 525 break; 526 } 527 @@ -8189,6 +8195,7 @@ xmlGetNamespace(xmlParserCtxtPtr ctxt, const xmlChar *prefix) { 528 return(NULL); 529 return(ctxt->nsTab[i + 1]); 530 } 531 + if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix); 532 return(NULL); 533 } 534 535 @@ -12538,6 +12545,8 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt, 536 ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); 537 ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36); 538 539 + ctxt->nsParent = oldctxt; 540 + 541 oldsax = ctxt->sax; 542 ctxt->sax = oldctxt->sax; 543 xmlDetectSAX2(ctxt); 544 diff --git a/xmlregexp.c b/xmlregexp.c 545 index 73598a5..4258a08 100644 546 --- a/xmlregexp.c 547 +++ b/xmlregexp.c 548 @@ -6401,7 +6401,7 @@ xmlExpHashNameComputeKey(const xmlChar *name) { 549 if (name != NULL) { 550 value += 30 * (*name); 551 while ((ch = *name++) != 0) { 552 - value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 553 + value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); 554 } 555 } 556 return (value); 557 -- 558 1.7.0.1 559 560