Home | History | Annotate | Download | only in libxml2
      1 /**
      2  * Test the UTF-8 decoding routines
      3  *
      4  * author: Daniel Veillard
      5  * copy: see Copyright for the status of this software.
      6  */
      7 
      8 #include <stdio.h>
      9 #include <string.h>
     10 #include <libxml/parser.h>
     11 #include <libxml/parserInternals.h>
     12 
     13 #include "buf.h"
     14 
     15 int lastError;
     16 
     17 static void errorHandler(void *unused, xmlErrorPtr err) {
     18     if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
     19         lastError = err->code;
     20     }
     21 }
     22 
     23 char document1[100] = "<doc>XXXX</doc>";
     24 char document2[100] = "<doc foo='XXXX'/>";
     25 
     26 static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
     27                   int len,  char *data, int forbid1, int forbid2) {
     28     int i;
     29     xmlDocPtr res;
     30 
     31     for (i = 0;i <= 0xFF;i++) {
     32 	lastError = 0;
     33 	xmlCtxtReset(ctxt);
     34 
     35         data[0] = i;
     36 
     37 	res = xmlReadMemory(document, len, "test", NULL, 0);
     38 
     39 	if ((i == forbid1) || (i == forbid2)) {
     40 	    if ((lastError == 0) || (res != NULL))
     41 	        fprintf(stderr,
     42 		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
     43 		        i, i);
     44 	}
     45 
     46 	else if ((i == '<') || (i == '&')) {
     47 	    if ((lastError == 0) || (res != NULL))
     48 	        fprintf(stderr,
     49 		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
     50 	}
     51 	else if (((i < 0x20) || (i >= 0x80)) &&
     52 	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
     53 	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
     54 	        fprintf(stderr,
     55 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
     56 	}
     57 	else if (res == NULL) {
     58 	    fprintf(stderr,
     59 		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
     60 	}
     61 	if (res != NULL)
     62 	    xmlFreeDoc(res);
     63     }
     64 }
     65 
     66 static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
     67                   int len,  char *data) {
     68     int i, j;
     69     xmlDocPtr res;
     70 
     71     for (i = 0x80;i <= 0xFF;i++) {
     72     for (j = 0;j <= 0xFF;j++) {
     73 	lastError = 0;
     74 	xmlCtxtReset(ctxt);
     75 
     76         data[0] = i;
     77         data[1] = j;
     78 
     79 	res = xmlReadMemory(document, len, "test", NULL, 0);
     80 
     81 	/* if first bit of first char is set, then second bit must too */
     82 	if ((i & 0x80) && ((i & 0x40) == 0)) {
     83 	    if ((lastError == 0) || (res != NULL))
     84 		fprintf(stderr,
     85 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
     86 			i, j);
     87 	}
     88 
     89 	/*
     90 	 * if first bit of first char is set, then second char first
     91 	 * bits must be 10
     92 	 */
     93 	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
     94 	    if ((lastError == 0) || (res != NULL))
     95 		fprintf(stderr,
     96 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
     97 			i, j);
     98 	}
     99 
    100 	/*
    101 	 * if using a 2 byte encoding then the value must be greater
    102 	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
    103 	 */
    104 	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
    105 	    if ((lastError == 0) || (res != NULL))
    106 		fprintf(stderr,
    107 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
    108 			i, j);
    109 	}
    110 
    111 	/*
    112 	 * if third bit of first char is set, then the sequence would need
    113 	 * at least 3 bytes, but we give only 2 !
    114 	 */
    115 	else if ((i & 0xE0) == 0xE0) {
    116 	    if ((lastError == 0) || (res != NULL))
    117 		fprintf(stderr,
    118 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
    119 			i, j);
    120 	}
    121 
    122 	/*
    123 	 * We should see no error in remaning cases
    124 	 */
    125 	else if ((lastError != 0) || (res == NULL)) {
    126 	    fprintf(stderr,
    127 		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
    128 	}
    129 	if (res != NULL)
    130 	    xmlFreeDoc(res);
    131     }
    132     }
    133 }
    134 
    135 /**
    136  * testDocumentRanges:
    137  *
    138  * Test the correct UTF8 character parsing in context of XML documents
    139  * Those are in-context injection tests checking the parser behaviour on
    140  * edge case values at different point in content, beginning and end of
    141  * CDATA in text or in attribute values.
    142  */
    143 
    144 static void testDocumentRanges(void) {
    145     xmlParserCtxtPtr ctxt;
    146     char *data;
    147 
    148     /*
    149      * Set up a parsing context using the first document as
    150      * the current input source.
    151      */
    152     ctxt = xmlNewParserCtxt();
    153     if (ctxt == NULL) {
    154         fprintf(stderr, "Failed to allocate parser context\n");
    155 	return;
    156     }
    157 
    158     printf("testing 1 byte char in document: 1");
    159     fflush(stdout);
    160     data = &document1[5];
    161     data[0] = ' ';
    162     data[1] = ' ';
    163     data[2] = ' ';
    164     data[3] = ' ';
    165     /* test 1 byte injection at beginning of area */
    166     testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
    167                            data, -1, -1);
    168     printf(" 2");
    169     fflush(stdout);
    170     data[0] = ' ';
    171     data[1] = ' ';
    172     data[2] = ' ';
    173     data[3] = ' ';
    174     /* test 1 byte injection at end of area */
    175     testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
    176                            data + 3, -1, -1);
    177 
    178     printf(" 3");
    179     fflush(stdout);
    180     data = &document2[10];
    181     data[0] = ' ';
    182     data[1] = ' ';
    183     data[2] = ' ';
    184     data[3] = ' ';
    185     /* test 1 byte injection at beginning of area */
    186     testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
    187                            data, '\'', -1);
    188     printf(" 4");
    189     fflush(stdout);
    190     data[0] = ' ';
    191     data[1] = ' ';
    192     data[2] = ' ';
    193     data[3] = ' ';
    194     /* test 1 byte injection at end of area */
    195     testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
    196                            data + 3, '\'', -1);
    197     printf(" done\n");
    198 
    199     printf("testing 2 byte char in document: 1");
    200     fflush(stdout);
    201     data = &document1[5];
    202     data[0] = ' ';
    203     data[1] = ' ';
    204     data[2] = ' ';
    205     data[3] = ' ';
    206     /* test 2 byte injection at beginning of area */
    207     testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
    208                            data);
    209     printf(" 2");
    210     fflush(stdout);
    211     data[0] = ' ';
    212     data[1] = ' ';
    213     data[2] = ' ';
    214     data[3] = ' ';
    215     /* test 2 byte injection at end of area */
    216     testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
    217                            data + 2);
    218 
    219     printf(" 3");
    220     fflush(stdout);
    221     data = &document2[10];
    222     data[0] = ' ';
    223     data[1] = ' ';
    224     data[2] = ' ';
    225     data[3] = ' ';
    226     /* test 2 byte injection at beginning of area */
    227     testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
    228                            data);
    229     printf(" 4");
    230     fflush(stdout);
    231     data[0] = ' ';
    232     data[1] = ' ';
    233     data[2] = ' ';
    234     data[3] = ' ';
    235     /* test 2 byte injection at end of area */
    236     testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
    237                            data + 2);
    238     printf(" done\n");
    239 
    240     xmlFreeParserCtxt(ctxt);
    241 }
    242 
    243 static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
    244     int i = 0;
    245     int len, c;
    246 
    247     data[1] = 0;
    248     data[2] = 0;
    249     data[3] = 0;
    250     for (i = 0;i <= 0xFF;i++) {
    251         data[0] = i;
    252 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
    253 
    254 	lastError = 0;
    255         c = xmlCurrentChar(ctxt, &len);
    256 	if ((i == 0) || (i >= 0x80)) {
    257 	    /* we must see an error there */
    258 	    if (lastError != XML_ERR_INVALID_CHAR)
    259 	        fprintf(stderr,
    260 		    "Failed to detect invalid char for Byte 0x%02X\n", i);
    261 	} else if (i == 0xD) {
    262 	    if ((c != 0xA) || (len != 1))
    263 		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
    264 	} else if ((c != i) || (len != 1)) {
    265 	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
    266 	}
    267     }
    268 }
    269 
    270 static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
    271     int i, j;
    272     int len, c;
    273 
    274     data[2] = 0;
    275     data[3] = 0;
    276     for (i = 0x80;i <= 0xFF;i++) {
    277 	for (j = 0;j <= 0xFF;j++) {
    278 	    data[0] = i;
    279 	    data[1] = j;
    280 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
    281 
    282 	    lastError = 0;
    283 	    c = xmlCurrentChar(ctxt, &len);
    284 
    285 	    /* if first bit of first char is set, then second bit must too */
    286 	    if ((i & 0x80) && ((i & 0x40) == 0)) {
    287 		if (lastError != XML_ERR_INVALID_CHAR)
    288 		    fprintf(stderr,
    289 		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
    290 		            i, j);
    291 	    }
    292 
    293 	    /*
    294 	     * if first bit of first char is set, then second char first
    295 	     * bits must be 10
    296 	     */
    297 	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
    298 		if (lastError != XML_ERR_INVALID_CHAR)
    299 		    fprintf(stderr,
    300 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
    301 		            i, j, c);
    302 	    }
    303 
    304 	    /*
    305 	     * if using a 2 byte encoding then the value must be greater
    306 	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
    307 	     */
    308 	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
    309 		if (lastError != XML_ERR_INVALID_CHAR)
    310 		    fprintf(stderr,
    311 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
    312 		            i, j, c);
    313 	    }
    314 
    315 	    /*
    316 	     * if third bit of first char is set, then the sequence would need
    317 	     * at least 3 bytes, but we give only 2 !
    318 	     */
    319 	    else if ((i & 0xE0) == 0xE0) {
    320 		if (lastError != XML_ERR_INVALID_CHAR)
    321 		    fprintf(stderr,
    322 		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
    323 		            i, j);
    324 	    }
    325 
    326             /*
    327 	     * We should see no error in remaning cases
    328 	     */
    329 	    else if ((lastError != 0) || (len != 2)) {
    330 		fprintf(stderr,
    331 		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
    332 	    }
    333 
    334             /*
    335 	     * Finally check the value is right
    336 	     */
    337 	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
    338 		fprintf(stderr,
    339 	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
    340 	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
    341 	    }
    342         }
    343     }
    344 }
    345 
    346 static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
    347     int i, j, k, K;
    348     int len, c;
    349     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
    350     int value;
    351 
    352     data[3] = 0;
    353     for (i = 0xE0;i <= 0xFF;i++) {
    354     for (j = 0;j <= 0xFF;j++) {
    355     for (k = 0;k < 6;k++) {
    356 	data[0] = i;
    357 	data[1] = j;
    358 	K = lows[k];
    359 	data[2] = (char) K;
    360 	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
    361 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
    362 
    363 	lastError = 0;
    364 	c = xmlCurrentChar(ctxt, &len);
    365 
    366 	/*
    367 	 * if fourth bit of first char is set, then the sequence would need
    368 	 * at least 4 bytes, but we give only 3 !
    369 	 */
    370 	if ((i & 0xF0) == 0xF0) {
    371 	    if (lastError != XML_ERR_INVALID_CHAR)
    372 		fprintf(stderr,
    373 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
    374 			i, j, K, data[3]);
    375 	}
    376 
    377         /*
    378 	 * The second and the third bytes must start with 10
    379 	 */
    380 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
    381 	    if (lastError != XML_ERR_INVALID_CHAR)
    382 		fprintf(stderr,
    383 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
    384 			i, j, K);
    385 	}
    386 
    387 	/*
    388 	 * if using a 3 byte encoding then the value must be greater
    389 	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
    390 	 * the 6th byte of data[1] must be set
    391 	 */
    392 	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
    393 	    if (lastError != XML_ERR_INVALID_CHAR)
    394 		fprintf(stderr,
    395 	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
    396 			i, j, K);
    397 	}
    398 
    399         /*
    400 	 * There are values in that range that are not allowed in XML-1.0
    401 	 */
    402 	else if (((value > 0xD7FF) && (value <0xE000)) ||
    403 	         ((value > 0xFFFD) && (value <0x10000))) {
    404 	    if (lastError != XML_ERR_INVALID_CHAR)
    405 		fprintf(stderr,
    406 	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
    407 			value, i, j, K);
    408 	}
    409 
    410 	/*
    411 	 * We should see no error in remaining cases
    412 	 */
    413 	else if ((lastError != 0) || (len != 3)) {
    414 	    fprintf(stderr,
    415 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
    416 		    i, j, K);
    417 	}
    418 
    419 	/*
    420 	 * Finally check the value is right
    421 	 */
    422 	else if (c != value) {
    423 	    fprintf(stderr,
    424     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
    425 		i, j, data[2], value, c);
    426 	}
    427     }
    428     }
    429     }
    430 }
    431 
    432 static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
    433     int i, j, k, K, l, L;
    434     int len, c;
    435     unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
    436     int value;
    437 
    438     data[4] = 0;
    439     for (i = 0xF0;i <= 0xFF;i++) {
    440     for (j = 0;j <= 0xFF;j++) {
    441     for (k = 0;k < 6;k++) {
    442     for (l = 0;l < 6;l++) {
    443 	data[0] = i;
    444 	data[1] = j;
    445 	K = lows[k];
    446 	data[2] = (char) K;
    447 	L = lows[l];
    448 	data[3] = (char) L;
    449 	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
    450 	        ((i & 0x7) << 18);
    451 	ctxt->charset = XML_CHAR_ENCODING_UTF8;
    452 
    453 	lastError = 0;
    454 	c = xmlCurrentChar(ctxt, &len);
    455 
    456 	/*
    457 	 * if fifth bit of first char is set, then the sequence would need
    458 	 * at least 5 bytes, but we give only 4 !
    459 	 */
    460 	if ((i & 0xF8) == 0xF8) {
    461 	    if (lastError != XML_ERR_INVALID_CHAR)
    462 		fprintf(stderr,
    463   "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
    464 			i, j, K, data[3]);
    465 	}
    466 
    467         /*
    468 	 * The second, third and fourth bytes must start with 10
    469 	 */
    470 	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
    471 	         ((L & 0xC0) != 0x80)) {
    472 	    if (lastError != XML_ERR_INVALID_CHAR)
    473 		fprintf(stderr,
    474 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
    475 			i, j, K, L);
    476 	}
    477 
    478 	/*
    479 	 * if using a 3 byte encoding then the value must be greater
    480 	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
    481 	 * the 6 or 5th byte of j must be set
    482 	 */
    483 	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
    484 	    if (lastError != XML_ERR_INVALID_CHAR)
    485 		fprintf(stderr,
    486 	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
    487 			i, j, K, L);
    488 	}
    489 
    490         /*
    491 	 * There are values in that range that are not allowed in XML-1.0
    492 	 */
    493 	else if (((value > 0xD7FF) && (value <0xE000)) ||
    494 	         ((value > 0xFFFD) && (value <0x10000)) ||
    495 		 (value > 0x10FFFF)) {
    496 	    if (lastError != XML_ERR_INVALID_CHAR)
    497 		fprintf(stderr,
    498 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
    499 			value, i, j, K, L);
    500 	}
    501 
    502 	/*
    503 	 * We should see no error in remaining cases
    504 	 */
    505 	else if ((lastError != 0) || (len != 4)) {
    506 	    fprintf(stderr,
    507 		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
    508 		    i, j, K);
    509 	}
    510 
    511 	/*
    512 	 * Finally check the value is right
    513 	 */
    514 	else if (c != value) {
    515 	    fprintf(stderr,
    516     "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
    517 		i, j, data[2], value, c);
    518 	}
    519     }
    520     }
    521     }
    522     }
    523 }
    524 
    525 /**
    526  * testCharRanges:
    527  *
    528  * Test the correct UTF8 character parsing in isolation i.e.
    529  * not when parsing a full document, this is less expensive and we can
    530  * cover the full range of UTF-8 chars accepted by XML-1.0
    531  */
    532 
    533 static void testCharRanges(void) {
    534     char data[5];
    535     xmlParserCtxtPtr ctxt;
    536     xmlParserInputBufferPtr buf;
    537     xmlParserInputPtr input;
    538 
    539     memset(data, 0, 5);
    540 
    541     /*
    542      * Set up a parsing context using the above data buffer as
    543      * the current input source.
    544      */
    545     ctxt = xmlNewParserCtxt();
    546     if (ctxt == NULL) {
    547         fprintf(stderr, "Failed to allocate parser context\n");
    548 	return;
    549     }
    550     buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
    551                                            XML_CHAR_ENCODING_NONE);
    552     if (buf == NULL) {
    553         fprintf(stderr, "Failed to allocate input buffer\n");
    554 	goto error;
    555     }
    556     input = xmlNewInputStream(ctxt);
    557     if (input == NULL) {
    558         xmlFreeParserInputBuffer(buf);
    559 	goto error;
    560     }
    561     input->filename = NULL;
    562     input->buf = buf;
    563     input->cur =
    564     input->base = xmlBufContent(input->buf->buffer);
    565     input->end = input->base + 4;
    566     inputPush(ctxt, input);
    567 
    568     printf("testing char range: 1");
    569     fflush(stdout);
    570     testCharRangeByte1(ctxt, data);
    571     printf(" 2");
    572     fflush(stdout);
    573     testCharRangeByte2(ctxt, data);
    574     printf(" 3");
    575     fflush(stdout);
    576     testCharRangeByte3(ctxt, data);
    577     printf(" 4");
    578     fflush(stdout);
    579     testCharRangeByte4(ctxt, data);
    580     printf(" done\n");
    581     fflush(stdout);
    582 
    583 error:
    584     xmlFreeParserCtxt(ctxt);
    585 }
    586 
    587 int main(void) {
    588 
    589     /*
    590      * this initialize the library and check potential ABI mismatches
    591      * between the version it was compiled for and the actual shared
    592      * library used.
    593      */
    594     LIBXML_TEST_VERSION
    595 
    596     /*
    597      * Catch errors separately
    598      */
    599 
    600     xmlSetStructuredErrorFunc(NULL, errorHandler);
    601 
    602     /*
    603      * Run the tests
    604      */
    605     testCharRanges();
    606     testDocumentRanges();
    607 
    608     /*
    609      * Cleanup function for the XML library.
    610      */
    611     xmlCleanupParser();
    612     /*
    613      * this is to debug memory for regression tests
    614      */
    615     xmlMemoryDump();
    616     return(0);
    617 }
    618