Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * string.c : an XML string utilities module
      3  *
      4  * This module provides various utility functions for manipulating
      5  * the xmlChar* type. All functions named xmlStr* have been moved here
      6  * from the parser.c file (their original home).
      7  *
      8  * See Copyright for the status of this software.
      9  *
     10  * UTF8 string routines from:
     11  * William Brack <wbrack (at) mmm.com.hk>
     12  *
     13  * daniel (at) veillard.com
     14  */
     15 
     16 #define IN_LIBXML
     17 #include "libxml.h"
     18 
     19 #include <stdlib.h>
     20 #include <string.h>
     21 #include <libxml/xmlmemory.h>
     22 #include <libxml/parserInternals.h>
     23 #include <libxml/xmlstring.h>
     24 
     25 /************************************************************************
     26  *                                                                      *
     27  *                Commodity functions to handle xmlChars                *
     28  *                                                                      *
     29  ************************************************************************/
     30 
     31 /**
     32  * xmlStrndup:
     33  * @cur:  the input xmlChar *
     34  * @len:  the len of @cur
     35  *
     36  * a strndup for array of xmlChar's
     37  *
     38  * Returns a new xmlChar * or NULL
     39  */
     40 xmlChar *
     41 xmlStrndup(const xmlChar *cur, int len) {
     42     xmlChar *ret;
     43 
     44     if ((cur == NULL) || (len < 0)) return(NULL);
     45     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
     46     if (ret == NULL) {
     47         xmlErrMemory(NULL, NULL);
     48         return(NULL);
     49     }
     50     memcpy(ret, cur, len * sizeof(xmlChar));
     51     ret[len] = 0;
     52     return(ret);
     53 }
     54 
     55 /**
     56  * xmlStrdup:
     57  * @cur:  the input xmlChar *
     58  *
     59  * a strdup for array of xmlChar's. Since they are supposed to be
     60  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
     61  * a termination mark of '0'.
     62  *
     63  * Returns a new xmlChar * or NULL
     64  */
     65 xmlChar *
     66 xmlStrdup(const xmlChar *cur) {
     67     const xmlChar *p = cur;
     68 
     69     if (cur == NULL) return(NULL);
     70     while (*p != 0) p++; /* non input consuming */
     71     return(xmlStrndup(cur, p - cur));
     72 }
     73 
     74 /**
     75  * xmlCharStrndup:
     76  * @cur:  the input char *
     77  * @len:  the len of @cur
     78  *
     79  * a strndup for char's to xmlChar's
     80  *
     81  * Returns a new xmlChar * or NULL
     82  */
     83 
     84 xmlChar *
     85 xmlCharStrndup(const char *cur, int len) {
     86     int i;
     87     xmlChar *ret;
     88 
     89     if ((cur == NULL) || (len < 0)) return(NULL);
     90     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
     91     if (ret == NULL) {
     92         xmlErrMemory(NULL, NULL);
     93         return(NULL);
     94     }
     95     for (i = 0;i < len;i++) {
     96         ret[i] = (xmlChar) cur[i];
     97         if (ret[i] == 0) return(ret);
     98     }
     99     ret[len] = 0;
    100     return(ret);
    101 }
    102 
    103 /**
    104  * xmlCharStrdup:
    105  * @cur:  the input char *
    106  *
    107  * a strdup for char's to xmlChar's
    108  *
    109  * Returns a new xmlChar * or NULL
    110  */
    111 
    112 xmlChar *
    113 xmlCharStrdup(const char *cur) {
    114     const char *p = cur;
    115 
    116     if (cur == NULL) return(NULL);
    117     while (*p != '\0') p++; /* non input consuming */
    118     return(xmlCharStrndup(cur, p - cur));
    119 }
    120 
    121 /**
    122  * xmlStrcmp:
    123  * @str1:  the first xmlChar *
    124  * @str2:  the second xmlChar *
    125  *
    126  * a strcmp for xmlChar's
    127  *
    128  * Returns the integer result of the comparison
    129  */
    130 
    131 int
    132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
    133     register int tmp;
    134 
    135     if (str1 == str2) return(0);
    136     if (str1 == NULL) return(-1);
    137     if (str2 == NULL) return(1);
    138     do {
    139         tmp = *str1++ - *str2;
    140         if (tmp != 0) return(tmp);
    141     } while (*str2++ != 0);
    142     return 0;
    143 }
    144 
    145 /**
    146  * xmlStrEqual:
    147  * @str1:  the first xmlChar *
    148  * @str2:  the second xmlChar *
    149  *
    150  * Check if both strings are equal of have same content.
    151  * Should be a bit more readable and faster than xmlStrcmp()
    152  *
    153  * Returns 1 if they are equal, 0 if they are different
    154  */
    155 
    156 int
    157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
    158     if (str1 == str2) return(1);
    159     if (str1 == NULL) return(0);
    160     if (str2 == NULL) return(0);
    161     do {
    162         if (*str1++ != *str2) return(0);
    163     } while (*str2++);
    164     return(1);
    165 }
    166 
    167 /**
    168  * xmlStrQEqual:
    169  * @pref:  the prefix of the QName
    170  * @name:  the localname of the QName
    171  * @str:  the second xmlChar *
    172  *
    173  * Check if a QName is Equal to a given string
    174  *
    175  * Returns 1 if they are equal, 0 if they are different
    176  */
    177 
    178 int
    179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
    180     if (pref == NULL) return(xmlStrEqual(name, str));
    181     if (name == NULL) return(0);
    182     if (str == NULL) return(0);
    183 
    184     do {
    185         if (*pref++ != *str) return(0);
    186     } while ((*str++) && (*pref));
    187     if (*str++ != ':') return(0);
    188     do {
    189         if (*name++ != *str) return(0);
    190     } while (*str++);
    191     return(1);
    192 }
    193 
    194 /**
    195  * xmlStrncmp:
    196  * @str1:  the first xmlChar *
    197  * @str2:  the second xmlChar *
    198  * @len:  the max comparison length
    199  *
    200  * a strncmp for xmlChar's
    201  *
    202  * Returns the integer result of the comparison
    203  */
    204 
    205 int
    206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
    207     register int tmp;
    208 
    209     if (len <= 0) return(0);
    210     if (str1 == str2) return(0);
    211     if (str1 == NULL) return(-1);
    212     if (str2 == NULL) return(1);
    213 #ifdef __GNUC__
    214     tmp = strncmp((const char *)str1, (const char *)str2, len);
    215     return tmp;
    216 #else
    217     do {
    218         tmp = *str1++ - *str2;
    219         if (tmp != 0 || --len == 0) return(tmp);
    220     } while (*str2++ != 0);
    221     return 0;
    222 #endif
    223 }
    224 
    225 static const xmlChar casemap[256] = {
    226     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
    227     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
    228     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
    229     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
    230     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
    231     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
    232     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
    233     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
    234     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
    235     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
    236     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
    237     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
    238     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
    239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
    240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
    241     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
    242     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
    243     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
    244     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
    245     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
    246     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
    247     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
    248     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
    249     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
    250     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
    251     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
    252     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
    253     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
    254     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
    255     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
    256     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
    257     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
    258 };
    259 
    260 /**
    261  * xmlStrcasecmp:
    262  * @str1:  the first xmlChar *
    263  * @str2:  the second xmlChar *
    264  *
    265  * a strcasecmp for xmlChar's
    266  *
    267  * Returns the integer result of the comparison
    268  */
    269 
    270 int
    271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
    272     register int tmp;
    273 
    274     if (str1 == str2) return(0);
    275     if (str1 == NULL) return(-1);
    276     if (str2 == NULL) return(1);
    277     do {
    278         tmp = casemap[*str1++] - casemap[*str2];
    279         if (tmp != 0) return(tmp);
    280     } while (*str2++ != 0);
    281     return 0;
    282 }
    283 
    284 /**
    285  * xmlStrncasecmp:
    286  * @str1:  the first xmlChar *
    287  * @str2:  the second xmlChar *
    288  * @len:  the max comparison length
    289  *
    290  * a strncasecmp for xmlChar's
    291  *
    292  * Returns the integer result of the comparison
    293  */
    294 
    295 int
    296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
    297     register int tmp;
    298 
    299     if (len <= 0) return(0);
    300     if (str1 == str2) return(0);
    301     if (str1 == NULL) return(-1);
    302     if (str2 == NULL) return(1);
    303     do {
    304         tmp = casemap[*str1++] - casemap[*str2];
    305         if (tmp != 0 || --len == 0) return(tmp);
    306     } while (*str2++ != 0);
    307     return 0;
    308 }
    309 
    310 /**
    311  * xmlStrchr:
    312  * @str:  the xmlChar * array
    313  * @val:  the xmlChar to search
    314  *
    315  * a strchr for xmlChar's
    316  *
    317  * Returns the xmlChar * for the first occurrence or NULL.
    318  */
    319 
    320 const xmlChar *
    321 xmlStrchr(const xmlChar *str, xmlChar val) {
    322     if (str == NULL) return(NULL);
    323     while (*str != 0) { /* non input consuming */
    324         if (*str == val) return((xmlChar *) str);
    325         str++;
    326     }
    327     return(NULL);
    328 }
    329 
    330 /**
    331  * xmlStrstr:
    332  * @str:  the xmlChar * array (haystack)
    333  * @val:  the xmlChar to search (needle)
    334  *
    335  * a strstr for xmlChar's
    336  *
    337  * Returns the xmlChar * for the first occurrence or NULL.
    338  */
    339 
    340 const xmlChar *
    341 xmlStrstr(const xmlChar *str, const xmlChar *val) {
    342     int n;
    343 
    344     if (str == NULL) return(NULL);
    345     if (val == NULL) return(NULL);
    346     n = xmlStrlen(val);
    347 
    348     if (n == 0) return(str);
    349     while (*str != 0) { /* non input consuming */
    350         if (*str == *val) {
    351             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
    352         }
    353         str++;
    354     }
    355     return(NULL);
    356 }
    357 
    358 /**
    359  * xmlStrcasestr:
    360  * @str:  the xmlChar * array (haystack)
    361  * @val:  the xmlChar to search (needle)
    362  *
    363  * a case-ignoring strstr for xmlChar's
    364  *
    365  * Returns the xmlChar * for the first occurrence or NULL.
    366  */
    367 
    368 const xmlChar *
    369 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
    370     int n;
    371 
    372     if (str == NULL) return(NULL);
    373     if (val == NULL) return(NULL);
    374     n = xmlStrlen(val);
    375 
    376     if (n == 0) return(str);
    377     while (*str != 0) { /* non input consuming */
    378         if (casemap[*str] == casemap[*val])
    379             if (!xmlStrncasecmp(str, val, n)) return(str);
    380         str++;
    381     }
    382     return(NULL);
    383 }
    384 
    385 /**
    386  * xmlStrsub:
    387  * @str:  the xmlChar * array (haystack)
    388  * @start:  the index of the first char (zero based)
    389  * @len:  the length of the substring
    390  *
    391  * Extract a substring of a given string
    392  *
    393  * Returns the xmlChar * for the first occurrence or NULL.
    394  */
    395 
    396 xmlChar *
    397 xmlStrsub(const xmlChar *str, int start, int len) {
    398     int i;
    399 
    400     if (str == NULL) return(NULL);
    401     if (start < 0) return(NULL);
    402     if (len < 0) return(NULL);
    403 
    404     for (i = 0;i < start;i++) {
    405         if (*str == 0) return(NULL);
    406         str++;
    407     }
    408     if (*str == 0) return(NULL);
    409     return(xmlStrndup(str, len));
    410 }
    411 
    412 /**
    413  * xmlStrlen:
    414  * @str:  the xmlChar * array
    415  *
    416  * length of a xmlChar's string
    417  *
    418  * Returns the number of xmlChar contained in the ARRAY.
    419  */
    420 
    421 int
    422 xmlStrlen(const xmlChar *str) {
    423     int len = 0;
    424 
    425     if (str == NULL) return(0);
    426     while (*str != 0) { /* non input consuming */
    427         str++;
    428         len++;
    429     }
    430     return(len);
    431 }
    432 
    433 /**
    434  * xmlStrncat:
    435  * @cur:  the original xmlChar * array
    436  * @add:  the xmlChar * array added
    437  * @len:  the length of @add
    438  *
    439  * a strncat for array of xmlChar's, it will extend @cur with the len
    440  * first bytes of @add. Note that if @len < 0 then this is an API error
    441  * and NULL will be returned.
    442  *
    443  * Returns a new xmlChar *, the original @cur is reallocated and should
    444  * not be freed.
    445  */
    446 
    447 xmlChar *
    448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
    449     int size;
    450     xmlChar *ret;
    451 
    452     if ((add == NULL) || (len == 0))
    453         return(cur);
    454     if (len < 0)
    455 	return(NULL);
    456     if (cur == NULL)
    457         return(xmlStrndup(add, len));
    458 
    459     size = xmlStrlen(cur);
    460     if (size < 0)
    461         return(NULL);
    462     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
    463     if (ret == NULL) {
    464         xmlErrMemory(NULL, NULL);
    465         return(cur);
    466     }
    467     memcpy(&ret[size], add, len * sizeof(xmlChar));
    468     ret[size + len] = 0;
    469     return(ret);
    470 }
    471 
    472 /**
    473  * xmlStrncatNew:
    474  * @str1:  first xmlChar string
    475  * @str2:  second xmlChar string
    476  * @len:  the len of @str2 or < 0
    477  *
    478  * same as xmlStrncat, but creates a new string.  The original
    479  * two strings are not freed. If @len is < 0 then the length
    480  * will be calculated automatically.
    481  *
    482  * Returns a new xmlChar * or NULL
    483  */
    484 xmlChar *
    485 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
    486     int size;
    487     xmlChar *ret;
    488 
    489     if (len < 0) {
    490         len = xmlStrlen(str2);
    491         if (len < 0)
    492             return(NULL);
    493     }
    494     if ((str2 == NULL) || (len == 0))
    495         return(xmlStrdup(str1));
    496     if (str1 == NULL)
    497         return(xmlStrndup(str2, len));
    498 
    499     size = xmlStrlen(str1);
    500     if (size < 0)
    501         return(NULL);
    502     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
    503     if (ret == NULL) {
    504         xmlErrMemory(NULL, NULL);
    505         return(xmlStrndup(str1, size));
    506     }
    507     memcpy(ret, str1, size * sizeof(xmlChar));
    508     memcpy(&ret[size], str2, len * sizeof(xmlChar));
    509     ret[size + len] = 0;
    510     return(ret);
    511 }
    512 
    513 /**
    514  * xmlStrcat:
    515  * @cur:  the original xmlChar * array
    516  * @add:  the xmlChar * array added
    517  *
    518  * a strcat for array of xmlChar's. Since they are supposed to be
    519  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
    520  * a termination mark of '0'.
    521  *
    522  * Returns a new xmlChar * containing the concatenated string. The original
    523  * @cur is reallocated and should not be freed.
    524  */
    525 xmlChar *
    526 xmlStrcat(xmlChar *cur, const xmlChar *add) {
    527     const xmlChar *p = add;
    528 
    529     if (add == NULL) return(cur);
    530     if (cur == NULL)
    531         return(xmlStrdup(add));
    532 
    533     while (*p != 0) p++; /* non input consuming */
    534     return(xmlStrncat(cur, add, p - add));
    535 }
    536 
    537 /**
    538  * xmlStrPrintf:
    539  * @buf:   the result buffer.
    540  * @len:   the result buffer length.
    541  * @msg:   the message with printf formatting.
    542  * @...:   extra parameters for the message.
    543  *
    544  * Formats @msg and places result into @buf.
    545  *
    546  * Returns the number of characters written to @buf or -1 if an error occurs.
    547  */
    548 int XMLCDECL
    549 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
    550     va_list args;
    551     int ret;
    552 
    553     if((buf == NULL) || (msg == NULL)) {
    554         return(-1);
    555     }
    556 
    557     va_start(args, msg);
    558     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
    559     va_end(args);
    560     buf[len - 1] = 0; /* be safe ! */
    561 
    562     return(ret);
    563 }
    564 
    565 /**
    566  * xmlStrVPrintf:
    567  * @buf:   the result buffer.
    568  * @len:   the result buffer length.
    569  * @msg:   the message with printf formatting.
    570  * @ap:    extra parameters for the message.
    571  *
    572  * Formats @msg and places result into @buf.
    573  *
    574  * Returns the number of characters written to @buf or -1 if an error occurs.
    575  */
    576 int
    577 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
    578     int ret;
    579 
    580     if((buf == NULL) || (msg == NULL)) {
    581         return(-1);
    582     }
    583 
    584     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
    585     buf[len - 1] = 0; /* be safe ! */
    586 
    587     return(ret);
    588 }
    589 
    590 /************************************************************************
    591  *                                                                      *
    592  *              Generic UTF8 handling routines                          *
    593  *                                                                      *
    594  * From rfc2044: encoding of the Unicode values on UTF-8:               *
    595  *                                                                      *
    596  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
    597  * 0000 0000-0000 007F   0xxxxxxx                                       *
    598  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
    599  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
    600  *                                                                      *
    601  * I hope we won't use values > 0xFFFF anytime soon !                   *
    602  *                                                                      *
    603  ************************************************************************/
    604 
    605 
    606 /**
    607  * xmlUTF8Size:
    608  * @utf: pointer to the UTF8 character
    609  *
    610  * calculates the internal size of a UTF8 character
    611  *
    612  * returns the numbers of bytes in the character, -1 on format error
    613  */
    614 int
    615 xmlUTF8Size(const xmlChar *utf) {
    616     xmlChar mask;
    617     int len;
    618 
    619     if (utf == NULL)
    620         return -1;
    621     if (*utf < 0x80)
    622         return 1;
    623     /* check valid UTF8 character */
    624     if (!(*utf & 0x40))
    625         return -1;
    626     /* determine number of bytes in char */
    627     len = 2;
    628     for (mask=0x20; mask != 0; mask>>=1) {
    629         if (!(*utf & mask))
    630             return len;
    631         len++;
    632     }
    633     return -1;
    634 }
    635 
    636 /**
    637  * xmlUTF8Charcmp:
    638  * @utf1: pointer to first UTF8 char
    639  * @utf2: pointer to second UTF8 char
    640  *
    641  * compares the two UCS4 values
    642  *
    643  * returns result of the compare as with xmlStrncmp
    644  */
    645 int
    646 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
    647 
    648     if (utf1 == NULL ) {
    649         if (utf2 == NULL)
    650             return 0;
    651         return -1;
    652     }
    653     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
    654 }
    655 
    656 /**
    657  * xmlUTF8Strlen:
    658  * @utf:  a sequence of UTF-8 encoded bytes
    659  *
    660  * compute the length of an UTF8 string, it doesn't do a full UTF8
    661  * checking of the content of the string.
    662  *
    663  * Returns the number of characters in the string or -1 in case of error
    664  */
    665 int
    666 xmlUTF8Strlen(const xmlChar *utf) {
    667     int ret = 0;
    668 
    669     if (utf == NULL)
    670         return(-1);
    671 
    672     while (*utf != 0) {
    673         if (utf[0] & 0x80) {
    674             if ((utf[1] & 0xc0) != 0x80)
    675                 return(-1);
    676             if ((utf[0] & 0xe0) == 0xe0) {
    677                 if ((utf[2] & 0xc0) != 0x80)
    678                     return(-1);
    679                 if ((utf[0] & 0xf0) == 0xf0) {
    680                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
    681                         return(-1);
    682                     utf += 4;
    683                 } else {
    684                     utf += 3;
    685                 }
    686             } else {
    687                 utf += 2;
    688             }
    689         } else {
    690             utf++;
    691         }
    692         ret++;
    693     }
    694     return(ret);
    695 }
    696 
    697 /**
    698  * xmlGetUTF8Char:
    699  * @utf:  a sequence of UTF-8 encoded bytes
    700  * @len:  a pointer to the minimum number of bytes present in
    701  *        the sequence.  This is used to assure the next character
    702  *        is completely contained within the sequence.
    703  *
    704  * Read the first UTF8 character from @utf
    705  *
    706  * Returns the char value or -1 in case of error, and sets *len to
    707  *        the actual number of bytes consumed (0 in case of error)
    708  */
    709 int
    710 xmlGetUTF8Char(const unsigned char *utf, int *len) {
    711     unsigned int c;
    712 
    713     if (utf == NULL)
    714         goto error;
    715     if (len == NULL)
    716         goto error;
    717     if (*len < 1)
    718         goto error;
    719 
    720     c = utf[0];
    721     if (c & 0x80) {
    722         if (*len < 2)
    723             goto error;
    724         if ((utf[1] & 0xc0) != 0x80)
    725             goto error;
    726         if ((c & 0xe0) == 0xe0) {
    727             if (*len < 3)
    728                 goto error;
    729             if ((utf[2] & 0xc0) != 0x80)
    730                 goto error;
    731             if ((c & 0xf0) == 0xf0) {
    732                 if (*len < 4)
    733                     goto error;
    734                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
    735                     goto error;
    736                 *len = 4;
    737                 /* 4-byte code */
    738                 c = (utf[0] & 0x7) << 18;
    739                 c |= (utf[1] & 0x3f) << 12;
    740                 c |= (utf[2] & 0x3f) << 6;
    741                 c |= utf[3] & 0x3f;
    742             } else {
    743               /* 3-byte code */
    744                 *len = 3;
    745                 c = (utf[0] & 0xf) << 12;
    746                 c |= (utf[1] & 0x3f) << 6;
    747                 c |= utf[2] & 0x3f;
    748             }
    749         } else {
    750           /* 2-byte code */
    751             *len = 2;
    752             c = (utf[0] & 0x1f) << 6;
    753             c |= utf[1] & 0x3f;
    754         }
    755     } else {
    756         /* 1-byte code */
    757         *len = 1;
    758     }
    759     return(c);
    760 
    761 error:
    762     if (len != NULL)
    763 	*len = 0;
    764     return(-1);
    765 }
    766 
    767 /**
    768  * xmlCheckUTF8:
    769  * @utf: Pointer to putative UTF-8 encoded string.
    770  *
    771  * Checks @utf for being valid UTF-8. @utf is assumed to be
    772  * null-terminated. This function is not super-strict, as it will
    773  * allow longer UTF-8 sequences than necessary. Note that Java is
    774  * capable of producing these sequences if provoked. Also note, this
    775  * routine checks for the 4-byte maximum size, but does not check for
    776  * 0x10ffff maximum value.
    777  *
    778  * Return value: true if @utf is valid.
    779  **/
    780 int
    781 xmlCheckUTF8(const unsigned char *utf)
    782 {
    783     int ix;
    784     unsigned char c;
    785 
    786     if (utf == NULL)
    787         return(0);
    788     /*
    789      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
    790      * are as follows (in "bit format"):
    791      *    0xxxxxxx                                      valid 1-byte
    792      *    110xxxxx 10xxxxxx                             valid 2-byte
    793      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
    794      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
    795      */
    796     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
    797         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
    798             ix++;
    799 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
    800 	    if ((utf[ix+1] & 0xc0 ) != 0x80)
    801 	        return 0;
    802 	    ix += 2;
    803 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
    804 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
    805 	        ((utf[ix+2] & 0xc0) != 0x80))
    806 		    return 0;
    807 	    ix += 3;
    808 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
    809 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
    810 	        ((utf[ix+2] & 0xc0) != 0x80) ||
    811 		((utf[ix+3] & 0xc0) != 0x80))
    812 		    return 0;
    813 	    ix += 4;
    814 	} else				/* unknown encoding */
    815 	    return 0;
    816       }
    817       return(1);
    818 }
    819 
    820 /**
    821  * xmlUTF8Strsize:
    822  * @utf:  a sequence of UTF-8 encoded bytes
    823  * @len:  the number of characters in the array
    824  *
    825  * storage size of an UTF8 string
    826  * the behaviour is not guaranteed if the input string is not UTF-8
    827  *
    828  * Returns the storage size of
    829  * the first 'len' characters of ARRAY
    830  */
    831 
    832 int
    833 xmlUTF8Strsize(const xmlChar *utf, int len) {
    834     const xmlChar   *ptr=utf;
    835     xmlChar         ch;
    836 
    837     if (utf == NULL)
    838         return(0);
    839 
    840     if (len <= 0)
    841         return(0);
    842 
    843     while ( len-- > 0) {
    844         if ( !*ptr )
    845             break;
    846         if ( (ch = *ptr++) & 0x80)
    847             while ((ch<<=1) & 0x80 ) {
    848 		if (*ptr == 0) break;
    849                 ptr++;
    850 	    }
    851     }
    852     return (ptr - utf);
    853 }
    854 
    855 
    856 /**
    857  * xmlUTF8Strndup:
    858  * @utf:  the input UTF8 *
    859  * @len:  the len of @utf (in chars)
    860  *
    861  * a strndup for array of UTF8's
    862  *
    863  * Returns a new UTF8 * or NULL
    864  */
    865 xmlChar *
    866 xmlUTF8Strndup(const xmlChar *utf, int len) {
    867     xmlChar *ret;
    868     int i;
    869 
    870     if ((utf == NULL) || (len < 0)) return(NULL);
    871     i = xmlUTF8Strsize(utf, len);
    872     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
    873     if (ret == NULL) {
    874         xmlGenericError(xmlGenericErrorContext,
    875                 "malloc of %ld byte failed\n",
    876                 (len + 1) * (long)sizeof(xmlChar));
    877         return(NULL);
    878     }
    879     memcpy(ret, utf, i * sizeof(xmlChar));
    880     ret[i] = 0;
    881     return(ret);
    882 }
    883 
    884 /**
    885  * xmlUTF8Strpos:
    886  * @utf:  the input UTF8 *
    887  * @pos:  the position of the desired UTF8 char (in chars)
    888  *
    889  * a function to provide the equivalent of fetching a
    890  * character from a string array
    891  *
    892  * Returns a pointer to the UTF8 character or NULL
    893  */
    894 const xmlChar *
    895 xmlUTF8Strpos(const xmlChar *utf, int pos) {
    896     xmlChar ch;
    897 
    898     if (utf == NULL) return(NULL);
    899     if (pos < 0)
    900         return(NULL);
    901     while (pos--) {
    902         if ((ch=*utf++) == 0) return(NULL);
    903         if ( ch & 0x80 ) {
    904             /* if not simple ascii, verify proper format */
    905             if ( (ch & 0xc0) != 0xc0 )
    906                 return(NULL);
    907             /* then skip over remaining bytes for this char */
    908             while ( (ch <<= 1) & 0x80 )
    909                 if ( (*utf++ & 0xc0) != 0x80 )
    910                     return(NULL);
    911         }
    912     }
    913     return((xmlChar *)utf);
    914 }
    915 
    916 /**
    917  * xmlUTF8Strloc:
    918  * @utf:  the input UTF8 *
    919  * @utfchar:  the UTF8 character to be found
    920  *
    921  * a function to provide the relative location of a UTF8 char
    922  *
    923  * Returns the relative character position of the desired char
    924  * or -1 if not found
    925  */
    926 int
    927 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
    928     int i, size;
    929     xmlChar ch;
    930 
    931     if (utf==NULL || utfchar==NULL) return -1;
    932     size = xmlUTF8Strsize(utfchar, 1);
    933         for(i=0; (ch=*utf) != 0; i++) {
    934             if (xmlStrncmp(utf, utfchar, size)==0)
    935                 return(i);
    936             utf++;
    937             if ( ch & 0x80 ) {
    938                 /* if not simple ascii, verify proper format */
    939                 if ( (ch & 0xc0) != 0xc0 )
    940                     return(-1);
    941                 /* then skip over remaining bytes for this char */
    942                 while ( (ch <<= 1) & 0x80 )
    943                     if ( (*utf++ & 0xc0) != 0x80 )
    944                         return(-1);
    945             }
    946         }
    947 
    948     return(-1);
    949 }
    950 /**
    951  * xmlUTF8Strsub:
    952  * @utf:  a sequence of UTF-8 encoded bytes
    953  * @start: relative pos of first char
    954  * @len:   total number to copy
    955  *
    956  * Create a substring from a given UTF-8 string
    957  * Note:  positions are given in units of UTF-8 chars
    958  *
    959  * Returns a pointer to a newly created string
    960  * or NULL if any problem
    961  */
    962 
    963 xmlChar *
    964 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
    965     int            i;
    966     xmlChar ch;
    967 
    968     if (utf == NULL) return(NULL);
    969     if (start < 0) return(NULL);
    970     if (len < 0) return(NULL);
    971 
    972     /*
    973      * Skip over any leading chars
    974      */
    975     for (i = 0;i < start;i++) {
    976         if ((ch=*utf++) == 0) return(NULL);
    977         if ( ch & 0x80 ) {
    978             /* if not simple ascii, verify proper format */
    979             if ( (ch & 0xc0) != 0xc0 )
    980                 return(NULL);
    981             /* then skip over remaining bytes for this char */
    982             while ( (ch <<= 1) & 0x80 )
    983                 if ( (*utf++ & 0xc0) != 0x80 )
    984                     return(NULL);
    985         }
    986     }
    987 
    988     return(xmlUTF8Strndup(utf, len));
    989 }
    990 
    991 /**
    992  * xmlEscapeFormatString:
    993  * @msg:  a pointer to the string in which to escape '%' characters.
    994  * Must be a heap-allocated buffer created by libxml2 that may be
    995  * returned, or that may be freed and replaced.
    996  *
    997  * Replaces the string pointed to by 'msg' with an escaped string.
    998  * Returns the same string with all '%' characters escaped.
    999  */
   1000 xmlChar *
   1001 xmlEscapeFormatString(xmlChar **msg)
   1002 {
   1003     xmlChar *msgPtr = NULL;
   1004     xmlChar *result = NULL;
   1005     xmlChar *resultPtr = NULL;
   1006     size_t count = 0;
   1007     size_t msgLen = 0;
   1008     size_t resultLen = 0;
   1009 
   1010     if (!msg || !*msg)
   1011         return(NULL);
   1012 
   1013     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
   1014         ++msgLen;
   1015         if (*msgPtr == '%')
   1016             ++count;
   1017     }
   1018 
   1019     if (count == 0)
   1020         return(*msg);
   1021 
   1022     resultLen = msgLen + count + 1;
   1023     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
   1024     if (result == NULL) {
   1025         /* Clear *msg to prevent format string vulnerabilities in
   1026            out-of-memory situations. */
   1027         xmlFree(*msg);
   1028         *msg = NULL;
   1029         xmlErrMemory(NULL, NULL);
   1030         return(NULL);
   1031     }
   1032 
   1033     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
   1034         *resultPtr = *msgPtr;
   1035         if (*msgPtr == '%')
   1036             *(++resultPtr) = '%';
   1037     }
   1038     result[resultLen - 1] = '\0';
   1039 
   1040     xmlFree(*msg);
   1041     *msg = result;
   1042 
   1043     return *msg;
   1044 }
   1045 
   1046 #define bottom_xmlstring
   1047 #include "elfgcchack.h"
   1048