Home | History | Annotate | Download | only in libxml2
      1 /*
      2  * string.c : an XML string utilities module
      3  *
      4  * This module provides various utility functions for manipulating
      5  * the xmlChar* type. All functions named xmlStr* have been moved here
      6  * from the parser.c file (their original home).
      7  *
      8  * See Copyright for the status of this software.
      9  *
     10  * UTF8 string routines from:
     11  * William Brack <wbrack (at) mmm.com.hk>
     12  *
     13  * daniel (at) veillard.com
     14  */
     15 
     16 #define IN_LIBXML
     17 #include "libxml.h"
     18 
     19 #include <stdlib.h>
     20 #include <string.h>
     21 #include <libxml/xmlmemory.h>
     22 #include <libxml/parserInternals.h>
     23 #include <libxml/xmlstring.h>
     24 
     25 /************************************************************************
     26  *                                                                      *
     27  *                Commodity functions to handle xmlChars                *
     28  *                                                                      *
     29  ************************************************************************/
     30 
     31 /**
     32  * xmlStrndup:
     33  * @cur:  the input xmlChar *
     34  * @len:  the len of @cur
     35  *
     36  * a strndup for array of xmlChar's
     37  *
     38  * Returns a new xmlChar * or NULL
     39  */
     40 xmlChar *
     41 xmlStrndup(const xmlChar *cur, int len) {
     42     xmlChar *ret;
     43 
     44     if ((cur == NULL) || (len < 0)) return(NULL);
     45     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
     46     if (ret == NULL) {
     47         xmlErrMemory(NULL, NULL);
     48         return(NULL);
     49     }
     50     memcpy(ret, cur, len * sizeof(xmlChar));
     51     ret[len] = 0;
     52     return(ret);
     53 }
     54 
     55 /**
     56  * xmlStrdup:
     57  * @cur:  the input xmlChar *
     58  *
     59  * a strdup for array of xmlChar's. Since they are supposed to be
     60  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
     61  * a termination mark of '0'.
     62  *
     63  * Returns a new xmlChar * or NULL
     64  */
     65 xmlChar *
     66 xmlStrdup(const xmlChar *cur) {
     67     const xmlChar *p = cur;
     68 
     69     if (cur == NULL) return(NULL);
     70     while (*p != 0) p++; /* non input consuming */
     71     return(xmlStrndup(cur, p - cur));
     72 }
     73 
     74 /**
     75  * xmlCharStrndup:
     76  * @cur:  the input char *
     77  * @len:  the len of @cur
     78  *
     79  * a strndup for char's to xmlChar's
     80  *
     81  * Returns a new xmlChar * or NULL
     82  */
     83 
     84 xmlChar *
     85 xmlCharStrndup(const char *cur, int len) {
     86     int i;
     87     xmlChar *ret;
     88 
     89     if ((cur == NULL) || (len < 0)) return(NULL);
     90     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
     91     if (ret == NULL) {
     92         xmlErrMemory(NULL, NULL);
     93         return(NULL);
     94     }
     95     for (i = 0;i < len;i++) {
     96         ret[i] = (xmlChar) cur[i];
     97         if (ret[i] == 0) return(ret);
     98     }
     99     ret[len] = 0;
    100     return(ret);
    101 }
    102 
    103 /**
    104  * xmlCharStrdup:
    105  * @cur:  the input char *
    106  *
    107  * a strdup for char's to xmlChar's
    108  *
    109  * Returns a new xmlChar * or NULL
    110  */
    111 
    112 xmlChar *
    113 xmlCharStrdup(const char *cur) {
    114     const char *p = cur;
    115 
    116     if (cur == NULL) return(NULL);
    117     while (*p != '\0') p++; /* non input consuming */
    118     return(xmlCharStrndup(cur, p - cur));
    119 }
    120 
    121 /**
    122  * xmlStrcmp:
    123  * @str1:  the first xmlChar *
    124  * @str2:  the second xmlChar *
    125  *
    126  * a strcmp for xmlChar's
    127  *
    128  * Returns the integer result of the comparison
    129  */
    130 
    131 int
    132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
    133     register int tmp;
    134 
    135     if (str1 == str2) return(0);
    136     if (str1 == NULL) return(-1);
    137     if (str2 == NULL) return(1);
    138     do {
    139         tmp = *str1++ - *str2;
    140         if (tmp != 0) return(tmp);
    141     } while (*str2++ != 0);
    142     return 0;
    143 }
    144 
    145 /**
    146  * xmlStrEqual:
    147  * @str1:  the first xmlChar *
    148  * @str2:  the second xmlChar *
    149  *
    150  * Check if both strings are equal of have same content.
    151  * Should be a bit more readable and faster than xmlStrcmp()
    152  *
    153  * Returns 1 if they are equal, 0 if they are different
    154  */
    155 
    156 int
    157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
    158     if (str1 == str2) return(1);
    159     if (str1 == NULL) return(0);
    160     if (str2 == NULL) return(0);
    161     do {
    162         if (*str1++ != *str2) return(0);
    163     } while (*str2++);
    164     return(1);
    165 }
    166 
    167 /**
    168  * xmlStrQEqual:
    169  * @pref:  the prefix of the QName
    170  * @name:  the localname of the QName
    171  * @str:  the second xmlChar *
    172  *
    173  * Check if a QName is Equal to a given string
    174  *
    175  * Returns 1 if they are equal, 0 if they are different
    176  */
    177 
    178 int
    179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
    180     if (pref == NULL) return(xmlStrEqual(name, str));
    181     if (name == NULL) return(0);
    182     if (str == NULL) return(0);
    183 
    184     do {
    185         if (*pref++ != *str) return(0);
    186     } while ((*str++) && (*pref));
    187     if (*str++ != ':') return(0);
    188     do {
    189         if (*name++ != *str) return(0);
    190     } while (*str++);
    191     return(1);
    192 }
    193 
    194 /**
    195  * xmlStrncmp:
    196  * @str1:  the first xmlChar *
    197  * @str2:  the second xmlChar *
    198  * @len:  the max comparison length
    199  *
    200  * a strncmp for xmlChar's
    201  *
    202  * Returns the integer result of the comparison
    203  */
    204 
    205 int
    206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
    207     register int tmp;
    208 
    209     if (len <= 0) return(0);
    210     if (str1 == str2) return(0);
    211     if (str1 == NULL) return(-1);
    212     if (str2 == NULL) return(1);
    213 #ifdef __GNUC__
    214     tmp = strncmp((const char *)str1, (const char *)str2, len);
    215     return tmp;
    216 #else
    217     do {
    218         tmp = *str1++ - *str2;
    219         if (tmp != 0 || --len == 0) return(tmp);
    220     } while (*str2++ != 0);
    221     return 0;
    222 #endif
    223 }
    224 
    225 static const xmlChar casemap[256] = {
    226     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
    227     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
    228     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
    229     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
    230     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
    231     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
    232     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
    233     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
    234     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
    235     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
    236     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
    237     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
    238     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
    239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
    240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
    241     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
    242     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
    243     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
    244     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
    245     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
    246     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
    247     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
    248     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
    249     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
    250     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
    251     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
    252     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
    253     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
    254     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
    255     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
    256     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
    257     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
    258 };
    259 
    260 /**
    261  * xmlStrcasecmp:
    262  * @str1:  the first xmlChar *
    263  * @str2:  the second xmlChar *
    264  *
    265  * a strcasecmp for xmlChar's
    266  *
    267  * Returns the integer result of the comparison
    268  */
    269 
    270 int
    271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
    272     register int tmp;
    273 
    274     if (str1 == str2) return(0);
    275     if (str1 == NULL) return(-1);
    276     if (str2 == NULL) return(1);
    277     do {
    278         tmp = casemap[*str1++] - casemap[*str2];
    279         if (tmp != 0) return(tmp);
    280     } while (*str2++ != 0);
    281     return 0;
    282 }
    283 
    284 /**
    285  * xmlStrncasecmp:
    286  * @str1:  the first xmlChar *
    287  * @str2:  the second xmlChar *
    288  * @len:  the max comparison length
    289  *
    290  * a strncasecmp for xmlChar's
    291  *
    292  * Returns the integer result of the comparison
    293  */
    294 
    295 int
    296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
    297     register int tmp;
    298 
    299     if (len <= 0) return(0);
    300     if (str1 == str2) return(0);
    301     if (str1 == NULL) return(-1);
    302     if (str2 == NULL) return(1);
    303     do {
    304         tmp = casemap[*str1++] - casemap[*str2];
    305         if (tmp != 0 || --len == 0) return(tmp);
    306     } while (*str2++ != 0);
    307     return 0;
    308 }
    309 
    310 /**
    311  * xmlStrchr:
    312  * @str:  the xmlChar * array
    313  * @val:  the xmlChar to search
    314  *
    315  * a strchr for xmlChar's
    316  *
    317  * Returns the xmlChar * for the first occurrence or NULL.
    318  */
    319 
    320 const xmlChar *
    321 xmlStrchr(const xmlChar *str, xmlChar val) {
    322     if (str == NULL) return(NULL);
    323     while (*str != 0) { /* non input consuming */
    324         if (*str == val) return((xmlChar *) str);
    325         str++;
    326     }
    327     return(NULL);
    328 }
    329 
    330 /**
    331  * xmlStrstr:
    332  * @str:  the xmlChar * array (haystack)
    333  * @val:  the xmlChar to search (needle)
    334  *
    335  * a strstr for xmlChar's
    336  *
    337  * Returns the xmlChar * for the first occurrence or NULL.
    338  */
    339 
    340 const xmlChar *
    341 xmlStrstr(const xmlChar *str, const xmlChar *val) {
    342     int n;
    343 
    344     if (str == NULL) return(NULL);
    345     if (val == NULL) return(NULL);
    346     n = xmlStrlen(val);
    347 
    348     if (n == 0) return(str);
    349     while (*str != 0) { /* non input consuming */
    350         if (*str == *val) {
    351             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
    352         }
    353         str++;
    354     }
    355     return(NULL);
    356 }
    357 
    358 /**
    359  * xmlStrcasestr:
    360  * @str:  the xmlChar * array (haystack)
    361  * @val:  the xmlChar to search (needle)
    362  *
    363  * a case-ignoring strstr for xmlChar's
    364  *
    365  * Returns the xmlChar * for the first occurrence or NULL.
    366  */
    367 
    368 const xmlChar *
    369 xmlStrcasestr(const xmlChar *str, xmlChar *val) {
    370     int n;
    371 
    372     if (str == NULL) return(NULL);
    373     if (val == NULL) return(NULL);
    374     n = xmlStrlen(val);
    375 
    376     if (n == 0) return(str);
    377     while (*str != 0) { /* non input consuming */
    378         if (casemap[*str] == casemap[*val])
    379             if (!xmlStrncasecmp(str, val, n)) return(str);
    380         str++;
    381     }
    382     return(NULL);
    383 }
    384 
    385 /**
    386  * xmlStrsub:
    387  * @str:  the xmlChar * array (haystack)
    388  * @start:  the index of the first char (zero based)
    389  * @len:  the length of the substring
    390  *
    391  * Extract a substring of a given string
    392  *
    393  * Returns the xmlChar * for the first occurrence or NULL.
    394  */
    395 
    396 xmlChar *
    397 xmlStrsub(const xmlChar *str, int start, int len) {
    398     int i;
    399 
    400     if (str == NULL) return(NULL);
    401     if (start < 0) return(NULL);
    402     if (len < 0) return(NULL);
    403 
    404     for (i = 0;i < start;i++) {
    405         if (*str == 0) return(NULL);
    406         str++;
    407     }
    408     if (*str == 0) return(NULL);
    409     return(xmlStrndup(str, len));
    410 }
    411 
    412 /**
    413  * xmlStrlen:
    414  * @str:  the xmlChar * array
    415  *
    416  * length of a xmlChar's string
    417  *
    418  * Returns the number of xmlChar contained in the ARRAY.
    419  */
    420 
    421 int
    422 xmlStrlen(const xmlChar *str) {
    423     int len = 0;
    424 
    425     if (str == NULL) return(0);
    426     while (*str != 0) { /* non input consuming */
    427         str++;
    428         len++;
    429     }
    430     return(len);
    431 }
    432 
    433 /**
    434  * xmlStrncat:
    435  * @cur:  the original xmlChar * array
    436  * @add:  the xmlChar * array added
    437  * @len:  the length of @add
    438  *
    439  * a strncat for array of xmlChar's, it will extend @cur with the len
    440  * first bytes of @add. Note that if @len < 0 then this is an API error
    441  * and NULL will be returned.
    442  *
    443  * Returns a new xmlChar *, the original @cur is reallocated if needed
    444  * and should not be freed
    445  */
    446 
    447 xmlChar *
    448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
    449     int size;
    450     xmlChar *ret;
    451 
    452     if ((add == NULL) || (len == 0))
    453         return(cur);
    454     if (len < 0)
    455 	return(NULL);
    456     if (cur == NULL)
    457         return(xmlStrndup(add, len));
    458 
    459     size = xmlStrlen(cur);
    460     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
    461     if (ret == NULL) {
    462         xmlErrMemory(NULL, NULL);
    463         return(cur);
    464     }
    465     memcpy(&ret[size], add, len * sizeof(xmlChar));
    466     ret[size + len] = 0;
    467     return(ret);
    468 }
    469 
    470 /**
    471  * xmlStrncatNew:
    472  * @str1:  first xmlChar string
    473  * @str2:  second xmlChar string
    474  * @len:  the len of @str2 or < 0
    475  *
    476  * same as xmlStrncat, but creates a new string.  The original
    477  * two strings are not freed. If @len is < 0 then the length
    478  * will be calculated automatically.
    479  *
    480  * Returns a new xmlChar * or NULL
    481  */
    482 xmlChar *
    483 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
    484     int size;
    485     xmlChar *ret;
    486 
    487     if (len < 0)
    488         len = xmlStrlen(str2);
    489     if ((str2 == NULL) || (len == 0))
    490         return(xmlStrdup(str1));
    491     if (str1 == NULL)
    492         return(xmlStrndup(str2, len));
    493 
    494     size = xmlStrlen(str1);
    495     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
    496     if (ret == NULL) {
    497         xmlErrMemory(NULL, NULL);
    498         return(xmlStrndup(str1, size));
    499     }
    500     memcpy(ret, str1, size * sizeof(xmlChar));
    501     memcpy(&ret[size], str2, len * sizeof(xmlChar));
    502     ret[size + len] = 0;
    503     return(ret);
    504 }
    505 
    506 /**
    507  * xmlStrcat:
    508  * @cur:  the original xmlChar * array
    509  * @add:  the xmlChar * array added
    510  *
    511  * a strcat for array of xmlChar's. Since they are supposed to be
    512  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
    513  * a termination mark of '0'.
    514  *
    515  * Returns a new xmlChar * containing the concatenated string.
    516  */
    517 xmlChar *
    518 xmlStrcat(xmlChar *cur, const xmlChar *add) {
    519     const xmlChar *p = add;
    520 
    521     if (add == NULL) return(cur);
    522     if (cur == NULL)
    523         return(xmlStrdup(add));
    524 
    525     while (*p != 0) p++; /* non input consuming */
    526     return(xmlStrncat(cur, add, p - add));
    527 }
    528 
    529 /**
    530  * xmlStrPrintf:
    531  * @buf:   the result buffer.
    532  * @len:   the result buffer length.
    533  * @msg:   the message with printf formatting.
    534  * @...:   extra parameters for the message.
    535  *
    536  * Formats @msg and places result into @buf.
    537  *
    538  * Returns the number of characters written to @buf or -1 if an error occurs.
    539  */
    540 int XMLCDECL
    541 xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
    542     va_list args;
    543     int ret;
    544 
    545     if((buf == NULL) || (msg == NULL)) {
    546         return(-1);
    547     }
    548 
    549     va_start(args, msg);
    550     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
    551     va_end(args);
    552     buf[len - 1] = 0; /* be safe ! */
    553 
    554     return(ret);
    555 }
    556 
    557 /**
    558  * xmlStrVPrintf:
    559  * @buf:   the result buffer.
    560  * @len:   the result buffer length.
    561  * @msg:   the message with printf formatting.
    562  * @ap:    extra parameters for the message.
    563  *
    564  * Formats @msg and places result into @buf.
    565  *
    566  * Returns the number of characters written to @buf or -1 if an error occurs.
    567  */
    568 int
    569 xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
    570     int ret;
    571 
    572     if((buf == NULL) || (msg == NULL)) {
    573         return(-1);
    574     }
    575 
    576     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
    577     buf[len - 1] = 0; /* be safe ! */
    578 
    579     return(ret);
    580 }
    581 
    582 /************************************************************************
    583  *                                                                      *
    584  *              Generic UTF8 handling routines                          *
    585  *                                                                      *
    586  * From rfc2044: encoding of the Unicode values on UTF-8:               *
    587  *                                                                      *
    588  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
    589  * 0000 0000-0000 007F   0xxxxxxx                                       *
    590  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
    591  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
    592  *                                                                      *
    593  * I hope we won't use values > 0xFFFF anytime soon !                   *
    594  *                                                                      *
    595  ************************************************************************/
    596 
    597 
    598 /**
    599  * xmlUTF8Size:
    600  * @utf: pointer to the UTF8 character
    601  *
    602  * calculates the internal size of a UTF8 character
    603  *
    604  * returns the numbers of bytes in the character, -1 on format error
    605  */
    606 int
    607 xmlUTF8Size(const xmlChar *utf) {
    608     xmlChar mask;
    609     int len;
    610 
    611     if (utf == NULL)
    612         return -1;
    613     if (*utf < 0x80)
    614         return 1;
    615     /* check valid UTF8 character */
    616     if (!(*utf & 0x40))
    617         return -1;
    618     /* determine number of bytes in char */
    619     len = 2;
    620     for (mask=0x20; mask != 0; mask>>=1) {
    621         if (!(*utf & mask))
    622             return len;
    623         len++;
    624     }
    625     return -1;
    626 }
    627 
    628 /**
    629  * xmlUTF8Charcmp:
    630  * @utf1: pointer to first UTF8 char
    631  * @utf2: pointer to second UTF8 char
    632  *
    633  * compares the two UCS4 values
    634  *
    635  * returns result of the compare as with xmlStrncmp
    636  */
    637 int
    638 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
    639 
    640     if (utf1 == NULL ) {
    641         if (utf2 == NULL)
    642             return 0;
    643         return -1;
    644     }
    645     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
    646 }
    647 
    648 /**
    649  * xmlUTF8Strlen:
    650  * @utf:  a sequence of UTF-8 encoded bytes
    651  *
    652  * compute the length of an UTF8 string, it doesn't do a full UTF8
    653  * checking of the content of the string.
    654  *
    655  * Returns the number of characters in the string or -1 in case of error
    656  */
    657 int
    658 xmlUTF8Strlen(const xmlChar *utf) {
    659     int ret = 0;
    660 
    661     if (utf == NULL)
    662         return(-1);
    663 
    664     while (*utf != 0) {
    665         if (utf[0] & 0x80) {
    666             if ((utf[1] & 0xc0) != 0x80)
    667                 return(-1);
    668             if ((utf[0] & 0xe0) == 0xe0) {
    669                 if ((utf[2] & 0xc0) != 0x80)
    670                     return(-1);
    671                 if ((utf[0] & 0xf0) == 0xf0) {
    672                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
    673                         return(-1);
    674                     utf += 4;
    675                 } else {
    676                     utf += 3;
    677                 }
    678             } else {
    679                 utf += 2;
    680             }
    681         } else {
    682             utf++;
    683         }
    684         ret++;
    685     }
    686     return(ret);
    687 }
    688 
    689 /**
    690  * xmlGetUTF8Char:
    691  * @utf:  a sequence of UTF-8 encoded bytes
    692  * @len:  a pointer to the minimum number of bytes present in
    693  *        the sequence.  This is used to assure the next character
    694  *        is completely contained within the sequence.
    695  *
    696  * Read the first UTF8 character from @utf
    697  *
    698  * Returns the char value or -1 in case of error, and sets *len to
    699  *        the actual number of bytes consumed (0 in case of error)
    700  */
    701 int
    702 xmlGetUTF8Char(const unsigned char *utf, int *len) {
    703     unsigned int c;
    704 
    705     if (utf == NULL)
    706         goto error;
    707     if (len == NULL)
    708         goto error;
    709     if (*len < 1)
    710         goto error;
    711 
    712     c = utf[0];
    713     if (c & 0x80) {
    714         if (*len < 2)
    715             goto error;
    716         if ((utf[1] & 0xc0) != 0x80)
    717             goto error;
    718         if ((c & 0xe0) == 0xe0) {
    719             if (*len < 3)
    720                 goto error;
    721             if ((utf[2] & 0xc0) != 0x80)
    722                 goto error;
    723             if ((c & 0xf0) == 0xf0) {
    724                 if (*len < 4)
    725                     goto error;
    726                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
    727                     goto error;
    728                 *len = 4;
    729                 /* 4-byte code */
    730                 c = (utf[0] & 0x7) << 18;
    731                 c |= (utf[1] & 0x3f) << 12;
    732                 c |= (utf[2] & 0x3f) << 6;
    733                 c |= utf[3] & 0x3f;
    734             } else {
    735               /* 3-byte code */
    736                 *len = 3;
    737                 c = (utf[0] & 0xf) << 12;
    738                 c |= (utf[1] & 0x3f) << 6;
    739                 c |= utf[2] & 0x3f;
    740             }
    741         } else {
    742           /* 2-byte code */
    743             *len = 2;
    744             c = (utf[0] & 0x1f) << 6;
    745             c |= utf[1] & 0x3f;
    746         }
    747     } else {
    748         /* 1-byte code */
    749         *len = 1;
    750     }
    751     return(c);
    752 
    753 error:
    754     if (len != NULL)
    755 	*len = 0;
    756     return(-1);
    757 }
    758 
    759 /**
    760  * xmlCheckUTF8:
    761  * @utf: Pointer to putative UTF-8 encoded string.
    762  *
    763  * Checks @utf for being valid UTF-8. @utf is assumed to be
    764  * null-terminated. This function is not super-strict, as it will
    765  * allow longer UTF-8 sequences than necessary. Note that Java is
    766  * capable of producing these sequences if provoked. Also note, this
    767  * routine checks for the 4-byte maximum size, but does not check for
    768  * 0x10ffff maximum value.
    769  *
    770  * Return value: true if @utf is valid.
    771  **/
    772 int
    773 xmlCheckUTF8(const unsigned char *utf)
    774 {
    775     int ix;
    776     unsigned char c;
    777 
    778     if (utf == NULL)
    779         return(0);
    780     /*
    781      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
    782      * are as follows (in "bit format"):
    783      *    0xxxxxxx                                      valid 1-byte
    784      *    110xxxxx 10xxxxxx                             valid 2-byte
    785      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
    786      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
    787      */
    788     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
    789         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
    790             ix++;
    791 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
    792 	    if ((utf[ix+1] & 0xc0 ) != 0x80)
    793 	        return 0;
    794 	    ix += 2;
    795 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
    796 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
    797 	        ((utf[ix+2] & 0xc0) != 0x80))
    798 		    return 0;
    799 	    ix += 3;
    800 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
    801 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
    802 	        ((utf[ix+2] & 0xc0) != 0x80) ||
    803 		((utf[ix+3] & 0xc0) != 0x80))
    804 		    return 0;
    805 	    ix += 4;
    806 	} else				/* unknown encoding */
    807 	    return 0;
    808       }
    809       return(1);
    810 }
    811 
    812 /**
    813  * xmlUTF8Strsize:
    814  * @utf:  a sequence of UTF-8 encoded bytes
    815  * @len:  the number of characters in the array
    816  *
    817  * storage size of an UTF8 string
    818  * the behaviour is not garanteed if the input string is not UTF-8
    819  *
    820  * Returns the storage size of
    821  * the first 'len' characters of ARRAY
    822  */
    823 
    824 int
    825 xmlUTF8Strsize(const xmlChar *utf, int len) {
    826     const xmlChar   *ptr=utf;
    827     xmlChar         ch;
    828 
    829     if (utf == NULL)
    830         return(0);
    831 
    832     if (len <= 0)
    833         return(0);
    834 
    835     while ( len-- > 0) {
    836         if ( !*ptr )
    837             break;
    838         if ( (ch = *ptr++) & 0x80)
    839             while ((ch<<=1) & 0x80 ) {
    840                 ptr++;
    841 		if (*ptr == 0) break;
    842 	    }
    843     }
    844     return (ptr - utf);
    845 }
    846 
    847 
    848 /**
    849  * xmlUTF8Strndup:
    850  * @utf:  the input UTF8 *
    851  * @len:  the len of @utf (in chars)
    852  *
    853  * a strndup for array of UTF8's
    854  *
    855  * Returns a new UTF8 * or NULL
    856  */
    857 xmlChar *
    858 xmlUTF8Strndup(const xmlChar *utf, int len) {
    859     xmlChar *ret;
    860     int i;
    861 
    862     if ((utf == NULL) || (len < 0)) return(NULL);
    863     i = xmlUTF8Strsize(utf, len);
    864     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
    865     if (ret == NULL) {
    866         xmlGenericError(xmlGenericErrorContext,
    867                 "malloc of %ld byte failed\n",
    868                 (len + 1) * (long)sizeof(xmlChar));
    869         return(NULL);
    870     }
    871     memcpy(ret, utf, i * sizeof(xmlChar));
    872     ret[i] = 0;
    873     return(ret);
    874 }
    875 
    876 /**
    877  * xmlUTF8Strpos:
    878  * @utf:  the input UTF8 *
    879  * @pos:  the position of the desired UTF8 char (in chars)
    880  *
    881  * a function to provide the equivalent of fetching a
    882  * character from a string array
    883  *
    884  * Returns a pointer to the UTF8 character or NULL
    885  */
    886 const xmlChar *
    887 xmlUTF8Strpos(const xmlChar *utf, int pos) {
    888     xmlChar ch;
    889 
    890     if (utf == NULL) return(NULL);
    891     if (pos < 0)
    892         return(NULL);
    893     while (pos--) {
    894         if ((ch=*utf++) == 0) return(NULL);
    895         if ( ch & 0x80 ) {
    896             /* if not simple ascii, verify proper format */
    897             if ( (ch & 0xc0) != 0xc0 )
    898                 return(NULL);
    899             /* then skip over remaining bytes for this char */
    900             while ( (ch <<= 1) & 0x80 )
    901                 if ( (*utf++ & 0xc0) != 0x80 )
    902                     return(NULL);
    903         }
    904     }
    905     return((xmlChar *)utf);
    906 }
    907 
    908 /**
    909  * xmlUTF8Strloc:
    910  * @utf:  the input UTF8 *
    911  * @utfchar:  the UTF8 character to be found
    912  *
    913  * a function to provide the relative location of a UTF8 char
    914  *
    915  * Returns the relative character position of the desired char
    916  * or -1 if not found
    917  */
    918 int
    919 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
    920     int i, size;
    921     xmlChar ch;
    922 
    923     if (utf==NULL || utfchar==NULL) return -1;
    924     size = xmlUTF8Strsize(utfchar, 1);
    925         for(i=0; (ch=*utf) != 0; i++) {
    926             if (xmlStrncmp(utf, utfchar, size)==0)
    927                 return(i);
    928             utf++;
    929             if ( ch & 0x80 ) {
    930                 /* if not simple ascii, verify proper format */
    931                 if ( (ch & 0xc0) != 0xc0 )
    932                     return(-1);
    933                 /* then skip over remaining bytes for this char */
    934                 while ( (ch <<= 1) & 0x80 )
    935                     if ( (*utf++ & 0xc0) != 0x80 )
    936                         return(-1);
    937             }
    938         }
    939 
    940     return(-1);
    941 }
    942 /**
    943  * xmlUTF8Strsub:
    944  * @utf:  a sequence of UTF-8 encoded bytes
    945  * @start: relative pos of first char
    946  * @len:   total number to copy
    947  *
    948  * Create a substring from a given UTF-8 string
    949  * Note:  positions are given in units of UTF-8 chars
    950  *
    951  * Returns a pointer to a newly created string
    952  * or NULL if any problem
    953  */
    954 
    955 xmlChar *
    956 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
    957     int            i;
    958     xmlChar ch;
    959 
    960     if (utf == NULL) return(NULL);
    961     if (start < 0) return(NULL);
    962     if (len < 0) return(NULL);
    963 
    964     /*
    965      * Skip over any leading chars
    966      */
    967     for (i = 0;i < start;i++) {
    968         if ((ch=*utf++) == 0) return(NULL);
    969         if ( ch & 0x80 ) {
    970             /* if not simple ascii, verify proper format */
    971             if ( (ch & 0xc0) != 0xc0 )
    972                 return(NULL);
    973             /* then skip over remaining bytes for this char */
    974             while ( (ch <<= 1) & 0x80 )
    975                 if ( (*utf++ & 0xc0) != 0x80 )
    976                     return(NULL);
    977         }
    978     }
    979 
    980     return(xmlUTF8Strndup(utf, len));
    981 }
    982 
    983 #define bottom_xmlstring
    984 #include "elfgcchack.h"
    985