Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uparse.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000apr18
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file provides a parser for files that are delimited by one single
     17 *   character like ';' or TAB. Example: the Unicode Character Properties files
     18 *   like UnicodeData.txt are semicolon-delimited.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/utf16.h"
     25 #include "cstring.h"
     26 #include "filestrm.h"
     27 #include "uparse.h"
     28 #include "ustr_imp.h"
     29 
     30 #include <stdio.h>
     31 
     32 U_CAPI const char * U_EXPORT2
     33 u_skipWhitespace(const char *s) {
     34     while(U_IS_INV_WHITESPACE(*s)) {
     35         ++s;
     36     }
     37     return s;
     38 }
     39 
     40 U_CAPI char * U_EXPORT2
     41 u_rtrim(char *s) {
     42     char *end=uprv_strchr(s, 0);
     43     while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
     44         *--end = 0;
     45     }
     46     return end;
     47 }
     48 
     49 /*
     50  * If the string starts with # @missing: then return the pointer to the
     51  * following non-whitespace character.
     52  * Otherwise return the original pointer.
     53  * Unicode 5.0 adds such lines in some data files to document
     54  * default property values.
     55  * Poor man's regex for variable amounts of white space.
     56  */
     57 static const char *
     58 getMissingLimit(const char *s) {
     59     const char *s0=s;
     60     if(
     61         *(s=u_skipWhitespace(s))=='#' &&
     62         *(s=u_skipWhitespace(s+1))=='@' &&
     63         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
     64         *(s=u_skipWhitespace(s+7))==':'
     65     ) {
     66         return u_skipWhitespace(s+1);
     67     } else {
     68         return s0;
     69     }
     70 }
     71 
     72 U_CAPI void U_EXPORT2
     73 u_parseDelimitedFile(const char *filename, char delimiter,
     74                      char *fields[][2], int32_t fieldCount,
     75                      UParseLineFn *lineFn, void *context,
     76                      UErrorCode *pErrorCode) {
     77     FileStream *file;
     78     char line[300];
     79     char *start, *limit;
     80     int32_t i, length;
     81 
     82     if(U_FAILURE(*pErrorCode)) {
     83         return;
     84     }
     85 
     86     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
     87         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     88         return;
     89     }
     90 
     91     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     92         filename=NULL;
     93         file=T_FileStream_stdin();
     94     } else {
     95         file=T_FileStream_open(filename, "r");
     96     }
     97     if(file==NULL) {
     98         *pErrorCode=U_FILE_ACCESS_ERROR;
     99         return;
    100     }
    101 
    102     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
    103         /* remove trailing newline characters */
    104         length=(int32_t)(u_rtrim(line)-line);
    105 
    106         /*
    107          * detect a line with # @missing:
    108          * start parsing after that, or else from the beginning of the line
    109          * set the default warning for @missing lines
    110          */
    111         start=(char *)getMissingLimit(line);
    112         if(start==line) {
    113             *pErrorCode=U_ZERO_ERROR;
    114         } else {
    115             *pErrorCode=U_USING_DEFAULT_WARNING;
    116         }
    117 
    118         /* skip this line if it is empty or a comment */
    119         if(*start==0 || *start=='#') {
    120             continue;
    121         }
    122 
    123         /* remove in-line comments */
    124         limit=uprv_strchr(start, '#');
    125         if(limit!=NULL) {
    126             /* get white space before the pound sign */
    127             while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
    128                 --limit;
    129             }
    130 
    131             /* truncate the line */
    132             *limit=0;
    133         }
    134 
    135         /* skip lines with only whitespace */
    136         if(u_skipWhitespace(start)[0]==0) {
    137             continue;
    138         }
    139 
    140         /* for each field, call the corresponding field function */
    141         for(i=0; i<fieldCount; ++i) {
    142             /* set the limit pointer of this field */
    143             limit=start;
    144             while(*limit!=delimiter && *limit!=0) {
    145                 ++limit;
    146             }
    147 
    148             /* set the field start and limit in the fields array */
    149             fields[i][0]=start;
    150             fields[i][1]=limit;
    151 
    152             /* set start to the beginning of the next field, if any */
    153             start=limit;
    154             if(*start!=0) {
    155                 ++start;
    156             } else if(i+1<fieldCount) {
    157                 *pErrorCode=U_PARSE_ERROR;
    158                 limit=line+length;
    159                 i=fieldCount;
    160                 break;
    161             }
    162         }
    163 
    164         /* error in a field function? */
    165         if(U_FAILURE(*pErrorCode)) {
    166             break;
    167         }
    168 
    169         /* call the field function */
    170         lineFn(context, fields, fieldCount, pErrorCode);
    171         if(U_FAILURE(*pErrorCode)) {
    172             break;
    173         }
    174     }
    175 
    176     if(filename!=NULL) {
    177         T_FileStream_close(file);
    178     }
    179 }
    180 
    181 /*
    182  * parse a list of code points
    183  * store them as a UTF-32 string in dest[destCapacity]
    184  * return the number of code points
    185  */
    186 U_CAPI int32_t U_EXPORT2
    187 u_parseCodePoints(const char *s,
    188                   uint32_t *dest, int32_t destCapacity,
    189                   UErrorCode *pErrorCode) {
    190     char *end;
    191     uint32_t value;
    192     int32_t count;
    193 
    194     if(U_FAILURE(*pErrorCode)) {
    195         return 0;
    196     }
    197     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
    198         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    199         return 0;
    200     }
    201 
    202     count=0;
    203     for(;;) {
    204         s=u_skipWhitespace(s);
    205         if(*s==';' || *s==0) {
    206             return count;
    207         }
    208 
    209         /* read one code point */
    210         value=(uint32_t)uprv_strtoul(s, &end, 16);
    211         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
    212             *pErrorCode=U_PARSE_ERROR;
    213             return 0;
    214         }
    215 
    216         /* append it to the destination array */
    217         if(count<destCapacity) {
    218             dest[count++]=value;
    219         } else {
    220             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    221         }
    222 
    223         /* go to the following characters */
    224         s=end;
    225     }
    226 }
    227 
    228 /*
    229  * parse a list of code points
    230  * store them as a string in dest[destCapacity]
    231  * set the first code point in *pFirst
    232  * @return The length of the string in numbers of UChars.
    233  */
    234 U_CAPI int32_t U_EXPORT2
    235 u_parseString(const char *s,
    236               UChar *dest, int32_t destCapacity,
    237               uint32_t *pFirst,
    238               UErrorCode *pErrorCode) {
    239     char *end;
    240     uint32_t value;
    241     int32_t destLength;
    242 
    243     if(U_FAILURE(*pErrorCode)) {
    244         return 0;
    245     }
    246     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
    247         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    248         return 0;
    249     }
    250 
    251     if(pFirst!=NULL) {
    252         *pFirst=0xffffffff;
    253     }
    254 
    255     destLength=0;
    256     for(;;) {
    257         s=u_skipWhitespace(s);
    258         if(*s==';' || *s==0) {
    259             if(destLength<destCapacity) {
    260                 dest[destLength]=0;
    261             } else if(destLength==destCapacity) {
    262                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
    263             } else {
    264                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    265             }
    266             return destLength;
    267         }
    268 
    269         /* read one code point */
    270         value=(uint32_t)uprv_strtoul(s, &end, 16);
    271         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
    272             *pErrorCode=U_PARSE_ERROR;
    273             return 0;
    274         }
    275 
    276         /* store the first code point */
    277         if(pFirst!=NULL) {
    278             *pFirst=value;
    279             pFirst=NULL;
    280         }
    281 
    282         /* append it to the destination array */
    283         if((destLength+U16_LENGTH(value))<=destCapacity) {
    284             U16_APPEND_UNSAFE(dest, destLength, value);
    285         } else {
    286             destLength+=U16_LENGTH(value);
    287         }
    288 
    289         /* go to the following characters */
    290         s=end;
    291     }
    292 }
    293 
    294 /* read a range like start or start..end */
    295 U_CAPI int32_t U_EXPORT2
    296 u_parseCodePointRangeAnyTerminator(const char *s,
    297                                    uint32_t *pStart, uint32_t *pEnd,
    298                                    const char **terminator,
    299                                    UErrorCode *pErrorCode) {
    300     char *end;
    301     uint32_t value;
    302 
    303     if(U_FAILURE(*pErrorCode)) {
    304         return 0;
    305     }
    306     if(s==NULL || pStart==NULL || pEnd==NULL) {
    307         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    308         return 0;
    309     }
    310 
    311     /* read the start code point */
    312     s=u_skipWhitespace(s);
    313     value=(uint32_t)uprv_strtoul(s, &end, 16);
    314     if(end<=s || value>=0x110000) {
    315         *pErrorCode=U_PARSE_ERROR;
    316         return 0;
    317     }
    318     *pStart=*pEnd=value;
    319 
    320     /* is there a "..end"? */
    321     s=u_skipWhitespace(end);
    322     if(*s!='.' || s[1]!='.') {
    323         *terminator=end;
    324         return 1;
    325     }
    326     s=u_skipWhitespace(s+2);
    327 
    328     /* read the end code point */
    329     value=(uint32_t)uprv_strtoul(s, &end, 16);
    330     if(end<=s || value>=0x110000) {
    331         *pErrorCode=U_PARSE_ERROR;
    332         return 0;
    333     }
    334     *pEnd=value;
    335 
    336     /* is this a valid range? */
    337     if(value<*pStart) {
    338         *pErrorCode=U_PARSE_ERROR;
    339         return 0;
    340     }
    341 
    342     *terminator=end;
    343     return value-*pStart+1;
    344 }
    345 
    346 U_CAPI int32_t U_EXPORT2
    347 u_parseCodePointRange(const char *s,
    348                       uint32_t *pStart, uint32_t *pEnd,
    349                       UErrorCode *pErrorCode) {
    350     const char *terminator;
    351     int32_t rangeLength=
    352         u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
    353     if(U_SUCCESS(*pErrorCode)) {
    354         terminator=u_skipWhitespace(terminator);
    355         if(*terminator!=';' && *terminator!=0) {
    356             *pErrorCode=U_PARSE_ERROR;
    357             return 0;
    358         }
    359     }
    360     return rangeLength;
    361 }
    362 
    363 U_CAPI int32_t U_EXPORT2
    364 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
    365     const char *read = source;
    366     int32_t i = 0;
    367     unsigned int value = 0;
    368     if(sLen == -1) {
    369         sLen = (int32_t)strlen(source);
    370     }
    371 
    372     while(read < source+sLen) {
    373         sscanf(read, "%2x", &value);
    374         if(i < destCapacity) {
    375             dest[i] = (char)value;
    376         }
    377         i++;
    378         read += 2;
    379     }
    380     return u_terminateChars(dest, destCapacity, i, status);
    381 }
    382