Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2007, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uparse.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000apr18
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file provides a parser for files that are delimited by one single
     17 *   character like ';' or TAB. Example: the Unicode Character Properties files
     18 *   like UnicodeData.txt are semicolon-delimited.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "cstring.h"
     23 #include "filestrm.h"
     24 #include "uparse.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/ustring.h"
     27 #include "ustr_imp.h"
     28 
     29 #include <stdio.h>
     30 
     31 U_CAPI const char * U_EXPORT2
     32 u_skipWhitespace(const char *s) {
     33     while(*s==' ' || *s=='\t') {
     34         ++s;
     35     }
     36     return s;
     37 }
     38 
     39 /*
     40  * If the string starts with # @missing: then return the pointer to the
     41  * following non-whitespace character.
     42  * Otherwise return the original pointer.
     43  * Unicode 5.0 adds such lines in some data files to document
     44  * default property values.
     45  * Poor man's regex for variable amounts of white space.
     46  */
     47 static const char *
     48 getMissingLimit(const char *s) {
     49     const char *s0=s;
     50     if(
     51         *(s=u_skipWhitespace(s))=='#' &&
     52         *(s=u_skipWhitespace(s+1))=='@' &&
     53         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
     54         *(s=u_skipWhitespace(s+7))==':'
     55     ) {
     56         return u_skipWhitespace(s+1);
     57     } else {
     58         return s0;
     59     }
     60 }
     61 
     62 U_CAPI void U_EXPORT2
     63 u_parseDelimitedFile(const char *filename, char delimiter,
     64                      char *fields[][2], int32_t fieldCount,
     65                      UParseLineFn *lineFn, void *context,
     66                      UErrorCode *pErrorCode) {
     67     FileStream *file;
     68     char line[300];
     69     char *start, *limit;
     70     int32_t i, length;
     71 
     72     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
     73         return;
     74     }
     75 
     76     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
     77         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     78         return;
     79     }
     80 
     81     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     82         filename=NULL;
     83         file=T_FileStream_stdin();
     84     } else {
     85         file=T_FileStream_open(filename, "r");
     86     }
     87     if(file==NULL) {
     88         *pErrorCode=U_FILE_ACCESS_ERROR;
     89         return;
     90     }
     91 
     92     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
     93         length=(int32_t)uprv_strlen(line);
     94 
     95         /* remove trailing newline characters */
     96         while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
     97             line[--length]=0;
     98         }
     99 
    100         /*
    101          * detect a line with # @missing:
    102          * start parsing after that, or else from the beginning of the line
    103          * set the default warning for @missing lines
    104          */
    105         start=(char *)getMissingLimit(line);
    106         if(start==line) {
    107             *pErrorCode=U_ZERO_ERROR;
    108         } else {
    109             *pErrorCode=U_USING_DEFAULT_WARNING;
    110         }
    111 
    112         /* skip this line if it is empty or a comment */
    113         if(*start==0 || *start=='#') {
    114             continue;
    115         }
    116 
    117         /* remove in-line comments */
    118         limit=uprv_strchr(start, '#');
    119         if(limit!=NULL) {
    120             /* get white space before the pound sign */
    121             while(limit>start && (*(limit-1)==' ' || *(limit-1)=='\t')) {
    122                 --limit;
    123             }
    124 
    125             /* truncate the line */
    126             *limit=0;
    127         }
    128 
    129         /* skip lines with only whitespace */
    130         if(u_skipWhitespace(start)[0]==0) {
    131             continue;
    132         }
    133 
    134         /* for each field, call the corresponding field function */
    135         for(i=0; i<fieldCount; ++i) {
    136             /* set the limit pointer of this field */
    137             limit=start;
    138             while(*limit!=delimiter && *limit!=0) {
    139                 ++limit;
    140             }
    141 
    142             /* set the field start and limit in the fields array */
    143             fields[i][0]=start;
    144             fields[i][1]=limit;
    145 
    146             /* set start to the beginning of the next field, if any */
    147             start=limit;
    148             if(*start!=0) {
    149                 ++start;
    150             } else if(i+1<fieldCount) {
    151                 *pErrorCode=U_PARSE_ERROR;
    152                 limit=line+length;
    153                 i=fieldCount;
    154                 break;
    155             }
    156         }
    157 
    158         /* error in a field function? */
    159         if(U_FAILURE(*pErrorCode)) {
    160             break;
    161         }
    162 
    163         /* call the field function */
    164         lineFn(context, fields, fieldCount, pErrorCode);
    165         if(U_FAILURE(*pErrorCode)) {
    166             break;
    167         }
    168     }
    169 
    170     if(filename!=NULL) {
    171         T_FileStream_close(file);
    172     }
    173 }
    174 
    175 /*
    176  * parse a list of code points
    177  * store them as a UTF-32 string in dest[destCapacity]
    178  * return the number of code points
    179  */
    180 U_CAPI int32_t U_EXPORT2
    181 u_parseCodePoints(const char *s,
    182                   uint32_t *dest, int32_t destCapacity,
    183                   UErrorCode *pErrorCode) {
    184     char *end;
    185     uint32_t value;
    186     int32_t count;
    187 
    188     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    189         return 0;
    190     }
    191     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
    192         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    193         return 0;
    194     }
    195 
    196     count=0;
    197     for(;;) {
    198         s=u_skipWhitespace(s);
    199         if(*s==';' || *s==0) {
    200             return count;
    201         }
    202 
    203         /* read one code point */
    204         value=(uint32_t)uprv_strtoul(s, &end, 16);
    205         if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
    206             *pErrorCode=U_PARSE_ERROR;
    207             return 0;
    208         }
    209 
    210         /* append it to the destination array */
    211         if(count<destCapacity) {
    212             dest[count++]=value;
    213         } else {
    214             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    215         }
    216 
    217         /* go to the following characters */
    218         s=end;
    219     }
    220 }
    221 
    222 /*
    223  * parse a list of code points
    224  * store them as a string in dest[destCapacity]
    225  * set the first code point in *pFirst
    226  * @return The length of the string in numbers of UChars.
    227  */
    228 U_CAPI int32_t U_EXPORT2
    229 u_parseString(const char *s,
    230               UChar *dest, int32_t destCapacity,
    231               uint32_t *pFirst,
    232               UErrorCode *pErrorCode) {
    233     char *end;
    234     uint32_t value;
    235     int32_t destLength;
    236 
    237     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    238         return 0;
    239     }
    240     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
    241         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    242     }
    243 
    244     if(pFirst!=NULL) {
    245         *pFirst=0xffffffff;
    246     }
    247 
    248     destLength=0;
    249     for(;;) {
    250         s=u_skipWhitespace(s);
    251         if(*s==';' || *s==0) {
    252             if(destLength<destCapacity) {
    253                 dest[destLength]=0;
    254             } else if(destLength==destCapacity) {
    255                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
    256             } else {
    257                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    258             }
    259             return destLength;
    260         }
    261 
    262         /* read one code point */
    263         value=(uint32_t)uprv_strtoul(s, &end, 16);
    264         if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
    265             *pErrorCode=U_PARSE_ERROR;
    266             return 0;
    267         }
    268 
    269         /* store the first code point */
    270         if(destLength==0 && pFirst!=NULL) {
    271             *pFirst=value;
    272         }
    273 
    274         /* append it to the destination array */
    275         if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
    276             UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
    277         } else {
    278             destLength+=UTF_CHAR_LENGTH(value);
    279         }
    280 
    281         /* go to the following characters */
    282         s=end;
    283     }
    284 }
    285 
    286 /* read a range like start or start..end */
    287 U_CAPI int32_t U_EXPORT2
    288 u_parseCodePointRange(const char *s,
    289                       uint32_t *pStart, uint32_t *pEnd,
    290                       UErrorCode *pErrorCode) {
    291     char *end;
    292     uint32_t value;
    293 
    294     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    295         return 0;
    296     }
    297     if(s==NULL || pStart==NULL || pEnd==NULL) {
    298         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    299         return 0;
    300     }
    301 
    302     s=u_skipWhitespace(s);
    303     if(*s==';' || *s==0) {
    304         *pErrorCode=U_PARSE_ERROR;
    305         return 0;
    306     }
    307 
    308     /* read the start code point */
    309     value=(uint32_t)uprv_strtoul(s, &end, 16);
    310     if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
    311         *pErrorCode=U_PARSE_ERROR;
    312         return 0;
    313     }
    314     *pStart=*pEnd=value;
    315 
    316     /* is there a "..end"? */
    317     s=u_skipWhitespace(end);
    318     if(*s==';' || *s==0) {
    319         return 1;
    320     }
    321 
    322     if(*s!='.' || s[1]!='.') {
    323         *pErrorCode=U_PARSE_ERROR;
    324         return 0;
    325     }
    326     s+=2;
    327 
    328     /* read the end code point */
    329     value=(uint32_t)uprv_strtoul(s, &end, 16);
    330     if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
    331         *pErrorCode=U_PARSE_ERROR;
    332         return 0;
    333     }
    334     *pEnd=value;
    335 
    336     /* is this a valid range? */
    337     if(value<*pStart) {
    338         *pErrorCode=U_PARSE_ERROR;
    339         return 0;
    340     }
    341 
    342     /* no garbage after that? */
    343     s=u_skipWhitespace(end);
    344     if(*s==';' || *s==0) {
    345         return value-*pStart+1;
    346     } else {
    347         *pErrorCode=U_PARSE_ERROR;
    348         return 0;
    349     }
    350 }
    351 
    352 U_CAPI int32_t U_EXPORT2
    353 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
    354     const char *read = source;
    355     int32_t i = 0;
    356     unsigned int value = 0;
    357     if(sLen == -1) {
    358         sLen = (int32_t)strlen(source);
    359     }
    360 
    361     while(read < source+sLen) {
    362         sscanf(read, "%2x", &value);
    363         if(i < destCapacity) {
    364             dest[i] = (char)value;
    365         }
    366         i++;
    367         read += 2;
    368     }
    369     return u_terminateChars(dest, destCapacity, i, status);
    370 }
    371