Home | History | Annotate | Download | only in genrb
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1998-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *
      9 * File read.c
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   05/26/99    stephen     Creation.
     15 *   5/10/01     Ram         removed ustdio dependency
     16 *******************************************************************************
     17 */
     18 
     19 #include "read.h"
     20 #include "errmsg.h"
     21 #include "unicode/ustring.h"
     22 
     23 #define OPENBRACE    0x007B
     24 #define CLOSEBRACE   0x007D
     25 #define COMMA        0x002C
     26 #define QUOTE        0x0022
     27 #define ESCAPE       0x005C
     28 #define SLASH        0x002F
     29 #define ASTERISK     0x002A
     30 #define SPACE        0x0020
     31 #define COLON        0x003A
     32 #define BADBOM       0xFFFE
     33 #define CR           0x000D
     34 #define LF           0x000A
     35 
     36 static int32_t lineCount;
     37 
     38 /* Protos */
     39 static enum ETokenType getStringToken(UCHARBUF *buf,
     40                                       UChar32 initialChar,
     41                                       struct UString *token,
     42                                       UErrorCode *status);
     43 
     44 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
     45 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     46 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     47 static UBool   isWhitespace          (UChar32 c);
     48 static UBool   isNewline             (UChar32 c);
     49 
     50 void resetLineNumber() {
     51     lineCount = 1;
     52 }
     53 
     54 /* Read and return the next token from the stream.  If the token is of
     55    type eString, fill in the token parameter with the token.  If the
     56    token is eError, then the status parameter will contain the
     57    specific error.  This will be eItemNotFound at the end of file,
     58    indicating that all tokens have been returned.  This method will
     59    never return eString twice in a row; instead, multiple adjacent
     60    string tokens will be merged into one, with no intervening
     61    space. */
     62 enum ETokenType getNextToken(UCHARBUF* buf,
     63                              struct UString *token,
     64                              uint32_t *linenumber, /* out: linenumber of token */
     65                              struct UString *comment,
     66                              UErrorCode *status) {
     67     enum ETokenType result;
     68     UChar32         c;
     69 
     70     if (U_FAILURE(*status)) {
     71         return TOK_ERROR;
     72     }
     73 
     74     /* Skip whitespace */
     75     c = getNextChar(buf, TRUE, comment, status);
     76 
     77     if (U_FAILURE(*status)) {
     78         return TOK_ERROR;
     79     }
     80 
     81     *linenumber = lineCount;
     82 
     83     switch(c) {
     84     case BADBOM:
     85         return TOK_ERROR;
     86     case OPENBRACE:
     87         return TOK_OPEN_BRACE;
     88     case CLOSEBRACE:
     89         return TOK_CLOSE_BRACE;
     90     case COMMA:
     91         return TOK_COMMA;
     92     case U_EOF:
     93         return TOK_EOF;
     94     case COLON:
     95         return TOK_COLON;
     96 
     97     default:
     98         result = getStringToken(buf, c, token, status);
     99     }
    100 
    101     *linenumber = lineCount;
    102     return result;
    103 }
    104 
    105 /* Copy a string token into the given UnicodeString.  Upon entry, we
    106    have already read the first character of the string token, which is
    107    not a whitespace character (but may be a QUOTE or ESCAPE). This
    108    function reads all subsequent characters that belong with this
    109    string, and copy them into the token parameter. The other
    110    important, and slightly convoluted purpose of this function is to
    111    merge adjacent strings.  It looks forward a bit, and if the next
    112    non comment, non whitespace item is a string, it reads it in as
    113    well.  If two adjacent strings are quoted, they are merged without
    114    intervening space.  Otherwise a single SPACE character is
    115    inserted. */
    116 static enum ETokenType getStringToken(UCHARBUF* buf,
    117                                       UChar32 initialChar,
    118                                       struct UString *token,
    119                                       UErrorCode *status) {
    120     UBool    lastStringWasQuoted;
    121     UChar32  c;
    122     UChar    target[3] = { '\0' };
    123     UChar    *pTarget   = target;
    124     int      len=0;
    125     UBool    isFollowingCharEscaped=FALSE;
    126     UBool    isNLUnescaped = FALSE;
    127     UChar32  prevC=0;
    128 
    129     /* We are guaranteed on entry that initialChar is not a whitespace
    130        character. If we are at the EOF, or have some other problem, it
    131        doesn't matter; we still want to validly return the initialChar
    132        (if nothing else) as a string token. */
    133 
    134     if (U_FAILURE(*status)) {
    135         return TOK_ERROR;
    136     }
    137 
    138     /* setup */
    139     lastStringWasQuoted = FALSE;
    140     c = initialChar;
    141     ustr_setlen(token, 0, status);
    142 
    143     if (U_FAILURE(*status)) {
    144         return TOK_ERROR;
    145     }
    146 
    147     for (;;) {
    148         if (c == QUOTE) {
    149             if (!lastStringWasQuoted && token->fLength > 0) {
    150                 ustr_ucat(token, SPACE, status);
    151 
    152                 if (U_FAILURE(*status)) {
    153                     return TOK_ERROR;
    154                 }
    155             }
    156 
    157             lastStringWasQuoted = TRUE;
    158 
    159             for (;;) {
    160                 c = ucbuf_getc(buf,status);
    161 
    162                 /* EOF reached */
    163                 if (c == U_EOF) {
    164                     return TOK_EOF;
    165                 }
    166 
    167                 /* Unterminated quoted strings */
    168                 if (U_FAILURE(*status)) {
    169                     return TOK_ERROR;
    170                 }
    171 
    172                 if (c == QUOTE && !isFollowingCharEscaped) {
    173                     break;
    174                 }
    175 
    176                 if (c == ESCAPE  && !isFollowingCharEscaped) {
    177                     pTarget = target;
    178                     c       = unescape(buf, status);
    179 
    180                     if (c == U_ERR) {
    181                         return TOK_ERROR;
    182                     }
    183                     if(c == CR || c == LF){
    184                         isNLUnescaped = TRUE;
    185                     }
    186                 }
    187 
    188                 if(c==ESCAPE && !isFollowingCharEscaped){
    189                     isFollowingCharEscaped = TRUE;
    190                 }else{
    191                     U_APPEND_CHAR32(c, pTarget,len);
    192                     pTarget = target;
    193                     ustr_uscat(token, pTarget,len, status);
    194                     isFollowingCharEscaped = FALSE;
    195                     len=0;
    196                     if(c == CR || c == LF){
    197                         if(isNLUnescaped == FALSE && prevC!=CR){
    198                             lineCount++;
    199                         }
    200                         isNLUnescaped = FALSE;
    201                     }
    202                 }
    203 
    204                 if (U_FAILURE(*status)) {
    205                     return TOK_ERROR;
    206                 }
    207                 prevC = c;
    208             }
    209         } else {
    210             if (token->fLength > 0) {
    211                 ustr_ucat(token, SPACE, status);
    212 
    213                 if (U_FAILURE(*status)) {
    214                     return TOK_ERROR;
    215                 }
    216             }
    217 
    218             if(lastStringWasQuoted){
    219                 if(getShowWarning()){
    220                     warning(lineCount, "Mixing quoted and unquoted strings");
    221                 }
    222                 if(isStrict()){
    223                     return TOK_ERROR;
    224                 }
    225 
    226             }
    227 
    228             lastStringWasQuoted = FALSE;
    229 
    230             /* if we reach here we are mixing
    231              * quoted and unquoted strings
    232              * warn in normal mode and error in
    233              * pedantic mode
    234              */
    235 
    236             if (c == ESCAPE) {
    237                 pTarget = target;
    238                 c       = unescape(buf, status);
    239 
    240                 /* EOF reached */
    241                 if (c == U_EOF) {
    242                     return TOK_ERROR;
    243                 }
    244             }
    245 
    246             U_APPEND_CHAR32(c, pTarget,len);
    247             pTarget = target;
    248             ustr_uscat(token, pTarget,len, status);
    249             len=0;
    250 
    251             if (U_FAILURE(*status)) {
    252                 return TOK_ERROR;
    253             }
    254 
    255             for (;;) {
    256                 /* DON'T skip whitespace */
    257                 c = getNextChar(buf, FALSE, NULL, status);
    258 
    259                 /* EOF reached */
    260                 if (c == U_EOF) {
    261                     ucbuf_ungetc(c, buf);
    262                     return TOK_STRING;
    263                 }
    264 
    265                 if (U_FAILURE(*status)) {
    266                     return TOK_STRING;
    267                 }
    268 
    269                 if (c == QUOTE
    270                         || c == OPENBRACE
    271                         || c == CLOSEBRACE
    272                         || c == COMMA
    273                         || c == COLON) {
    274                     ucbuf_ungetc(c, buf);
    275                     break;
    276                 }
    277 
    278                 if (isWhitespace(c)) {
    279                     break;
    280                 }
    281 
    282                 if (c == ESCAPE) {
    283                     pTarget = target;
    284                     c       = unescape(buf, status);
    285 
    286                     if (c == U_ERR) {
    287                         return TOK_ERROR;
    288                     }
    289                 }
    290 
    291                 U_APPEND_CHAR32(c, pTarget,len);
    292                 pTarget = target;
    293                 ustr_uscat(token, pTarget,len, status);
    294                 len=0;
    295                 if (U_FAILURE(*status)) {
    296                     return TOK_ERROR;
    297                 }
    298             }
    299         }
    300 
    301         /* DO skip whitespace */
    302         c = getNextChar(buf, TRUE, NULL, status);
    303 
    304         if (U_FAILURE(*status)) {
    305             return TOK_STRING;
    306         }
    307 
    308         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
    309             ucbuf_ungetc(c, buf);
    310             return TOK_STRING;
    311         }
    312     }
    313 }
    314 
    315 /* Retrieve the next character.  If skipwhite is
    316    true, whitespace is skipped as well. */
    317 static UChar32 getNextChar(UCHARBUF* buf,
    318                            UBool skipwhite,
    319                            struct UString *token,
    320                            UErrorCode *status) {
    321     UChar32 c, c2;
    322 
    323     if (U_FAILURE(*status)) {
    324         return U_EOF;
    325     }
    326 
    327     for (;;) {
    328         c = ucbuf_getc(buf,status);
    329 
    330         if (c == U_EOF) {
    331             return U_EOF;
    332         }
    333 
    334         if (skipwhite && isWhitespace(c)) {
    335             continue;
    336         }
    337 
    338         /* This also handles the get() failing case */
    339         if (c != SLASH) {
    340             return c;
    341         }
    342 
    343         c = ucbuf_getc(buf,status); /* "/c" */
    344 
    345         if (c == U_EOF) {
    346             return U_EOF;
    347         }
    348 
    349         switch (c) {
    350         case SLASH:  /* "//" */
    351             seekUntilNewline(buf, NULL, status);
    352             break;
    353 
    354         case ASTERISK:  /* " / * " */
    355             c2 = ucbuf_getc(buf, status); /* "/ * c" */
    356             if(c2 == ASTERISK){  /* "/ * *" */
    357                 /* parse multi-line comment and store it in token*/
    358                 seekUntilEndOfComment(buf, token, status);
    359             } else {
    360                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
    361                 seekUntilEndOfComment(buf, NULL, status);
    362             }
    363             break;
    364 
    365         default:
    366             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
    367             /* If get() failed this is a NOP */
    368             return SLASH;
    369         }
    370 
    371     }
    372 }
    373 
    374 static void seekUntilNewline(UCHARBUF* buf,
    375                              struct UString *token,
    376                              UErrorCode *status) {
    377     UChar32 c;
    378 
    379     if (U_FAILURE(*status)) {
    380         return;
    381     }
    382 
    383     do {
    384         c = ucbuf_getc(buf,status);
    385         /* add the char to token */
    386         if(token!=NULL){
    387             ustr_u32cat(token, c, status);
    388         }
    389     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
    390 }
    391 
    392 static void seekUntilEndOfComment(UCHARBUF *buf,
    393                                   struct UString *token,
    394                                   UErrorCode *status) {
    395     UChar32  c, d;
    396     uint32_t line;
    397 
    398     if (U_FAILURE(*status)) {
    399         return;
    400     }
    401 
    402     line = lineCount;
    403 
    404     do {
    405         c = ucbuf_getc(buf, status);
    406 
    407         if (c == ASTERISK) {
    408             d = ucbuf_getc(buf, status);
    409 
    410             if (d != SLASH) {
    411                 ucbuf_ungetc(d, buf);
    412             } else {
    413                 break;
    414             }
    415         }
    416         /* add the char to token */
    417         if(token!=NULL){
    418             ustr_u32cat(token, c, status);
    419         }
    420         /* increment the lineCount */
    421         isNewline(c);
    422 
    423     } while (c != U_EOF && *status == U_ZERO_ERROR);
    424 
    425     if (c == U_EOF) {
    426         *status = U_INVALID_FORMAT_ERROR;
    427         error(line, "unterminated comment detected");
    428     }
    429 }
    430 
    431 UChar32 unescape(UCHARBUF *buf,
    432                  UErrorCode *status) {
    433     if (U_FAILURE(*status)) {
    434         return U_EOF;
    435     }
    436 
    437     /* We expect to be called after the ESCAPE has been seen, but
    438      * u_fgetcx needs an ESCAPE to do its magic. */
    439     ucbuf_ungetc(ESCAPE, buf);
    440 
    441     return ucbuf_getcx32(buf, status);
    442 }
    443 
    444 static UBool isWhitespace(UChar32 c) {
    445     switch (c) {
    446         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
    447     case 0x000A:
    448     case 0x2029:
    449         lineCount++;
    450     case 0x000D:
    451     case 0x0020:
    452     case 0x0009:
    453     case 0xFEFF:
    454         return TRUE;
    455 
    456     default:
    457         return FALSE;
    458     }
    459 }
    460 
    461 static UBool isNewline(UChar32 c) {
    462     switch (c) {
    463         /* '\n', '\r', 0x2029 */
    464     case 0x000A:
    465     case 0x2029:
    466         lineCount++;
    467     case 0x000D:
    468         return TRUE;
    469 
    470     default:
    471         return FALSE;
    472     }
    473 }
    474