Home | History | Annotate | Download | only in genrb
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1998-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *
      9 * File read.c
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   05/26/99    stephen     Creation.
     15 *   5/10/01     Ram         removed ustdio dependency
     16 *******************************************************************************
     17 */
     18 
     19 #include "read.h"
     20 #include "errmsg.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/utf16.h"
     23 
     24 #define OPENBRACE    0x007B
     25 #define CLOSEBRACE   0x007D
     26 #define COMMA        0x002C
     27 #define QUOTE        0x0022
     28 #define ESCAPE       0x005C
     29 #define SLASH        0x002F
     30 #define ASTERISK     0x002A
     31 #define SPACE        0x0020
     32 #define COLON        0x003A
     33 #define BADBOM       0xFFFE
     34 #define CR           0x000D
     35 #define LF           0x000A
     36 
     37 static int32_t lineCount;
     38 
     39 /* Protos */
     40 static enum ETokenType getStringToken(UCHARBUF *buf,
     41                                       UChar32 initialChar,
     42                                       struct UString *token,
     43                                       UErrorCode *status);
     44 
     45 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
     46 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     47 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     48 static UBool   isWhitespace          (UChar32 c);
     49 static UBool   isNewline             (UChar32 c);
     50 
     51 U_CFUNC void resetLineNumber() {
     52     lineCount = 1;
     53 }
     54 
     55 /* Read and return the next token from the stream.  If the token is of
     56    type eString, fill in the token parameter with the token.  If the
     57    token is eError, then the status parameter will contain the
     58    specific error.  This will be eItemNotFound at the end of file,
     59    indicating that all tokens have been returned.  This method will
     60    never return eString twice in a row; instead, multiple adjacent
     61    string tokens will be merged into one, with no intervening
     62    space. */
     63 U_CFUNC enum ETokenType
     64 getNextToken(UCHARBUF* buf,
     65              struct UString *token,
     66              uint32_t *linenumber, /* out: linenumber of token */
     67              struct UString *comment,
     68              UErrorCode *status) {
     69     enum ETokenType result;
     70     UChar32         c;
     71 
     72     if (U_FAILURE(*status)) {
     73         return TOK_ERROR;
     74     }
     75 
     76     /* Skip whitespace */
     77     c = getNextChar(buf, TRUE, comment, status);
     78 
     79     if (U_FAILURE(*status)) {
     80         return TOK_ERROR;
     81     }
     82 
     83     *linenumber = lineCount;
     84 
     85     switch(c) {
     86     case BADBOM:
     87         return TOK_ERROR;
     88     case OPENBRACE:
     89         return TOK_OPEN_BRACE;
     90     case CLOSEBRACE:
     91         return TOK_CLOSE_BRACE;
     92     case COMMA:
     93         return TOK_COMMA;
     94     case U_EOF:
     95         return TOK_EOF;
     96     case COLON:
     97         return TOK_COLON;
     98 
     99     default:
    100         result = getStringToken(buf, c, token, status);
    101     }
    102 
    103     *linenumber = lineCount;
    104     return result;
    105 }
    106 
    107 /* Copy a string token into the given UnicodeString.  Upon entry, we
    108    have already read the first character of the string token, which is
    109    not a whitespace character (but may be a QUOTE or ESCAPE). This
    110    function reads all subsequent characters that belong with this
    111    string, and copy them into the token parameter. The other
    112    important, and slightly convoluted purpose of this function is to
    113    merge adjacent strings.  It looks forward a bit, and if the next
    114    non comment, non whitespace item is a string, it reads it in as
    115    well.  If two adjacent strings are quoted, they are merged without
    116    intervening space.  Otherwise a single SPACE character is
    117    inserted. */
    118 static enum ETokenType getStringToken(UCHARBUF* buf,
    119                                       UChar32 initialChar,
    120                                       struct UString *token,
    121                                       UErrorCode *status) {
    122     UBool    lastStringWasQuoted;
    123     UChar32  c;
    124     UChar    target[3] = { '\0' };
    125     UChar    *pTarget   = target;
    126     int      len=0;
    127     UBool    isFollowingCharEscaped=FALSE;
    128     UBool    isNLUnescaped = FALSE;
    129     UChar32  prevC=0;
    130 
    131     /* We are guaranteed on entry that initialChar is not a whitespace
    132        character. If we are at the EOF, or have some other problem, it
    133        doesn't matter; we still want to validly return the initialChar
    134        (if nothing else) as a string token. */
    135 
    136     if (U_FAILURE(*status)) {
    137         return TOK_ERROR;
    138     }
    139 
    140     /* setup */
    141     lastStringWasQuoted = FALSE;
    142     c = initialChar;
    143     ustr_setlen(token, 0, status);
    144 
    145     if (U_FAILURE(*status)) {
    146         return TOK_ERROR;
    147     }
    148 
    149     for (;;) {
    150         if (c == QUOTE) {
    151             if (!lastStringWasQuoted && token->fLength > 0) {
    152                 ustr_ucat(token, SPACE, status);
    153 
    154                 if (U_FAILURE(*status)) {
    155                     return TOK_ERROR;
    156                 }
    157             }
    158 
    159             lastStringWasQuoted = TRUE;
    160 
    161             for (;;) {
    162                 c = ucbuf_getc(buf,status);
    163 
    164                 /* EOF reached */
    165                 if (c == U_EOF) {
    166                     return TOK_EOF;
    167                 }
    168 
    169                 /* Unterminated quoted strings */
    170                 if (U_FAILURE(*status)) {
    171                     return TOK_ERROR;
    172                 }
    173 
    174                 if (c == QUOTE && !isFollowingCharEscaped) {
    175                     break;
    176                 }
    177 
    178                 if (c == ESCAPE  && !isFollowingCharEscaped) {
    179                     pTarget = target;
    180                     c       = unescape(buf, status);
    181 
    182                     if (c == U_ERR) {
    183                         return TOK_ERROR;
    184                     }
    185                     if(c == CR || c == LF){
    186                         isNLUnescaped = TRUE;
    187                     }
    188                 }
    189 
    190                 if(c==ESCAPE && !isFollowingCharEscaped){
    191                     isFollowingCharEscaped = TRUE;
    192                 }else{
    193                     U_APPEND_CHAR32(c, pTarget,len);
    194                     pTarget = target;
    195                     ustr_uscat(token, pTarget,len, status);
    196                     isFollowingCharEscaped = FALSE;
    197                     len=0;
    198                     if(c == CR || c == LF){
    199                         if(isNLUnescaped == FALSE && prevC!=CR){
    200                             lineCount++;
    201                         }
    202                         isNLUnescaped = FALSE;
    203                     }
    204                 }
    205 
    206                 if (U_FAILURE(*status)) {
    207                     return TOK_ERROR;
    208                 }
    209                 prevC = c;
    210             }
    211         } else {
    212             if (token->fLength > 0) {
    213                 ustr_ucat(token, SPACE, status);
    214 
    215                 if (U_FAILURE(*status)) {
    216                     return TOK_ERROR;
    217                 }
    218             }
    219 
    220             if(lastStringWasQuoted){
    221                 if(getShowWarning()){
    222                     warning(lineCount, "Mixing quoted and unquoted strings");
    223                 }
    224                 if(isStrict()){
    225                     return TOK_ERROR;
    226                 }
    227 
    228             }
    229 
    230             lastStringWasQuoted = FALSE;
    231 
    232             /* if we reach here we are mixing
    233              * quoted and unquoted strings
    234              * warn in normal mode and error in
    235              * pedantic mode
    236              */
    237 
    238             if (c == ESCAPE) {
    239                 pTarget = target;
    240                 c       = unescape(buf, status);
    241 
    242                 /* EOF reached */
    243                 if (c == U_EOF) {
    244                     return TOK_ERROR;
    245                 }
    246             }
    247 
    248             U_APPEND_CHAR32(c, pTarget,len);
    249             pTarget = target;
    250             ustr_uscat(token, pTarget,len, status);
    251             len=0;
    252 
    253             if (U_FAILURE(*status)) {
    254                 return TOK_ERROR;
    255             }
    256 
    257             for (;;) {
    258                 /* DON'T skip whitespace */
    259                 c = getNextChar(buf, FALSE, NULL, status);
    260 
    261                 /* EOF reached */
    262                 if (c == U_EOF) {
    263                     ucbuf_ungetc(c, buf);
    264                     return TOK_STRING;
    265                 }
    266 
    267                 if (U_FAILURE(*status)) {
    268                     return TOK_STRING;
    269                 }
    270 
    271                 if (c == QUOTE
    272                         || c == OPENBRACE
    273                         || c == CLOSEBRACE
    274                         || c == COMMA
    275                         || c == COLON) {
    276                     ucbuf_ungetc(c, buf);
    277                     break;
    278                 }
    279 
    280                 if (isWhitespace(c)) {
    281                     break;
    282                 }
    283 
    284                 if (c == ESCAPE) {
    285                     pTarget = target;
    286                     c       = unescape(buf, status);
    287 
    288                     if (c == U_ERR) {
    289                         return TOK_ERROR;
    290                     }
    291                 }
    292 
    293                 U_APPEND_CHAR32(c, pTarget,len);
    294                 pTarget = target;
    295                 ustr_uscat(token, pTarget,len, status);
    296                 len=0;
    297                 if (U_FAILURE(*status)) {
    298                     return TOK_ERROR;
    299                 }
    300             }
    301         }
    302 
    303         /* DO skip whitespace */
    304         c = getNextChar(buf, TRUE, NULL, status);
    305 
    306         if (U_FAILURE(*status)) {
    307             return TOK_STRING;
    308         }
    309 
    310         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
    311             ucbuf_ungetc(c, buf);
    312             return TOK_STRING;
    313         }
    314     }
    315 }
    316 
    317 /* Retrieve the next character.  If skipwhite is
    318    true, whitespace is skipped as well. */
    319 static UChar32 getNextChar(UCHARBUF* buf,
    320                            UBool skipwhite,
    321                            struct UString *token,
    322                            UErrorCode *status) {
    323     UChar32 c, c2;
    324 
    325     if (U_FAILURE(*status)) {
    326         return U_EOF;
    327     }
    328 
    329     for (;;) {
    330         c = ucbuf_getc(buf,status);
    331 
    332         if (c == U_EOF) {
    333             return U_EOF;
    334         }
    335 
    336         if (skipwhite && isWhitespace(c)) {
    337             continue;
    338         }
    339 
    340         /* This also handles the get() failing case */
    341         if (c != SLASH) {
    342             return c;
    343         }
    344 
    345         c = ucbuf_getc(buf,status); /* "/c" */
    346 
    347         if (c == U_EOF) {
    348             return U_EOF;
    349         }
    350 
    351         switch (c) {
    352         case SLASH:  /* "//" */
    353             seekUntilNewline(buf, NULL, status);
    354             break;
    355 
    356         case ASTERISK:  /* " / * " */
    357             c2 = ucbuf_getc(buf, status); /* "/ * c" */
    358             if(c2 == ASTERISK){  /* "/ * *" */
    359                 /* parse multi-line comment and store it in token*/
    360                 seekUntilEndOfComment(buf, token, status);
    361             } else {
    362                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
    363                 seekUntilEndOfComment(buf, NULL, status);
    364             }
    365             break;
    366 
    367         default:
    368             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
    369             /* If get() failed this is a NOP */
    370             return SLASH;
    371         }
    372 
    373     }
    374 }
    375 
    376 static void seekUntilNewline(UCHARBUF* buf,
    377                              struct UString *token,
    378                              UErrorCode *status) {
    379     UChar32 c;
    380 
    381     if (U_FAILURE(*status)) {
    382         return;
    383     }
    384 
    385     do {
    386         c = ucbuf_getc(buf,status);
    387         /* add the char to token */
    388         if(token!=NULL){
    389             ustr_u32cat(token, c, status);
    390         }
    391     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
    392 }
    393 
    394 static void seekUntilEndOfComment(UCHARBUF *buf,
    395                                   struct UString *token,
    396                                   UErrorCode *status) {
    397     UChar32  c, d;
    398     uint32_t line;
    399 
    400     if (U_FAILURE(*status)) {
    401         return;
    402     }
    403 
    404     line = lineCount;
    405 
    406     do {
    407         c = ucbuf_getc(buf, status);
    408 
    409         if (c == ASTERISK) {
    410             d = ucbuf_getc(buf, status);
    411 
    412             if (d != SLASH) {
    413                 ucbuf_ungetc(d, buf);
    414             } else {
    415                 break;
    416             }
    417         }
    418         /* add the char to token */
    419         if(token!=NULL){
    420             ustr_u32cat(token, c, status);
    421         }
    422         /* increment the lineCount */
    423         isNewline(c);
    424 
    425     } while (c != U_EOF && *status == U_ZERO_ERROR);
    426 
    427     if (c == U_EOF) {
    428         *status = U_INVALID_FORMAT_ERROR;
    429         error(line, "unterminated comment detected");
    430     }
    431 }
    432 
    433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
    434     if (U_FAILURE(*status)) {
    435         return U_EOF;
    436     }
    437 
    438     /* We expect to be called after the ESCAPE has been seen, but
    439      * u_fgetcx needs an ESCAPE to do its magic. */
    440     ucbuf_ungetc(ESCAPE, buf);
    441 
    442     return ucbuf_getcx32(buf, status);
    443 }
    444 
    445 static UBool isWhitespace(UChar32 c) {
    446     switch (c) {
    447         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
    448     case 0x000A:
    449     case 0x2029:
    450         lineCount++;
    451     case 0x000D:
    452     case 0x0020:
    453     case 0x0009:
    454     case 0xFEFF:
    455         return TRUE;
    456 
    457     default:
    458         return FALSE;
    459     }
    460 }
    461 
    462 static UBool isNewline(UChar32 c) {
    463     switch (c) {
    464         /* '\n', '\r', 0x2029 */
    465     case 0x000A:
    466     case 0x2029:
    467         lineCount++;
    468     case 0x000D:
    469         return TRUE;
    470 
    471     default:
    472         return FALSE;
    473     }
    474 }
    475