Home | History | Annotate | Download | only in genrb
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1998-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *
     11 * File read.c
     12 *
     13 * Modification History:
     14 *
     15 *   Date        Name        Description
     16 *   05/26/99    stephen     Creation.
     17 *   5/10/01     Ram         removed ustdio dependency
     18 *******************************************************************************
     19 */
     20 
     21 #include "read.h"
     22 #include "errmsg.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/utf16.h"
     25 
     26 #define OPENBRACE    0x007B
     27 #define CLOSEBRACE   0x007D
     28 #define COMMA        0x002C
     29 #define QUOTE        0x0022
     30 #define ESCAPE       0x005C
     31 #define SLASH        0x002F
     32 #define ASTERISK     0x002A
     33 #define SPACE        0x0020
     34 #define COLON        0x003A
     35 #define BADBOM       0xFFFE
     36 #define CR           0x000D
     37 #define LF           0x000A
     38 
     39 static int32_t lineCount;
     40 
     41 /* Protos */
     42 static enum ETokenType getStringToken(UCHARBUF *buf,
     43                                       UChar32 initialChar,
     44                                       struct UString *token,
     45                                       UErrorCode *status);
     46 
     47 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
     48 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     49 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
     50 static UBool   isWhitespace          (UChar32 c);
     51 static UBool   isNewline             (UChar32 c);
     52 
     53 U_CFUNC void resetLineNumber() {
     54     lineCount = 1;
     55 }
     56 
     57 /* Read and return the next token from the stream.  If the token is of
     58    type eString, fill in the token parameter with the token.  If the
     59    token is eError, then the status parameter will contain the
     60    specific error.  This will be eItemNotFound at the end of file,
     61    indicating that all tokens have been returned.  This method will
     62    never return eString twice in a row; instead, multiple adjacent
     63    string tokens will be merged into one, with no intervening
     64    space. */
     65 U_CFUNC enum ETokenType
     66 getNextToken(UCHARBUF* buf,
     67              struct UString *token,
     68              uint32_t *linenumber, /* out: linenumber of token */
     69              struct UString *comment,
     70              UErrorCode *status) {
     71     enum ETokenType result;
     72     UChar32         c;
     73 
     74     if (U_FAILURE(*status)) {
     75         return TOK_ERROR;
     76     }
     77 
     78     /* Skip whitespace */
     79     c = getNextChar(buf, TRUE, comment, status);
     80 
     81     if (U_FAILURE(*status)) {
     82         return TOK_ERROR;
     83     }
     84 
     85     *linenumber = lineCount;
     86 
     87     switch(c) {
     88     case BADBOM:
     89         return TOK_ERROR;
     90     case OPENBRACE:
     91         return TOK_OPEN_BRACE;
     92     case CLOSEBRACE:
     93         return TOK_CLOSE_BRACE;
     94     case COMMA:
     95         return TOK_COMMA;
     96     case U_EOF:
     97         return TOK_EOF;
     98     case COLON:
     99         return TOK_COLON;
    100 
    101     default:
    102         result = getStringToken(buf, c, token, status);
    103     }
    104 
    105     *linenumber = lineCount;
    106     return result;
    107 }
    108 
    109 /* Copy a string token into the given UnicodeString.  Upon entry, we
    110    have already read the first character of the string token, which is
    111    not a whitespace character (but may be a QUOTE or ESCAPE). This
    112    function reads all subsequent characters that belong with this
    113    string, and copy them into the token parameter. The other
    114    important, and slightly convoluted purpose of this function is to
    115    merge adjacent strings.  It looks forward a bit, and if the next
    116    non comment, non whitespace item is a string, it reads it in as
    117    well.  If two adjacent strings are quoted, they are merged without
    118    intervening space.  Otherwise a single SPACE character is
    119    inserted. */
    120 static enum ETokenType getStringToken(UCHARBUF* buf,
    121                                       UChar32 initialChar,
    122                                       struct UString *token,
    123                                       UErrorCode *status) {
    124     UBool    lastStringWasQuoted;
    125     UChar32  c;
    126     UChar    target[3] = { '\0' };
    127     UChar    *pTarget   = target;
    128     int      len=0;
    129     UBool    isFollowingCharEscaped=FALSE;
    130     UBool    isNLUnescaped = FALSE;
    131     UChar32  prevC=0;
    132 
    133     /* We are guaranteed on entry that initialChar is not a whitespace
    134        character. If we are at the EOF, or have some other problem, it
    135        doesn't matter; we still want to validly return the initialChar
    136        (if nothing else) as a string token. */
    137 
    138     if (U_FAILURE(*status)) {
    139         return TOK_ERROR;
    140     }
    141 
    142     /* setup */
    143     lastStringWasQuoted = FALSE;
    144     c = initialChar;
    145     ustr_setlen(token, 0, status);
    146 
    147     if (U_FAILURE(*status)) {
    148         return TOK_ERROR;
    149     }
    150 
    151     for (;;) {
    152         if (c == QUOTE) {
    153             if (!lastStringWasQuoted && token->fLength > 0) {
    154                 ustr_ucat(token, SPACE, status);
    155 
    156                 if (U_FAILURE(*status)) {
    157                     return TOK_ERROR;
    158                 }
    159             }
    160 
    161             lastStringWasQuoted = TRUE;
    162 
    163             for (;;) {
    164                 c = ucbuf_getc(buf,status);
    165 
    166                 /* EOF reached */
    167                 if (c == U_EOF) {
    168                     return TOK_EOF;
    169                 }
    170 
    171                 /* Unterminated quoted strings */
    172                 if (U_FAILURE(*status)) {
    173                     return TOK_ERROR;
    174                 }
    175 
    176                 if (c == QUOTE && !isFollowingCharEscaped) {
    177                     break;
    178                 }
    179 
    180                 if (c == ESCAPE  && !isFollowingCharEscaped) {
    181                     pTarget = target;
    182                     c       = unescape(buf, status);
    183 
    184                     if (c == U_ERR) {
    185                         return TOK_ERROR;
    186                     }
    187                     if(c == CR || c == LF){
    188                         isNLUnescaped = TRUE;
    189                     }
    190                 }
    191 
    192                 if(c==ESCAPE && !isFollowingCharEscaped){
    193                     isFollowingCharEscaped = TRUE;
    194                 }else{
    195                     U_APPEND_CHAR32(c, pTarget,len);
    196                     pTarget = target;
    197                     ustr_uscat(token, pTarget,len, status);
    198                     isFollowingCharEscaped = FALSE;
    199                     len=0;
    200                     if(c == CR || c == LF){
    201                         if(isNLUnescaped == FALSE && prevC!=CR){
    202                             lineCount++;
    203                         }
    204                         isNLUnescaped = FALSE;
    205                     }
    206                 }
    207 
    208                 if (U_FAILURE(*status)) {
    209                     return TOK_ERROR;
    210                 }
    211                 prevC = c;
    212             }
    213         } else {
    214             if (token->fLength > 0) {
    215                 ustr_ucat(token, SPACE, status);
    216 
    217                 if (U_FAILURE(*status)) {
    218                     return TOK_ERROR;
    219                 }
    220             }
    221 
    222             if(lastStringWasQuoted){
    223                 if(getShowWarning()){
    224                     warning(lineCount, "Mixing quoted and unquoted strings");
    225                 }
    226                 if(isStrict()){
    227                     return TOK_ERROR;
    228                 }
    229 
    230             }
    231 
    232             lastStringWasQuoted = FALSE;
    233 
    234             /* if we reach here we are mixing
    235              * quoted and unquoted strings
    236              * warn in normal mode and error in
    237              * pedantic mode
    238              */
    239 
    240             if (c == ESCAPE) {
    241                 pTarget = target;
    242                 c       = unescape(buf, status);
    243 
    244                 /* EOF reached */
    245                 if (c == U_EOF) {
    246                     return TOK_ERROR;
    247                 }
    248             }
    249 
    250             U_APPEND_CHAR32(c, pTarget,len);
    251             pTarget = target;
    252             ustr_uscat(token, pTarget,len, status);
    253             len=0;
    254 
    255             if (U_FAILURE(*status)) {
    256                 return TOK_ERROR;
    257             }
    258 
    259             for (;;) {
    260                 /* DON'T skip whitespace */
    261                 c = getNextChar(buf, FALSE, NULL, status);
    262 
    263                 /* EOF reached */
    264                 if (c == U_EOF) {
    265                     ucbuf_ungetc(c, buf);
    266                     return TOK_STRING;
    267                 }
    268 
    269                 if (U_FAILURE(*status)) {
    270                     return TOK_STRING;
    271                 }
    272 
    273                 if (c == QUOTE
    274                         || c == OPENBRACE
    275                         || c == CLOSEBRACE
    276                         || c == COMMA
    277                         || c == COLON) {
    278                     ucbuf_ungetc(c, buf);
    279                     break;
    280                 }
    281 
    282                 if (isWhitespace(c)) {
    283                     break;
    284                 }
    285 
    286                 if (c == ESCAPE) {
    287                     pTarget = target;
    288                     c       = unescape(buf, status);
    289 
    290                     if (c == U_ERR) {
    291                         return TOK_ERROR;
    292                     }
    293                 }
    294 
    295                 U_APPEND_CHAR32(c, pTarget,len);
    296                 pTarget = target;
    297                 ustr_uscat(token, pTarget,len, status);
    298                 len=0;
    299                 if (U_FAILURE(*status)) {
    300                     return TOK_ERROR;
    301                 }
    302             }
    303         }
    304 
    305         /* DO skip whitespace */
    306         c = getNextChar(buf, TRUE, NULL, status);
    307 
    308         if (U_FAILURE(*status)) {
    309             return TOK_STRING;
    310         }
    311 
    312         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
    313             ucbuf_ungetc(c, buf);
    314             return TOK_STRING;
    315         }
    316     }
    317 }
    318 
    319 /* Retrieve the next character.  If skipwhite is
    320    true, whitespace is skipped as well. */
    321 static UChar32 getNextChar(UCHARBUF* buf,
    322                            UBool skipwhite,
    323                            struct UString *token,
    324                            UErrorCode *status) {
    325     UChar32 c, c2;
    326 
    327     if (U_FAILURE(*status)) {
    328         return U_EOF;
    329     }
    330 
    331     for (;;) {
    332         c = ucbuf_getc(buf,status);
    333 
    334         if (c == U_EOF) {
    335             return U_EOF;
    336         }
    337 
    338         if (skipwhite && isWhitespace(c)) {
    339             continue;
    340         }
    341 
    342         /* This also handles the get() failing case */
    343         if (c != SLASH) {
    344             return c;
    345         }
    346 
    347         c = ucbuf_getc(buf,status); /* "/c" */
    348 
    349         if (c == U_EOF) {
    350             return U_EOF;
    351         }
    352 
    353         switch (c) {
    354         case SLASH:  /* "//" */
    355             seekUntilNewline(buf, NULL, status);
    356             break;
    357 
    358         case ASTERISK:  /* " / * " */
    359             c2 = ucbuf_getc(buf, status); /* "/ * c" */
    360             if(c2 == ASTERISK){  /* "/ * *" */
    361                 /* parse multi-line comment and store it in token*/
    362                 seekUntilEndOfComment(buf, token, status);
    363             } else {
    364                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
    365                 seekUntilEndOfComment(buf, NULL, status);
    366             }
    367             break;
    368 
    369         default:
    370             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
    371             /* If get() failed this is a NOP */
    372             return SLASH;
    373         }
    374 
    375     }
    376 }
    377 
    378 static void seekUntilNewline(UCHARBUF* buf,
    379                              struct UString *token,
    380                              UErrorCode *status) {
    381     UChar32 c;
    382 
    383     if (U_FAILURE(*status)) {
    384         return;
    385     }
    386 
    387     do {
    388         c = ucbuf_getc(buf,status);
    389         /* add the char to token */
    390         if(token!=NULL){
    391             ustr_u32cat(token, c, status);
    392         }
    393     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
    394 }
    395 
    396 static void seekUntilEndOfComment(UCHARBUF *buf,
    397                                   struct UString *token,
    398                                   UErrorCode *status) {
    399     UChar32  c, d;
    400     uint32_t line;
    401 
    402     if (U_FAILURE(*status)) {
    403         return;
    404     }
    405 
    406     line = lineCount;
    407 
    408     do {
    409         c = ucbuf_getc(buf, status);
    410 
    411         if (c == ASTERISK) {
    412             d = ucbuf_getc(buf, status);
    413 
    414             if (d != SLASH) {
    415                 ucbuf_ungetc(d, buf);
    416             } else {
    417                 break;
    418             }
    419         }
    420         /* add the char to token */
    421         if(token!=NULL){
    422             ustr_u32cat(token, c, status);
    423         }
    424         /* increment the lineCount */
    425         isNewline(c);
    426 
    427     } while (c != U_EOF && *status == U_ZERO_ERROR);
    428 
    429     if (c == U_EOF) {
    430         *status = U_INVALID_FORMAT_ERROR;
    431         error(line, "unterminated comment detected");
    432     }
    433 }
    434 
    435 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
    436     if (U_FAILURE(*status)) {
    437         return U_EOF;
    438     }
    439 
    440     /* We expect to be called after the ESCAPE has been seen, but
    441      * u_fgetcx needs an ESCAPE to do its magic. */
    442     ucbuf_ungetc(ESCAPE, buf);
    443 
    444     return ucbuf_getcx32(buf, status);
    445 }
    446 
    447 static UBool isWhitespace(UChar32 c) {
    448     switch (c) {
    449         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
    450     case 0x000A:
    451     case 0x2029:
    452         lineCount++;
    453     case 0x000D:
    454     case 0x0020:
    455     case 0x0009:
    456     case 0xFEFF:
    457         return TRUE;
    458 
    459     default:
    460         return FALSE;
    461     }
    462 }
    463 
    464 static UBool isNewline(UChar32 c) {
    465     switch (c) {
    466         /* '\n', '\r', 0x2029 */
    467     case 0x000A:
    468     case 0x2029:
    469         lineCount++;
    470     case 0x000D:
    471         return TRUE;
    472 
    473     default:
    474         return FALSE;
    475     }
    476 }
    477