1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1998-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * 9 * File read.c 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 05/26/99 stephen Creation. 15 * 5/10/01 Ram removed ustdio dependency 16 ******************************************************************************* 17 */ 18 19 #include "read.h" 20 #include "errmsg.h" 21 #include "unicode/ustring.h" 22 #include "unicode/utf16.h" 23 24 #define OPENBRACE 0x007B 25 #define CLOSEBRACE 0x007D 26 #define COMMA 0x002C 27 #define QUOTE 0x0022 28 #define ESCAPE 0x005C 29 #define SLASH 0x002F 30 #define ASTERISK 0x002A 31 #define SPACE 0x0020 32 #define COLON 0x003A 33 #define BADBOM 0xFFFE 34 #define CR 0x000D 35 #define LF 0x000A 36 37 static int32_t lineCount; 38 39 /* Protos */ 40 static enum ETokenType getStringToken(UCHARBUF *buf, 41 UChar32 initialChar, 42 struct UString *token, 43 UErrorCode *status); 44 45 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); 46 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); 47 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); 48 static UBool isWhitespace (UChar32 c); 49 static UBool isNewline (UChar32 c); 50 51 U_CFUNC void resetLineNumber() { 52 lineCount = 1; 53 } 54 55 /* Read and return the next token from the stream. If the token is of 56 type eString, fill in the token parameter with the token. If the 57 token is eError, then the status parameter will contain the 58 specific error. This will be eItemNotFound at the end of file, 59 indicating that all tokens have been returned. This method will 60 never return eString twice in a row; instead, multiple adjacent 61 string tokens will be merged into one, with no intervening 62 space. */ 63 U_CFUNC enum ETokenType 64 getNextToken(UCHARBUF* buf, 65 struct UString *token, 66 uint32_t *linenumber, /* out: linenumber of token */ 67 struct UString *comment, 68 UErrorCode *status) { 69 enum ETokenType result; 70 UChar32 c; 71 72 if (U_FAILURE(*status)) { 73 return TOK_ERROR; 74 } 75 76 /* Skip whitespace */ 77 c = getNextChar(buf, TRUE, comment, status); 78 79 if (U_FAILURE(*status)) { 80 return TOK_ERROR; 81 } 82 83 *linenumber = lineCount; 84 85 switch(c) { 86 case BADBOM: 87 return TOK_ERROR; 88 case OPENBRACE: 89 return TOK_OPEN_BRACE; 90 case CLOSEBRACE: 91 return TOK_CLOSE_BRACE; 92 case COMMA: 93 return TOK_COMMA; 94 case U_EOF: 95 return TOK_EOF; 96 case COLON: 97 return TOK_COLON; 98 99 default: 100 result = getStringToken(buf, c, token, status); 101 } 102 103 *linenumber = lineCount; 104 return result; 105 } 106 107 /* Copy a string token into the given UnicodeString. Upon entry, we 108 have already read the first character of the string token, which is 109 not a whitespace character (but may be a QUOTE or ESCAPE). This 110 function reads all subsequent characters that belong with this 111 string, and copy them into the token parameter. The other 112 important, and slightly convoluted purpose of this function is to 113 merge adjacent strings. It looks forward a bit, and if the next 114 non comment, non whitespace item is a string, it reads it in as 115 well. If two adjacent strings are quoted, they are merged without 116 intervening space. Otherwise a single SPACE character is 117 inserted. */ 118 static enum ETokenType getStringToken(UCHARBUF* buf, 119 UChar32 initialChar, 120 struct UString *token, 121 UErrorCode *status) { 122 UBool lastStringWasQuoted; 123 UChar32 c; 124 UChar target[3] = { '\0' }; 125 UChar *pTarget = target; 126 int len=0; 127 UBool isFollowingCharEscaped=FALSE; 128 UBool isNLUnescaped = FALSE; 129 UChar32 prevC=0; 130 131 /* We are guaranteed on entry that initialChar is not a whitespace 132 character. If we are at the EOF, or have some other problem, it 133 doesn't matter; we still want to validly return the initialChar 134 (if nothing else) as a string token. */ 135 136 if (U_FAILURE(*status)) { 137 return TOK_ERROR; 138 } 139 140 /* setup */ 141 lastStringWasQuoted = FALSE; 142 c = initialChar; 143 ustr_setlen(token, 0, status); 144 145 if (U_FAILURE(*status)) { 146 return TOK_ERROR; 147 } 148 149 for (;;) { 150 if (c == QUOTE) { 151 if (!lastStringWasQuoted && token->fLength > 0) { 152 ustr_ucat(token, SPACE, status); 153 154 if (U_FAILURE(*status)) { 155 return TOK_ERROR; 156 } 157 } 158 159 lastStringWasQuoted = TRUE; 160 161 for (;;) { 162 c = ucbuf_getc(buf,status); 163 164 /* EOF reached */ 165 if (c == U_EOF) { 166 return TOK_EOF; 167 } 168 169 /* Unterminated quoted strings */ 170 if (U_FAILURE(*status)) { 171 return TOK_ERROR; 172 } 173 174 if (c == QUOTE && !isFollowingCharEscaped) { 175 break; 176 } 177 178 if (c == ESCAPE && !isFollowingCharEscaped) { 179 pTarget = target; 180 c = unescape(buf, status); 181 182 if (c == U_ERR) { 183 return TOK_ERROR; 184 } 185 if(c == CR || c == LF){ 186 isNLUnescaped = TRUE; 187 } 188 } 189 190 if(c==ESCAPE && !isFollowingCharEscaped){ 191 isFollowingCharEscaped = TRUE; 192 }else{ 193 U_APPEND_CHAR32(c, pTarget,len); 194 pTarget = target; 195 ustr_uscat(token, pTarget,len, status); 196 isFollowingCharEscaped = FALSE; 197 len=0; 198 if(c == CR || c == LF){ 199 if(isNLUnescaped == FALSE && prevC!=CR){ 200 lineCount++; 201 } 202 isNLUnescaped = FALSE; 203 } 204 } 205 206 if (U_FAILURE(*status)) { 207 return TOK_ERROR; 208 } 209 prevC = c; 210 } 211 } else { 212 if (token->fLength > 0) { 213 ustr_ucat(token, SPACE, status); 214 215 if (U_FAILURE(*status)) { 216 return TOK_ERROR; 217 } 218 } 219 220 if(lastStringWasQuoted){ 221 if(getShowWarning()){ 222 warning(lineCount, "Mixing quoted and unquoted strings"); 223 } 224 if(isStrict()){ 225 return TOK_ERROR; 226 } 227 228 } 229 230 lastStringWasQuoted = FALSE; 231 232 /* if we reach here we are mixing 233 * quoted and unquoted strings 234 * warn in normal mode and error in 235 * pedantic mode 236 */ 237 238 if (c == ESCAPE) { 239 pTarget = target; 240 c = unescape(buf, status); 241 242 /* EOF reached */ 243 if (c == U_EOF) { 244 return TOK_ERROR; 245 } 246 } 247 248 U_APPEND_CHAR32(c, pTarget,len); 249 pTarget = target; 250 ustr_uscat(token, pTarget,len, status); 251 len=0; 252 253 if (U_FAILURE(*status)) { 254 return TOK_ERROR; 255 } 256 257 for (;;) { 258 /* DON'T skip whitespace */ 259 c = getNextChar(buf, FALSE, NULL, status); 260 261 /* EOF reached */ 262 if (c == U_EOF) { 263 ucbuf_ungetc(c, buf); 264 return TOK_STRING; 265 } 266 267 if (U_FAILURE(*status)) { 268 return TOK_STRING; 269 } 270 271 if (c == QUOTE 272 || c == OPENBRACE 273 || c == CLOSEBRACE 274 || c == COMMA 275 || c == COLON) { 276 ucbuf_ungetc(c, buf); 277 break; 278 } 279 280 if (isWhitespace(c)) { 281 break; 282 } 283 284 if (c == ESCAPE) { 285 pTarget = target; 286 c = unescape(buf, status); 287 288 if (c == U_ERR) { 289 return TOK_ERROR; 290 } 291 } 292 293 U_APPEND_CHAR32(c, pTarget,len); 294 pTarget = target; 295 ustr_uscat(token, pTarget,len, status); 296 len=0; 297 if (U_FAILURE(*status)) { 298 return TOK_ERROR; 299 } 300 } 301 } 302 303 /* DO skip whitespace */ 304 c = getNextChar(buf, TRUE, NULL, status); 305 306 if (U_FAILURE(*status)) { 307 return TOK_STRING; 308 } 309 310 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { 311 ucbuf_ungetc(c, buf); 312 return TOK_STRING; 313 } 314 } 315 } 316 317 /* Retrieve the next character. If skipwhite is 318 true, whitespace is skipped as well. */ 319 static UChar32 getNextChar(UCHARBUF* buf, 320 UBool skipwhite, 321 struct UString *token, 322 UErrorCode *status) { 323 UChar32 c, c2; 324 325 if (U_FAILURE(*status)) { 326 return U_EOF; 327 } 328 329 for (;;) { 330 c = ucbuf_getc(buf,status); 331 332 if (c == U_EOF) { 333 return U_EOF; 334 } 335 336 if (skipwhite && isWhitespace(c)) { 337 continue; 338 } 339 340 /* This also handles the get() failing case */ 341 if (c != SLASH) { 342 return c; 343 } 344 345 c = ucbuf_getc(buf,status); /* "/c" */ 346 347 if (c == U_EOF) { 348 return U_EOF; 349 } 350 351 switch (c) { 352 case SLASH: /* "//" */ 353 seekUntilNewline(buf, NULL, status); 354 break; 355 356 case ASTERISK: /* " / * " */ 357 c2 = ucbuf_getc(buf, status); /* "/ * c" */ 358 if(c2 == ASTERISK){ /* "/ * *" */ 359 /* parse multi-line comment and store it in token*/ 360 seekUntilEndOfComment(buf, token, status); 361 } else { 362 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ 363 seekUntilEndOfComment(buf, NULL, status); 364 } 365 break; 366 367 default: 368 ucbuf_ungetc(c, buf); /* "/c" - put back the c */ 369 /* If get() failed this is a NOP */ 370 return SLASH; 371 } 372 373 } 374 } 375 376 static void seekUntilNewline(UCHARBUF* buf, 377 struct UString *token, 378 UErrorCode *status) { 379 UChar32 c; 380 381 if (U_FAILURE(*status)) { 382 return; 383 } 384 385 do { 386 c = ucbuf_getc(buf,status); 387 /* add the char to token */ 388 if(token!=NULL){ 389 ustr_u32cat(token, c, status); 390 } 391 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); 392 } 393 394 static void seekUntilEndOfComment(UCHARBUF *buf, 395 struct UString *token, 396 UErrorCode *status) { 397 UChar32 c, d; 398 uint32_t line; 399 400 if (U_FAILURE(*status)) { 401 return; 402 } 403 404 line = lineCount; 405 406 do { 407 c = ucbuf_getc(buf, status); 408 409 if (c == ASTERISK) { 410 d = ucbuf_getc(buf, status); 411 412 if (d != SLASH) { 413 ucbuf_ungetc(d, buf); 414 } else { 415 break; 416 } 417 } 418 /* add the char to token */ 419 if(token!=NULL){ 420 ustr_u32cat(token, c, status); 421 } 422 /* increment the lineCount */ 423 isNewline(c); 424 425 } while (c != U_EOF && *status == U_ZERO_ERROR); 426 427 if (c == U_EOF) { 428 *status = U_INVALID_FORMAT_ERROR; 429 error(line, "unterminated comment detected"); 430 } 431 } 432 433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { 434 if (U_FAILURE(*status)) { 435 return U_EOF; 436 } 437 438 /* We expect to be called after the ESCAPE has been seen, but 439 * u_fgetcx needs an ESCAPE to do its magic. */ 440 ucbuf_ungetc(ESCAPE, buf); 441 442 return ucbuf_getcx32(buf, status); 443 } 444 445 static UBool isWhitespace(UChar32 c) { 446 switch (c) { 447 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ 448 case 0x000A: 449 case 0x2029: 450 lineCount++; 451 case 0x000D: 452 case 0x0020: 453 case 0x0009: 454 case 0xFEFF: 455 return TRUE; 456 457 default: 458 return FALSE; 459 } 460 } 461 462 static UBool isNewline(UChar32 c) { 463 switch (c) { 464 /* '\n', '\r', 0x2029 */ 465 case 0x000A: 466 case 0x2029: 467 lineCount++; 468 case 0x000D: 469 return TRUE; 470 471 default: 472 return FALSE; 473 } 474 } 475