1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1998-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * 9 * File read.c 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 05/26/99 stephen Creation. 15 * 5/10/01 Ram removed ustdio dependency 16 ******************************************************************************* 17 */ 18 19 #include "read.h" 20 #include "errmsg.h" 21 #include "unicode/ustring.h" 22 23 #define OPENBRACE 0x007B 24 #define CLOSEBRACE 0x007D 25 #define COMMA 0x002C 26 #define QUOTE 0x0022 27 #define ESCAPE 0x005C 28 #define SLASH 0x002F 29 #define ASTERISK 0x002A 30 #define SPACE 0x0020 31 #define COLON 0x003A 32 #define BADBOM 0xFFFE 33 #define CR 0x000D 34 #define LF 0x000A 35 36 static int32_t lineCount; 37 38 /* Protos */ 39 static enum ETokenType getStringToken(UCHARBUF *buf, 40 UChar32 initialChar, 41 struct UString *token, 42 UErrorCode *status); 43 44 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); 45 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); 46 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); 47 static UBool isWhitespace (UChar32 c); 48 static UBool isNewline (UChar32 c); 49 50 void resetLineNumber() { 51 lineCount = 1; 52 } 53 54 /* Read and return the next token from the stream. If the token is of 55 type eString, fill in the token parameter with the token. If the 56 token is eError, then the status parameter will contain the 57 specific error. This will be eItemNotFound at the end of file, 58 indicating that all tokens have been returned. This method will 59 never return eString twice in a row; instead, multiple adjacent 60 string tokens will be merged into one, with no intervening 61 space. */ 62 enum ETokenType getNextToken(UCHARBUF* buf, 63 struct UString *token, 64 uint32_t *linenumber, /* out: linenumber of token */ 65 struct UString *comment, 66 UErrorCode *status) { 67 enum ETokenType result; 68 UChar32 c; 69 70 if (U_FAILURE(*status)) { 71 return TOK_ERROR; 72 } 73 74 /* Skip whitespace */ 75 c = getNextChar(buf, TRUE, comment, status); 76 77 if (U_FAILURE(*status)) { 78 return TOK_ERROR; 79 } 80 81 *linenumber = lineCount; 82 83 switch(c) { 84 case BADBOM: 85 return TOK_ERROR; 86 case OPENBRACE: 87 return TOK_OPEN_BRACE; 88 case CLOSEBRACE: 89 return TOK_CLOSE_BRACE; 90 case COMMA: 91 return TOK_COMMA; 92 case U_EOF: 93 return TOK_EOF; 94 case COLON: 95 return TOK_COLON; 96 97 default: 98 result = getStringToken(buf, c, token, status); 99 } 100 101 *linenumber = lineCount; 102 return result; 103 } 104 105 /* Copy a string token into the given UnicodeString. Upon entry, we 106 have already read the first character of the string token, which is 107 not a whitespace character (but may be a QUOTE or ESCAPE). This 108 function reads all subsequent characters that belong with this 109 string, and copy them into the token parameter. The other 110 important, and slightly convoluted purpose of this function is to 111 merge adjacent strings. It looks forward a bit, and if the next 112 non comment, non whitespace item is a string, it reads it in as 113 well. If two adjacent strings are quoted, they are merged without 114 intervening space. Otherwise a single SPACE character is 115 inserted. */ 116 static enum ETokenType getStringToken(UCHARBUF* buf, 117 UChar32 initialChar, 118 struct UString *token, 119 UErrorCode *status) { 120 UBool lastStringWasQuoted; 121 UChar32 c; 122 UChar target[3] = { '\0' }; 123 UChar *pTarget = target; 124 int len=0; 125 UBool isFollowingCharEscaped=FALSE; 126 UBool isNLUnescaped = FALSE; 127 UChar32 prevC=0; 128 129 /* We are guaranteed on entry that initialChar is not a whitespace 130 character. If we are at the EOF, or have some other problem, it 131 doesn't matter; we still want to validly return the initialChar 132 (if nothing else) as a string token. */ 133 134 if (U_FAILURE(*status)) { 135 return TOK_ERROR; 136 } 137 138 /* setup */ 139 lastStringWasQuoted = FALSE; 140 c = initialChar; 141 ustr_setlen(token, 0, status); 142 143 if (U_FAILURE(*status)) { 144 return TOK_ERROR; 145 } 146 147 for (;;) { 148 if (c == QUOTE) { 149 if (!lastStringWasQuoted && token->fLength > 0) { 150 ustr_ucat(token, SPACE, status); 151 152 if (U_FAILURE(*status)) { 153 return TOK_ERROR; 154 } 155 } 156 157 lastStringWasQuoted = TRUE; 158 159 for (;;) { 160 c = ucbuf_getc(buf,status); 161 162 /* EOF reached */ 163 if (c == U_EOF) { 164 return TOK_EOF; 165 } 166 167 /* Unterminated quoted strings */ 168 if (U_FAILURE(*status)) { 169 return TOK_ERROR; 170 } 171 172 if (c == QUOTE && !isFollowingCharEscaped) { 173 break; 174 } 175 176 if (c == ESCAPE && !isFollowingCharEscaped) { 177 pTarget = target; 178 c = unescape(buf, status); 179 180 if (c == U_ERR) { 181 return TOK_ERROR; 182 } 183 if(c == CR || c == LF){ 184 isNLUnescaped = TRUE; 185 } 186 } 187 188 if(c==ESCAPE && !isFollowingCharEscaped){ 189 isFollowingCharEscaped = TRUE; 190 }else{ 191 U_APPEND_CHAR32(c, pTarget,len); 192 pTarget = target; 193 ustr_uscat(token, pTarget,len, status); 194 isFollowingCharEscaped = FALSE; 195 len=0; 196 if(c == CR || c == LF){ 197 if(isNLUnescaped == FALSE && prevC!=CR){ 198 lineCount++; 199 } 200 isNLUnescaped = FALSE; 201 } 202 } 203 204 if (U_FAILURE(*status)) { 205 return TOK_ERROR; 206 } 207 prevC = c; 208 } 209 } else { 210 if (token->fLength > 0) { 211 ustr_ucat(token, SPACE, status); 212 213 if (U_FAILURE(*status)) { 214 return TOK_ERROR; 215 } 216 } 217 218 if(lastStringWasQuoted){ 219 if(getShowWarning()){ 220 warning(lineCount, "Mixing quoted and unquoted strings"); 221 } 222 if(isStrict()){ 223 return TOK_ERROR; 224 } 225 226 } 227 228 lastStringWasQuoted = FALSE; 229 230 /* if we reach here we are mixing 231 * quoted and unquoted strings 232 * warn in normal mode and error in 233 * pedantic mode 234 */ 235 236 if (c == ESCAPE) { 237 pTarget = target; 238 c = unescape(buf, status); 239 240 /* EOF reached */ 241 if (c == U_EOF) { 242 return TOK_ERROR; 243 } 244 } 245 246 U_APPEND_CHAR32(c, pTarget,len); 247 pTarget = target; 248 ustr_uscat(token, pTarget,len, status); 249 len=0; 250 251 if (U_FAILURE(*status)) { 252 return TOK_ERROR; 253 } 254 255 for (;;) { 256 /* DON'T skip whitespace */ 257 c = getNextChar(buf, FALSE, NULL, status); 258 259 /* EOF reached */ 260 if (c == U_EOF) { 261 ucbuf_ungetc(c, buf); 262 return TOK_STRING; 263 } 264 265 if (U_FAILURE(*status)) { 266 return TOK_STRING; 267 } 268 269 if (c == QUOTE 270 || c == OPENBRACE 271 || c == CLOSEBRACE 272 || c == COMMA 273 || c == COLON) { 274 ucbuf_ungetc(c, buf); 275 break; 276 } 277 278 if (isWhitespace(c)) { 279 break; 280 } 281 282 if (c == ESCAPE) { 283 pTarget = target; 284 c = unescape(buf, status); 285 286 if (c == U_ERR) { 287 return TOK_ERROR; 288 } 289 } 290 291 U_APPEND_CHAR32(c, pTarget,len); 292 pTarget = target; 293 ustr_uscat(token, pTarget,len, status); 294 len=0; 295 if (U_FAILURE(*status)) { 296 return TOK_ERROR; 297 } 298 } 299 } 300 301 /* DO skip whitespace */ 302 c = getNextChar(buf, TRUE, NULL, status); 303 304 if (U_FAILURE(*status)) { 305 return TOK_STRING; 306 } 307 308 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { 309 ucbuf_ungetc(c, buf); 310 return TOK_STRING; 311 } 312 } 313 } 314 315 /* Retrieve the next character. If skipwhite is 316 true, whitespace is skipped as well. */ 317 static UChar32 getNextChar(UCHARBUF* buf, 318 UBool skipwhite, 319 struct UString *token, 320 UErrorCode *status) { 321 UChar32 c, c2; 322 323 if (U_FAILURE(*status)) { 324 return U_EOF; 325 } 326 327 for (;;) { 328 c = ucbuf_getc(buf,status); 329 330 if (c == U_EOF) { 331 return U_EOF; 332 } 333 334 if (skipwhite && isWhitespace(c)) { 335 continue; 336 } 337 338 /* This also handles the get() failing case */ 339 if (c != SLASH) { 340 return c; 341 } 342 343 c = ucbuf_getc(buf,status); /* "/c" */ 344 345 if (c == U_EOF) { 346 return U_EOF; 347 } 348 349 switch (c) { 350 case SLASH: /* "//" */ 351 seekUntilNewline(buf, NULL, status); 352 break; 353 354 case ASTERISK: /* " / * " */ 355 c2 = ucbuf_getc(buf, status); /* "/ * c" */ 356 if(c2 == ASTERISK){ /* "/ * *" */ 357 /* parse multi-line comment and store it in token*/ 358 seekUntilEndOfComment(buf, token, status); 359 } else { 360 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ 361 seekUntilEndOfComment(buf, NULL, status); 362 } 363 break; 364 365 default: 366 ucbuf_ungetc(c, buf); /* "/c" - put back the c */ 367 /* If get() failed this is a NOP */ 368 return SLASH; 369 } 370 371 } 372 } 373 374 static void seekUntilNewline(UCHARBUF* buf, 375 struct UString *token, 376 UErrorCode *status) { 377 UChar32 c; 378 379 if (U_FAILURE(*status)) { 380 return; 381 } 382 383 do { 384 c = ucbuf_getc(buf,status); 385 /* add the char to token */ 386 if(token!=NULL){ 387 ustr_u32cat(token, c, status); 388 } 389 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); 390 } 391 392 static void seekUntilEndOfComment(UCHARBUF *buf, 393 struct UString *token, 394 UErrorCode *status) { 395 UChar32 c, d; 396 uint32_t line; 397 398 if (U_FAILURE(*status)) { 399 return; 400 } 401 402 line = lineCount; 403 404 do { 405 c = ucbuf_getc(buf, status); 406 407 if (c == ASTERISK) { 408 d = ucbuf_getc(buf, status); 409 410 if (d != SLASH) { 411 ucbuf_ungetc(d, buf); 412 } else { 413 break; 414 } 415 } 416 /* add the char to token */ 417 if(token!=NULL){ 418 ustr_u32cat(token, c, status); 419 } 420 /* increment the lineCount */ 421 isNewline(c); 422 423 } while (c != U_EOF && *status == U_ZERO_ERROR); 424 425 if (c == U_EOF) { 426 *status = U_INVALID_FORMAT_ERROR; 427 error(line, "unterminated comment detected"); 428 } 429 } 430 431 UChar32 unescape(UCHARBUF *buf, 432 UErrorCode *status) { 433 if (U_FAILURE(*status)) { 434 return U_EOF; 435 } 436 437 /* We expect to be called after the ESCAPE has been seen, but 438 * u_fgetcx needs an ESCAPE to do its magic. */ 439 ucbuf_ungetc(ESCAPE, buf); 440 441 return ucbuf_getcx32(buf, status); 442 } 443 444 static UBool isWhitespace(UChar32 c) { 445 switch (c) { 446 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ 447 case 0x000A: 448 case 0x2029: 449 lineCount++; 450 case 0x000D: 451 case 0x0020: 452 case 0x0009: 453 case 0xFEFF: 454 return TRUE; 455 456 default: 457 return FALSE; 458 } 459 } 460 461 static UBool isNewline(UChar32 c) { 462 switch (c) { 463 /* '\n', '\r', 0x2029 */ 464 case 0x000A: 465 case 0x2029: 466 lineCount++; 467 case 0x000D: 468 return TRUE; 469 470 default: 471 return FALSE; 472 } 473 } 474