1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This class implements the lexer for assembly files. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/MC/MCParser/AsmLexer.h" 15 #include "llvm/MC/MCAsmInfo.h" 16 #include "llvm/Support/MemoryBuffer.h" 17 #include "llvm/Support/SMLoc.h" 18 #include <cctype> 19 #include <cerrno> 20 #include <cstdio> 21 #include <cstdlib> 22 using namespace llvm; 23 24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurPtr = nullptr; 26 isAtStartOfLine = true; 27 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); 28 } 29 30 AsmLexer::~AsmLexer() { 31 } 32 33 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) { 34 CurBuf = Buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf.begin(); 40 41 TokStart = nullptr; 42 } 43 44 /// ReturnError - Set the error to the specified string at the specified 45 /// location. This is defined to always return AsmToken::Error. 46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50 } 51 52 int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr - 1 != CurBuf.end()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67 } 68 69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70 /// 71 /// The leading integral digit sequence and dot should have already been 72 /// consumed, some or all of the fractional digit sequence *can* have been 73 /// consumed. 74 AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92 } 93 94 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 95 /// while making sure there are enough actual digits around for the constant to 96 /// be valid. 97 /// 98 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 99 /// before we get here. 100 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 101 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 102 "unexpected parse state in floating hex"); 103 bool NoFracDigits = true; 104 105 // Skip the fractional part if there is one 106 if (*CurPtr == '.') { 107 ++CurPtr; 108 109 const char *FracStart = CurPtr; 110 while (isxdigit(*CurPtr)) 111 ++CurPtr; 112 113 NoFracDigits = CurPtr == FracStart; 114 } 115 116 if (NoIntDigits && NoFracDigits) 117 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 118 "expected at least one significand digit"); 119 120 // Make sure we do have some kind of proper exponent part 121 if (*CurPtr != 'p' && *CurPtr != 'P') 122 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 123 "expected exponent part 'p'"); 124 ++CurPtr; 125 126 if (*CurPtr == '+' || *CurPtr == '-') 127 ++CurPtr; 128 129 // N.b. exponent digits are *not* hex 130 const char *ExpStart = CurPtr; 131 while (isdigit(*CurPtr)) 132 ++CurPtr; 133 134 if (CurPtr == ExpStart) 135 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 136 "expected at least one exponent digit"); 137 138 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 139 } 140 141 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]* 142 static bool IsIdentifierChar(char c, bool AllowAt) { 143 return isalnum(c) || c == '_' || c == '$' || c == '.' || 144 (c == '@' && AllowAt) || c == '?'; 145 } 146 AsmToken AsmLexer::LexIdentifier() { 147 // Check for floating point literals. 148 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 149 // Disambiguate a .1243foo identifier from a floating literal. 150 while (isdigit(*CurPtr)) 151 ++CurPtr; 152 if (*CurPtr == 'e' || *CurPtr == 'E' || 153 !IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) 154 return LexFloatLiteral(); 155 } 156 157 while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) 158 ++CurPtr; 159 160 // Handle . as a special case. 161 if (CurPtr == TokStart+1 && TokStart[0] == '.') 162 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 163 164 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 165 } 166 167 /// LexSlash: Slash: / 168 /// C-Style Comment: /* ... */ 169 AsmToken AsmLexer::LexSlash() { 170 switch (*CurPtr) { 171 case '*': break; // C style comment. 172 case '/': return ++CurPtr, LexLineComment(); 173 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 174 } 175 176 // C Style comment. 177 ++CurPtr; // skip the star. 178 while (1) { 179 int CurChar = getNextChar(); 180 switch (CurChar) { 181 case EOF: 182 return ReturnError(TokStart, "unterminated comment"); 183 case '*': 184 // End of the comment? 185 if (CurPtr[0] != '/') break; 186 187 ++CurPtr; // End the */. 188 return LexToken(); 189 } 190 } 191 } 192 193 /// LexLineComment: Comment: #[^\n]* 194 /// : //[^\n]* 195 AsmToken AsmLexer::LexLineComment() { 196 // FIXME: This is broken if we happen to a comment at the end of a file, which 197 // was .included, and which doesn't end with a newline. 198 int CurChar = getNextChar(); 199 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 200 CurChar = getNextChar(); 201 202 if (CurChar == EOF) 203 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 204 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 205 } 206 207 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 208 // Skip ULL, UL, U, L and LL suffices. 209 if (CurPtr[0] == 'U') 210 ++CurPtr; 211 if (CurPtr[0] == 'L') 212 ++CurPtr; 213 if (CurPtr[0] == 'L') 214 ++CurPtr; 215 } 216 217 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 218 // integer as a hexadecimal, possibly with leading zeroes. 219 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 220 const char *FirstHex = nullptr; 221 const char *LookAhead = CurPtr; 222 while (1) { 223 if (isdigit(*LookAhead)) { 224 ++LookAhead; 225 } else if (isxdigit(*LookAhead)) { 226 if (!FirstHex) 227 FirstHex = LookAhead; 228 ++LookAhead; 229 } else { 230 break; 231 } 232 } 233 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 234 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 235 if (isHex) 236 return 16; 237 return DefaultRadix; 238 } 239 240 static AsmToken intToken(StringRef Ref, APInt &Value) 241 { 242 if (Value.isIntN(64)) 243 return AsmToken(AsmToken::Integer, Ref, Value); 244 return AsmToken(AsmToken::BigNum, Ref, Value); 245 } 246 247 /// LexDigit: First character is [0-9]. 248 /// Local Label: [0-9][:] 249 /// Forward/Backward Label: [0-9][fb] 250 /// Binary integer: 0b[01]+ 251 /// Octal integer: 0[0-7]+ 252 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 253 /// Decimal integer: [1-9][0-9]* 254 AsmToken AsmLexer::LexDigit() { 255 // Decimal integer: [1-9][0-9]* 256 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 257 unsigned Radix = doLookAhead(CurPtr, 10); 258 bool isHex = Radix == 16; 259 // Check for floating point literals. 260 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 261 ++CurPtr; 262 return LexFloatLiteral(); 263 } 264 265 StringRef Result(TokStart, CurPtr - TokStart); 266 267 APInt Value(128, 0, true); 268 if (Result.getAsInteger(Radix, Value)) 269 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 270 "invalid hexdecimal number"); 271 272 // Consume the [bB][hH]. 273 if (Radix == 2 || Radix == 16) 274 ++CurPtr; 275 276 // The darwin/x86 (and x86-64) assembler accepts and ignores type 277 // suffices on integer literals. 278 SkipIgnoredIntegerSuffix(CurPtr); 279 280 return intToken(Result, Value); 281 } 282 283 if (*CurPtr == 'b') { 284 ++CurPtr; 285 // See if we actually have "0b" as part of something like "jmp 0b\n" 286 if (!isdigit(CurPtr[0])) { 287 --CurPtr; 288 StringRef Result(TokStart, CurPtr - TokStart); 289 return AsmToken(AsmToken::Integer, Result, 0); 290 } 291 const char *NumStart = CurPtr; 292 while (CurPtr[0] == '0' || CurPtr[0] == '1') 293 ++CurPtr; 294 295 // Requires at least one binary digit. 296 if (CurPtr == NumStart) 297 return ReturnError(TokStart, "invalid binary number"); 298 299 StringRef Result(TokStart, CurPtr - TokStart); 300 301 APInt Value(128, 0, true); 302 if (Result.substr(2).getAsInteger(2, Value)) 303 return ReturnError(TokStart, "invalid binary number"); 304 305 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 306 // suffixes on integer literals. 307 SkipIgnoredIntegerSuffix(CurPtr); 308 309 return intToken(Result, Value); 310 } 311 312 if (*CurPtr == 'x') { 313 ++CurPtr; 314 const char *NumStart = CurPtr; 315 while (isxdigit(CurPtr[0])) 316 ++CurPtr; 317 318 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 319 // diagnosed by LexHexFloatLiteral). 320 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 321 return LexHexFloatLiteral(NumStart == CurPtr); 322 323 // Otherwise requires at least one hex digit. 324 if (CurPtr == NumStart) 325 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 326 327 APInt Result(128, 0); 328 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 329 return ReturnError(TokStart, "invalid hexadecimal number"); 330 331 // Consume the optional [hH]. 332 if (*CurPtr == 'h' || *CurPtr == 'H') 333 ++CurPtr; 334 335 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 336 // suffixes on integer literals. 337 SkipIgnoredIntegerSuffix(CurPtr); 338 339 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 340 } 341 342 // Either octal or hexadecimal. 343 APInt Value(128, 0, true); 344 unsigned Radix = doLookAhead(CurPtr, 8); 345 bool isHex = Radix == 16; 346 StringRef Result(TokStart, CurPtr - TokStart); 347 if (Result.getAsInteger(Radix, Value)) 348 return ReturnError(TokStart, !isHex ? "invalid octal number" : 349 "invalid hexdecimal number"); 350 351 // Consume the [hH]. 352 if (Radix == 16) 353 ++CurPtr; 354 355 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 356 // suffixes on integer literals. 357 SkipIgnoredIntegerSuffix(CurPtr); 358 359 return intToken(Result, Value); 360 } 361 362 /// LexSingleQuote: Integer: 'b' 363 AsmToken AsmLexer::LexSingleQuote() { 364 int CurChar = getNextChar(); 365 366 if (CurChar == '\\') 367 CurChar = getNextChar(); 368 369 if (CurChar == EOF) 370 return ReturnError(TokStart, "unterminated single quote"); 371 372 CurChar = getNextChar(); 373 374 if (CurChar != '\'') 375 return ReturnError(TokStart, "single quote way too long"); 376 377 // The idea here being that 'c' is basically just an integral 378 // constant. 379 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 380 long long Value; 381 382 if (Res.startswith("\'\\")) { 383 char theChar = Res[2]; 384 switch (theChar) { 385 default: Value = theChar; break; 386 case '\'': Value = '\''; break; 387 case 't': Value = '\t'; break; 388 case 'n': Value = '\n'; break; 389 case 'b': Value = '\b'; break; 390 } 391 } else 392 Value = TokStart[1]; 393 394 return AsmToken(AsmToken::Integer, Res, Value); 395 } 396 397 398 /// LexQuote: String: "..." 399 AsmToken AsmLexer::LexQuote() { 400 int CurChar = getNextChar(); 401 // TODO: does gas allow multiline string constants? 402 while (CurChar != '"') { 403 if (CurChar == '\\') { 404 // Allow \", etc. 405 CurChar = getNextChar(); 406 } 407 408 if (CurChar == EOF) 409 return ReturnError(TokStart, "unterminated string constant"); 410 411 CurChar = getNextChar(); 412 } 413 414 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 415 } 416 417 StringRef AsmLexer::LexUntilEndOfStatement() { 418 TokStart = CurPtr; 419 420 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 421 !isAtStatementSeparator(CurPtr) && // End of statement marker. 422 *CurPtr != '\n' && *CurPtr != '\r' && 423 (*CurPtr != 0 || CurPtr != CurBuf.end())) { 424 ++CurPtr; 425 } 426 return StringRef(TokStart, CurPtr-TokStart); 427 } 428 429 StringRef AsmLexer::LexUntilEndOfLine() { 430 TokStart = CurPtr; 431 432 while (*CurPtr != '\n' && *CurPtr != '\r' && 433 (*CurPtr != 0 || CurPtr != CurBuf.end())) { 434 ++CurPtr; 435 } 436 return StringRef(TokStart, CurPtr-TokStart); 437 } 438 439 const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) { 440 const char *SavedTokStart = TokStart; 441 const char *SavedCurPtr = CurPtr; 442 bool SavedAtStartOfLine = isAtStartOfLine; 443 bool SavedSkipSpace = SkipSpace; 444 445 std::string SavedErr = getErr(); 446 SMLoc SavedErrLoc = getErrLoc(); 447 448 SkipSpace = ShouldSkipSpace; 449 AsmToken Token = LexToken(); 450 451 SetError(SavedErrLoc, SavedErr); 452 453 SkipSpace = SavedSkipSpace; 454 isAtStartOfLine = SavedAtStartOfLine; 455 CurPtr = SavedCurPtr; 456 TokStart = SavedTokStart; 457 458 return Token; 459 } 460 461 bool AsmLexer::isAtStartOfComment(char Char) { 462 // FIXME: This won't work for multi-character comment indicators like "//". 463 return Char == *MAI.getCommentString(); 464 } 465 466 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 467 return strncmp(Ptr, MAI.getSeparatorString(), 468 strlen(MAI.getSeparatorString())) == 0; 469 } 470 471 AsmToken AsmLexer::LexToken() { 472 TokStart = CurPtr; 473 // This always consumes at least one character. 474 int CurChar = getNextChar(); 475 476 if (isAtStartOfComment(CurChar)) { 477 // If this comment starts with a '#', then return the Hash token and let 478 // the assembler parser see if it can be parsed as a cpp line filename 479 // comment. We do this only if we are at the start of a line. 480 if (CurChar == '#' && isAtStartOfLine) 481 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 482 isAtStartOfLine = true; 483 return LexLineComment(); 484 } 485 if (isAtStatementSeparator(TokStart)) { 486 CurPtr += strlen(MAI.getSeparatorString()) - 1; 487 return AsmToken(AsmToken::EndOfStatement, 488 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 489 } 490 491 // If we're missing a newline at EOF, make sure we still get an 492 // EndOfStatement token before the Eof token. 493 if (CurChar == EOF && !isAtStartOfLine) { 494 isAtStartOfLine = true; 495 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 496 } 497 498 isAtStartOfLine = false; 499 switch (CurChar) { 500 default: 501 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 502 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 503 return LexIdentifier(); 504 505 // Unknown character, emit an error. 506 return ReturnError(TokStart, "invalid character in input"); 507 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 508 case 0: 509 case ' ': 510 case '\t': 511 if (SkipSpace) { 512 // Ignore whitespace. 513 return LexToken(); 514 } else { 515 int len = 1; 516 while (*CurPtr==' ' || *CurPtr=='\t') { 517 CurPtr++; 518 len++; 519 } 520 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 521 } 522 case '\n': // FALL THROUGH. 523 case '\r': 524 isAtStartOfLine = true; 525 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 526 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 527 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 528 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 529 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 530 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 531 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 532 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 533 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 534 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 535 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 536 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 537 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 538 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 539 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 540 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 541 case '=': 542 if (*CurPtr == '=') 543 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 544 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 545 case '|': 546 if (*CurPtr == '|') 547 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 548 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 549 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 550 case '&': 551 if (*CurPtr == '&') 552 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 553 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 554 case '!': 555 if (*CurPtr == '=') 556 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 557 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 558 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 559 case '/': return LexSlash(); 560 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 561 case '\'': return LexSingleQuote(); 562 case '"': return LexQuote(); 563 case '0': case '1': case '2': case '3': case '4': 564 case '5': case '6': case '7': case '8': case '9': 565 return LexDigit(); 566 case '<': 567 switch (*CurPtr) { 568 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 569 StringRef(TokStart, 2)); 570 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 571 StringRef(TokStart, 2)); 572 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 573 StringRef(TokStart, 2)); 574 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 575 } 576 case '>': 577 switch (*CurPtr) { 578 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 579 StringRef(TokStart, 2)); 580 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 581 StringRef(TokStart, 2)); 582 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 583 } 584 585 // TODO: Quoted identifiers (objc methods etc) 586 // local labels: [0-9][:] 587 // Forward/backward labels: [0-9][fb] 588 // Integers, fp constants, character constants. 589 } 590 } 591