1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This class implements the lexer for assembly files. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/MC/MCParser/AsmLexer.h" 15 #include "llvm/MC/MCAsmInfo.h" 16 #include "llvm/Support/MemoryBuffer.h" 17 #include "llvm/Support/SMLoc.h" 18 #include <cctype> 19 #include <cerrno> 20 #include <cstdio> 21 #include <cstdlib> 22 using namespace llvm; 23 24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27 isAtStartOfLine = true; 28 } 29 30 AsmLexer::~AsmLexer() { 31 } 32 33 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 34 CurBuf = buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf->getBufferStart(); 40 41 TokStart = 0; 42 } 43 44 /// ReturnError - Set the error to the specified string at the specified 45 /// location. This is defined to always return AsmToken::Error. 46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50 } 51 52 int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr-1 != CurBuf->getBufferEnd()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67 } 68 69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70 /// 71 /// The leading integral digit sequence and dot should have already been 72 /// consumed, some or all of the fractional digit sequence *can* have been 73 /// consumed. 74 AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92 } 93 94 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 95 static bool IsIdentifierChar(char c) { 96 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 97 } 98 AsmToken AsmLexer::LexIdentifier() { 99 // Check for floating point literals. 100 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 101 // Disambiguate a .1243foo identifier from a floating literal. 102 while (isdigit(*CurPtr)) 103 ++CurPtr; 104 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 105 return LexFloatLiteral(); 106 } 107 108 while (IsIdentifierChar(*CurPtr)) 109 ++CurPtr; 110 111 // Handle . as a special case. 112 if (CurPtr == TokStart+1 && TokStart[0] == '.') 113 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 114 115 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 116 } 117 118 /// LexSlash: Slash: / 119 /// C-Style Comment: /* ... */ 120 AsmToken AsmLexer::LexSlash() { 121 switch (*CurPtr) { 122 case '*': break; // C style comment. 123 case '/': return ++CurPtr, LexLineComment(); 124 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 125 } 126 127 // C Style comment. 128 ++CurPtr; // skip the star. 129 while (1) { 130 int CurChar = getNextChar(); 131 switch (CurChar) { 132 case EOF: 133 return ReturnError(TokStart, "unterminated comment"); 134 case '*': 135 // End of the comment? 136 if (CurPtr[0] != '/') break; 137 138 ++CurPtr; // End the */. 139 return LexToken(); 140 } 141 } 142 } 143 144 /// LexLineComment: Comment: #[^\n]* 145 /// : //[^\n]* 146 AsmToken AsmLexer::LexLineComment() { 147 // FIXME: This is broken if we happen to a comment at the end of a file, which 148 // was .included, and which doesn't end with a newline. 149 int CurChar = getNextChar(); 150 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 151 CurChar = getNextChar(); 152 153 if (CurChar == EOF) 154 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 155 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 156 } 157 158 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 159 // Skip ULL, UL, U, L and LL suffices. 160 if (CurPtr[0] == 'U') 161 ++CurPtr; 162 if (CurPtr[0] == 'L') 163 ++CurPtr; 164 if (CurPtr[0] == 'L') 165 ++CurPtr; 166 } 167 168 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 169 // integer as a hexadecimal, possibly with leading zeroes. 170 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 171 const char *FirstHex = 0; 172 const char *LookAhead = CurPtr; 173 while (1) { 174 if (isdigit(*LookAhead)) { 175 ++LookAhead; 176 } else if (isxdigit(*LookAhead)) { 177 if (!FirstHex) 178 FirstHex = LookAhead; 179 ++LookAhead; 180 } else { 181 break; 182 } 183 } 184 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 185 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 186 if (isHex) 187 return 16; 188 return DefaultRadix; 189 } 190 191 /// LexDigit: First character is [0-9]. 192 /// Local Label: [0-9][:] 193 /// Forward/Backward Label: [0-9][fb] 194 /// Binary integer: 0b[01]+ 195 /// Octal integer: 0[0-7]+ 196 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 197 /// Decimal integer: [1-9][0-9]* 198 AsmToken AsmLexer::LexDigit() { 199 // Decimal integer: [1-9][0-9]* 200 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 201 unsigned Radix = doLookAhead(CurPtr, 10); 202 bool isHex = Radix == 16; 203 // Check for floating point literals. 204 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 205 ++CurPtr; 206 return LexFloatLiteral(); 207 } 208 209 StringRef Result(TokStart, CurPtr - TokStart); 210 211 long long Value; 212 if (Result.getAsInteger(Radix, Value)) { 213 // Allow positive values that are too large to fit into a signed 64-bit 214 // integer, but that do fit in an unsigned one, we just convert them over. 215 unsigned long long UValue; 216 if (Result.getAsInteger(Radix, UValue)) 217 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 218 "invalid hexdecimal number"); 219 Value = (long long)UValue; 220 } 221 222 // Consume the [bB][hH]. 223 if (Radix == 2 || Radix == 16) 224 ++CurPtr; 225 226 // The darwin/x86 (and x86-64) assembler accepts and ignores type 227 // suffices on integer literals. 228 SkipIgnoredIntegerSuffix(CurPtr); 229 230 return AsmToken(AsmToken::Integer, Result, Value); 231 } 232 233 if (*CurPtr == 'b') { 234 ++CurPtr; 235 // See if we actually have "0b" as part of something like "jmp 0b\n" 236 if (!isdigit(CurPtr[0])) { 237 --CurPtr; 238 StringRef Result(TokStart, CurPtr - TokStart); 239 return AsmToken(AsmToken::Integer, Result, 0); 240 } 241 const char *NumStart = CurPtr; 242 while (CurPtr[0] == '0' || CurPtr[0] == '1') 243 ++CurPtr; 244 245 // Requires at least one binary digit. 246 if (CurPtr == NumStart) 247 return ReturnError(TokStart, "invalid binary number"); 248 249 StringRef Result(TokStart, CurPtr - TokStart); 250 251 long long Value; 252 if (Result.substr(2).getAsInteger(2, Value)) 253 return ReturnError(TokStart, "invalid binary number"); 254 255 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 256 // suffixes on integer literals. 257 SkipIgnoredIntegerSuffix(CurPtr); 258 259 return AsmToken(AsmToken::Integer, Result, Value); 260 } 261 262 if (*CurPtr == 'x') { 263 ++CurPtr; 264 const char *NumStart = CurPtr; 265 while (isxdigit(CurPtr[0])) 266 ++CurPtr; 267 268 // Requires at least one hex digit. 269 if (CurPtr == NumStart) 270 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 271 272 unsigned long long Result; 273 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 274 return ReturnError(TokStart, "invalid hexadecimal number"); 275 276 // Consume the optional [hH]. 277 if (*CurPtr == 'h' || *CurPtr == 'H') 278 ++CurPtr; 279 280 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 281 // suffixes on integer literals. 282 SkipIgnoredIntegerSuffix(CurPtr); 283 284 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 285 (int64_t)Result); 286 } 287 288 // Either octal or hexadecimal. 289 long long Value; 290 unsigned Radix = doLookAhead(CurPtr, 8); 291 bool isHex = Radix == 16; 292 StringRef Result(TokStart, CurPtr - TokStart); 293 if (Result.getAsInteger(Radix, Value)) 294 return ReturnError(TokStart, !isHex ? "invalid octal number" : 295 "invalid hexdecimal number"); 296 297 // Consume the [hH]. 298 if (Radix == 16) 299 ++CurPtr; 300 301 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 302 // suffixes on integer literals. 303 SkipIgnoredIntegerSuffix(CurPtr); 304 305 return AsmToken(AsmToken::Integer, Result, Value); 306 } 307 308 /// LexSingleQuote: Integer: 'b' 309 AsmToken AsmLexer::LexSingleQuote() { 310 int CurChar = getNextChar(); 311 312 if (CurChar == '\\') 313 CurChar = getNextChar(); 314 315 if (CurChar == EOF) 316 return ReturnError(TokStart, "unterminated single quote"); 317 318 CurChar = getNextChar(); 319 320 if (CurChar != '\'') 321 return ReturnError(TokStart, "single quote way too long"); 322 323 // The idea here being that 'c' is basically just an integral 324 // constant. 325 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 326 long long Value; 327 328 if (Res.startswith("\'\\")) { 329 char theChar = Res[2]; 330 switch (theChar) { 331 default: Value = theChar; break; 332 case '\'': Value = '\''; break; 333 case 't': Value = '\t'; break; 334 case 'n': Value = '\n'; break; 335 case 'b': Value = '\b'; break; 336 } 337 } else 338 Value = TokStart[1]; 339 340 return AsmToken(AsmToken::Integer, Res, Value); 341 } 342 343 344 /// LexQuote: String: "..." 345 AsmToken AsmLexer::LexQuote() { 346 int CurChar = getNextChar(); 347 // TODO: does gas allow multiline string constants? 348 while (CurChar != '"') { 349 if (CurChar == '\\') { 350 // Allow \", etc. 351 CurChar = getNextChar(); 352 } 353 354 if (CurChar == EOF) 355 return ReturnError(TokStart, "unterminated string constant"); 356 357 CurChar = getNextChar(); 358 } 359 360 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 361 } 362 363 StringRef AsmLexer::LexUntilEndOfStatement() { 364 TokStart = CurPtr; 365 366 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 367 !isAtStatementSeparator(CurPtr) && // End of statement marker. 368 *CurPtr != '\n' && 369 *CurPtr != '\r' && 370 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 371 ++CurPtr; 372 } 373 return StringRef(TokStart, CurPtr-TokStart); 374 } 375 376 StringRef AsmLexer::LexUntilEndOfLine() { 377 TokStart = CurPtr; 378 379 while (*CurPtr != '\n' && 380 *CurPtr != '\r' && 381 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 382 ++CurPtr; 383 } 384 return StringRef(TokStart, CurPtr-TokStart); 385 } 386 387 bool AsmLexer::isAtStartOfComment(char Char) { 388 // FIXME: This won't work for multi-character comment indicators like "//". 389 return Char == *MAI.getCommentString(); 390 } 391 392 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 393 return strncmp(Ptr, MAI.getSeparatorString(), 394 strlen(MAI.getSeparatorString())) == 0; 395 } 396 397 AsmToken AsmLexer::LexToken() { 398 TokStart = CurPtr; 399 // This always consumes at least one character. 400 int CurChar = getNextChar(); 401 402 if (isAtStartOfComment(CurChar)) { 403 // If this comment starts with a '#', then return the Hash token and let 404 // the assembler parser see if it can be parsed as a cpp line filename 405 // comment. We do this only if we are at the start of a line. 406 if (CurChar == '#' && isAtStartOfLine) 407 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 408 isAtStartOfLine = true; 409 return LexLineComment(); 410 } 411 if (isAtStatementSeparator(TokStart)) { 412 CurPtr += strlen(MAI.getSeparatorString()) - 1; 413 return AsmToken(AsmToken::EndOfStatement, 414 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 415 } 416 417 // If we're missing a newline at EOF, make sure we still get an 418 // EndOfStatement token before the Eof token. 419 if (CurChar == EOF && !isAtStartOfLine) { 420 isAtStartOfLine = true; 421 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 422 } 423 424 isAtStartOfLine = false; 425 switch (CurChar) { 426 default: 427 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 428 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 429 return LexIdentifier(); 430 431 // Unknown character, emit an error. 432 return ReturnError(TokStart, "invalid character in input"); 433 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 434 case 0: 435 case ' ': 436 case '\t': 437 if (SkipSpace) { 438 // Ignore whitespace. 439 return LexToken(); 440 } else { 441 int len = 1; 442 while (*CurPtr==' ' || *CurPtr=='\t') { 443 CurPtr++; 444 len++; 445 } 446 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 447 } 448 case '\n': // FALL THROUGH. 449 case '\r': 450 isAtStartOfLine = true; 451 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 452 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 453 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 454 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 455 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 456 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 457 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 458 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 459 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 460 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 461 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 462 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 463 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 464 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 465 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 466 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 467 case '=': 468 if (*CurPtr == '=') 469 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 470 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 471 case '|': 472 if (*CurPtr == '|') 473 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 474 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 475 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 476 case '&': 477 if (*CurPtr == '&') 478 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 479 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 480 case '!': 481 if (*CurPtr == '=') 482 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 483 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 484 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 485 case '/': return LexSlash(); 486 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 487 case '\'': return LexSingleQuote(); 488 case '"': return LexQuote(); 489 case '0': case '1': case '2': case '3': case '4': 490 case '5': case '6': case '7': case '8': case '9': 491 return LexDigit(); 492 case '<': 493 switch (*CurPtr) { 494 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 495 StringRef(TokStart, 2)); 496 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 497 StringRef(TokStart, 2)); 498 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 499 StringRef(TokStart, 2)); 500 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 501 } 502 case '>': 503 switch (*CurPtr) { 504 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 505 StringRef(TokStart, 2)); 506 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 507 StringRef(TokStart, 2)); 508 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 509 } 510 511 // TODO: Quoted identifiers (objc methods etc) 512 // local labels: [0-9][:] 513 // Forward/backward labels: [0-9][fb] 514 // Integers, fp constants, character constants. 515 } 516 } 517