1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This class implements the lexer for assembly files. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/MC/MCParser/AsmLexer.h" 15 #include "llvm/Support/SMLoc.h" 16 #include "llvm/Support/MemoryBuffer.h" 17 #include "llvm/MC/MCAsmInfo.h" 18 #include <cctype> 19 #include <cerrno> 20 #include <cstdio> 21 #include <cstdlib> 22 using namespace llvm; 23 24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27 isAtStartOfLine = true; 28 } 29 30 AsmLexer::~AsmLexer() { 31 } 32 33 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 34 CurBuf = buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf->getBufferStart(); 40 41 TokStart = 0; 42 } 43 44 /// ReturnError - Set the error to the specified string at the specified 45 /// location. This is defined to always return AsmToken::Error. 46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50 } 51 52 int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr-1 != CurBuf->getBufferEnd()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67 } 68 69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70 /// 71 /// The leading integral digit sequence and dot should have already been 72 /// consumed, some or all of the fractional digit sequence *can* have been 73 /// consumed. 74 AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92 } 93 94 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 95 static bool IsIdentifierChar(char c) { 96 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 97 } 98 AsmToken AsmLexer::LexIdentifier() { 99 // Check for floating point literals. 100 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 101 // Disambiguate a .1243foo identifier from a floating literal. 102 while (isdigit(*CurPtr)) 103 ++CurPtr; 104 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 105 return LexFloatLiteral(); 106 } 107 108 while (IsIdentifierChar(*CurPtr)) 109 ++CurPtr; 110 111 // Handle . as a special case. 112 if (CurPtr == TokStart+1 && TokStart[0] == '.') 113 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 114 115 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 116 } 117 118 /// LexSlash: Slash: / 119 /// C-Style Comment: /* ... */ 120 AsmToken AsmLexer::LexSlash() { 121 switch (*CurPtr) { 122 case '*': break; // C style comment. 123 case '/': return ++CurPtr, LexLineComment(); 124 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 125 } 126 127 // C Style comment. 128 ++CurPtr; // skip the star. 129 while (1) { 130 int CurChar = getNextChar(); 131 switch (CurChar) { 132 case EOF: 133 return ReturnError(TokStart, "unterminated comment"); 134 case '*': 135 // End of the comment? 136 if (CurPtr[0] != '/') break; 137 138 ++CurPtr; // End the */. 139 return LexToken(); 140 } 141 } 142 } 143 144 /// LexLineComment: Comment: #[^\n]* 145 /// : //[^\n]* 146 AsmToken AsmLexer::LexLineComment() { 147 // FIXME: This is broken if we happen to a comment at the end of a file, which 148 // was .included, and which doesn't end with a newline. 149 int CurChar = getNextChar(); 150 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 151 CurChar = getNextChar(); 152 153 if (CurChar == EOF) 154 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 155 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 156 } 157 158 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 159 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 160 CurPtr += 2; 161 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 162 CurPtr += 3; 163 } 164 165 /// LexDigit: First character is [0-9]. 166 /// Local Label: [0-9][:] 167 /// Forward/Backward Label: [0-9][fb] 168 /// Binary integer: 0b[01]+ 169 /// Octal integer: 0[0-7]+ 170 /// Hex integer: 0x[0-9a-fA-F]+ 171 /// Decimal integer: [1-9][0-9]* 172 AsmToken AsmLexer::LexDigit() { 173 // Decimal integer: [1-9][0-9]* 174 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 175 while (isdigit(*CurPtr)) 176 ++CurPtr; 177 178 // Check for floating point literals. 179 if (*CurPtr == '.' || *CurPtr == 'e') { 180 ++CurPtr; 181 return LexFloatLiteral(); 182 } 183 184 StringRef Result(TokStart, CurPtr - TokStart); 185 186 long long Value; 187 if (Result.getAsInteger(10, Value)) { 188 // Allow positive values that are too large to fit into a signed 64-bit 189 // integer, but that do fit in an unsigned one, we just convert them over. 190 unsigned long long UValue; 191 if (Result.getAsInteger(10, UValue)) 192 return ReturnError(TokStart, "invalid decimal number"); 193 Value = (long long)UValue; 194 } 195 196 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 197 // suffixes on integer literals. 198 SkipIgnoredIntegerSuffix(CurPtr); 199 200 return AsmToken(AsmToken::Integer, Result, Value); 201 } 202 203 if (*CurPtr == 'b') { 204 ++CurPtr; 205 // See if we actually have "0b" as part of something like "jmp 0b\n" 206 if (!isdigit(CurPtr[0])) { 207 --CurPtr; 208 StringRef Result(TokStart, CurPtr - TokStart); 209 return AsmToken(AsmToken::Integer, Result, 0); 210 } 211 const char *NumStart = CurPtr; 212 while (CurPtr[0] == '0' || CurPtr[0] == '1') 213 ++CurPtr; 214 215 // Requires at least one binary digit. 216 if (CurPtr == NumStart) 217 return ReturnError(TokStart, "invalid binary number"); 218 219 StringRef Result(TokStart, CurPtr - TokStart); 220 221 long long Value; 222 if (Result.substr(2).getAsInteger(2, Value)) 223 return ReturnError(TokStart, "invalid binary number"); 224 225 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 226 // suffixes on integer literals. 227 SkipIgnoredIntegerSuffix(CurPtr); 228 229 return AsmToken(AsmToken::Integer, Result, Value); 230 } 231 232 if (*CurPtr == 'x') { 233 ++CurPtr; 234 const char *NumStart = CurPtr; 235 while (isxdigit(CurPtr[0])) 236 ++CurPtr; 237 238 // Requires at least one hex digit. 239 if (CurPtr == NumStart) 240 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 241 242 unsigned long long Result; 243 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 244 return ReturnError(TokStart, "invalid hexadecimal number"); 245 246 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 247 // suffixes on integer literals. 248 SkipIgnoredIntegerSuffix(CurPtr); 249 250 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 251 (int64_t)Result); 252 } 253 254 // Must be an octal number, it starts with 0. 255 while (*CurPtr >= '0' && *CurPtr <= '9') 256 ++CurPtr; 257 258 StringRef Result(TokStart, CurPtr - TokStart); 259 long long Value; 260 if (Result.getAsInteger(8, Value)) 261 return ReturnError(TokStart, "invalid octal number"); 262 263 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 264 // suffixes on integer literals. 265 SkipIgnoredIntegerSuffix(CurPtr); 266 267 return AsmToken(AsmToken::Integer, Result, Value); 268 } 269 270 /// LexSingleQuote: Integer: 'b' 271 AsmToken AsmLexer::LexSingleQuote() { 272 int CurChar = getNextChar(); 273 274 if (CurChar == '\\') 275 CurChar = getNextChar(); 276 277 if (CurChar == EOF) 278 return ReturnError(TokStart, "unterminated single quote"); 279 280 CurChar = getNextChar(); 281 282 if (CurChar != '\'') 283 return ReturnError(TokStart, "single quote way too long"); 284 285 // The idea here being that 'c' is basically just an integral 286 // constant. 287 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 288 long long Value; 289 290 if (Res.startswith("\'\\")) { 291 char theChar = Res[2]; 292 switch (theChar) { 293 default: Value = theChar; break; 294 case '\'': Value = '\''; break; 295 case 't': Value = '\t'; break; 296 case 'n': Value = '\n'; break; 297 case 'b': Value = '\b'; break; 298 } 299 } else 300 Value = TokStart[1]; 301 302 return AsmToken(AsmToken::Integer, Res, Value); 303 } 304 305 306 /// LexQuote: String: "..." 307 AsmToken AsmLexer::LexQuote() { 308 int CurChar = getNextChar(); 309 // TODO: does gas allow multiline string constants? 310 while (CurChar != '"') { 311 if (CurChar == '\\') { 312 // Allow \", etc. 313 CurChar = getNextChar(); 314 } 315 316 if (CurChar == EOF) 317 return ReturnError(TokStart, "unterminated string constant"); 318 319 CurChar = getNextChar(); 320 } 321 322 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 323 } 324 325 StringRef AsmLexer::LexUntilEndOfStatement() { 326 TokStart = CurPtr; 327 328 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 329 !isAtStatementSeparator(CurPtr) && // End of statement marker. 330 *CurPtr != '\n' && 331 *CurPtr != '\r' && 332 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 333 ++CurPtr; 334 } 335 return StringRef(TokStart, CurPtr-TokStart); 336 } 337 338 StringRef AsmLexer::LexUntilEndOfLine() { 339 TokStart = CurPtr; 340 341 while (*CurPtr != '\n' && 342 *CurPtr != '\r' && 343 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 344 ++CurPtr; 345 } 346 return StringRef(TokStart, CurPtr-TokStart); 347 } 348 349 bool AsmLexer::isAtStartOfComment(char Char) { 350 // FIXME: This won't work for multi-character comment indicators like "//". 351 return Char == *MAI.getCommentString(); 352 } 353 354 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 355 return strncmp(Ptr, MAI.getSeparatorString(), 356 strlen(MAI.getSeparatorString())) == 0; 357 } 358 359 AsmToken AsmLexer::LexToken() { 360 TokStart = CurPtr; 361 // This always consumes at least one character. 362 int CurChar = getNextChar(); 363 364 if (isAtStartOfComment(CurChar)) { 365 // If this comment starts with a '#', then return the Hash token and let 366 // the assembler parser see if it can be parsed as a cpp line filename 367 // comment. We do this only if we are at the start of a line. 368 if (CurChar == '#' && isAtStartOfLine) 369 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 370 isAtStartOfLine = true; 371 return LexLineComment(); 372 } 373 if (isAtStatementSeparator(TokStart)) { 374 CurPtr += strlen(MAI.getSeparatorString()) - 1; 375 return AsmToken(AsmToken::EndOfStatement, 376 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 377 } 378 379 // If we're missing a newline at EOF, make sure we still get an 380 // EndOfStatement token before the Eof token. 381 if (CurChar == EOF && !isAtStartOfLine) { 382 isAtStartOfLine = true; 383 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 384 } 385 386 isAtStartOfLine = false; 387 switch (CurChar) { 388 default: 389 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 390 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 391 return LexIdentifier(); 392 393 // Unknown character, emit an error. 394 return ReturnError(TokStart, "invalid character in input"); 395 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 396 case 0: 397 case ' ': 398 case '\t': 399 // Ignore whitespace. 400 return LexToken(); 401 case '\n': // FALL THROUGH. 402 case '\r': 403 isAtStartOfLine = true; 404 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 405 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 406 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 407 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 408 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 409 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 410 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 411 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 412 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 413 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 414 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 415 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 416 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 417 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 418 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 419 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 420 case '=': 421 if (*CurPtr == '=') 422 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 423 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 424 case '|': 425 if (*CurPtr == '|') 426 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 427 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 428 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 429 case '&': 430 if (*CurPtr == '&') 431 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 432 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 433 case '!': 434 if (*CurPtr == '=') 435 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 436 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 437 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 438 case '/': return LexSlash(); 439 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 440 case '\'': return LexSingleQuote(); 441 case '"': return LexQuote(); 442 case '0': case '1': case '2': case '3': case '4': 443 case '5': case '6': case '7': case '8': case '9': 444 return LexDigit(); 445 case '<': 446 switch (*CurPtr) { 447 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 448 StringRef(TokStart, 2)); 449 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 450 StringRef(TokStart, 2)); 451 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 452 StringRef(TokStart, 2)); 453 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 454 } 455 case '>': 456 switch (*CurPtr) { 457 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 458 StringRef(TokStart, 2)); 459 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 460 StringRef(TokStart, 2)); 461 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 462 } 463 464 // TODO: Quoted identifiers (objc methods etc) 465 // local labels: [0-9][:] 466 // Forward/backward labels: [0-9][fb] 467 // Integers, fp constants, character constants. 468 } 469 } 470