1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Implement the Lexer for TableGen. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "TGLexer.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/Compiler.h" 19 #include "llvm/Support/MemoryBuffer.h" 20 #include "llvm/Support/SourceMgr.h" 21 #include "llvm/TableGen/Error.h" 22 #include <cctype> 23 #include <cerrno> 24 #include <cstdint> 25 #include <cstdio> 26 #include <cstdlib> 27 #include <cstring> 28 29 using namespace llvm; 30 31 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 32 CurBuffer = SrcMgr.getMainFileID(); 33 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 34 CurPtr = CurBuf.begin(); 35 TokStart = nullptr; 36 } 37 38 SMLoc TGLexer::getLoc() const { 39 return SMLoc::getFromPointer(TokStart); 40 } 41 42 /// ReturnError - Set the error to the specified string at the specified 43 /// location. This is defined to always return tgtok::Error. 44 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 45 PrintError(Loc, Msg); 46 return tgtok::Error; 47 } 48 49 int TGLexer::getNextChar() { 50 char CurChar = *CurPtr++; 51 switch (CurChar) { 52 default: 53 return (unsigned char)CurChar; 54 case 0: { 55 // A nul character in the stream is either the end of the current buffer or 56 // a random nul in the file. Disambiguate that here. 57 if (CurPtr-1 != CurBuf.end()) 58 return 0; // Just whitespace. 59 60 // If this is the end of an included file, pop the parent file off the 61 // include stack. 62 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 63 if (ParentIncludeLoc != SMLoc()) { 64 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 65 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 66 CurPtr = ParentIncludeLoc.getPointer(); 67 return getNextChar(); 68 } 69 70 // Otherwise, return end of file. 71 --CurPtr; // Another call to lex will return EOF again. 72 return EOF; 73 } 74 case '\n': 75 case '\r': 76 // Handle the newline character by ignoring it and incrementing the line 77 // count. However, be careful about 'dos style' files with \n\r in them. 78 // Only treat a \n\r or \r\n as a single line. 79 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 80 *CurPtr != CurChar) 81 ++CurPtr; // Eat the two char newline sequence. 82 return '\n'; 83 } 84 } 85 86 int TGLexer::peekNextChar(int Index) { 87 return *(CurPtr + Index); 88 } 89 90 tgtok::TokKind TGLexer::LexToken() { 91 TokStart = CurPtr; 92 // This always consumes at least one character. 93 int CurChar = getNextChar(); 94 95 switch (CurChar) { 96 default: 97 // Handle letters: [a-zA-Z_] 98 if (isalpha(CurChar) || CurChar == '_') 99 return LexIdentifier(); 100 101 // Unknown character, emit an error. 102 return ReturnError(TokStart, "Unexpected character"); 103 case EOF: return tgtok::Eof; 104 case ':': return tgtok::colon; 105 case ';': return tgtok::semi; 106 case '.': return tgtok::period; 107 case ',': return tgtok::comma; 108 case '<': return tgtok::less; 109 case '>': return tgtok::greater; 110 case ']': return tgtok::r_square; 111 case '{': return tgtok::l_brace; 112 case '}': return tgtok::r_brace; 113 case '(': return tgtok::l_paren; 114 case ')': return tgtok::r_paren; 115 case '=': return tgtok::equal; 116 case '?': return tgtok::question; 117 case '#': return tgtok::paste; 118 119 case 0: 120 case ' ': 121 case '\t': 122 case '\n': 123 case '\r': 124 // Ignore whitespace. 125 return LexToken(); 126 case '/': 127 // If this is the start of a // comment, skip until the end of the line or 128 // the end of the buffer. 129 if (*CurPtr == '/') 130 SkipBCPLComment(); 131 else if (*CurPtr == '*') { 132 if (SkipCComment()) 133 return tgtok::Error; 134 } else // Otherwise, this is an error. 135 return ReturnError(TokStart, "Unexpected character"); 136 return LexToken(); 137 case '-': case '+': 138 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 139 case '7': case '8': case '9': { 140 int NextChar = 0; 141 if (isdigit(CurChar)) { 142 // Allow identifiers to start with a number if it is followed by 143 // an identifier. This can happen with paste operations like 144 // foo#8i. 145 int i = 0; 146 do { 147 NextChar = peekNextChar(i++); 148 } while (isdigit(NextChar)); 149 150 if (NextChar == 'x' || NextChar == 'b') { 151 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 152 // likely a number. 153 int NextNextChar = peekNextChar(i); 154 switch (NextNextChar) { 155 default: 156 break; 157 case '0': case '1': 158 if (NextChar == 'b') 159 return LexNumber(); 160 LLVM_FALLTHROUGH; 161 case '2': case '3': case '4': case '5': 162 case '6': case '7': case '8': case '9': 163 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 164 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 165 if (NextChar == 'x') 166 return LexNumber(); 167 break; 168 } 169 } 170 } 171 172 if (isalpha(NextChar) || NextChar == '_') 173 return LexIdentifier(); 174 175 return LexNumber(); 176 } 177 case '"': return LexString(); 178 case '$': return LexVarName(); 179 case '[': return LexBracket(); 180 case '!': return LexExclaim(); 181 } 182 } 183 184 /// LexString - Lex "[^"]*" 185 tgtok::TokKind TGLexer::LexString() { 186 const char *StrStart = CurPtr; 187 188 CurStrVal = ""; 189 190 while (*CurPtr != '"') { 191 // If we hit the end of the buffer, report an error. 192 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 193 return ReturnError(StrStart, "End of file in string literal"); 194 195 if (*CurPtr == '\n' || *CurPtr == '\r') 196 return ReturnError(StrStart, "End of line in string literal"); 197 198 if (*CurPtr != '\\') { 199 CurStrVal += *CurPtr++; 200 continue; 201 } 202 203 ++CurPtr; 204 205 switch (*CurPtr) { 206 case '\\': case '\'': case '"': 207 // These turn into their literal character. 208 CurStrVal += *CurPtr++; 209 break; 210 case 't': 211 CurStrVal += '\t'; 212 ++CurPtr; 213 break; 214 case 'n': 215 CurStrVal += '\n'; 216 ++CurPtr; 217 break; 218 219 case '\n': 220 case '\r': 221 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 222 223 // If we hit the end of the buffer, report an error. 224 case '\0': 225 if (CurPtr == CurBuf.end()) 226 return ReturnError(StrStart, "End of file in string literal"); 227 LLVM_FALLTHROUGH; 228 default: 229 return ReturnError(CurPtr, "invalid escape in string literal"); 230 } 231 } 232 233 ++CurPtr; 234 return tgtok::StrVal; 235 } 236 237 tgtok::TokKind TGLexer::LexVarName() { 238 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 239 return ReturnError(TokStart, "Invalid variable name"); 240 241 // Otherwise, we're ok, consume the rest of the characters. 242 const char *VarNameStart = CurPtr++; 243 244 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 245 ++CurPtr; 246 247 CurStrVal.assign(VarNameStart, CurPtr); 248 return tgtok::VarName; 249 } 250 251 tgtok::TokKind TGLexer::LexIdentifier() { 252 // The first letter is [a-zA-Z_#]. 253 const char *IdentStart = TokStart; 254 255 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 256 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 257 ++CurPtr; 258 259 // Check to see if this identifier is a keyword. 260 StringRef Str(IdentStart, CurPtr-IdentStart); 261 262 if (Str == "include") { 263 if (LexInclude()) return tgtok::Error; 264 return Lex(); 265 } 266 267 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 268 .Case("int", tgtok::Int) 269 .Case("bit", tgtok::Bit) 270 .Case("bits", tgtok::Bits) 271 .Case("string", tgtok::String) 272 .Case("list", tgtok::List) 273 .Case("code", tgtok::Code) 274 .Case("dag", tgtok::Dag) 275 .Case("class", tgtok::Class) 276 .Case("def", tgtok::Def) 277 .Case("foreach", tgtok::Foreach) 278 .Case("defm", tgtok::Defm) 279 .Case("defset", tgtok::Defset) 280 .Case("multiclass", tgtok::MultiClass) 281 .Case("field", tgtok::Field) 282 .Case("let", tgtok::Let) 283 .Case("in", tgtok::In) 284 .Default(tgtok::Id); 285 286 if (Kind == tgtok::Id) 287 CurStrVal.assign(Str.begin(), Str.end()); 288 return Kind; 289 } 290 291 /// LexInclude - We just read the "include" token. Get the string token that 292 /// comes next and enter the include. 293 bool TGLexer::LexInclude() { 294 // The token after the include must be a string. 295 tgtok::TokKind Tok = LexToken(); 296 if (Tok == tgtok::Error) return true; 297 if (Tok != tgtok::StrVal) { 298 PrintError(getLoc(), "Expected filename after include"); 299 return true; 300 } 301 302 // Get the string. 303 std::string Filename = CurStrVal; 304 std::string IncludedFile; 305 306 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 307 IncludedFile); 308 if (!CurBuffer) { 309 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 310 return true; 311 } 312 313 DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); 314 if (Found != Dependencies.end()) { 315 PrintError(getLoc(), 316 "File '" + IncludedFile + "' has already been included."); 317 SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, 318 "previously included here"); 319 return true; 320 } 321 Dependencies.insert(std::make_pair(IncludedFile, getLoc())); 322 // Save the line number and lex buffer of the includer. 323 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 324 CurPtr = CurBuf.begin(); 325 return false; 326 } 327 328 void TGLexer::SkipBCPLComment() { 329 ++CurPtr; // skip the second slash. 330 while (true) { 331 switch (*CurPtr) { 332 case '\n': 333 case '\r': 334 return; // Newline is end of comment. 335 case 0: 336 // If this is the end of the buffer, end the comment. 337 if (CurPtr == CurBuf.end()) 338 return; 339 break; 340 } 341 // Otherwise, skip the character. 342 ++CurPtr; 343 } 344 } 345 346 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 347 /// is that we allow nesting. 348 bool TGLexer::SkipCComment() { 349 ++CurPtr; // skip the star. 350 unsigned CommentDepth = 1; 351 352 while (true) { 353 int CurChar = getNextChar(); 354 switch (CurChar) { 355 case EOF: 356 PrintError(TokStart, "Unterminated comment!"); 357 return true; 358 case '*': 359 // End of the comment? 360 if (CurPtr[0] != '/') break; 361 362 ++CurPtr; // End the */. 363 if (--CommentDepth == 0) 364 return false; 365 break; 366 case '/': 367 // Start of a nested comment? 368 if (CurPtr[0] != '*') break; 369 ++CurPtr; 370 ++CommentDepth; 371 break; 372 } 373 } 374 } 375 376 /// LexNumber - Lex: 377 /// [-+]?[0-9]+ 378 /// 0x[0-9a-fA-F]+ 379 /// 0b[01]+ 380 tgtok::TokKind TGLexer::LexNumber() { 381 if (CurPtr[-1] == '0') { 382 if (CurPtr[0] == 'x') { 383 ++CurPtr; 384 const char *NumStart = CurPtr; 385 while (isxdigit(CurPtr[0])) 386 ++CurPtr; 387 388 // Requires at least one hex digit. 389 if (CurPtr == NumStart) 390 return ReturnError(TokStart, "Invalid hexadecimal number"); 391 392 errno = 0; 393 CurIntVal = strtoll(NumStart, nullptr, 16); 394 if (errno == EINVAL) 395 return ReturnError(TokStart, "Invalid hexadecimal number"); 396 if (errno == ERANGE) { 397 errno = 0; 398 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 399 if (errno == EINVAL) 400 return ReturnError(TokStart, "Invalid hexadecimal number"); 401 if (errno == ERANGE) 402 return ReturnError(TokStart, "Hexadecimal number out of range"); 403 } 404 return tgtok::IntVal; 405 } else if (CurPtr[0] == 'b') { 406 ++CurPtr; 407 const char *NumStart = CurPtr; 408 while (CurPtr[0] == '0' || CurPtr[0] == '1') 409 ++CurPtr; 410 411 // Requires at least one binary digit. 412 if (CurPtr == NumStart) 413 return ReturnError(CurPtr-2, "Invalid binary number"); 414 CurIntVal = strtoll(NumStart, nullptr, 2); 415 return tgtok::BinaryIntVal; 416 } 417 } 418 419 // Check for a sign without a digit. 420 if (!isdigit(CurPtr[0])) { 421 if (CurPtr[-1] == '-') 422 return tgtok::minus; 423 else if (CurPtr[-1] == '+') 424 return tgtok::plus; 425 } 426 427 while (isdigit(CurPtr[0])) 428 ++CurPtr; 429 CurIntVal = strtoll(TokStart, nullptr, 10); 430 return tgtok::IntVal; 431 } 432 433 /// LexBracket - We just read '['. If this is a code block, return it, 434 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 435 tgtok::TokKind TGLexer::LexBracket() { 436 if (CurPtr[0] != '{') 437 return tgtok::l_square; 438 ++CurPtr; 439 const char *CodeStart = CurPtr; 440 while (true) { 441 int Char = getNextChar(); 442 if (Char == EOF) break; 443 444 if (Char != '}') continue; 445 446 Char = getNextChar(); 447 if (Char == EOF) break; 448 if (Char == ']') { 449 CurStrVal.assign(CodeStart, CurPtr-2); 450 return tgtok::CodeFragment; 451 } 452 } 453 454 return ReturnError(CodeStart-2, "Unterminated Code Block"); 455 } 456 457 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 458 tgtok::TokKind TGLexer::LexExclaim() { 459 if (!isalpha(*CurPtr)) 460 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 461 462 const char *Start = CurPtr++; 463 while (isalpha(*CurPtr)) 464 ++CurPtr; 465 466 // Check to see which operator this is. 467 tgtok::TokKind Kind = 468 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 469 .Case("eq", tgtok::XEq) 470 .Case("ne", tgtok::XNe) 471 .Case("le", tgtok::XLe) 472 .Case("lt", tgtok::XLt) 473 .Case("ge", tgtok::XGe) 474 .Case("gt", tgtok::XGt) 475 .Case("if", tgtok::XIf) 476 .Case("isa", tgtok::XIsA) 477 .Case("head", tgtok::XHead) 478 .Case("tail", tgtok::XTail) 479 .Case("size", tgtok::XSize) 480 .Case("con", tgtok::XConcat) 481 .Case("dag", tgtok::XDag) 482 .Case("add", tgtok::XADD) 483 .Case("and", tgtok::XAND) 484 .Case("or", tgtok::XOR) 485 .Case("shl", tgtok::XSHL) 486 .Case("sra", tgtok::XSRA) 487 .Case("srl", tgtok::XSRL) 488 .Case("cast", tgtok::XCast) 489 .Case("empty", tgtok::XEmpty) 490 .Case("subst", tgtok::XSubst) 491 .Case("foldl", tgtok::XFoldl) 492 .Case("foreach", tgtok::XForEach) 493 .Case("listconcat", tgtok::XListConcat) 494 .Case("strconcat", tgtok::XStrConcat) 495 .Default(tgtok::Error); 496 497 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 498 } 499