1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Implement the Lexer for TableGen. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "TGLexer.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/MemoryBuffer.h" 19 #include "llvm/Support/SourceMgr.h" 20 #include "llvm/TableGen/Error.h" 21 #include <cctype> 22 #include <cerrno> 23 #include <cstdio> 24 #include <cstdlib> 25 #include <cstring> 26 27 using namespace llvm; 28 29 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 30 CurBuffer = SrcMgr.getMainFileID(); 31 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 32 CurPtr = CurBuf.begin(); 33 TokStart = nullptr; 34 } 35 36 SMLoc TGLexer::getLoc() const { 37 return SMLoc::getFromPointer(TokStart); 38 } 39 40 /// ReturnError - Set the error to the specified string at the specified 41 /// location. This is defined to always return tgtok::Error. 42 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 43 PrintError(Loc, Msg); 44 return tgtok::Error; 45 } 46 47 int TGLexer::getNextChar() { 48 char CurChar = *CurPtr++; 49 switch (CurChar) { 50 default: 51 return (unsigned char)CurChar; 52 case 0: { 53 // A nul character in the stream is either the end of the current buffer or 54 // a random nul in the file. Disambiguate that here. 55 if (CurPtr-1 != CurBuf.end()) 56 return 0; // Just whitespace. 57 58 // If this is the end of an included file, pop the parent file off the 59 // include stack. 60 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 61 if (ParentIncludeLoc != SMLoc()) { 62 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 63 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 64 CurPtr = ParentIncludeLoc.getPointer(); 65 return getNextChar(); 66 } 67 68 // Otherwise, return end of file. 69 --CurPtr; // Another call to lex will return EOF again. 70 return EOF; 71 } 72 case '\n': 73 case '\r': 74 // Handle the newline character by ignoring it and incrementing the line 75 // count. However, be careful about 'dos style' files with \n\r in them. 76 // Only treat a \n\r or \r\n as a single line. 77 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 78 *CurPtr != CurChar) 79 ++CurPtr; // Eat the two char newline sequence. 80 return '\n'; 81 } 82 } 83 84 int TGLexer::peekNextChar(int Index) { 85 return *(CurPtr + Index); 86 } 87 88 tgtok::TokKind TGLexer::LexToken() { 89 TokStart = CurPtr; 90 // This always consumes at least one character. 91 int CurChar = getNextChar(); 92 93 switch (CurChar) { 94 default: 95 // Handle letters: [a-zA-Z_] 96 if (isalpha(CurChar) || CurChar == '_') 97 return LexIdentifier(); 98 99 // Unknown character, emit an error. 100 return ReturnError(TokStart, "Unexpected character"); 101 case EOF: return tgtok::Eof; 102 case ':': return tgtok::colon; 103 case ';': return tgtok::semi; 104 case '.': return tgtok::period; 105 case ',': return tgtok::comma; 106 case '<': return tgtok::less; 107 case '>': return tgtok::greater; 108 case ']': return tgtok::r_square; 109 case '{': return tgtok::l_brace; 110 case '}': return tgtok::r_brace; 111 case '(': return tgtok::l_paren; 112 case ')': return tgtok::r_paren; 113 case '=': return tgtok::equal; 114 case '?': return tgtok::question; 115 case '#': return tgtok::paste; 116 117 case 0: 118 case ' ': 119 case '\t': 120 case '\n': 121 case '\r': 122 // Ignore whitespace. 123 return LexToken(); 124 case '/': 125 // If this is the start of a // comment, skip until the end of the line or 126 // the end of the buffer. 127 if (*CurPtr == '/') 128 SkipBCPLComment(); 129 else if (*CurPtr == '*') { 130 if (SkipCComment()) 131 return tgtok::Error; 132 } else // Otherwise, this is an error. 133 return ReturnError(TokStart, "Unexpected character"); 134 return LexToken(); 135 case '-': case '+': 136 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 137 case '7': case '8': case '9': { 138 int NextChar = 0; 139 if (isdigit(CurChar)) { 140 // Allow identifiers to start with a number if it is followed by 141 // an identifier. This can happen with paste operations like 142 // foo#8i. 143 int i = 0; 144 do { 145 NextChar = peekNextChar(i++); 146 } while (isdigit(NextChar)); 147 148 if (NextChar == 'x' || NextChar == 'b') { 149 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 150 // likely a number. 151 int NextNextChar = peekNextChar(i); 152 switch (NextNextChar) { 153 default: 154 break; 155 case '0': case '1': 156 if (NextChar == 'b') 157 return LexNumber(); 158 // Fallthrough 159 case '2': case '3': case '4': case '5': 160 case '6': case '7': case '8': case '9': 161 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 162 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 163 if (NextChar == 'x') 164 return LexNumber(); 165 break; 166 } 167 } 168 } 169 170 if (isalpha(NextChar) || NextChar == '_') 171 return LexIdentifier(); 172 173 return LexNumber(); 174 } 175 case '"': return LexString(); 176 case '$': return LexVarName(); 177 case '[': return LexBracket(); 178 case '!': return LexExclaim(); 179 } 180 } 181 182 /// LexString - Lex "[^"]*" 183 tgtok::TokKind TGLexer::LexString() { 184 const char *StrStart = CurPtr; 185 186 CurStrVal = ""; 187 188 while (*CurPtr != '"') { 189 // If we hit the end of the buffer, report an error. 190 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 191 return ReturnError(StrStart, "End of file in string literal"); 192 193 if (*CurPtr == '\n' || *CurPtr == '\r') 194 return ReturnError(StrStart, "End of line in string literal"); 195 196 if (*CurPtr != '\\') { 197 CurStrVal += *CurPtr++; 198 continue; 199 } 200 201 ++CurPtr; 202 203 switch (*CurPtr) { 204 case '\\': case '\'': case '"': 205 // These turn into their literal character. 206 CurStrVal += *CurPtr++; 207 break; 208 case 't': 209 CurStrVal += '\t'; 210 ++CurPtr; 211 break; 212 case 'n': 213 CurStrVal += '\n'; 214 ++CurPtr; 215 break; 216 217 case '\n': 218 case '\r': 219 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 220 221 // If we hit the end of the buffer, report an error. 222 case '\0': 223 if (CurPtr == CurBuf.end()) 224 return ReturnError(StrStart, "End of file in string literal"); 225 // FALL THROUGH 226 default: 227 return ReturnError(CurPtr, "invalid escape in string literal"); 228 } 229 } 230 231 ++CurPtr; 232 return tgtok::StrVal; 233 } 234 235 tgtok::TokKind TGLexer::LexVarName() { 236 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 237 return ReturnError(TokStart, "Invalid variable name"); 238 239 // Otherwise, we're ok, consume the rest of the characters. 240 const char *VarNameStart = CurPtr++; 241 242 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 243 ++CurPtr; 244 245 CurStrVal.assign(VarNameStart, CurPtr); 246 return tgtok::VarName; 247 } 248 249 250 tgtok::TokKind TGLexer::LexIdentifier() { 251 // The first letter is [a-zA-Z_#]. 252 const char *IdentStart = TokStart; 253 254 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 255 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 256 ++CurPtr; 257 258 // Check to see if this identifier is a keyword. 259 StringRef Str(IdentStart, CurPtr-IdentStart); 260 261 if (Str == "include") { 262 if (LexInclude()) return tgtok::Error; 263 return Lex(); 264 } 265 266 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 267 .Case("int", tgtok::Int) 268 .Case("bit", tgtok::Bit) 269 .Case("bits", tgtok::Bits) 270 .Case("string", tgtok::String) 271 .Case("list", tgtok::List) 272 .Case("code", tgtok::Code) 273 .Case("dag", tgtok::Dag) 274 .Case("class", tgtok::Class) 275 .Case("def", tgtok::Def) 276 .Case("foreach", tgtok::Foreach) 277 .Case("defm", tgtok::Defm) 278 .Case("multiclass", tgtok::MultiClass) 279 .Case("field", tgtok::Field) 280 .Case("let", tgtok::Let) 281 .Case("in", tgtok::In) 282 .Default(tgtok::Id); 283 284 if (Kind == tgtok::Id) 285 CurStrVal.assign(Str.begin(), Str.end()); 286 return Kind; 287 } 288 289 /// LexInclude - We just read the "include" token. Get the string token that 290 /// comes next and enter the include. 291 bool TGLexer::LexInclude() { 292 // The token after the include must be a string. 293 tgtok::TokKind Tok = LexToken(); 294 if (Tok == tgtok::Error) return true; 295 if (Tok != tgtok::StrVal) { 296 PrintError(getLoc(), "Expected filename after include"); 297 return true; 298 } 299 300 // Get the string. 301 std::string Filename = CurStrVal; 302 std::string IncludedFile; 303 304 305 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 306 IncludedFile); 307 if (!CurBuffer) { 308 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 309 return true; 310 } 311 312 DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); 313 if (Found != Dependencies.end()) { 314 PrintError(getLoc(), 315 "File '" + IncludedFile + "' has already been included."); 316 SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, 317 "previously included here"); 318 return true; 319 } 320 Dependencies.insert(std::make_pair(IncludedFile, getLoc())); 321 // Save the line number and lex buffer of the includer. 322 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 323 CurPtr = CurBuf.begin(); 324 return false; 325 } 326 327 void TGLexer::SkipBCPLComment() { 328 ++CurPtr; // skip the second slash. 329 while (1) { 330 switch (*CurPtr) { 331 case '\n': 332 case '\r': 333 return; // Newline is end of comment. 334 case 0: 335 // If this is the end of the buffer, end the comment. 336 if (CurPtr == CurBuf.end()) 337 return; 338 break; 339 } 340 // Otherwise, skip the character. 341 ++CurPtr; 342 } 343 } 344 345 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 346 /// is that we allow nesting. 347 bool TGLexer::SkipCComment() { 348 ++CurPtr; // skip the star. 349 unsigned CommentDepth = 1; 350 351 while (1) { 352 int CurChar = getNextChar(); 353 switch (CurChar) { 354 case EOF: 355 PrintError(TokStart, "Unterminated comment!"); 356 return true; 357 case '*': 358 // End of the comment? 359 if (CurPtr[0] != '/') break; 360 361 ++CurPtr; // End the */. 362 if (--CommentDepth == 0) 363 return false; 364 break; 365 case '/': 366 // Start of a nested comment? 367 if (CurPtr[0] != '*') break; 368 ++CurPtr; 369 ++CommentDepth; 370 break; 371 } 372 } 373 } 374 375 /// LexNumber - Lex: 376 /// [-+]?[0-9]+ 377 /// 0x[0-9a-fA-F]+ 378 /// 0b[01]+ 379 tgtok::TokKind TGLexer::LexNumber() { 380 if (CurPtr[-1] == '0') { 381 if (CurPtr[0] == 'x') { 382 ++CurPtr; 383 const char *NumStart = CurPtr; 384 while (isxdigit(CurPtr[0])) 385 ++CurPtr; 386 387 // Requires at least one hex digit. 388 if (CurPtr == NumStart) 389 return ReturnError(TokStart, "Invalid hexadecimal number"); 390 391 errno = 0; 392 CurIntVal = strtoll(NumStart, nullptr, 16); 393 if (errno == EINVAL) 394 return ReturnError(TokStart, "Invalid hexadecimal number"); 395 if (errno == ERANGE) { 396 errno = 0; 397 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 398 if (errno == EINVAL) 399 return ReturnError(TokStart, "Invalid hexadecimal number"); 400 if (errno == ERANGE) 401 return ReturnError(TokStart, "Hexadecimal number out of range"); 402 } 403 return tgtok::IntVal; 404 } else if (CurPtr[0] == 'b') { 405 ++CurPtr; 406 const char *NumStart = CurPtr; 407 while (CurPtr[0] == '0' || CurPtr[0] == '1') 408 ++CurPtr; 409 410 // Requires at least one binary digit. 411 if (CurPtr == NumStart) 412 return ReturnError(CurPtr-2, "Invalid binary number"); 413 CurIntVal = strtoll(NumStart, nullptr, 2); 414 return tgtok::BinaryIntVal; 415 } 416 } 417 418 // Check for a sign without a digit. 419 if (!isdigit(CurPtr[0])) { 420 if (CurPtr[-1] == '-') 421 return tgtok::minus; 422 else if (CurPtr[-1] == '+') 423 return tgtok::plus; 424 } 425 426 while (isdigit(CurPtr[0])) 427 ++CurPtr; 428 CurIntVal = strtoll(TokStart, nullptr, 10); 429 return tgtok::IntVal; 430 } 431 432 /// LexBracket - We just read '['. If this is a code block, return it, 433 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 434 tgtok::TokKind TGLexer::LexBracket() { 435 if (CurPtr[0] != '{') 436 return tgtok::l_square; 437 ++CurPtr; 438 const char *CodeStart = CurPtr; 439 while (1) { 440 int Char = getNextChar(); 441 if (Char == EOF) break; 442 443 if (Char != '}') continue; 444 445 Char = getNextChar(); 446 if (Char == EOF) break; 447 if (Char == ']') { 448 CurStrVal.assign(CodeStart, CurPtr-2); 449 return tgtok::CodeFragment; 450 } 451 } 452 453 return ReturnError(CodeStart-2, "Unterminated Code Block"); 454 } 455 456 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 457 tgtok::TokKind TGLexer::LexExclaim() { 458 if (!isalpha(*CurPtr)) 459 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 460 461 const char *Start = CurPtr++; 462 while (isalpha(*CurPtr)) 463 ++CurPtr; 464 465 // Check to see which operator this is. 466 tgtok::TokKind Kind = 467 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 468 .Case("eq", tgtok::XEq) 469 .Case("if", tgtok::XIf) 470 .Case("head", tgtok::XHead) 471 .Case("tail", tgtok::XTail) 472 .Case("con", tgtok::XConcat) 473 .Case("add", tgtok::XADD) 474 .Case("and", tgtok::XAND) 475 .Case("shl", tgtok::XSHL) 476 .Case("sra", tgtok::XSRA) 477 .Case("srl", tgtok::XSRL) 478 .Case("cast", tgtok::XCast) 479 .Case("empty", tgtok::XEmpty) 480 .Case("subst", tgtok::XSubst) 481 .Case("foreach", tgtok::XForEach) 482 .Case("listconcat", tgtok::XListConcat) 483 .Case("strconcat", tgtok::XStrConcat) 484 .Default(tgtok::Error); 485 486 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 487 } 488 489