1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Implement the Lexer for TableGen. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "TGLexer.h" 15 #include "llvm/TableGen/Error.h" 16 #include "llvm/Support/SourceMgr.h" 17 #include "llvm/Support/MemoryBuffer.h" 18 #include "llvm/ADT/StringSwitch.h" 19 #include "llvm/ADT/Twine.h" 20 #include <cctype> 21 #include <cstdio> 22 #include <cstdlib> 23 #include <cstring> 24 #include <cerrno> 25 26 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 27 28 using namespace llvm; 29 30 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 31 CurBuffer = 0; 32 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 33 CurPtr = CurBuf->getBufferStart(); 34 TokStart = 0; 35 } 36 37 SMLoc TGLexer::getLoc() const { 38 return SMLoc::getFromPointer(TokStart); 39 } 40 41 /// ReturnError - Set the error to the specified string at the specified 42 /// location. This is defined to always return tgtok::Error. 43 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 44 PrintError(Loc, Msg); 45 return tgtok::Error; 46 } 47 48 int TGLexer::getNextChar() { 49 char CurChar = *CurPtr++; 50 switch (CurChar) { 51 default: 52 return (unsigned char)CurChar; 53 case 0: { 54 // A nul character in the stream is either the end of the current buffer or 55 // a random nul in the file. Disambiguate that here. 56 if (CurPtr-1 != CurBuf->getBufferEnd()) 57 return 0; // Just whitespace. 58 59 // If this is the end of an included file, pop the parent file off the 60 // include stack. 61 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 62 if (ParentIncludeLoc != SMLoc()) { 63 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 64 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 65 CurPtr = ParentIncludeLoc.getPointer(); 66 return getNextChar(); 67 } 68 69 // Otherwise, return end of file. 70 --CurPtr; // Another call to lex will return EOF again. 71 return EOF; 72 } 73 case '\n': 74 case '\r': 75 // Handle the newline character by ignoring it and incrementing the line 76 // count. However, be careful about 'dos style' files with \n\r in them. 77 // Only treat a \n\r or \r\n as a single line. 78 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 79 *CurPtr != CurChar) 80 ++CurPtr; // Eat the two char newline sequence. 81 return '\n'; 82 } 83 } 84 85 int TGLexer::peekNextChar(int Index) { 86 return *(CurPtr + Index); 87 } 88 89 tgtok::TokKind TGLexer::LexToken() { 90 TokStart = CurPtr; 91 // This always consumes at least one character. 92 int CurChar = getNextChar(); 93 94 switch (CurChar) { 95 default: 96 // Handle letters: [a-zA-Z_] 97 if (isalpha(CurChar) || CurChar == '_') 98 return LexIdentifier(); 99 100 // Unknown character, emit an error. 101 return ReturnError(TokStart, "Unexpected character"); 102 case EOF: return tgtok::Eof; 103 case ':': return tgtok::colon; 104 case ';': return tgtok::semi; 105 case '.': return tgtok::period; 106 case ',': return tgtok::comma; 107 case '<': return tgtok::less; 108 case '>': return tgtok::greater; 109 case ']': return tgtok::r_square; 110 case '{': return tgtok::l_brace; 111 case '}': return tgtok::r_brace; 112 case '(': return tgtok::l_paren; 113 case ')': return tgtok::r_paren; 114 case '=': return tgtok::equal; 115 case '?': return tgtok::question; 116 case '#': return tgtok::paste; 117 118 case 0: 119 case ' ': 120 case '\t': 121 case '\n': 122 case '\r': 123 // Ignore whitespace. 124 return LexToken(); 125 case '/': 126 // If this is the start of a // comment, skip until the end of the line or 127 // the end of the buffer. 128 if (*CurPtr == '/') 129 SkipBCPLComment(); 130 else if (*CurPtr == '*') { 131 if (SkipCComment()) 132 return tgtok::Error; 133 } else // Otherwise, this is an error. 134 return ReturnError(TokStart, "Unexpected character"); 135 return LexToken(); 136 case '-': case '+': 137 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 138 case '7': case '8': case '9': { 139 int NextChar = 0; 140 if (isdigit(CurChar)) { 141 // Allow identifiers to start with a number if it is followed by 142 // an identifier. This can happen with paste operations like 143 // foo#8i. 144 int i = 0; 145 do { 146 NextChar = peekNextChar(i++); 147 } while (isdigit(NextChar)); 148 149 if (NextChar == 'x' || NextChar == 'b') { 150 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 151 // likely a number. 152 int NextNextChar = peekNextChar(i); 153 switch (NextNextChar) { 154 default: 155 break; 156 case '0': case '1': 157 if (NextChar == 'b') 158 return LexNumber(); 159 // Fallthrough 160 case '2': case '3': case '4': case '5': 161 case '6': case '7': case '8': case '9': 162 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 163 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 164 if (NextChar == 'x') 165 return LexNumber(); 166 break; 167 } 168 } 169 } 170 171 if (isalpha(NextChar) || NextChar == '_') 172 return LexIdentifier(); 173 174 return LexNumber(); 175 } 176 case '"': return LexString(); 177 case '$': return LexVarName(); 178 case '[': return LexBracket(); 179 case '!': return LexExclaim(); 180 } 181 } 182 183 /// LexString - Lex "[^"]*" 184 tgtok::TokKind TGLexer::LexString() { 185 const char *StrStart = CurPtr; 186 187 CurStrVal = ""; 188 189 while (*CurPtr != '"') { 190 // If we hit the end of the buffer, report an error. 191 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) 192 return ReturnError(StrStart, "End of file in string literal"); 193 194 if (*CurPtr == '\n' || *CurPtr == '\r') 195 return ReturnError(StrStart, "End of line in string literal"); 196 197 if (*CurPtr != '\\') { 198 CurStrVal += *CurPtr++; 199 continue; 200 } 201 202 ++CurPtr; 203 204 switch (*CurPtr) { 205 case '\\': case '\'': case '"': 206 // These turn into their literal character. 207 CurStrVal += *CurPtr++; 208 break; 209 case 't': 210 CurStrVal += '\t'; 211 ++CurPtr; 212 break; 213 case 'n': 214 CurStrVal += '\n'; 215 ++CurPtr; 216 break; 217 218 case '\n': 219 case '\r': 220 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 221 222 // If we hit the end of the buffer, report an error. 223 case '\0': 224 if (CurPtr == CurBuf->getBufferEnd()) 225 return ReturnError(StrStart, "End of file in string literal"); 226 // FALL THROUGH 227 default: 228 return ReturnError(CurPtr, "invalid escape in string literal"); 229 } 230 } 231 232 ++CurPtr; 233 return tgtok::StrVal; 234 } 235 236 tgtok::TokKind TGLexer::LexVarName() { 237 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 238 return ReturnError(TokStart, "Invalid variable name"); 239 240 // Otherwise, we're ok, consume the rest of the characters. 241 const char *VarNameStart = CurPtr++; 242 243 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 244 ++CurPtr; 245 246 CurStrVal.assign(VarNameStart, CurPtr); 247 return tgtok::VarName; 248 } 249 250 251 tgtok::TokKind TGLexer::LexIdentifier() { 252 // The first letter is [a-zA-Z_#]. 253 const char *IdentStart = TokStart; 254 255 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 256 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 257 ++CurPtr; 258 259 // Check to see if this identifier is a keyword. 260 StringRef Str(IdentStart, CurPtr-IdentStart); 261 262 if (Str == "include") { 263 if (LexInclude()) return tgtok::Error; 264 return Lex(); 265 } 266 267 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 268 .Case("int", tgtok::Int) 269 .Case("bit", tgtok::Bit) 270 .Case("bits", tgtok::Bits) 271 .Case("string", tgtok::String) 272 .Case("list", tgtok::List) 273 .Case("code", tgtok::Code) 274 .Case("dag", tgtok::Dag) 275 .Case("class", tgtok::Class) 276 .Case("def", tgtok::Def) 277 .Case("foreach", tgtok::Foreach) 278 .Case("defm", tgtok::Defm) 279 .Case("multiclass", tgtok::MultiClass) 280 .Case("field", tgtok::Field) 281 .Case("let", tgtok::Let) 282 .Case("in", tgtok::In) 283 .Default(tgtok::Id); 284 285 if (Kind == tgtok::Id) 286 CurStrVal.assign(Str.begin(), Str.end()); 287 return Kind; 288 } 289 290 /// LexInclude - We just read the "include" token. Get the string token that 291 /// comes next and enter the include. 292 bool TGLexer::LexInclude() { 293 // The token after the include must be a string. 294 tgtok::TokKind Tok = LexToken(); 295 if (Tok == tgtok::Error) return true; 296 if (Tok != tgtok::StrVal) { 297 PrintError(getLoc(), "Expected filename after include"); 298 return true; 299 } 300 301 // Get the string. 302 std::string Filename = CurStrVal; 303 std::string IncludedFile; 304 305 306 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 307 IncludedFile); 308 if (CurBuffer == -1) { 309 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 310 return true; 311 } 312 313 Dependencies.push_back(IncludedFile); 314 // Save the line number and lex buffer of the includer. 315 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 316 CurPtr = CurBuf->getBufferStart(); 317 return false; 318 } 319 320 void TGLexer::SkipBCPLComment() { 321 ++CurPtr; // skip the second slash. 322 while (1) { 323 switch (*CurPtr) { 324 case '\n': 325 case '\r': 326 return; // Newline is end of comment. 327 case 0: 328 // If this is the end of the buffer, end the comment. 329 if (CurPtr == CurBuf->getBufferEnd()) 330 return; 331 break; 332 } 333 // Otherwise, skip the character. 334 ++CurPtr; 335 } 336 } 337 338 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 339 /// is that we allow nesting. 340 bool TGLexer::SkipCComment() { 341 ++CurPtr; // skip the star. 342 unsigned CommentDepth = 1; 343 344 while (1) { 345 int CurChar = getNextChar(); 346 switch (CurChar) { 347 case EOF: 348 PrintError(TokStart, "Unterminated comment!"); 349 return true; 350 case '*': 351 // End of the comment? 352 if (CurPtr[0] != '/') break; 353 354 ++CurPtr; // End the */. 355 if (--CommentDepth == 0) 356 return false; 357 break; 358 case '/': 359 // Start of a nested comment? 360 if (CurPtr[0] != '*') break; 361 ++CurPtr; 362 ++CommentDepth; 363 break; 364 } 365 } 366 } 367 368 /// LexNumber - Lex: 369 /// [-+]?[0-9]+ 370 /// 0x[0-9a-fA-F]+ 371 /// 0b[01]+ 372 tgtok::TokKind TGLexer::LexNumber() { 373 if (CurPtr[-1] == '0') { 374 if (CurPtr[0] == 'x') { 375 ++CurPtr; 376 const char *NumStart = CurPtr; 377 while (isxdigit(CurPtr[0])) 378 ++CurPtr; 379 380 // Requires at least one hex digit. 381 if (CurPtr == NumStart) 382 return ReturnError(TokStart, "Invalid hexadecimal number"); 383 384 errno = 0; 385 CurIntVal = strtoll(NumStart, 0, 16); 386 if (errno == EINVAL) 387 return ReturnError(TokStart, "Invalid hexadecimal number"); 388 if (errno == ERANGE) { 389 errno = 0; 390 CurIntVal = (int64_t)strtoull(NumStart, 0, 16); 391 if (errno == EINVAL) 392 return ReturnError(TokStart, "Invalid hexadecimal number"); 393 if (errno == ERANGE) 394 return ReturnError(TokStart, "Hexadecimal number out of range"); 395 } 396 return tgtok::IntVal; 397 } else if (CurPtr[0] == 'b') { 398 ++CurPtr; 399 const char *NumStart = CurPtr; 400 while (CurPtr[0] == '0' || CurPtr[0] == '1') 401 ++CurPtr; 402 403 // Requires at least one binary digit. 404 if (CurPtr == NumStart) 405 return ReturnError(CurPtr-2, "Invalid binary number"); 406 CurIntVal = strtoll(NumStart, 0, 2); 407 return tgtok::IntVal; 408 } 409 } 410 411 // Check for a sign without a digit. 412 if (!isdigit(CurPtr[0])) { 413 if (CurPtr[-1] == '-') 414 return tgtok::minus; 415 else if (CurPtr[-1] == '+') 416 return tgtok::plus; 417 } 418 419 while (isdigit(CurPtr[0])) 420 ++CurPtr; 421 CurIntVal = strtoll(TokStart, 0, 10); 422 return tgtok::IntVal; 423 } 424 425 /// LexBracket - We just read '['. If this is a code block, return it, 426 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 427 tgtok::TokKind TGLexer::LexBracket() { 428 if (CurPtr[0] != '{') 429 return tgtok::l_square; 430 ++CurPtr; 431 const char *CodeStart = CurPtr; 432 while (1) { 433 int Char = getNextChar(); 434 if (Char == EOF) break; 435 436 if (Char != '}') continue; 437 438 Char = getNextChar(); 439 if (Char == EOF) break; 440 if (Char == ']') { 441 CurStrVal.assign(CodeStart, CurPtr-2); 442 return tgtok::CodeFragment; 443 } 444 } 445 446 return ReturnError(CodeStart-2, "Unterminated Code Block"); 447 } 448 449 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 450 tgtok::TokKind TGLexer::LexExclaim() { 451 if (!isalpha(*CurPtr)) 452 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 453 454 const char *Start = CurPtr++; 455 while (isalpha(*CurPtr)) 456 ++CurPtr; 457 458 // Check to see which operator this is. 459 tgtok::TokKind Kind = 460 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 461 .Case("eq", tgtok::XEq) 462 .Case("if", tgtok::XIf) 463 .Case("head", tgtok::XHead) 464 .Case("tail", tgtok::XTail) 465 .Case("con", tgtok::XConcat) 466 .Case("shl", tgtok::XSHL) 467 .Case("sra", tgtok::XSRA) 468 .Case("srl", tgtok::XSRL) 469 .Case("cast", tgtok::XCast) 470 .Case("empty", tgtok::XEmpty) 471 .Case("subst", tgtok::XSubst) 472 .Case("foreach", tgtok::XForEach) 473 .Case("strconcat", tgtok::XStrConcat) 474 .Default(tgtok::Error); 475 476 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 477 } 478 479