1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Implement the Lexer for TableGen. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "TGLexer.h" 15 #include "llvm/TableGen/Error.h" 16 #include "llvm/Support/SourceMgr.h" 17 #include "llvm/Support/MemoryBuffer.h" 18 #include "llvm/Config/config.h" 19 #include "llvm/ADT/StringSwitch.h" 20 #include "llvm/ADT/Twine.h" 21 #include <cctype> 22 #include <cstdio> 23 #include <cstdlib> 24 #include <cstring> 25 #include <cerrno> 26 using namespace llvm; 27 28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 29 CurBuffer = 0; 30 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 31 CurPtr = CurBuf->getBufferStart(); 32 TokStart = 0; 33 } 34 35 SMLoc TGLexer::getLoc() const { 36 return SMLoc::getFromPointer(TokStart); 37 } 38 39 /// ReturnError - Set the error to the specified string at the specified 40 /// location. This is defined to always return tgtok::Error. 41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 42 PrintError(Loc, Msg); 43 return tgtok::Error; 44 } 45 46 int TGLexer::getNextChar() { 47 char CurChar = *CurPtr++; 48 switch (CurChar) { 49 default: 50 return (unsigned char)CurChar; 51 case 0: { 52 // A nul character in the stream is either the end of the current buffer or 53 // a random nul in the file. Disambiguate that here. 54 if (CurPtr-1 != CurBuf->getBufferEnd()) 55 return 0; // Just whitespace. 56 57 // If this is the end of an included file, pop the parent file off the 58 // include stack. 59 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 60 if (ParentIncludeLoc != SMLoc()) { 61 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 62 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 63 CurPtr = ParentIncludeLoc.getPointer(); 64 return getNextChar(); 65 } 66 67 // Otherwise, return end of file. 68 --CurPtr; // Another call to lex will return EOF again. 69 return EOF; 70 } 71 case '\n': 72 case '\r': 73 // Handle the newline character by ignoring it and incrementing the line 74 // count. However, be careful about 'dos style' files with \n\r in them. 75 // Only treat a \n\r or \r\n as a single line. 76 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 77 *CurPtr != CurChar) 78 ++CurPtr; // Eat the two char newline sequence. 79 return '\n'; 80 } 81 } 82 83 int TGLexer::peekNextChar(int Index) { 84 return *(CurPtr + Index); 85 } 86 87 tgtok::TokKind TGLexer::LexToken() { 88 TokStart = CurPtr; 89 // This always consumes at least one character. 90 int CurChar = getNextChar(); 91 92 switch (CurChar) { 93 default: 94 // Handle letters: [a-zA-Z_] 95 if (isalpha(CurChar) || CurChar == '_') 96 return LexIdentifier(); 97 98 // Unknown character, emit an error. 99 return ReturnError(TokStart, "Unexpected character"); 100 case EOF: return tgtok::Eof; 101 case ':': return tgtok::colon; 102 case ';': return tgtok::semi; 103 case '.': return tgtok::period; 104 case ',': return tgtok::comma; 105 case '<': return tgtok::less; 106 case '>': return tgtok::greater; 107 case ']': return tgtok::r_square; 108 case '{': return tgtok::l_brace; 109 case '}': return tgtok::r_brace; 110 case '(': return tgtok::l_paren; 111 case ')': return tgtok::r_paren; 112 case '=': return tgtok::equal; 113 case '?': return tgtok::question; 114 case '#': return tgtok::paste; 115 116 case 0: 117 case ' ': 118 case '\t': 119 case '\n': 120 case '\r': 121 // Ignore whitespace. 122 return LexToken(); 123 case '/': 124 // If this is the start of a // comment, skip until the end of the line or 125 // the end of the buffer. 126 if (*CurPtr == '/') 127 SkipBCPLComment(); 128 else if (*CurPtr == '*') { 129 if (SkipCComment()) 130 return tgtok::Error; 131 } else // Otherwise, this is an error. 132 return ReturnError(TokStart, "Unexpected character"); 133 return LexToken(); 134 case '-': case '+': 135 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 136 case '7': case '8': case '9': { 137 int NextChar = 0; 138 if (isdigit(CurChar)) { 139 // Allow identifiers to start with a number if it is followed by 140 // an identifier. This can happen with paste operations like 141 // foo#8i. 142 int i = 0; 143 do { 144 NextChar = peekNextChar(i++); 145 } while (isdigit(NextChar)); 146 147 if (NextChar == 'x' || NextChar == 'b') { 148 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 149 // likely a number. 150 int NextNextChar = peekNextChar(i); 151 switch (NextNextChar) { 152 default: 153 break; 154 case '0': case '1': 155 if (NextChar == 'b') 156 return LexNumber(); 157 // Fallthrough 158 case '2': case '3': case '4': case '5': 159 case '6': case '7': case '8': case '9': 160 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 161 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 162 if (NextChar == 'x') 163 return LexNumber(); 164 break; 165 } 166 } 167 } 168 169 if (isalpha(NextChar) || NextChar == '_') 170 return LexIdentifier(); 171 172 return LexNumber(); 173 } 174 case '"': return LexString(); 175 case '$': return LexVarName(); 176 case '[': return LexBracket(); 177 case '!': return LexExclaim(); 178 } 179 } 180 181 /// LexString - Lex "[^"]*" 182 tgtok::TokKind TGLexer::LexString() { 183 const char *StrStart = CurPtr; 184 185 CurStrVal = ""; 186 187 while (*CurPtr != '"') { 188 // If we hit the end of the buffer, report an error. 189 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) 190 return ReturnError(StrStart, "End of file in string literal"); 191 192 if (*CurPtr == '\n' || *CurPtr == '\r') 193 return ReturnError(StrStart, "End of line in string literal"); 194 195 if (*CurPtr != '\\') { 196 CurStrVal += *CurPtr++; 197 continue; 198 } 199 200 ++CurPtr; 201 202 switch (*CurPtr) { 203 case '\\': case '\'': case '"': 204 // These turn into their literal character. 205 CurStrVal += *CurPtr++; 206 break; 207 case 't': 208 CurStrVal += '\t'; 209 ++CurPtr; 210 break; 211 case 'n': 212 CurStrVal += '\n'; 213 ++CurPtr; 214 break; 215 216 case '\n': 217 case '\r': 218 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 219 220 // If we hit the end of the buffer, report an error. 221 case '\0': 222 if (CurPtr == CurBuf->getBufferEnd()) 223 return ReturnError(StrStart, "End of file in string literal"); 224 // FALL THROUGH 225 default: 226 return ReturnError(CurPtr, "invalid escape in string literal"); 227 } 228 } 229 230 ++CurPtr; 231 return tgtok::StrVal; 232 } 233 234 tgtok::TokKind TGLexer::LexVarName() { 235 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 236 return ReturnError(TokStart, "Invalid variable name"); 237 238 // Otherwise, we're ok, consume the rest of the characters. 239 const char *VarNameStart = CurPtr++; 240 241 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 242 ++CurPtr; 243 244 CurStrVal.assign(VarNameStart, CurPtr); 245 return tgtok::VarName; 246 } 247 248 249 tgtok::TokKind TGLexer::LexIdentifier() { 250 // The first letter is [a-zA-Z_#]. 251 const char *IdentStart = TokStart; 252 253 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 254 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 255 ++CurPtr; 256 257 // Check to see if this identifier is a keyword. 258 StringRef Str(IdentStart, CurPtr-IdentStart); 259 260 if (Str == "include") { 261 if (LexInclude()) return tgtok::Error; 262 return Lex(); 263 } 264 265 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 266 .Case("int", tgtok::Int) 267 .Case("bit", tgtok::Bit) 268 .Case("bits", tgtok::Bits) 269 .Case("string", tgtok::String) 270 .Case("list", tgtok::List) 271 .Case("code", tgtok::Code) 272 .Case("dag", tgtok::Dag) 273 .Case("class", tgtok::Class) 274 .Case("def", tgtok::Def) 275 .Case("defm", tgtok::Defm) 276 .Case("multiclass", tgtok::MultiClass) 277 .Case("field", tgtok::Field) 278 .Case("let", tgtok::Let) 279 .Case("in", tgtok::In) 280 .Default(tgtok::Id); 281 282 if (Kind == tgtok::Id) 283 CurStrVal.assign(Str.begin(), Str.end()); 284 return Kind; 285 } 286 287 /// LexInclude - We just read the "include" token. Get the string token that 288 /// comes next and enter the include. 289 bool TGLexer::LexInclude() { 290 // The token after the include must be a string. 291 tgtok::TokKind Tok = LexToken(); 292 if (Tok == tgtok::Error) return true; 293 if (Tok != tgtok::StrVal) { 294 PrintError(getLoc(), "Expected filename after include"); 295 return true; 296 } 297 298 // Get the string. 299 std::string Filename = CurStrVal; 300 std::string IncludedFile; 301 302 303 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 304 IncludedFile); 305 if (CurBuffer == -1) { 306 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 307 return true; 308 } 309 310 Dependencies.push_back(IncludedFile); 311 // Save the line number and lex buffer of the includer. 312 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 313 CurPtr = CurBuf->getBufferStart(); 314 return false; 315 } 316 317 void TGLexer::SkipBCPLComment() { 318 ++CurPtr; // skip the second slash. 319 while (1) { 320 switch (*CurPtr) { 321 case '\n': 322 case '\r': 323 return; // Newline is end of comment. 324 case 0: 325 // If this is the end of the buffer, end the comment. 326 if (CurPtr == CurBuf->getBufferEnd()) 327 return; 328 break; 329 } 330 // Otherwise, skip the character. 331 ++CurPtr; 332 } 333 } 334 335 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 336 /// is that we allow nesting. 337 bool TGLexer::SkipCComment() { 338 ++CurPtr; // skip the star. 339 unsigned CommentDepth = 1; 340 341 while (1) { 342 int CurChar = getNextChar(); 343 switch (CurChar) { 344 case EOF: 345 PrintError(TokStart, "Unterminated comment!"); 346 return true; 347 case '*': 348 // End of the comment? 349 if (CurPtr[0] != '/') break; 350 351 ++CurPtr; // End the */. 352 if (--CommentDepth == 0) 353 return false; 354 break; 355 case '/': 356 // Start of a nested comment? 357 if (CurPtr[0] != '*') break; 358 ++CurPtr; 359 ++CommentDepth; 360 break; 361 } 362 } 363 } 364 365 /// LexNumber - Lex: 366 /// [-+]?[0-9]+ 367 /// 0x[0-9a-fA-F]+ 368 /// 0b[01]+ 369 tgtok::TokKind TGLexer::LexNumber() { 370 if (CurPtr[-1] == '0') { 371 if (CurPtr[0] == 'x') { 372 ++CurPtr; 373 const char *NumStart = CurPtr; 374 while (isxdigit(CurPtr[0])) 375 ++CurPtr; 376 377 // Requires at least one hex digit. 378 if (CurPtr == NumStart) 379 return ReturnError(TokStart, "Invalid hexadecimal number"); 380 381 errno = 0; 382 CurIntVal = strtoll(NumStart, 0, 16); 383 if (errno == EINVAL) 384 return ReturnError(TokStart, "Invalid hexadecimal number"); 385 if (errno == ERANGE) { 386 errno = 0; 387 CurIntVal = (int64_t)strtoull(NumStart, 0, 16); 388 if (errno == EINVAL) 389 return ReturnError(TokStart, "Invalid hexadecimal number"); 390 if (errno == ERANGE) 391 return ReturnError(TokStart, "Hexadecimal number out of range"); 392 } 393 return tgtok::IntVal; 394 } else if (CurPtr[0] == 'b') { 395 ++CurPtr; 396 const char *NumStart = CurPtr; 397 while (CurPtr[0] == '0' || CurPtr[0] == '1') 398 ++CurPtr; 399 400 // Requires at least one binary digit. 401 if (CurPtr == NumStart) 402 return ReturnError(CurPtr-2, "Invalid binary number"); 403 CurIntVal = strtoll(NumStart, 0, 2); 404 return tgtok::IntVal; 405 } 406 } 407 408 // Check for a sign without a digit. 409 if (!isdigit(CurPtr[0])) { 410 if (CurPtr[-1] == '-') 411 return tgtok::minus; 412 else if (CurPtr[-1] == '+') 413 return tgtok::plus; 414 } 415 416 while (isdigit(CurPtr[0])) 417 ++CurPtr; 418 CurIntVal = strtoll(TokStart, 0, 10); 419 return tgtok::IntVal; 420 } 421 422 /// LexBracket - We just read '['. If this is a code block, return it, 423 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 424 tgtok::TokKind TGLexer::LexBracket() { 425 if (CurPtr[0] != '{') 426 return tgtok::l_square; 427 ++CurPtr; 428 const char *CodeStart = CurPtr; 429 while (1) { 430 int Char = getNextChar(); 431 if (Char == EOF) break; 432 433 if (Char != '}') continue; 434 435 Char = getNextChar(); 436 if (Char == EOF) break; 437 if (Char == ']') { 438 CurStrVal.assign(CodeStart, CurPtr-2); 439 return tgtok::CodeFragment; 440 } 441 } 442 443 return ReturnError(CodeStart-2, "Unterminated Code Block"); 444 } 445 446 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 447 tgtok::TokKind TGLexer::LexExclaim() { 448 if (!isalpha(*CurPtr)) 449 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 450 451 const char *Start = CurPtr++; 452 while (isalpha(*CurPtr)) 453 ++CurPtr; 454 455 // Check to see which operator this is. 456 tgtok::TokKind Kind = 457 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 458 .Case("eq", tgtok::XEq) 459 .Case("if", tgtok::XIf) 460 .Case("head", tgtok::XHead) 461 .Case("tail", tgtok::XTail) 462 .Case("con", tgtok::XConcat) 463 .Case("shl", tgtok::XSHL) 464 .Case("sra", tgtok::XSRA) 465 .Case("srl", tgtok::XSRL) 466 .Case("cast", tgtok::XCast) 467 .Case("empty", tgtok::XEmpty) 468 .Case("subst", tgtok::XSubst) 469 .Case("foreach", tgtok::XForEach) 470 .Case("strconcat", tgtok::XStrConcat) 471 .Default(tgtok::Error); 472 473 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 474 } 475 476