Home | History | Annotate | Download | only in TableGen
      1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Implement the Lexer for TableGen.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "TGLexer.h"
     15 #include "llvm/ADT/StringSwitch.h"
     16 #include "llvm/ADT/Twine.h"
     17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
     18 #include "llvm/Support/Compiler.h"
     19 #include "llvm/Support/MemoryBuffer.h"
     20 #include "llvm/Support/SourceMgr.h"
     21 #include "llvm/TableGen/Error.h"
     22 #include <cctype>
     23 #include <cerrno>
     24 #include <cstdint>
     25 #include <cstdio>
     26 #include <cstdlib>
     27 #include <cstring>
     28 
     29 using namespace llvm;
     30 
     31 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
     32   CurBuffer = SrcMgr.getMainFileID();
     33   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
     34   CurPtr = CurBuf.begin();
     35   TokStart = nullptr;
     36 }
     37 
     38 SMLoc TGLexer::getLoc() const {
     39   return SMLoc::getFromPointer(TokStart);
     40 }
     41 
     42 /// ReturnError - Set the error to the specified string at the specified
     43 /// location.  This is defined to always return tgtok::Error.
     44 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
     45   PrintError(Loc, Msg);
     46   return tgtok::Error;
     47 }
     48 
     49 int TGLexer::getNextChar() {
     50   char CurChar = *CurPtr++;
     51   switch (CurChar) {
     52   default:
     53     return (unsigned char)CurChar;
     54   case 0: {
     55     // A nul character in the stream is either the end of the current buffer or
     56     // a random nul in the file.  Disambiguate that here.
     57     if (CurPtr-1 != CurBuf.end())
     58       return 0;  // Just whitespace.
     59 
     60     // If this is the end of an included file, pop the parent file off the
     61     // include stack.
     62     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     63     if (ParentIncludeLoc != SMLoc()) {
     64       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
     65       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
     66       CurPtr = ParentIncludeLoc.getPointer();
     67       return getNextChar();
     68     }
     69 
     70     // Otherwise, return end of file.
     71     --CurPtr;  // Another call to lex will return EOF again.
     72     return EOF;
     73   }
     74   case '\n':
     75   case '\r':
     76     // Handle the newline character by ignoring it and incrementing the line
     77     // count.  However, be careful about 'dos style' files with \n\r in them.
     78     // Only treat a \n\r or \r\n as a single line.
     79     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
     80         *CurPtr != CurChar)
     81       ++CurPtr;  // Eat the two char newline sequence.
     82     return '\n';
     83   }
     84 }
     85 
     86 int TGLexer::peekNextChar(int Index) {
     87   return *(CurPtr + Index);
     88 }
     89 
     90 tgtok::TokKind TGLexer::LexToken() {
     91   TokStart = CurPtr;
     92   // This always consumes at least one character.
     93   int CurChar = getNextChar();
     94 
     95   switch (CurChar) {
     96   default:
     97     // Handle letters: [a-zA-Z_]
     98     if (isalpha(CurChar) || CurChar == '_')
     99       return LexIdentifier();
    100 
    101     // Unknown character, emit an error.
    102     return ReturnError(TokStart, "Unexpected character");
    103   case EOF: return tgtok::Eof;
    104   case ':': return tgtok::colon;
    105   case ';': return tgtok::semi;
    106   case '.': return tgtok::period;
    107   case ',': return tgtok::comma;
    108   case '<': return tgtok::less;
    109   case '>': return tgtok::greater;
    110   case ']': return tgtok::r_square;
    111   case '{': return tgtok::l_brace;
    112   case '}': return tgtok::r_brace;
    113   case '(': return tgtok::l_paren;
    114   case ')': return tgtok::r_paren;
    115   case '=': return tgtok::equal;
    116   case '?': return tgtok::question;
    117   case '#': return tgtok::paste;
    118 
    119   case 0:
    120   case ' ':
    121   case '\t':
    122   case '\n':
    123   case '\r':
    124     // Ignore whitespace.
    125     return LexToken();
    126   case '/':
    127     // If this is the start of a // comment, skip until the end of the line or
    128     // the end of the buffer.
    129     if (*CurPtr == '/')
    130       SkipBCPLComment();
    131     else if (*CurPtr == '*') {
    132       if (SkipCComment())
    133         return tgtok::Error;
    134     } else // Otherwise, this is an error.
    135       return ReturnError(TokStart, "Unexpected character");
    136     return LexToken();
    137   case '-': case '+':
    138   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
    139   case '7': case '8': case '9': {
    140     int NextChar = 0;
    141     if (isdigit(CurChar)) {
    142       // Allow identifiers to start with a number if it is followed by
    143       // an identifier.  This can happen with paste operations like
    144       // foo#8i.
    145       int i = 0;
    146       do {
    147         NextChar = peekNextChar(i++);
    148       } while (isdigit(NextChar));
    149 
    150       if (NextChar == 'x' || NextChar == 'b') {
    151         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
    152         // likely a number.
    153         int NextNextChar = peekNextChar(i);
    154         switch (NextNextChar) {
    155         default:
    156           break;
    157         case '0': case '1':
    158           if (NextChar == 'b')
    159             return LexNumber();
    160           LLVM_FALLTHROUGH;
    161         case '2': case '3': case '4': case '5':
    162         case '6': case '7': case '8': case '9':
    163         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    164         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    165           if (NextChar == 'x')
    166             return LexNumber();
    167           break;
    168         }
    169       }
    170     }
    171 
    172     if (isalpha(NextChar) || NextChar == '_')
    173       return LexIdentifier();
    174 
    175     return LexNumber();
    176   }
    177   case '"': return LexString();
    178   case '$': return LexVarName();
    179   case '[': return LexBracket();
    180   case '!': return LexExclaim();
    181   }
    182 }
    183 
    184 /// LexString - Lex "[^"]*"
    185 tgtok::TokKind TGLexer::LexString() {
    186   const char *StrStart = CurPtr;
    187 
    188   CurStrVal = "";
    189 
    190   while (*CurPtr != '"') {
    191     // If we hit the end of the buffer, report an error.
    192     if (*CurPtr == 0 && CurPtr == CurBuf.end())
    193       return ReturnError(StrStart, "End of file in string literal");
    194 
    195     if (*CurPtr == '\n' || *CurPtr == '\r')
    196       return ReturnError(StrStart, "End of line in string literal");
    197 
    198     if (*CurPtr != '\\') {
    199       CurStrVal += *CurPtr++;
    200       continue;
    201     }
    202 
    203     ++CurPtr;
    204 
    205     switch (*CurPtr) {
    206     case '\\': case '\'': case '"':
    207       // These turn into their literal character.
    208       CurStrVal += *CurPtr++;
    209       break;
    210     case 't':
    211       CurStrVal += '\t';
    212       ++CurPtr;
    213       break;
    214     case 'n':
    215       CurStrVal += '\n';
    216       ++CurPtr;
    217       break;
    218 
    219     case '\n':
    220     case '\r':
    221       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
    222 
    223     // If we hit the end of the buffer, report an error.
    224     case '\0':
    225       if (CurPtr == CurBuf.end())
    226         return ReturnError(StrStart, "End of file in string literal");
    227       LLVM_FALLTHROUGH;
    228     default:
    229       return ReturnError(CurPtr, "invalid escape in string literal");
    230     }
    231   }
    232 
    233   ++CurPtr;
    234   return tgtok::StrVal;
    235 }
    236 
    237 tgtok::TokKind TGLexer::LexVarName() {
    238   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
    239     return ReturnError(TokStart, "Invalid variable name");
    240 
    241   // Otherwise, we're ok, consume the rest of the characters.
    242   const char *VarNameStart = CurPtr++;
    243 
    244   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    245     ++CurPtr;
    246 
    247   CurStrVal.assign(VarNameStart, CurPtr);
    248   return tgtok::VarName;
    249 }
    250 
    251 tgtok::TokKind TGLexer::LexIdentifier() {
    252   // The first letter is [a-zA-Z_#].
    253   const char *IdentStart = TokStart;
    254 
    255   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
    256   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    257     ++CurPtr;
    258 
    259   // Check to see if this identifier is a keyword.
    260   StringRef Str(IdentStart, CurPtr-IdentStart);
    261 
    262   if (Str == "include") {
    263     if (LexInclude()) return tgtok::Error;
    264     return Lex();
    265   }
    266 
    267   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
    268     .Case("int", tgtok::Int)
    269     .Case("bit", tgtok::Bit)
    270     .Case("bits", tgtok::Bits)
    271     .Case("string", tgtok::String)
    272     .Case("list", tgtok::List)
    273     .Case("code", tgtok::Code)
    274     .Case("dag", tgtok::Dag)
    275     .Case("class", tgtok::Class)
    276     .Case("def", tgtok::Def)
    277     .Case("foreach", tgtok::Foreach)
    278     .Case("defm", tgtok::Defm)
    279     .Case("defset", tgtok::Defset)
    280     .Case("multiclass", tgtok::MultiClass)
    281     .Case("field", tgtok::Field)
    282     .Case("let", tgtok::Let)
    283     .Case("in", tgtok::In)
    284     .Default(tgtok::Id);
    285 
    286   if (Kind == tgtok::Id)
    287     CurStrVal.assign(Str.begin(), Str.end());
    288   return Kind;
    289 }
    290 
    291 /// LexInclude - We just read the "include" token.  Get the string token that
    292 /// comes next and enter the include.
    293 bool TGLexer::LexInclude() {
    294   // The token after the include must be a string.
    295   tgtok::TokKind Tok = LexToken();
    296   if (Tok == tgtok::Error) return true;
    297   if (Tok != tgtok::StrVal) {
    298     PrintError(getLoc(), "Expected filename after include");
    299     return true;
    300   }
    301 
    302   // Get the string.
    303   std::string Filename = CurStrVal;
    304   std::string IncludedFile;
    305 
    306   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
    307                                     IncludedFile);
    308   if (!CurBuffer) {
    309     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
    310     return true;
    311   }
    312 
    313   DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
    314   if (Found != Dependencies.end()) {
    315     PrintError(getLoc(),
    316                "File '" + IncludedFile + "' has already been included.");
    317     SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
    318                         "previously included here");
    319     return true;
    320   }
    321   Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
    322   // Save the line number and lex buffer of the includer.
    323   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
    324   CurPtr = CurBuf.begin();
    325   return false;
    326 }
    327 
    328 void TGLexer::SkipBCPLComment() {
    329   ++CurPtr;  // skip the second slash.
    330   while (true) {
    331     switch (*CurPtr) {
    332     case '\n':
    333     case '\r':
    334       return;  // Newline is end of comment.
    335     case 0:
    336       // If this is the end of the buffer, end the comment.
    337       if (CurPtr == CurBuf.end())
    338         return;
    339       break;
    340     }
    341     // Otherwise, skip the character.
    342     ++CurPtr;
    343   }
    344 }
    345 
    346 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
    347 /// is that we allow nesting.
    348 bool TGLexer::SkipCComment() {
    349   ++CurPtr;  // skip the star.
    350   unsigned CommentDepth = 1;
    351 
    352   while (true) {
    353     int CurChar = getNextChar();
    354     switch (CurChar) {
    355     case EOF:
    356       PrintError(TokStart, "Unterminated comment!");
    357       return true;
    358     case '*':
    359       // End of the comment?
    360       if (CurPtr[0] != '/') break;
    361 
    362       ++CurPtr;   // End the */.
    363       if (--CommentDepth == 0)
    364         return false;
    365       break;
    366     case '/':
    367       // Start of a nested comment?
    368       if (CurPtr[0] != '*') break;
    369       ++CurPtr;
    370       ++CommentDepth;
    371       break;
    372     }
    373   }
    374 }
    375 
    376 /// LexNumber - Lex:
    377 ///    [-+]?[0-9]+
    378 ///    0x[0-9a-fA-F]+
    379 ///    0b[01]+
    380 tgtok::TokKind TGLexer::LexNumber() {
    381   if (CurPtr[-1] == '0') {
    382     if (CurPtr[0] == 'x') {
    383       ++CurPtr;
    384       const char *NumStart = CurPtr;
    385       while (isxdigit(CurPtr[0]))
    386         ++CurPtr;
    387 
    388       // Requires at least one hex digit.
    389       if (CurPtr == NumStart)
    390         return ReturnError(TokStart, "Invalid hexadecimal number");
    391 
    392       errno = 0;
    393       CurIntVal = strtoll(NumStart, nullptr, 16);
    394       if (errno == EINVAL)
    395         return ReturnError(TokStart, "Invalid hexadecimal number");
    396       if (errno == ERANGE) {
    397         errno = 0;
    398         CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
    399         if (errno == EINVAL)
    400           return ReturnError(TokStart, "Invalid hexadecimal number");
    401         if (errno == ERANGE)
    402           return ReturnError(TokStart, "Hexadecimal number out of range");
    403       }
    404       return tgtok::IntVal;
    405     } else if (CurPtr[0] == 'b') {
    406       ++CurPtr;
    407       const char *NumStart = CurPtr;
    408       while (CurPtr[0] == '0' || CurPtr[0] == '1')
    409         ++CurPtr;
    410 
    411       // Requires at least one binary digit.
    412       if (CurPtr == NumStart)
    413         return ReturnError(CurPtr-2, "Invalid binary number");
    414       CurIntVal = strtoll(NumStart, nullptr, 2);
    415       return tgtok::BinaryIntVal;
    416     }
    417   }
    418 
    419   // Check for a sign without a digit.
    420   if (!isdigit(CurPtr[0])) {
    421     if (CurPtr[-1] == '-')
    422       return tgtok::minus;
    423     else if (CurPtr[-1] == '+')
    424       return tgtok::plus;
    425   }
    426 
    427   while (isdigit(CurPtr[0]))
    428     ++CurPtr;
    429   CurIntVal = strtoll(TokStart, nullptr, 10);
    430   return tgtok::IntVal;
    431 }
    432 
    433 /// LexBracket - We just read '['.  If this is a code block, return it,
    434 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
    435 tgtok::TokKind TGLexer::LexBracket() {
    436   if (CurPtr[0] != '{')
    437     return tgtok::l_square;
    438   ++CurPtr;
    439   const char *CodeStart = CurPtr;
    440   while (true) {
    441     int Char = getNextChar();
    442     if (Char == EOF) break;
    443 
    444     if (Char != '}') continue;
    445 
    446     Char = getNextChar();
    447     if (Char == EOF) break;
    448     if (Char == ']') {
    449       CurStrVal.assign(CodeStart, CurPtr-2);
    450       return tgtok::CodeFragment;
    451     }
    452   }
    453 
    454   return ReturnError(CodeStart-2, "Unterminated Code Block");
    455 }
    456 
    457 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
    458 tgtok::TokKind TGLexer::LexExclaim() {
    459   if (!isalpha(*CurPtr))
    460     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
    461 
    462   const char *Start = CurPtr++;
    463   while (isalpha(*CurPtr))
    464     ++CurPtr;
    465 
    466   // Check to see which operator this is.
    467   tgtok::TokKind Kind =
    468     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
    469     .Case("eq", tgtok::XEq)
    470     .Case("ne", tgtok::XNe)
    471     .Case("le", tgtok::XLe)
    472     .Case("lt", tgtok::XLt)
    473     .Case("ge", tgtok::XGe)
    474     .Case("gt", tgtok::XGt)
    475     .Case("if", tgtok::XIf)
    476     .Case("isa", tgtok::XIsA)
    477     .Case("head", tgtok::XHead)
    478     .Case("tail", tgtok::XTail)
    479     .Case("size", tgtok::XSize)
    480     .Case("con", tgtok::XConcat)
    481     .Case("dag", tgtok::XDag)
    482     .Case("add", tgtok::XADD)
    483     .Case("and", tgtok::XAND)
    484     .Case("or", tgtok::XOR)
    485     .Case("shl", tgtok::XSHL)
    486     .Case("sra", tgtok::XSRA)
    487     .Case("srl", tgtok::XSRL)
    488     .Case("cast", tgtok::XCast)
    489     .Case("empty", tgtok::XEmpty)
    490     .Case("subst", tgtok::XSubst)
    491     .Case("foldl", tgtok::XFoldl)
    492     .Case("foreach", tgtok::XForEach)
    493     .Case("listconcat", tgtok::XListConcat)
    494     .Case("strconcat", tgtok::XStrConcat)
    495     .Default(tgtok::Error);
    496 
    497   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
    498 }
    499