Home | History | Annotate | Download | only in TableGen
      1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Implement the Lexer for TableGen.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "TGLexer.h"
     15 #include "llvm/ADT/StringSwitch.h"
     16 #include "llvm/ADT/Twine.h"
     17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
     18 #include "llvm/Support/MemoryBuffer.h"
     19 #include "llvm/Support/SourceMgr.h"
     20 #include "llvm/TableGen/Error.h"
     21 #include <cctype>
     22 #include <cerrno>
     23 #include <cstdio>
     24 #include <cstdlib>
     25 #include <cstring>
     26 
     27 using namespace llvm;
     28 
     29 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
     30   CurBuffer = 0;
     31   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     32   CurPtr = CurBuf->getBufferStart();
     33   TokStart = 0;
     34 }
     35 
     36 SMLoc TGLexer::getLoc() const {
     37   return SMLoc::getFromPointer(TokStart);
     38 }
     39 
     40 /// ReturnError - Set the error to the specified string at the specified
     41 /// location.  This is defined to always return tgtok::Error.
     42 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
     43   PrintError(Loc, Msg);
     44   return tgtok::Error;
     45 }
     46 
     47 int TGLexer::getNextChar() {
     48   char CurChar = *CurPtr++;
     49   switch (CurChar) {
     50   default:
     51     return (unsigned char)CurChar;
     52   case 0: {
     53     // A nul character in the stream is either the end of the current buffer or
     54     // a random nul in the file.  Disambiguate that here.
     55     if (CurPtr-1 != CurBuf->getBufferEnd())
     56       return 0;  // Just whitespace.
     57 
     58     // If this is the end of an included file, pop the parent file off the
     59     // include stack.
     60     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     61     if (ParentIncludeLoc != SMLoc()) {
     62       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
     63       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     64       CurPtr = ParentIncludeLoc.getPointer();
     65       return getNextChar();
     66     }
     67 
     68     // Otherwise, return end of file.
     69     --CurPtr;  // Another call to lex will return EOF again.
     70     return EOF;
     71   }
     72   case '\n':
     73   case '\r':
     74     // Handle the newline character by ignoring it and incrementing the line
     75     // count.  However, be careful about 'dos style' files with \n\r in them.
     76     // Only treat a \n\r or \r\n as a single line.
     77     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
     78         *CurPtr != CurChar)
     79       ++CurPtr;  // Eat the two char newline sequence.
     80     return '\n';
     81   }
     82 }
     83 
     84 int TGLexer::peekNextChar(int Index) {
     85   return *(CurPtr + Index);
     86 }
     87 
     88 tgtok::TokKind TGLexer::LexToken() {
     89   TokStart = CurPtr;
     90   // This always consumes at least one character.
     91   int CurChar = getNextChar();
     92 
     93   switch (CurChar) {
     94   default:
     95     // Handle letters: [a-zA-Z_]
     96     if (isalpha(CurChar) || CurChar == '_')
     97       return LexIdentifier();
     98 
     99     // Unknown character, emit an error.
    100     return ReturnError(TokStart, "Unexpected character");
    101   case EOF: return tgtok::Eof;
    102   case ':': return tgtok::colon;
    103   case ';': return tgtok::semi;
    104   case '.': return tgtok::period;
    105   case ',': return tgtok::comma;
    106   case '<': return tgtok::less;
    107   case '>': return tgtok::greater;
    108   case ']': return tgtok::r_square;
    109   case '{': return tgtok::l_brace;
    110   case '}': return tgtok::r_brace;
    111   case '(': return tgtok::l_paren;
    112   case ')': return tgtok::r_paren;
    113   case '=': return tgtok::equal;
    114   case '?': return tgtok::question;
    115   case '#': return tgtok::paste;
    116 
    117   case 0:
    118   case ' ':
    119   case '\t':
    120   case '\n':
    121   case '\r':
    122     // Ignore whitespace.
    123     return LexToken();
    124   case '/':
    125     // If this is the start of a // comment, skip until the end of the line or
    126     // the end of the buffer.
    127     if (*CurPtr == '/')
    128       SkipBCPLComment();
    129     else if (*CurPtr == '*') {
    130       if (SkipCComment())
    131         return tgtok::Error;
    132     } else // Otherwise, this is an error.
    133       return ReturnError(TokStart, "Unexpected character");
    134     return LexToken();
    135   case '-': case '+':
    136   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
    137   case '7': case '8': case '9': {
    138     int NextChar = 0;
    139     if (isdigit(CurChar)) {
    140       // Allow identifiers to start with a number if it is followed by
    141       // an identifier.  This can happen with paste operations like
    142       // foo#8i.
    143       int i = 0;
    144       do {
    145         NextChar = peekNextChar(i++);
    146       } while (isdigit(NextChar));
    147 
    148       if (NextChar == 'x' || NextChar == 'b') {
    149         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
    150         // likely a number.
    151         int NextNextChar = peekNextChar(i);
    152         switch (NextNextChar) {
    153         default:
    154           break;
    155         case '0': case '1':
    156           if (NextChar == 'b')
    157             return LexNumber();
    158           // Fallthrough
    159         case '2': case '3': case '4': case '5':
    160         case '6': case '7': case '8': case '9':
    161         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    162         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    163           if (NextChar == 'x')
    164             return LexNumber();
    165           break;
    166         }
    167       }
    168     }
    169 
    170     if (isalpha(NextChar) || NextChar == '_')
    171       return LexIdentifier();
    172 
    173     return LexNumber();
    174   }
    175   case '"': return LexString();
    176   case '$': return LexVarName();
    177   case '[': return LexBracket();
    178   case '!': return LexExclaim();
    179   }
    180 }
    181 
    182 /// LexString - Lex "[^"]*"
    183 tgtok::TokKind TGLexer::LexString() {
    184   const char *StrStart = CurPtr;
    185 
    186   CurStrVal = "";
    187 
    188   while (*CurPtr != '"') {
    189     // If we hit the end of the buffer, report an error.
    190     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
    191       return ReturnError(StrStart, "End of file in string literal");
    192 
    193     if (*CurPtr == '\n' || *CurPtr == '\r')
    194       return ReturnError(StrStart, "End of line in string literal");
    195 
    196     if (*CurPtr != '\\') {
    197       CurStrVal += *CurPtr++;
    198       continue;
    199     }
    200 
    201     ++CurPtr;
    202 
    203     switch (*CurPtr) {
    204     case '\\': case '\'': case '"':
    205       // These turn into their literal character.
    206       CurStrVal += *CurPtr++;
    207       break;
    208     case 't':
    209       CurStrVal += '\t';
    210       ++CurPtr;
    211       break;
    212     case 'n':
    213       CurStrVal += '\n';
    214       ++CurPtr;
    215       break;
    216 
    217     case '\n':
    218     case '\r':
    219       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
    220 
    221     // If we hit the end of the buffer, report an error.
    222     case '\0':
    223       if (CurPtr == CurBuf->getBufferEnd())
    224         return ReturnError(StrStart, "End of file in string literal");
    225       // FALL THROUGH
    226     default:
    227       return ReturnError(CurPtr, "invalid escape in string literal");
    228     }
    229   }
    230 
    231   ++CurPtr;
    232   return tgtok::StrVal;
    233 }
    234 
    235 tgtok::TokKind TGLexer::LexVarName() {
    236   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
    237     return ReturnError(TokStart, "Invalid variable name");
    238 
    239   // Otherwise, we're ok, consume the rest of the characters.
    240   const char *VarNameStart = CurPtr++;
    241 
    242   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    243     ++CurPtr;
    244 
    245   CurStrVal.assign(VarNameStart, CurPtr);
    246   return tgtok::VarName;
    247 }
    248 
    249 
    250 tgtok::TokKind TGLexer::LexIdentifier() {
    251   // The first letter is [a-zA-Z_#].
    252   const char *IdentStart = TokStart;
    253 
    254   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
    255   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    256     ++CurPtr;
    257 
    258   // Check to see if this identifier is a keyword.
    259   StringRef Str(IdentStart, CurPtr-IdentStart);
    260 
    261   if (Str == "include") {
    262     if (LexInclude()) return tgtok::Error;
    263     return Lex();
    264   }
    265 
    266   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
    267     .Case("int", tgtok::Int)
    268     .Case("bit", tgtok::Bit)
    269     .Case("bits", tgtok::Bits)
    270     .Case("string", tgtok::String)
    271     .Case("list", tgtok::List)
    272     .Case("code", tgtok::Code)
    273     .Case("dag", tgtok::Dag)
    274     .Case("class", tgtok::Class)
    275     .Case("def", tgtok::Def)
    276     .Case("foreach", tgtok::Foreach)
    277     .Case("defm", tgtok::Defm)
    278     .Case("multiclass", tgtok::MultiClass)
    279     .Case("field", tgtok::Field)
    280     .Case("let", tgtok::Let)
    281     .Case("in", tgtok::In)
    282     .Default(tgtok::Id);
    283 
    284   if (Kind == tgtok::Id)
    285     CurStrVal.assign(Str.begin(), Str.end());
    286   return Kind;
    287 }
    288 
    289 /// LexInclude - We just read the "include" token.  Get the string token that
    290 /// comes next and enter the include.
    291 bool TGLexer::LexInclude() {
    292   // The token after the include must be a string.
    293   tgtok::TokKind Tok = LexToken();
    294   if (Tok == tgtok::Error) return true;
    295   if (Tok != tgtok::StrVal) {
    296     PrintError(getLoc(), "Expected filename after include");
    297     return true;
    298   }
    299 
    300   // Get the string.
    301   std::string Filename = CurStrVal;
    302   std::string IncludedFile;
    303 
    304 
    305   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
    306                                     IncludedFile);
    307   if (CurBuffer == -1) {
    308     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
    309     return true;
    310   }
    311 
    312   DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
    313   if (Found != Dependencies.end()) {
    314     PrintError(getLoc(),
    315                "File '" + IncludedFile + "' has already been included.");
    316     SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
    317                         "previously included here");
    318     return true;
    319   }
    320   Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
    321   // Save the line number and lex buffer of the includer.
    322   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
    323   CurPtr = CurBuf->getBufferStart();
    324   return false;
    325 }
    326 
    327 void TGLexer::SkipBCPLComment() {
    328   ++CurPtr;  // skip the second slash.
    329   while (1) {
    330     switch (*CurPtr) {
    331     case '\n':
    332     case '\r':
    333       return;  // Newline is end of comment.
    334     case 0:
    335       // If this is the end of the buffer, end the comment.
    336       if (CurPtr == CurBuf->getBufferEnd())
    337         return;
    338       break;
    339     }
    340     // Otherwise, skip the character.
    341     ++CurPtr;
    342   }
    343 }
    344 
    345 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
    346 /// is that we allow nesting.
    347 bool TGLexer::SkipCComment() {
    348   ++CurPtr;  // skip the star.
    349   unsigned CommentDepth = 1;
    350 
    351   while (1) {
    352     int CurChar = getNextChar();
    353     switch (CurChar) {
    354     case EOF:
    355       PrintError(TokStart, "Unterminated comment!");
    356       return true;
    357     case '*':
    358       // End of the comment?
    359       if (CurPtr[0] != '/') break;
    360 
    361       ++CurPtr;   // End the */.
    362       if (--CommentDepth == 0)
    363         return false;
    364       break;
    365     case '/':
    366       // Start of a nested comment?
    367       if (CurPtr[0] != '*') break;
    368       ++CurPtr;
    369       ++CommentDepth;
    370       break;
    371     }
    372   }
    373 }
    374 
    375 /// LexNumber - Lex:
    376 ///    [-+]?[0-9]+
    377 ///    0x[0-9a-fA-F]+
    378 ///    0b[01]+
    379 tgtok::TokKind TGLexer::LexNumber() {
    380   if (CurPtr[-1] == '0') {
    381     if (CurPtr[0] == 'x') {
    382       ++CurPtr;
    383       const char *NumStart = CurPtr;
    384       while (isxdigit(CurPtr[0]))
    385         ++CurPtr;
    386 
    387       // Requires at least one hex digit.
    388       if (CurPtr == NumStart)
    389         return ReturnError(TokStart, "Invalid hexadecimal number");
    390 
    391       errno = 0;
    392       CurIntVal = strtoll(NumStart, 0, 16);
    393       if (errno == EINVAL)
    394         return ReturnError(TokStart, "Invalid hexadecimal number");
    395       if (errno == ERANGE) {
    396         errno = 0;
    397         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
    398         if (errno == EINVAL)
    399           return ReturnError(TokStart, "Invalid hexadecimal number");
    400         if (errno == ERANGE)
    401           return ReturnError(TokStart, "Hexadecimal number out of range");
    402       }
    403       return tgtok::IntVal;
    404     } else if (CurPtr[0] == 'b') {
    405       ++CurPtr;
    406       const char *NumStart = CurPtr;
    407       while (CurPtr[0] == '0' || CurPtr[0] == '1')
    408         ++CurPtr;
    409 
    410       // Requires at least one binary digit.
    411       if (CurPtr == NumStart)
    412         return ReturnError(CurPtr-2, "Invalid binary number");
    413       CurIntVal = strtoll(NumStart, 0, 2);
    414       return tgtok::IntVal;
    415     }
    416   }
    417 
    418   // Check for a sign without a digit.
    419   if (!isdigit(CurPtr[0])) {
    420     if (CurPtr[-1] == '-')
    421       return tgtok::minus;
    422     else if (CurPtr[-1] == '+')
    423       return tgtok::plus;
    424   }
    425 
    426   while (isdigit(CurPtr[0]))
    427     ++CurPtr;
    428   CurIntVal = strtoll(TokStart, 0, 10);
    429   return tgtok::IntVal;
    430 }
    431 
    432 /// LexBracket - We just read '['.  If this is a code block, return it,
    433 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
    434 tgtok::TokKind TGLexer::LexBracket() {
    435   if (CurPtr[0] != '{')
    436     return tgtok::l_square;
    437   ++CurPtr;
    438   const char *CodeStart = CurPtr;
    439   while (1) {
    440     int Char = getNextChar();
    441     if (Char == EOF) break;
    442 
    443     if (Char != '}') continue;
    444 
    445     Char = getNextChar();
    446     if (Char == EOF) break;
    447     if (Char == ']') {
    448       CurStrVal.assign(CodeStart, CurPtr-2);
    449       return tgtok::CodeFragment;
    450     }
    451   }
    452 
    453   return ReturnError(CodeStart-2, "Unterminated Code Block");
    454 }
    455 
    456 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
    457 tgtok::TokKind TGLexer::LexExclaim() {
    458   if (!isalpha(*CurPtr))
    459     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
    460 
    461   const char *Start = CurPtr++;
    462   while (isalpha(*CurPtr))
    463     ++CurPtr;
    464 
    465   // Check to see which operator this is.
    466   tgtok::TokKind Kind =
    467     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
    468     .Case("eq", tgtok::XEq)
    469     .Case("if", tgtok::XIf)
    470     .Case("head", tgtok::XHead)
    471     .Case("tail", tgtok::XTail)
    472     .Case("con", tgtok::XConcat)
    473     .Case("add", tgtok::XADD)
    474     .Case("shl", tgtok::XSHL)
    475     .Case("sra", tgtok::XSRA)
    476     .Case("srl", tgtok::XSRL)
    477     .Case("cast", tgtok::XCast)
    478     .Case("empty", tgtok::XEmpty)
    479     .Case("subst", tgtok::XSubst)
    480     .Case("foreach", tgtok::XForEach)
    481     .Case("strconcat", tgtok::XStrConcat)
    482     .Default(tgtok::Error);
    483 
    484   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
    485 }
    486 
    487