Home | History | Annotate | Download | only in TableGen
      1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Implement the Lexer for TableGen.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "TGLexer.h"
     15 #include "llvm/TableGen/Error.h"
     16 #include "llvm/Support/SourceMgr.h"
     17 #include "llvm/Support/MemoryBuffer.h"
     18 #include "llvm/Config/config.h"
     19 #include "llvm/ADT/StringSwitch.h"
     20 #include "llvm/ADT/Twine.h"
     21 #include <cctype>
     22 #include <cstdio>
     23 #include <cstdlib>
     24 #include <cstring>
     25 #include <cerrno>
     26 using namespace llvm;
     27 
     28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
     29   CurBuffer = 0;
     30   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     31   CurPtr = CurBuf->getBufferStart();
     32   TokStart = 0;
     33 }
     34 
     35 SMLoc TGLexer::getLoc() const {
     36   return SMLoc::getFromPointer(TokStart);
     37 }
     38 
     39 /// ReturnError - Set the error to the specified string at the specified
     40 /// location.  This is defined to always return tgtok::Error.
     41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
     42   PrintError(Loc, Msg);
     43   return tgtok::Error;
     44 }
     45 
     46 int TGLexer::getNextChar() {
     47   char CurChar = *CurPtr++;
     48   switch (CurChar) {
     49   default:
     50     return (unsigned char)CurChar;
     51   case 0: {
     52     // A nul character in the stream is either the end of the current buffer or
     53     // a random nul in the file.  Disambiguate that here.
     54     if (CurPtr-1 != CurBuf->getBufferEnd())
     55       return 0;  // Just whitespace.
     56 
     57     // If this is the end of an included file, pop the parent file off the
     58     // include stack.
     59     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     60     if (ParentIncludeLoc != SMLoc()) {
     61       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
     62       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     63       CurPtr = ParentIncludeLoc.getPointer();
     64       return getNextChar();
     65     }
     66 
     67     // Otherwise, return end of file.
     68     --CurPtr;  // Another call to lex will return EOF again.
     69     return EOF;
     70   }
     71   case '\n':
     72   case '\r':
     73     // Handle the newline character by ignoring it and incrementing the line
     74     // count.  However, be careful about 'dos style' files with \n\r in them.
     75     // Only treat a \n\r or \r\n as a single line.
     76     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
     77         *CurPtr != CurChar)
     78       ++CurPtr;  // Eat the two char newline sequence.
     79     return '\n';
     80   }
     81 }
     82 
     83 int TGLexer::peekNextChar(int Index) {
     84   return *(CurPtr + Index);
     85 }
     86 
     87 tgtok::TokKind TGLexer::LexToken() {
     88   TokStart = CurPtr;
     89   // This always consumes at least one character.
     90   int CurChar = getNextChar();
     91 
     92   switch (CurChar) {
     93   default:
     94     // Handle letters: [a-zA-Z_]
     95     if (isalpha(CurChar) || CurChar == '_')
     96       return LexIdentifier();
     97 
     98     // Unknown character, emit an error.
     99     return ReturnError(TokStart, "Unexpected character");
    100   case EOF: return tgtok::Eof;
    101   case ':': return tgtok::colon;
    102   case ';': return tgtok::semi;
    103   case '.': return tgtok::period;
    104   case ',': return tgtok::comma;
    105   case '<': return tgtok::less;
    106   case '>': return tgtok::greater;
    107   case ']': return tgtok::r_square;
    108   case '{': return tgtok::l_brace;
    109   case '}': return tgtok::r_brace;
    110   case '(': return tgtok::l_paren;
    111   case ')': return tgtok::r_paren;
    112   case '=': return tgtok::equal;
    113   case '?': return tgtok::question;
    114   case '#': return tgtok::paste;
    115 
    116   case 0:
    117   case ' ':
    118   case '\t':
    119   case '\n':
    120   case '\r':
    121     // Ignore whitespace.
    122     return LexToken();
    123   case '/':
    124     // If this is the start of a // comment, skip until the end of the line or
    125     // the end of the buffer.
    126     if (*CurPtr == '/')
    127       SkipBCPLComment();
    128     else if (*CurPtr == '*') {
    129       if (SkipCComment())
    130         return tgtok::Error;
    131     } else // Otherwise, this is an error.
    132       return ReturnError(TokStart, "Unexpected character");
    133     return LexToken();
    134   case '-': case '+':
    135   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
    136   case '7': case '8': case '9': {
    137     int NextChar = 0;
    138     if (isdigit(CurChar)) {
    139       // Allow identifiers to start with a number if it is followed by
    140       // an identifier.  This can happen with paste operations like
    141       // foo#8i.
    142       int i = 0;
    143       do {
    144         NextChar = peekNextChar(i++);
    145       } while (isdigit(NextChar));
    146 
    147       if (NextChar == 'x' || NextChar == 'b') {
    148         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
    149         // likely a number.
    150         int NextNextChar = peekNextChar(i);
    151         switch (NextNextChar) {
    152         default:
    153           break;
    154         case '0': case '1':
    155           if (NextChar == 'b')
    156             return LexNumber();
    157           // Fallthrough
    158         case '2': case '3': case '4': case '5':
    159         case '6': case '7': case '8': case '9':
    160         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    161         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    162           if (NextChar == 'x')
    163             return LexNumber();
    164           break;
    165         }
    166       }
    167     }
    168 
    169     if (isalpha(NextChar) || NextChar == '_')
    170       return LexIdentifier();
    171 
    172     return LexNumber();
    173   }
    174   case '"': return LexString();
    175   case '$': return LexVarName();
    176   case '[': return LexBracket();
    177   case '!': return LexExclaim();
    178   }
    179 }
    180 
    181 /// LexString - Lex "[^"]*"
    182 tgtok::TokKind TGLexer::LexString() {
    183   const char *StrStart = CurPtr;
    184 
    185   CurStrVal = "";
    186 
    187   while (*CurPtr != '"') {
    188     // If we hit the end of the buffer, report an error.
    189     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
    190       return ReturnError(StrStart, "End of file in string literal");
    191 
    192     if (*CurPtr == '\n' || *CurPtr == '\r')
    193       return ReturnError(StrStart, "End of line in string literal");
    194 
    195     if (*CurPtr != '\\') {
    196       CurStrVal += *CurPtr++;
    197       continue;
    198     }
    199 
    200     ++CurPtr;
    201 
    202     switch (*CurPtr) {
    203     case '\\': case '\'': case '"':
    204       // These turn into their literal character.
    205       CurStrVal += *CurPtr++;
    206       break;
    207     case 't':
    208       CurStrVal += '\t';
    209       ++CurPtr;
    210       break;
    211     case 'n':
    212       CurStrVal += '\n';
    213       ++CurPtr;
    214       break;
    215 
    216     case '\n':
    217     case '\r':
    218       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
    219 
    220     // If we hit the end of the buffer, report an error.
    221     case '\0':
    222       if (CurPtr == CurBuf->getBufferEnd())
    223         return ReturnError(StrStart, "End of file in string literal");
    224       // FALL THROUGH
    225     default:
    226       return ReturnError(CurPtr, "invalid escape in string literal");
    227     }
    228   }
    229 
    230   ++CurPtr;
    231   return tgtok::StrVal;
    232 }
    233 
    234 tgtok::TokKind TGLexer::LexVarName() {
    235   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
    236     return ReturnError(TokStart, "Invalid variable name");
    237 
    238   // Otherwise, we're ok, consume the rest of the characters.
    239   const char *VarNameStart = CurPtr++;
    240 
    241   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    242     ++CurPtr;
    243 
    244   CurStrVal.assign(VarNameStart, CurPtr);
    245   return tgtok::VarName;
    246 }
    247 
    248 
    249 tgtok::TokKind TGLexer::LexIdentifier() {
    250   // The first letter is [a-zA-Z_#].
    251   const char *IdentStart = TokStart;
    252 
    253   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
    254   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    255     ++CurPtr;
    256 
    257   // Check to see if this identifier is a keyword.
    258   StringRef Str(IdentStart, CurPtr-IdentStart);
    259 
    260   if (Str == "include") {
    261     if (LexInclude()) return tgtok::Error;
    262     return Lex();
    263   }
    264 
    265   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
    266     .Case("int", tgtok::Int)
    267     .Case("bit", tgtok::Bit)
    268     .Case("bits", tgtok::Bits)
    269     .Case("string", tgtok::String)
    270     .Case("list", tgtok::List)
    271     .Case("code", tgtok::Code)
    272     .Case("dag", tgtok::Dag)
    273     .Case("class", tgtok::Class)
    274     .Case("def", tgtok::Def)
    275     .Case("defm", tgtok::Defm)
    276     .Case("multiclass", tgtok::MultiClass)
    277     .Case("field", tgtok::Field)
    278     .Case("let", tgtok::Let)
    279     .Case("in", tgtok::In)
    280     .Default(tgtok::Id);
    281 
    282   if (Kind == tgtok::Id)
    283     CurStrVal.assign(Str.begin(), Str.end());
    284   return Kind;
    285 }
    286 
    287 /// LexInclude - We just read the "include" token.  Get the string token that
    288 /// comes next and enter the include.
    289 bool TGLexer::LexInclude() {
    290   // The token after the include must be a string.
    291   tgtok::TokKind Tok = LexToken();
    292   if (Tok == tgtok::Error) return true;
    293   if (Tok != tgtok::StrVal) {
    294     PrintError(getLoc(), "Expected filename after include");
    295     return true;
    296   }
    297 
    298   // Get the string.
    299   std::string Filename = CurStrVal;
    300   std::string IncludedFile;
    301 
    302 
    303   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
    304                                     IncludedFile);
    305   if (CurBuffer == -1) {
    306     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
    307     return true;
    308   }
    309 
    310   Dependencies.push_back(IncludedFile);
    311   // Save the line number and lex buffer of the includer.
    312   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
    313   CurPtr = CurBuf->getBufferStart();
    314   return false;
    315 }
    316 
    317 void TGLexer::SkipBCPLComment() {
    318   ++CurPtr;  // skip the second slash.
    319   while (1) {
    320     switch (*CurPtr) {
    321     case '\n':
    322     case '\r':
    323       return;  // Newline is end of comment.
    324     case 0:
    325       // If this is the end of the buffer, end the comment.
    326       if (CurPtr == CurBuf->getBufferEnd())
    327         return;
    328       break;
    329     }
    330     // Otherwise, skip the character.
    331     ++CurPtr;
    332   }
    333 }
    334 
    335 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
    336 /// is that we allow nesting.
    337 bool TGLexer::SkipCComment() {
    338   ++CurPtr;  // skip the star.
    339   unsigned CommentDepth = 1;
    340 
    341   while (1) {
    342     int CurChar = getNextChar();
    343     switch (CurChar) {
    344     case EOF:
    345       PrintError(TokStart, "Unterminated comment!");
    346       return true;
    347     case '*':
    348       // End of the comment?
    349       if (CurPtr[0] != '/') break;
    350 
    351       ++CurPtr;   // End the */.
    352       if (--CommentDepth == 0)
    353         return false;
    354       break;
    355     case '/':
    356       // Start of a nested comment?
    357       if (CurPtr[0] != '*') break;
    358       ++CurPtr;
    359       ++CommentDepth;
    360       break;
    361     }
    362   }
    363 }
    364 
    365 /// LexNumber - Lex:
    366 ///    [-+]?[0-9]+
    367 ///    0x[0-9a-fA-F]+
    368 ///    0b[01]+
    369 tgtok::TokKind TGLexer::LexNumber() {
    370   if (CurPtr[-1] == '0') {
    371     if (CurPtr[0] == 'x') {
    372       ++CurPtr;
    373       const char *NumStart = CurPtr;
    374       while (isxdigit(CurPtr[0]))
    375         ++CurPtr;
    376 
    377       // Requires at least one hex digit.
    378       if (CurPtr == NumStart)
    379         return ReturnError(TokStart, "Invalid hexadecimal number");
    380 
    381       errno = 0;
    382       CurIntVal = strtoll(NumStart, 0, 16);
    383       if (errno == EINVAL)
    384         return ReturnError(TokStart, "Invalid hexadecimal number");
    385       if (errno == ERANGE) {
    386         errno = 0;
    387         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
    388         if (errno == EINVAL)
    389           return ReturnError(TokStart, "Invalid hexadecimal number");
    390         if (errno == ERANGE)
    391           return ReturnError(TokStart, "Hexadecimal number out of range");
    392       }
    393       return tgtok::IntVal;
    394     } else if (CurPtr[0] == 'b') {
    395       ++CurPtr;
    396       const char *NumStart = CurPtr;
    397       while (CurPtr[0] == '0' || CurPtr[0] == '1')
    398         ++CurPtr;
    399 
    400       // Requires at least one binary digit.
    401       if (CurPtr == NumStart)
    402         return ReturnError(CurPtr-2, "Invalid binary number");
    403       CurIntVal = strtoll(NumStart, 0, 2);
    404       return tgtok::IntVal;
    405     }
    406   }
    407 
    408   // Check for a sign without a digit.
    409   if (!isdigit(CurPtr[0])) {
    410     if (CurPtr[-1] == '-')
    411       return tgtok::minus;
    412     else if (CurPtr[-1] == '+')
    413       return tgtok::plus;
    414   }
    415 
    416   while (isdigit(CurPtr[0]))
    417     ++CurPtr;
    418   CurIntVal = strtoll(TokStart, 0, 10);
    419   return tgtok::IntVal;
    420 }
    421 
    422 /// LexBracket - We just read '['.  If this is a code block, return it,
    423 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
    424 tgtok::TokKind TGLexer::LexBracket() {
    425   if (CurPtr[0] != '{')
    426     return tgtok::l_square;
    427   ++CurPtr;
    428   const char *CodeStart = CurPtr;
    429   while (1) {
    430     int Char = getNextChar();
    431     if (Char == EOF) break;
    432 
    433     if (Char != '}') continue;
    434 
    435     Char = getNextChar();
    436     if (Char == EOF) break;
    437     if (Char == ']') {
    438       CurStrVal.assign(CodeStart, CurPtr-2);
    439       return tgtok::CodeFragment;
    440     }
    441   }
    442 
    443   return ReturnError(CodeStart-2, "Unterminated Code Block");
    444 }
    445 
    446 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
    447 tgtok::TokKind TGLexer::LexExclaim() {
    448   if (!isalpha(*CurPtr))
    449     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
    450 
    451   const char *Start = CurPtr++;
    452   while (isalpha(*CurPtr))
    453     ++CurPtr;
    454 
    455   // Check to see which operator this is.
    456   tgtok::TokKind Kind =
    457     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
    458     .Case("eq", tgtok::XEq)
    459     .Case("if", tgtok::XIf)
    460     .Case("head", tgtok::XHead)
    461     .Case("tail", tgtok::XTail)
    462     .Case("con", tgtok::XConcat)
    463     .Case("shl", tgtok::XSHL)
    464     .Case("sra", tgtok::XSRA)
    465     .Case("srl", tgtok::XSRL)
    466     .Case("cast", tgtok::XCast)
    467     .Case("empty", tgtok::XEmpty)
    468     .Case("subst", tgtok::XSubst)
    469     .Case("foreach", tgtok::XForEach)
    470     .Case("strconcat", tgtok::XStrConcat)
    471     .Default(tgtok::Error);
    472 
    473   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
    474 }
    475 
    476