Home | History | Annotate | Download | only in TableGen
      1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Implement the Lexer for TableGen.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "TGLexer.h"
     15 #include "llvm/TableGen/Error.h"
     16 #include "llvm/Support/SourceMgr.h"
     17 #include "llvm/Support/MemoryBuffer.h"
     18 #include "llvm/Config/config.h"
     19 #include "llvm/ADT/StringSwitch.h"
     20 #include "llvm/ADT/Twine.h"
     21 #include <cctype>
     22 #include <cstdio>
     23 #include <cstdlib>
     24 #include <cstring>
     25 #include <cerrno>
     26 using namespace llvm;
     27 
     28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
     29   CurBuffer = 0;
     30   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     31   CurPtr = CurBuf->getBufferStart();
     32   TokStart = 0;
     33 }
     34 
     35 SMLoc TGLexer::getLoc() const {
     36   return SMLoc::getFromPointer(TokStart);
     37 }
     38 
     39 /// ReturnError - Set the error to the specified string at the specified
     40 /// location.  This is defined to always return tgtok::Error.
     41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
     42   PrintError(Loc, Msg);
     43   return tgtok::Error;
     44 }
     45 
     46 int TGLexer::getNextChar() {
     47   char CurChar = *CurPtr++;
     48   switch (CurChar) {
     49   default:
     50     return (unsigned char)CurChar;
     51   case 0: {
     52     // A nul character in the stream is either the end of the current buffer or
     53     // a random nul in the file.  Disambiguate that here.
     54     if (CurPtr-1 != CurBuf->getBufferEnd())
     55       return 0;  // Just whitespace.
     56 
     57     // If this is the end of an included file, pop the parent file off the
     58     // include stack.
     59     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     60     if (ParentIncludeLoc != SMLoc()) {
     61       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
     62       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     63       CurPtr = ParentIncludeLoc.getPointer();
     64       return getNextChar();
     65     }
     66 
     67     // Otherwise, return end of file.
     68     --CurPtr;  // Another call to lex will return EOF again.
     69     return EOF;
     70   }
     71   case '\n':
     72   case '\r':
     73     // Handle the newline character by ignoring it and incrementing the line
     74     // count.  However, be careful about 'dos style' files with \n\r in them.
     75     // Only treat a \n\r or \r\n as a single line.
     76     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
     77         *CurPtr != CurChar)
     78       ++CurPtr;  // Eat the two char newline sequence.
     79     return '\n';
     80   }
     81 }
     82 
     83 tgtok::TokKind TGLexer::LexToken() {
     84   TokStart = CurPtr;
     85   // This always consumes at least one character.
     86   int CurChar = getNextChar();
     87 
     88   switch (CurChar) {
     89   default:
     90     // Handle letters: [a-zA-Z_#]
     91     if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
     92       return LexIdentifier();
     93 
     94     // Unknown character, emit an error.
     95     return ReturnError(TokStart, "Unexpected character");
     96   case EOF: return tgtok::Eof;
     97   case ':': return tgtok::colon;
     98   case ';': return tgtok::semi;
     99   case '.': return tgtok::period;
    100   case ',': return tgtok::comma;
    101   case '<': return tgtok::less;
    102   case '>': return tgtok::greater;
    103   case ']': return tgtok::r_square;
    104   case '{': return tgtok::l_brace;
    105   case '}': return tgtok::r_brace;
    106   case '(': return tgtok::l_paren;
    107   case ')': return tgtok::r_paren;
    108   case '=': return tgtok::equal;
    109   case '?': return tgtok::question;
    110 
    111   case 0:
    112   case ' ':
    113   case '\t':
    114   case '\n':
    115   case '\r':
    116     // Ignore whitespace.
    117     return LexToken();
    118   case '/':
    119     // If this is the start of a // comment, skip until the end of the line or
    120     // the end of the buffer.
    121     if (*CurPtr == '/')
    122       SkipBCPLComment();
    123     else if (*CurPtr == '*') {
    124       if (SkipCComment())
    125         return tgtok::Error;
    126     } else // Otherwise, this is an error.
    127       return ReturnError(TokStart, "Unexpected character");
    128     return LexToken();
    129   case '-': case '+':
    130   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
    131   case '7': case '8': case '9':
    132     return LexNumber();
    133   case '"': return LexString();
    134   case '$': return LexVarName();
    135   case '[': return LexBracket();
    136   case '!': return LexExclaim();
    137   }
    138 }
    139 
    140 /// LexString - Lex "[^"]*"
    141 tgtok::TokKind TGLexer::LexString() {
    142   const char *StrStart = CurPtr;
    143 
    144   CurStrVal = "";
    145 
    146   while (*CurPtr != '"') {
    147     // If we hit the end of the buffer, report an error.
    148     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
    149       return ReturnError(StrStart, "End of file in string literal");
    150 
    151     if (*CurPtr == '\n' || *CurPtr == '\r')
    152       return ReturnError(StrStart, "End of line in string literal");
    153 
    154     if (*CurPtr != '\\') {
    155       CurStrVal += *CurPtr++;
    156       continue;
    157     }
    158 
    159     ++CurPtr;
    160 
    161     switch (*CurPtr) {
    162     case '\\': case '\'': case '"':
    163       // These turn into their literal character.
    164       CurStrVal += *CurPtr++;
    165       break;
    166     case 't':
    167       CurStrVal += '\t';
    168       ++CurPtr;
    169       break;
    170     case 'n':
    171       CurStrVal += '\n';
    172       ++CurPtr;
    173       break;
    174 
    175     case '\n':
    176     case '\r':
    177       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
    178 
    179     // If we hit the end of the buffer, report an error.
    180     case '\0':
    181       if (CurPtr == CurBuf->getBufferEnd())
    182         return ReturnError(StrStart, "End of file in string literal");
    183       // FALL THROUGH
    184     default:
    185       return ReturnError(CurPtr, "invalid escape in string literal");
    186     }
    187   }
    188 
    189   ++CurPtr;
    190   return tgtok::StrVal;
    191 }
    192 
    193 tgtok::TokKind TGLexer::LexVarName() {
    194   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
    195     return ReturnError(TokStart, "Invalid variable name");
    196 
    197   // Otherwise, we're ok, consume the rest of the characters.
    198   const char *VarNameStart = CurPtr++;
    199 
    200   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    201     ++CurPtr;
    202 
    203   CurStrVal.assign(VarNameStart, CurPtr);
    204   return tgtok::VarName;
    205 }
    206 
    207 
    208 tgtok::TokKind TGLexer::LexIdentifier() {
    209   // The first letter is [a-zA-Z_#].
    210   const char *IdentStart = TokStart;
    211 
    212   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
    213   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
    214          *CurPtr == '#')
    215     ++CurPtr;
    216 
    217   // Check to see if this identifier is a keyword.
    218   StringRef Str(IdentStart, CurPtr-IdentStart);
    219 
    220   if (Str == "include") {
    221     if (LexInclude()) return tgtok::Error;
    222     return Lex();
    223   }
    224 
    225   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
    226     .Case("int", tgtok::Int)
    227     .Case("bit", tgtok::Bit)
    228     .Case("bits", tgtok::Bits)
    229     .Case("string", tgtok::String)
    230     .Case("list", tgtok::List)
    231     .Case("code", tgtok::Code)
    232     .Case("dag", tgtok::Dag)
    233     .Case("class", tgtok::Class)
    234     .Case("def", tgtok::Def)
    235     .Case("defm", tgtok::Defm)
    236     .Case("multiclass", tgtok::MultiClass)
    237     .Case("field", tgtok::Field)
    238     .Case("let", tgtok::Let)
    239     .Case("in", tgtok::In)
    240     .Default(tgtok::Id);
    241 
    242   if (Kind == tgtok::Id)
    243     CurStrVal.assign(Str.begin(), Str.end());
    244   return Kind;
    245 }
    246 
    247 /// LexInclude - We just read the "include" token.  Get the string token that
    248 /// comes next and enter the include.
    249 bool TGLexer::LexInclude() {
    250   // The token after the include must be a string.
    251   tgtok::TokKind Tok = LexToken();
    252   if (Tok == tgtok::Error) return true;
    253   if (Tok != tgtok::StrVal) {
    254     PrintError(getLoc(), "Expected filename after include");
    255     return true;
    256   }
    257 
    258   // Get the string.
    259   std::string Filename = CurStrVal;
    260   std::string IncludedFile;
    261 
    262 
    263   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
    264                                     IncludedFile);
    265   if (CurBuffer == -1) {
    266     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
    267     return true;
    268   }
    269 
    270   Dependencies.push_back(IncludedFile);
    271   // Save the line number and lex buffer of the includer.
    272   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
    273   CurPtr = CurBuf->getBufferStart();
    274   return false;
    275 }
    276 
    277 void TGLexer::SkipBCPLComment() {
    278   ++CurPtr;  // skip the second slash.
    279   while (1) {
    280     switch (*CurPtr) {
    281     case '\n':
    282     case '\r':
    283       return;  // Newline is end of comment.
    284     case 0:
    285       // If this is the end of the buffer, end the comment.
    286       if (CurPtr == CurBuf->getBufferEnd())
    287         return;
    288       break;
    289     }
    290     // Otherwise, skip the character.
    291     ++CurPtr;
    292   }
    293 }
    294 
    295 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
    296 /// is that we allow nesting.
    297 bool TGLexer::SkipCComment() {
    298   ++CurPtr;  // skip the star.
    299   unsigned CommentDepth = 1;
    300 
    301   while (1) {
    302     int CurChar = getNextChar();
    303     switch (CurChar) {
    304     case EOF:
    305       PrintError(TokStart, "Unterminated comment!");
    306       return true;
    307     case '*':
    308       // End of the comment?
    309       if (CurPtr[0] != '/') break;
    310 
    311       ++CurPtr;   // End the */.
    312       if (--CommentDepth == 0)
    313         return false;
    314       break;
    315     case '/':
    316       // Start of a nested comment?
    317       if (CurPtr[0] != '*') break;
    318       ++CurPtr;
    319       ++CommentDepth;
    320       break;
    321     }
    322   }
    323 }
    324 
    325 /// LexNumber - Lex:
    326 ///    [-+]?[0-9]+
    327 ///    0x[0-9a-fA-F]+
    328 ///    0b[01]+
    329 tgtok::TokKind TGLexer::LexNumber() {
    330   if (CurPtr[-1] == '0') {
    331     if (CurPtr[0] == 'x') {
    332       ++CurPtr;
    333       const char *NumStart = CurPtr;
    334       while (isxdigit(CurPtr[0]))
    335         ++CurPtr;
    336 
    337       // Requires at least one hex digit.
    338       if (CurPtr == NumStart)
    339         return ReturnError(TokStart, "Invalid hexadecimal number");
    340 
    341       errno = 0;
    342       CurIntVal = strtoll(NumStart, 0, 16);
    343       if (errno == EINVAL)
    344         return ReturnError(TokStart, "Invalid hexadecimal number");
    345       if (errno == ERANGE) {
    346         errno = 0;
    347         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
    348         if (errno == EINVAL)
    349           return ReturnError(TokStart, "Invalid hexadecimal number");
    350         if (errno == ERANGE)
    351           return ReturnError(TokStart, "Hexadecimal number out of range");
    352       }
    353       return tgtok::IntVal;
    354     } else if (CurPtr[0] == 'b') {
    355       ++CurPtr;
    356       const char *NumStart = CurPtr;
    357       while (CurPtr[0] == '0' || CurPtr[0] == '1')
    358         ++CurPtr;
    359 
    360       // Requires at least one binary digit.
    361       if (CurPtr == NumStart)
    362         return ReturnError(CurPtr-2, "Invalid binary number");
    363       CurIntVal = strtoll(NumStart, 0, 2);
    364       return tgtok::IntVal;
    365     }
    366   }
    367 
    368   // Check for a sign without a digit.
    369   if (!isdigit(CurPtr[0])) {
    370     if (CurPtr[-1] == '-')
    371       return tgtok::minus;
    372     else if (CurPtr[-1] == '+')
    373       return tgtok::plus;
    374   }
    375 
    376   while (isdigit(CurPtr[0]))
    377     ++CurPtr;
    378   CurIntVal = strtoll(TokStart, 0, 10);
    379   return tgtok::IntVal;
    380 }
    381 
    382 /// LexBracket - We just read '['.  If this is a code block, return it,
    383 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
    384 tgtok::TokKind TGLexer::LexBracket() {
    385   if (CurPtr[0] != '{')
    386     return tgtok::l_square;
    387   ++CurPtr;
    388   const char *CodeStart = CurPtr;
    389   while (1) {
    390     int Char = getNextChar();
    391     if (Char == EOF) break;
    392 
    393     if (Char != '}') continue;
    394 
    395     Char = getNextChar();
    396     if (Char == EOF) break;
    397     if (Char == ']') {
    398       CurStrVal.assign(CodeStart, CurPtr-2);
    399       return tgtok::CodeFragment;
    400     }
    401   }
    402 
    403   return ReturnError(CodeStart-2, "Unterminated Code Block");
    404 }
    405 
    406 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
    407 tgtok::TokKind TGLexer::LexExclaim() {
    408   if (!isalpha(*CurPtr))
    409     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
    410 
    411   const char *Start = CurPtr++;
    412   while (isalpha(*CurPtr))
    413     ++CurPtr;
    414 
    415   // Check to see which operator this is.
    416   tgtok::TokKind Kind =
    417     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
    418     .Case("eq", tgtok::XEq)
    419     .Case("if", tgtok::XIf)
    420     .Case("head", tgtok::XHead)
    421     .Case("tail", tgtok::XTail)
    422     .Case("con", tgtok::XConcat)
    423     .Case("shl", tgtok::XSHL)
    424     .Case("sra", tgtok::XSRA)
    425     .Case("srl", tgtok::XSRL)
    426     .Case("cast", tgtok::XCast)
    427     .Case("empty", tgtok::XEmpty)
    428     .Case("subst", tgtok::XSubst)
    429     .Case("foreach", tgtok::XForEach)
    430     .Case("strconcat", tgtok::XStrConcat)
    431     .Default(tgtok::Error);
    432 
    433   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
    434 }
    435 
    436