Home | History | Annotate | Download | only in TableGen
      1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Implement the Lexer for TableGen.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "TGLexer.h"
     15 #include "llvm/TableGen/Error.h"
     16 #include "llvm/Support/SourceMgr.h"
     17 #include "llvm/Support/MemoryBuffer.h"
     18 #include "llvm/ADT/StringSwitch.h"
     19 #include "llvm/ADT/Twine.h"
     20 #include <cctype>
     21 #include <cstdio>
     22 #include <cstdlib>
     23 #include <cstring>
     24 #include <cerrno>
     25 
     26 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
     27 
     28 using namespace llvm;
     29 
     30 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
     31   CurBuffer = 0;
     32   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     33   CurPtr = CurBuf->getBufferStart();
     34   TokStart = 0;
     35 }
     36 
     37 SMLoc TGLexer::getLoc() const {
     38   return SMLoc::getFromPointer(TokStart);
     39 }
     40 
     41 /// ReturnError - Set the error to the specified string at the specified
     42 /// location.  This is defined to always return tgtok::Error.
     43 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
     44   PrintError(Loc, Msg);
     45   return tgtok::Error;
     46 }
     47 
     48 int TGLexer::getNextChar() {
     49   char CurChar = *CurPtr++;
     50   switch (CurChar) {
     51   default:
     52     return (unsigned char)CurChar;
     53   case 0: {
     54     // A nul character in the stream is either the end of the current buffer or
     55     // a random nul in the file.  Disambiguate that here.
     56     if (CurPtr-1 != CurBuf->getBufferEnd())
     57       return 0;  // Just whitespace.
     58 
     59     // If this is the end of an included file, pop the parent file off the
     60     // include stack.
     61     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     62     if (ParentIncludeLoc != SMLoc()) {
     63       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
     64       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
     65       CurPtr = ParentIncludeLoc.getPointer();
     66       return getNextChar();
     67     }
     68 
     69     // Otherwise, return end of file.
     70     --CurPtr;  // Another call to lex will return EOF again.
     71     return EOF;
     72   }
     73   case '\n':
     74   case '\r':
     75     // Handle the newline character by ignoring it and incrementing the line
     76     // count.  However, be careful about 'dos style' files with \n\r in them.
     77     // Only treat a \n\r or \r\n as a single line.
     78     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
     79         *CurPtr != CurChar)
     80       ++CurPtr;  // Eat the two char newline sequence.
     81     return '\n';
     82   }
     83 }
     84 
     85 int TGLexer::peekNextChar(int Index) {
     86   return *(CurPtr + Index);
     87 }
     88 
     89 tgtok::TokKind TGLexer::LexToken() {
     90   TokStart = CurPtr;
     91   // This always consumes at least one character.
     92   int CurChar = getNextChar();
     93 
     94   switch (CurChar) {
     95   default:
     96     // Handle letters: [a-zA-Z_]
     97     if (isalpha(CurChar) || CurChar == '_')
     98       return LexIdentifier();
     99 
    100     // Unknown character, emit an error.
    101     return ReturnError(TokStart, "Unexpected character");
    102   case EOF: return tgtok::Eof;
    103   case ':': return tgtok::colon;
    104   case ';': return tgtok::semi;
    105   case '.': return tgtok::period;
    106   case ',': return tgtok::comma;
    107   case '<': return tgtok::less;
    108   case '>': return tgtok::greater;
    109   case ']': return tgtok::r_square;
    110   case '{': return tgtok::l_brace;
    111   case '}': return tgtok::r_brace;
    112   case '(': return tgtok::l_paren;
    113   case ')': return tgtok::r_paren;
    114   case '=': return tgtok::equal;
    115   case '?': return tgtok::question;
    116   case '#': return tgtok::paste;
    117 
    118   case 0:
    119   case ' ':
    120   case '\t':
    121   case '\n':
    122   case '\r':
    123     // Ignore whitespace.
    124     return LexToken();
    125   case '/':
    126     // If this is the start of a // comment, skip until the end of the line or
    127     // the end of the buffer.
    128     if (*CurPtr == '/')
    129       SkipBCPLComment();
    130     else if (*CurPtr == '*') {
    131       if (SkipCComment())
    132         return tgtok::Error;
    133     } else // Otherwise, this is an error.
    134       return ReturnError(TokStart, "Unexpected character");
    135     return LexToken();
    136   case '-': case '+':
    137   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
    138   case '7': case '8': case '9': {
    139     int NextChar = 0;
    140     if (isdigit(CurChar)) {
    141       // Allow identifiers to start with a number if it is followed by
    142       // an identifier.  This can happen with paste operations like
    143       // foo#8i.
    144       int i = 0;
    145       do {
    146         NextChar = peekNextChar(i++);
    147       } while (isdigit(NextChar));
    148 
    149       if (NextChar == 'x' || NextChar == 'b') {
    150         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
    151         // likely a number.
    152         int NextNextChar = peekNextChar(i);
    153         switch (NextNextChar) {
    154         default:
    155           break;
    156         case '0': case '1':
    157           if (NextChar == 'b')
    158             return LexNumber();
    159           // Fallthrough
    160         case '2': case '3': case '4': case '5':
    161         case '6': case '7': case '8': case '9':
    162         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    163         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    164           if (NextChar == 'x')
    165             return LexNumber();
    166           break;
    167         }
    168       }
    169     }
    170 
    171     if (isalpha(NextChar) || NextChar == '_')
    172       return LexIdentifier();
    173 
    174     return LexNumber();
    175   }
    176   case '"': return LexString();
    177   case '$': return LexVarName();
    178   case '[': return LexBracket();
    179   case '!': return LexExclaim();
    180   }
    181 }
    182 
    183 /// LexString - Lex "[^"]*"
    184 tgtok::TokKind TGLexer::LexString() {
    185   const char *StrStart = CurPtr;
    186 
    187   CurStrVal = "";
    188 
    189   while (*CurPtr != '"') {
    190     // If we hit the end of the buffer, report an error.
    191     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
    192       return ReturnError(StrStart, "End of file in string literal");
    193 
    194     if (*CurPtr == '\n' || *CurPtr == '\r')
    195       return ReturnError(StrStart, "End of line in string literal");
    196 
    197     if (*CurPtr != '\\') {
    198       CurStrVal += *CurPtr++;
    199       continue;
    200     }
    201 
    202     ++CurPtr;
    203 
    204     switch (*CurPtr) {
    205     case '\\': case '\'': case '"':
    206       // These turn into their literal character.
    207       CurStrVal += *CurPtr++;
    208       break;
    209     case 't':
    210       CurStrVal += '\t';
    211       ++CurPtr;
    212       break;
    213     case 'n':
    214       CurStrVal += '\n';
    215       ++CurPtr;
    216       break;
    217 
    218     case '\n':
    219     case '\r':
    220       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
    221 
    222     // If we hit the end of the buffer, report an error.
    223     case '\0':
    224       if (CurPtr == CurBuf->getBufferEnd())
    225         return ReturnError(StrStart, "End of file in string literal");
    226       // FALL THROUGH
    227     default:
    228       return ReturnError(CurPtr, "invalid escape in string literal");
    229     }
    230   }
    231 
    232   ++CurPtr;
    233   return tgtok::StrVal;
    234 }
    235 
    236 tgtok::TokKind TGLexer::LexVarName() {
    237   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
    238     return ReturnError(TokStart, "Invalid variable name");
    239 
    240   // Otherwise, we're ok, consume the rest of the characters.
    241   const char *VarNameStart = CurPtr++;
    242 
    243   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    244     ++CurPtr;
    245 
    246   CurStrVal.assign(VarNameStart, CurPtr);
    247   return tgtok::VarName;
    248 }
    249 
    250 
    251 tgtok::TokKind TGLexer::LexIdentifier() {
    252   // The first letter is [a-zA-Z_#].
    253   const char *IdentStart = TokStart;
    254 
    255   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
    256   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
    257     ++CurPtr;
    258 
    259   // Check to see if this identifier is a keyword.
    260   StringRef Str(IdentStart, CurPtr-IdentStart);
    261 
    262   if (Str == "include") {
    263     if (LexInclude()) return tgtok::Error;
    264     return Lex();
    265   }
    266 
    267   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
    268     .Case("int", tgtok::Int)
    269     .Case("bit", tgtok::Bit)
    270     .Case("bits", tgtok::Bits)
    271     .Case("string", tgtok::String)
    272     .Case("list", tgtok::List)
    273     .Case("code", tgtok::Code)
    274     .Case("dag", tgtok::Dag)
    275     .Case("class", tgtok::Class)
    276     .Case("def", tgtok::Def)
    277     .Case("foreach", tgtok::Foreach)
    278     .Case("defm", tgtok::Defm)
    279     .Case("multiclass", tgtok::MultiClass)
    280     .Case("field", tgtok::Field)
    281     .Case("let", tgtok::Let)
    282     .Case("in", tgtok::In)
    283     .Default(tgtok::Id);
    284 
    285   if (Kind == tgtok::Id)
    286     CurStrVal.assign(Str.begin(), Str.end());
    287   return Kind;
    288 }
    289 
    290 /// LexInclude - We just read the "include" token.  Get the string token that
    291 /// comes next and enter the include.
    292 bool TGLexer::LexInclude() {
    293   // The token after the include must be a string.
    294   tgtok::TokKind Tok = LexToken();
    295   if (Tok == tgtok::Error) return true;
    296   if (Tok != tgtok::StrVal) {
    297     PrintError(getLoc(), "Expected filename after include");
    298     return true;
    299   }
    300 
    301   // Get the string.
    302   std::string Filename = CurStrVal;
    303   std::string IncludedFile;
    304 
    305 
    306   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
    307                                     IncludedFile);
    308   if (CurBuffer == -1) {
    309     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
    310     return true;
    311   }
    312 
    313   Dependencies.push_back(IncludedFile);
    314   // Save the line number and lex buffer of the includer.
    315   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
    316   CurPtr = CurBuf->getBufferStart();
    317   return false;
    318 }
    319 
    320 void TGLexer::SkipBCPLComment() {
    321   ++CurPtr;  // skip the second slash.
    322   while (1) {
    323     switch (*CurPtr) {
    324     case '\n':
    325     case '\r':
    326       return;  // Newline is end of comment.
    327     case 0:
    328       // If this is the end of the buffer, end the comment.
    329       if (CurPtr == CurBuf->getBufferEnd())
    330         return;
    331       break;
    332     }
    333     // Otherwise, skip the character.
    334     ++CurPtr;
    335   }
    336 }
    337 
    338 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
    339 /// is that we allow nesting.
    340 bool TGLexer::SkipCComment() {
    341   ++CurPtr;  // skip the star.
    342   unsigned CommentDepth = 1;
    343 
    344   while (1) {
    345     int CurChar = getNextChar();
    346     switch (CurChar) {
    347     case EOF:
    348       PrintError(TokStart, "Unterminated comment!");
    349       return true;
    350     case '*':
    351       // End of the comment?
    352       if (CurPtr[0] != '/') break;
    353 
    354       ++CurPtr;   // End the */.
    355       if (--CommentDepth == 0)
    356         return false;
    357       break;
    358     case '/':
    359       // Start of a nested comment?
    360       if (CurPtr[0] != '*') break;
    361       ++CurPtr;
    362       ++CommentDepth;
    363       break;
    364     }
    365   }
    366 }
    367 
    368 /// LexNumber - Lex:
    369 ///    [-+]?[0-9]+
    370 ///    0x[0-9a-fA-F]+
    371 ///    0b[01]+
    372 tgtok::TokKind TGLexer::LexNumber() {
    373   if (CurPtr[-1] == '0') {
    374     if (CurPtr[0] == 'x') {
    375       ++CurPtr;
    376       const char *NumStart = CurPtr;
    377       while (isxdigit(CurPtr[0]))
    378         ++CurPtr;
    379 
    380       // Requires at least one hex digit.
    381       if (CurPtr == NumStart)
    382         return ReturnError(TokStart, "Invalid hexadecimal number");
    383 
    384       errno = 0;
    385       CurIntVal = strtoll(NumStart, 0, 16);
    386       if (errno == EINVAL)
    387         return ReturnError(TokStart, "Invalid hexadecimal number");
    388       if (errno == ERANGE) {
    389         errno = 0;
    390         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
    391         if (errno == EINVAL)
    392           return ReturnError(TokStart, "Invalid hexadecimal number");
    393         if (errno == ERANGE)
    394           return ReturnError(TokStart, "Hexadecimal number out of range");
    395       }
    396       return tgtok::IntVal;
    397     } else if (CurPtr[0] == 'b') {
    398       ++CurPtr;
    399       const char *NumStart = CurPtr;
    400       while (CurPtr[0] == '0' || CurPtr[0] == '1')
    401         ++CurPtr;
    402 
    403       // Requires at least one binary digit.
    404       if (CurPtr == NumStart)
    405         return ReturnError(CurPtr-2, "Invalid binary number");
    406       CurIntVal = strtoll(NumStart, 0, 2);
    407       return tgtok::IntVal;
    408     }
    409   }
    410 
    411   // Check for a sign without a digit.
    412   if (!isdigit(CurPtr[0])) {
    413     if (CurPtr[-1] == '-')
    414       return tgtok::minus;
    415     else if (CurPtr[-1] == '+')
    416       return tgtok::plus;
    417   }
    418 
    419   while (isdigit(CurPtr[0]))
    420     ++CurPtr;
    421   CurIntVal = strtoll(TokStart, 0, 10);
    422   return tgtok::IntVal;
    423 }
    424 
    425 /// LexBracket - We just read '['.  If this is a code block, return it,
    426 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
    427 tgtok::TokKind TGLexer::LexBracket() {
    428   if (CurPtr[0] != '{')
    429     return tgtok::l_square;
    430   ++CurPtr;
    431   const char *CodeStart = CurPtr;
    432   while (1) {
    433     int Char = getNextChar();
    434     if (Char == EOF) break;
    435 
    436     if (Char != '}') continue;
    437 
    438     Char = getNextChar();
    439     if (Char == EOF) break;
    440     if (Char == ']') {
    441       CurStrVal.assign(CodeStart, CurPtr-2);
    442       return tgtok::CodeFragment;
    443     }
    444   }
    445 
    446   return ReturnError(CodeStart-2, "Unterminated Code Block");
    447 }
    448 
    449 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
    450 tgtok::TokKind TGLexer::LexExclaim() {
    451   if (!isalpha(*CurPtr))
    452     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
    453 
    454   const char *Start = CurPtr++;
    455   while (isalpha(*CurPtr))
    456     ++CurPtr;
    457 
    458   // Check to see which operator this is.
    459   tgtok::TokKind Kind =
    460     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
    461     .Case("eq", tgtok::XEq)
    462     .Case("if", tgtok::XIf)
    463     .Case("head", tgtok::XHead)
    464     .Case("tail", tgtok::XTail)
    465     .Case("con", tgtok::XConcat)
    466     .Case("shl", tgtok::XSHL)
    467     .Case("sra", tgtok::XSRA)
    468     .Case("srl", tgtok::XSRL)
    469     .Case("cast", tgtok::XCast)
    470     .Case("empty", tgtok::XEmpty)
    471     .Case("subst", tgtok::XSubst)
    472     .Case("foreach", tgtok::XForEach)
    473     .Case("strconcat", tgtok::XStrConcat)
    474     .Default(tgtok::Error);
    475 
    476   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
    477 }
    478 
    479