Home | History | Annotate | Download | only in llvm-rc
      1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===---------------------------------------------------------------------===//
      9 //
     10 // This file implements an interface defined in ResourceScriptToken.h.
     11 // In particular, it defines an .rc script tokenizer.
     12 //
     13 //===---------------------------------------------------------------------===//
     14 
     15 #include "ResourceScriptToken.h"
     16 #include "llvm/Support/raw_ostream.h"
     17 
     18 #include <algorithm>
     19 #include <cassert>
     20 #include <cctype>
     21 #include <cstdlib>
     22 #include <utility>
     23 
     24 using namespace llvm;
     25 
     26 using Kind = RCToken::Kind;
     27 
     28 // Checks if Representation is a correct description of an RC integer.
     29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
     30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
     31 // character (that is the difference between our representation and
     32 // StringRef's one). If Representation is correct, 'true' is returned and
     33 // the return value is put back in Num.
     34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
     35   size_t Length = Representation.size();
     36   if (Length == 0)
     37     return false;
     38   // Strip the last 'L' if unnecessary.
     39   if (std::toupper(Representation.back()) == 'L')
     40     Representation = Representation.drop_back(1);
     41 
     42   return !Representation.getAsInteger<uint32_t>(0, Num);
     43 }
     44 
     45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
     46     : TokenKind(RCTokenKind), TokenValue(Value) {}
     47 
     48 uint32_t RCToken::intValue() const {
     49   assert(TokenKind == Kind::Int);
     50   // We assume that the token already is a correct integer (checked by
     51   // rcGetAsInteger).
     52   uint32_t Result;
     53   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
     54   assert(IsSuccess);
     55   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
     56   return Result;
     57 }
     58 
     59 bool RCToken::isLongInt() const {
     60   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
     61 }
     62 
     63 StringRef RCToken::value() const { return TokenValue; }
     64 
     65 Kind RCToken::kind() const { return TokenKind; }
     66 
     67 bool RCToken::isBinaryOp() const {
     68   switch (TokenKind) {
     69   case Kind::Plus:
     70   case Kind::Minus:
     71   case Kind::Pipe:
     72   case Kind::Amp:
     73     return true;
     74   default:
     75     return false;
     76   }
     77 }
     78 
     79 static Error getStringError(const Twine &message) {
     80   return make_error<StringError>("Error parsing file: " + message,
     81                                  inconvertibleErrorCode());
     82 }
     83 
     84 namespace {
     85 
     86 class Tokenizer {
     87 public:
     88   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
     89 
     90   Expected<std::vector<RCToken>> run();
     91 
     92 private:
     93   // All 'advancing' methods return boolean values; if they're equal to false,
     94   // the stream has ended or failed.
     95   bool advance(size_t Amount = 1);
     96   bool skipWhitespaces();
     97 
     98   // Consumes a token. If any problem occurred, a non-empty Error is returned.
     99   Error consumeToken(const Kind TokenKind);
    100 
    101   // Check if tokenizer is about to read FollowingChars.
    102   bool willNowRead(StringRef FollowingChars) const;
    103 
    104   // Check if tokenizer can start reading an identifier at current position.
    105   // The original tool did non specify the rules to determine what is a correct
    106   // identifier. We assume they should follow the C convention:
    107   // [a-zA-Z_][a-zA-Z0-9_]*.
    108   bool canStartIdentifier() const;
    109   // Check if tokenizer can continue reading an identifier.
    110   bool canContinueIdentifier() const;
    111 
    112   // Check if tokenizer can start reading an integer.
    113   // A correct integer always starts with a 0-9 digit,
    114   // can contain characters 0-9A-Fa-f (digits),
    115   // Ll (marking the integer is 32-bit), Xx (marking the representation
    116   // is hexadecimal). As some kind of separator should come after the
    117   // integer, we can consume the integer until a non-alphanumeric
    118   // character.
    119   bool canStartInt() const;
    120   bool canContinueInt() const;
    121 
    122   bool canStartString() const;
    123 
    124   // Check if tokenizer can start reading a single line comment (e.g. a comment
    125   // that begins with '//')
    126   bool canStartLineComment() const;
    127 
    128   // Check if tokenizer can start or finish reading a block comment (e.g. a
    129   // comment that begins with '/*' and ends with '*/')
    130   bool canStartBlockComment() const;
    131 
    132   // Throw away all remaining characters on the current line.
    133   void skipCurrentLine();
    134 
    135   bool streamEof() const;
    136 
    137   // Classify the token that is about to be read from the current position.
    138   Kind classifyCurrentToken() const;
    139 
    140   // Process the Kind::Identifier token - check if it is
    141   // an identifier describing a block start or end.
    142   void processIdentifier(RCToken &token) const;
    143 
    144   StringRef Data;
    145   size_t DataLength, Pos;
    146 };
    147 
    148 void Tokenizer::skipCurrentLine() {
    149   Pos = Data.find_first_of("\r\n", Pos);
    150   Pos = Data.find_first_not_of("\r\n", Pos);
    151 
    152   if (Pos == StringRef::npos)
    153     Pos = DataLength;
    154 }
    155 
    156 Expected<std::vector<RCToken>> Tokenizer::run() {
    157   Pos = 0;
    158   std::vector<RCToken> Result;
    159 
    160   // Consume an optional UTF-8 Byte Order Mark.
    161   if (willNowRead("\xef\xbb\xbf"))
    162     advance(3);
    163 
    164   while (!streamEof()) {
    165     if (!skipWhitespaces())
    166       break;
    167 
    168     Kind TokenKind = classifyCurrentToken();
    169     if (TokenKind == Kind::Invalid)
    170       return getStringError("Invalid token found at position " + Twine(Pos));
    171 
    172     const size_t TokenStart = Pos;
    173     if (Error TokenError = consumeToken(TokenKind))
    174       return std::move(TokenError);
    175 
    176     // Comments are just deleted, don't bother saving them.
    177     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
    178       continue;
    179 
    180     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
    181     if (TokenKind == Kind::Identifier) {
    182       processIdentifier(Token);
    183     } else if (TokenKind == Kind::Int) {
    184       uint32_t TokenInt;
    185       if (!rcGetAsInteger(Token.value(), TokenInt)) {
    186         // The integer has incorrect format or cannot be represented in
    187         // a 32-bit integer.
    188         return getStringError("Integer invalid or too large: " +
    189                               Token.value().str());
    190       }
    191     }
    192 
    193     Result.push_back(Token);
    194   }
    195 
    196   return Result;
    197 }
    198 
    199 bool Tokenizer::advance(size_t Amount) {
    200   Pos += Amount;
    201   return !streamEof();
    202 }
    203 
    204 bool Tokenizer::skipWhitespaces() {
    205   while (!streamEof() && std::isspace(Data[Pos]))
    206     advance();
    207   return !streamEof();
    208 }
    209 
    210 Error Tokenizer::consumeToken(const Kind TokenKind) {
    211   switch (TokenKind) {
    212   // One-character token consumption.
    213 #define TOKEN(Name)
    214 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
    215 #include "ResourceScriptTokenList.def"
    216     advance();
    217     return Error::success();
    218 
    219   case Kind::LineComment:
    220     advance(2);
    221     skipCurrentLine();
    222     return Error::success();
    223 
    224   case Kind::StartComment: {
    225     advance(2);
    226     auto EndPos = Data.find("*/", Pos);
    227     if (EndPos == StringRef::npos)
    228       return getStringError(
    229           "Unclosed multi-line comment beginning at position " + Twine(Pos));
    230     advance(EndPos - Pos);
    231     advance(2);
    232     return Error::success();
    233   }
    234   case Kind::Identifier:
    235     while (!streamEof() && canContinueIdentifier())
    236       advance();
    237     return Error::success();
    238 
    239   case Kind::Int:
    240     while (!streamEof() && canContinueInt())
    241       advance();
    242     return Error::success();
    243 
    244   case Kind::String:
    245     // Consume the preceding 'L', if there is any.
    246     if (std::toupper(Data[Pos]) == 'L')
    247       advance();
    248     // Consume the double-quote.
    249     advance();
    250 
    251     // Consume the characters until the end of the file, line or string.
    252     while (true) {
    253       if (streamEof()) {
    254         return getStringError("Unterminated string literal.");
    255       } else if (Data[Pos] == '"') {
    256         // Consume the ending double-quote.
    257         advance();
    258         // However, if another '"' follows this double-quote, the string didn't
    259         // end and we just included '"' into the string.
    260         if (!willNowRead("\""))
    261           return Error::success();
    262       } else if (Data[Pos] == '\n') {
    263         return getStringError("String literal not terminated in the line.");
    264       }
    265 
    266       advance();
    267     }
    268 
    269   case Kind::Invalid:
    270     assert(false && "Cannot consume an invalid token.");
    271   }
    272 
    273   llvm_unreachable("Unknown RCToken::Kind");
    274 }
    275 
    276 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
    277   return Data.drop_front(Pos).startswith(FollowingChars);
    278 }
    279 
    280 bool Tokenizer::canStartIdentifier() const {
    281   assert(!streamEof());
    282 
    283   const char CurChar = Data[Pos];
    284   return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
    285 }
    286 
    287 bool Tokenizer::canContinueIdentifier() const {
    288   assert(!streamEof());
    289   const char CurChar = Data[Pos];
    290   return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
    291          CurChar == '/' || CurChar == '\\';
    292 }
    293 
    294 bool Tokenizer::canStartInt() const {
    295   assert(!streamEof());
    296   return std::isdigit(Data[Pos]);
    297 }
    298 
    299 bool Tokenizer::canStartBlockComment() const {
    300   assert(!streamEof());
    301   return Data.drop_front(Pos).startswith("/*");
    302 }
    303 
    304 bool Tokenizer::canStartLineComment() const {
    305   assert(!streamEof());
    306   return Data.drop_front(Pos).startswith("//");
    307 }
    308 
    309 bool Tokenizer::canContinueInt() const {
    310   assert(!streamEof());
    311   return std::isalnum(Data[Pos]);
    312 }
    313 
    314 bool Tokenizer::canStartString() const {
    315   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
    316 }
    317 
    318 bool Tokenizer::streamEof() const { return Pos == DataLength; }
    319 
    320 Kind Tokenizer::classifyCurrentToken() const {
    321   if (canStartBlockComment())
    322     return Kind::StartComment;
    323   if (canStartLineComment())
    324     return Kind::LineComment;
    325 
    326   if (canStartInt())
    327     return Kind::Int;
    328   if (canStartString())
    329     return Kind::String;
    330   // BEGIN and END are at this point of lexing recognized as identifiers.
    331   if (canStartIdentifier())
    332     return Kind::Identifier;
    333 
    334   const char CurChar = Data[Pos];
    335 
    336   switch (CurChar) {
    337   // One-character token classification.
    338 #define TOKEN(Name)
    339 #define SHORT_TOKEN(Name, Ch)                                                  \
    340   case Ch:                                                                     \
    341     return Kind::Name;
    342 #include "ResourceScriptTokenList.def"
    343 
    344   default:
    345     return Kind::Invalid;
    346   }
    347 }
    348 
    349 void Tokenizer::processIdentifier(RCToken &Token) const {
    350   assert(Token.kind() == Kind::Identifier);
    351   StringRef Name = Token.value();
    352 
    353   if (Name.equals_lower("begin"))
    354     Token = RCToken(Kind::BlockBegin, Name);
    355   else if (Name.equals_lower("end"))
    356     Token = RCToken(Kind::BlockEnd, Name);
    357 }
    358 
    359 } // anonymous namespace
    360 
    361 namespace llvm {
    362 
    363 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
    364   return Tokenizer(Input).run();
    365 }
    366 
    367 } // namespace llvm
    368