Home | History | Annotate | Download | only in Dynamic
      1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Recursive parser implementation for the matcher expression grammar.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include <string>
     16 #include <vector>
     17 
     18 #include "clang/ASTMatchers/Dynamic/Parser.h"
     19 #include "clang/ASTMatchers/Dynamic/Registry.h"
     20 #include "clang/Basic/CharInfo.h"
     21 #include "llvm/ADT/Twine.h"
     22 
     23 namespace clang {
     24 namespace ast_matchers {
     25 namespace dynamic {
     26 
     27 /// \brief Simple structure to hold information for one token from the parser.
     28 struct Parser::TokenInfo {
     29   /// \brief Different possible tokens.
     30   enum TokenKind {
     31     TK_Eof = 0,
     32     TK_OpenParen = 1,
     33     TK_CloseParen = 2,
     34     TK_Comma = 3,
     35     TK_Period = 4,
     36     TK_Literal = 5,
     37     TK_Ident = 6,
     38     TK_InvalidChar = 7,
     39     TK_Error = 8
     40   };
     41 
     42   /// \brief Some known identifiers.
     43   static const char* const ID_Bind;
     44 
     45   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
     46 
     47   StringRef Text;
     48   TokenKind Kind;
     49   SourceRange Range;
     50   VariantValue Value;
     51 };
     52 
     53 const char* const Parser::TokenInfo::ID_Bind = "bind";
     54 
     55 /// \brief Simple tokenizer for the parser.
     56 class Parser::CodeTokenizer {
     57 public:
     58   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
     59       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
     60     NextToken = getNextToken();
     61   }
     62 
     63   /// \brief Returns but doesn't consume the next token.
     64   const TokenInfo &peekNextToken() const { return NextToken; }
     65 
     66   /// \brief Consumes and returns the next token.
     67   TokenInfo consumeNextToken() {
     68     TokenInfo ThisToken = NextToken;
     69     NextToken = getNextToken();
     70     return ThisToken;
     71   }
     72 
     73   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
     74 
     75 private:
     76   TokenInfo getNextToken() {
     77     consumeWhitespace();
     78     TokenInfo Result;
     79     Result.Range.Start = currentLocation();
     80 
     81     if (Code.empty()) {
     82       Result.Kind = TokenInfo::TK_Eof;
     83       Result.Text = "";
     84       return Result;
     85     }
     86 
     87     switch (Code[0]) {
     88     case ',':
     89       Result.Kind = TokenInfo::TK_Comma;
     90       Result.Text = Code.substr(0, 1);
     91       Code = Code.drop_front();
     92       break;
     93     case '.':
     94       Result.Kind = TokenInfo::TK_Period;
     95       Result.Text = Code.substr(0, 1);
     96       Code = Code.drop_front();
     97       break;
     98     case '(':
     99       Result.Kind = TokenInfo::TK_OpenParen;
    100       Result.Text = Code.substr(0, 1);
    101       Code = Code.drop_front();
    102       break;
    103     case ')':
    104       Result.Kind = TokenInfo::TK_CloseParen;
    105       Result.Text = Code.substr(0, 1);
    106       Code = Code.drop_front();
    107       break;
    108 
    109     case '"':
    110     case '\'':
    111       // Parse a string literal.
    112       consumeStringLiteral(&Result);
    113       break;
    114 
    115     case '0': case '1': case '2': case '3': case '4':
    116     case '5': case '6': case '7': case '8': case '9':
    117       // Parse an unsigned literal.
    118       consumeUnsignedLiteral(&Result);
    119       break;
    120 
    121     default:
    122       if (isAlphanumeric(Code[0])) {
    123         // Parse an identifier
    124         size_t TokenLength = 1;
    125         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
    126           ++TokenLength;
    127         Result.Kind = TokenInfo::TK_Ident;
    128         Result.Text = Code.substr(0, TokenLength);
    129         Code = Code.drop_front(TokenLength);
    130       } else {
    131         Result.Kind = TokenInfo::TK_InvalidChar;
    132         Result.Text = Code.substr(0, 1);
    133         Code = Code.drop_front(1);
    134       }
    135       break;
    136     }
    137 
    138     Result.Range.End = currentLocation();
    139     return Result;
    140   }
    141 
    142   /// \brief Consume an unsigned literal.
    143   void consumeUnsignedLiteral(TokenInfo *Result) {
    144     unsigned Length = 1;
    145     if (Code.size() > 1) {
    146       // Consume the 'x' or 'b' radix modifier, if present.
    147       switch (toLowercase(Code[1])) {
    148       case 'x': case 'b': Length = 2;
    149       }
    150     }
    151     while (Length < Code.size() && isHexDigit(Code[Length]))
    152       ++Length;
    153 
    154     Result->Text = Code.substr(0, Length);
    155     Code = Code.drop_front(Length);
    156 
    157     unsigned Value;
    158     if (!Result->Text.getAsInteger(0, Value)) {
    159       Result->Kind = TokenInfo::TK_Literal;
    160       Result->Value = Value;
    161     } else {
    162       SourceRange Range;
    163       Range.Start = Result->Range.Start;
    164       Range.End = currentLocation();
    165       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
    166       Result->Kind = TokenInfo::TK_Error;
    167     }
    168   }
    169 
    170   /// \brief Consume a string literal.
    171   ///
    172   /// \c Code must be positioned at the start of the literal (the opening
    173   /// quote). Consumed until it finds the same closing quote character.
    174   void consumeStringLiteral(TokenInfo *Result) {
    175     bool InEscape = false;
    176     const char Marker = Code[0];
    177     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
    178       if (InEscape) {
    179         InEscape = false;
    180         continue;
    181       }
    182       if (Code[Length] == '\\') {
    183         InEscape = true;
    184         continue;
    185       }
    186       if (Code[Length] == Marker) {
    187         Result->Kind = TokenInfo::TK_Literal;
    188         Result->Text = Code.substr(0, Length + 1);
    189         Result->Value = Code.substr(1, Length - 1).str();
    190         Code = Code.drop_front(Length + 1);
    191         return;
    192       }
    193     }
    194 
    195     StringRef ErrorText = Code;
    196     Code = Code.drop_front(Code.size());
    197     SourceRange Range;
    198     Range.Start = Result->Range.Start;
    199     Range.End = currentLocation();
    200     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
    201     Result->Kind = TokenInfo::TK_Error;
    202   }
    203 
    204   /// \brief Consume all leading whitespace from \c Code.
    205   void consumeWhitespace() {
    206     while (!Code.empty() && isWhitespace(Code[0])) {
    207       if (Code[0] == '\n') {
    208         ++Line;
    209         StartOfLine = Code.drop_front();
    210       }
    211       Code = Code.drop_front();
    212     }
    213   }
    214 
    215   SourceLocation currentLocation() {
    216     SourceLocation Location;
    217     Location.Line = Line;
    218     Location.Column = Code.data() - StartOfLine.data() + 1;
    219     return Location;
    220   }
    221 
    222   StringRef Code;
    223   StringRef StartOfLine;
    224   unsigned Line;
    225   Diagnostics *Error;
    226   TokenInfo NextToken;
    227 };
    228 
    229 Parser::Sema::~Sema() {}
    230 
    231 /// \brief Parse and validate a matcher expression.
    232 /// \return \c true on success, in which case \c Value has the matcher parsed.
    233 ///   If the input is malformed, or some argument has an error, it
    234 ///   returns \c false.
    235 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
    236   const TokenInfo NameToken = Tokenizer->consumeNextToken();
    237   assert(NameToken.Kind == TokenInfo::TK_Ident);
    238   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    239   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    240     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
    241         << OpenToken.Text;
    242     return false;
    243   }
    244 
    245   std::vector<ParserValue> Args;
    246   TokenInfo EndToken;
    247   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
    248     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
    249       // End of args.
    250       EndToken = Tokenizer->consumeNextToken();
    251       break;
    252     }
    253     if (Args.size() > 0) {
    254       // We must find a , token to continue.
    255       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
    256       if (CommaToken.Kind != TokenInfo::TK_Comma) {
    257         Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
    258             << CommaToken.Text;
    259         return false;
    260       }
    261     }
    262 
    263     Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
    264                              NameToken.Text, NameToken.Range, Args.size() + 1);
    265     ParserValue ArgValue;
    266     ArgValue.Text = Tokenizer->peekNextToken().Text;
    267     ArgValue.Range = Tokenizer->peekNextToken().Range;
    268     if (!parseExpressionImpl(&ArgValue.Value)) return false;
    269 
    270     Args.push_back(ArgValue);
    271   }
    272 
    273   if (EndToken.Kind == TokenInfo::TK_Eof) {
    274     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
    275     return false;
    276   }
    277 
    278   std::string BindID;
    279   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
    280     // Parse .bind("foo")
    281     Tokenizer->consumeNextToken();  // consume the period.
    282     const TokenInfo BindToken = Tokenizer->consumeNextToken();
    283     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    284     const TokenInfo IDToken = Tokenizer->consumeNextToken();
    285     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
    286 
    287     // TODO: We could use different error codes for each/some to be more
    288     //       explicit about the syntax error.
    289     if (BindToken.Kind != TokenInfo::TK_Ident ||
    290         BindToken.Text != TokenInfo::ID_Bind) {
    291       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
    292       return false;
    293     }
    294     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    295       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
    296       return false;
    297     }
    298     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
    299       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
    300       return false;
    301     }
    302     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
    303       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
    304       return false;
    305     }
    306     BindID = IDToken.Value.getString();
    307   }
    308 
    309   // Merge the start and end infos.
    310   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
    311                            NameToken.Text, NameToken.Range);
    312   SourceRange MatcherRange = NameToken.Range;
    313   MatcherRange.End = EndToken.Range.End;
    314   MatcherList Result = S->actOnMatcherExpression(
    315       NameToken.Text, MatcherRange, BindID, Args, Error);
    316   if (Result.empty()) return false;
    317 
    318   *Value = Result;
    319   return true;
    320 }
    321 
    322 /// \brief Parse an <Expresssion>
    323 bool Parser::parseExpressionImpl(VariantValue *Value) {
    324   switch (Tokenizer->nextTokenKind()) {
    325   case TokenInfo::TK_Literal:
    326     *Value = Tokenizer->consumeNextToken().Value;
    327     return true;
    328 
    329   case TokenInfo::TK_Ident:
    330     return parseMatcherExpressionImpl(Value);
    331 
    332   case TokenInfo::TK_Eof:
    333     Error->addError(Tokenizer->consumeNextToken().Range,
    334                     Error->ET_ParserNoCode);
    335     return false;
    336 
    337   case TokenInfo::TK_Error:
    338     // This error was already reported by the tokenizer.
    339     return false;
    340 
    341   case TokenInfo::TK_OpenParen:
    342   case TokenInfo::TK_CloseParen:
    343   case TokenInfo::TK_Comma:
    344   case TokenInfo::TK_Period:
    345   case TokenInfo::TK_InvalidChar:
    346     const TokenInfo Token = Tokenizer->consumeNextToken();
    347     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
    348     return false;
    349   }
    350 
    351   llvm_unreachable("Unknown token kind.");
    352 }
    353 
    354 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
    355                Diagnostics *Error)
    356     : Tokenizer(Tokenizer), S(S), Error(Error) {}
    357 
    358 class RegistrySema : public Parser::Sema {
    359 public:
    360   virtual ~RegistrySema() {}
    361   MatcherList actOnMatcherExpression(StringRef MatcherName,
    362                                      const SourceRange &NameRange,
    363                                      StringRef BindID,
    364                                      ArrayRef<ParserValue> Args,
    365                                      Diagnostics *Error) {
    366     if (BindID.empty()) {
    367       return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
    368     } else {
    369       return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
    370                                              Args, Error);
    371     }
    372   }
    373 };
    374 
    375 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
    376                              Diagnostics *Error) {
    377   RegistrySema S;
    378   return parseExpression(Code, &S, Value, Error);
    379 }
    380 
    381 bool Parser::parseExpression(StringRef Code, Sema *S,
    382                              VariantValue *Value, Diagnostics *Error) {
    383   CodeTokenizer Tokenizer(Code, Error);
    384   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
    385   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
    386     Error->addError(Tokenizer.peekNextToken().Range,
    387                     Error->ET_ParserTrailingCode);
    388     return false;
    389   }
    390   return true;
    391 }
    392 
    393 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
    394                                                 Diagnostics *Error) {
    395   RegistrySema S;
    396   return parseMatcherExpression(Code, &S, Error);
    397 }
    398 
    399 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
    400                                                 Parser::Sema *S,
    401                                                 Diagnostics *Error) {
    402   VariantValue Value;
    403   if (!parseExpression(Code, S, &Value, Error))
    404     return NULL;
    405   if (!Value.isMatchers()) {
    406     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
    407     return NULL;
    408   }
    409   if (Value.getMatchers().matchers().size() != 1) {
    410     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
    411         << Value.getTypeAsString();
    412     return NULL;
    413   }
    414   return Value.getMatchers().matchers()[0]->clone();
    415 }
    416 
    417 }  // namespace dynamic
    418 }  // namespace ast_matchers
    419 }  // namespace clang
    420