Home | History | Annotate | Download | only in Dynamic
      1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Recursive parser implementation for the matcher expression grammar.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "clang/ASTMatchers/Dynamic/Parser.h"
     16 #include "clang/ASTMatchers/Dynamic/Registry.h"
     17 #include "clang/Basic/CharInfo.h"
     18 #include "llvm/ADT/Optional.h"
     19 #include "llvm/ADT/Twine.h"
     20 #include <string>
     21 #include <vector>
     22 
     23 namespace clang {
     24 namespace ast_matchers {
     25 namespace dynamic {
     26 
     27 /// \brief Simple structure to hold information for one token from the parser.
     28 struct Parser::TokenInfo {
     29   /// \brief Different possible tokens.
     30   enum TokenKind {
     31     TK_Eof,
     32     TK_OpenParen,
     33     TK_CloseParen,
     34     TK_Comma,
     35     TK_Period,
     36     TK_Literal,
     37     TK_Ident,
     38     TK_InvalidChar,
     39     TK_Error,
     40     TK_CodeCompletion
     41   };
     42 
     43   /// \brief Some known identifiers.
     44   static const char* const ID_Bind;
     45 
     46   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
     47 
     48   StringRef Text;
     49   TokenKind Kind;
     50   SourceRange Range;
     51   VariantValue Value;
     52 };
     53 
     54 const char* const Parser::TokenInfo::ID_Bind = "bind";
     55 
     56 /// \brief Simple tokenizer for the parser.
     57 class Parser::CodeTokenizer {
     58 public:
     59   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
     60       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
     61         CodeCompletionLocation(nullptr) {
     62     NextToken = getNextToken();
     63   }
     64 
     65   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
     66                 unsigned CodeCompletionOffset)
     67       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
     68         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
     69     NextToken = getNextToken();
     70   }
     71 
     72   /// \brief Returns but doesn't consume the next token.
     73   const TokenInfo &peekNextToken() const { return NextToken; }
     74 
     75   /// \brief Consumes and returns the next token.
     76   TokenInfo consumeNextToken() {
     77     TokenInfo ThisToken = NextToken;
     78     NextToken = getNextToken();
     79     return ThisToken;
     80   }
     81 
     82   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
     83 
     84 private:
     85   TokenInfo getNextToken() {
     86     consumeWhitespace();
     87     TokenInfo Result;
     88     Result.Range.Start = currentLocation();
     89 
     90     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
     91       Result.Kind = TokenInfo::TK_CodeCompletion;
     92       Result.Text = StringRef(CodeCompletionLocation, 0);
     93       CodeCompletionLocation = nullptr;
     94       return Result;
     95     }
     96 
     97     if (Code.empty()) {
     98       Result.Kind = TokenInfo::TK_Eof;
     99       Result.Text = "";
    100       return Result;
    101     }
    102 
    103     switch (Code[0]) {
    104     case ',':
    105       Result.Kind = TokenInfo::TK_Comma;
    106       Result.Text = Code.substr(0, 1);
    107       Code = Code.drop_front();
    108       break;
    109     case '.':
    110       Result.Kind = TokenInfo::TK_Period;
    111       Result.Text = Code.substr(0, 1);
    112       Code = Code.drop_front();
    113       break;
    114     case '(':
    115       Result.Kind = TokenInfo::TK_OpenParen;
    116       Result.Text = Code.substr(0, 1);
    117       Code = Code.drop_front();
    118       break;
    119     case ')':
    120       Result.Kind = TokenInfo::TK_CloseParen;
    121       Result.Text = Code.substr(0, 1);
    122       Code = Code.drop_front();
    123       break;
    124 
    125     case '"':
    126     case '\'':
    127       // Parse a string literal.
    128       consumeStringLiteral(&Result);
    129       break;
    130 
    131     case '0': case '1': case '2': case '3': case '4':
    132     case '5': case '6': case '7': case '8': case '9':
    133       // Parse an unsigned literal.
    134       consumeUnsignedLiteral(&Result);
    135       break;
    136 
    137     default:
    138       if (isAlphanumeric(Code[0])) {
    139         // Parse an identifier
    140         size_t TokenLength = 1;
    141         while (1) {
    142           // A code completion location in/immediately after an identifier will
    143           // cause the portion of the identifier before the code completion
    144           // location to become a code completion token.
    145           if (CodeCompletionLocation == Code.data() + TokenLength) {
    146             CodeCompletionLocation = nullptr;
    147             Result.Kind = TokenInfo::TK_CodeCompletion;
    148             Result.Text = Code.substr(0, TokenLength);
    149             Code = Code.drop_front(TokenLength);
    150             return Result;
    151           }
    152           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
    153             break;
    154           ++TokenLength;
    155         }
    156         Result.Kind = TokenInfo::TK_Ident;
    157         Result.Text = Code.substr(0, TokenLength);
    158         Code = Code.drop_front(TokenLength);
    159       } else {
    160         Result.Kind = TokenInfo::TK_InvalidChar;
    161         Result.Text = Code.substr(0, 1);
    162         Code = Code.drop_front(1);
    163       }
    164       break;
    165     }
    166 
    167     Result.Range.End = currentLocation();
    168     return Result;
    169   }
    170 
    171   /// \brief Consume an unsigned literal.
    172   void consumeUnsignedLiteral(TokenInfo *Result) {
    173     unsigned Length = 1;
    174     if (Code.size() > 1) {
    175       // Consume the 'x' or 'b' radix modifier, if present.
    176       switch (toLowercase(Code[1])) {
    177       case 'x': case 'b': Length = 2;
    178       }
    179     }
    180     while (Length < Code.size() && isHexDigit(Code[Length]))
    181       ++Length;
    182 
    183     Result->Text = Code.substr(0, Length);
    184     Code = Code.drop_front(Length);
    185 
    186     unsigned Value;
    187     if (!Result->Text.getAsInteger(0, Value)) {
    188       Result->Kind = TokenInfo::TK_Literal;
    189       Result->Value = Value;
    190     } else {
    191       SourceRange Range;
    192       Range.Start = Result->Range.Start;
    193       Range.End = currentLocation();
    194       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
    195       Result->Kind = TokenInfo::TK_Error;
    196     }
    197   }
    198 
    199   /// \brief Consume a string literal.
    200   ///
    201   /// \c Code must be positioned at the start of the literal (the opening
    202   /// quote). Consumed until it finds the same closing quote character.
    203   void consumeStringLiteral(TokenInfo *Result) {
    204     bool InEscape = false;
    205     const char Marker = Code[0];
    206     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
    207       if (InEscape) {
    208         InEscape = false;
    209         continue;
    210       }
    211       if (Code[Length] == '\\') {
    212         InEscape = true;
    213         continue;
    214       }
    215       if (Code[Length] == Marker) {
    216         Result->Kind = TokenInfo::TK_Literal;
    217         Result->Text = Code.substr(0, Length + 1);
    218         Result->Value = Code.substr(1, Length - 1).str();
    219         Code = Code.drop_front(Length + 1);
    220         return;
    221       }
    222     }
    223 
    224     StringRef ErrorText = Code;
    225     Code = Code.drop_front(Code.size());
    226     SourceRange Range;
    227     Range.Start = Result->Range.Start;
    228     Range.End = currentLocation();
    229     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
    230     Result->Kind = TokenInfo::TK_Error;
    231   }
    232 
    233   /// \brief Consume all leading whitespace from \c Code.
    234   void consumeWhitespace() {
    235     while (!Code.empty() && isWhitespace(Code[0])) {
    236       if (Code[0] == '\n') {
    237         ++Line;
    238         StartOfLine = Code.drop_front();
    239       }
    240       Code = Code.drop_front();
    241     }
    242   }
    243 
    244   SourceLocation currentLocation() {
    245     SourceLocation Location;
    246     Location.Line = Line;
    247     Location.Column = Code.data() - StartOfLine.data() + 1;
    248     return Location;
    249   }
    250 
    251   StringRef Code;
    252   StringRef StartOfLine;
    253   unsigned Line;
    254   Diagnostics *Error;
    255   TokenInfo NextToken;
    256   const char *CodeCompletionLocation;
    257 };
    258 
    259 Parser::Sema::~Sema() {}
    260 
    261 VariantValue Parser::Sema::getNamedValue(StringRef Name) {
    262   return VariantValue();
    263 }
    264 
    265 struct Parser::ScopedContextEntry {
    266   Parser *P;
    267 
    268   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
    269     P->ContextStack.push_back(std::make_pair(C, 0u));
    270   }
    271 
    272   ~ScopedContextEntry() {
    273     P->ContextStack.pop_back();
    274   }
    275 
    276   void nextArg() {
    277     ++P->ContextStack.back().second;
    278   }
    279 };
    280 
    281 /// \brief Parse expressions that start with an identifier.
    282 ///
    283 /// This function can parse named values and matchers.
    284 /// In case of failure it will try to determine the user's intent to give
    285 /// an appropriate error message.
    286 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
    287   const TokenInfo NameToken = Tokenizer->consumeNextToken();
    288 
    289   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
    290     // Parse as a named value.
    291     if (const VariantValue NamedValue = S->getNamedValue(NameToken.Text)) {
    292       *Value = NamedValue;
    293       return true;
    294     }
    295     // If the syntax is correct and the name is not a matcher either, report
    296     // unknown named value.
    297     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
    298          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
    299          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
    300         !S->lookupMatcherCtor(NameToken.Text)) {
    301       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
    302           << NameToken.Text;
    303       return false;
    304     }
    305     // Otherwise, fallback to the matcher parser.
    306   }
    307 
    308   // Parse as a matcher expression.
    309   return parseMatcherExpressionImpl(NameToken, Value);
    310 }
    311 
    312 /// \brief Parse and validate a matcher expression.
    313 /// \return \c true on success, in which case \c Value has the matcher parsed.
    314 ///   If the input is malformed, or some argument has an error, it
    315 ///   returns \c false.
    316 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
    317                                         VariantValue *Value) {
    318   assert(NameToken.Kind == TokenInfo::TK_Ident);
    319   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    320   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    321     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
    322         << OpenToken.Text;
    323     return false;
    324   }
    325 
    326   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
    327 
    328   if (!Ctor) {
    329     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
    330         << NameToken.Text;
    331     // Do not return here. We need to continue to give completion suggestions.
    332   }
    333 
    334   std::vector<ParserValue> Args;
    335   TokenInfo EndToken;
    336 
    337   {
    338     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
    339 
    340     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
    341       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
    342         // End of args.
    343         EndToken = Tokenizer->consumeNextToken();
    344         break;
    345       }
    346       if (Args.size() > 0) {
    347         // We must find a , token to continue.
    348         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
    349         if (CommaToken.Kind != TokenInfo::TK_Comma) {
    350           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
    351               << CommaToken.Text;
    352           return false;
    353         }
    354       }
    355 
    356       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
    357                                NameToken.Text, NameToken.Range,
    358                                Args.size() + 1);
    359       ParserValue ArgValue;
    360       ArgValue.Text = Tokenizer->peekNextToken().Text;
    361       ArgValue.Range = Tokenizer->peekNextToken().Range;
    362       if (!parseExpressionImpl(&ArgValue.Value)) {
    363         return false;
    364       }
    365 
    366       Args.push_back(ArgValue);
    367       SCE.nextArg();
    368     }
    369   }
    370 
    371   if (EndToken.Kind == TokenInfo::TK_Eof) {
    372     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
    373     return false;
    374   }
    375 
    376   std::string BindID;
    377   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
    378     // Parse .bind("foo")
    379     Tokenizer->consumeNextToken();  // consume the period.
    380     const TokenInfo BindToken = Tokenizer->consumeNextToken();
    381     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
    382       addCompletion(BindToken, "bind(\"", "bind");
    383       return false;
    384     }
    385 
    386     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    387     const TokenInfo IDToken = Tokenizer->consumeNextToken();
    388     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
    389 
    390     // TODO: We could use different error codes for each/some to be more
    391     //       explicit about the syntax error.
    392     if (BindToken.Kind != TokenInfo::TK_Ident ||
    393         BindToken.Text != TokenInfo::ID_Bind) {
    394       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
    395       return false;
    396     }
    397     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    398       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
    399       return false;
    400     }
    401     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
    402       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
    403       return false;
    404     }
    405     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
    406       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
    407       return false;
    408     }
    409     BindID = IDToken.Value.getString();
    410   }
    411 
    412   if (!Ctor)
    413     return false;
    414 
    415   // Merge the start and end infos.
    416   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
    417                            NameToken.Text, NameToken.Range);
    418   SourceRange MatcherRange = NameToken.Range;
    419   MatcherRange.End = EndToken.Range.End;
    420   VariantMatcher Result = S->actOnMatcherExpression(
    421       *Ctor, MatcherRange, BindID, Args, Error);
    422   if (Result.isNull()) return false;
    423 
    424   *Value = Result;
    425   return true;
    426 }
    427 
    428 // If the prefix of this completion matches the completion token, add it to
    429 // Completions minus the prefix.
    430 void Parser::addCompletion(const TokenInfo &CompToken, StringRef TypedText,
    431                            StringRef Decl) {
    432   if (TypedText.size() >= CompToken.Text.size() &&
    433       TypedText.substr(0, CompToken.Text.size()) == CompToken.Text) {
    434     Completions.push_back(
    435         MatcherCompletion(TypedText.substr(CompToken.Text.size()), Decl));
    436   }
    437 }
    438 
    439 void Parser::addExpressionCompletions() {
    440   const TokenInfo CompToken = Tokenizer->consumeNextToken();
    441   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
    442 
    443   // We cannot complete code if there is an invalid element on the context
    444   // stack.
    445   for (ContextStackTy::iterator I = ContextStack.begin(),
    446                                 E = ContextStack.end();
    447        I != E; ++I) {
    448     if (!I->first)
    449       return;
    450   }
    451 
    452   std::vector<MatcherCompletion> RegCompletions =
    453       Registry::getCompletions(ContextStack);
    454   for (std::vector<MatcherCompletion>::iterator I = RegCompletions.begin(),
    455                                                 E = RegCompletions.end();
    456        I != E; ++I) {
    457     addCompletion(CompToken, I->TypedText, I->MatcherDecl);
    458   }
    459 }
    460 
    461 /// \brief Parse an <Expresssion>
    462 bool Parser::parseExpressionImpl(VariantValue *Value) {
    463   switch (Tokenizer->nextTokenKind()) {
    464   case TokenInfo::TK_Literal:
    465     *Value = Tokenizer->consumeNextToken().Value;
    466     return true;
    467 
    468   case TokenInfo::TK_Ident:
    469     return parseIdentifierPrefixImpl(Value);
    470 
    471   case TokenInfo::TK_CodeCompletion:
    472     addExpressionCompletions();
    473     return false;
    474 
    475   case TokenInfo::TK_Eof:
    476     Error->addError(Tokenizer->consumeNextToken().Range,
    477                     Error->ET_ParserNoCode);
    478     return false;
    479 
    480   case TokenInfo::TK_Error:
    481     // This error was already reported by the tokenizer.
    482     return false;
    483 
    484   case TokenInfo::TK_OpenParen:
    485   case TokenInfo::TK_CloseParen:
    486   case TokenInfo::TK_Comma:
    487   case TokenInfo::TK_Period:
    488   case TokenInfo::TK_InvalidChar:
    489     const TokenInfo Token = Tokenizer->consumeNextToken();
    490     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
    491     return false;
    492   }
    493 
    494   llvm_unreachable("Unknown token kind.");
    495 }
    496 
    497 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
    498                Diagnostics *Error)
    499     : Tokenizer(Tokenizer), S(S), Error(Error) {}
    500 
    501 Parser::RegistrySema::~RegistrySema() {}
    502 
    503 llvm::Optional<MatcherCtor>
    504 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
    505   return Registry::lookupMatcherCtor(MatcherName);
    506 }
    507 
    508 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
    509     MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID,
    510     ArrayRef<ParserValue> Args, Diagnostics *Error) {
    511   if (BindID.empty()) {
    512     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
    513   } else {
    514     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
    515                                            Error);
    516   }
    517 }
    518 
    519 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
    520                              Diagnostics *Error) {
    521   RegistrySema S;
    522   return parseExpression(Code, &S, Value, Error);
    523 }
    524 
    525 bool Parser::parseExpression(StringRef Code, Sema *S,
    526                              VariantValue *Value, Diagnostics *Error) {
    527   CodeTokenizer Tokenizer(Code, Error);
    528   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
    529   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
    530     Error->addError(Tokenizer.peekNextToken().Range,
    531                     Error->ET_ParserTrailingCode);
    532     return false;
    533   }
    534   return true;
    535 }
    536 
    537 std::vector<MatcherCompletion>
    538 Parser::completeExpression(StringRef Code, unsigned CompletionOffset) {
    539   Diagnostics Error;
    540   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
    541   RegistrySema S;
    542   Parser P(&Tokenizer, &S, &Error);
    543   VariantValue Dummy;
    544   P.parseExpressionImpl(&Dummy);
    545 
    546   return P.Completions;
    547 }
    548 
    549 llvm::Optional<DynTypedMatcher>
    550 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
    551   RegistrySema S;
    552   return parseMatcherExpression(Code, &S, Error);
    553 }
    554 
    555 llvm::Optional<DynTypedMatcher>
    556 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
    557                                Diagnostics *Error) {
    558   VariantValue Value;
    559   if (!parseExpression(Code, S, &Value, Error))
    560     return llvm::Optional<DynTypedMatcher>();
    561   if (!Value.isMatcher()) {
    562     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
    563     return llvm::Optional<DynTypedMatcher>();
    564   }
    565   llvm::Optional<DynTypedMatcher> Result =
    566       Value.getMatcher().getSingleMatcher();
    567   if (!Result.hasValue()) {
    568     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
    569         << Value.getTypeAsString();
    570   }
    571   return Result;
    572 }
    573 
    574 }  // namespace dynamic
    575 }  // namespace ast_matchers
    576 }  // namespace clang
    577