Home | History | Annotate | Download | only in Dynamic
      1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Recursive parser implementation for the matcher expression grammar.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "clang/ASTMatchers/Dynamic/Parser.h"
     16 #include "clang/ASTMatchers/Dynamic/Registry.h"
     17 #include "clang/Basic/CharInfo.h"
     18 #include "llvm/ADT/Optional.h"
     19 #include "llvm/ADT/Twine.h"
     20 #include "llvm/Support/ManagedStatic.h"
     21 #include <string>
     22 #include <vector>
     23 
     24 namespace clang {
     25 namespace ast_matchers {
     26 namespace dynamic {
     27 
     28 /// \brief Simple structure to hold information for one token from the parser.
     29 struct Parser::TokenInfo {
     30   /// \brief Different possible tokens.
     31   enum TokenKind {
     32     TK_Eof,
     33     TK_OpenParen,
     34     TK_CloseParen,
     35     TK_Comma,
     36     TK_Period,
     37     TK_Literal,
     38     TK_Ident,
     39     TK_InvalidChar,
     40     TK_Error,
     41     TK_CodeCompletion
     42   };
     43 
     44   /// \brief Some known identifiers.
     45   static const char* const ID_Bind;
     46 
     47   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
     48 
     49   StringRef Text;
     50   TokenKind Kind;
     51   SourceRange Range;
     52   VariantValue Value;
     53 };
     54 
     55 const char* const Parser::TokenInfo::ID_Bind = "bind";
     56 
     57 /// \brief Simple tokenizer for the parser.
     58 class Parser::CodeTokenizer {
     59 public:
     60   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
     61       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
     62         CodeCompletionLocation(nullptr) {
     63     NextToken = getNextToken();
     64   }
     65 
     66   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
     67                 unsigned CodeCompletionOffset)
     68       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
     69         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
     70     NextToken = getNextToken();
     71   }
     72 
     73   /// \brief Returns but doesn't consume the next token.
     74   const TokenInfo &peekNextToken() const { return NextToken; }
     75 
     76   /// \brief Consumes and returns the next token.
     77   TokenInfo consumeNextToken() {
     78     TokenInfo ThisToken = NextToken;
     79     NextToken = getNextToken();
     80     return ThisToken;
     81   }
     82 
     83   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
     84 
     85 private:
     86   TokenInfo getNextToken() {
     87     consumeWhitespace();
     88     TokenInfo Result;
     89     Result.Range.Start = currentLocation();
     90 
     91     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
     92       Result.Kind = TokenInfo::TK_CodeCompletion;
     93       Result.Text = StringRef(CodeCompletionLocation, 0);
     94       CodeCompletionLocation = nullptr;
     95       return Result;
     96     }
     97 
     98     if (Code.empty()) {
     99       Result.Kind = TokenInfo::TK_Eof;
    100       Result.Text = "";
    101       return Result;
    102     }
    103 
    104     switch (Code[0]) {
    105     case ',':
    106       Result.Kind = TokenInfo::TK_Comma;
    107       Result.Text = Code.substr(0, 1);
    108       Code = Code.drop_front();
    109       break;
    110     case '.':
    111       Result.Kind = TokenInfo::TK_Period;
    112       Result.Text = Code.substr(0, 1);
    113       Code = Code.drop_front();
    114       break;
    115     case '(':
    116       Result.Kind = TokenInfo::TK_OpenParen;
    117       Result.Text = Code.substr(0, 1);
    118       Code = Code.drop_front();
    119       break;
    120     case ')':
    121       Result.Kind = TokenInfo::TK_CloseParen;
    122       Result.Text = Code.substr(0, 1);
    123       Code = Code.drop_front();
    124       break;
    125 
    126     case '"':
    127     case '\'':
    128       // Parse a string literal.
    129       consumeStringLiteral(&Result);
    130       break;
    131 
    132     case '0': case '1': case '2': case '3': case '4':
    133     case '5': case '6': case '7': case '8': case '9':
    134       // Parse an unsigned literal.
    135       consumeUnsignedLiteral(&Result);
    136       break;
    137 
    138     default:
    139       if (isAlphanumeric(Code[0])) {
    140         // Parse an identifier
    141         size_t TokenLength = 1;
    142         while (1) {
    143           // A code completion location in/immediately after an identifier will
    144           // cause the portion of the identifier before the code completion
    145           // location to become a code completion token.
    146           if (CodeCompletionLocation == Code.data() + TokenLength) {
    147             CodeCompletionLocation = nullptr;
    148             Result.Kind = TokenInfo::TK_CodeCompletion;
    149             Result.Text = Code.substr(0, TokenLength);
    150             Code = Code.drop_front(TokenLength);
    151             return Result;
    152           }
    153           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
    154             break;
    155           ++TokenLength;
    156         }
    157         Result.Kind = TokenInfo::TK_Ident;
    158         Result.Text = Code.substr(0, TokenLength);
    159         Code = Code.drop_front(TokenLength);
    160       } else {
    161         Result.Kind = TokenInfo::TK_InvalidChar;
    162         Result.Text = Code.substr(0, 1);
    163         Code = Code.drop_front(1);
    164       }
    165       break;
    166     }
    167 
    168     Result.Range.End = currentLocation();
    169     return Result;
    170   }
    171 
    172   /// \brief Consume an unsigned literal.
    173   void consumeUnsignedLiteral(TokenInfo *Result) {
    174     unsigned Length = 1;
    175     if (Code.size() > 1) {
    176       // Consume the 'x' or 'b' radix modifier, if present.
    177       switch (toLowercase(Code[1])) {
    178       case 'x': case 'b': Length = 2;
    179       }
    180     }
    181     while (Length < Code.size() && isHexDigit(Code[Length]))
    182       ++Length;
    183 
    184     Result->Text = Code.substr(0, Length);
    185     Code = Code.drop_front(Length);
    186 
    187     unsigned Value;
    188     if (!Result->Text.getAsInteger(0, Value)) {
    189       Result->Kind = TokenInfo::TK_Literal;
    190       Result->Value = Value;
    191     } else {
    192       SourceRange Range;
    193       Range.Start = Result->Range.Start;
    194       Range.End = currentLocation();
    195       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
    196       Result->Kind = TokenInfo::TK_Error;
    197     }
    198   }
    199 
    200   /// \brief Consume a string literal.
    201   ///
    202   /// \c Code must be positioned at the start of the literal (the opening
    203   /// quote). Consumed until it finds the same closing quote character.
    204   void consumeStringLiteral(TokenInfo *Result) {
    205     bool InEscape = false;
    206     const char Marker = Code[0];
    207     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
    208       if (InEscape) {
    209         InEscape = false;
    210         continue;
    211       }
    212       if (Code[Length] == '\\') {
    213         InEscape = true;
    214         continue;
    215       }
    216       if (Code[Length] == Marker) {
    217         Result->Kind = TokenInfo::TK_Literal;
    218         Result->Text = Code.substr(0, Length + 1);
    219         Result->Value = Code.substr(1, Length - 1);
    220         Code = Code.drop_front(Length + 1);
    221         return;
    222       }
    223     }
    224 
    225     StringRef ErrorText = Code;
    226     Code = Code.drop_front(Code.size());
    227     SourceRange Range;
    228     Range.Start = Result->Range.Start;
    229     Range.End = currentLocation();
    230     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
    231     Result->Kind = TokenInfo::TK_Error;
    232   }
    233 
    234   /// \brief Consume all leading whitespace from \c Code.
    235   void consumeWhitespace() {
    236     while (!Code.empty() && isWhitespace(Code[0])) {
    237       if (Code[0] == '\n') {
    238         ++Line;
    239         StartOfLine = Code.drop_front();
    240       }
    241       Code = Code.drop_front();
    242     }
    243   }
    244 
    245   SourceLocation currentLocation() {
    246     SourceLocation Location;
    247     Location.Line = Line;
    248     Location.Column = Code.data() - StartOfLine.data() + 1;
    249     return Location;
    250   }
    251 
    252   StringRef Code;
    253   StringRef StartOfLine;
    254   unsigned Line;
    255   Diagnostics *Error;
    256   TokenInfo NextToken;
    257   const char *CodeCompletionLocation;
    258 };
    259 
    260 Parser::Sema::~Sema() {}
    261 
    262 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
    263     llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
    264   return std::vector<ArgKind>();
    265 }
    266 
    267 std::vector<MatcherCompletion>
    268 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
    269   return std::vector<MatcherCompletion>();
    270 }
    271 
    272 struct Parser::ScopedContextEntry {
    273   Parser *P;
    274 
    275   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
    276     P->ContextStack.push_back(std::make_pair(C, 0u));
    277   }
    278 
    279   ~ScopedContextEntry() {
    280     P->ContextStack.pop_back();
    281   }
    282 
    283   void nextArg() {
    284     ++P->ContextStack.back().second;
    285   }
    286 };
    287 
    288 /// \brief Parse expressions that start with an identifier.
    289 ///
    290 /// This function can parse named values and matchers.
    291 /// In case of failure it will try to determine the user's intent to give
    292 /// an appropriate error message.
    293 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
    294   const TokenInfo NameToken = Tokenizer->consumeNextToken();
    295 
    296   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
    297     // Parse as a named value.
    298     if (const VariantValue NamedValue =
    299             NamedValues ? NamedValues->lookup(NameToken.Text)
    300                         : VariantValue()) {
    301       *Value = NamedValue;
    302       return true;
    303     }
    304     // If the syntax is correct and the name is not a matcher either, report
    305     // unknown named value.
    306     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
    307          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
    308          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
    309         !S->lookupMatcherCtor(NameToken.Text)) {
    310       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
    311           << NameToken.Text;
    312       return false;
    313     }
    314     // Otherwise, fallback to the matcher parser.
    315   }
    316 
    317   // Parse as a matcher expression.
    318   return parseMatcherExpressionImpl(NameToken, Value);
    319 }
    320 
    321 /// \brief Parse and validate a matcher expression.
    322 /// \return \c true on success, in which case \c Value has the matcher parsed.
    323 ///   If the input is malformed, or some argument has an error, it
    324 ///   returns \c false.
    325 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
    326                                         VariantValue *Value) {
    327   assert(NameToken.Kind == TokenInfo::TK_Ident);
    328   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    329   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    330     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
    331         << OpenToken.Text;
    332     return false;
    333   }
    334 
    335   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
    336 
    337   if (!Ctor) {
    338     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
    339         << NameToken.Text;
    340     // Do not return here. We need to continue to give completion suggestions.
    341   }
    342 
    343   std::vector<ParserValue> Args;
    344   TokenInfo EndToken;
    345 
    346   {
    347     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
    348 
    349     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
    350       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
    351         // End of args.
    352         EndToken = Tokenizer->consumeNextToken();
    353         break;
    354       }
    355       if (Args.size() > 0) {
    356         // We must find a , token to continue.
    357         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
    358         if (CommaToken.Kind != TokenInfo::TK_Comma) {
    359           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
    360               << CommaToken.Text;
    361           return false;
    362         }
    363       }
    364 
    365       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
    366                                NameToken.Text, NameToken.Range,
    367                                Args.size() + 1);
    368       ParserValue ArgValue;
    369       ArgValue.Text = Tokenizer->peekNextToken().Text;
    370       ArgValue.Range = Tokenizer->peekNextToken().Range;
    371       if (!parseExpressionImpl(&ArgValue.Value)) {
    372         return false;
    373       }
    374 
    375       Args.push_back(ArgValue);
    376       SCE.nextArg();
    377     }
    378   }
    379 
    380   if (EndToken.Kind == TokenInfo::TK_Eof) {
    381     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
    382     return false;
    383   }
    384 
    385   std::string BindID;
    386   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
    387     // Parse .bind("foo")
    388     Tokenizer->consumeNextToken();  // consume the period.
    389     const TokenInfo BindToken = Tokenizer->consumeNextToken();
    390     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
    391       addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
    392       return false;
    393     }
    394 
    395     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
    396     const TokenInfo IDToken = Tokenizer->consumeNextToken();
    397     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
    398 
    399     // TODO: We could use different error codes for each/some to be more
    400     //       explicit about the syntax error.
    401     if (BindToken.Kind != TokenInfo::TK_Ident ||
    402         BindToken.Text != TokenInfo::ID_Bind) {
    403       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
    404       return false;
    405     }
    406     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
    407       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
    408       return false;
    409     }
    410     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
    411       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
    412       return false;
    413     }
    414     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
    415       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
    416       return false;
    417     }
    418     BindID = IDToken.Value.getString();
    419   }
    420 
    421   if (!Ctor)
    422     return false;
    423 
    424   // Merge the start and end infos.
    425   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
    426                            NameToken.Text, NameToken.Range);
    427   SourceRange MatcherRange = NameToken.Range;
    428   MatcherRange.End = EndToken.Range.End;
    429   VariantMatcher Result = S->actOnMatcherExpression(
    430       *Ctor, MatcherRange, BindID, Args, Error);
    431   if (Result.isNull()) return false;
    432 
    433   *Value = Result;
    434   return true;
    435 }
    436 
    437 // If the prefix of this completion matches the completion token, add it to
    438 // Completions minus the prefix.
    439 void Parser::addCompletion(const TokenInfo &CompToken,
    440                            const MatcherCompletion& Completion) {
    441   if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
    442       Completion.Specificity > 0) {
    443     Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
    444                              Completion.MatcherDecl, Completion.Specificity);
    445   }
    446 }
    447 
    448 std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
    449     ArrayRef<ArgKind> AcceptedTypes) {
    450   if (!NamedValues) return std::vector<MatcherCompletion>();
    451   std::vector<MatcherCompletion> Result;
    452   for (const auto &Entry : *NamedValues) {
    453     unsigned Specificity;
    454     if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
    455       std::string Decl =
    456           (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
    457       Result.emplace_back(Entry.getKey(), Decl, Specificity);
    458     }
    459   }
    460   return Result;
    461 }
    462 
    463 void Parser::addExpressionCompletions() {
    464   const TokenInfo CompToken = Tokenizer->consumeNextToken();
    465   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
    466 
    467   // We cannot complete code if there is an invalid element on the context
    468   // stack.
    469   for (ContextStackTy::iterator I = ContextStack.begin(),
    470                                 E = ContextStack.end();
    471        I != E; ++I) {
    472     if (!I->first)
    473       return;
    474   }
    475 
    476   auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
    477   for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
    478     addCompletion(CompToken, Completion);
    479   }
    480 
    481   for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
    482     addCompletion(CompToken, Completion);
    483   }
    484 }
    485 
    486 /// \brief Parse an <Expresssion>
    487 bool Parser::parseExpressionImpl(VariantValue *Value) {
    488   switch (Tokenizer->nextTokenKind()) {
    489   case TokenInfo::TK_Literal:
    490     *Value = Tokenizer->consumeNextToken().Value;
    491     return true;
    492 
    493   case TokenInfo::TK_Ident:
    494     return parseIdentifierPrefixImpl(Value);
    495 
    496   case TokenInfo::TK_CodeCompletion:
    497     addExpressionCompletions();
    498     return false;
    499 
    500   case TokenInfo::TK_Eof:
    501     Error->addError(Tokenizer->consumeNextToken().Range,
    502                     Error->ET_ParserNoCode);
    503     return false;
    504 
    505   case TokenInfo::TK_Error:
    506     // This error was already reported by the tokenizer.
    507     return false;
    508 
    509   case TokenInfo::TK_OpenParen:
    510   case TokenInfo::TK_CloseParen:
    511   case TokenInfo::TK_Comma:
    512   case TokenInfo::TK_Period:
    513   case TokenInfo::TK_InvalidChar:
    514     const TokenInfo Token = Tokenizer->consumeNextToken();
    515     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
    516     return false;
    517   }
    518 
    519   llvm_unreachable("Unknown token kind.");
    520 }
    521 
    522 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
    523 
    524 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
    525                const NamedValueMap *NamedValues, Diagnostics *Error)
    526     : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
    527       NamedValues(NamedValues), Error(Error) {}
    528 
    529 Parser::RegistrySema::~RegistrySema() {}
    530 
    531 llvm::Optional<MatcherCtor>
    532 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
    533   return Registry::lookupMatcherCtor(MatcherName);
    534 }
    535 
    536 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
    537     MatcherCtor Ctor, SourceRange NameRange, StringRef BindID,
    538     ArrayRef<ParserValue> Args, Diagnostics *Error) {
    539   if (BindID.empty()) {
    540     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
    541   } else {
    542     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
    543                                            Error);
    544   }
    545 }
    546 
    547 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
    548     ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
    549   return Registry::getAcceptedCompletionTypes(Context);
    550 }
    551 
    552 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
    553     ArrayRef<ArgKind> AcceptedTypes) {
    554   return Registry::getMatcherCompletions(AcceptedTypes);
    555 }
    556 
    557 bool Parser::parseExpression(StringRef Code, Sema *S,
    558                              const NamedValueMap *NamedValues,
    559                              VariantValue *Value, Diagnostics *Error) {
    560   CodeTokenizer Tokenizer(Code, Error);
    561   if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
    562     return false;
    563   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
    564     Error->addError(Tokenizer.peekNextToken().Range,
    565                     Error->ET_ParserTrailingCode);
    566     return false;
    567   }
    568   return true;
    569 }
    570 
    571 std::vector<MatcherCompletion>
    572 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
    573                            const NamedValueMap *NamedValues) {
    574   Diagnostics Error;
    575   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
    576   Parser P(&Tokenizer, S, NamedValues, &Error);
    577   VariantValue Dummy;
    578   P.parseExpressionImpl(&Dummy);
    579 
    580   // Sort by specificity, then by name.
    581   std::sort(P.Completions.begin(), P.Completions.end(),
    582             [](const MatcherCompletion &A, const MatcherCompletion &B) {
    583     if (A.Specificity != B.Specificity)
    584       return A.Specificity > B.Specificity;
    585     return A.TypedText < B.TypedText;
    586   });
    587 
    588   return P.Completions;
    589 }
    590 
    591 llvm::Optional<DynTypedMatcher>
    592 Parser::parseMatcherExpression(StringRef Code, Sema *S,
    593                                const NamedValueMap *NamedValues,
    594                                Diagnostics *Error) {
    595   VariantValue Value;
    596   if (!parseExpression(Code, S, NamedValues, &Value, Error))
    597     return llvm::Optional<DynTypedMatcher>();
    598   if (!Value.isMatcher()) {
    599     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
    600     return llvm::Optional<DynTypedMatcher>();
    601   }
    602   llvm::Optional<DynTypedMatcher> Result =
    603       Value.getMatcher().getSingleMatcher();
    604   if (!Result.hasValue()) {
    605     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
    606         << Value.getTypeAsString();
    607   }
    608   return Result;
    609 }
    610 
    611 }  // namespace dynamic
    612 }  // namespace ast_matchers
    613 }  // namespace clang
    614