Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
      6 #define V8_REGEXP_REGEXP_PARSER_H_
      7 
      8 #include "src/objects.h"
      9 #include "src/regexp/regexp-ast.h"
     10 #include "src/zone/zone.h"
     11 
     12 namespace v8 {
     13 namespace internal {
     14 
     15 struct RegExpCompileData;
     16 
     17 
     18 // A BufferedZoneList is an automatically growing list, just like (and backed
     19 // by) a ZoneList, that is optimized for the case of adding and removing
     20 // a single element. The last element added is stored outside the backing list,
     21 // and if no more than one element is ever added, the ZoneList isn't even
     22 // allocated.
     23 // Elements must not be NULL pointers.
     24 template <typename T, int initial_size>
     25 class BufferedZoneList {
     26  public:
     27   BufferedZoneList() : list_(NULL), last_(NULL) {}
     28 
     29   // Adds element at end of list. This element is buffered and can
     30   // be read using last() or removed using RemoveLast until a new Add or until
     31   // RemoveLast or GetList has been called.
     32   void Add(T* value, Zone* zone) {
     33     if (last_ != NULL) {
     34       if (list_ == NULL) {
     35         list_ = new (zone) ZoneList<T*>(initial_size, zone);
     36       }
     37       list_->Add(last_, zone);
     38     }
     39     last_ = value;
     40   }
     41 
     42   T* last() {
     43     DCHECK(last_ != NULL);
     44     return last_;
     45   }
     46 
     47   T* RemoveLast() {
     48     DCHECK(last_ != NULL);
     49     T* result = last_;
     50     if ((list_ != NULL) && (list_->length() > 0))
     51       last_ = list_->RemoveLast();
     52     else
     53       last_ = NULL;
     54     return result;
     55   }
     56 
     57   T* Get(int i) {
     58     DCHECK((0 <= i) && (i < length()));
     59     if (list_ == NULL) {
     60       DCHECK_EQ(0, i);
     61       return last_;
     62     } else {
     63       if (i == list_->length()) {
     64         DCHECK(last_ != NULL);
     65         return last_;
     66       } else {
     67         return list_->at(i);
     68       }
     69     }
     70   }
     71 
     72   void Clear() {
     73     list_ = NULL;
     74     last_ = NULL;
     75   }
     76 
     77   int length() {
     78     int length = (list_ == NULL) ? 0 : list_->length();
     79     return length + ((last_ == NULL) ? 0 : 1);
     80   }
     81 
     82   ZoneList<T*>* GetList(Zone* zone) {
     83     if (list_ == NULL) {
     84       list_ = new (zone) ZoneList<T*>(initial_size, zone);
     85     }
     86     if (last_ != NULL) {
     87       list_->Add(last_, zone);
     88       last_ = NULL;
     89     }
     90     return list_;
     91   }
     92 
     93  private:
     94   ZoneList<T*>* list_;
     95   T* last_;
     96 };
     97 
     98 
     99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
    100 class RegExpBuilder : public ZoneObject {
    101  public:
    102   RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
    103   void AddCharacter(uc16 character);
    104   void AddUnicodeCharacter(uc32 character);
    105   void AddEscapedUnicodeCharacter(uc32 character);
    106   // "Adds" an empty expression. Does nothing except consume a
    107   // following quantifier
    108   void AddEmpty();
    109   void AddCharacterClass(RegExpCharacterClass* cc);
    110   void AddCharacterClassForDesugaring(uc32 c);
    111   void AddAtom(RegExpTree* tree);
    112   void AddTerm(RegExpTree* tree);
    113   void AddAssertion(RegExpTree* tree);
    114   void NewAlternative();  // '|'
    115   bool AddQuantifierToAtom(int min, int max,
    116                            RegExpQuantifier::QuantifierType type);
    117   RegExpTree* ToRegExp();
    118 
    119  private:
    120   static const uc16 kNoPendingSurrogate = 0;
    121   void AddLeadSurrogate(uc16 lead_surrogate);
    122   void AddTrailSurrogate(uc16 trail_surrogate);
    123   void FlushPendingSurrogate();
    124   void FlushCharacters();
    125   void FlushText();
    126   void FlushTerms();
    127   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
    128   bool NeedsDesugaringForIgnoreCase(uc32 c);
    129   Zone* zone() const { return zone_; }
    130   bool ignore_case() const { return ignore_case_; }
    131   bool unicode() const { return unicode_; }
    132 
    133   Zone* zone_;
    134   bool pending_empty_;
    135   bool ignore_case_;
    136   bool unicode_;
    137   ZoneList<uc16>* characters_;
    138   uc16 pending_surrogate_;
    139   BufferedZoneList<RegExpTree, 2> terms_;
    140   BufferedZoneList<RegExpTree, 2> text_;
    141   BufferedZoneList<RegExpTree, 2> alternatives_;
    142 #ifdef DEBUG
    143   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
    144 #define LAST(x) last_added_ = x;
    145 #else
    146 #define LAST(x)
    147 #endif
    148 };
    149 
    150 
    151 class RegExpParser BASE_EMBEDDED {
    152  public:
    153   RegExpParser(FlatStringReader* in, Handle<String>* error,
    154                JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
    155 
    156   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
    157                           JSRegExp::Flags flags, RegExpCompileData* result);
    158 
    159   RegExpTree* ParsePattern();
    160   RegExpTree* ParseDisjunction();
    161   RegExpTree* ParseGroup();
    162   RegExpTree* ParseCharacterClass();
    163 
    164   // Parses a {...,...} quantifier and stores the range in the given
    165   // out parameters.
    166   bool ParseIntervalQuantifier(int* min_out, int* max_out);
    167 
    168   // Parses and returns a single escaped character.  The character
    169   // must not be 'b' or 'B' since they are usually handle specially.
    170   uc32 ParseClassCharacterEscape();
    171 
    172   // Checks whether the following is a length-digit hexadecimal number,
    173   // and sets the value if it is.
    174   bool ParseHexEscape(int length, uc32* value);
    175   bool ParseUnicodeEscape(uc32* value);
    176   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
    177   bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
    178 
    179   uc32 ParseOctalLiteral();
    180 
    181   // Tries to parse the input as a back reference.  If successful it
    182   // stores the result in the output parameter and returns true.  If
    183   // it fails it will push back the characters read so the same characters
    184   // can be reparsed.
    185   bool ParseBackReferenceIndex(int* index_out);
    186 
    187   bool ParseClassProperty(ZoneList<CharacterRange>* result);
    188   CharacterRange ParseClassAtom(uc16* char_class);
    189   RegExpTree* ReportError(Vector<const char> message);
    190   void Advance();
    191   void Advance(int dist);
    192   void Reset(int pos);
    193 
    194   // Reports whether the pattern might be used as a literal search string.
    195   // Only use if the result of the parse is a single atom node.
    196   bool simple();
    197   bool contains_anchor() { return contains_anchor_; }
    198   void set_contains_anchor() { contains_anchor_ = true; }
    199   int captures_started() { return captures_started_; }
    200   int position() { return next_pos_ - 1; }
    201   bool failed() { return failed_; }
    202   bool ignore_case() const { return ignore_case_; }
    203   bool multiline() const { return multiline_; }
    204   bool unicode() const { return unicode_; }
    205 
    206   static bool IsSyntaxCharacterOrSlash(uc32 c);
    207 
    208   static const int kMaxCaptures = 1 << 16;
    209   static const uc32 kEndMarker = (1 << 21);
    210 
    211  private:
    212   enum SubexpressionType {
    213     INITIAL,
    214     CAPTURE,  // All positive values represent captures.
    215     POSITIVE_LOOKAROUND,
    216     NEGATIVE_LOOKAROUND,
    217     GROUPING
    218   };
    219 
    220   class RegExpParserState : public ZoneObject {
    221    public:
    222     RegExpParserState(RegExpParserState* previous_state,
    223                       SubexpressionType group_type,
    224                       RegExpLookaround::Type lookaround_type,
    225                       int disjunction_capture_index,
    226                       const ZoneVector<uc16>* capture_name, bool ignore_case,
    227                       bool unicode, Zone* zone)
    228         : previous_state_(previous_state),
    229           builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
    230           group_type_(group_type),
    231           lookaround_type_(lookaround_type),
    232           disjunction_capture_index_(disjunction_capture_index),
    233           capture_name_(capture_name) {}
    234     // Parser state of containing expression, if any.
    235     RegExpParserState* previous_state() { return previous_state_; }
    236     bool IsSubexpression() { return previous_state_ != NULL; }
    237     // RegExpBuilder building this regexp's AST.
    238     RegExpBuilder* builder() { return builder_; }
    239     // Type of regexp being parsed (parenthesized group or entire regexp).
    240     SubexpressionType group_type() { return group_type_; }
    241     // Lookahead or Lookbehind.
    242     RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
    243     // Index in captures array of first capture in this sub-expression, if any.
    244     // Also the capture index of this sub-expression itself, if group_type
    245     // is CAPTURE.
    246     int capture_index() { return disjunction_capture_index_; }
    247     // The name of the current sub-expression, if group_type is CAPTURE. Only
    248     // used for named captures.
    249     const ZoneVector<uc16>* capture_name() { return capture_name_; }
    250 
    251     bool IsNamedCapture() const { return capture_name_ != nullptr; }
    252 
    253     // Check whether the parser is inside a capture group with the given index.
    254     bool IsInsideCaptureGroup(int index);
    255     // Check whether the parser is inside a capture group with the given name.
    256     bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
    257 
    258    private:
    259     // Linked list implementation of stack of states.
    260     RegExpParserState* previous_state_;
    261     // Builder for the stored disjunction.
    262     RegExpBuilder* builder_;
    263     // Stored disjunction type (capture, look-ahead or grouping), if any.
    264     SubexpressionType group_type_;
    265     // Stored read direction.
    266     RegExpLookaround::Type lookaround_type_;
    267     // Stored disjunction's capture index (if any).
    268     int disjunction_capture_index_;
    269     // Stored capture name (if any).
    270     const ZoneVector<uc16>* capture_name_;
    271   };
    272 
    273   // Return the 1-indexed RegExpCapture object, allocate if necessary.
    274   RegExpCapture* GetCapture(int index);
    275 
    276   // Creates a new named capture at the specified index. Must be called exactly
    277   // once for each named capture. Fails if a capture with the same name is
    278   // encountered.
    279   bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
    280 
    281   // Parses the name of a capture group (?<name>pattern). The name must adhere
    282   // to IdentifierName in the ECMAScript standard.
    283   const ZoneVector<uc16>* ParseCaptureGroupName();
    284 
    285   bool ParseNamedBackReference(RegExpBuilder* builder,
    286                                RegExpParserState* state);
    287 
    288   // After the initial parsing pass, patch corresponding RegExpCapture objects
    289   // into all RegExpBackReferences. This is done after initial parsing in order
    290   // to avoid complicating cases in which references comes before the capture.
    291   void PatchNamedBackReferences();
    292 
    293   Handle<FixedArray> CreateCaptureNameMap();
    294 
    295   Isolate* isolate() { return isolate_; }
    296   Zone* zone() const { return zone_; }
    297 
    298   uc32 current() { return current_; }
    299   bool has_more() { return has_more_; }
    300   bool has_next() { return next_pos_ < in()->length(); }
    301   uc32 Next();
    302   template <bool update_position>
    303   uc32 ReadNext();
    304   FlatStringReader* in() { return in_; }
    305   void ScanForCaptures();
    306 
    307   Isolate* isolate_;
    308   Zone* zone_;
    309   Handle<String>* error_;
    310   ZoneList<RegExpCapture*>* captures_;
    311   ZoneList<RegExpCapture*>* named_captures_;
    312   ZoneList<RegExpBackReference*>* named_back_references_;
    313   FlatStringReader* in_;
    314   uc32 current_;
    315   bool ignore_case_;
    316   bool multiline_;
    317   bool unicode_;
    318   int next_pos_;
    319   int captures_started_;
    320   // The capture count is only valid after we have scanned for captures.
    321   int capture_count_;
    322   bool has_more_;
    323   bool simple_;
    324   bool contains_anchor_;
    325   bool is_scanned_for_captures_;
    326   bool failed_;
    327 };
    328 
    329 }  // namespace internal
    330 }  // namespace v8
    331 
    332 #endif  // V8_REGEXP_REGEXP_PARSER_H_
    333