Home | History | Annotate | Download | only in re2
      1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Compiled representation of regular expressions.
      6 // See regexp.h for the Regexp class, which represents a regular
      7 // expression symbolically.
      8 
      9 #ifndef RE2_PROG_H__
     10 #define RE2_PROG_H__
     11 
     12 #include "util/util.h"
     13 #include "re2/re2.h"
     14 
     15 namespace re2 {
     16 
     17 // Simple fixed-size bitmap.
     18 template<int Bits>
     19 class Bitmap {
     20  public:
     21   Bitmap() { Reset(); }
     22   int Size() { return Bits; }
     23 
     24   void Reset() {
     25     for (int i = 0; i < Words; i++)
     26       w_[i] = 0;
     27   }
     28   bool Get(int k) const {
     29     return w_[k >> WordLog] & (1<<(k & 31));
     30   }
     31   void Set(int k) {
     32     w_[k >> WordLog] |= 1<<(k & 31);
     33   }
     34   void Clear(int k) {
     35     w_[k >> WordLog] &= ~(1<<(k & 31));
     36   }
     37   uint32 Word(int i) const {
     38     return w_[i];
     39   }
     40 
     41  private:
     42   static const int WordLog = 5;
     43   static const int Words = (Bits+31)/32;
     44   uint32 w_[Words];
     45   DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
     46 };
     47 
     48 
     49 // Opcodes for Inst
     50 enum InstOp {
     51   kInstAlt = 0,      // choose between out_ and out1_
     52   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
     53   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
     54   kInstCapture,      // capturing parenthesis number cap_
     55   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
     56   kInstMatch,        // found a match!
     57   kInstNop,          // no-op; occasionally unavoidable
     58   kInstFail,         // never match; occasionally unavoidable
     59 };
     60 
     61 // Bit flags for empty-width specials
     62 enum EmptyOp {
     63   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
     64   kEmptyEndLine          = 1<<1,      // $ - end of line
     65   kEmptyBeginText        = 1<<2,      // \A - beginning of text
     66   kEmptyEndText          = 1<<3,      // \z - end of text
     67   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
     68   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
     69   kEmptyAllFlags         = (1<<6)-1,
     70 };
     71 
     72 class Regexp;
     73 
     74 class DFA;
     75 struct OneState;
     76 
     77 // Compiled form of regexp program.
     78 class Prog {
     79  public:
     80   Prog();
     81   ~Prog();
     82 
     83   // Single instruction in regexp program.
     84   class Inst {
     85    public:
     86     Inst() : out_opcode_(0), out1_(0) { }
     87 
     88     // Constructors per opcode
     89     void InitAlt(uint32 out, uint32 out1);
     90     void InitByteRange(int lo, int hi, int foldcase, uint32 out);
     91     void InitCapture(int cap, uint32 out);
     92     void InitEmptyWidth(EmptyOp empty, uint32 out);
     93     void InitMatch(int id);
     94     void InitNop(uint32 out);
     95     void InitFail();
     96 
     97     // Getters
     98     int id(Prog* p) { return this - p->inst_; }
     99     InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
    100     int out()     { return out_opcode_>>3; }
    101     int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
    102     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
    103     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
    104     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
    105     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
    106     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
    107     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
    108     bool greedy(Prog *p) {
    109       DCHECK_EQ(opcode(), kInstAltMatch);
    110       return p->inst(out())->opcode() == kInstByteRange;
    111     }
    112 
    113     // Does this inst (an kInstByteRange) match c?
    114     inline bool Matches(int c) {
    115       DCHECK_EQ(opcode(), kInstByteRange);
    116       if (foldcase_ && 'A' <= c && c <= 'Z')
    117         c += 'a' - 'A';
    118       return lo_ <= c && c <= hi_;
    119     }
    120 
    121     // Returns string representation for debugging.
    122     string Dump();
    123 
    124     // Maximum instruction id.
    125     // (Must fit in out_opcode_, and PatchList steals another bit.)
    126     static const int kMaxInst = (1<<28) - 1;
    127 
    128    private:
    129     void set_opcode(InstOp opcode) {
    130       out_opcode_ = (out()<<3) | opcode;
    131     }
    132 
    133     void set_out(int out) {
    134       out_opcode_ = (out<<3) | opcode();
    135     }
    136 
    137     void set_out_opcode(int out, InstOp opcode) {
    138       out_opcode_ = (out<<3) | opcode;
    139     }
    140 
    141     uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
    142     union {              // additional instruction arguments:
    143       uint32 out1_;      // opcode == kInstAlt
    144                          //   alternate next instruction
    145 
    146       int32 cap_;        // opcode == kInstCapture
    147                          //   Index of capture register (holds text
    148                          //   position recorded by capturing parentheses).
    149                          //   For \n (the submatch for the nth parentheses),
    150                          //   the left parenthesis captures into register 2*n
    151                          //   and the right one captures into register 2*n+1.
    152 
    153       int32 match_id_;   // opcode == kInstMatch
    154                          //   Match ID to identify this match (for re2::Set).
    155 
    156       struct {           // opcode == kInstByteRange
    157         uint8 lo_;       //   byte range is lo_-hi_ inclusive
    158         uint8 hi_;       //
    159         uint8 foldcase_; //   convert A-Z to a-z before checking range.
    160       };
    161 
    162       EmptyOp empty_;    // opcode == kInstEmptyWidth
    163                          //   empty_ is bitwise OR of kEmpty* flags above.
    164     };
    165 
    166     friend class Compiler;
    167     friend struct PatchList;
    168     friend class Prog;
    169 
    170     DISALLOW_EVIL_CONSTRUCTORS(Inst);
    171   };
    172 
    173   // Whether to anchor the search.
    174   enum Anchor {
    175     kUnanchored,  // match anywhere
    176     kAnchored,    // match only starting at beginning of text
    177   };
    178 
    179   // Kind of match to look for (for anchor != kFullMatch)
    180   //
    181   // kLongestMatch mode finds the overall longest
    182   // match but still makes its submatch choices the way
    183   // Perl would, not in the way prescribed by POSIX.
    184   // The POSIX rules are much more expensive to implement,
    185   // and no one has needed them.
    186   //
    187   // kFullMatch is not strictly necessary -- we could use
    188   // kLongestMatch and then check the length of the match -- but
    189   // the matching code can run faster if it knows to consider only
    190   // full matches.
    191   enum MatchKind {
    192     kFirstMatch,     // like Perl, PCRE
    193     kLongestMatch,   // like egrep or POSIX
    194     kFullMatch,      // match only entire text; implies anchor==kAnchored
    195     kManyMatch       // for SearchDFA, records set of matches
    196   };
    197 
    198   Inst *inst(int id) { return &inst_[id]; }
    199   int start() { return start_; }
    200   int start_unanchored() { return start_unanchored_; }
    201   void set_start(int start) { start_ = start; }
    202   void set_start_unanchored(int start) { start_unanchored_ = start; }
    203   int64 size() { return size_; }
    204   bool reversed() { return reversed_; }
    205   void set_reversed(bool reversed) { reversed_ = reversed; }
    206   int64 byte_inst_count() { return byte_inst_count_; }
    207   const Bitmap<256>& byterange() { return byterange_; }
    208   void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
    209   int64 dfa_mem() { return dfa_mem_; }
    210   int flags() { return flags_; }
    211   void set_flags(int flags) { flags_ = flags; }
    212   bool anchor_start() { return anchor_start_; }
    213   void set_anchor_start(bool b) { anchor_start_ = b; }
    214   bool anchor_end() { return anchor_end_; }
    215   void set_anchor_end(bool b) { anchor_end_ = b; }
    216   int bytemap_range() { return bytemap_range_; }
    217   const uint8* bytemap() { return bytemap_; }
    218 
    219   // Returns string representation of program for debugging.
    220   string Dump();
    221   string DumpUnanchored();
    222 
    223   // Record that at some point in the prog, the bytes in the range
    224   // lo-hi (inclusive) are treated as different from bytes outside the range.
    225   // Tracking this lets the DFA collapse commonly-treated byte ranges
    226   // when recording state pointers, greatly reducing its memory footprint.
    227   void MarkByteRange(int lo, int hi);
    228 
    229   // Returns the set of kEmpty flags that are in effect at
    230   // position p within context.
    231   static uint32 EmptyFlags(const StringPiece& context, const char* p);
    232 
    233   // Returns whether byte c is a word character: ASCII only.
    234   // Used by the implementation of \b and \B.
    235   // This is not right for Unicode, but:
    236   //   - it's hard to get right in a byte-at-a-time matching world
    237   //     (the DFA has only one-byte lookahead).
    238   //   - even if the lookahead were possible, the Progs would be huge.
    239   // This crude approximation is the same one PCRE uses.
    240   static bool IsWordChar(uint8 c) {
    241     return ('A' <= c && c <= 'Z') ||
    242            ('a' <= c && c <= 'z') ||
    243            ('0' <= c && c <= '9') ||
    244            c == '_';
    245   }
    246 
    247   // Execution engines.  They all search for the regexp (run the prog)
    248   // in text, which is in the larger context (used for ^ $ \b etc).
    249   // Anchor and kind control the kind of search.
    250   // Returns true if match found, false if not.
    251   // If match found, fills match[0..nmatch-1] with submatch info.
    252   // match[0] is overall match, match[1] is first set of parens, etc.
    253   // If a particular submatch is not matched during the regexp match,
    254   // it is set to NULL.
    255   //
    256   // Matching text == StringPiece(NULL, 0) is treated as any other empty
    257   // string, but note that on return, it will not be possible to distinguish
    258   // submatches that matched that empty string from submatches that didn't
    259   // match anything.  Either way, match[i] == NULL.
    260 
    261   // Search using NFA: can find submatches but kind of slow.
    262   bool SearchNFA(const StringPiece& text, const StringPiece& context,
    263                  Anchor anchor, MatchKind kind,
    264                  StringPiece* match, int nmatch);
    265 
    266   // Search using DFA: much faster than NFA but only finds
    267   // end of match and can use a lot more memory.
    268   // Returns whether a match was found.
    269   // If the DFA runs out of memory, sets *failed to true and returns false.
    270   // If matches != NULL and kind == kManyMatch and there is a match,
    271   // SearchDFA fills matches with the match IDs of the final matching state.
    272   bool SearchDFA(const StringPiece& text, const StringPiece& context,
    273                  Anchor anchor, MatchKind kind,
    274                  StringPiece* match0, bool* failed,
    275                  vector<int>* matches);
    276 
    277   // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
    278   // Usually the DFA is built out incrementally, as needed, which
    279   // avoids lots of unnecessary work.  This function is useful only
    280   // for testing purposes.  Returns number of states.
    281   int BuildEntireDFA(MatchKind kind);
    282 
    283   // Compute byte map.
    284   void ComputeByteMap();
    285 
    286   // Run peep-hole optimizer on program.
    287   void Optimize();
    288 
    289   // One-pass NFA: only correct if IsOnePass() is true,
    290   // but much faster than NFA (competitive with PCRE)
    291   // for those expressions.
    292   bool IsOnePass();
    293   bool SearchOnePass(const StringPiece& text, const StringPiece& context,
    294                      Anchor anchor, MatchKind kind,
    295                      StringPiece* match, int nmatch);
    296 
    297   // Bit-state backtracking.  Fast on small cases but uses memory
    298   // proportional to the product of the program size and the text size.
    299   bool SearchBitState(const StringPiece& text, const StringPiece& context,
    300                       Anchor anchor, MatchKind kind,
    301                       StringPiece* match, int nmatch);
    302 
    303   static const int kMaxOnePassCapture = 5;  // $0 through $4
    304 
    305   // Backtracking search: the gold standard against which the other
    306   // implementations are checked.  FOR TESTING ONLY.
    307   // It allocates a ton of memory to avoid running forever.
    308   // It is also recursive, so can't use in production (will overflow stacks).
    309   // The name "Unsafe" here is supposed to be a flag that
    310   // you should not be using this function.
    311   bool UnsafeSearchBacktrack(const StringPiece& text,
    312                              const StringPiece& context,
    313                              Anchor anchor, MatchKind kind,
    314                              StringPiece* match, int nmatch);
    315 
    316   // Computes range for any strings matching regexp. The min and max can in
    317   // some cases be arbitrarily precise, so the caller gets to specify the
    318   // maximum desired length of string returned.
    319   //
    320   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
    321   // string s that is an anchored match for this regexp satisfies
    322   //   min <= s && s <= max.
    323   //
    324   // Note that PossibleMatchRange() will only consider the first copy of an
    325   // infinitely repeated element (i.e., any regexp element followed by a '*' or
    326   // '+' operator). Regexps with "{N}" constructions are not affected, as those
    327   // do not compile down to infinite repetitions.
    328   //
    329   // Returns true on success, false on error.
    330   bool PossibleMatchRange(string* min, string* max, int maxlen);
    331 
    332   // Compiles a collection of regexps to Prog.  Each regexp will have
    333   // its own Match instruction recording the index in the vector.
    334   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
    335                           Regexp* re);
    336 
    337  private:
    338   friend class Compiler;
    339 
    340   DFA* GetDFA(MatchKind kind);
    341 
    342   bool anchor_start_;       // regexp has explicit start anchor
    343   bool anchor_end_;         // regexp has explicit end anchor
    344   bool reversed_;           // whether program runs backward over input
    345   bool did_onepass_;        // has IsOnePass been called?
    346 
    347   int start_;               // entry point for program
    348   int start_unanchored_;    // unanchored entry point for program
    349   int size_;                // number of instructions
    350   int byte_inst_count_;     // number of kInstByteRange instructions
    351   int bytemap_range_;       // bytemap_[x] < bytemap_range_
    352   int flags_;               // regexp parse flags
    353   int onepass_statesize_;   // byte size of each OneState* node
    354 
    355   Inst* inst_;              // pointer to instruction array
    356 
    357   Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
    358   DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
    359   DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
    360   int64 dfa_mem_;      // Maximum memory for DFAs.
    361   void (*delete_dfa_)(DFA* dfa);
    362 
    363   Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
    364                              // commonly-treated byte range.
    365   uint8 bytemap_[256];       // map from input bytes to byte classes
    366   uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
    367 
    368   uint8* onepass_nodes_;     // data for OnePass nodes
    369   OneState* onepass_start_;  // start node for OnePass program
    370 
    371   DISALLOW_EVIL_CONSTRUCTORS(Prog);
    372 };
    373 
    374 }  // namespace re2
    375 
    376 #endif  // RE2_PROG_H__
    377