Home | History | Annotate | Download | only in re2
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Format a regular expression structure as a string.
      6 // Tested by parse_test.cc
      7 
      8 #include "util/util.h"
      9 #include "re2/regexp.h"
     10 #include "re2/walker-inl.h"
     11 
     12 namespace re2 {
     13 
     14 enum {
     15   PrecAtom,
     16   PrecUnary,
     17   PrecConcat,
     18   PrecAlternate,
     19   PrecEmpty,
     20   PrecParen,
     21   PrecToplevel,
     22 };
     23 
     24 // Helper function.  See description below.
     25 static void AppendCCRange(string* t, Rune lo, Rune hi);
     26 
     27 // Walker to generate string in s_.
     28 // The arg pointers are actually integers giving the
     29 // context precedence.
     30 // The child_args are always NULL.
     31 class ToStringWalker : public Regexp::Walker<int> {
     32  public:
     33   explicit ToStringWalker(string* t) : t_(t) {}
     34 
     35   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
     36   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
     37                         int* child_args, int nchild_args);
     38   virtual int ShortVisit(Regexp* re, int parent_arg) {
     39     return 0;
     40   }
     41 
     42  private:
     43   string* t_;  // The string the walker appends to.
     44 
     45   DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
     46 };
     47 
     48 string Regexp::ToString() {
     49   string t;
     50   ToStringWalker w(&t);
     51   w.WalkExponential(this, PrecToplevel, 100000);
     52   if (w.stopped_early())
     53     t += " [truncated]";
     54   return t;
     55 }
     56 
     57 #define ToString DontCallToString  // Avoid accidental recursion.
     58 
     59 // Visits re before children are processed.
     60 // Appends ( if needed and passes new precedence to children.
     61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
     62   int prec = parent_arg;
     63   int nprec = PrecAtom;
     64 
     65   switch (re->op()) {
     66     case kRegexpNoMatch:
     67     case kRegexpEmptyMatch:
     68     case kRegexpLiteral:
     69     case kRegexpAnyChar:
     70     case kRegexpAnyByte:
     71     case kRegexpBeginLine:
     72     case kRegexpEndLine:
     73     case kRegexpBeginText:
     74     case kRegexpEndText:
     75     case kRegexpWordBoundary:
     76     case kRegexpNoWordBoundary:
     77     case kRegexpCharClass:
     78     case kRegexpHaveMatch:
     79       nprec = PrecAtom;
     80       break;
     81 
     82     case kRegexpConcat:
     83     case kRegexpLiteralString:
     84       if (prec < PrecConcat)
     85         t_->append("(?:");
     86       nprec = PrecConcat;
     87       break;
     88 
     89     case kRegexpAlternate:
     90       if (prec < PrecAlternate)
     91         t_->append("(?:");
     92       nprec = PrecAlternate;
     93       break;
     94 
     95     case kRegexpCapture:
     96       t_->append("(");
     97       if (re->name()) {
     98         t_->append("?P<");
     99         t_->append(*re->name());
    100         t_->append(">");
    101       }
    102       nprec = PrecParen;
    103       break;
    104 
    105     case kRegexpStar:
    106     case kRegexpPlus:
    107     case kRegexpQuest:
    108     case kRegexpRepeat:
    109       if (prec < PrecUnary)
    110         t_->append("(?:");
    111       // The subprecedence here is PrecAtom instead of PrecUnary
    112       // because PCRE treats two unary ops in a row as a parse error.
    113       nprec = PrecAtom;
    114       break;
    115   }
    116 
    117   return nprec;
    118 }
    119 
    120 static void AppendLiteral(string *t, Rune r, bool foldcase) {
    121   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
    122     t->append(1, '\\');
    123     t->append(1, r);
    124   } else if (foldcase && 'a' <= r && r <= 'z') {
    125     if ('a' <= r && r <= 'z')
    126       r += 'A' - 'a';
    127     t->append(1, '[');
    128     t->append(1, r);
    129     t->append(1, r + 'a' - 'A');
    130     t->append(1, ']');
    131   } else {
    132     AppendCCRange(t, r, r);
    133   }
    134 }
    135 
    136 // Visits re after children are processed.
    137 // For childless regexps, all the work is done here.
    138 // For regexps with children, append any unary suffixes or ).
    139 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
    140                               int* child_args, int nchild_args) {
    141   int prec = parent_arg;
    142   switch (re->op()) {
    143     case kRegexpNoMatch:
    144       // There's no simple symbol for "no match", but
    145       // [^0-Runemax] excludes everything.
    146       t_->append("[^\\x00-\\x{10ffff}]");
    147       break;
    148 
    149     case kRegexpEmptyMatch:
    150       // Append (?:) to make empty string visible,
    151       // unless this is already being parenthesized.
    152       if (prec < PrecEmpty)
    153         t_->append("(?:)");
    154       break;
    155 
    156     case kRegexpLiteral:
    157       AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
    158       break;
    159 
    160     case kRegexpLiteralString:
    161       for (int i = 0; i < re->nrunes(); i++)
    162         AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
    163       if (prec < PrecConcat)
    164         t_->append(")");
    165       break;
    166 
    167     case kRegexpConcat:
    168       if (prec < PrecConcat)
    169         t_->append(")");
    170       break;
    171 
    172     case kRegexpAlternate:
    173       // Clumsy but workable: the children all appended |
    174       // at the end of their strings, so just remove the last one.
    175       if ((*t_)[t_->size()-1] == '|')
    176         t_->erase(t_->size()-1);
    177       else
    178         LOG(DFATAL) << "Bad final char: " << t_;
    179       if (prec < PrecAlternate)
    180         t_->append(")");
    181       break;
    182 
    183     case kRegexpStar:
    184       t_->append("*");
    185       if (re->parse_flags() & Regexp::NonGreedy)
    186         t_->append("?");
    187       if (prec < PrecUnary)
    188         t_->append(")");
    189       break;
    190 
    191     case kRegexpPlus:
    192       t_->append("+");
    193       if (re->parse_flags() & Regexp::NonGreedy)
    194         t_->append("?");
    195       if (prec < PrecUnary)
    196         t_->append(")");
    197       break;
    198 
    199     case kRegexpQuest:
    200       t_->append("?");
    201       if (re->parse_flags() & Regexp::NonGreedy)
    202         t_->append("?");
    203       if (prec < PrecUnary)
    204         t_->append(")");
    205       break;
    206 
    207     case kRegexpRepeat:
    208       if (re->max() == -1)
    209         t_->append(StringPrintf("{%d,}", re->min()));
    210       else if (re->min() == re->max())
    211         t_->append(StringPrintf("{%d}", re->min()));
    212       else
    213         t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
    214       if (re->parse_flags() & Regexp::NonGreedy)
    215         t_->append("?");
    216       if (prec < PrecUnary)
    217         t_->append(")");
    218       break;
    219 
    220     case kRegexpAnyChar:
    221       t_->append(".");
    222       break;
    223 
    224     case kRegexpAnyByte:
    225       t_->append("\\C");
    226       break;
    227 
    228     case kRegexpBeginLine:
    229       t_->append("^");
    230       break;
    231 
    232     case kRegexpEndLine:
    233       t_->append("$");
    234       break;
    235 
    236     case kRegexpBeginText:
    237       t_->append("(?-m:^)");
    238       break;
    239 
    240     case kRegexpEndText:
    241       if (re->parse_flags() & Regexp::WasDollar)
    242         t_->append("(?-m:$)");
    243       else
    244         t_->append("\\z");
    245       break;
    246 
    247     case kRegexpWordBoundary:
    248       t_->append("\\b");
    249       break;
    250 
    251     case kRegexpNoWordBoundary:
    252       t_->append("\\B");
    253       break;
    254 
    255     case kRegexpCharClass: {
    256       if (re->cc()->size() == 0) {
    257         t_->append("[^\\x00-\\x{10ffff}]");
    258         break;
    259       }
    260       t_->append("[");
    261       // Heuristic: show class as negated if it contains the
    262       // non-character 0xFFFE.
    263       CharClass* cc = re->cc();
    264       if (cc->Contains(0xFFFE)) {
    265         cc = cc->Negate();
    266         t_->append("^");
    267       }
    268       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
    269         AppendCCRange(t_, i->lo, i->hi);
    270       if (cc != re->cc())
    271         cc->Delete();
    272       t_->append("]");
    273       break;
    274     }
    275 
    276     case kRegexpCapture:
    277       t_->append(")");
    278       break;
    279 
    280     case kRegexpHaveMatch:
    281       // There's no syntax accepted by the parser to generate
    282       // this node (it is generated by RE2::Set) so make something
    283       // up that is readable but won't compile.
    284       t_->append("(?HaveMatch:%d)", re->match_id());
    285       break;
    286   }
    287 
    288   // If the parent is an alternation, append the | for it.
    289   if (prec == PrecAlternate)
    290     t_->append("|");
    291 
    292   return 0;
    293 }
    294 
    295 // Appends a rune for use in a character class to the string t.
    296 static void AppendCCChar(string* t, Rune r) {
    297   if (0x20 <= r && r <= 0x7E) {
    298     if (strchr("[]^-\\", r))
    299       t->append("\\");
    300     t->append(1, r);
    301     return;
    302   }
    303   switch (r) {
    304     default:
    305       break;
    306 
    307     case '\r':
    308       t->append("\\r");
    309       return;
    310 
    311     case '\t':
    312       t->append("\\t");
    313       return;
    314 
    315     case '\n':
    316       t->append("\\n");
    317       return;
    318 
    319     case '\f':
    320       t->append("\\f");
    321       return;
    322   }
    323 
    324   if (r < 0x100) {
    325     StringAppendF(t, "\\x%02x", static_cast<int>(r));
    326     return;
    327   }
    328   StringAppendF(t, "\\x{%x}", static_cast<int>(r));
    329 }
    330 
    331 static void AppendCCRange(string* t, Rune lo, Rune hi) {
    332   if (lo > hi)
    333     return;
    334   AppendCCChar(t, lo);
    335   if (lo < hi) {
    336     t->append("-");
    337     AppendCCChar(t, hi);
    338   }
    339 }
    340 
    341 }  // namespace re2
    342