Home | History | Annotate | Download | only in syntax
      1 // Copyright 2011 The Go Authors.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package syntax
      6 
      7 import "testing"
      8 
      9 var simplifyTests = []struct {
     10 	Regexp string
     11 	Simple string
     12 }{
     13 	// Already-simple constructs
     14 	{`a`, `a`},
     15 	{`ab`, `ab`},
     16 	{`a|b`, `[a-b]`},
     17 	{`ab|cd`, `ab|cd`},
     18 	{`(ab)*`, `(ab)*`},
     19 	{`(ab)+`, `(ab)+`},
     20 	{`(ab)?`, `(ab)?`},
     21 	{`.`, `(?s:.)`},
     22 	{`^`, `^`},
     23 	{`$`, `$`},
     24 	{`[ac]`, `[ac]`},
     25 	{`[^ac]`, `[^ac]`},
     26 
     27 	// Posix character classes
     28 	{`[[:alnum:]]`, `[0-9A-Za-z]`},
     29 	{`[[:alpha:]]`, `[A-Za-z]`},
     30 	{`[[:blank:]]`, `[\t ]`},
     31 	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
     32 	{`[[:digit:]]`, `[0-9]`},
     33 	{`[[:graph:]]`, `[!-~]`},
     34 	{`[[:lower:]]`, `[a-z]`},
     35 	{`[[:print:]]`, `[ -~]`},
     36 	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
     37 	{`[[:space:]]`, `[\t-\r ]`},
     38 	{`[[:upper:]]`, `[A-Z]`},
     39 	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
     40 
     41 	// Perl character classes
     42 	{`\d`, `[0-9]`},
     43 	{`\s`, `[\t-\n\f-\r ]`},
     44 	{`\w`, `[0-9A-Z_a-z]`},
     45 	{`\D`, `[^0-9]`},
     46 	{`\S`, `[^\t-\n\f-\r ]`},
     47 	{`\W`, `[^0-9A-Z_a-z]`},
     48 	{`[\d]`, `[0-9]`},
     49 	{`[\s]`, `[\t-\n\f-\r ]`},
     50 	{`[\w]`, `[0-9A-Z_a-z]`},
     51 	{`[\D]`, `[^0-9]`},
     52 	{`[\S]`, `[^\t-\n\f-\r ]`},
     53 	{`[\W]`, `[^0-9A-Z_a-z]`},
     54 
     55 	// Posix repetitions
     56 	{`a{1}`, `a`},
     57 	{`a{2}`, `aa`},
     58 	{`a{5}`, `aaaaa`},
     59 	{`a{0,1}`, `a?`},
     60 	// The next three are illegible because Simplify inserts (?:)
     61 	// parens instead of () parens to avoid creating extra
     62 	// captured subexpressions.  The comments show a version with fewer parens.
     63 	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
     64 	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
     65 	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
     66 	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
     67 	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
     68 	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
     69 	{`a{0,}`, `a*`},
     70 	{`a{1,}`, `a+`},
     71 	{`a{2,}`, `aa+`},
     72 	{`a{5,}`, `aaaaa+`},
     73 
     74 	// Test that operators simplify their arguments.
     75 	{`(?:a{1,}){1,}`, `a+`},
     76 	{`(a{1,}b{1,})`, `(a+b+)`},
     77 	{`a{1,}|b{1,}`, `a+|b+`},
     78 	{`(?:a{1,})*`, `(?:a+)*`},
     79 	{`(?:a{1,})+`, `a+`},
     80 	{`(?:a{1,})?`, `(?:a+)?`},
     81 	{``, `(?:)`},
     82 	{`a{0}`, `(?:)`},
     83 
     84 	// Character class simplification
     85 	{`[ab]`, `[a-b]`},
     86 	{`[a-za-za-z]`, `[a-z]`},
     87 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
     88 	{`[ABCDEFGH]`, `[A-H]`},
     89 	{`[AB-CD-EF-GH]`, `[A-H]`},
     90 	{`[W-ZP-XE-R]`, `[E-Z]`},
     91 	{`[a-ee-gg-m]`, `[a-m]`},
     92 	{`[a-ea-ha-m]`, `[a-m]`},
     93 	{`[a-ma-ha-e]`, `[a-m]`},
     94 	{`[a-zA-Z0-9 -~]`, `[ -~]`},
     95 
     96 	// Empty character classes
     97 	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
     98 
     99 	// Full character classes
    100 	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
    101 
    102 	// Unicode case folding.
    103 	{`(?i)A`, `(?i:A)`},
    104 	{`(?i)a`, `(?i:A)`},
    105 	{`(?i)[A]`, `(?i:A)`},
    106 	{`(?i)[a]`, `(?i:A)`},
    107 	{`(?i)K`, `(?i:K)`},
    108 	{`(?i)k`, `(?i:K)`},
    109 	{`(?i)\x{212a}`, "(?i:K)"},
    110 	{`(?i)[K]`, "[Kk\u212A]"},
    111 	{`(?i)[k]`, "[Kk\u212A]"},
    112 	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
    113 	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
    114 	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
    115 	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
    116 
    117 	// Empty string as a regular expression.
    118 	// The empty string must be preserved inside parens in order
    119 	// to make submatches work right, so these tests are less
    120 	// interesting than they might otherwise be.  String inserts
    121 	// explicit (?:) in place of non-parenthesized empty strings,
    122 	// to make them easier to spot for other parsers.
    123 	{`(a|b|)`, `([a-b]|(?:))`},
    124 	{`(|)`, `()`},
    125 	{`a()`, `a()`},
    126 	{`(()|())`, `(()|())`},
    127 	{`(a|)`, `(a|(?:))`},
    128 	{`ab()cd()`, `ab()cd()`},
    129 	{`()`, `()`},
    130 	{`()*`, `()*`},
    131 	{`()+`, `()+`},
    132 	{`()?`, `()?`},
    133 	{`(){0}`, `(?:)`},
    134 	{`(){1}`, `()`},
    135 	{`(){1,}`, `()+`},
    136 	{`(){0,2}`, `(?:()()?)?`},
    137 }
    138 
    139 func TestSimplify(t *testing.T) {
    140 	for _, tt := range simplifyTests {
    141 		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
    142 		if err != nil {
    143 			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
    144 			continue
    145 		}
    146 		s := re.Simplify().String()
    147 		if s != tt.Simple {
    148 			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
    149 		}
    150 	}
    151 }
    152