Home | History | Annotate | Download | only in syntax
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package syntax
      6 
      7 // Note to implementers:
      8 // In this package, re is always a *Regexp and r is always a rune.
      9 
     10 import (
     11 	"bytes"
     12 	"strconv"
     13 	"strings"
     14 	"unicode"
     15 )
     16 
     17 // A Regexp is a node in a regular expression syntax tree.
     18 type Regexp struct {
     19 	Op       Op // operator
     20 	Flags    Flags
     21 	Sub      []*Regexp  // subexpressions, if any
     22 	Sub0     [1]*Regexp // storage for short Sub
     23 	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
     24 	Rune0    [2]rune    // storage for short Rune
     25 	Min, Max int        // min, max for OpRepeat
     26 	Cap      int        // capturing index, for OpCapture
     27 	Name     string     // capturing name, for OpCapture
     28 }
     29 
     30 // An Op is a single regular expression operator.
     31 type Op uint8
     32 
     33 // Operators are listed in precedence order, tightest binding to weakest.
     34 // Character class operators are listed simplest to most complex
     35 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
     36 
     37 const (
     38 	OpNoMatch        Op = 1 + iota // matches no strings
     39 	OpEmptyMatch                   // matches empty string
     40 	OpLiteral                      // matches Runes sequence
     41 	OpCharClass                    // matches Runes interpreted as range pair list
     42 	OpAnyCharNotNL                 // matches any character except newline
     43 	OpAnyChar                      // matches any character
     44 	OpBeginLine                    // matches empty string at beginning of line
     45 	OpEndLine                      // matches empty string at end of line
     46 	OpBeginText                    // matches empty string at beginning of text
     47 	OpEndText                      // matches empty string at end of text
     48 	OpWordBoundary                 // matches word boundary `\b`
     49 	OpNoWordBoundary               // matches word non-boundary `\B`
     50 	OpCapture                      // capturing subexpression with index Cap, optional name Name
     51 	OpStar                         // matches Sub[0] zero or more times
     52 	OpPlus                         // matches Sub[0] one or more times
     53 	OpQuest                        // matches Sub[0] zero or one times
     54 	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
     55 	OpConcat                       // matches concatenation of Subs
     56 	OpAlternate                    // matches alternation of Subs
     57 )
     58 
     59 const opPseudo Op = 128 // where pseudo-ops start
     60 
     61 // Equal returns true if x and y have identical structure.
     62 func (x *Regexp) Equal(y *Regexp) bool {
     63 	if x == nil || y == nil {
     64 		return x == y
     65 	}
     66 	if x.Op != y.Op {
     67 		return false
     68 	}
     69 	switch x.Op {
     70 	case OpEndText:
     71 		// The parse flags remember whether this is \z or \Z.
     72 		if x.Flags&WasDollar != y.Flags&WasDollar {
     73 			return false
     74 		}
     75 
     76 	case OpLiteral, OpCharClass:
     77 		if len(x.Rune) != len(y.Rune) {
     78 			return false
     79 		}
     80 		for i, r := range x.Rune {
     81 			if r != y.Rune[i] {
     82 				return false
     83 			}
     84 		}
     85 
     86 	case OpAlternate, OpConcat:
     87 		if len(x.Sub) != len(y.Sub) {
     88 			return false
     89 		}
     90 		for i, sub := range x.Sub {
     91 			if !sub.Equal(y.Sub[i]) {
     92 				return false
     93 			}
     94 		}
     95 
     96 	case OpStar, OpPlus, OpQuest:
     97 		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
     98 			return false
     99 		}
    100 
    101 	case OpRepeat:
    102 		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
    103 			return false
    104 		}
    105 
    106 	case OpCapture:
    107 		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
    108 			return false
    109 		}
    110 	}
    111 	return true
    112 }
    113 
    114 // writeRegexp writes the Perl syntax for the regular expression re to b.
    115 func writeRegexp(b *bytes.Buffer, re *Regexp) {
    116 	switch re.Op {
    117 	default:
    118 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
    119 	case OpNoMatch:
    120 		b.WriteString(`[^\x00-\x{10FFFF}]`)
    121 	case OpEmptyMatch:
    122 		b.WriteString(`(?:)`)
    123 	case OpLiteral:
    124 		if re.Flags&FoldCase != 0 {
    125 			b.WriteString(`(?i:`)
    126 		}
    127 		for _, r := range re.Rune {
    128 			escape(b, r, false)
    129 		}
    130 		if re.Flags&FoldCase != 0 {
    131 			b.WriteString(`)`)
    132 		}
    133 	case OpCharClass:
    134 		if len(re.Rune)%2 != 0 {
    135 			b.WriteString(`[invalid char class]`)
    136 			break
    137 		}
    138 		b.WriteRune('[')
    139 		if len(re.Rune) == 0 {
    140 			b.WriteString(`^\x00-\x{10FFFF}`)
    141 		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
    142 			// Contains 0 and MaxRune. Probably a negated class.
    143 			// Print the gaps.
    144 			b.WriteRune('^')
    145 			for i := 1; i < len(re.Rune)-1; i += 2 {
    146 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
    147 				escape(b, lo, lo == '-')
    148 				if lo != hi {
    149 					b.WriteRune('-')
    150 					escape(b, hi, hi == '-')
    151 				}
    152 			}
    153 		} else {
    154 			for i := 0; i < len(re.Rune); i += 2 {
    155 				lo, hi := re.Rune[i], re.Rune[i+1]
    156 				escape(b, lo, lo == '-')
    157 				if lo != hi {
    158 					b.WriteRune('-')
    159 					escape(b, hi, hi == '-')
    160 				}
    161 			}
    162 		}
    163 		b.WriteRune(']')
    164 	case OpAnyCharNotNL:
    165 		b.WriteString(`(?-s:.)`)
    166 	case OpAnyChar:
    167 		b.WriteString(`(?s:.)`)
    168 	case OpBeginLine:
    169 		b.WriteString(`(?m:^)`)
    170 	case OpEndLine:
    171 		b.WriteString(`(?m:$)`)
    172 	case OpBeginText:
    173 		b.WriteString(`\A`)
    174 	case OpEndText:
    175 		if re.Flags&WasDollar != 0 {
    176 			b.WriteString(`(?-m:$)`)
    177 		} else {
    178 			b.WriteString(`\z`)
    179 		}
    180 	case OpWordBoundary:
    181 		b.WriteString(`\b`)
    182 	case OpNoWordBoundary:
    183 		b.WriteString(`\B`)
    184 	case OpCapture:
    185 		if re.Name != "" {
    186 			b.WriteString(`(?P<`)
    187 			b.WriteString(re.Name)
    188 			b.WriteRune('>')
    189 		} else {
    190 			b.WriteRune('(')
    191 		}
    192 		if re.Sub[0].Op != OpEmptyMatch {
    193 			writeRegexp(b, re.Sub[0])
    194 		}
    195 		b.WriteRune(')')
    196 	case OpStar, OpPlus, OpQuest, OpRepeat:
    197 		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
    198 			b.WriteString(`(?:`)
    199 			writeRegexp(b, sub)
    200 			b.WriteString(`)`)
    201 		} else {
    202 			writeRegexp(b, sub)
    203 		}
    204 		switch re.Op {
    205 		case OpStar:
    206 			b.WriteRune('*')
    207 		case OpPlus:
    208 			b.WriteRune('+')
    209 		case OpQuest:
    210 			b.WriteRune('?')
    211 		case OpRepeat:
    212 			b.WriteRune('{')
    213 			b.WriteString(strconv.Itoa(re.Min))
    214 			if re.Max != re.Min {
    215 				b.WriteRune(',')
    216 				if re.Max >= 0 {
    217 					b.WriteString(strconv.Itoa(re.Max))
    218 				}
    219 			}
    220 			b.WriteRune('}')
    221 		}
    222 		if re.Flags&NonGreedy != 0 {
    223 			b.WriteRune('?')
    224 		}
    225 	case OpConcat:
    226 		for _, sub := range re.Sub {
    227 			if sub.Op == OpAlternate {
    228 				b.WriteString(`(?:`)
    229 				writeRegexp(b, sub)
    230 				b.WriteString(`)`)
    231 			} else {
    232 				writeRegexp(b, sub)
    233 			}
    234 		}
    235 	case OpAlternate:
    236 		for i, sub := range re.Sub {
    237 			if i > 0 {
    238 				b.WriteRune('|')
    239 			}
    240 			writeRegexp(b, sub)
    241 		}
    242 	}
    243 }
    244 
    245 func (re *Regexp) String() string {
    246 	var b bytes.Buffer
    247 	writeRegexp(&b, re)
    248 	return b.String()
    249 }
    250 
    251 const meta = `\.+*?()|[]{}^$`
    252 
    253 func escape(b *bytes.Buffer, r rune, force bool) {
    254 	if unicode.IsPrint(r) {
    255 		if strings.ContainsRune(meta, r) || force {
    256 			b.WriteRune('\\')
    257 		}
    258 		b.WriteRune(r)
    259 		return
    260 	}
    261 
    262 	switch r {
    263 	case '\a':
    264 		b.WriteString(`\a`)
    265 	case '\f':
    266 		b.WriteString(`\f`)
    267 	case '\n':
    268 		b.WriteString(`\n`)
    269 	case '\r':
    270 		b.WriteString(`\r`)
    271 	case '\t':
    272 		b.WriteString(`\t`)
    273 	case '\v':
    274 		b.WriteString(`\v`)
    275 	default:
    276 		if r < 0x100 {
    277 			b.WriteString(`\x`)
    278 			s := strconv.FormatInt(int64(r), 16)
    279 			if len(s) == 1 {
    280 				b.WriteRune('0')
    281 			}
    282 			b.WriteString(s)
    283 			break
    284 		}
    285 		b.WriteString(`\x{`)
    286 		b.WriteString(strconv.FormatInt(int64(r), 16))
    287 		b.WriteString(`}`)
    288 	}
    289 }
    290 
    291 // MaxCap walks the regexp to find the maximum capture index.
    292 func (re *Regexp) MaxCap() int {
    293 	m := 0
    294 	if re.Op == OpCapture {
    295 		m = re.Cap
    296 	}
    297 	for _, sub := range re.Sub {
    298 		if n := sub.MaxCap(); m < n {
    299 			m = n
    300 		}
    301 	}
    302 	return m
    303 }
    304 
    305 // CapNames walks the regexp to find the names of capturing groups.
    306 func (re *Regexp) CapNames() []string {
    307 	names := make([]string, re.MaxCap()+1)
    308 	re.capNames(names)
    309 	return names
    310 }
    311 
    312 func (re *Regexp) capNames(names []string) {
    313 	if re.Op == OpCapture {
    314 		names[re.Cap] = re.Name
    315 	}
    316 	for _, sub := range re.Sub {
    317 		sub.capNames(names)
    318 	}
    319 }
    320