Home | History | Annotate | Download | only in syntax
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package syntax
      6 
      7 import "unicode"
      8 
      9 // A patchList is a list of instruction pointers that need to be filled in (patched).
     10 // Because the pointers haven't been filled in yet, we can reuse their storage
     11 // to hold the list. It's kind of sleazy, but works well in practice.
     12 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
     13 //
     14 // These aren't really pointers: they're integers, so we can reinterpret them
     15 // this way without using package unsafe. A value l denotes
     16 // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
     17 // l == 0 denotes the empty list, okay because we start every program
     18 // with a fail instruction, so we'll never want to point at its output link.
     19 type patchList uint32
     20 
     21 func (l patchList) next(p *Prog) patchList {
     22 	i := &p.Inst[l>>1]
     23 	if l&1 == 0 {
     24 		return patchList(i.Out)
     25 	}
     26 	return patchList(i.Arg)
     27 }
     28 
     29 func (l patchList) patch(p *Prog, val uint32) {
     30 	for l != 0 {
     31 		i := &p.Inst[l>>1]
     32 		if l&1 == 0 {
     33 			l = patchList(i.Out)
     34 			i.Out = val
     35 		} else {
     36 			l = patchList(i.Arg)
     37 			i.Arg = val
     38 		}
     39 	}
     40 }
     41 
     42 func (l1 patchList) append(p *Prog, l2 patchList) patchList {
     43 	if l1 == 0 {
     44 		return l2
     45 	}
     46 	if l2 == 0 {
     47 		return l1
     48 	}
     49 
     50 	last := l1
     51 	for {
     52 		next := last.next(p)
     53 		if next == 0 {
     54 			break
     55 		}
     56 		last = next
     57 	}
     58 
     59 	i := &p.Inst[last>>1]
     60 	if last&1 == 0 {
     61 		i.Out = uint32(l2)
     62 	} else {
     63 		i.Arg = uint32(l2)
     64 	}
     65 	return l1
     66 }
     67 
     68 // A frag represents a compiled program fragment.
     69 type frag struct {
     70 	i   uint32    // index of first instruction
     71 	out patchList // where to record end instruction
     72 }
     73 
     74 type compiler struct {
     75 	p *Prog
     76 }
     77 
     78 // Compile compiles the regexp into a program to be executed.
     79 // The regexp should have been simplified already (returned from re.Simplify).
     80 func Compile(re *Regexp) (*Prog, error) {
     81 	var c compiler
     82 	c.init()
     83 	f := c.compile(re)
     84 	f.out.patch(c.p, c.inst(InstMatch).i)
     85 	c.p.Start = int(f.i)
     86 	return c.p, nil
     87 }
     88 
     89 func (c *compiler) init() {
     90 	c.p = new(Prog)
     91 	c.p.NumCap = 2 // implicit ( and ) for whole match $0
     92 	c.inst(InstFail)
     93 }
     94 
     95 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
     96 var anyRune = []rune{0, unicode.MaxRune}
     97 
     98 func (c *compiler) compile(re *Regexp) frag {
     99 	switch re.Op {
    100 	case OpNoMatch:
    101 		return c.fail()
    102 	case OpEmptyMatch:
    103 		return c.nop()
    104 	case OpLiteral:
    105 		if len(re.Rune) == 0 {
    106 			return c.nop()
    107 		}
    108 		var f frag
    109 		for j := range re.Rune {
    110 			f1 := c.rune(re.Rune[j:j+1], re.Flags)
    111 			if j == 0 {
    112 				f = f1
    113 			} else {
    114 				f = c.cat(f, f1)
    115 			}
    116 		}
    117 		return f
    118 	case OpCharClass:
    119 		return c.rune(re.Rune, re.Flags)
    120 	case OpAnyCharNotNL:
    121 		return c.rune(anyRuneNotNL, 0)
    122 	case OpAnyChar:
    123 		return c.rune(anyRune, 0)
    124 	case OpBeginLine:
    125 		return c.empty(EmptyBeginLine)
    126 	case OpEndLine:
    127 		return c.empty(EmptyEndLine)
    128 	case OpBeginText:
    129 		return c.empty(EmptyBeginText)
    130 	case OpEndText:
    131 		return c.empty(EmptyEndText)
    132 	case OpWordBoundary:
    133 		return c.empty(EmptyWordBoundary)
    134 	case OpNoWordBoundary:
    135 		return c.empty(EmptyNoWordBoundary)
    136 	case OpCapture:
    137 		bra := c.cap(uint32(re.Cap << 1))
    138 		sub := c.compile(re.Sub[0])
    139 		ket := c.cap(uint32(re.Cap<<1 | 1))
    140 		return c.cat(c.cat(bra, sub), ket)
    141 	case OpStar:
    142 		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
    143 	case OpPlus:
    144 		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
    145 	case OpQuest:
    146 		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
    147 	case OpConcat:
    148 		if len(re.Sub) == 0 {
    149 			return c.nop()
    150 		}
    151 		var f frag
    152 		for i, sub := range re.Sub {
    153 			if i == 0 {
    154 				f = c.compile(sub)
    155 			} else {
    156 				f = c.cat(f, c.compile(sub))
    157 			}
    158 		}
    159 		return f
    160 	case OpAlternate:
    161 		var f frag
    162 		for _, sub := range re.Sub {
    163 			f = c.alt(f, c.compile(sub))
    164 		}
    165 		return f
    166 	}
    167 	panic("regexp: unhandled case in compile")
    168 }
    169 
    170 func (c *compiler) inst(op InstOp) frag {
    171 	// TODO: impose length limit
    172 	f := frag{i: uint32(len(c.p.Inst))}
    173 	c.p.Inst = append(c.p.Inst, Inst{Op: op})
    174 	return f
    175 }
    176 
    177 func (c *compiler) nop() frag {
    178 	f := c.inst(InstNop)
    179 	f.out = patchList(f.i << 1)
    180 	return f
    181 }
    182 
    183 func (c *compiler) fail() frag {
    184 	return frag{}
    185 }
    186 
    187 func (c *compiler) cap(arg uint32) frag {
    188 	f := c.inst(InstCapture)
    189 	f.out = patchList(f.i << 1)
    190 	c.p.Inst[f.i].Arg = arg
    191 
    192 	if c.p.NumCap < int(arg)+1 {
    193 		c.p.NumCap = int(arg) + 1
    194 	}
    195 	return f
    196 }
    197 
    198 func (c *compiler) cat(f1, f2 frag) frag {
    199 	// concat of failure is failure
    200 	if f1.i == 0 || f2.i == 0 {
    201 		return frag{}
    202 	}
    203 
    204 	// TODO: elide nop
    205 
    206 	f1.out.patch(c.p, f2.i)
    207 	return frag{f1.i, f2.out}
    208 }
    209 
    210 func (c *compiler) alt(f1, f2 frag) frag {
    211 	// alt of failure is other
    212 	if f1.i == 0 {
    213 		return f2
    214 	}
    215 	if f2.i == 0 {
    216 		return f1
    217 	}
    218 
    219 	f := c.inst(InstAlt)
    220 	i := &c.p.Inst[f.i]
    221 	i.Out = f1.i
    222 	i.Arg = f2.i
    223 	f.out = f1.out.append(c.p, f2.out)
    224 	return f
    225 }
    226 
    227 func (c *compiler) quest(f1 frag, nongreedy bool) frag {
    228 	f := c.inst(InstAlt)
    229 	i := &c.p.Inst[f.i]
    230 	if nongreedy {
    231 		i.Arg = f1.i
    232 		f.out = patchList(f.i << 1)
    233 	} else {
    234 		i.Out = f1.i
    235 		f.out = patchList(f.i<<1 | 1)
    236 	}
    237 	f.out = f.out.append(c.p, f1.out)
    238 	return f
    239 }
    240 
    241 func (c *compiler) star(f1 frag, nongreedy bool) frag {
    242 	f := c.inst(InstAlt)
    243 	i := &c.p.Inst[f.i]
    244 	if nongreedy {
    245 		i.Arg = f1.i
    246 		f.out = patchList(f.i << 1)
    247 	} else {
    248 		i.Out = f1.i
    249 		f.out = patchList(f.i<<1 | 1)
    250 	}
    251 	f1.out.patch(c.p, f.i)
    252 	return f
    253 }
    254 
    255 func (c *compiler) plus(f1 frag, nongreedy bool) frag {
    256 	return frag{f1.i, c.star(f1, nongreedy).out}
    257 }
    258 
    259 func (c *compiler) empty(op EmptyOp) frag {
    260 	f := c.inst(InstEmptyWidth)
    261 	c.p.Inst[f.i].Arg = uint32(op)
    262 	f.out = patchList(f.i << 1)
    263 	return f
    264 }
    265 
    266 func (c *compiler) rune(r []rune, flags Flags) frag {
    267 	f := c.inst(InstRune)
    268 	i := &c.p.Inst[f.i]
    269 	i.Rune = r
    270 	flags &= FoldCase // only relevant flag is FoldCase
    271 	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
    272 		// and sometimes not even that
    273 		flags &^= FoldCase
    274 	}
    275 	i.Arg = uint32(flags)
    276 	f.out = patchList(f.i << 1)
    277 
    278 	// Special cases for exec machine.
    279 	switch {
    280 	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
    281 		i.Op = InstRune1
    282 	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
    283 		i.Op = InstRuneAny
    284 	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
    285 		i.Op = InstRuneAnyNotNL
    286 	}
    287 
    288 	return f
    289 }
    290