1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import "unicode" 8 9 // A patchList is a list of instruction pointers that need to be filled in (patched). 10 // Because the pointers haven't been filled in yet, we can reuse their storage 11 // to hold the list. It's kind of sleazy, but works well in practice. 12 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 13 // 14 // These aren't really pointers: they're integers, so we can reinterpret them 15 // this way without using package unsafe. A value l denotes 16 // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). 17 // l == 0 denotes the empty list, okay because we start every program 18 // with a fail instruction, so we'll never want to point at its output link. 19 type patchList uint32 20 21 func (l patchList) next(p *Prog) patchList { 22 i := &p.Inst[l>>1] 23 if l&1 == 0 { 24 return patchList(i.Out) 25 } 26 return patchList(i.Arg) 27 } 28 29 func (l patchList) patch(p *Prog, val uint32) { 30 for l != 0 { 31 i := &p.Inst[l>>1] 32 if l&1 == 0 { 33 l = patchList(i.Out) 34 i.Out = val 35 } else { 36 l = patchList(i.Arg) 37 i.Arg = val 38 } 39 } 40 } 41 42 func (l1 patchList) append(p *Prog, l2 patchList) patchList { 43 if l1 == 0 { 44 return l2 45 } 46 if l2 == 0 { 47 return l1 48 } 49 50 last := l1 51 for { 52 next := last.next(p) 53 if next == 0 { 54 break 55 } 56 last = next 57 } 58 59 i := &p.Inst[last>>1] 60 if last&1 == 0 { 61 i.Out = uint32(l2) 62 } else { 63 i.Arg = uint32(l2) 64 } 65 return l1 66 } 67 68 // A frag represents a compiled program fragment. 69 type frag struct { 70 i uint32 // index of first instruction 71 out patchList // where to record end instruction 72 } 73 74 type compiler struct { 75 p *Prog 76 } 77 78 // Compile compiles the regexp into a program to be executed. 79 // The regexp should have been simplified already (returned from re.Simplify). 80 func Compile(re *Regexp) (*Prog, error) { 81 var c compiler 82 c.init() 83 f := c.compile(re) 84 f.out.patch(c.p, c.inst(InstMatch).i) 85 c.p.Start = int(f.i) 86 return c.p, nil 87 } 88 89 func (c *compiler) init() { 90 c.p = new(Prog) 91 c.p.NumCap = 2 // implicit ( and ) for whole match $0 92 c.inst(InstFail) 93 } 94 95 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 96 var anyRune = []rune{0, unicode.MaxRune} 97 98 func (c *compiler) compile(re *Regexp) frag { 99 switch re.Op { 100 case OpNoMatch: 101 return c.fail() 102 case OpEmptyMatch: 103 return c.nop() 104 case OpLiteral: 105 if len(re.Rune) == 0 { 106 return c.nop() 107 } 108 var f frag 109 for j := range re.Rune { 110 f1 := c.rune(re.Rune[j:j+1], re.Flags) 111 if j == 0 { 112 f = f1 113 } else { 114 f = c.cat(f, f1) 115 } 116 } 117 return f 118 case OpCharClass: 119 return c.rune(re.Rune, re.Flags) 120 case OpAnyCharNotNL: 121 return c.rune(anyRuneNotNL, 0) 122 case OpAnyChar: 123 return c.rune(anyRune, 0) 124 case OpBeginLine: 125 return c.empty(EmptyBeginLine) 126 case OpEndLine: 127 return c.empty(EmptyEndLine) 128 case OpBeginText: 129 return c.empty(EmptyBeginText) 130 case OpEndText: 131 return c.empty(EmptyEndText) 132 case OpWordBoundary: 133 return c.empty(EmptyWordBoundary) 134 case OpNoWordBoundary: 135 return c.empty(EmptyNoWordBoundary) 136 case OpCapture: 137 bra := c.cap(uint32(re.Cap << 1)) 138 sub := c.compile(re.Sub[0]) 139 ket := c.cap(uint32(re.Cap<<1 | 1)) 140 return c.cat(c.cat(bra, sub), ket) 141 case OpStar: 142 return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 143 case OpPlus: 144 return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 145 case OpQuest: 146 return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 147 case OpConcat: 148 if len(re.Sub) == 0 { 149 return c.nop() 150 } 151 var f frag 152 for i, sub := range re.Sub { 153 if i == 0 { 154 f = c.compile(sub) 155 } else { 156 f = c.cat(f, c.compile(sub)) 157 } 158 } 159 return f 160 case OpAlternate: 161 var f frag 162 for _, sub := range re.Sub { 163 f = c.alt(f, c.compile(sub)) 164 } 165 return f 166 } 167 panic("regexp: unhandled case in compile") 168 } 169 170 func (c *compiler) inst(op InstOp) frag { 171 // TODO: impose length limit 172 f := frag{i: uint32(len(c.p.Inst))} 173 c.p.Inst = append(c.p.Inst, Inst{Op: op}) 174 return f 175 } 176 177 func (c *compiler) nop() frag { 178 f := c.inst(InstNop) 179 f.out = patchList(f.i << 1) 180 return f 181 } 182 183 func (c *compiler) fail() frag { 184 return frag{} 185 } 186 187 func (c *compiler) cap(arg uint32) frag { 188 f := c.inst(InstCapture) 189 f.out = patchList(f.i << 1) 190 c.p.Inst[f.i].Arg = arg 191 192 if c.p.NumCap < int(arg)+1 { 193 c.p.NumCap = int(arg) + 1 194 } 195 return f 196 } 197 198 func (c *compiler) cat(f1, f2 frag) frag { 199 // concat of failure is failure 200 if f1.i == 0 || f2.i == 0 { 201 return frag{} 202 } 203 204 // TODO: elide nop 205 206 f1.out.patch(c.p, f2.i) 207 return frag{f1.i, f2.out} 208 } 209 210 func (c *compiler) alt(f1, f2 frag) frag { 211 // alt of failure is other 212 if f1.i == 0 { 213 return f2 214 } 215 if f2.i == 0 { 216 return f1 217 } 218 219 f := c.inst(InstAlt) 220 i := &c.p.Inst[f.i] 221 i.Out = f1.i 222 i.Arg = f2.i 223 f.out = f1.out.append(c.p, f2.out) 224 return f 225 } 226 227 func (c *compiler) quest(f1 frag, nongreedy bool) frag { 228 f := c.inst(InstAlt) 229 i := &c.p.Inst[f.i] 230 if nongreedy { 231 i.Arg = f1.i 232 f.out = patchList(f.i << 1) 233 } else { 234 i.Out = f1.i 235 f.out = patchList(f.i<<1 | 1) 236 } 237 f.out = f.out.append(c.p, f1.out) 238 return f 239 } 240 241 func (c *compiler) star(f1 frag, nongreedy bool) frag { 242 f := c.inst(InstAlt) 243 i := &c.p.Inst[f.i] 244 if nongreedy { 245 i.Arg = f1.i 246 f.out = patchList(f.i << 1) 247 } else { 248 i.Out = f1.i 249 f.out = patchList(f.i<<1 | 1) 250 } 251 f1.out.patch(c.p, f.i) 252 return f 253 } 254 255 func (c *compiler) plus(f1 frag, nongreedy bool) frag { 256 return frag{f1.i, c.star(f1, nongreedy).out} 257 } 258 259 func (c *compiler) empty(op EmptyOp) frag { 260 f := c.inst(InstEmptyWidth) 261 c.p.Inst[f.i].Arg = uint32(op) 262 f.out = patchList(f.i << 1) 263 return f 264 } 265 266 func (c *compiler) rune(r []rune, flags Flags) frag { 267 f := c.inst(InstRune) 268 i := &c.p.Inst[f.i] 269 i.Rune = r 270 flags &= FoldCase // only relevant flag is FoldCase 271 if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { 272 // and sometimes not even that 273 flags &^= FoldCase 274 } 275 i.Arg = uint32(flags) 276 f.out = patchList(f.i << 1) 277 278 // Special cases for exec machine. 279 switch { 280 case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): 281 i.Op = InstRune1 282 case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: 283 i.Op = InstRuneAny 284 case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: 285 i.Op = InstRuneAnyNotNL 286 } 287 288 return f 289 } 290