1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package regexp 6 7 import ( 8 "io" 9 "regexp/syntax" 10 ) 11 12 // A queue is a 'sparse array' holding pending threads of execution. 13 // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 type queue struct { 15 sparse []uint32 16 dense []entry 17 } 18 19 // A entry is an entry on a queue. 20 // It holds both the instruction pc and the actual thread. 21 // Some queue entries are just place holders so that the machine 22 // knows it has considered that pc. Such entries have t == nil. 23 type entry struct { 24 pc uint32 25 t *thread 26 } 27 28 // A thread is the state of a single path through the machine: 29 // an instruction and a corresponding capture array. 30 // See http://swtch.com/~rsc/regexp/regexp2.html 31 type thread struct { 32 inst *syntax.Inst 33 cap []int 34 } 35 36 // A machine holds all the state during an NFA simulation for p. 37 type machine struct { 38 re *Regexp // corresponding Regexp 39 p *syntax.Prog // compiled program 40 op *onePassProg // compiled onepass program, or notOnePass 41 maxBitStateLen int // max length of string to search with bitstate 42 b *bitState // state for backtracker, allocated lazily 43 q0, q1 queue // two queues for runq, nextq 44 pool []*thread // pool of available threads 45 matched bool // whether a match was found 46 matchcap []int // capture information for the match 47 48 // cached inputs, to avoid allocation 49 inputBytes inputBytes 50 inputString inputString 51 inputReader inputReader 52 } 53 54 func (m *machine) newInputBytes(b []byte) input { 55 m.inputBytes.str = b 56 return &m.inputBytes 57 } 58 59 func (m *machine) newInputString(s string) input { 60 m.inputString.str = s 61 return &m.inputString 62 } 63 64 func (m *machine) newInputReader(r io.RuneReader) input { 65 m.inputReader.r = r 66 m.inputReader.atEOT = false 67 m.inputReader.pos = 0 68 return &m.inputReader 69 } 70 71 // progMachine returns a new machine running the prog p. 72 func progMachine(p *syntax.Prog, op *onePassProg) *machine { 73 m := &machine{p: p, op: op} 74 n := len(m.p.Inst) 75 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 76 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 77 ncap := p.NumCap 78 if ncap < 2 { 79 ncap = 2 80 } 81 if op == notOnePass { 82 m.maxBitStateLen = maxBitStateLen(p) 83 } 84 m.matchcap = make([]int, ncap) 85 return m 86 } 87 88 func (m *machine) init(ncap int) { 89 for _, t := range m.pool { 90 t.cap = t.cap[:ncap] 91 } 92 m.matchcap = m.matchcap[:ncap] 93 } 94 95 // alloc allocates a new thread with the given instruction. 96 // It uses the free pool if possible. 97 func (m *machine) alloc(i *syntax.Inst) *thread { 98 var t *thread 99 if n := len(m.pool); n > 0 { 100 t = m.pool[n-1] 101 m.pool = m.pool[:n-1] 102 } else { 103 t = new(thread) 104 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 105 } 106 t.inst = i 107 return t 108 } 109 110 // free returns t to the free pool. 111 func (m *machine) free(t *thread) { 112 m.inputBytes.str = nil 113 m.inputString.str = "" 114 m.inputReader.r = nil 115 m.pool = append(m.pool, t) 116 } 117 118 // match runs the machine over the input starting at pos. 119 // It reports whether a match was found. 120 // If so, m.matchcap holds the submatch information. 121 func (m *machine) match(i input, pos int) bool { 122 startCond := m.re.cond 123 if startCond == ^syntax.EmptyOp(0) { // impossible 124 return false 125 } 126 m.matched = false 127 for i := range m.matchcap { 128 m.matchcap[i] = -1 129 } 130 runq, nextq := &m.q0, &m.q1 131 r, r1 := endOfText, endOfText 132 width, width1 := 0, 0 133 r, width = i.step(pos) 134 if r != endOfText { 135 r1, width1 = i.step(pos + width) 136 } 137 var flag syntax.EmptyOp 138 if pos == 0 { 139 flag = syntax.EmptyOpContext(-1, r) 140 } else { 141 flag = i.context(pos) 142 } 143 for { 144 if len(runq.dense) == 0 { 145 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 146 // Anchored match, past beginning of text. 147 break 148 } 149 if m.matched { 150 // Have match; finished exploring alternatives. 151 break 152 } 153 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 154 // Match requires literal prefix; fast search for it. 155 advance := i.index(m.re, pos) 156 if advance < 0 { 157 break 158 } 159 pos += advance 160 r, width = i.step(pos) 161 r1, width1 = i.step(pos + width) 162 } 163 } 164 if !m.matched { 165 if len(m.matchcap) > 0 { 166 m.matchcap[0] = pos 167 } 168 m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 169 } 170 flag = syntax.EmptyOpContext(r, r1) 171 m.step(runq, nextq, pos, pos+width, r, flag) 172 if width == 0 { 173 break 174 } 175 if len(m.matchcap) == 0 && m.matched { 176 // Found a match and not paying attention 177 // to where it is, so any match will do. 178 break 179 } 180 pos += width 181 r, width = r1, width1 182 if r != endOfText { 183 r1, width1 = i.step(pos + width) 184 } 185 runq, nextq = nextq, runq 186 } 187 m.clear(nextq) 188 return m.matched 189 } 190 191 // clear frees all threads on the thread queue. 192 func (m *machine) clear(q *queue) { 193 for _, d := range q.dense { 194 if d.t != nil { 195 // m.free(d.t) 196 m.pool = append(m.pool, d.t) 197 } 198 } 199 q.dense = q.dense[:0] 200 } 201 202 // step executes one step of the machine, running each of the threads 203 // on runq and appending new threads to nextq. 204 // The step processes the rune c (which may be endOfText), 205 // which starts at position pos and ends at nextPos. 206 // nextCond gives the setting for the empty-width flags after c. 207 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 208 longest := m.re.longest 209 for j := 0; j < len(runq.dense); j++ { 210 d := &runq.dense[j] 211 t := d.t 212 if t == nil { 213 continue 214 } 215 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 216 // m.free(t) 217 m.pool = append(m.pool, t) 218 continue 219 } 220 i := t.inst 221 add := false 222 switch i.Op { 223 default: 224 panic("bad inst") 225 226 case syntax.InstMatch: 227 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 228 t.cap[1] = pos 229 copy(m.matchcap, t.cap) 230 } 231 if !longest { 232 // First-match mode: cut off all lower-priority threads. 233 for _, d := range runq.dense[j+1:] { 234 if d.t != nil { 235 // m.free(d.t) 236 m.pool = append(m.pool, d.t) 237 } 238 } 239 runq.dense = runq.dense[:0] 240 } 241 m.matched = true 242 243 case syntax.InstRune: 244 add = i.MatchRune(c) 245 case syntax.InstRune1: 246 add = c == i.Rune[0] 247 case syntax.InstRuneAny: 248 add = true 249 case syntax.InstRuneAnyNotNL: 250 add = c != '\n' 251 } 252 if add { 253 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 254 } 255 if t != nil { 256 // m.free(t) 257 m.pool = append(m.pool, t) 258 } 259 } 260 runq.dense = runq.dense[:0] 261 } 262 263 // add adds an entry to q for pc, unless the q already has such an entry. 264 // It also recursively adds an entry for all instructions reachable from pc by following 265 // empty-width conditions satisfied by cond. pos gives the current position 266 // in the input. 267 func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 268 if pc == 0 { 269 return t 270 } 271 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 272 return t 273 } 274 275 j := len(q.dense) 276 q.dense = q.dense[:j+1] 277 d := &q.dense[j] 278 d.t = nil 279 d.pc = pc 280 q.sparse[pc] = uint32(j) 281 282 i := &m.p.Inst[pc] 283 switch i.Op { 284 default: 285 panic("unhandled") 286 case syntax.InstFail: 287 // nothing 288 case syntax.InstAlt, syntax.InstAltMatch: 289 t = m.add(q, i.Out, pos, cap, cond, t) 290 t = m.add(q, i.Arg, pos, cap, cond, t) 291 case syntax.InstEmptyWidth: 292 if syntax.EmptyOp(i.Arg)&^cond == 0 { 293 t = m.add(q, i.Out, pos, cap, cond, t) 294 } 295 case syntax.InstNop: 296 t = m.add(q, i.Out, pos, cap, cond, t) 297 case syntax.InstCapture: 298 if int(i.Arg) < len(cap) { 299 opos := cap[i.Arg] 300 cap[i.Arg] = pos 301 m.add(q, i.Out, pos, cap, cond, nil) 302 cap[i.Arg] = opos 303 } else { 304 t = m.add(q, i.Out, pos, cap, cond, t) 305 } 306 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 307 if t == nil { 308 t = m.alloc(i) 309 } else { 310 t.inst = i 311 } 312 if len(cap) > 0 && &t.cap[0] != &cap[0] { 313 copy(t.cap, cap) 314 } 315 d.t = t 316 t = nil 317 } 318 return t 319 } 320 321 // onepass runs the machine over the input starting at pos. 322 // It reports whether a match was found. 323 // If so, m.matchcap holds the submatch information. 324 func (m *machine) onepass(i input, pos int) bool { 325 startCond := m.re.cond 326 if startCond == ^syntax.EmptyOp(0) { // impossible 327 return false 328 } 329 m.matched = false 330 for i := range m.matchcap { 331 m.matchcap[i] = -1 332 } 333 r, r1 := endOfText, endOfText 334 width, width1 := 0, 0 335 r, width = i.step(pos) 336 if r != endOfText { 337 r1, width1 = i.step(pos + width) 338 } 339 var flag syntax.EmptyOp 340 if pos == 0 { 341 flag = syntax.EmptyOpContext(-1, r) 342 } else { 343 flag = i.context(pos) 344 } 345 pc := m.op.Start 346 inst := m.op.Inst[pc] 347 // If there is a simple literal prefix, skip over it. 348 if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && 349 len(m.re.prefix) > 0 && i.canCheckPrefix() { 350 // Match requires literal prefix; fast search for it. 351 if i.hasPrefix(m.re) { 352 pos += len(m.re.prefix) 353 r, width = i.step(pos) 354 r1, width1 = i.step(pos + width) 355 flag = i.context(pos) 356 pc = int(m.re.prefixEnd) 357 } else { 358 return m.matched 359 } 360 } 361 for { 362 inst = m.op.Inst[pc] 363 pc = int(inst.Out) 364 switch inst.Op { 365 default: 366 panic("bad inst") 367 case syntax.InstMatch: 368 m.matched = true 369 if len(m.matchcap) > 0 { 370 m.matchcap[0] = 0 371 m.matchcap[1] = pos 372 } 373 return m.matched 374 case syntax.InstRune: 375 if !inst.MatchRune(r) { 376 return m.matched 377 } 378 case syntax.InstRune1: 379 if r != inst.Rune[0] { 380 return m.matched 381 } 382 case syntax.InstRuneAny: 383 // Nothing 384 case syntax.InstRuneAnyNotNL: 385 if r == '\n' { 386 return m.matched 387 } 388 // peek at the input rune to see which branch of the Alt to take 389 case syntax.InstAlt, syntax.InstAltMatch: 390 pc = int(onePassNext(&inst, r)) 391 continue 392 case syntax.InstFail: 393 return m.matched 394 case syntax.InstNop: 395 continue 396 case syntax.InstEmptyWidth: 397 if syntax.EmptyOp(inst.Arg)&^flag != 0 { 398 return m.matched 399 } 400 continue 401 case syntax.InstCapture: 402 if int(inst.Arg) < len(m.matchcap) { 403 m.matchcap[inst.Arg] = pos 404 } 405 continue 406 } 407 if width == 0 { 408 break 409 } 410 flag = syntax.EmptyOpContext(r, r1) 411 pos += width 412 r, width = r1, width1 413 if r != endOfText { 414 r1, width1 = i.step(pos + width) 415 } 416 } 417 return m.matched 418 } 419 420 // empty is a non-nil 0-element slice, 421 // so doExecute can avoid an allocation 422 // when 0 captures are requested from a successful match. 423 var empty = make([]int, 0) 424 425 // doExecute finds the leftmost match in the input and returns 426 // the position of its subexpressions. 427 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int { 428 m := re.get() 429 var i input 430 var size int 431 if r != nil { 432 i = m.newInputReader(r) 433 } else if b != nil { 434 i = m.newInputBytes(b) 435 size = len(b) 436 } else { 437 i = m.newInputString(s) 438 size = len(s) 439 } 440 if m.op != notOnePass { 441 if !m.onepass(i, pos) { 442 re.put(m) 443 return nil 444 } 445 } else if size < m.maxBitStateLen && r == nil { 446 if m.b == nil { 447 m.b = newBitState(m.p) 448 } 449 if !m.backtrack(i, pos, size, ncap) { 450 re.put(m) 451 return nil 452 } 453 } else { 454 m.init(ncap) 455 if !m.match(i, pos) { 456 re.put(m) 457 return nil 458 } 459 } 460 if ncap == 0 { 461 re.put(m) 462 return empty // empty but not nil 463 } 464 cap := make([]int, len(m.matchcap)) 465 copy(cap, m.matchcap) 466 re.put(m) 467 return cap 468 } 469