Home | History | Annotate | Download | only in regexp
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package regexp
      6 
      7 import (
      8 	"io"
      9 	"regexp/syntax"
     10 )
     11 
     12 // A queue is a 'sparse array' holding pending threads of execution.
     13 // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
     14 type queue struct {
     15 	sparse []uint32
     16 	dense  []entry
     17 }
     18 
     19 // A entry is an entry on a queue.
     20 // It holds both the instruction pc and the actual thread.
     21 // Some queue entries are just place holders so that the machine
     22 // knows it has considered that pc.  Such entries have t == nil.
     23 type entry struct {
     24 	pc uint32
     25 	t  *thread
     26 }
     27 
     28 // A thread is the state of a single path through the machine:
     29 // an instruction and a corresponding capture array.
     30 // See http://swtch.com/~rsc/regexp/regexp2.html
     31 type thread struct {
     32 	inst *syntax.Inst
     33 	cap  []int
     34 }
     35 
     36 // A machine holds all the state during an NFA simulation for p.
     37 type machine struct {
     38 	re             *Regexp      // corresponding Regexp
     39 	p              *syntax.Prog // compiled program
     40 	op             *onePassProg // compiled onepass program, or notOnePass
     41 	maxBitStateLen int          // max length of string to search with bitstate
     42 	b              *bitState    // state for backtracker, allocated lazily
     43 	q0, q1         queue        // two queues for runq, nextq
     44 	pool           []*thread    // pool of available threads
     45 	matched        bool         // whether a match was found
     46 	matchcap       []int        // capture information for the match
     47 
     48 	// cached inputs, to avoid allocation
     49 	inputBytes  inputBytes
     50 	inputString inputString
     51 	inputReader inputReader
     52 }
     53 
     54 func (m *machine) newInputBytes(b []byte) input {
     55 	m.inputBytes.str = b
     56 	return &m.inputBytes
     57 }
     58 
     59 func (m *machine) newInputString(s string) input {
     60 	m.inputString.str = s
     61 	return &m.inputString
     62 }
     63 
     64 func (m *machine) newInputReader(r io.RuneReader) input {
     65 	m.inputReader.r = r
     66 	m.inputReader.atEOT = false
     67 	m.inputReader.pos = 0
     68 	return &m.inputReader
     69 }
     70 
     71 // progMachine returns a new machine running the prog p.
     72 func progMachine(p *syntax.Prog, op *onePassProg) *machine {
     73 	m := &machine{p: p, op: op}
     74 	n := len(m.p.Inst)
     75 	m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
     76 	m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
     77 	ncap := p.NumCap
     78 	if ncap < 2 {
     79 		ncap = 2
     80 	}
     81 	if op == notOnePass {
     82 		m.maxBitStateLen = maxBitStateLen(p)
     83 	}
     84 	m.matchcap = make([]int, ncap)
     85 	return m
     86 }
     87 
     88 func (m *machine) init(ncap int) {
     89 	for _, t := range m.pool {
     90 		t.cap = t.cap[:ncap]
     91 	}
     92 	m.matchcap = m.matchcap[:ncap]
     93 }
     94 
     95 // alloc allocates a new thread with the given instruction.
     96 // It uses the free pool if possible.
     97 func (m *machine) alloc(i *syntax.Inst) *thread {
     98 	var t *thread
     99 	if n := len(m.pool); n > 0 {
    100 		t = m.pool[n-1]
    101 		m.pool = m.pool[:n-1]
    102 	} else {
    103 		t = new(thread)
    104 		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
    105 	}
    106 	t.inst = i
    107 	return t
    108 }
    109 
    110 // free returns t to the free pool.
    111 func (m *machine) free(t *thread) {
    112 	m.inputBytes.str = nil
    113 	m.inputString.str = ""
    114 	m.inputReader.r = nil
    115 	m.pool = append(m.pool, t)
    116 }
    117 
    118 // match runs the machine over the input starting at pos.
    119 // It reports whether a match was found.
    120 // If so, m.matchcap holds the submatch information.
    121 func (m *machine) match(i input, pos int) bool {
    122 	startCond := m.re.cond
    123 	if startCond == ^syntax.EmptyOp(0) { // impossible
    124 		return false
    125 	}
    126 	m.matched = false
    127 	for i := range m.matchcap {
    128 		m.matchcap[i] = -1
    129 	}
    130 	runq, nextq := &m.q0, &m.q1
    131 	r, r1 := endOfText, endOfText
    132 	width, width1 := 0, 0
    133 	r, width = i.step(pos)
    134 	if r != endOfText {
    135 		r1, width1 = i.step(pos + width)
    136 	}
    137 	var flag syntax.EmptyOp
    138 	if pos == 0 {
    139 		flag = syntax.EmptyOpContext(-1, r)
    140 	} else {
    141 		flag = i.context(pos)
    142 	}
    143 	for {
    144 		if len(runq.dense) == 0 {
    145 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
    146 				// Anchored match, past beginning of text.
    147 				break
    148 			}
    149 			if m.matched {
    150 				// Have match; finished exploring alternatives.
    151 				break
    152 			}
    153 			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
    154 				// Match requires literal prefix; fast search for it.
    155 				advance := i.index(m.re, pos)
    156 				if advance < 0 {
    157 					break
    158 				}
    159 				pos += advance
    160 				r, width = i.step(pos)
    161 				r1, width1 = i.step(pos + width)
    162 			}
    163 		}
    164 		if !m.matched {
    165 			if len(m.matchcap) > 0 {
    166 				m.matchcap[0] = pos
    167 			}
    168 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
    169 		}
    170 		flag = syntax.EmptyOpContext(r, r1)
    171 		m.step(runq, nextq, pos, pos+width, r, flag)
    172 		if width == 0 {
    173 			break
    174 		}
    175 		if len(m.matchcap) == 0 && m.matched {
    176 			// Found a match and not paying attention
    177 			// to where it is, so any match will do.
    178 			break
    179 		}
    180 		pos += width
    181 		r, width = r1, width1
    182 		if r != endOfText {
    183 			r1, width1 = i.step(pos + width)
    184 		}
    185 		runq, nextq = nextq, runq
    186 	}
    187 	m.clear(nextq)
    188 	return m.matched
    189 }
    190 
    191 // clear frees all threads on the thread queue.
    192 func (m *machine) clear(q *queue) {
    193 	for _, d := range q.dense {
    194 		if d.t != nil {
    195 			// m.free(d.t)
    196 			m.pool = append(m.pool, d.t)
    197 		}
    198 	}
    199 	q.dense = q.dense[:0]
    200 }
    201 
    202 // step executes one step of the machine, running each of the threads
    203 // on runq and appending new threads to nextq.
    204 // The step processes the rune c (which may be endOfText),
    205 // which starts at position pos and ends at nextPos.
    206 // nextCond gives the setting for the empty-width flags after c.
    207 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
    208 	longest := m.re.longest
    209 	for j := 0; j < len(runq.dense); j++ {
    210 		d := &runq.dense[j]
    211 		t := d.t
    212 		if t == nil {
    213 			continue
    214 		}
    215 		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
    216 			// m.free(t)
    217 			m.pool = append(m.pool, t)
    218 			continue
    219 		}
    220 		i := t.inst
    221 		add := false
    222 		switch i.Op {
    223 		default:
    224 			panic("bad inst")
    225 
    226 		case syntax.InstMatch:
    227 			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
    228 				t.cap[1] = pos
    229 				copy(m.matchcap, t.cap)
    230 			}
    231 			if !longest {
    232 				// First-match mode: cut off all lower-priority threads.
    233 				for _, d := range runq.dense[j+1:] {
    234 					if d.t != nil {
    235 						// m.free(d.t)
    236 						m.pool = append(m.pool, d.t)
    237 					}
    238 				}
    239 				runq.dense = runq.dense[:0]
    240 			}
    241 			m.matched = true
    242 
    243 		case syntax.InstRune:
    244 			add = i.MatchRune(c)
    245 		case syntax.InstRune1:
    246 			add = c == i.Rune[0]
    247 		case syntax.InstRuneAny:
    248 			add = true
    249 		case syntax.InstRuneAnyNotNL:
    250 			add = c != '\n'
    251 		}
    252 		if add {
    253 			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
    254 		}
    255 		if t != nil {
    256 			// m.free(t)
    257 			m.pool = append(m.pool, t)
    258 		}
    259 	}
    260 	runq.dense = runq.dense[:0]
    261 }
    262 
    263 // add adds an entry to q for pc, unless the q already has such an entry.
    264 // It also recursively adds an entry for all instructions reachable from pc by following
    265 // empty-width conditions satisfied by cond.  pos gives the current position
    266 // in the input.
    267 func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
    268 	if pc == 0 {
    269 		return t
    270 	}
    271 	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
    272 		return t
    273 	}
    274 
    275 	j := len(q.dense)
    276 	q.dense = q.dense[:j+1]
    277 	d := &q.dense[j]
    278 	d.t = nil
    279 	d.pc = pc
    280 	q.sparse[pc] = uint32(j)
    281 
    282 	i := &m.p.Inst[pc]
    283 	switch i.Op {
    284 	default:
    285 		panic("unhandled")
    286 	case syntax.InstFail:
    287 		// nothing
    288 	case syntax.InstAlt, syntax.InstAltMatch:
    289 		t = m.add(q, i.Out, pos, cap, cond, t)
    290 		t = m.add(q, i.Arg, pos, cap, cond, t)
    291 	case syntax.InstEmptyWidth:
    292 		if syntax.EmptyOp(i.Arg)&^cond == 0 {
    293 			t = m.add(q, i.Out, pos, cap, cond, t)
    294 		}
    295 	case syntax.InstNop:
    296 		t = m.add(q, i.Out, pos, cap, cond, t)
    297 	case syntax.InstCapture:
    298 		if int(i.Arg) < len(cap) {
    299 			opos := cap[i.Arg]
    300 			cap[i.Arg] = pos
    301 			m.add(q, i.Out, pos, cap, cond, nil)
    302 			cap[i.Arg] = opos
    303 		} else {
    304 			t = m.add(q, i.Out, pos, cap, cond, t)
    305 		}
    306 	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    307 		if t == nil {
    308 			t = m.alloc(i)
    309 		} else {
    310 			t.inst = i
    311 		}
    312 		if len(cap) > 0 && &t.cap[0] != &cap[0] {
    313 			copy(t.cap, cap)
    314 		}
    315 		d.t = t
    316 		t = nil
    317 	}
    318 	return t
    319 }
    320 
    321 // onepass runs the machine over the input starting at pos.
    322 // It reports whether a match was found.
    323 // If so, m.matchcap holds the submatch information.
    324 func (m *machine) onepass(i input, pos int) bool {
    325 	startCond := m.re.cond
    326 	if startCond == ^syntax.EmptyOp(0) { // impossible
    327 		return false
    328 	}
    329 	m.matched = false
    330 	for i := range m.matchcap {
    331 		m.matchcap[i] = -1
    332 	}
    333 	r, r1 := endOfText, endOfText
    334 	width, width1 := 0, 0
    335 	r, width = i.step(pos)
    336 	if r != endOfText {
    337 		r1, width1 = i.step(pos + width)
    338 	}
    339 	var flag syntax.EmptyOp
    340 	if pos == 0 {
    341 		flag = syntax.EmptyOpContext(-1, r)
    342 	} else {
    343 		flag = i.context(pos)
    344 	}
    345 	pc := m.op.Start
    346 	inst := m.op.Inst[pc]
    347 	// If there is a simple literal prefix, skip over it.
    348 	if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 &&
    349 		len(m.re.prefix) > 0 && i.canCheckPrefix() {
    350 		// Match requires literal prefix; fast search for it.
    351 		if i.hasPrefix(m.re) {
    352 			pos += len(m.re.prefix)
    353 			r, width = i.step(pos)
    354 			r1, width1 = i.step(pos + width)
    355 			flag = i.context(pos)
    356 			pc = int(m.re.prefixEnd)
    357 		} else {
    358 			return m.matched
    359 		}
    360 	}
    361 	for {
    362 		inst = m.op.Inst[pc]
    363 		pc = int(inst.Out)
    364 		switch inst.Op {
    365 		default:
    366 			panic("bad inst")
    367 		case syntax.InstMatch:
    368 			m.matched = true
    369 			if len(m.matchcap) > 0 {
    370 				m.matchcap[0] = 0
    371 				m.matchcap[1] = pos
    372 			}
    373 			return m.matched
    374 		case syntax.InstRune:
    375 			if !inst.MatchRune(r) {
    376 				return m.matched
    377 			}
    378 		case syntax.InstRune1:
    379 			if r != inst.Rune[0] {
    380 				return m.matched
    381 			}
    382 		case syntax.InstRuneAny:
    383 			// Nothing
    384 		case syntax.InstRuneAnyNotNL:
    385 			if r == '\n' {
    386 				return m.matched
    387 			}
    388 		// peek at the input rune to see which branch of the Alt to take
    389 		case syntax.InstAlt, syntax.InstAltMatch:
    390 			pc = int(onePassNext(&inst, r))
    391 			continue
    392 		case syntax.InstFail:
    393 			return m.matched
    394 		case syntax.InstNop:
    395 			continue
    396 		case syntax.InstEmptyWidth:
    397 			if syntax.EmptyOp(inst.Arg)&^flag != 0 {
    398 				return m.matched
    399 			}
    400 			continue
    401 		case syntax.InstCapture:
    402 			if int(inst.Arg) < len(m.matchcap) {
    403 				m.matchcap[inst.Arg] = pos
    404 			}
    405 			continue
    406 		}
    407 		if width == 0 {
    408 			break
    409 		}
    410 		flag = syntax.EmptyOpContext(r, r1)
    411 		pos += width
    412 		r, width = r1, width1
    413 		if r != endOfText {
    414 			r1, width1 = i.step(pos + width)
    415 		}
    416 	}
    417 	return m.matched
    418 }
    419 
    420 // empty is a non-nil 0-element slice,
    421 // so doExecute can avoid an allocation
    422 // when 0 captures are requested from a successful match.
    423 var empty = make([]int, 0)
    424 
    425 // doExecute finds the leftmost match in the input and returns
    426 // the position of its subexpressions.
    427 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int {
    428 	m := re.get()
    429 	var i input
    430 	var size int
    431 	if r != nil {
    432 		i = m.newInputReader(r)
    433 	} else if b != nil {
    434 		i = m.newInputBytes(b)
    435 		size = len(b)
    436 	} else {
    437 		i = m.newInputString(s)
    438 		size = len(s)
    439 	}
    440 	if m.op != notOnePass {
    441 		if !m.onepass(i, pos) {
    442 			re.put(m)
    443 			return nil
    444 		}
    445 	} else if size < m.maxBitStateLen && r == nil {
    446 		if m.b == nil {
    447 			m.b = newBitState(m.p)
    448 		}
    449 		if !m.backtrack(i, pos, size, ncap) {
    450 			re.put(m)
    451 			return nil
    452 		}
    453 	} else {
    454 		m.init(ncap)
    455 		if !m.match(i, pos) {
    456 			re.put(m)
    457 			return nil
    458 		}
    459 	}
    460 	if ncap == 0 {
    461 		re.put(m)
    462 		return empty // empty but not nil
    463 	}
    464 	cap := make([]int, len(m.matchcap))
    465 	copy(cap, m.matchcap)
    466 	re.put(m)
    467 	return cap
    468 }
    469