Home | History | Annotate | Download | only in regexp
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package regexp
      6 
      7 import (
      8 	"io"
      9 	"regexp/syntax"
     10 )
     11 
     12 // A queue is a 'sparse array' holding pending threads of execution.
     13 // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
     14 type queue struct {
     15 	sparse []uint32
     16 	dense  []entry
     17 }
     18 
     19 // An entry is an entry on a queue.
     20 // It holds both the instruction pc and the actual thread.
     21 // Some queue entries are just place holders so that the machine
     22 // knows it has considered that pc. Such entries have t == nil.
     23 type entry struct {
     24 	pc uint32
     25 	t  *thread
     26 }
     27 
     28 // A thread is the state of a single path through the machine:
     29 // an instruction and a corresponding capture array.
     30 // See http://swtch.com/~rsc/regexp/regexp2.html
     31 type thread struct {
     32 	inst *syntax.Inst
     33 	cap  []int
     34 }
     35 
     36 // A machine holds all the state during an NFA simulation for p.
     37 type machine struct {
     38 	re             *Regexp      // corresponding Regexp
     39 	p              *syntax.Prog // compiled program
     40 	op             *onePassProg // compiled onepass program, or notOnePass
     41 	maxBitStateLen int          // max length of string to search with bitstate
     42 	b              *bitState    // state for backtracker, allocated lazily
     43 	q0, q1         queue        // two queues for runq, nextq
     44 	pool           []*thread    // pool of available threads
     45 	matched        bool         // whether a match was found
     46 	matchcap       []int        // capture information for the match
     47 
     48 	// cached inputs, to avoid allocation
     49 	inputBytes  inputBytes
     50 	inputString inputString
     51 	inputReader inputReader
     52 }
     53 
     54 func (m *machine) newInputBytes(b []byte) input {
     55 	m.inputBytes.str = b
     56 	return &m.inputBytes
     57 }
     58 
     59 func (m *machine) newInputString(s string) input {
     60 	m.inputString.str = s
     61 	return &m.inputString
     62 }
     63 
     64 func (m *machine) newInputReader(r io.RuneReader) input {
     65 	m.inputReader.r = r
     66 	m.inputReader.atEOT = false
     67 	m.inputReader.pos = 0
     68 	return &m.inputReader
     69 }
     70 
     71 // progMachine returns a new machine running the prog p.
     72 func progMachine(p *syntax.Prog, op *onePassProg) *machine {
     73 	m := &machine{p: p, op: op}
     74 	n := len(m.p.Inst)
     75 	m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
     76 	m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
     77 	ncap := p.NumCap
     78 	if ncap < 2 {
     79 		ncap = 2
     80 	}
     81 	if op == notOnePass {
     82 		m.maxBitStateLen = maxBitStateLen(p)
     83 	}
     84 	m.matchcap = make([]int, ncap)
     85 	return m
     86 }
     87 
     88 func (m *machine) init(ncap int) {
     89 	for _, t := range m.pool {
     90 		t.cap = t.cap[:ncap]
     91 	}
     92 	m.matchcap = m.matchcap[:ncap]
     93 }
     94 
     95 // alloc allocates a new thread with the given instruction.
     96 // It uses the free pool if possible.
     97 func (m *machine) alloc(i *syntax.Inst) *thread {
     98 	var t *thread
     99 	if n := len(m.pool); n > 0 {
    100 		t = m.pool[n-1]
    101 		m.pool = m.pool[:n-1]
    102 	} else {
    103 		t = new(thread)
    104 		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
    105 	}
    106 	t.inst = i
    107 	return t
    108 }
    109 
    110 // match runs the machine over the input starting at pos.
    111 // It reports whether a match was found.
    112 // If so, m.matchcap holds the submatch information.
    113 func (m *machine) match(i input, pos int) bool {
    114 	startCond := m.re.cond
    115 	if startCond == ^syntax.EmptyOp(0) { // impossible
    116 		return false
    117 	}
    118 	m.matched = false
    119 	for i := range m.matchcap {
    120 		m.matchcap[i] = -1
    121 	}
    122 	runq, nextq := &m.q0, &m.q1
    123 	r, r1 := endOfText, endOfText
    124 	width, width1 := 0, 0
    125 	r, width = i.step(pos)
    126 	if r != endOfText {
    127 		r1, width1 = i.step(pos + width)
    128 	}
    129 	var flag syntax.EmptyOp
    130 	if pos == 0 {
    131 		flag = syntax.EmptyOpContext(-1, r)
    132 	} else {
    133 		flag = i.context(pos)
    134 	}
    135 	for {
    136 		if len(runq.dense) == 0 {
    137 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
    138 				// Anchored match, past beginning of text.
    139 				break
    140 			}
    141 			if m.matched {
    142 				// Have match; finished exploring alternatives.
    143 				break
    144 			}
    145 			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
    146 				// Match requires literal prefix; fast search for it.
    147 				advance := i.index(m.re, pos)
    148 				if advance < 0 {
    149 					break
    150 				}
    151 				pos += advance
    152 				r, width = i.step(pos)
    153 				r1, width1 = i.step(pos + width)
    154 			}
    155 		}
    156 		if !m.matched {
    157 			if len(m.matchcap) > 0 {
    158 				m.matchcap[0] = pos
    159 			}
    160 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
    161 		}
    162 		flag = syntax.EmptyOpContext(r, r1)
    163 		m.step(runq, nextq, pos, pos+width, r, flag)
    164 		if width == 0 {
    165 			break
    166 		}
    167 		if len(m.matchcap) == 0 && m.matched {
    168 			// Found a match and not paying attention
    169 			// to where it is, so any match will do.
    170 			break
    171 		}
    172 		pos += width
    173 		r, width = r1, width1
    174 		if r != endOfText {
    175 			r1, width1 = i.step(pos + width)
    176 		}
    177 		runq, nextq = nextq, runq
    178 	}
    179 	m.clear(nextq)
    180 	return m.matched
    181 }
    182 
    183 // clear frees all threads on the thread queue.
    184 func (m *machine) clear(q *queue) {
    185 	for _, d := range q.dense {
    186 		if d.t != nil {
    187 			m.pool = append(m.pool, d.t)
    188 		}
    189 	}
    190 	q.dense = q.dense[:0]
    191 }
    192 
    193 // step executes one step of the machine, running each of the threads
    194 // on runq and appending new threads to nextq.
    195 // The step processes the rune c (which may be endOfText),
    196 // which starts at position pos and ends at nextPos.
    197 // nextCond gives the setting for the empty-width flags after c.
    198 func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
    199 	longest := m.re.longest
    200 	for j := 0; j < len(runq.dense); j++ {
    201 		d := &runq.dense[j]
    202 		t := d.t
    203 		if t == nil {
    204 			continue
    205 		}
    206 		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
    207 			m.pool = append(m.pool, t)
    208 			continue
    209 		}
    210 		i := t.inst
    211 		add := false
    212 		switch i.Op {
    213 		default:
    214 			panic("bad inst")
    215 
    216 		case syntax.InstMatch:
    217 			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
    218 				t.cap[1] = pos
    219 				copy(m.matchcap, t.cap)
    220 			}
    221 			if !longest {
    222 				// First-match mode: cut off all lower-priority threads.
    223 				for _, d := range runq.dense[j+1:] {
    224 					if d.t != nil {
    225 						m.pool = append(m.pool, d.t)
    226 					}
    227 				}
    228 				runq.dense = runq.dense[:0]
    229 			}
    230 			m.matched = true
    231 
    232 		case syntax.InstRune:
    233 			add = i.MatchRune(c)
    234 		case syntax.InstRune1:
    235 			add = c == i.Rune[0]
    236 		case syntax.InstRuneAny:
    237 			add = true
    238 		case syntax.InstRuneAnyNotNL:
    239 			add = c != '\n'
    240 		}
    241 		if add {
    242 			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
    243 		}
    244 		if t != nil {
    245 			m.pool = append(m.pool, t)
    246 		}
    247 	}
    248 	runq.dense = runq.dense[:0]
    249 }
    250 
    251 // add adds an entry to q for pc, unless the q already has such an entry.
    252 // It also recursively adds an entry for all instructions reachable from pc by following
    253 // empty-width conditions satisfied by cond.  pos gives the current position
    254 // in the input.
    255 func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
    256 	if pc == 0 {
    257 		return t
    258 	}
    259 	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
    260 		return t
    261 	}
    262 
    263 	j := len(q.dense)
    264 	q.dense = q.dense[:j+1]
    265 	d := &q.dense[j]
    266 	d.t = nil
    267 	d.pc = pc
    268 	q.sparse[pc] = uint32(j)
    269 
    270 	i := &m.p.Inst[pc]
    271 	switch i.Op {
    272 	default:
    273 		panic("unhandled")
    274 	case syntax.InstFail:
    275 		// nothing
    276 	case syntax.InstAlt, syntax.InstAltMatch:
    277 		t = m.add(q, i.Out, pos, cap, cond, t)
    278 		t = m.add(q, i.Arg, pos, cap, cond, t)
    279 	case syntax.InstEmptyWidth:
    280 		if syntax.EmptyOp(i.Arg)&^cond == 0 {
    281 			t = m.add(q, i.Out, pos, cap, cond, t)
    282 		}
    283 	case syntax.InstNop:
    284 		t = m.add(q, i.Out, pos, cap, cond, t)
    285 	case syntax.InstCapture:
    286 		if int(i.Arg) < len(cap) {
    287 			opos := cap[i.Arg]
    288 			cap[i.Arg] = pos
    289 			m.add(q, i.Out, pos, cap, cond, nil)
    290 			cap[i.Arg] = opos
    291 		} else {
    292 			t = m.add(q, i.Out, pos, cap, cond, t)
    293 		}
    294 	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    295 		if t == nil {
    296 			t = m.alloc(i)
    297 		} else {
    298 			t.inst = i
    299 		}
    300 		if len(cap) > 0 && &t.cap[0] != &cap[0] {
    301 			copy(t.cap, cap)
    302 		}
    303 		d.t = t
    304 		t = nil
    305 	}
    306 	return t
    307 }
    308 
    309 // onepass runs the machine over the input starting at pos.
    310 // It reports whether a match was found.
    311 // If so, m.matchcap holds the submatch information.
    312 // ncap is the number of captures.
    313 func (m *machine) onepass(i input, pos, ncap int) bool {
    314 	startCond := m.re.cond
    315 	if startCond == ^syntax.EmptyOp(0) { // impossible
    316 		return false
    317 	}
    318 	m.matched = false
    319 	m.matchcap = m.matchcap[:ncap]
    320 	for i := range m.matchcap {
    321 		m.matchcap[i] = -1
    322 	}
    323 	r, r1 := endOfText, endOfText
    324 	width, width1 := 0, 0
    325 	r, width = i.step(pos)
    326 	if r != endOfText {
    327 		r1, width1 = i.step(pos + width)
    328 	}
    329 	var flag syntax.EmptyOp
    330 	if pos == 0 {
    331 		flag = syntax.EmptyOpContext(-1, r)
    332 	} else {
    333 		flag = i.context(pos)
    334 	}
    335 	pc := m.op.Start
    336 	inst := m.op.Inst[pc]
    337 	// If there is a simple literal prefix, skip over it.
    338 	if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 &&
    339 		len(m.re.prefix) > 0 && i.canCheckPrefix() {
    340 		// Match requires literal prefix; fast search for it.
    341 		if !i.hasPrefix(m.re) {
    342 			return m.matched
    343 		}
    344 		pos += len(m.re.prefix)
    345 		r, width = i.step(pos)
    346 		r1, width1 = i.step(pos + width)
    347 		flag = i.context(pos)
    348 		pc = int(m.re.prefixEnd)
    349 	}
    350 	for {
    351 		inst = m.op.Inst[pc]
    352 		pc = int(inst.Out)
    353 		switch inst.Op {
    354 		default:
    355 			panic("bad inst")
    356 		case syntax.InstMatch:
    357 			m.matched = true
    358 			if len(m.matchcap) > 0 {
    359 				m.matchcap[0] = 0
    360 				m.matchcap[1] = pos
    361 			}
    362 			return m.matched
    363 		case syntax.InstRune:
    364 			if !inst.MatchRune(r) {
    365 				return m.matched
    366 			}
    367 		case syntax.InstRune1:
    368 			if r != inst.Rune[0] {
    369 				return m.matched
    370 			}
    371 		case syntax.InstRuneAny:
    372 			// Nothing
    373 		case syntax.InstRuneAnyNotNL:
    374 			if r == '\n' {
    375 				return m.matched
    376 			}
    377 		// peek at the input rune to see which branch of the Alt to take
    378 		case syntax.InstAlt, syntax.InstAltMatch:
    379 			pc = int(onePassNext(&inst, r))
    380 			continue
    381 		case syntax.InstFail:
    382 			return m.matched
    383 		case syntax.InstNop:
    384 			continue
    385 		case syntax.InstEmptyWidth:
    386 			if syntax.EmptyOp(inst.Arg)&^flag != 0 {
    387 				return m.matched
    388 			}
    389 			continue
    390 		case syntax.InstCapture:
    391 			if int(inst.Arg) < len(m.matchcap) {
    392 				m.matchcap[inst.Arg] = pos
    393 			}
    394 			continue
    395 		}
    396 		if width == 0 {
    397 			break
    398 		}
    399 		flag = syntax.EmptyOpContext(r, r1)
    400 		pos += width
    401 		r, width = r1, width1
    402 		if r != endOfText {
    403 			r1, width1 = i.step(pos + width)
    404 		}
    405 	}
    406 	return m.matched
    407 }
    408 
    409 // doMatch reports whether either r, b or s match the regexp.
    410 func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
    411 	return re.doExecute(r, b, s, 0, 0, nil) != nil
    412 }
    413 
    414 // doExecute finds the leftmost match in the input, appends the position
    415 // of its subexpressions to dstCap and returns dstCap.
    416 //
    417 // nil is returned if no matches are found and non-nil if matches are found.
    418 func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
    419 	m := re.get()
    420 	var i input
    421 	var size int
    422 	if r != nil {
    423 		i = m.newInputReader(r)
    424 	} else if b != nil {
    425 		i = m.newInputBytes(b)
    426 		size = len(b)
    427 	} else {
    428 		i = m.newInputString(s)
    429 		size = len(s)
    430 	}
    431 	if m.op != notOnePass {
    432 		if !m.onepass(i, pos, ncap) {
    433 			re.put(m)
    434 			return nil
    435 		}
    436 	} else if size < m.maxBitStateLen && r == nil {
    437 		if m.b == nil {
    438 			m.b = newBitState(m.p)
    439 		}
    440 		if !m.backtrack(i, pos, size, ncap) {
    441 			re.put(m)
    442 			return nil
    443 		}
    444 	} else {
    445 		m.init(ncap)
    446 		if !m.match(i, pos) {
    447 			re.put(m)
    448 			return nil
    449 		}
    450 	}
    451 	dstCap = append(dstCap, m.matchcap...)
    452 	if dstCap == nil {
    453 		// Keep the promise of returning non-nil value on match.
    454 		dstCap = arrayNoInts[:0]
    455 	}
    456 	re.put(m)
    457 	return dstCap
    458 }
    459 
    460 // arrayNoInts is returned by doExecute match if nil dstCap is passed
    461 // to it with ncap=0.
    462 var arrayNoInts [0]int
    463