Home | History | Annotate | Download | only in regexp
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // backtrack is a regular expression search with submatch
      6 // tracking for small regular expressions and texts. It allocates
      7 // a bit vector with (length of input) * (length of prog) bits,
      8 // to make sure it never explores the same (character position, instruction)
      9 // state multiple times. This limits the search to run in time linear in
     10 // the length of the test.
     11 //
     12 // backtrack is a fast replacement for the NFA code on small
     13 // regexps when onepass cannot be used.
     14 
     15 package regexp
     16 
     17 import "regexp/syntax"
     18 
     19 // A job is an entry on the backtracker's job stack. It holds
     20 // the instruction pc and the position in the input.
     21 type job struct {
     22 	pc  uint32
     23 	arg bool
     24 	pos int
     25 }
     26 
     27 const (
     28 	visitedBits        = 32
     29 	maxBacktrackProg   = 500        // len(prog.Inst) <= max
     30 	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
     31 )
     32 
     33 // bitState holds state for the backtracker.
     34 type bitState struct {
     35 	prog *syntax.Prog
     36 
     37 	end     int
     38 	cap     []int
     39 	jobs    []job
     40 	visited []uint32
     41 }
     42 
     43 var notBacktrack *bitState = nil
     44 
     45 // maxBitStateLen returns the maximum length of a string to search with
     46 // the backtracker using prog.
     47 func maxBitStateLen(prog *syntax.Prog) int {
     48 	if !shouldBacktrack(prog) {
     49 		return 0
     50 	}
     51 	return maxBacktrackVector / len(prog.Inst)
     52 }
     53 
     54 // newBitState returns a new bitState for the given prog,
     55 // or notBacktrack if the size of the prog exceeds the maximum size that
     56 // the backtracker will be run for.
     57 func newBitState(prog *syntax.Prog) *bitState {
     58 	if !shouldBacktrack(prog) {
     59 		return notBacktrack
     60 	}
     61 	return &bitState{
     62 		prog: prog,
     63 	}
     64 }
     65 
     66 // shouldBacktrack reports whether the program is too
     67 // long for the backtracker to run.
     68 func shouldBacktrack(prog *syntax.Prog) bool {
     69 	return len(prog.Inst) <= maxBacktrackProg
     70 }
     71 
     72 // reset resets the state of the backtracker.
     73 // end is the end position in the input.
     74 // ncap is the number of captures.
     75 func (b *bitState) reset(end int, ncap int) {
     76 	b.end = end
     77 
     78 	if cap(b.jobs) == 0 {
     79 		b.jobs = make([]job, 0, 256)
     80 	} else {
     81 		b.jobs = b.jobs[:0]
     82 	}
     83 
     84 	visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
     85 	if cap(b.visited) < visitedSize {
     86 		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
     87 	} else {
     88 		b.visited = b.visited[:visitedSize]
     89 		for i := range b.visited {
     90 			b.visited[i] = 0
     91 		}
     92 	}
     93 
     94 	if cap(b.cap) < ncap {
     95 		b.cap = make([]int, ncap)
     96 	} else {
     97 		b.cap = b.cap[:ncap]
     98 	}
     99 	for i := range b.cap {
    100 		b.cap[i] = -1
    101 	}
    102 }
    103 
    104 // shouldVisit reports whether the combination of (pc, pos) has not
    105 // been visited yet.
    106 func (b *bitState) shouldVisit(pc uint32, pos int) bool {
    107 	n := uint(int(pc)*(b.end+1) + pos)
    108 	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
    109 		return false
    110 	}
    111 	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
    112 	return true
    113 }
    114 
    115 // push pushes (pc, pos, arg) onto the job stack if it should be
    116 // visited.
    117 func (b *bitState) push(pc uint32, pos int, arg bool) {
    118 	// Only check shouldVisit when arg is false.
    119 	// When arg is true, we are continuing a previous visit.
    120 	if b.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
    121 		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
    122 	}
    123 }
    124 
    125 // tryBacktrack runs a backtracking search starting at pos.
    126 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
    127 	longest := m.re.longest
    128 	m.matched = false
    129 
    130 	b.push(pc, pos, false)
    131 	for len(b.jobs) > 0 {
    132 		l := len(b.jobs) - 1
    133 		// Pop job off the stack.
    134 		pc := b.jobs[l].pc
    135 		pos := b.jobs[l].pos
    136 		arg := b.jobs[l].arg
    137 		b.jobs = b.jobs[:l]
    138 
    139 		// Optimization: rather than push and pop,
    140 		// code that is going to Push and continue
    141 		// the loop simply updates ip, p, and arg
    142 		// and jumps to CheckAndLoop. We have to
    143 		// do the ShouldVisit check that Push
    144 		// would have, but we avoid the stack
    145 		// manipulation.
    146 		goto Skip
    147 	CheckAndLoop:
    148 		if !b.shouldVisit(pc, pos) {
    149 			continue
    150 		}
    151 	Skip:
    152 
    153 		inst := b.prog.Inst[pc]
    154 
    155 		switch inst.Op {
    156 		default:
    157 			panic("bad inst")
    158 		case syntax.InstFail:
    159 			panic("unexpected InstFail")
    160 		case syntax.InstAlt:
    161 			// Cannot just
    162 			//   b.push(inst.Out, pos, false)
    163 			//   b.push(inst.Arg, pos, false)
    164 			// If during the processing of inst.Out, we encounter
    165 			// inst.Arg via another path, we want to process it then.
    166 			// Pushing it here will inhibit that. Instead, re-push
    167 			// inst with arg==true as a reminder to push inst.Arg out
    168 			// later.
    169 			if arg {
    170 				// Finished inst.Out; try inst.Arg.
    171 				arg = false
    172 				pc = inst.Arg
    173 				goto CheckAndLoop
    174 			} else {
    175 				b.push(pc, pos, true)
    176 				pc = inst.Out
    177 				goto CheckAndLoop
    178 			}
    179 
    180 		case syntax.InstAltMatch:
    181 			// One opcode consumes runes; the other leads to match.
    182 			switch b.prog.Inst[inst.Out].Op {
    183 			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    184 				// inst.Arg is the match.
    185 				b.push(inst.Arg, pos, false)
    186 				pc = inst.Arg
    187 				pos = b.end
    188 				goto CheckAndLoop
    189 			}
    190 			// inst.Out is the match - non-greedy
    191 			b.push(inst.Out, b.end, false)
    192 			pc = inst.Out
    193 			goto CheckAndLoop
    194 
    195 		case syntax.InstRune:
    196 			r, width := i.step(pos)
    197 			if !inst.MatchRune(r) {
    198 				continue
    199 			}
    200 			pos += width
    201 			pc = inst.Out
    202 			goto CheckAndLoop
    203 
    204 		case syntax.InstRune1:
    205 			r, width := i.step(pos)
    206 			if r != inst.Rune[0] {
    207 				continue
    208 			}
    209 			pos += width
    210 			pc = inst.Out
    211 			goto CheckAndLoop
    212 
    213 		case syntax.InstRuneAnyNotNL:
    214 			r, width := i.step(pos)
    215 			if r == '\n' || r == endOfText {
    216 				continue
    217 			}
    218 			pos += width
    219 			pc = inst.Out
    220 			goto CheckAndLoop
    221 
    222 		case syntax.InstRuneAny:
    223 			r, width := i.step(pos)
    224 			if r == endOfText {
    225 				continue
    226 			}
    227 			pos += width
    228 			pc = inst.Out
    229 			goto CheckAndLoop
    230 
    231 		case syntax.InstCapture:
    232 			if arg {
    233 				// Finished inst.Out; restore the old value.
    234 				b.cap[inst.Arg] = pos
    235 				continue
    236 			} else {
    237 				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
    238 					// Capture pos to register, but save old value.
    239 					b.push(pc, b.cap[inst.Arg], true) // come back when we're done.
    240 					b.cap[inst.Arg] = pos
    241 				}
    242 				pc = inst.Out
    243 				goto CheckAndLoop
    244 			}
    245 
    246 		case syntax.InstEmptyWidth:
    247 			if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
    248 				continue
    249 			}
    250 			pc = inst.Out
    251 			goto CheckAndLoop
    252 
    253 		case syntax.InstNop:
    254 			pc = inst.Out
    255 			goto CheckAndLoop
    256 
    257 		case syntax.InstMatch:
    258 			// We found a match. If the caller doesn't care
    259 			// where the match is, no point going further.
    260 			if len(b.cap) == 0 {
    261 				m.matched = true
    262 				return m.matched
    263 			}
    264 
    265 			// Record best match so far.
    266 			// Only need to check end point, because this entire
    267 			// call is only considering one start position.
    268 			if len(b.cap) > 1 {
    269 				b.cap[1] = pos
    270 			}
    271 			if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
    272 				copy(m.matchcap, b.cap)
    273 			}
    274 			m.matched = true
    275 
    276 			// If going for first match, we're done.
    277 			if !longest {
    278 				return m.matched
    279 			}
    280 
    281 			// If we used the entire text, no longer match is possible.
    282 			if pos == b.end {
    283 				return m.matched
    284 			}
    285 
    286 			// Otherwise, continue on in hope of a longer match.
    287 			continue
    288 		}
    289 	}
    290 
    291 	return m.matched
    292 }
    293 
    294 // backtrack runs a backtracking search of prog on the input starting at pos.
    295 func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
    296 	if !i.canCheckPrefix() {
    297 		panic("backtrack called for a RuneReader")
    298 	}
    299 
    300 	startCond := m.re.cond
    301 	if startCond == ^syntax.EmptyOp(0) { // impossible
    302 		return false
    303 	}
    304 	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
    305 		// Anchored match, past beginning of text.
    306 		return false
    307 	}
    308 
    309 	b := m.b
    310 	b.reset(end, ncap)
    311 
    312 	m.matchcap = m.matchcap[:ncap]
    313 	for i := range m.matchcap {
    314 		m.matchcap[i] = -1
    315 	}
    316 
    317 	// Anchored search must start at the beginning of the input
    318 	if startCond&syntax.EmptyBeginText != 0 {
    319 		if len(b.cap) > 0 {
    320 			b.cap[0] = pos
    321 		}
    322 		return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
    323 	}
    324 
    325 	// Unanchored search, starting from each possible text position.
    326 	// Notice that we have to try the empty string at the end of
    327 	// the text, so the loop condition is pos <= end, not pos < end.
    328 	// This looks like it's quadratic in the size of the text,
    329 	// but we are not clearing visited between calls to TrySearch,
    330 	// so no work is duplicated and it ends up still being linear.
    331 	width := -1
    332 	for ; pos <= end && width != 0; pos += width {
    333 		if len(m.re.prefix) > 0 {
    334 			// Match requires literal prefix; fast search for it.
    335 			advance := i.index(m.re, pos)
    336 			if advance < 0 {
    337 				return false
    338 			}
    339 			pos += advance
    340 		}
    341 
    342 		if len(b.cap) > 0 {
    343 			b.cap[0] = pos
    344 		}
    345 		if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
    346 			// Match must be leftmost; done.
    347 			return true
    348 		}
    349 		_, width = i.step(pos)
    350 	}
    351 	return false
    352 }
    353