Home | History | Annotate | Download | only in regexp
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // backtrack is a regular expression search with submatch
      6 // tracking for small regular expressions and texts. It allocates
      7 // a bit vector with (length of input) * (length of prog) bits,
      8 // to make sure it never explores the same (character position, instruction)
      9 // state multiple times. This limits the search to run in time linear in
     10 // the length of the test.
     11 //
     12 // backtrack is a fast replacement for the NFA code on small
     13 // regexps when onepass cannot be used.
     14 
     15 package regexp
     16 
     17 import "regexp/syntax"
     18 
     19 // A job is an entry on the backtracker's job stack. It holds
     20 // the instruction pc and the position in the input.
     21 type job struct {
     22 	pc  uint32
     23 	arg int
     24 	pos int
     25 }
     26 
     27 const (
     28 	visitedBits        = 32
     29 	maxBacktrackProg   = 500        // len(prog.Inst) <= max
     30 	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
     31 )
     32 
     33 // bitState holds state for the backtracker.
     34 type bitState struct {
     35 	prog *syntax.Prog
     36 
     37 	end     int
     38 	cap     []int
     39 	jobs    []job
     40 	visited []uint32
     41 }
     42 
     43 var notBacktrack *bitState = nil
     44 
     45 // maxBitStateLen returns the maximum length of a string to search with
     46 // the backtracker using prog.
     47 func maxBitStateLen(prog *syntax.Prog) int {
     48 	if !shouldBacktrack(prog) {
     49 		return 0
     50 	}
     51 	return maxBacktrackVector / len(prog.Inst)
     52 }
     53 
     54 // newBitState returns a new bitState for the given prog,
     55 // or notBacktrack if the size of the prog exceeds the maximum size that
     56 // the backtracker will be run for.
     57 func newBitState(prog *syntax.Prog) *bitState {
     58 	if !shouldBacktrack(prog) {
     59 		return notBacktrack
     60 	}
     61 	return &bitState{
     62 		prog: prog,
     63 	}
     64 }
     65 
     66 // shouldBacktrack reports whether the program is too
     67 // long for the backtracker to run.
     68 func shouldBacktrack(prog *syntax.Prog) bool {
     69 	return len(prog.Inst) <= maxBacktrackProg
     70 }
     71 
     72 // reset resets the state of the backtracker.
     73 // end is the end position in the input.
     74 // ncap is the number of captures.
     75 func (b *bitState) reset(end int, ncap int) {
     76 	b.end = end
     77 
     78 	if cap(b.jobs) == 0 {
     79 		b.jobs = make([]job, 0, 256)
     80 	} else {
     81 		b.jobs = b.jobs[:0]
     82 	}
     83 
     84 	visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
     85 	if cap(b.visited) < visitedSize {
     86 		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
     87 	} else {
     88 		b.visited = b.visited[:visitedSize]
     89 		for i := range b.visited {
     90 			b.visited[i] = 0
     91 		}
     92 	}
     93 
     94 	if cap(b.cap) < ncap {
     95 		b.cap = make([]int, ncap)
     96 	} else {
     97 		b.cap = b.cap[:ncap]
     98 	}
     99 	for i := range b.cap {
    100 		b.cap[i] = -1
    101 	}
    102 }
    103 
    104 // shouldVisit reports whether the combination of (pc, pos) has not
    105 // been visited yet.
    106 func (b *bitState) shouldVisit(pc uint32, pos int) bool {
    107 	n := uint(int(pc)*(b.end+1) + pos)
    108 	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
    109 		return false
    110 	}
    111 	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
    112 	return true
    113 }
    114 
    115 // push pushes (pc, pos, arg) onto the job stack if it should be
    116 // visited.
    117 func (b *bitState) push(pc uint32, pos int, arg int) {
    118 	if b.prog.Inst[pc].Op == syntax.InstFail {
    119 		return
    120 	}
    121 
    122 	// Only check shouldVisit when arg == 0.
    123 	// When arg > 0, we are continuing a previous visit.
    124 	if arg == 0 && !b.shouldVisit(pc, pos) {
    125 		return
    126 	}
    127 
    128 	b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
    129 }
    130 
    131 // tryBacktrack runs a backtracking search starting at pos.
    132 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
    133 	longest := m.re.longest
    134 	m.matched = false
    135 
    136 	b.push(pc, pos, 0)
    137 	for len(b.jobs) > 0 {
    138 		l := len(b.jobs) - 1
    139 		// Pop job off the stack.
    140 		pc := b.jobs[l].pc
    141 		pos := b.jobs[l].pos
    142 		arg := b.jobs[l].arg
    143 		b.jobs = b.jobs[:l]
    144 
    145 		// Optimization: rather than push and pop,
    146 		// code that is going to Push and continue
    147 		// the loop simply updates ip, p, and arg
    148 		// and jumps to CheckAndLoop. We have to
    149 		// do the ShouldVisit check that Push
    150 		// would have, but we avoid the stack
    151 		// manipulation.
    152 		goto Skip
    153 	CheckAndLoop:
    154 		if !b.shouldVisit(pc, pos) {
    155 			continue
    156 		}
    157 	Skip:
    158 
    159 		inst := b.prog.Inst[pc]
    160 
    161 		switch inst.Op {
    162 		default:
    163 			panic("bad inst")
    164 		case syntax.InstFail:
    165 			panic("unexpected InstFail")
    166 		case syntax.InstAlt:
    167 			// Cannot just
    168 			//   b.push(inst.Out, pos, 0)
    169 			//   b.push(inst.Arg, pos, 0)
    170 			// If during the processing of inst.Out, we encounter
    171 			// inst.Arg via another path, we want to process it then.
    172 			// Pushing it here will inhibit that. Instead, re-push
    173 			// inst with arg==1 as a reminder to push inst.Arg out
    174 			// later.
    175 			switch arg {
    176 			case 0:
    177 				b.push(pc, pos, 1)
    178 				pc = inst.Out
    179 				goto CheckAndLoop
    180 			case 1:
    181 				// Finished inst.Out; try inst.Arg.
    182 				arg = 0
    183 				pc = inst.Arg
    184 				goto CheckAndLoop
    185 			}
    186 			panic("bad arg in InstAlt")
    187 
    188 		case syntax.InstAltMatch:
    189 			// One opcode consumes runes; the other leads to match.
    190 			switch b.prog.Inst[inst.Out].Op {
    191 			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    192 				// inst.Arg is the match.
    193 				b.push(inst.Arg, pos, 0)
    194 				pc = inst.Arg
    195 				pos = b.end
    196 				goto CheckAndLoop
    197 			}
    198 			// inst.Out is the match - non-greedy
    199 			b.push(inst.Out, b.end, 0)
    200 			pc = inst.Out
    201 			goto CheckAndLoop
    202 
    203 		case syntax.InstRune:
    204 			r, width := i.step(pos)
    205 			if !inst.MatchRune(r) {
    206 				continue
    207 			}
    208 			pos += width
    209 			pc = inst.Out
    210 			goto CheckAndLoop
    211 
    212 		case syntax.InstRune1:
    213 			r, width := i.step(pos)
    214 			if r != inst.Rune[0] {
    215 				continue
    216 			}
    217 			pos += width
    218 			pc = inst.Out
    219 			goto CheckAndLoop
    220 
    221 		case syntax.InstRuneAnyNotNL:
    222 			r, width := i.step(pos)
    223 			if r == '\n' || r == endOfText {
    224 				continue
    225 			}
    226 			pos += width
    227 			pc = inst.Out
    228 			goto CheckAndLoop
    229 
    230 		case syntax.InstRuneAny:
    231 			r, width := i.step(pos)
    232 			if r == endOfText {
    233 				continue
    234 			}
    235 			pos += width
    236 			pc = inst.Out
    237 			goto CheckAndLoop
    238 
    239 		case syntax.InstCapture:
    240 			switch arg {
    241 			case 0:
    242 				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
    243 					// Capture pos to register, but save old value.
    244 					b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
    245 					b.cap[inst.Arg] = pos
    246 				}
    247 				pc = inst.Out
    248 				goto CheckAndLoop
    249 			case 1:
    250 				// Finished inst.Out; restore the old value.
    251 				b.cap[inst.Arg] = pos
    252 				continue
    253 
    254 			}
    255 			panic("bad arg in InstCapture")
    256 
    257 		case syntax.InstEmptyWidth:
    258 			if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
    259 				continue
    260 			}
    261 			pc = inst.Out
    262 			goto CheckAndLoop
    263 
    264 		case syntax.InstNop:
    265 			pc = inst.Out
    266 			goto CheckAndLoop
    267 
    268 		case syntax.InstMatch:
    269 			// We found a match. If the caller doesn't care
    270 			// where the match is, no point going further.
    271 			if len(b.cap) == 0 {
    272 				m.matched = true
    273 				return m.matched
    274 			}
    275 
    276 			// Record best match so far.
    277 			// Only need to check end point, because this entire
    278 			// call is only considering one start position.
    279 			if len(b.cap) > 1 {
    280 				b.cap[1] = pos
    281 			}
    282 			if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
    283 				copy(m.matchcap, b.cap)
    284 			}
    285 			m.matched = true
    286 
    287 			// If going for first match, we're done.
    288 			if !longest {
    289 				return m.matched
    290 			}
    291 
    292 			// If we used the entire text, no longer match is possible.
    293 			if pos == b.end {
    294 				return m.matched
    295 			}
    296 
    297 			// Otherwise, continue on in hope of a longer match.
    298 			continue
    299 		}
    300 	}
    301 
    302 	return m.matched
    303 }
    304 
    305 // backtrack runs a backtracking search of prog on the input starting at pos.
    306 func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
    307 	if !i.canCheckPrefix() {
    308 		panic("backtrack called for a RuneReader")
    309 	}
    310 
    311 	startCond := m.re.cond
    312 	if startCond == ^syntax.EmptyOp(0) { // impossible
    313 		return false
    314 	}
    315 	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
    316 		// Anchored match, past beginning of text.
    317 		return false
    318 	}
    319 
    320 	b := m.b
    321 	b.reset(end, ncap)
    322 
    323 	m.matchcap = m.matchcap[:ncap]
    324 	for i := range m.matchcap {
    325 		m.matchcap[i] = -1
    326 	}
    327 
    328 	// Anchored search must start at the beginning of the input
    329 	if startCond&syntax.EmptyBeginText != 0 {
    330 		if len(b.cap) > 0 {
    331 			b.cap[0] = pos
    332 		}
    333 		return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
    334 	}
    335 
    336 	// Unanchored search, starting from each possible text position.
    337 	// Notice that we have to try the empty string at the end of
    338 	// the text, so the loop condition is pos <= end, not pos < end.
    339 	// This looks like it's quadratic in the size of the text,
    340 	// but we are not clearing visited between calls to TrySearch,
    341 	// so no work is duplicated and it ends up still being linear.
    342 	width := -1
    343 	for ; pos <= end && width != 0; pos += width {
    344 		if len(m.re.prefix) > 0 {
    345 			// Match requires literal prefix; fast search for it.
    346 			advance := i.index(m.re, pos)
    347 			if advance < 0 {
    348 				return false
    349 			}
    350 			pos += advance
    351 		}
    352 
    353 		if len(b.cap) > 0 {
    354 			b.cap[0] = pos
    355 		}
    356 		if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
    357 			// Match must be leftmost; done.
    358 			return true
    359 		}
    360 		_, width = i.step(pos)
    361 	}
    362 	return false
    363 }
    364