Home | History | Annotate | Download | only in regexp
      1 // Copyright 2014 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package regexp
      6 
      7 import (
      8 	"bytes"
      9 	"regexp/syntax"
     10 	"sort"
     11 	"unicode"
     12 )
     13 
     14 // "One-pass" regexp execution.
     15 // Some regexps can be analyzed to determine that they never need
     16 // backtracking: they are guaranteed to run in one pass over the string
     17 // without bothering to save all the usual NFA state.
     18 // Detect those and execute them more quickly.
     19 
     20 // A onePassProg is a compiled one-pass regular expression program.
     21 // It is the same as syntax.Prog except for the use of onePassInst.
     22 type onePassProg struct {
     23 	Inst   []onePassInst
     24 	Start  int // index of start instruction
     25 	NumCap int // number of InstCapture insts in re
     26 }
     27 
     28 // A onePassInst is a single instruction in a one-pass regular expression program.
     29 // It is the same as syntax.Inst except for the new 'Next' field.
     30 type onePassInst struct {
     31 	syntax.Inst
     32 	Next []uint32
     33 }
     34 
     35 // OnePassPrefix returns a literal string that all matches for the
     36 // regexp must start with. Complete is true if the prefix
     37 // is the entire match. Pc is the index of the last rune instruction
     38 // in the string. The OnePassPrefix skips over the mandatory
     39 // EmptyBeginText
     40 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
     41 	i := &p.Inst[p.Start]
     42 	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
     43 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
     44 	}
     45 	pc = i.Out
     46 	i = &p.Inst[pc]
     47 	for i.Op == syntax.InstNop {
     48 		pc = i.Out
     49 		i = &p.Inst[pc]
     50 	}
     51 	// Avoid allocation of buffer if prefix is empty.
     52 	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
     53 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
     54 	}
     55 
     56 	// Have prefix; gather characters.
     57 	var buf bytes.Buffer
     58 	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
     59 		buf.WriteRune(i.Rune[0])
     60 		pc, i = i.Out, &p.Inst[i.Out]
     61 	}
     62 	if i.Op == syntax.InstEmptyWidth &&
     63 		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
     64 		p.Inst[i.Out].Op == syntax.InstMatch {
     65 		complete = true
     66 	}
     67 	return buf.String(), complete, pc
     68 }
     69 
     70 // OnePassNext selects the next actionable state of the prog, based on the input character.
     71 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
     72 // One of the alternates may ultimately lead without input to end of line. If the instruction
     73 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
     74 func onePassNext(i *onePassInst, r rune) uint32 {
     75 	next := i.MatchRunePos(r)
     76 	if next >= 0 {
     77 		return i.Next[next]
     78 	}
     79 	if i.Op == syntax.InstAltMatch {
     80 		return i.Out
     81 	}
     82 	return 0
     83 }
     84 
     85 func iop(i *syntax.Inst) syntax.InstOp {
     86 	op := i.Op
     87 	switch op {
     88 	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
     89 		op = syntax.InstRune
     90 	}
     91 	return op
     92 }
     93 
     94 // Sparse Array implementation is used as a queueOnePass.
     95 type queueOnePass struct {
     96 	sparse          []uint32
     97 	dense           []uint32
     98 	size, nextIndex uint32
     99 }
    100 
    101 func (q *queueOnePass) empty() bool {
    102 	return q.nextIndex >= q.size
    103 }
    104 
    105 func (q *queueOnePass) next() (n uint32) {
    106 	n = q.dense[q.nextIndex]
    107 	q.nextIndex++
    108 	return
    109 }
    110 
    111 func (q *queueOnePass) clear() {
    112 	q.size = 0
    113 	q.nextIndex = 0
    114 }
    115 
    116 func (q *queueOnePass) contains(u uint32) bool {
    117 	if u >= uint32(len(q.sparse)) {
    118 		return false
    119 	}
    120 	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
    121 }
    122 
    123 func (q *queueOnePass) insert(u uint32) {
    124 	if !q.contains(u) {
    125 		q.insertNew(u)
    126 	}
    127 }
    128 
    129 func (q *queueOnePass) insertNew(u uint32) {
    130 	if u >= uint32(len(q.sparse)) {
    131 		return
    132 	}
    133 	q.sparse[u] = q.size
    134 	q.dense[q.size] = u
    135 	q.size++
    136 }
    137 
    138 func newQueue(size int) (q *queueOnePass) {
    139 	return &queueOnePass{
    140 		sparse: make([]uint32, size),
    141 		dense:  make([]uint32, size),
    142 	}
    143 }
    144 
    145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
    146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
    147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
    148 // NextIp array with the single element mergeFailed is returned.
    149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
    150 const mergeFailed = uint32(0xffffffff)
    151 
    152 var (
    153 	noRune = []rune{}
    154 	noNext = []uint32{mergeFailed}
    155 )
    156 
    157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
    158 	leftLen := len(*leftRunes)
    159 	rightLen := len(*rightRunes)
    160 	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
    161 		panic("mergeRuneSets odd length []rune")
    162 	}
    163 	var (
    164 		lx, rx int
    165 	)
    166 	merged := make([]rune, 0)
    167 	next := make([]uint32, 0)
    168 	ok := true
    169 	defer func() {
    170 		if !ok {
    171 			merged = nil
    172 			next = nil
    173 		}
    174 	}()
    175 
    176 	ix := -1
    177 	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
    178 		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
    179 			return false
    180 		}
    181 		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
    182 		*newLow += 2
    183 		ix += 2
    184 		next = append(next, pc)
    185 		return true
    186 	}
    187 
    188 	for lx < leftLen || rx < rightLen {
    189 		switch {
    190 		case rx >= rightLen:
    191 			ok = extend(&lx, leftRunes, leftPC)
    192 		case lx >= leftLen:
    193 			ok = extend(&rx, rightRunes, rightPC)
    194 		case (*rightRunes)[rx] < (*leftRunes)[lx]:
    195 			ok = extend(&rx, rightRunes, rightPC)
    196 		default:
    197 			ok = extend(&lx, leftRunes, leftPC)
    198 		}
    199 		if !ok {
    200 			return noRune, noNext
    201 		}
    202 	}
    203 	return merged, next
    204 }
    205 
    206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
    207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
    208 	for ix, instOriginal := range original.Inst {
    209 		switch instOriginal.Op {
    210 		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
    211 		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
    212 			prog.Inst[ix].Next = nil
    213 		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    214 			prog.Inst[ix].Next = nil
    215 			prog.Inst[ix] = onePassInst{Inst: instOriginal}
    216 		}
    217 	}
    218 }
    219 
    220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
    221 func onePassCopy(prog *syntax.Prog) *onePassProg {
    222 	p := &onePassProg{
    223 		Start:  prog.Start,
    224 		NumCap: prog.NumCap,
    225 		Inst:   make([]onePassInst, len(prog.Inst)),
    226 	}
    227 	for i, inst := range prog.Inst {
    228 		p.Inst[i] = onePassInst{Inst: inst}
    229 	}
    230 
    231 	// rewrites one or more common Prog constructs that enable some otherwise
    232 	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
    233 	// ip A, that points to ips B & C.
    234 	// A:BC + B:DA => A:BC + B:CD
    235 	// A:BC + B:DC => A:DC + B:DC
    236 	for pc := range p.Inst {
    237 		switch p.Inst[pc].Op {
    238 		default:
    239 			continue
    240 		case syntax.InstAlt, syntax.InstAltMatch:
    241 			// A:Bx + B:Ay
    242 			p_A_Other := &p.Inst[pc].Out
    243 			p_A_Alt := &p.Inst[pc].Arg
    244 			// make sure a target is another Alt
    245 			instAlt := p.Inst[*p_A_Alt]
    246 			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
    247 				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
    248 				instAlt = p.Inst[*p_A_Alt]
    249 				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
    250 					continue
    251 				}
    252 			}
    253 			instOther := p.Inst[*p_A_Other]
    254 			// Analyzing both legs pointing to Alts is for another day
    255 			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
    256 				// too complicated
    257 				continue
    258 			}
    259 			// simple empty transition loop
    260 			// A:BC + B:DA => A:BC + B:DC
    261 			p_B_Alt := &p.Inst[*p_A_Alt].Out
    262 			p_B_Other := &p.Inst[*p_A_Alt].Arg
    263 			patch := false
    264 			if instAlt.Out == uint32(pc) {
    265 				patch = true
    266 			} else if instAlt.Arg == uint32(pc) {
    267 				patch = true
    268 				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
    269 			}
    270 			if patch {
    271 				*p_B_Alt = *p_A_Other
    272 			}
    273 
    274 			// empty transition to common target
    275 			// A:BC + B:DC => A:DC + B:DC
    276 			if *p_A_Other == *p_B_Alt {
    277 				*p_A_Alt = *p_B_Other
    278 			}
    279 		}
    280 	}
    281 	return p
    282 }
    283 
    284 // runeSlice exists to permit sorting the case-folded rune sets.
    285 type runeSlice []rune
    286 
    287 func (p runeSlice) Len() int           { return len(p) }
    288 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
    289 func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
    290 
    291 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
    292 var anyRune = []rune{0, unicode.MaxRune}
    293 
    294 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
    295 // the match engine can always tell which branch to take. The routine may modify
    296 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
    297 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
    298 // to the size of the Prog.
    299 func makeOnePass(p *onePassProg) *onePassProg {
    300 	// If the machine is very long, it's not worth the time to check if we can use one pass.
    301 	if len(p.Inst) >= 1000 {
    302 		return notOnePass
    303 	}
    304 
    305 	var (
    306 		instQueue    = newQueue(len(p.Inst))
    307 		visitQueue   = newQueue(len(p.Inst))
    308 		check        func(uint32, []bool) bool
    309 		onePassRunes = make([][]rune, len(p.Inst))
    310 	)
    311 
    312 	// check that paths from Alt instructions are unambiguous, and rebuild the new
    313 	// program as a onepass program
    314 	check = func(pc uint32, m []bool) (ok bool) {
    315 		ok = true
    316 		inst := &p.Inst[pc]
    317 		if visitQueue.contains(pc) {
    318 			return
    319 		}
    320 		visitQueue.insert(pc)
    321 		switch inst.Op {
    322 		case syntax.InstAlt, syntax.InstAltMatch:
    323 			ok = check(inst.Out, m) && check(inst.Arg, m)
    324 			// check no-input paths to InstMatch
    325 			matchOut := m[inst.Out]
    326 			matchArg := m[inst.Arg]
    327 			if matchOut && matchArg {
    328 				ok = false
    329 				break
    330 			}
    331 			// Match on empty goes in inst.Out
    332 			if matchArg {
    333 				inst.Out, inst.Arg = inst.Arg, inst.Out
    334 				matchOut, matchArg = matchArg, matchOut
    335 			}
    336 			if matchOut {
    337 				m[pc] = true
    338 				inst.Op = syntax.InstAltMatch
    339 			}
    340 
    341 			// build a dispatch operator from the two legs of the alt.
    342 			onePassRunes[pc], inst.Next = mergeRuneSets(
    343 				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
    344 			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
    345 				ok = false
    346 				break
    347 			}
    348 		case syntax.InstCapture, syntax.InstNop:
    349 			ok = check(inst.Out, m)
    350 			m[pc] = m[inst.Out]
    351 			// pass matching runes back through these no-ops.
    352 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
    353 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
    354 			for i := range inst.Next {
    355 				inst.Next[i] = inst.Out
    356 			}
    357 		case syntax.InstEmptyWidth:
    358 			ok = check(inst.Out, m)
    359 			m[pc] = m[inst.Out]
    360 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
    361 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
    362 			for i := range inst.Next {
    363 				inst.Next[i] = inst.Out
    364 			}
    365 		case syntax.InstMatch, syntax.InstFail:
    366 			m[pc] = inst.Op == syntax.InstMatch
    367 		case syntax.InstRune:
    368 			m[pc] = false
    369 			if len(inst.Next) > 0 {
    370 				break
    371 			}
    372 			instQueue.insert(inst.Out)
    373 			if len(inst.Rune) == 0 {
    374 				onePassRunes[pc] = []rune{}
    375 				inst.Next = []uint32{inst.Out}
    376 				break
    377 			}
    378 			runes := make([]rune, 0)
    379 			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
    380 				r0 := inst.Rune[0]
    381 				runes = append(runes, r0, r0)
    382 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
    383 					runes = append(runes, r1, r1)
    384 				}
    385 				sort.Sort(runeSlice(runes))
    386 			} else {
    387 				runes = append(runes, inst.Rune...)
    388 			}
    389 			onePassRunes[pc] = runes
    390 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
    391 			for i := range inst.Next {
    392 				inst.Next[i] = inst.Out
    393 			}
    394 			inst.Op = syntax.InstRune
    395 		case syntax.InstRune1:
    396 			m[pc] = false
    397 			if len(inst.Next) > 0 {
    398 				break
    399 			}
    400 			instQueue.insert(inst.Out)
    401 			runes := []rune{}
    402 			// expand case-folded runes
    403 			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
    404 				r0 := inst.Rune[0]
    405 				runes = append(runes, r0, r0)
    406 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
    407 					runes = append(runes, r1, r1)
    408 				}
    409 				sort.Sort(runeSlice(runes))
    410 			} else {
    411 				runes = append(runes, inst.Rune[0], inst.Rune[0])
    412 			}
    413 			onePassRunes[pc] = runes
    414 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
    415 			for i := range inst.Next {
    416 				inst.Next[i] = inst.Out
    417 			}
    418 			inst.Op = syntax.InstRune
    419 		case syntax.InstRuneAny:
    420 			m[pc] = false
    421 			if len(inst.Next) > 0 {
    422 				break
    423 			}
    424 			instQueue.insert(inst.Out)
    425 			onePassRunes[pc] = append([]rune{}, anyRune...)
    426 			inst.Next = []uint32{inst.Out}
    427 		case syntax.InstRuneAnyNotNL:
    428 			m[pc] = false
    429 			if len(inst.Next) > 0 {
    430 				break
    431 			}
    432 			instQueue.insert(inst.Out)
    433 			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
    434 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
    435 			for i := range inst.Next {
    436 				inst.Next[i] = inst.Out
    437 			}
    438 		}
    439 		return
    440 	}
    441 
    442 	instQueue.clear()
    443 	instQueue.insert(uint32(p.Start))
    444 	m := make([]bool, len(p.Inst))
    445 	for !instQueue.empty() {
    446 		visitQueue.clear()
    447 		pc := instQueue.next()
    448 		if !check(pc, m) {
    449 			p = notOnePass
    450 			break
    451 		}
    452 	}
    453 	if p != notOnePass {
    454 		for i := range p.Inst {
    455 			p.Inst[i].Rune = onePassRunes[i]
    456 		}
    457 	}
    458 	return p
    459 }
    460 
    461 var notOnePass *onePassProg = nil
    462 
    463 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
    464 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
    465 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
    466 // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
    467 func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
    468 	if prog.Start == 0 {
    469 		return notOnePass
    470 	}
    471 	// onepass regexp is anchored
    472 	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
    473 		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
    474 		return notOnePass
    475 	}
    476 	// every instruction leading to InstMatch must be EmptyEndText
    477 	for _, inst := range prog.Inst {
    478 		opOut := prog.Inst[inst.Out].Op
    479 		switch inst.Op {
    480 		default:
    481 			if opOut == syntax.InstMatch {
    482 				return notOnePass
    483 			}
    484 		case syntax.InstAlt, syntax.InstAltMatch:
    485 			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
    486 				return notOnePass
    487 			}
    488 		case syntax.InstEmptyWidth:
    489 			if opOut == syntax.InstMatch {
    490 				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
    491 					continue
    492 				}
    493 				return notOnePass
    494 			}
    495 		}
    496 	}
    497 	// Creates a slightly optimized copy of the original Prog
    498 	// that cleans up some Prog idioms that block valid onepass programs
    499 	p = onePassCopy(prog)
    500 
    501 	// checkAmbiguity on InstAlts, build onepass Prog if possible
    502 	p = makeOnePass(p)
    503 
    504 	if p != notOnePass {
    505 		cleanupOnePass(p, prog)
    506 	}
    507 	return p
    508 }
    509