Home | History | Annotate | Download | only in syntax
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package syntax
      6 
      7 import (
      8 	"bytes"
      9 	"strconv"
     10 	"unicode"
     11 )
     12 
     13 // Compiled program.
     14 // May not belong in this package, but convenient for now.
     15 
     16 // A Prog is a compiled regular expression program.
     17 type Prog struct {
     18 	Inst   []Inst
     19 	Start  int // index of start instruction
     20 	NumCap int // number of InstCapture insts in re
     21 }
     22 
     23 // An InstOp is an instruction opcode.
     24 type InstOp uint8
     25 
     26 const (
     27 	InstAlt InstOp = iota
     28 	InstAltMatch
     29 	InstCapture
     30 	InstEmptyWidth
     31 	InstMatch
     32 	InstFail
     33 	InstNop
     34 	InstRune
     35 	InstRune1
     36 	InstRuneAny
     37 	InstRuneAnyNotNL
     38 )
     39 
     40 var instOpNames = []string{
     41 	"InstAlt",
     42 	"InstAltMatch",
     43 	"InstCapture",
     44 	"InstEmptyWidth",
     45 	"InstMatch",
     46 	"InstFail",
     47 	"InstNop",
     48 	"InstRune",
     49 	"InstRune1",
     50 	"InstRuneAny",
     51 	"InstRuneAnyNotNL",
     52 }
     53 
     54 func (i InstOp) String() string {
     55 	if uint(i) >= uint(len(instOpNames)) {
     56 		return ""
     57 	}
     58 	return instOpNames[i]
     59 }
     60 
     61 // An EmptyOp specifies a kind or mixture of zero-width assertions.
     62 type EmptyOp uint8
     63 
     64 const (
     65 	EmptyBeginLine EmptyOp = 1 << iota
     66 	EmptyEndLine
     67 	EmptyBeginText
     68 	EmptyEndText
     69 	EmptyWordBoundary
     70 	EmptyNoWordBoundary
     71 )
     72 
     73 // EmptyOpContext returns the zero-width assertions
     74 // satisfied at the position between the runes r1 and r2.
     75 // Passing r1 == -1 indicates that the position is
     76 // at the beginning of the text.
     77 // Passing r2 == -1 indicates that the position is
     78 // at the end of the text.
     79 func EmptyOpContext(r1, r2 rune) EmptyOp {
     80 	var op EmptyOp = EmptyNoWordBoundary
     81 	var boundary byte
     82 	switch {
     83 	case IsWordChar(r1):
     84 		boundary = 1
     85 	case r1 == '\n':
     86 		op |= EmptyBeginLine
     87 	case r1 < 0:
     88 		op |= EmptyBeginText | EmptyBeginLine
     89 	}
     90 	switch {
     91 	case IsWordChar(r2):
     92 		boundary ^= 1
     93 	case r2 == '\n':
     94 		op |= EmptyEndLine
     95 	case r2 < 0:
     96 		op |= EmptyEndText | EmptyEndLine
     97 	}
     98 	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
     99 		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
    100 	}
    101 	return op
    102 }
    103 
    104 // IsWordChar reports whether r is consider a ``word character''
    105 // during the evaluation of the \b and \B zero-width assertions.
    106 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
    107 func IsWordChar(r rune) bool {
    108 	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
    109 }
    110 
    111 // An Inst is a single instruction in a regular expression program.
    112 type Inst struct {
    113 	Op   InstOp
    114 	Out  uint32 // all but InstMatch, InstFail
    115 	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
    116 	Rune []rune
    117 }
    118 
    119 func (p *Prog) String() string {
    120 	var b bytes.Buffer
    121 	dumpProg(&b, p)
    122 	return b.String()
    123 }
    124 
    125 // skipNop follows any no-op or capturing instructions
    126 // and returns the resulting pc.
    127 func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
    128 	i := &p.Inst[pc]
    129 	for i.Op == InstNop || i.Op == InstCapture {
    130 		pc = i.Out
    131 		i = &p.Inst[pc]
    132 	}
    133 	return i, pc
    134 }
    135 
    136 // op returns i.Op but merges all the Rune special cases into InstRune
    137 func (i *Inst) op() InstOp {
    138 	op := i.Op
    139 	switch op {
    140 	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
    141 		op = InstRune
    142 	}
    143 	return op
    144 }
    145 
    146 // Prefix returns a literal string that all matches for the
    147 // regexp must start with. Complete is true if the prefix
    148 // is the entire match.
    149 func (p *Prog) Prefix() (prefix string, complete bool) {
    150 	i, _ := p.skipNop(uint32(p.Start))
    151 
    152 	// Avoid allocation of buffer if prefix is empty.
    153 	if i.op() != InstRune || len(i.Rune) != 1 {
    154 		return "", i.Op == InstMatch
    155 	}
    156 
    157 	// Have prefix; gather characters.
    158 	var buf bytes.Buffer
    159 	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
    160 		buf.WriteRune(i.Rune[0])
    161 		i, _ = p.skipNop(i.Out)
    162 	}
    163 	return buf.String(), i.Op == InstMatch
    164 }
    165 
    166 // StartCond returns the leading empty-width conditions that must
    167 // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
    168 func (p *Prog) StartCond() EmptyOp {
    169 	var flag EmptyOp
    170 	pc := uint32(p.Start)
    171 	i := &p.Inst[pc]
    172 Loop:
    173 	for {
    174 		switch i.Op {
    175 		case InstEmptyWidth:
    176 			flag |= EmptyOp(i.Arg)
    177 		case InstFail:
    178 			return ^EmptyOp(0)
    179 		case InstCapture, InstNop:
    180 			// skip
    181 		default:
    182 			break Loop
    183 		}
    184 		pc = i.Out
    185 		i = &p.Inst[pc]
    186 	}
    187 	return flag
    188 }
    189 
    190 const noMatch = -1
    191 
    192 // MatchRune reports whether the instruction matches (and consumes) r.
    193 // It should only be called when i.Op == InstRune.
    194 func (i *Inst) MatchRune(r rune) bool {
    195 	return i.MatchRunePos(r) != noMatch
    196 }
    197 
    198 // MatchRunePos checks whether the instruction matches (and consumes) r.
    199 // If so, MatchRunePos returns the index of the matching rune pair
    200 // (or, when len(i.Rune) == 1, rune singleton).
    201 // If not, MatchRunePos returns -1.
    202 // MatchRunePos should only be called when i.Op == InstRune.
    203 func (i *Inst) MatchRunePos(r rune) int {
    204 	rune := i.Rune
    205 
    206 	// Special case: single-rune slice is from literal string, not char class.
    207 	if len(rune) == 1 {
    208 		r0 := rune[0]
    209 		if r == r0 {
    210 			return 0
    211 		}
    212 		if Flags(i.Arg)&FoldCase != 0 {
    213 			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
    214 				if r == r1 {
    215 					return 0
    216 				}
    217 			}
    218 		}
    219 		return noMatch
    220 	}
    221 
    222 	// Peek at the first few pairs.
    223 	// Should handle ASCII well.
    224 	for j := 0; j < len(rune) && j <= 8; j += 2 {
    225 		if r < rune[j] {
    226 			return noMatch
    227 		}
    228 		if r <= rune[j+1] {
    229 			return j / 2
    230 		}
    231 	}
    232 
    233 	// Otherwise binary search.
    234 	lo := 0
    235 	hi := len(rune) / 2
    236 	for lo < hi {
    237 		m := lo + (hi-lo)/2
    238 		if c := rune[2*m]; c <= r {
    239 			if r <= rune[2*m+1] {
    240 				return m
    241 			}
    242 			lo = m + 1
    243 		} else {
    244 			hi = m
    245 		}
    246 	}
    247 	return noMatch
    248 }
    249 
    250 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
    251 // Since we act on runes, it would be easy to support Unicode here.
    252 func wordRune(r rune) bool {
    253 	return r == '_' ||
    254 		('A' <= r && r <= 'Z') ||
    255 		('a' <= r && r <= 'z') ||
    256 		('0' <= r && r <= '9')
    257 }
    258 
    259 // MatchEmptyWidth reports whether the instruction matches
    260 // an empty string between the runes before and after.
    261 // It should only be called when i.Op == InstEmptyWidth.
    262 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
    263 	switch EmptyOp(i.Arg) {
    264 	case EmptyBeginLine:
    265 		return before == '\n' || before == -1
    266 	case EmptyEndLine:
    267 		return after == '\n' || after == -1
    268 	case EmptyBeginText:
    269 		return before == -1
    270 	case EmptyEndText:
    271 		return after == -1
    272 	case EmptyWordBoundary:
    273 		return wordRune(before) != wordRune(after)
    274 	case EmptyNoWordBoundary:
    275 		return wordRune(before) == wordRune(after)
    276 	}
    277 	panic("unknown empty width arg")
    278 }
    279 
    280 func (i *Inst) String() string {
    281 	var b bytes.Buffer
    282 	dumpInst(&b, i)
    283 	return b.String()
    284 }
    285 
    286 func bw(b *bytes.Buffer, args ...string) {
    287 	for _, s := range args {
    288 		b.WriteString(s)
    289 	}
    290 }
    291 
    292 func dumpProg(b *bytes.Buffer, p *Prog) {
    293 	for j := range p.Inst {
    294 		i := &p.Inst[j]
    295 		pc := strconv.Itoa(j)
    296 		if len(pc) < 3 {
    297 			b.WriteString("   "[len(pc):])
    298 		}
    299 		if j == p.Start {
    300 			pc += "*"
    301 		}
    302 		bw(b, pc, "\t")
    303 		dumpInst(b, i)
    304 		bw(b, "\n")
    305 	}
    306 }
    307 
    308 func u32(i uint32) string {
    309 	return strconv.FormatUint(uint64(i), 10)
    310 }
    311 
    312 func dumpInst(b *bytes.Buffer, i *Inst) {
    313 	switch i.Op {
    314 	case InstAlt:
    315 		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
    316 	case InstAltMatch:
    317 		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
    318 	case InstCapture:
    319 		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
    320 	case InstEmptyWidth:
    321 		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
    322 	case InstMatch:
    323 		bw(b, "match")
    324 	case InstFail:
    325 		bw(b, "fail")
    326 	case InstNop:
    327 		bw(b, "nop -> ", u32(i.Out))
    328 	case InstRune:
    329 		if i.Rune == nil {
    330 			// shouldn't happen
    331 			bw(b, "rune <nil>")
    332 		}
    333 		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
    334 		if Flags(i.Arg)&FoldCase != 0 {
    335 			bw(b, "/i")
    336 		}
    337 		bw(b, " -> ", u32(i.Out))
    338 	case InstRune1:
    339 		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
    340 	case InstRuneAny:
    341 		bw(b, "any -> ", u32(i.Out))
    342 	case InstRuneAnyNotNL:
    343 		bw(b, "anynotnl -> ", u32(i.Out))
    344 	}
    345 }
    346