1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "bytes" 9 "strconv" 10 "unicode" 11 ) 12 13 // Compiled program. 14 // May not belong in this package, but convenient for now. 15 16 // A Prog is a compiled regular expression program. 17 type Prog struct { 18 Inst []Inst 19 Start int // index of start instruction 20 NumCap int // number of InstCapture insts in re 21 } 22 23 // An InstOp is an instruction opcode. 24 type InstOp uint8 25 26 const ( 27 InstAlt InstOp = iota 28 InstAltMatch 29 InstCapture 30 InstEmptyWidth 31 InstMatch 32 InstFail 33 InstNop 34 InstRune 35 InstRune1 36 InstRuneAny 37 InstRuneAnyNotNL 38 ) 39 40 var instOpNames = []string{ 41 "InstAlt", 42 "InstAltMatch", 43 "InstCapture", 44 "InstEmptyWidth", 45 "InstMatch", 46 "InstFail", 47 "InstNop", 48 "InstRune", 49 "InstRune1", 50 "InstRuneAny", 51 "InstRuneAnyNotNL", 52 } 53 54 func (i InstOp) String() string { 55 if uint(i) >= uint(len(instOpNames)) { 56 return "" 57 } 58 return instOpNames[i] 59 } 60 61 // An EmptyOp specifies a kind or mixture of zero-width assertions. 62 type EmptyOp uint8 63 64 const ( 65 EmptyBeginLine EmptyOp = 1 << iota 66 EmptyEndLine 67 EmptyBeginText 68 EmptyEndText 69 EmptyWordBoundary 70 EmptyNoWordBoundary 71 ) 72 73 // EmptyOpContext returns the zero-width assertions 74 // satisfied at the position between the runes r1 and r2. 75 // Passing r1 == -1 indicates that the position is 76 // at the beginning of the text. 77 // Passing r2 == -1 indicates that the position is 78 // at the end of the text. 79 func EmptyOpContext(r1, r2 rune) EmptyOp { 80 var op EmptyOp = EmptyNoWordBoundary 81 var boundary byte 82 switch { 83 case IsWordChar(r1): 84 boundary = 1 85 case r1 == '\n': 86 op |= EmptyBeginLine 87 case r1 < 0: 88 op |= EmptyBeginText | EmptyBeginLine 89 } 90 switch { 91 case IsWordChar(r2): 92 boundary ^= 1 93 case r2 == '\n': 94 op |= EmptyEndLine 95 case r2 < 0: 96 op |= EmptyEndText | EmptyEndLine 97 } 98 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 99 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 100 } 101 return op 102 } 103 104 // IsWordChar reports whether r is consider a ``word character'' 105 // during the evaluation of the \b and \B zero-width assertions. 106 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 107 func IsWordChar(r rune) bool { 108 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 109 } 110 111 // An Inst is a single instruction in a regular expression program. 112 type Inst struct { 113 Op InstOp 114 Out uint32 // all but InstMatch, InstFail 115 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 116 Rune []rune 117 } 118 119 func (p *Prog) String() string { 120 var b bytes.Buffer 121 dumpProg(&b, p) 122 return b.String() 123 } 124 125 // skipNop follows any no-op or capturing instructions 126 // and returns the resulting pc. 127 func (p *Prog) skipNop(pc uint32) (*Inst, uint32) { 128 i := &p.Inst[pc] 129 for i.Op == InstNop || i.Op == InstCapture { 130 pc = i.Out 131 i = &p.Inst[pc] 132 } 133 return i, pc 134 } 135 136 // op returns i.Op but merges all the Rune special cases into InstRune 137 func (i *Inst) op() InstOp { 138 op := i.Op 139 switch op { 140 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 141 op = InstRune 142 } 143 return op 144 } 145 146 // Prefix returns a literal string that all matches for the 147 // regexp must start with. Complete is true if the prefix 148 // is the entire match. 149 func (p *Prog) Prefix() (prefix string, complete bool) { 150 i, _ := p.skipNop(uint32(p.Start)) 151 152 // Avoid allocation of buffer if prefix is empty. 153 if i.op() != InstRune || len(i.Rune) != 1 { 154 return "", i.Op == InstMatch 155 } 156 157 // Have prefix; gather characters. 158 var buf bytes.Buffer 159 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 160 buf.WriteRune(i.Rune[0]) 161 i, _ = p.skipNop(i.Out) 162 } 163 return buf.String(), i.Op == InstMatch 164 } 165 166 // StartCond returns the leading empty-width conditions that must 167 // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 168 func (p *Prog) StartCond() EmptyOp { 169 var flag EmptyOp 170 pc := uint32(p.Start) 171 i := &p.Inst[pc] 172 Loop: 173 for { 174 switch i.Op { 175 case InstEmptyWidth: 176 flag |= EmptyOp(i.Arg) 177 case InstFail: 178 return ^EmptyOp(0) 179 case InstCapture, InstNop: 180 // skip 181 default: 182 break Loop 183 } 184 pc = i.Out 185 i = &p.Inst[pc] 186 } 187 return flag 188 } 189 190 const noMatch = -1 191 192 // MatchRune reports whether the instruction matches (and consumes) r. 193 // It should only be called when i.Op == InstRune. 194 func (i *Inst) MatchRune(r rune) bool { 195 return i.MatchRunePos(r) != noMatch 196 } 197 198 // MatchRunePos checks whether the instruction matches (and consumes) r. 199 // If so, MatchRunePos returns the index of the matching rune pair 200 // (or, when len(i.Rune) == 1, rune singleton). 201 // If not, MatchRunePos returns -1. 202 // MatchRunePos should only be called when i.Op == InstRune. 203 func (i *Inst) MatchRunePos(r rune) int { 204 rune := i.Rune 205 206 // Special case: single-rune slice is from literal string, not char class. 207 if len(rune) == 1 { 208 r0 := rune[0] 209 if r == r0 { 210 return 0 211 } 212 if Flags(i.Arg)&FoldCase != 0 { 213 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 214 if r == r1 { 215 return 0 216 } 217 } 218 } 219 return noMatch 220 } 221 222 // Peek at the first few pairs. 223 // Should handle ASCII well. 224 for j := 0; j < len(rune) && j <= 8; j += 2 { 225 if r < rune[j] { 226 return noMatch 227 } 228 if r <= rune[j+1] { 229 return j / 2 230 } 231 } 232 233 // Otherwise binary search. 234 lo := 0 235 hi := len(rune) / 2 236 for lo < hi { 237 m := lo + (hi-lo)/2 238 if c := rune[2*m]; c <= r { 239 if r <= rune[2*m+1] { 240 return m 241 } 242 lo = m + 1 243 } else { 244 hi = m 245 } 246 } 247 return noMatch 248 } 249 250 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. 251 // Since we act on runes, it would be easy to support Unicode here. 252 func wordRune(r rune) bool { 253 return r == '_' || 254 ('A' <= r && r <= 'Z') || 255 ('a' <= r && r <= 'z') || 256 ('0' <= r && r <= '9') 257 } 258 259 // MatchEmptyWidth reports whether the instruction matches 260 // an empty string between the runes before and after. 261 // It should only be called when i.Op == InstEmptyWidth. 262 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 263 switch EmptyOp(i.Arg) { 264 case EmptyBeginLine: 265 return before == '\n' || before == -1 266 case EmptyEndLine: 267 return after == '\n' || after == -1 268 case EmptyBeginText: 269 return before == -1 270 case EmptyEndText: 271 return after == -1 272 case EmptyWordBoundary: 273 return wordRune(before) != wordRune(after) 274 case EmptyNoWordBoundary: 275 return wordRune(before) == wordRune(after) 276 } 277 panic("unknown empty width arg") 278 } 279 280 func (i *Inst) String() string { 281 var b bytes.Buffer 282 dumpInst(&b, i) 283 return b.String() 284 } 285 286 func bw(b *bytes.Buffer, args ...string) { 287 for _, s := range args { 288 b.WriteString(s) 289 } 290 } 291 292 func dumpProg(b *bytes.Buffer, p *Prog) { 293 for j := range p.Inst { 294 i := &p.Inst[j] 295 pc := strconv.Itoa(j) 296 if len(pc) < 3 { 297 b.WriteString(" "[len(pc):]) 298 } 299 if j == p.Start { 300 pc += "*" 301 } 302 bw(b, pc, "\t") 303 dumpInst(b, i) 304 bw(b, "\n") 305 } 306 } 307 308 func u32(i uint32) string { 309 return strconv.FormatUint(uint64(i), 10) 310 } 311 312 func dumpInst(b *bytes.Buffer, i *Inst) { 313 switch i.Op { 314 case InstAlt: 315 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 316 case InstAltMatch: 317 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 318 case InstCapture: 319 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 320 case InstEmptyWidth: 321 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 322 case InstMatch: 323 bw(b, "match") 324 case InstFail: 325 bw(b, "fail") 326 case InstNop: 327 bw(b, "nop -> ", u32(i.Out)) 328 case InstRune: 329 if i.Rune == nil { 330 // shouldn't happen 331 bw(b, "rune <nil>") 332 } 333 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 334 if Flags(i.Arg)&FoldCase != 0 { 335 bw(b, "/i") 336 } 337 bw(b, " -> ", u32(i.Out)) 338 case InstRune1: 339 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 340 case InstRuneAny: 341 bw(b, "any -> ", u32(i.Out)) 342 case InstRuneAnyNotNL: 343 bw(b, "anynotnl -> ", u32(i.Out)) 344 } 345 } 346