Home | History | Annotate | Download | only in regexp
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package regexp implements regular expression search.
      6 //
      7 // The syntax of the regular expressions accepted is the same
      8 // general syntax used by Perl, Python, and other languages.
      9 // More precisely, it is the syntax accepted by RE2 and described at
     10 // https://golang.org/s/re2syntax, except for \C.
     11 // For an overview of the syntax, run
     12 //   go doc regexp/syntax
     13 //
     14 // The regexp implementation provided by this package is
     15 // guaranteed to run in time linear in the size of the input.
     16 // (This is a property not guaranteed by most open source
     17 // implementations of regular expressions.) For more information
     18 // about this property, see
     19 //	http://swtch.com/~rsc/regexp/regexp1.html
     20 // or any book about automata theory.
     21 //
     22 // All characters are UTF-8-encoded code points.
     23 //
     24 // There are 16 methods of Regexp that match a regular expression and identify
     25 // the matched text. Their names are matched by this regular expression:
     26 //
     27 //	Find(All)?(String)?(Submatch)?(Index)?
     28 //
     29 // If 'All' is present, the routine matches successive non-overlapping
     30 // matches of the entire expression. Empty matches abutting a preceding
     31 // match are ignored. The return value is a slice containing the successive
     32 // return values of the corresponding non-'All' routine. These routines take
     33 // an extra integer argument, n; if n >= 0, the function returns at most n
     34 // matches/submatches.
     35 //
     36 // If 'String' is present, the argument is a string; otherwise it is a slice
     37 // of bytes; return values are adjusted as appropriate.
     38 //
     39 // If 'Submatch' is present, the return value is a slice identifying the
     40 // successive submatches of the expression. Submatches are matches of
     41 // parenthesized subexpressions (also known as capturing groups) within the
     42 // regular expression, numbered from left to right in order of opening
     43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
     44 // the match of the first parenthesized subexpression, and so on.
     45 //
     46 // If 'Index' is present, matches and submatches are identified by byte index
     47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
     48 // the nth submatch. The pair for n==0 identifies the match of the entire
     49 // expression. If 'Index' is not present, the match is identified by the
     50 // text of the match/submatch. If an index is negative, it means that
     51 // subexpression did not match any string in the input.
     52 //
     53 // There is also a subset of the methods that can be applied to text read
     54 // from a RuneReader:
     55 //
     56 //	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
     57 //
     58 // This set may grow. Note that regular expression matches may need to
     59 // examine text beyond the text returned by a match, so the methods that
     60 // match text from a RuneReader may read arbitrarily far into the input
     61 // before returning.
     62 //
     63 // (There are a few other methods that do not match this pattern.)
     64 //
     65 package regexp
     66 
     67 import (
     68 	"bytes"
     69 	"io"
     70 	"regexp/syntax"
     71 	"strconv"
     72 	"strings"
     73 	"sync"
     74 	"unicode"
     75 	"unicode/utf8"
     76 )
     77 
     78 // Regexp is the representation of a compiled regular expression.
     79 // A Regexp is safe for concurrent use by multiple goroutines,
     80 // except for configuration methods, such as Longest.
     81 type Regexp struct {
     82 	// read-only after Compile
     83 	regexpRO
     84 
     85 	// cache of machines for running regexp
     86 	mu      sync.Mutex
     87 	machine []*machine
     88 }
     89 
     90 type regexpRO struct {
     91 	expr           string         // as passed to Compile
     92 	prog           *syntax.Prog   // compiled program
     93 	onepass        *onePassProg   // onepass program or nil
     94 	prefix         string         // required prefix in unanchored matches
     95 	prefixBytes    []byte         // prefix, as a []byte
     96 	prefixComplete bool           // prefix is the entire regexp
     97 	prefixRune     rune           // first rune in prefix
     98 	prefixEnd      uint32         // pc for last rune in prefix
     99 	cond           syntax.EmptyOp // empty-width conditions required at start of match
    100 	numSubexp      int
    101 	subexpNames    []string
    102 	longest        bool
    103 }
    104 
    105 // String returns the source text used to compile the regular expression.
    106 func (re *Regexp) String() string {
    107 	return re.expr
    108 }
    109 
    110 // Copy returns a new Regexp object copied from re.
    111 //
    112 // When using a Regexp in multiple goroutines, giving each goroutine
    113 // its own copy helps to avoid lock contention.
    114 func (re *Regexp) Copy() *Regexp {
    115 	// It is not safe to copy Regexp by value
    116 	// since it contains a sync.Mutex.
    117 	return &Regexp{
    118 		regexpRO: re.regexpRO,
    119 	}
    120 }
    121 
    122 // Compile parses a regular expression and returns, if successful,
    123 // a Regexp object that can be used to match against text.
    124 //
    125 // When matching against text, the regexp returns a match that
    126 // begins as early as possible in the input (leftmost), and among those
    127 // it chooses the one that a backtracking search would have found first.
    128 // This so-called leftmost-first matching is the same semantics
    129 // that Perl, Python, and other implementations use, although this
    130 // package implements it without the expense of backtracking.
    131 // For POSIX leftmost-longest matching, see CompilePOSIX.
    132 func Compile(expr string) (*Regexp, error) {
    133 	return compile(expr, syntax.Perl, false)
    134 }
    135 
    136 // CompilePOSIX is like Compile but restricts the regular expression
    137 // to POSIX ERE (egrep) syntax and changes the match semantics to
    138 // leftmost-longest.
    139 //
    140 // That is, when matching against text, the regexp returns a match that
    141 // begins as early as possible in the input (leftmost), and among those
    142 // it chooses a match that is as long as possible.
    143 // This so-called leftmost-longest matching is the same semantics
    144 // that early regular expression implementations used and that POSIX
    145 // specifies.
    146 //
    147 // However, there can be multiple leftmost-longest matches, with different
    148 // submatch choices, and here this package diverges from POSIX.
    149 // Among the possible leftmost-longest matches, this package chooses
    150 // the one that a backtracking search would have found first, while POSIX
    151 // specifies that the match be chosen to maximize the length of the first
    152 // subexpression, then the second, and so on from left to right.
    153 // The POSIX rule is computationally prohibitive and not even well-defined.
    154 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
    155 func CompilePOSIX(expr string) (*Regexp, error) {
    156 	return compile(expr, syntax.POSIX, true)
    157 }
    158 
    159 // Longest makes future searches prefer the leftmost-longest match.
    160 // That is, when matching against text, the regexp returns a match that
    161 // begins as early as possible in the input (leftmost), and among those
    162 // it chooses a match that is as long as possible.
    163 // This method modifies the Regexp and may not be called concurrently
    164 // with any other methods.
    165 func (re *Regexp) Longest() {
    166 	re.longest = true
    167 }
    168 
    169 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
    170 	re, err := syntax.Parse(expr, mode)
    171 	if err != nil {
    172 		return nil, err
    173 	}
    174 	maxCap := re.MaxCap()
    175 	capNames := re.CapNames()
    176 
    177 	re = re.Simplify()
    178 	prog, err := syntax.Compile(re)
    179 	if err != nil {
    180 		return nil, err
    181 	}
    182 	regexp := &Regexp{
    183 		regexpRO: regexpRO{
    184 			expr:        expr,
    185 			prog:        prog,
    186 			onepass:     compileOnePass(prog),
    187 			numSubexp:   maxCap,
    188 			subexpNames: capNames,
    189 			cond:        prog.StartCond(),
    190 			longest:     longest,
    191 		},
    192 	}
    193 	if regexp.onepass == notOnePass {
    194 		regexp.prefix, regexp.prefixComplete = prog.Prefix()
    195 	} else {
    196 		regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
    197 	}
    198 	if regexp.prefix != "" {
    199 		// TODO(rsc): Remove this allocation by adding
    200 		// IndexString to package bytes.
    201 		regexp.prefixBytes = []byte(regexp.prefix)
    202 		regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
    203 	}
    204 	return regexp, nil
    205 }
    206 
    207 // get returns a machine to use for matching re.
    208 // It uses the re's machine cache if possible, to avoid
    209 // unnecessary allocation.
    210 func (re *Regexp) get() *machine {
    211 	re.mu.Lock()
    212 	if n := len(re.machine); n > 0 {
    213 		z := re.machine[n-1]
    214 		re.machine = re.machine[:n-1]
    215 		re.mu.Unlock()
    216 		return z
    217 	}
    218 	re.mu.Unlock()
    219 	z := progMachine(re.prog, re.onepass)
    220 	z.re = re
    221 	return z
    222 }
    223 
    224 // put returns a machine to the re's machine cache.
    225 // There is no attempt to limit the size of the cache, so it will
    226 // grow to the maximum number of simultaneous matches
    227 // run using re.  (The cache empties when re gets garbage collected.)
    228 func (re *Regexp) put(z *machine) {
    229 	re.mu.Lock()
    230 	re.machine = append(re.machine, z)
    231 	re.mu.Unlock()
    232 }
    233 
    234 // MustCompile is like Compile but panics if the expression cannot be parsed.
    235 // It simplifies safe initialization of global variables holding compiled regular
    236 // expressions.
    237 func MustCompile(str string) *Regexp {
    238 	regexp, error := Compile(str)
    239 	if error != nil {
    240 		panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
    241 	}
    242 	return regexp
    243 }
    244 
    245 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
    246 // It simplifies safe initialization of global variables holding compiled regular
    247 // expressions.
    248 func MustCompilePOSIX(str string) *Regexp {
    249 	regexp, error := CompilePOSIX(str)
    250 	if error != nil {
    251 		panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
    252 	}
    253 	return regexp
    254 }
    255 
    256 func quote(s string) string {
    257 	if strconv.CanBackquote(s) {
    258 		return "`" + s + "`"
    259 	}
    260 	return strconv.Quote(s)
    261 }
    262 
    263 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
    264 func (re *Regexp) NumSubexp() int {
    265 	return re.numSubexp
    266 }
    267 
    268 // SubexpNames returns the names of the parenthesized subexpressions
    269 // in this Regexp. The name for the first sub-expression is names[1],
    270 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
    271 // Since the Regexp as a whole cannot be named, names[0] is always
    272 // the empty string. The slice should not be modified.
    273 func (re *Regexp) SubexpNames() []string {
    274 	return re.subexpNames
    275 }
    276 
    277 const endOfText rune = -1
    278 
    279 // input abstracts different representations of the input text. It provides
    280 // one-character lookahead.
    281 type input interface {
    282 	step(pos int) (r rune, width int) // advance one rune
    283 	canCheckPrefix() bool             // can we look ahead without losing info?
    284 	hasPrefix(re *Regexp) bool
    285 	index(re *Regexp, pos int) int
    286 	context(pos int) syntax.EmptyOp
    287 }
    288 
    289 // inputString scans a string.
    290 type inputString struct {
    291 	str string
    292 }
    293 
    294 func (i *inputString) step(pos int) (rune, int) {
    295 	if pos < len(i.str) {
    296 		c := i.str[pos]
    297 		if c < utf8.RuneSelf {
    298 			return rune(c), 1
    299 		}
    300 		return utf8.DecodeRuneInString(i.str[pos:])
    301 	}
    302 	return endOfText, 0
    303 }
    304 
    305 func (i *inputString) canCheckPrefix() bool {
    306 	return true
    307 }
    308 
    309 func (i *inputString) hasPrefix(re *Regexp) bool {
    310 	return strings.HasPrefix(i.str, re.prefix)
    311 }
    312 
    313 func (i *inputString) index(re *Regexp, pos int) int {
    314 	return strings.Index(i.str[pos:], re.prefix)
    315 }
    316 
    317 func (i *inputString) context(pos int) syntax.EmptyOp {
    318 	r1, r2 := endOfText, endOfText
    319 	// 0 < pos && pos <= len(i.str)
    320 	if uint(pos-1) < uint(len(i.str)) {
    321 		r1 = rune(i.str[pos-1])
    322 		if r1 >= utf8.RuneSelf {
    323 			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
    324 		}
    325 	}
    326 	// 0 <= pos && pos < len(i.str)
    327 	if uint(pos) < uint(len(i.str)) {
    328 		r2 = rune(i.str[pos])
    329 		if r2 >= utf8.RuneSelf {
    330 			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
    331 		}
    332 	}
    333 	return syntax.EmptyOpContext(r1, r2)
    334 }
    335 
    336 // inputBytes scans a byte slice.
    337 type inputBytes struct {
    338 	str []byte
    339 }
    340 
    341 func (i *inputBytes) step(pos int) (rune, int) {
    342 	if pos < len(i.str) {
    343 		c := i.str[pos]
    344 		if c < utf8.RuneSelf {
    345 			return rune(c), 1
    346 		}
    347 		return utf8.DecodeRune(i.str[pos:])
    348 	}
    349 	return endOfText, 0
    350 }
    351 
    352 func (i *inputBytes) canCheckPrefix() bool {
    353 	return true
    354 }
    355 
    356 func (i *inputBytes) hasPrefix(re *Regexp) bool {
    357 	return bytes.HasPrefix(i.str, re.prefixBytes)
    358 }
    359 
    360 func (i *inputBytes) index(re *Regexp, pos int) int {
    361 	return bytes.Index(i.str[pos:], re.prefixBytes)
    362 }
    363 
    364 func (i *inputBytes) context(pos int) syntax.EmptyOp {
    365 	r1, r2 := endOfText, endOfText
    366 	// 0 < pos && pos <= len(i.str)
    367 	if uint(pos-1) < uint(len(i.str)) {
    368 		r1 = rune(i.str[pos-1])
    369 		if r1 >= utf8.RuneSelf {
    370 			r1, _ = utf8.DecodeLastRune(i.str[:pos])
    371 		}
    372 	}
    373 	// 0 <= pos && pos < len(i.str)
    374 	if uint(pos) < uint(len(i.str)) {
    375 		r2 = rune(i.str[pos])
    376 		if r2 >= utf8.RuneSelf {
    377 			r2, _ = utf8.DecodeRune(i.str[pos:])
    378 		}
    379 	}
    380 	return syntax.EmptyOpContext(r1, r2)
    381 }
    382 
    383 // inputReader scans a RuneReader.
    384 type inputReader struct {
    385 	r     io.RuneReader
    386 	atEOT bool
    387 	pos   int
    388 }
    389 
    390 func (i *inputReader) step(pos int) (rune, int) {
    391 	if !i.atEOT && pos != i.pos {
    392 		return endOfText, 0
    393 
    394 	}
    395 	r, w, err := i.r.ReadRune()
    396 	if err != nil {
    397 		i.atEOT = true
    398 		return endOfText, 0
    399 	}
    400 	i.pos += w
    401 	return r, w
    402 }
    403 
    404 func (i *inputReader) canCheckPrefix() bool {
    405 	return false
    406 }
    407 
    408 func (i *inputReader) hasPrefix(re *Regexp) bool {
    409 	return false
    410 }
    411 
    412 func (i *inputReader) index(re *Regexp, pos int) int {
    413 	return -1
    414 }
    415 
    416 func (i *inputReader) context(pos int) syntax.EmptyOp {
    417 	return 0
    418 }
    419 
    420 // LiteralPrefix returns a literal string that must begin any match
    421 // of the regular expression re. It returns the boolean true if the
    422 // literal string comprises the entire regular expression.
    423 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
    424 	return re.prefix, re.prefixComplete
    425 }
    426 
    427 // MatchReader reports whether the Regexp matches the text read by the
    428 // RuneReader.
    429 func (re *Regexp) MatchReader(r io.RuneReader) bool {
    430 	return re.doMatch(r, nil, "")
    431 }
    432 
    433 // MatchString reports whether the Regexp matches the string s.
    434 func (re *Regexp) MatchString(s string) bool {
    435 	return re.doMatch(nil, nil, s)
    436 }
    437 
    438 // Match reports whether the Regexp matches the byte slice b.
    439 func (re *Regexp) Match(b []byte) bool {
    440 	return re.doMatch(nil, b, "")
    441 }
    442 
    443 // MatchReader checks whether a textual regular expression matches the text
    444 // read by the RuneReader. More complicated queries need to use Compile and
    445 // the full Regexp interface.
    446 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
    447 	re, err := Compile(pattern)
    448 	if err != nil {
    449 		return false, err
    450 	}
    451 	return re.MatchReader(r), nil
    452 }
    453 
    454 // MatchString checks whether a textual regular expression
    455 // matches a string. More complicated queries need
    456 // to use Compile and the full Regexp interface.
    457 func MatchString(pattern string, s string) (matched bool, err error) {
    458 	re, err := Compile(pattern)
    459 	if err != nil {
    460 		return false, err
    461 	}
    462 	return re.MatchString(s), nil
    463 }
    464 
    465 // Match checks whether a textual regular expression
    466 // matches a byte slice. More complicated queries need
    467 // to use Compile and the full Regexp interface.
    468 func Match(pattern string, b []byte) (matched bool, err error) {
    469 	re, err := Compile(pattern)
    470 	if err != nil {
    471 		return false, err
    472 	}
    473 	return re.Match(b), nil
    474 }
    475 
    476 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
    477 // with the replacement string repl. Inside repl, $ signs are interpreted as
    478 // in Expand, so for instance $1 represents the text of the first submatch.
    479 func (re *Regexp) ReplaceAllString(src, repl string) string {
    480 	n := 2
    481 	if strings.Contains(repl, "$") {
    482 		n = 2 * (re.numSubexp + 1)
    483 	}
    484 	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
    485 		return re.expand(dst, repl, nil, src, match)
    486 	})
    487 	return string(b)
    488 }
    489 
    490 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
    491 // with the replacement string repl. The replacement repl is substituted directly,
    492 // without using Expand.
    493 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
    494 	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
    495 		return append(dst, repl...)
    496 	}))
    497 }
    498 
    499 // ReplaceAllStringFunc returns a copy of src in which all matches of the
    500 // Regexp have been replaced by the return value of function repl applied
    501 // to the matched substring. The replacement returned by repl is substituted
    502 // directly, without using Expand.
    503 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
    504 	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
    505 		return append(dst, repl(src[match[0]:match[1]])...)
    506 	})
    507 	return string(b)
    508 }
    509 
    510 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
    511 	lastMatchEnd := 0 // end position of the most recent match
    512 	searchPos := 0    // position where we next look for a match
    513 	var buf []byte
    514 	var endPos int
    515 	if bsrc != nil {
    516 		endPos = len(bsrc)
    517 	} else {
    518 		endPos = len(src)
    519 	}
    520 	if nmatch > re.prog.NumCap {
    521 		nmatch = re.prog.NumCap
    522 	}
    523 
    524 	var dstCap [2]int
    525 	for searchPos <= endPos {
    526 		a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
    527 		if len(a) == 0 {
    528 			break // no more matches
    529 		}
    530 
    531 		// Copy the unmatched characters before this match.
    532 		if bsrc != nil {
    533 			buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
    534 		} else {
    535 			buf = append(buf, src[lastMatchEnd:a[0]]...)
    536 		}
    537 
    538 		// Now insert a copy of the replacement string, but not for a
    539 		// match of the empty string immediately after another match.
    540 		// (Otherwise, we get double replacement for patterns that
    541 		// match both empty and nonempty strings.)
    542 		if a[1] > lastMatchEnd || a[0] == 0 {
    543 			buf = repl(buf, a)
    544 		}
    545 		lastMatchEnd = a[1]
    546 
    547 		// Advance past this match; always advance at least one character.
    548 		var width int
    549 		if bsrc != nil {
    550 			_, width = utf8.DecodeRune(bsrc[searchPos:])
    551 		} else {
    552 			_, width = utf8.DecodeRuneInString(src[searchPos:])
    553 		}
    554 		if searchPos+width > a[1] {
    555 			searchPos += width
    556 		} else if searchPos+1 > a[1] {
    557 			// This clause is only needed at the end of the input
    558 			// string. In that case, DecodeRuneInString returns width=0.
    559 			searchPos++
    560 		} else {
    561 			searchPos = a[1]
    562 		}
    563 	}
    564 
    565 	// Copy the unmatched characters after the last match.
    566 	if bsrc != nil {
    567 		buf = append(buf, bsrc[lastMatchEnd:]...)
    568 	} else {
    569 		buf = append(buf, src[lastMatchEnd:]...)
    570 	}
    571 
    572 	return buf
    573 }
    574 
    575 // ReplaceAll returns a copy of src, replacing matches of the Regexp
    576 // with the replacement text repl. Inside repl, $ signs are interpreted as
    577 // in Expand, so for instance $1 represents the text of the first submatch.
    578 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
    579 	n := 2
    580 	if bytes.IndexByte(repl, '$') >= 0 {
    581 		n = 2 * (re.numSubexp + 1)
    582 	}
    583 	srepl := ""
    584 	b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
    585 		if len(srepl) != len(repl) {
    586 			srepl = string(repl)
    587 		}
    588 		return re.expand(dst, srepl, src, "", match)
    589 	})
    590 	return b
    591 }
    592 
    593 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
    594 // with the replacement bytes repl. The replacement repl is substituted directly,
    595 // without using Expand.
    596 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
    597 	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
    598 		return append(dst, repl...)
    599 	})
    600 }
    601 
    602 // ReplaceAllFunc returns a copy of src in which all matches of the
    603 // Regexp have been replaced by the return value of function repl applied
    604 // to the matched byte slice. The replacement returned by repl is substituted
    605 // directly, without using Expand.
    606 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
    607 	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
    608 		return append(dst, repl(src[match[0]:match[1]])...)
    609 	})
    610 }
    611 
    612 // Bitmap used by func special to check whether a character needs to be escaped.
    613 var specialBytes [16]byte
    614 
    615 // special reports whether byte b needs to be escaped by QuoteMeta.
    616 func special(b byte) bool {
    617 	return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
    618 }
    619 
    620 func init() {
    621 	for _, b := range []byte(`\.+*?()|[]{}^$`) {
    622 		specialBytes[b%16] |= 1 << (b / 16)
    623 	}
    624 }
    625 
    626 // QuoteMeta returns a string that quotes all regular expression metacharacters
    627 // inside the argument text; the returned string is a regular expression matching
    628 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
    629 func QuoteMeta(s string) string {
    630 	// A byte loop is correct because all metacharacters are ASCII.
    631 	var i int
    632 	for i = 0; i < len(s); i++ {
    633 		if special(s[i]) {
    634 			break
    635 		}
    636 	}
    637 	// No meta characters found, so return original string.
    638 	if i >= len(s) {
    639 		return s
    640 	}
    641 
    642 	b := make([]byte, 2*len(s)-i)
    643 	copy(b, s[:i])
    644 	j := i
    645 	for ; i < len(s); i++ {
    646 		if special(s[i]) {
    647 			b[j] = '\\'
    648 			j++
    649 		}
    650 		b[j] = s[i]
    651 		j++
    652 	}
    653 	return string(b[:j])
    654 }
    655 
    656 // The number of capture values in the program may correspond
    657 // to fewer capturing expressions than are in the regexp.
    658 // For example, "(a){0}" turns into an empty program, so the
    659 // maximum capture in the program is 0 but we need to return
    660 // an expression for \1.  Pad appends -1s to the slice a as needed.
    661 func (re *Regexp) pad(a []int) []int {
    662 	if a == nil {
    663 		// No match.
    664 		return nil
    665 	}
    666 	n := (1 + re.numSubexp) * 2
    667 	for len(a) < n {
    668 		a = append(a, -1)
    669 	}
    670 	return a
    671 }
    672 
    673 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
    674 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
    675 	var end int
    676 	if b == nil {
    677 		end = len(s)
    678 	} else {
    679 		end = len(b)
    680 	}
    681 
    682 	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
    683 		matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
    684 		if len(matches) == 0 {
    685 			break
    686 		}
    687 
    688 		accept := true
    689 		if matches[1] == pos {
    690 			// We've found an empty match.
    691 			if matches[0] == prevMatchEnd {
    692 				// We don't allow an empty match right
    693 				// after a previous match, so ignore it.
    694 				accept = false
    695 			}
    696 			var width int
    697 			// TODO: use step()
    698 			if b == nil {
    699 				_, width = utf8.DecodeRuneInString(s[pos:end])
    700 			} else {
    701 				_, width = utf8.DecodeRune(b[pos:end])
    702 			}
    703 			if width > 0 {
    704 				pos += width
    705 			} else {
    706 				pos = end + 1
    707 			}
    708 		} else {
    709 			pos = matches[1]
    710 		}
    711 		prevMatchEnd = matches[1]
    712 
    713 		if accept {
    714 			deliver(re.pad(matches))
    715 			i++
    716 		}
    717 	}
    718 }
    719 
    720 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
    721 // A return value of nil indicates no match.
    722 func (re *Regexp) Find(b []byte) []byte {
    723 	var dstCap [2]int
    724 	a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
    725 	if a == nil {
    726 		return nil
    727 	}
    728 	return b[a[0]:a[1]]
    729 }
    730 
    731 // FindIndex returns a two-element slice of integers defining the location of
    732 // the leftmost match in b of the regular expression. The match itself is at
    733 // b[loc[0]:loc[1]].
    734 // A return value of nil indicates no match.
    735 func (re *Regexp) FindIndex(b []byte) (loc []int) {
    736 	a := re.doExecute(nil, b, "", 0, 2, nil)
    737 	if a == nil {
    738 		return nil
    739 	}
    740 	return a[0:2]
    741 }
    742 
    743 // FindString returns a string holding the text of the leftmost match in s of the regular
    744 // expression. If there is no match, the return value is an empty string,
    745 // but it will also be empty if the regular expression successfully matches
    746 // an empty string. Use FindStringIndex or FindStringSubmatch if it is
    747 // necessary to distinguish these cases.
    748 func (re *Regexp) FindString(s string) string {
    749 	var dstCap [2]int
    750 	a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
    751 	if a == nil {
    752 		return ""
    753 	}
    754 	return s[a[0]:a[1]]
    755 }
    756 
    757 // FindStringIndex returns a two-element slice of integers defining the
    758 // location of the leftmost match in s of the regular expression. The match
    759 // itself is at s[loc[0]:loc[1]].
    760 // A return value of nil indicates no match.
    761 func (re *Regexp) FindStringIndex(s string) (loc []int) {
    762 	a := re.doExecute(nil, nil, s, 0, 2, nil)
    763 	if a == nil {
    764 		return nil
    765 	}
    766 	return a[0:2]
    767 }
    768 
    769 // FindReaderIndex returns a two-element slice of integers defining the
    770 // location of the leftmost match of the regular expression in text read from
    771 // the RuneReader. The match text was found in the input stream at
    772 // byte offset loc[0] through loc[1]-1.
    773 // A return value of nil indicates no match.
    774 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
    775 	a := re.doExecute(r, nil, "", 0, 2, nil)
    776 	if a == nil {
    777 		return nil
    778 	}
    779 	return a[0:2]
    780 }
    781 
    782 // FindSubmatch returns a slice of slices holding the text of the leftmost
    783 // match of the regular expression in b and the matches, if any, of its
    784 // subexpressions, as defined by the 'Submatch' descriptions in the package
    785 // comment.
    786 // A return value of nil indicates no match.
    787 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
    788 	var dstCap [4]int
    789 	a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
    790 	if a == nil {
    791 		return nil
    792 	}
    793 	ret := make([][]byte, 1+re.numSubexp)
    794 	for i := range ret {
    795 		if 2*i < len(a) && a[2*i] >= 0 {
    796 			ret[i] = b[a[2*i]:a[2*i+1]]
    797 		}
    798 	}
    799 	return ret
    800 }
    801 
    802 // Expand appends template to dst and returns the result; during the
    803 // append, Expand replaces variables in the template with corresponding
    804 // matches drawn from src. The match slice should have been returned by
    805 // FindSubmatchIndex.
    806 //
    807 // In the template, a variable is denoted by a substring of the form
    808 // $name or ${name}, where name is a non-empty sequence of letters,
    809 // digits, and underscores. A purely numeric name like $1 refers to
    810 // the submatch with the corresponding index; other names refer to
    811 // capturing parentheses named with the (?P<name>...) syntax. A
    812 // reference to an out of range or unmatched index or a name that is not
    813 // present in the regular expression is replaced with an empty slice.
    814 //
    815 // In the $name form, name is taken to be as long as possible: $1x is
    816 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
    817 //
    818 // To insert a literal $ in the output, use $$ in the template.
    819 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
    820 	return re.expand(dst, string(template), src, "", match)
    821 }
    822 
    823 // ExpandString is like Expand but the template and source are strings.
    824 // It appends to and returns a byte slice in order to give the calling
    825 // code control over allocation.
    826 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
    827 	return re.expand(dst, template, nil, src, match)
    828 }
    829 
    830 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
    831 	for len(template) > 0 {
    832 		i := strings.Index(template, "$")
    833 		if i < 0 {
    834 			break
    835 		}
    836 		dst = append(dst, template[:i]...)
    837 		template = template[i:]
    838 		if len(template) > 1 && template[1] == '$' {
    839 			// Treat $$ as $.
    840 			dst = append(dst, '$')
    841 			template = template[2:]
    842 			continue
    843 		}
    844 		name, num, rest, ok := extract(template)
    845 		if !ok {
    846 			// Malformed; treat $ as raw text.
    847 			dst = append(dst, '$')
    848 			template = template[1:]
    849 			continue
    850 		}
    851 		template = rest
    852 		if num >= 0 {
    853 			if 2*num+1 < len(match) && match[2*num] >= 0 {
    854 				if bsrc != nil {
    855 					dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
    856 				} else {
    857 					dst = append(dst, src[match[2*num]:match[2*num+1]]...)
    858 				}
    859 			}
    860 		} else {
    861 			for i, namei := range re.subexpNames {
    862 				if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
    863 					if bsrc != nil {
    864 						dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
    865 					} else {
    866 						dst = append(dst, src[match[2*i]:match[2*i+1]]...)
    867 					}
    868 					break
    869 				}
    870 			}
    871 		}
    872 	}
    873 	dst = append(dst, template...)
    874 	return dst
    875 }
    876 
    877 // extract returns the name from a leading "$name" or "${name}" in str.
    878 // If it is a number, extract returns num set to that number; otherwise num = -1.
    879 func extract(str string) (name string, num int, rest string, ok bool) {
    880 	if len(str) < 2 || str[0] != '$' {
    881 		return
    882 	}
    883 	brace := false
    884 	if str[1] == '{' {
    885 		brace = true
    886 		str = str[2:]
    887 	} else {
    888 		str = str[1:]
    889 	}
    890 	i := 0
    891 	for i < len(str) {
    892 		rune, size := utf8.DecodeRuneInString(str[i:])
    893 		if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
    894 			break
    895 		}
    896 		i += size
    897 	}
    898 	if i == 0 {
    899 		// empty name is not okay
    900 		return
    901 	}
    902 	name = str[:i]
    903 	if brace {
    904 		if i >= len(str) || str[i] != '}' {
    905 			// missing closing brace
    906 			return
    907 		}
    908 		i++
    909 	}
    910 
    911 	// Parse number.
    912 	num = 0
    913 	for i := 0; i < len(name); i++ {
    914 		if name[i] < '0' || '9' < name[i] || num >= 1e8 {
    915 			num = -1
    916 			break
    917 		}
    918 		num = num*10 + int(name[i]) - '0'
    919 	}
    920 	// Disallow leading zeros.
    921 	if name[0] == '0' && len(name) > 1 {
    922 		num = -1
    923 	}
    924 
    925 	rest = str[i:]
    926 	ok = true
    927 	return
    928 }
    929 
    930 // FindSubmatchIndex returns a slice holding the index pairs identifying the
    931 // leftmost match of the regular expression in b and the matches, if any, of
    932 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
    933 // in the package comment.
    934 // A return value of nil indicates no match.
    935 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
    936 	return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
    937 }
    938 
    939 // FindStringSubmatch returns a slice of strings holding the text of the
    940 // leftmost match of the regular expression in s and the matches, if any, of
    941 // its subexpressions, as defined by the 'Submatch' description in the
    942 // package comment.
    943 // A return value of nil indicates no match.
    944 func (re *Regexp) FindStringSubmatch(s string) []string {
    945 	var dstCap [4]int
    946 	a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
    947 	if a == nil {
    948 		return nil
    949 	}
    950 	ret := make([]string, 1+re.numSubexp)
    951 	for i := range ret {
    952 		if 2*i < len(a) && a[2*i] >= 0 {
    953 			ret[i] = s[a[2*i]:a[2*i+1]]
    954 		}
    955 	}
    956 	return ret
    957 }
    958 
    959 // FindStringSubmatchIndex returns a slice holding the index pairs
    960 // identifying the leftmost match of the regular expression in s and the
    961 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
    962 // 'Index' descriptions in the package comment.
    963 // A return value of nil indicates no match.
    964 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
    965 	return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
    966 }
    967 
    968 // FindReaderSubmatchIndex returns a slice holding the index pairs
    969 // identifying the leftmost match of the regular expression of text read by
    970 // the RuneReader, and the matches, if any, of its subexpressions, as defined
    971 // by the 'Submatch' and 'Index' descriptions in the package comment. A
    972 // return value of nil indicates no match.
    973 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
    974 	return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
    975 }
    976 
    977 const startSize = 10 // The size at which to start a slice in the 'All' routines.
    978 
    979 // FindAll is the 'All' version of Find; it returns a slice of all successive
    980 // matches of the expression, as defined by the 'All' description in the
    981 // package comment.
    982 // A return value of nil indicates no match.
    983 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
    984 	if n < 0 {
    985 		n = len(b) + 1
    986 	}
    987 	result := make([][]byte, 0, startSize)
    988 	re.allMatches("", b, n, func(match []int) {
    989 		result = append(result, b[match[0]:match[1]])
    990 	})
    991 	if len(result) == 0 {
    992 		return nil
    993 	}
    994 	return result
    995 }
    996 
    997 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
    998 // successive matches of the expression, as defined by the 'All' description
    999 // in the package comment.
   1000 // A return value of nil indicates no match.
   1001 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   1002 	if n < 0 {
   1003 		n = len(b) + 1
   1004 	}
   1005 	result := make([][]int, 0, startSize)
   1006 	re.allMatches("", b, n, func(match []int) {
   1007 		result = append(result, match[0:2])
   1008 	})
   1009 	if len(result) == 0 {
   1010 		return nil
   1011 	}
   1012 	return result
   1013 }
   1014 
   1015 // FindAllString is the 'All' version of FindString; it returns a slice of all
   1016 // successive matches of the expression, as defined by the 'All' description
   1017 // in the package comment.
   1018 // A return value of nil indicates no match.
   1019 func (re *Regexp) FindAllString(s string, n int) []string {
   1020 	if n < 0 {
   1021 		n = len(s) + 1
   1022 	}
   1023 	result := make([]string, 0, startSize)
   1024 	re.allMatches(s, nil, n, func(match []int) {
   1025 		result = append(result, s[match[0]:match[1]])
   1026 	})
   1027 	if len(result) == 0 {
   1028 		return nil
   1029 	}
   1030 	return result
   1031 }
   1032 
   1033 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   1034 // slice of all successive matches of the expression, as defined by the 'All'
   1035 // description in the package comment.
   1036 // A return value of nil indicates no match.
   1037 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   1038 	if n < 0 {
   1039 		n = len(s) + 1
   1040 	}
   1041 	result := make([][]int, 0, startSize)
   1042 	re.allMatches(s, nil, n, func(match []int) {
   1043 		result = append(result, match[0:2])
   1044 	})
   1045 	if len(result) == 0 {
   1046 		return nil
   1047 	}
   1048 	return result
   1049 }
   1050 
   1051 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
   1052 // of all successive matches of the expression, as defined by the 'All'
   1053 // description in the package comment.
   1054 // A return value of nil indicates no match.
   1055 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   1056 	if n < 0 {
   1057 		n = len(b) + 1
   1058 	}
   1059 	result := make([][][]byte, 0, startSize)
   1060 	re.allMatches("", b, n, func(match []int) {
   1061 		slice := make([][]byte, len(match)/2)
   1062 		for j := range slice {
   1063 			if match[2*j] >= 0 {
   1064 				slice[j] = b[match[2*j]:match[2*j+1]]
   1065 			}
   1066 		}
   1067 		result = append(result, slice)
   1068 	})
   1069 	if len(result) == 0 {
   1070 		return nil
   1071 	}
   1072 	return result
   1073 }
   1074 
   1075 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
   1076 // a slice of all successive matches of the expression, as defined by the
   1077 // 'All' description in the package comment.
   1078 // A return value of nil indicates no match.
   1079 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
   1080 	if n < 0 {
   1081 		n = len(b) + 1
   1082 	}
   1083 	result := make([][]int, 0, startSize)
   1084 	re.allMatches("", b, n, func(match []int) {
   1085 		result = append(result, match)
   1086 	})
   1087 	if len(result) == 0 {
   1088 		return nil
   1089 	}
   1090 	return result
   1091 }
   1092 
   1093 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
   1094 // returns a slice of all successive matches of the expression, as defined by
   1095 // the 'All' description in the package comment.
   1096 // A return value of nil indicates no match.
   1097 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
   1098 	if n < 0 {
   1099 		n = len(s) + 1
   1100 	}
   1101 	result := make([][]string, 0, startSize)
   1102 	re.allMatches(s, nil, n, func(match []int) {
   1103 		slice := make([]string, len(match)/2)
   1104 		for j := range slice {
   1105 			if match[2*j] >= 0 {
   1106 				slice[j] = s[match[2*j]:match[2*j+1]]
   1107 			}
   1108 		}
   1109 		result = append(result, slice)
   1110 	})
   1111 	if len(result) == 0 {
   1112 		return nil
   1113 	}
   1114 	return result
   1115 }
   1116 
   1117 // FindAllStringSubmatchIndex is the 'All' version of
   1118 // FindStringSubmatchIndex; it returns a slice of all successive matches of
   1119 // the expression, as defined by the 'All' description in the package
   1120 // comment.
   1121 // A return value of nil indicates no match.
   1122 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
   1123 	if n < 0 {
   1124 		n = len(s) + 1
   1125 	}
   1126 	result := make([][]int, 0, startSize)
   1127 	re.allMatches(s, nil, n, func(match []int) {
   1128 		result = append(result, match)
   1129 	})
   1130 	if len(result) == 0 {
   1131 		return nil
   1132 	}
   1133 	return result
   1134 }
   1135 
   1136 // Split slices s into substrings separated by the expression and returns a slice of
   1137 // the substrings between those expression matches.
   1138 //
   1139 // The slice returned by this method consists of all the substrings of s
   1140 // not contained in the slice returned by FindAllString. When called on an expression
   1141 // that contains no metacharacters, it is equivalent to strings.SplitN.
   1142 //
   1143 // Example:
   1144 //   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
   1145 //   // s: ["", "b", "b", "c", "cadaaae"]
   1146 //
   1147 // The count determines the number of substrings to return:
   1148 //   n > 0: at most n substrings; the last substring will be the unsplit remainder.
   1149 //   n == 0: the result is nil (zero substrings)
   1150 //   n < 0: all substrings
   1151 func (re *Regexp) Split(s string, n int) []string {
   1152 
   1153 	if n == 0 {
   1154 		return nil
   1155 	}
   1156 
   1157 	if len(re.expr) > 0 && len(s) == 0 {
   1158 		return []string{""}
   1159 	}
   1160 
   1161 	matches := re.FindAllStringIndex(s, n)
   1162 	strings := make([]string, 0, len(matches))
   1163 
   1164 	beg := 0
   1165 	end := 0
   1166 	for _, match := range matches {
   1167 		if n > 0 && len(strings) >= n-1 {
   1168 			break
   1169 		}
   1170 
   1171 		end = match[0]
   1172 		if match[1] != 0 {
   1173 			strings = append(strings, s[beg:end])
   1174 		}
   1175 		beg = match[1]
   1176 	}
   1177 
   1178 	if end != len(s) {
   1179 		strings = append(strings, s[beg:])
   1180 	}
   1181 
   1182 	return strings
   1183 }
   1184