1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // http://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n; if n >= 0, the function returns at most n 34 // matches/submatches. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the 50 // text of the match/submatch. If an index is negative, it means that 51 // subexpression did not match any string in the input. 52 // 53 // There is also a subset of the methods that can be applied to text read 54 // from a RuneReader: 55 // 56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57 // 58 // This set may grow. Note that regular expression matches may need to 59 // examine text beyond the text returned by a match, so the methods that 60 // match text from a RuneReader may read arbitrarily far into the input 61 // before returning. 62 // 63 // (There are a few other methods that do not match this pattern.) 64 // 65 package regexp 66 67 import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76 ) 77 78 // Regexp is the representation of a compiled regular expression. 79 // A Regexp is safe for concurrent use by multiple goroutines, 80 // except for configuration methods, such as Longest. 81 type Regexp struct { 82 // read-only after Compile 83 regexpRO 84 85 // cache of machines for running regexp 86 mu sync.Mutex 87 machine []*machine 88 } 89 90 type regexpRO struct { 91 expr string // as passed to Compile 92 prog *syntax.Prog // compiled program 93 onepass *onePassProg // onepass program or nil 94 prefix string // required prefix in unanchored matches 95 prefixBytes []byte // prefix, as a []byte 96 prefixComplete bool // prefix is the entire regexp 97 prefixRune rune // first rune in prefix 98 prefixEnd uint32 // pc for last rune in prefix 99 cond syntax.EmptyOp // empty-width conditions required at start of match 100 numSubexp int 101 subexpNames []string 102 longest bool 103 } 104 105 // String returns the source text used to compile the regular expression. 106 func (re *Regexp) String() string { 107 return re.expr 108 } 109 110 // Copy returns a new Regexp object copied from re. 111 // 112 // When using a Regexp in multiple goroutines, giving each goroutine 113 // its own copy helps to avoid lock contention. 114 func (re *Regexp) Copy() *Regexp { 115 // It is not safe to copy Regexp by value 116 // since it contains a sync.Mutex. 117 return &Regexp{ 118 regexpRO: re.regexpRO, 119 } 120 } 121 122 // Compile parses a regular expression and returns, if successful, 123 // a Regexp object that can be used to match against text. 124 // 125 // When matching against text, the regexp returns a match that 126 // begins as early as possible in the input (leftmost), and among those 127 // it chooses the one that a backtracking search would have found first. 128 // This so-called leftmost-first matching is the same semantics 129 // that Perl, Python, and other implementations use, although this 130 // package implements it without the expense of backtracking. 131 // For POSIX leftmost-longest matching, see CompilePOSIX. 132 func Compile(expr string) (*Regexp, error) { 133 return compile(expr, syntax.Perl, false) 134 } 135 136 // CompilePOSIX is like Compile but restricts the regular expression 137 // to POSIX ERE (egrep) syntax and changes the match semantics to 138 // leftmost-longest. 139 // 140 // That is, when matching against text, the regexp returns a match that 141 // begins as early as possible in the input (leftmost), and among those 142 // it chooses a match that is as long as possible. 143 // This so-called leftmost-longest matching is the same semantics 144 // that early regular expression implementations used and that POSIX 145 // specifies. 146 // 147 // However, there can be multiple leftmost-longest matches, with different 148 // submatch choices, and here this package diverges from POSIX. 149 // Among the possible leftmost-longest matches, this package chooses 150 // the one that a backtracking search would have found first, while POSIX 151 // specifies that the match be chosen to maximize the length of the first 152 // subexpression, then the second, and so on from left to right. 153 // The POSIX rule is computationally prohibitive and not even well-defined. 154 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 155 func CompilePOSIX(expr string) (*Regexp, error) { 156 return compile(expr, syntax.POSIX, true) 157 } 158 159 // Longest makes future searches prefer the leftmost-longest match. 160 // That is, when matching against text, the regexp returns a match that 161 // begins as early as possible in the input (leftmost), and among those 162 // it chooses a match that is as long as possible. 163 // This method modifies the Regexp and may not be called concurrently 164 // with any other methods. 165 func (re *Regexp) Longest() { 166 re.longest = true 167 } 168 169 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 170 re, err := syntax.Parse(expr, mode) 171 if err != nil { 172 return nil, err 173 } 174 maxCap := re.MaxCap() 175 capNames := re.CapNames() 176 177 re = re.Simplify() 178 prog, err := syntax.Compile(re) 179 if err != nil { 180 return nil, err 181 } 182 regexp := &Regexp{ 183 regexpRO: regexpRO{ 184 expr: expr, 185 prog: prog, 186 onepass: compileOnePass(prog), 187 numSubexp: maxCap, 188 subexpNames: capNames, 189 cond: prog.StartCond(), 190 longest: longest, 191 }, 192 } 193 if regexp.onepass == notOnePass { 194 regexp.prefix, regexp.prefixComplete = prog.Prefix() 195 } else { 196 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 197 } 198 if regexp.prefix != "" { 199 // TODO(rsc): Remove this allocation by adding 200 // IndexString to package bytes. 201 regexp.prefixBytes = []byte(regexp.prefix) 202 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 203 } 204 return regexp, nil 205 } 206 207 // get returns a machine to use for matching re. 208 // It uses the re's machine cache if possible, to avoid 209 // unnecessary allocation. 210 func (re *Regexp) get() *machine { 211 re.mu.Lock() 212 if n := len(re.machine); n > 0 { 213 z := re.machine[n-1] 214 re.machine = re.machine[:n-1] 215 re.mu.Unlock() 216 return z 217 } 218 re.mu.Unlock() 219 z := progMachine(re.prog, re.onepass) 220 z.re = re 221 return z 222 } 223 224 // put returns a machine to the re's machine cache. 225 // There is no attempt to limit the size of the cache, so it will 226 // grow to the maximum number of simultaneous matches 227 // run using re. (The cache empties when re gets garbage collected.) 228 func (re *Regexp) put(z *machine) { 229 re.mu.Lock() 230 re.machine = append(re.machine, z) 231 re.mu.Unlock() 232 } 233 234 // MustCompile is like Compile but panics if the expression cannot be parsed. 235 // It simplifies safe initialization of global variables holding compiled regular 236 // expressions. 237 func MustCompile(str string) *Regexp { 238 regexp, error := Compile(str) 239 if error != nil { 240 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 241 } 242 return regexp 243 } 244 245 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 246 // It simplifies safe initialization of global variables holding compiled regular 247 // expressions. 248 func MustCompilePOSIX(str string) *Regexp { 249 regexp, error := CompilePOSIX(str) 250 if error != nil { 251 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 252 } 253 return regexp 254 } 255 256 func quote(s string) string { 257 if strconv.CanBackquote(s) { 258 return "`" + s + "`" 259 } 260 return strconv.Quote(s) 261 } 262 263 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 264 func (re *Regexp) NumSubexp() int { 265 return re.numSubexp 266 } 267 268 // SubexpNames returns the names of the parenthesized subexpressions 269 // in this Regexp. The name for the first sub-expression is names[1], 270 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 271 // Since the Regexp as a whole cannot be named, names[0] is always 272 // the empty string. The slice should not be modified. 273 func (re *Regexp) SubexpNames() []string { 274 return re.subexpNames 275 } 276 277 const endOfText rune = -1 278 279 // input abstracts different representations of the input text. It provides 280 // one-character lookahead. 281 type input interface { 282 step(pos int) (r rune, width int) // advance one rune 283 canCheckPrefix() bool // can we look ahead without losing info? 284 hasPrefix(re *Regexp) bool 285 index(re *Regexp, pos int) int 286 context(pos int) syntax.EmptyOp 287 } 288 289 // inputString scans a string. 290 type inputString struct { 291 str string 292 } 293 294 func (i *inputString) step(pos int) (rune, int) { 295 if pos < len(i.str) { 296 c := i.str[pos] 297 if c < utf8.RuneSelf { 298 return rune(c), 1 299 } 300 return utf8.DecodeRuneInString(i.str[pos:]) 301 } 302 return endOfText, 0 303 } 304 305 func (i *inputString) canCheckPrefix() bool { 306 return true 307 } 308 309 func (i *inputString) hasPrefix(re *Regexp) bool { 310 return strings.HasPrefix(i.str, re.prefix) 311 } 312 313 func (i *inputString) index(re *Regexp, pos int) int { 314 return strings.Index(i.str[pos:], re.prefix) 315 } 316 317 func (i *inputString) context(pos int) syntax.EmptyOp { 318 r1, r2 := endOfText, endOfText 319 // 0 < pos && pos <= len(i.str) 320 if uint(pos-1) < uint(len(i.str)) { 321 r1 = rune(i.str[pos-1]) 322 if r1 >= utf8.RuneSelf { 323 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 324 } 325 } 326 // 0 <= pos && pos < len(i.str) 327 if uint(pos) < uint(len(i.str)) { 328 r2 = rune(i.str[pos]) 329 if r2 >= utf8.RuneSelf { 330 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 331 } 332 } 333 return syntax.EmptyOpContext(r1, r2) 334 } 335 336 // inputBytes scans a byte slice. 337 type inputBytes struct { 338 str []byte 339 } 340 341 func (i *inputBytes) step(pos int) (rune, int) { 342 if pos < len(i.str) { 343 c := i.str[pos] 344 if c < utf8.RuneSelf { 345 return rune(c), 1 346 } 347 return utf8.DecodeRune(i.str[pos:]) 348 } 349 return endOfText, 0 350 } 351 352 func (i *inputBytes) canCheckPrefix() bool { 353 return true 354 } 355 356 func (i *inputBytes) hasPrefix(re *Regexp) bool { 357 return bytes.HasPrefix(i.str, re.prefixBytes) 358 } 359 360 func (i *inputBytes) index(re *Regexp, pos int) int { 361 return bytes.Index(i.str[pos:], re.prefixBytes) 362 } 363 364 func (i *inputBytes) context(pos int) syntax.EmptyOp { 365 r1, r2 := endOfText, endOfText 366 // 0 < pos && pos <= len(i.str) 367 if uint(pos-1) < uint(len(i.str)) { 368 r1 = rune(i.str[pos-1]) 369 if r1 >= utf8.RuneSelf { 370 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 371 } 372 } 373 // 0 <= pos && pos < len(i.str) 374 if uint(pos) < uint(len(i.str)) { 375 r2 = rune(i.str[pos]) 376 if r2 >= utf8.RuneSelf { 377 r2, _ = utf8.DecodeRune(i.str[pos:]) 378 } 379 } 380 return syntax.EmptyOpContext(r1, r2) 381 } 382 383 // inputReader scans a RuneReader. 384 type inputReader struct { 385 r io.RuneReader 386 atEOT bool 387 pos int 388 } 389 390 func (i *inputReader) step(pos int) (rune, int) { 391 if !i.atEOT && pos != i.pos { 392 return endOfText, 0 393 394 } 395 r, w, err := i.r.ReadRune() 396 if err != nil { 397 i.atEOT = true 398 return endOfText, 0 399 } 400 i.pos += w 401 return r, w 402 } 403 404 func (i *inputReader) canCheckPrefix() bool { 405 return false 406 } 407 408 func (i *inputReader) hasPrefix(re *Regexp) bool { 409 return false 410 } 411 412 func (i *inputReader) index(re *Regexp, pos int) int { 413 return -1 414 } 415 416 func (i *inputReader) context(pos int) syntax.EmptyOp { 417 return 0 418 } 419 420 // LiteralPrefix returns a literal string that must begin any match 421 // of the regular expression re. It returns the boolean true if the 422 // literal string comprises the entire regular expression. 423 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 424 return re.prefix, re.prefixComplete 425 } 426 427 // MatchReader reports whether the Regexp matches the text read by the 428 // RuneReader. 429 func (re *Regexp) MatchReader(r io.RuneReader) bool { 430 return re.doMatch(r, nil, "") 431 } 432 433 // MatchString reports whether the Regexp matches the string s. 434 func (re *Regexp) MatchString(s string) bool { 435 return re.doMatch(nil, nil, s) 436 } 437 438 // Match reports whether the Regexp matches the byte slice b. 439 func (re *Regexp) Match(b []byte) bool { 440 return re.doMatch(nil, b, "") 441 } 442 443 // MatchReader checks whether a textual regular expression matches the text 444 // read by the RuneReader. More complicated queries need to use Compile and 445 // the full Regexp interface. 446 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 447 re, err := Compile(pattern) 448 if err != nil { 449 return false, err 450 } 451 return re.MatchReader(r), nil 452 } 453 454 // MatchString checks whether a textual regular expression 455 // matches a string. More complicated queries need 456 // to use Compile and the full Regexp interface. 457 func MatchString(pattern string, s string) (matched bool, err error) { 458 re, err := Compile(pattern) 459 if err != nil { 460 return false, err 461 } 462 return re.MatchString(s), nil 463 } 464 465 // Match checks whether a textual regular expression 466 // matches a byte slice. More complicated queries need 467 // to use Compile and the full Regexp interface. 468 func Match(pattern string, b []byte) (matched bool, err error) { 469 re, err := Compile(pattern) 470 if err != nil { 471 return false, err 472 } 473 return re.Match(b), nil 474 } 475 476 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 477 // with the replacement string repl. Inside repl, $ signs are interpreted as 478 // in Expand, so for instance $1 represents the text of the first submatch. 479 func (re *Regexp) ReplaceAllString(src, repl string) string { 480 n := 2 481 if strings.Contains(repl, "$") { 482 n = 2 * (re.numSubexp + 1) 483 } 484 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 485 return re.expand(dst, repl, nil, src, match) 486 }) 487 return string(b) 488 } 489 490 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 491 // with the replacement string repl. The replacement repl is substituted directly, 492 // without using Expand. 493 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 494 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 495 return append(dst, repl...) 496 })) 497 } 498 499 // ReplaceAllStringFunc returns a copy of src in which all matches of the 500 // Regexp have been replaced by the return value of function repl applied 501 // to the matched substring. The replacement returned by repl is substituted 502 // directly, without using Expand. 503 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 504 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 505 return append(dst, repl(src[match[0]:match[1]])...) 506 }) 507 return string(b) 508 } 509 510 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 511 lastMatchEnd := 0 // end position of the most recent match 512 searchPos := 0 // position where we next look for a match 513 var buf []byte 514 var endPos int 515 if bsrc != nil { 516 endPos = len(bsrc) 517 } else { 518 endPos = len(src) 519 } 520 if nmatch > re.prog.NumCap { 521 nmatch = re.prog.NumCap 522 } 523 524 var dstCap [2]int 525 for searchPos <= endPos { 526 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 527 if len(a) == 0 { 528 break // no more matches 529 } 530 531 // Copy the unmatched characters before this match. 532 if bsrc != nil { 533 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 534 } else { 535 buf = append(buf, src[lastMatchEnd:a[0]]...) 536 } 537 538 // Now insert a copy of the replacement string, but not for a 539 // match of the empty string immediately after another match. 540 // (Otherwise, we get double replacement for patterns that 541 // match both empty and nonempty strings.) 542 if a[1] > lastMatchEnd || a[0] == 0 { 543 buf = repl(buf, a) 544 } 545 lastMatchEnd = a[1] 546 547 // Advance past this match; always advance at least one character. 548 var width int 549 if bsrc != nil { 550 _, width = utf8.DecodeRune(bsrc[searchPos:]) 551 } else { 552 _, width = utf8.DecodeRuneInString(src[searchPos:]) 553 } 554 if searchPos+width > a[1] { 555 searchPos += width 556 } else if searchPos+1 > a[1] { 557 // This clause is only needed at the end of the input 558 // string. In that case, DecodeRuneInString returns width=0. 559 searchPos++ 560 } else { 561 searchPos = a[1] 562 } 563 } 564 565 // Copy the unmatched characters after the last match. 566 if bsrc != nil { 567 buf = append(buf, bsrc[lastMatchEnd:]...) 568 } else { 569 buf = append(buf, src[lastMatchEnd:]...) 570 } 571 572 return buf 573 } 574 575 // ReplaceAll returns a copy of src, replacing matches of the Regexp 576 // with the replacement text repl. Inside repl, $ signs are interpreted as 577 // in Expand, so for instance $1 represents the text of the first submatch. 578 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 579 n := 2 580 if bytes.IndexByte(repl, '$') >= 0 { 581 n = 2 * (re.numSubexp + 1) 582 } 583 srepl := "" 584 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 585 if len(srepl) != len(repl) { 586 srepl = string(repl) 587 } 588 return re.expand(dst, srepl, src, "", match) 589 }) 590 return b 591 } 592 593 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 594 // with the replacement bytes repl. The replacement repl is substituted directly, 595 // without using Expand. 596 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 597 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 598 return append(dst, repl...) 599 }) 600 } 601 602 // ReplaceAllFunc returns a copy of src in which all matches of the 603 // Regexp have been replaced by the return value of function repl applied 604 // to the matched byte slice. The replacement returned by repl is substituted 605 // directly, without using Expand. 606 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 607 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 608 return append(dst, repl(src[match[0]:match[1]])...) 609 }) 610 } 611 612 // Bitmap used by func special to check whether a character needs to be escaped. 613 var specialBytes [16]byte 614 615 // special reports whether byte b needs to be escaped by QuoteMeta. 616 func special(b byte) bool { 617 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 618 } 619 620 func init() { 621 for _, b := range []byte(`\.+*?()|[]{}^$`) { 622 specialBytes[b%16] |= 1 << (b / 16) 623 } 624 } 625 626 // QuoteMeta returns a string that quotes all regular expression metacharacters 627 // inside the argument text; the returned string is a regular expression matching 628 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 629 func QuoteMeta(s string) string { 630 // A byte loop is correct because all metacharacters are ASCII. 631 var i int 632 for i = 0; i < len(s); i++ { 633 if special(s[i]) { 634 break 635 } 636 } 637 // No meta characters found, so return original string. 638 if i >= len(s) { 639 return s 640 } 641 642 b := make([]byte, 2*len(s)-i) 643 copy(b, s[:i]) 644 j := i 645 for ; i < len(s); i++ { 646 if special(s[i]) { 647 b[j] = '\\' 648 j++ 649 } 650 b[j] = s[i] 651 j++ 652 } 653 return string(b[:j]) 654 } 655 656 // The number of capture values in the program may correspond 657 // to fewer capturing expressions than are in the regexp. 658 // For example, "(a){0}" turns into an empty program, so the 659 // maximum capture in the program is 0 but we need to return 660 // an expression for \1. Pad appends -1s to the slice a as needed. 661 func (re *Regexp) pad(a []int) []int { 662 if a == nil { 663 // No match. 664 return nil 665 } 666 n := (1 + re.numSubexp) * 2 667 for len(a) < n { 668 a = append(a, -1) 669 } 670 return a 671 } 672 673 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 674 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 675 var end int 676 if b == nil { 677 end = len(s) 678 } else { 679 end = len(b) 680 } 681 682 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 683 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 684 if len(matches) == 0 { 685 break 686 } 687 688 accept := true 689 if matches[1] == pos { 690 // We've found an empty match. 691 if matches[0] == prevMatchEnd { 692 // We don't allow an empty match right 693 // after a previous match, so ignore it. 694 accept = false 695 } 696 var width int 697 // TODO: use step() 698 if b == nil { 699 _, width = utf8.DecodeRuneInString(s[pos:end]) 700 } else { 701 _, width = utf8.DecodeRune(b[pos:end]) 702 } 703 if width > 0 { 704 pos += width 705 } else { 706 pos = end + 1 707 } 708 } else { 709 pos = matches[1] 710 } 711 prevMatchEnd = matches[1] 712 713 if accept { 714 deliver(re.pad(matches)) 715 i++ 716 } 717 } 718 } 719 720 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 721 // A return value of nil indicates no match. 722 func (re *Regexp) Find(b []byte) []byte { 723 var dstCap [2]int 724 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 725 if a == nil { 726 return nil 727 } 728 return b[a[0]:a[1]] 729 } 730 731 // FindIndex returns a two-element slice of integers defining the location of 732 // the leftmost match in b of the regular expression. The match itself is at 733 // b[loc[0]:loc[1]]. 734 // A return value of nil indicates no match. 735 func (re *Regexp) FindIndex(b []byte) (loc []int) { 736 a := re.doExecute(nil, b, "", 0, 2, nil) 737 if a == nil { 738 return nil 739 } 740 return a[0:2] 741 } 742 743 // FindString returns a string holding the text of the leftmost match in s of the regular 744 // expression. If there is no match, the return value is an empty string, 745 // but it will also be empty if the regular expression successfully matches 746 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 747 // necessary to distinguish these cases. 748 func (re *Regexp) FindString(s string) string { 749 var dstCap [2]int 750 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 751 if a == nil { 752 return "" 753 } 754 return s[a[0]:a[1]] 755 } 756 757 // FindStringIndex returns a two-element slice of integers defining the 758 // location of the leftmost match in s of the regular expression. The match 759 // itself is at s[loc[0]:loc[1]]. 760 // A return value of nil indicates no match. 761 func (re *Regexp) FindStringIndex(s string) (loc []int) { 762 a := re.doExecute(nil, nil, s, 0, 2, nil) 763 if a == nil { 764 return nil 765 } 766 return a[0:2] 767 } 768 769 // FindReaderIndex returns a two-element slice of integers defining the 770 // location of the leftmost match of the regular expression in text read from 771 // the RuneReader. The match text was found in the input stream at 772 // byte offset loc[0] through loc[1]-1. 773 // A return value of nil indicates no match. 774 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 775 a := re.doExecute(r, nil, "", 0, 2, nil) 776 if a == nil { 777 return nil 778 } 779 return a[0:2] 780 } 781 782 // FindSubmatch returns a slice of slices holding the text of the leftmost 783 // match of the regular expression in b and the matches, if any, of its 784 // subexpressions, as defined by the 'Submatch' descriptions in the package 785 // comment. 786 // A return value of nil indicates no match. 787 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 788 var dstCap [4]int 789 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 790 if a == nil { 791 return nil 792 } 793 ret := make([][]byte, 1+re.numSubexp) 794 for i := range ret { 795 if 2*i < len(a) && a[2*i] >= 0 { 796 ret[i] = b[a[2*i]:a[2*i+1]] 797 } 798 } 799 return ret 800 } 801 802 // Expand appends template to dst and returns the result; during the 803 // append, Expand replaces variables in the template with corresponding 804 // matches drawn from src. The match slice should have been returned by 805 // FindSubmatchIndex. 806 // 807 // In the template, a variable is denoted by a substring of the form 808 // $name or ${name}, where name is a non-empty sequence of letters, 809 // digits, and underscores. A purely numeric name like $1 refers to 810 // the submatch with the corresponding index; other names refer to 811 // capturing parentheses named with the (?P<name>...) syntax. A 812 // reference to an out of range or unmatched index or a name that is not 813 // present in the regular expression is replaced with an empty slice. 814 // 815 // In the $name form, name is taken to be as long as possible: $1x is 816 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 817 // 818 // To insert a literal $ in the output, use $$ in the template. 819 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 820 return re.expand(dst, string(template), src, "", match) 821 } 822 823 // ExpandString is like Expand but the template and source are strings. 824 // It appends to and returns a byte slice in order to give the calling 825 // code control over allocation. 826 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 827 return re.expand(dst, template, nil, src, match) 828 } 829 830 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 831 for len(template) > 0 { 832 i := strings.Index(template, "$") 833 if i < 0 { 834 break 835 } 836 dst = append(dst, template[:i]...) 837 template = template[i:] 838 if len(template) > 1 && template[1] == '$' { 839 // Treat $$ as $. 840 dst = append(dst, '$') 841 template = template[2:] 842 continue 843 } 844 name, num, rest, ok := extract(template) 845 if !ok { 846 // Malformed; treat $ as raw text. 847 dst = append(dst, '$') 848 template = template[1:] 849 continue 850 } 851 template = rest 852 if num >= 0 { 853 if 2*num+1 < len(match) && match[2*num] >= 0 { 854 if bsrc != nil { 855 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 856 } else { 857 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 858 } 859 } 860 } else { 861 for i, namei := range re.subexpNames { 862 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 863 if bsrc != nil { 864 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 865 } else { 866 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 867 } 868 break 869 } 870 } 871 } 872 } 873 dst = append(dst, template...) 874 return dst 875 } 876 877 // extract returns the name from a leading "$name" or "${name}" in str. 878 // If it is a number, extract returns num set to that number; otherwise num = -1. 879 func extract(str string) (name string, num int, rest string, ok bool) { 880 if len(str) < 2 || str[0] != '$' { 881 return 882 } 883 brace := false 884 if str[1] == '{' { 885 brace = true 886 str = str[2:] 887 } else { 888 str = str[1:] 889 } 890 i := 0 891 for i < len(str) { 892 rune, size := utf8.DecodeRuneInString(str[i:]) 893 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 894 break 895 } 896 i += size 897 } 898 if i == 0 { 899 // empty name is not okay 900 return 901 } 902 name = str[:i] 903 if brace { 904 if i >= len(str) || str[i] != '}' { 905 // missing closing brace 906 return 907 } 908 i++ 909 } 910 911 // Parse number. 912 num = 0 913 for i := 0; i < len(name); i++ { 914 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 915 num = -1 916 break 917 } 918 num = num*10 + int(name[i]) - '0' 919 } 920 // Disallow leading zeros. 921 if name[0] == '0' && len(name) > 1 { 922 num = -1 923 } 924 925 rest = str[i:] 926 ok = true 927 return 928 } 929 930 // FindSubmatchIndex returns a slice holding the index pairs identifying the 931 // leftmost match of the regular expression in b and the matches, if any, of 932 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 933 // in the package comment. 934 // A return value of nil indicates no match. 935 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 936 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 937 } 938 939 // FindStringSubmatch returns a slice of strings holding the text of the 940 // leftmost match of the regular expression in s and the matches, if any, of 941 // its subexpressions, as defined by the 'Submatch' description in the 942 // package comment. 943 // A return value of nil indicates no match. 944 func (re *Regexp) FindStringSubmatch(s string) []string { 945 var dstCap [4]int 946 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 947 if a == nil { 948 return nil 949 } 950 ret := make([]string, 1+re.numSubexp) 951 for i := range ret { 952 if 2*i < len(a) && a[2*i] >= 0 { 953 ret[i] = s[a[2*i]:a[2*i+1]] 954 } 955 } 956 return ret 957 } 958 959 // FindStringSubmatchIndex returns a slice holding the index pairs 960 // identifying the leftmost match of the regular expression in s and the 961 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 962 // 'Index' descriptions in the package comment. 963 // A return value of nil indicates no match. 964 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 965 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 966 } 967 968 // FindReaderSubmatchIndex returns a slice holding the index pairs 969 // identifying the leftmost match of the regular expression of text read by 970 // the RuneReader, and the matches, if any, of its subexpressions, as defined 971 // by the 'Submatch' and 'Index' descriptions in the package comment. A 972 // return value of nil indicates no match. 973 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 974 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 975 } 976 977 const startSize = 10 // The size at which to start a slice in the 'All' routines. 978 979 // FindAll is the 'All' version of Find; it returns a slice of all successive 980 // matches of the expression, as defined by the 'All' description in the 981 // package comment. 982 // A return value of nil indicates no match. 983 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 984 if n < 0 { 985 n = len(b) + 1 986 } 987 result := make([][]byte, 0, startSize) 988 re.allMatches("", b, n, func(match []int) { 989 result = append(result, b[match[0]:match[1]]) 990 }) 991 if len(result) == 0 { 992 return nil 993 } 994 return result 995 } 996 997 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 998 // successive matches of the expression, as defined by the 'All' description 999 // in the package comment. 1000 // A return value of nil indicates no match. 1001 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1002 if n < 0 { 1003 n = len(b) + 1 1004 } 1005 result := make([][]int, 0, startSize) 1006 re.allMatches("", b, n, func(match []int) { 1007 result = append(result, match[0:2]) 1008 }) 1009 if len(result) == 0 { 1010 return nil 1011 } 1012 return result 1013 } 1014 1015 // FindAllString is the 'All' version of FindString; it returns a slice of all 1016 // successive matches of the expression, as defined by the 'All' description 1017 // in the package comment. 1018 // A return value of nil indicates no match. 1019 func (re *Regexp) FindAllString(s string, n int) []string { 1020 if n < 0 { 1021 n = len(s) + 1 1022 } 1023 result := make([]string, 0, startSize) 1024 re.allMatches(s, nil, n, func(match []int) { 1025 result = append(result, s[match[0]:match[1]]) 1026 }) 1027 if len(result) == 0 { 1028 return nil 1029 } 1030 return result 1031 } 1032 1033 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1034 // slice of all successive matches of the expression, as defined by the 'All' 1035 // description in the package comment. 1036 // A return value of nil indicates no match. 1037 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1038 if n < 0 { 1039 n = len(s) + 1 1040 } 1041 result := make([][]int, 0, startSize) 1042 re.allMatches(s, nil, n, func(match []int) { 1043 result = append(result, match[0:2]) 1044 }) 1045 if len(result) == 0 { 1046 return nil 1047 } 1048 return result 1049 } 1050 1051 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1052 // of all successive matches of the expression, as defined by the 'All' 1053 // description in the package comment. 1054 // A return value of nil indicates no match. 1055 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1056 if n < 0 { 1057 n = len(b) + 1 1058 } 1059 result := make([][][]byte, 0, startSize) 1060 re.allMatches("", b, n, func(match []int) { 1061 slice := make([][]byte, len(match)/2) 1062 for j := range slice { 1063 if match[2*j] >= 0 { 1064 slice[j] = b[match[2*j]:match[2*j+1]] 1065 } 1066 } 1067 result = append(result, slice) 1068 }) 1069 if len(result) == 0 { 1070 return nil 1071 } 1072 return result 1073 } 1074 1075 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1076 // a slice of all successive matches of the expression, as defined by the 1077 // 'All' description in the package comment. 1078 // A return value of nil indicates no match. 1079 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1080 if n < 0 { 1081 n = len(b) + 1 1082 } 1083 result := make([][]int, 0, startSize) 1084 re.allMatches("", b, n, func(match []int) { 1085 result = append(result, match) 1086 }) 1087 if len(result) == 0 { 1088 return nil 1089 } 1090 return result 1091 } 1092 1093 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1094 // returns a slice of all successive matches of the expression, as defined by 1095 // the 'All' description in the package comment. 1096 // A return value of nil indicates no match. 1097 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1098 if n < 0 { 1099 n = len(s) + 1 1100 } 1101 result := make([][]string, 0, startSize) 1102 re.allMatches(s, nil, n, func(match []int) { 1103 slice := make([]string, len(match)/2) 1104 for j := range slice { 1105 if match[2*j] >= 0 { 1106 slice[j] = s[match[2*j]:match[2*j+1]] 1107 } 1108 } 1109 result = append(result, slice) 1110 }) 1111 if len(result) == 0 { 1112 return nil 1113 } 1114 return result 1115 } 1116 1117 // FindAllStringSubmatchIndex is the 'All' version of 1118 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1119 // the expression, as defined by the 'All' description in the package 1120 // comment. 1121 // A return value of nil indicates no match. 1122 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1123 if n < 0 { 1124 n = len(s) + 1 1125 } 1126 result := make([][]int, 0, startSize) 1127 re.allMatches(s, nil, n, func(match []int) { 1128 result = append(result, match) 1129 }) 1130 if len(result) == 0 { 1131 return nil 1132 } 1133 return result 1134 } 1135 1136 // Split slices s into substrings separated by the expression and returns a slice of 1137 // the substrings between those expression matches. 1138 // 1139 // The slice returned by this method consists of all the substrings of s 1140 // not contained in the slice returned by FindAllString. When called on an expression 1141 // that contains no metacharacters, it is equivalent to strings.SplitN. 1142 // 1143 // Example: 1144 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1145 // // s: ["", "b", "b", "c", "cadaaae"] 1146 // 1147 // The count determines the number of substrings to return: 1148 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1149 // n == 0: the result is nil (zero substrings) 1150 // n < 0: all substrings 1151 func (re *Regexp) Split(s string, n int) []string { 1152 1153 if n == 0 { 1154 return nil 1155 } 1156 1157 if len(re.expr) > 0 && len(s) == 0 { 1158 return []string{""} 1159 } 1160 1161 matches := re.FindAllStringIndex(s, n) 1162 strings := make([]string, 0, len(matches)) 1163 1164 beg := 0 1165 end := 0 1166 for _, match := range matches { 1167 if n > 0 && len(strings) >= n-1 { 1168 break 1169 } 1170 1171 end = match[0] 1172 if match[1] != 0 { 1173 strings = append(strings, s[beg:end]) 1174 } 1175 beg = match[1] 1176 } 1177 1178 if end != len(s) { 1179 strings = append(strings, s[beg:]) 1180 } 1181 1182 return strings 1183 } 1184