Home | History | Annotate | Download | only in strings
      1 // Copyright 2012 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package strings
      6 
      7 // stringFinder efficiently finds strings in a source text. It's implemented
      8 // using the Boyer-Moore string search algorithm:
      9 // http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
     10 // http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
     11 // document uses 1-based indexing)
     12 type stringFinder struct {
     13 	// pattern is the string that we are searching for in the text.
     14 	pattern string
     15 
     16 	// badCharSkip[b] contains the distance between the last byte of pattern
     17 	// and the rightmost occurrence of b in pattern. If b is not in pattern,
     18 	// badCharSkip[b] is len(pattern).
     19 	//
     20 	// Whenever a mismatch is found with byte b in the text, we can safely
     21 	// shift the matching frame at least badCharSkip[b] until the next time
     22 	// the matching char could be in alignment.
     23 	badCharSkip [256]int
     24 
     25 	// goodSuffixSkip[i] defines how far we can shift the matching frame given
     26 	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
     27 	// not. There are two cases to consider:
     28 	//
     29 	// 1. The matched suffix occurs elsewhere in pattern (with a different
     30 	// byte preceding it that we might possibly match). In this case, we can
     31 	// shift the matching frame to align with the next suffix chunk. For
     32 	// example, the pattern "mississi" has the suffix "issi" next occurring
     33 	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
     34 	// shift+len(suffix) == 3+4 == 7.
     35 	//
     36 	// 2. If the matched suffix does not occur elsewhere in pattern, then the
     37 	// matching frame may share part of its prefix with the end of the
     38 	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
     39 	// to shift the frame to align this portion of the prefix to the
     40 	// suffix. For example, in the pattern "abcxxxabc", when the first
     41 	// mismatch from the back is found to be in position 3, the matching
     42 	// suffix "xxabc" is not found elsewhere in the pattern. However, its
     43 	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
     44 	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
     45 	goodSuffixSkip []int
     46 }
     47 
     48 func makeStringFinder(pattern string) *stringFinder {
     49 	f := &stringFinder{
     50 		pattern:        pattern,
     51 		goodSuffixSkip: make([]int, len(pattern)),
     52 	}
     53 	// last is the index of the last character in the pattern.
     54 	last := len(pattern) - 1
     55 
     56 	// Build bad character table.
     57 	// Bytes not in the pattern can skip one pattern's length.
     58 	for i := range f.badCharSkip {
     59 		f.badCharSkip[i] = len(pattern)
     60 	}
     61 	// The loop condition is < instead of <= so that the last byte does not
     62 	// have a zero distance to itself. Finding this byte out of place implies
     63 	// that it is not in the last position.
     64 	for i := 0; i < last; i++ {
     65 		f.badCharSkip[pattern[i]] = last - i
     66 	}
     67 
     68 	// Build good suffix table.
     69 	// First pass: set each value to the next index which starts a prefix of
     70 	// pattern.
     71 	lastPrefix := last
     72 	for i := last; i >= 0; i-- {
     73 		if HasPrefix(pattern, pattern[i+1:]) {
     74 			lastPrefix = i + 1
     75 		}
     76 		// lastPrefix is the shift, and (last-i) is len(suffix).
     77 		f.goodSuffixSkip[i] = lastPrefix + last - i
     78 	}
     79 	// Second pass: find repeats of pattern's suffix starting from the front.
     80 	for i := 0; i < last; i++ {
     81 		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
     82 		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
     83 			// (last-i) is the shift, and lenSuffix is len(suffix).
     84 			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
     85 		}
     86 	}
     87 
     88 	return f
     89 }
     90 
     91 func longestCommonSuffix(a, b string) (i int) {
     92 	for ; i < len(a) && i < len(b); i++ {
     93 		if a[len(a)-1-i] != b[len(b)-1-i] {
     94 			break
     95 		}
     96 	}
     97 	return
     98 }
     99 
    100 // next returns the index in text of the first occurrence of the pattern. If
    101 // the pattern is not found, it returns -1.
    102 func (f *stringFinder) next(text string) int {
    103 	i := len(f.pattern) - 1
    104 	for i < len(text) {
    105 		// Compare backwards from the end until the first unmatching character.
    106 		j := len(f.pattern) - 1
    107 		for j >= 0 && text[i] == f.pattern[j] {
    108 			i--
    109 			j--
    110 		}
    111 		if j < 0 {
    112 			return i + 1 // match
    113 		}
    114 		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
    115 	}
    116 	return -1
    117 }
    118 
    119 func max(a, b int) int {
    120 	if a > b {
    121 		return a
    122 	}
    123 	return b
    124 }
    125