1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This algorithm is based on "Faster Suffix Sorting" 6 // by N. Jesper Larsson and Kunihiko Sadakane 7 // paper: http://www.larsson.dogma.net/ssrev-tr.pdf 8 // code: http://www.larsson.dogma.net/qsufsort.c 9 10 // This algorithm computes the suffix array sa by computing its inverse. 11 // Consecutive groups of suffixes in sa are labeled as sorted groups or 12 // unsorted groups. For a given pass of the sorter, all suffixes are ordered 13 // up to their first h characters, and sa is h-ordered. Suffixes in their 14 // final positions and unambiguously sorted in h-order are in a sorted group. 15 // Consecutive groups of suffixes with identical first h characters are an 16 // unsorted group. In each pass of the algorithm, unsorted groups are sorted 17 // according to the group number of their following suffix. 18 19 // In the implementation, if sa[i] is negative, it indicates that i is 20 // the first element of a sorted group of length -sa[i], and can be skipped. 21 // An unsorted group sa[i:k] is given the group number of the index of its 22 // last element, k-1. The group numbers are stored in the inverse slice (inv), 23 // and when all groups are sorted, this slice is the inverse suffix array. 24 25 package suffixarray 26 27 import "sort" 28 29 func qsufsort(data []byte) []int { 30 // initial sorting by first byte of suffix 31 sa := sortedByFirstByte(data) 32 if len(sa) < 2 { 33 return sa 34 } 35 // initialize the group lookup table 36 // this becomes the inverse of the suffix array when all groups are sorted 37 inv := initGroups(sa, data) 38 39 // the index starts 1-ordered 40 sufSortable := &suffixSortable{sa: sa, inv: inv, h: 1} 41 42 for sa[0] > -len(sa) { // until all suffixes are one big sorted group 43 // The suffixes are h-ordered, make them 2*h-ordered 44 pi := 0 // pi is first position of first group 45 sl := 0 // sl is negated length of sorted groups 46 for pi < len(sa) { 47 if s := sa[pi]; s < 0 { // if pi starts sorted group 48 pi -= s // skip over sorted group 49 sl += s // add negated length to sl 50 } else { // if pi starts unsorted group 51 if sl != 0 { 52 sa[pi+sl] = sl // combine sorted groups before pi 53 sl = 0 54 } 55 pk := inv[s] + 1 // pk-1 is last position of unsorted group 56 sufSortable.sa = sa[pi:pk] 57 sort.Sort(sufSortable) 58 sufSortable.updateGroups(pi) 59 pi = pk // next group 60 } 61 } 62 if sl != 0 { // if the array ends with a sorted group 63 sa[pi+sl] = sl // combine sorted groups at end of sa 64 } 65 66 sufSortable.h *= 2 // double sorted depth 67 } 68 69 for i := range sa { // reconstruct suffix array from inverse 70 sa[inv[i]] = i 71 } 72 return sa 73 } 74 75 func sortedByFirstByte(data []byte) []int { 76 // total byte counts 77 var count [256]int 78 for _, b := range data { 79 count[b]++ 80 } 81 // make count[b] equal index of first occurrence of b in sorted array 82 sum := 0 83 for b := range count { 84 count[b], sum = sum, count[b]+sum 85 } 86 // iterate through bytes, placing index into the correct spot in sa 87 sa := make([]int, len(data)) 88 for i, b := range data { 89 sa[count[b]] = i 90 count[b]++ 91 } 92 return sa 93 } 94 95 func initGroups(sa []int, data []byte) []int { 96 // label contiguous same-letter groups with the same group number 97 inv := make([]int, len(data)) 98 prevGroup := len(sa) - 1 99 groupByte := data[sa[prevGroup]] 100 for i := len(sa) - 1; i >= 0; i-- { 101 if b := data[sa[i]]; b < groupByte { 102 if prevGroup == i+1 { 103 sa[i+1] = -1 104 } 105 groupByte = b 106 prevGroup = i 107 } 108 inv[sa[i]] = prevGroup 109 if prevGroup == 0 { 110 sa[0] = -1 111 } 112 } 113 // Separate out the final suffix to the start of its group. 114 // This is necessary to ensure the suffix "a" is before "aba" 115 // when using a potentially unstable sort. 116 lastByte := data[len(data)-1] 117 s := -1 118 for i := range sa { 119 if sa[i] >= 0 { 120 if data[sa[i]] == lastByte && s == -1 { 121 s = i 122 } 123 if sa[i] == len(sa)-1 { 124 sa[i], sa[s] = sa[s], sa[i] 125 inv[sa[s]] = s 126 sa[s] = -1 // mark it as an isolated sorted group 127 break 128 } 129 } 130 } 131 return inv 132 } 133 134 type suffixSortable struct { 135 sa []int 136 inv []int 137 h int 138 buf []int // common scratch space 139 } 140 141 func (x *suffixSortable) Len() int { return len(x.sa) } 142 func (x *suffixSortable) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] } 143 func (x *suffixSortable) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } 144 145 func (x *suffixSortable) updateGroups(offset int) { 146 bounds := x.buf[0:0] 147 group := x.inv[x.sa[0]+x.h] 148 for i := 1; i < len(x.sa); i++ { 149 if g := x.inv[x.sa[i]+x.h]; g > group { 150 bounds = append(bounds, i) 151 group = g 152 } 153 } 154 bounds = append(bounds, len(x.sa)) 155 x.buf = bounds 156 157 // update the group numberings after all new groups are determined 158 prev := 0 159 for _, b := range bounds { 160 for i := prev; i < b; i++ { 161 x.inv[x.sa[i]] = offset + b - 1 162 } 163 if b-prev == 1 { 164 x.sa[prev] = -1 165 } 166 prev = b 167 } 168 } 169