1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package suffixarray implements substring search in logarithmic time using 6 // an in-memory suffix array. 7 // 8 // Example use: 9 // 10 // // create index for some data 11 // index := suffixarray.New(data) 12 // 13 // // lookup byte slice s 14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data 15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data 16 // 17 package suffixarray 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 "regexp" 24 "sort" 25 ) 26 27 // Index implements a suffix array for fast substring search. 28 type Index struct { 29 data []byte 30 sa []int // suffix array for data; len(sa) == len(data) 31 } 32 33 // New creates a new Index for data. 34 // Index creation time is O(N*log(N)) for N = len(data). 35 func New(data []byte) *Index { 36 return &Index{data, qsufsort(data)} 37 } 38 39 // writeInt writes an int x to w using buf to buffer the write. 40 func writeInt(w io.Writer, buf []byte, x int) error { 41 binary.PutVarint(buf, int64(x)) 42 _, err := w.Write(buf[0:binary.MaxVarintLen64]) 43 return err 44 } 45 46 // readInt reads an int x from r using buf to buffer the read and returns x. 47 func readInt(r io.Reader, buf []byte) (int, error) { 48 _, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error 49 x, _ := binary.Varint(buf) 50 return int(x), err 51 } 52 53 // writeSlice writes data[:n] to w and returns n. 54 // It uses buf to buffer the write. 55 func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) { 56 // encode as many elements as fit into buf 57 p := binary.MaxVarintLen64 58 for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ { 59 p += binary.PutUvarint(buf[p:], uint64(data[n])) 60 } 61 62 // update buffer size 63 binary.PutVarint(buf, int64(p)) 64 65 // write buffer 66 _, err = w.Write(buf[0:p]) 67 return 68 } 69 70 // readSlice reads data[:n] from r and returns n. 71 // It uses buf to buffer the read. 72 func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) { 73 // read buffer size 74 var size int 75 size, err = readInt(r, buf) 76 if err != nil { 77 return 78 } 79 80 // read buffer w/o the size 81 if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil { 82 return 83 } 84 85 // decode as many elements as present in buf 86 for p := binary.MaxVarintLen64; p < size; n++ { 87 x, w := binary.Uvarint(buf[p:]) 88 data[n] = int(x) 89 p += w 90 } 91 92 return 93 } 94 95 const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore 96 97 // Read reads the index from r into x; x must not be nil. 98 func (x *Index) Read(r io.Reader) error { 99 // buffer for all reads 100 buf := make([]byte, bufSize) 101 102 // read length 103 n, err := readInt(r, buf) 104 if err != nil { 105 return err 106 } 107 108 // allocate space 109 if 2*n < cap(x.data) || cap(x.data) < n { 110 // new data is significantly smaller or larger then 111 // existing buffers - allocate new ones 112 x.data = make([]byte, n) 113 x.sa = make([]int, n) 114 } else { 115 // re-use existing buffers 116 x.data = x.data[0:n] 117 x.sa = x.sa[0:n] 118 } 119 120 // read data 121 if _, err := io.ReadFull(r, x.data); err != nil { 122 return err 123 } 124 125 // read index 126 for sa := x.sa; len(sa) > 0; { 127 n, err := readSlice(r, buf, sa) 128 if err != nil { 129 return err 130 } 131 sa = sa[n:] 132 } 133 return nil 134 } 135 136 // Write writes the index x to w. 137 func (x *Index) Write(w io.Writer) error { 138 // buffer for all writes 139 buf := make([]byte, bufSize) 140 141 // write length 142 if err := writeInt(w, buf, len(x.data)); err != nil { 143 return err 144 } 145 146 // write data 147 if _, err := w.Write(x.data); err != nil { 148 return err 149 } 150 151 // write index 152 for sa := x.sa; len(sa) > 0; { 153 n, err := writeSlice(w, buf, sa) 154 if err != nil { 155 return err 156 } 157 sa = sa[n:] 158 } 159 return nil 160 } 161 162 // Bytes returns the data over which the index was created. 163 // It must not be modified. 164 // 165 func (x *Index) Bytes() []byte { 166 return x.data 167 } 168 169 func (x *Index) at(i int) []byte { 170 return x.data[x.sa[i]:] 171 } 172 173 // lookupAll returns a slice into the matching region of the index. 174 // The runtime is O(log(N)*len(s)). 175 func (x *Index) lookupAll(s []byte) []int { 176 // find matching suffix index range [i:j] 177 // find the first index where s would be the prefix 178 i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 }) 179 // starting at i, find the first index at which s is not a prefix 180 j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) }) 181 return x.sa[i:j] 182 } 183 184 // Lookup returns an unsorted list of at most n indices where the byte string s 185 // occurs in the indexed data. If n < 0, all occurrences are returned. 186 // The result is nil if s is empty, s is not found, or n == 0. 187 // Lookup time is O(log(N)*len(s) + len(result)) where N is the 188 // size of the indexed data. 189 // 190 func (x *Index) Lookup(s []byte, n int) (result []int) { 191 if len(s) > 0 && n != 0 { 192 matches := x.lookupAll(s) 193 if n < 0 || len(matches) < n { 194 n = len(matches) 195 } 196 // 0 <= n <= len(matches) 197 if n > 0 { 198 result = make([]int, n) 199 copy(result, matches) 200 } 201 } 202 return 203 } 204 205 // FindAllIndex returns a sorted list of non-overlapping matches of the 206 // regular expression r, where a match is a pair of indices specifying 207 // the matched slice of x.Bytes(). If n < 0, all matches are returned 208 // in successive order. Otherwise, at most n matches are returned and 209 // they may not be successive. The result is nil if there are no matches, 210 // or if n == 0. 211 // 212 func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) { 213 // a non-empty literal prefix is used to determine possible 214 // match start indices with Lookup 215 prefix, complete := r.LiteralPrefix() 216 lit := []byte(prefix) 217 218 // worst-case scenario: no literal prefix 219 if prefix == "" { 220 return r.FindAllIndex(x.data, n) 221 } 222 223 // if regexp is a literal just use Lookup and convert its 224 // result into match pairs 225 if complete { 226 // Lookup returns indices that may belong to overlapping matches. 227 // After eliminating them, we may end up with fewer than n matches. 228 // If we don't have enough at the end, redo the search with an 229 // increased value n1, but only if Lookup returned all the requested 230 // indices in the first place (if it returned fewer than that then 231 // there cannot be more). 232 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { 233 indices := x.Lookup(lit, n1) 234 if len(indices) == 0 { 235 return 236 } 237 sort.Ints(indices) 238 pairs := make([]int, 2*len(indices)) 239 result = make([][]int, len(indices)) 240 count := 0 241 prev := 0 242 for _, i := range indices { 243 if count == n { 244 break 245 } 246 // ignore indices leading to overlapping matches 247 if prev <= i { 248 j := 2 * count 249 pairs[j+0] = i 250 pairs[j+1] = i + len(lit) 251 result[count] = pairs[j : j+2] 252 count++ 253 prev = i + len(lit) 254 } 255 } 256 result = result[0:count] 257 if len(result) >= n || len(indices) != n1 { 258 // found all matches or there's no chance to find more 259 // (n and n1 can be negative) 260 break 261 } 262 } 263 if len(result) == 0 { 264 result = nil 265 } 266 return 267 } 268 269 // regexp has a non-empty literal prefix; Lookup(lit) computes 270 // the indices of possible complete matches; use these as starting 271 // points for anchored searches 272 // (regexp "^" matches beginning of input, not beginning of line) 273 r = regexp.MustCompile("^" + r.String()) // compiles because r compiled 274 275 // same comment about Lookup applies here as in the loop above 276 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { 277 indices := x.Lookup(lit, n1) 278 if len(indices) == 0 { 279 return 280 } 281 sort.Ints(indices) 282 result = result[0:0] 283 prev := 0 284 for _, i := range indices { 285 if len(result) == n { 286 break 287 } 288 m := r.FindIndex(x.data[i:]) // anchored search - will not run off 289 // ignore indices leading to overlapping matches 290 if m != nil && prev <= i { 291 m[0] = i // correct m 292 m[1] += i 293 result = append(result, m) 294 prev = m[1] 295 } 296 } 297 if len(result) >= n || len(indices) != n1 { 298 // found all matches or there's no chance to find more 299 // (n and n1 can be negative) 300 break 301 } 302 } 303 if len(result) == 0 { 304 result = nil 305 } 306 return 307 } 308