1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package suffixarray 6 7 import ( 8 "bytes" 9 "math/rand" 10 "regexp" 11 "sort" 12 "strings" 13 "testing" 14 ) 15 16 type testCase struct { 17 name string // name of test case 18 source string // source to index 19 patterns []string // patterns to lookup 20 } 21 22 var testCases = []testCase{ 23 { 24 "empty string", 25 "", 26 []string{ 27 "", 28 "foo", 29 "(foo)", 30 ".*", 31 "a*", 32 }, 33 }, 34 35 { 36 "all a's", 37 "aaaaaaaaaa", // 10 a's 38 []string{ 39 "", 40 "a", 41 "aa", 42 "aaa", 43 "aaaa", 44 "aaaaa", 45 "aaaaaa", 46 "aaaaaaa", 47 "aaaaaaaa", 48 "aaaaaaaaa", 49 "aaaaaaaaaa", 50 "aaaaaaaaaaa", // 11 a's 51 ".", 52 ".*", 53 "a+", 54 "aa+", 55 "aaaa[b]?", 56 "aaa*", 57 }, 58 }, 59 60 { 61 "abc", 62 "abc", 63 []string{ 64 "a", 65 "b", 66 "c", 67 "ab", 68 "bc", 69 "abc", 70 "a.c", 71 "a(b|c)", 72 "abc?", 73 }, 74 }, 75 76 { 77 "barbara*3", 78 "barbarabarbarabarbara", 79 []string{ 80 "a", 81 "bar", 82 "rab", 83 "arab", 84 "barbar", 85 "bara?bar", 86 }, 87 }, 88 89 { 90 "typing drill", 91 "Now is the time for all good men to come to the aid of their country.", 92 []string{ 93 "Now", 94 "the time", 95 "to come the aid", 96 "is the time for all good men to come to the aid of their", 97 "to (come|the)?", 98 }, 99 }, 100 101 { 102 "godoc simulation", 103 "package main\n\nimport(\n \"rand\"\n ", 104 []string{}, 105 }, 106 } 107 108 // find all occurrences of s in source; report at most n occurrences 109 func find(src, s string, n int) []int { 110 var res []int 111 if s != "" && n != 0 { 112 // find at most n occurrences of s in src 113 for i := -1; n < 0 || len(res) < n; { 114 j := strings.Index(src[i+1:], s) 115 if j < 0 { 116 break 117 } 118 i += j + 1 119 res = append(res, i) 120 } 121 } 122 return res 123 } 124 125 func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) { 126 res := x.Lookup([]byte(s), n) 127 exp := find(tc.source, s, n) 128 129 // check that the lengths match 130 if len(res) != len(exp) { 131 t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res)) 132 } 133 134 // if n >= 0 the number of results is limited --- unless n >= all results, 135 // we may obtain different positions from the Index and from find (because 136 // Index may not find the results in the same order as find) => in general 137 // we cannot simply check that the res and exp lists are equal 138 139 // check that each result is in fact a correct match and there are no duplicates 140 sort.Ints(res) 141 for i, r := range res { 142 if r < 0 || len(tc.source) <= r { 143 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source)) 144 } else if !strings.HasPrefix(tc.source[r:], s) { 145 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r) 146 } 147 if i > 0 && res[i-1] == r { 148 t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r) 149 } 150 } 151 152 if n < 0 { 153 // all results computed - sorted res and exp must be equal 154 for i, r := range res { 155 e := exp[i] 156 if r != e { 157 t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r) 158 } 159 } 160 } 161 } 162 163 func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) { 164 res := x.FindAllIndex(rx, n) 165 exp := rx.FindAllStringIndex(tc.source, n) 166 167 // check that the lengths match 168 if len(res) != len(exp) { 169 t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res)) 170 } 171 172 // if n >= 0 the number of results is limited --- unless n >= all results, 173 // we may obtain different positions from the Index and from regexp (because 174 // Index may not find the results in the same order as regexp) => in general 175 // we cannot simply check that the res and exp lists are equal 176 177 // check that each result is in fact a correct match and the result is sorted 178 for i, r := range res { 179 if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] { 180 t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1]) 181 } else if !rx.MatchString(tc.source[r[0]:r[1]]) { 182 t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1]) 183 } 184 } 185 186 if n < 0 { 187 // all results computed - sorted res and exp must be equal 188 for i, r := range res { 189 e := exp[i] 190 if r[0] != e[0] || r[1] != e[1] { 191 t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]", 192 tc.name, rx, i, e[0], e[1], r[0], r[1]) 193 } 194 } 195 } 196 } 197 198 func testLookups(t *testing.T, tc *testCase, x *Index, n int) { 199 for _, pat := range tc.patterns { 200 testLookup(t, tc, x, pat, n) 201 if rx, err := regexp.Compile(pat); err == nil { 202 testFindAllIndex(t, tc, x, rx, n) 203 } 204 } 205 } 206 207 // index is used to hide the sort.Interface 208 type index Index 209 210 func (x *index) Len() int { return len(x.sa) } 211 func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } 212 func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } 213 func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } 214 215 func testConstruction(t *testing.T, tc *testCase, x *Index) { 216 if !sort.IsSorted((*index)(x)) { 217 t.Errorf("failed testConstruction %s", tc.name) 218 } 219 } 220 221 func equal(x, y *Index) bool { 222 if !bytes.Equal(x.data, y.data) { 223 return false 224 } 225 for i, j := range x.sa { 226 if j != y.sa[i] { 227 return false 228 } 229 } 230 return true 231 } 232 233 // returns the serialized index size 234 func testSaveRestore(t *testing.T, tc *testCase, x *Index) int { 235 var buf bytes.Buffer 236 if err := x.Write(&buf); err != nil { 237 t.Errorf("failed writing index %s (%s)", tc.name, err) 238 } 239 size := buf.Len() 240 var y Index 241 if err := y.Read(&buf); err != nil { 242 t.Errorf("failed reading index %s (%s)", tc.name, err) 243 } 244 if !equal(x, &y) { 245 t.Errorf("restored index doesn't match saved index %s", tc.name) 246 } 247 return size 248 } 249 250 func TestIndex(t *testing.T) { 251 for _, tc := range testCases { 252 x := New([]byte(tc.source)) 253 testConstruction(t, &tc, x) 254 testSaveRestore(t, &tc, x) 255 testLookups(t, &tc, x, 0) 256 testLookups(t, &tc, x, 1) 257 testLookups(t, &tc, x, 10) 258 testLookups(t, &tc, x, 2e9) 259 testLookups(t, &tc, x, -1) 260 } 261 } 262 263 // Of all possible inputs, the random bytes have the least amount of substring 264 // repetition, and the repeated bytes have the most. For most algorithms, 265 // the running time of every input will be between these two. 266 func benchmarkNew(b *testing.B, random bool) { 267 b.StopTimer() 268 data := make([]byte, 1e6) 269 if random { 270 for i := range data { 271 data[i] = byte(rand.Intn(256)) 272 } 273 } 274 b.StartTimer() 275 for i := 0; i < b.N; i++ { 276 New(data) 277 } 278 } 279 280 func BenchmarkNewIndexRandom(b *testing.B) { 281 benchmarkNew(b, true) 282 } 283 func BenchmarkNewIndexRepeat(b *testing.B) { 284 benchmarkNew(b, false) 285 } 286 287 func BenchmarkSaveRestore(b *testing.B) { 288 b.StopTimer() 289 r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence 290 data := make([]byte, 1<<20) // 1MB of data to index 291 for i := range data { 292 data[i] = byte(r.Intn(256)) 293 } 294 x := New(data) 295 size := testSaveRestore(nil, nil, x) // verify correctness 296 buf := bytes.NewBuffer(make([]byte, size)) // avoid growing 297 b.SetBytes(int64(size)) 298 b.StartTimer() 299 for i := 0; i < b.N; i++ { 300 x.Write(buf) 301 var y Index 302 y.Read(buf) 303 } 304 } 305