1 /* 2 * divsufsort.c for libdivsufsort 3 * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person 6 * obtaining a copy of this software and associated documentation 7 * files (the "Software"), to deal in the Software without 8 * restriction, including without limitation the rights to use, 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following 12 * conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27 #include "divsufsort_private.h" 28 #ifdef _OPENMP 29 # include <omp.h> 30 #endif 31 32 33 /*- Private Functions -*/ 34 35 /* Sorts suffixes of type B*. */ 36 static 37 saidx_t 38 sort_typeBstar(const sauchar_t *T, saidx_t *SA, 39 saidx_t *bucket_A, saidx_t *bucket_B, 40 saidx_t n) { 41 saidx_t *PAb, *ISAb, *buf; 42 #ifdef _OPENMP 43 saidx_t *curbuf; 44 saidx_t l; 45 #endif 46 saidx_t i, j, k, t, m, bufsize; 47 saint_t c0, c1; 48 #ifdef _OPENMP 49 saint_t d0, d1; 50 int tmp; 51 #endif 52 53 /* Initialize bucket arrays. */ 54 for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } 55 for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } 56 57 /* Count the number of occurrences of the first one or two characters of each 58 type A, B and B* suffix. Moreover, store the beginning position of all 59 type B* suffixes into the array SA. */ 60 for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { 61 /* type A suffix. */ 62 do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); 63 if(0 <= i) { 64 /* type B* suffix. */ 65 ++BUCKET_BSTAR(c0, c1); 66 SA[--m] = i; 67 /* type B suffix. */ 68 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { 69 ++BUCKET_B(c0, c1); 70 } 71 } 72 } 73 m = n - m; 74 /* 75 note: 76 A type B* suffix is lexicographically smaller than a type B suffix that 77 begins with the same first two characters. 78 */ 79 80 /* Calculate the index of start/end point of each bucket. */ 81 for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { 82 t = i + BUCKET_A(c0); 83 BUCKET_A(c0) = i + j; /* start point */ 84 i = t + BUCKET_B(c0, c0); 85 for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { 86 j += BUCKET_BSTAR(c0, c1); 87 BUCKET_BSTAR(c0, c1) = j; /* end point */ 88 i += BUCKET_B(c0, c1); 89 } 90 } 91 92 if(0 < m) { 93 /* Sort the type B* suffixes by their first two characters. */ 94 PAb = SA + n - m; ISAb = SA + m; 95 for(i = m - 2; 0 <= i; --i) { 96 t = PAb[i], c0 = T[t], c1 = T[t + 1]; 97 SA[--BUCKET_BSTAR(c0, c1)] = i; 98 } 99 t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; 100 SA[--BUCKET_BSTAR(c0, c1)] = m - 1; 101 102 /* Sort the type B* substrings using sssort. */ 103 #ifdef _OPENMP 104 tmp = omp_get_max_threads(); 105 buf = SA + m, bufsize = (n - (2 * m)) / tmp; 106 c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; 107 #pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) 108 { 109 tmp = omp_get_thread_num(); 110 curbuf = buf + tmp * bufsize; 111 k = 0; 112 for(;;) { 113 #pragma omp critical(sssort_lock) 114 { 115 if(0 < (l = j)) { 116 d0 = c0, d1 = c1; 117 do { 118 k = BUCKET_BSTAR(d0, d1); 119 if(--d1 <= d0) { 120 d1 = ALPHABET_SIZE - 1; 121 if(--d0 < 0) { break; } 122 } 123 } while(((l - k) <= 1) && (0 < (l = k))); 124 c0 = d0, c1 = d1, j = k; 125 } 126 } 127 if(l == 0) { break; } 128 sssort(T, PAb, SA + k, SA + l, 129 curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); 130 } 131 } 132 #else 133 buf = SA + m, bufsize = n - (2 * m); 134 for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { 135 for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { 136 i = BUCKET_BSTAR(c0, c1); 137 if(1 < (j - i)) { 138 sssort(T, PAb, SA + i, SA + j, 139 buf, bufsize, 2, n, *(SA + i) == (m - 1)); 140 } 141 } 142 } 143 #endif 144 145 /* Compute ranks of type B* substrings. */ 146 for(i = m - 1; 0 <= i; --i) { 147 if(0 <= SA[i]) { 148 j = i; 149 do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); 150 SA[i + 1] = i - j; 151 if(i <= 0) { break; } 152 } 153 j = i; 154 do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); 155 ISAb[SA[i]] = j; 156 } 157 158 /* Construct the inverse suffix array of type B* suffixes using trsort. */ 159 trsort(ISAb, SA, m, 1); 160 161 /* Set the sorted order of tyoe B* suffixes. */ 162 for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { 163 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } 164 if(0 <= i) { 165 t = i; 166 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } 167 SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; 168 } 169 } 170 171 /* Calculate the index of start/end point of each bucket. */ 172 BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ 173 for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { 174 i = BUCKET_A(c0 + 1) - 1; 175 for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { 176 t = i - BUCKET_B(c0, c1); 177 BUCKET_B(c0, c1) = i; /* end point */ 178 179 /* Move all type B* suffixes to the correct position. */ 180 for(i = t, j = BUCKET_BSTAR(c0, c1); 181 j <= k; 182 --i, --k) { SA[i] = SA[k]; } 183 } 184 BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ 185 BUCKET_B(c0, c0) = i; /* end point */ 186 } 187 } 188 189 return m; 190 } 191 192 /* Constructs the suffix array by using the sorted order of type B* suffixes. */ 193 static 194 void 195 construct_SA(const sauchar_t *T, saidx_t *SA, 196 saidx_t *bucket_A, saidx_t *bucket_B, 197 saidx_t n, saidx_t m) { 198 saidx_t *i, *j, *k; 199 saidx_t s; 200 saint_t c0, c1, c2; 201 202 if(0 < m) { 203 /* Construct the sorted order of type B suffixes by using 204 the sorted order of type B* suffixes. */ 205 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 206 /* Scan the suffix array from right to left. */ 207 for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 208 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 209 i <= j; 210 --j) { 211 if(0 < (s = *j)) { 212 assert(T[s] == c1); 213 assert(((s + 1) < n) && (T[s] <= T[s + 1])); 214 assert(T[s - 1] <= T[s]); 215 *j = ~s; 216 c0 = T[--s]; 217 if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 218 if(c0 != c2) { 219 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 220 k = SA + BUCKET_B(c2 = c0, c1); 221 } 222 assert(k < j); 223 *k-- = s; 224 } else { 225 assert(((s == 0) && (T[s] == c1)) || (s < 0)); 226 *j = ~s; 227 } 228 } 229 } 230 } 231 232 /* Construct the suffix array by using 233 the sorted order of type B suffixes. */ 234 k = SA + BUCKET_A(c2 = T[n - 1]); 235 *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); 236 /* Scan the suffix array from left to right. */ 237 for(i = SA, j = SA + n; i < j; ++i) { 238 if(0 < (s = *i)) { 239 assert(T[s - 1] >= T[s]); 240 c0 = T[--s]; 241 if((s == 0) || (T[s - 1] < c0)) { s = ~s; } 242 if(c0 != c2) { 243 BUCKET_A(c2) = k - SA; 244 k = SA + BUCKET_A(c2 = c0); 245 } 246 assert(i < k); 247 *k++ = s; 248 } else { 249 assert(s < 0); 250 *i = ~s; 251 } 252 } 253 } 254 255 /* Constructs the burrows-wheeler transformed string directly 256 by using the sorted order of type B* suffixes. */ 257 static 258 saidx_t 259 construct_BWT(const sauchar_t *T, saidx_t *SA, 260 saidx_t *bucket_A, saidx_t *bucket_B, 261 saidx_t n, saidx_t m) { 262 saidx_t *i, *j, *k, *orig; 263 saidx_t s; 264 saint_t c0, c1, c2; 265 266 if(0 < m) { 267 /* Construct the sorted order of type B suffixes by using 268 the sorted order of type B* suffixes. */ 269 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 270 /* Scan the suffix array from right to left. */ 271 for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 272 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 273 i <= j; 274 --j) { 275 if(0 < (s = *j)) { 276 assert(T[s] == c1); 277 assert(((s + 1) < n) && (T[s] <= T[s + 1])); 278 assert(T[s - 1] <= T[s]); 279 c0 = T[--s]; 280 *j = ~((saidx_t)c0); 281 if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 282 if(c0 != c2) { 283 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 284 k = SA + BUCKET_B(c2 = c0, c1); 285 } 286 assert(k < j); 287 *k-- = s; 288 } else if(s != 0) { 289 *j = ~s; 290 #ifndef NDEBUG 291 } else { 292 assert(T[s] == c1); 293 #endif 294 } 295 } 296 } 297 } 298 299 /* Construct the BWTed string by using 300 the sorted order of type B suffixes. */ 301 k = SA + BUCKET_A(c2 = T[n - 1]); 302 *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1); 303 /* Scan the suffix array from left to right. */ 304 for(i = SA, j = SA + n, orig = SA; i < j; ++i) { 305 if(0 < (s = *i)) { 306 assert(T[s - 1] >= T[s]); 307 c0 = T[--s]; 308 *i = c0; 309 if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); } 310 if(c0 != c2) { 311 BUCKET_A(c2) = k - SA; 312 k = SA + BUCKET_A(c2 = c0); 313 } 314 assert(i < k); 315 *k++ = s; 316 } else if(s != 0) { 317 *i = ~s; 318 } else { 319 orig = i; 320 } 321 } 322 323 return orig - SA; 324 } 325 326 327 /*---------------------------------------------------------------------------*/ 328 329 /*- Function -*/ 330 331 saint_t 332 divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) { 333 saidx_t *bucket_A, *bucket_B; 334 saidx_t m; 335 saint_t err = 0; 336 337 /* Check arguments. */ 338 if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } 339 else if(n == 0) { return 0; } 340 else if(n == 1) { SA[0] = 0; return 0; } 341 else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } 342 343 bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 344 bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 345 346 /* Suffixsort. */ 347 if((bucket_A != NULL) && (bucket_B != NULL)) { 348 m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); 349 construct_SA(T, SA, bucket_A, bucket_B, n, m); 350 } else { 351 err = -2; 352 } 353 354 free(bucket_B); 355 free(bucket_A); 356 357 return err; 358 } 359 360 saidx_t 361 divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) { 362 saidx_t *B; 363 saidx_t *bucket_A, *bucket_B; 364 saidx_t m, pidx, i; 365 366 /* Check arguments. */ 367 if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } 368 else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } 369 370 if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); } 371 bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 372 bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 373 374 /* Burrows-Wheeler Transform. */ 375 if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { 376 m = sort_typeBstar(T, B, bucket_A, bucket_B, n); 377 pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); 378 379 /* Copy to output string. */ 380 U[0] = T[n - 1]; 381 for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; } 382 for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; } 383 pidx += 1; 384 } else { 385 pidx = -2; 386 } 387 388 free(bucket_B); 389 free(bucket_A); 390 if(A == NULL) { free(B); } 391 392 return pidx; 393 } 394 395 const char * 396 divsufsort_version(void) { 397 return PROJECT_VERSION_FULL; 398 } 399