Home | History | Annotate | Download | only in lib
      1 /*
      2  * divsufsort.c for libdivsufsort
      3  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person
      6  * obtaining a copy of this software and associated documentation
      7  * files (the "Software"), to deal in the Software without
      8  * restriction, including without limitation the rights to use,
      9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following
     12  * conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be
     15  * included in all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
     19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
     21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     24  * OTHER DEALINGS IN THE SOFTWARE.
     25  */
     26 
     27 #include "divsufsort_private.h"
     28 #ifdef _OPENMP
     29 # include <omp.h>
     30 #endif
     31 
     32 
     33 /*- Private Functions -*/
     34 
     35 /* Sorts suffixes of type B*. */
     36 static
     37 saidx_t
     38 sort_typeBstar(const sauchar_t *T, saidx_t *SA,
     39                saidx_t *bucket_A, saidx_t *bucket_B,
     40                saidx_t n) {
     41   saidx_t *PAb, *ISAb, *buf;
     42 #ifdef _OPENMP
     43   saidx_t *curbuf;
     44   saidx_t l;
     45 #endif
     46   saidx_t i, j, k, t, m, bufsize;
     47   saint_t c0, c1;
     48 #ifdef _OPENMP
     49   saint_t d0, d1;
     50   int tmp;
     51 #endif
     52 
     53   /* Initialize bucket arrays. */
     54   for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
     55   for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
     56 
     57   /* Count the number of occurrences of the first one or two characters of each
     58      type A, B and B* suffix. Moreover, store the beginning position of all
     59      type B* suffixes into the array SA. */
     60   for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
     61     /* type A suffix. */
     62     do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
     63     if(0 <= i) {
     64       /* type B* suffix. */
     65       ++BUCKET_BSTAR(c0, c1);
     66       SA[--m] = i;
     67       /* type B suffix. */
     68       for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
     69         ++BUCKET_B(c0, c1);
     70       }
     71     }
     72   }
     73   m = n - m;
     74 /*
     75 note:
     76   A type B* suffix is lexicographically smaller than a type B suffix that
     77   begins with the same first two characters.
     78 */
     79 
     80   /* Calculate the index of start/end point of each bucket. */
     81   for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
     82     t = i + BUCKET_A(c0);
     83     BUCKET_A(c0) = i + j; /* start point */
     84     i = t + BUCKET_B(c0, c0);
     85     for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
     86       j += BUCKET_BSTAR(c0, c1);
     87       BUCKET_BSTAR(c0, c1) = j; /* end point */
     88       i += BUCKET_B(c0, c1);
     89     }
     90   }
     91 
     92   if(0 < m) {
     93     /* Sort the type B* suffixes by their first two characters. */
     94     PAb = SA + n - m; ISAb = SA + m;
     95     for(i = m - 2; 0 <= i; --i) {
     96       t = PAb[i], c0 = T[t], c1 = T[t + 1];
     97       SA[--BUCKET_BSTAR(c0, c1)] = i;
     98     }
     99     t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
    100     SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
    101 
    102     /* Sort the type B* substrings using sssort. */
    103 #ifdef _OPENMP
    104     tmp = omp_get_max_threads();
    105     buf = SA + m, bufsize = (n - (2 * m)) / tmp;
    106     c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
    107 #pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
    108     {
    109       tmp = omp_get_thread_num();
    110       curbuf = buf + tmp * bufsize;
    111       k = 0;
    112       for(;;) {
    113         #pragma omp critical(sssort_lock)
    114         {
    115           if(0 < (l = j)) {
    116             d0 = c0, d1 = c1;
    117             do {
    118               k = BUCKET_BSTAR(d0, d1);
    119               if(--d1 <= d0) {
    120                 d1 = ALPHABET_SIZE - 1;
    121                 if(--d0 < 0) { break; }
    122               }
    123             } while(((l - k) <= 1) && (0 < (l = k)));
    124             c0 = d0, c1 = d1, j = k;
    125           }
    126         }
    127         if(l == 0) { break; }
    128         sssort(T, PAb, SA + k, SA + l,
    129                curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
    130       }
    131     }
    132 #else
    133     buf = SA + m, bufsize = n - (2 * m);
    134     for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
    135       for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
    136         i = BUCKET_BSTAR(c0, c1);
    137         if(1 < (j - i)) {
    138           sssort(T, PAb, SA + i, SA + j,
    139                  buf, bufsize, 2, n, *(SA + i) == (m - 1));
    140         }
    141       }
    142     }
    143 #endif
    144 
    145     /* Compute ranks of type B* substrings. */
    146     for(i = m - 1; 0 <= i; --i) {
    147       if(0 <= SA[i]) {
    148         j = i;
    149         do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
    150         SA[i + 1] = i - j;
    151         if(i <= 0) { break; }
    152       }
    153       j = i;
    154       do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
    155       ISAb[SA[i]] = j;
    156     }
    157 
    158     /* Construct the inverse suffix array of type B* suffixes using trsort. */
    159     trsort(ISAb, SA, m, 1);
    160 
    161     /* Set the sorted order of tyoe B* suffixes. */
    162     for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
    163       for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
    164       if(0 <= i) {
    165         t = i;
    166         for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
    167         SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
    168       }
    169     }
    170 
    171     /* Calculate the index of start/end point of each bucket. */
    172     BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
    173     for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
    174       i = BUCKET_A(c0 + 1) - 1;
    175       for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
    176         t = i - BUCKET_B(c0, c1);
    177         BUCKET_B(c0, c1) = i; /* end point */
    178 
    179         /* Move all type B* suffixes to the correct position. */
    180         for(i = t, j = BUCKET_BSTAR(c0, c1);
    181             j <= k;
    182             --i, --k) { SA[i] = SA[k]; }
    183       }
    184       BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
    185       BUCKET_B(c0, c0) = i; /* end point */
    186     }
    187   }
    188 
    189   return m;
    190 }
    191 
    192 /* Constructs the suffix array by using the sorted order of type B* suffixes. */
    193 static
    194 void
    195 construct_SA(const sauchar_t *T, saidx_t *SA,
    196              saidx_t *bucket_A, saidx_t *bucket_B,
    197              saidx_t n, saidx_t m) {
    198   saidx_t *i, *j, *k;
    199   saidx_t s;
    200   saint_t c0, c1, c2;
    201 
    202   if(0 < m) {
    203     /* Construct the sorted order of type B suffixes by using
    204        the sorted order of type B* suffixes. */
    205     for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
    206       /* Scan the suffix array from right to left. */
    207       for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
    208           j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
    209           i <= j;
    210           --j) {
    211         if(0 < (s = *j)) {
    212           assert(T[s] == c1);
    213           assert(((s + 1) < n) && (T[s] <= T[s + 1]));
    214           assert(T[s - 1] <= T[s]);
    215           *j = ~s;
    216           c0 = T[--s];
    217           if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
    218           if(c0 != c2) {
    219             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
    220             k = SA + BUCKET_B(c2 = c0, c1);
    221           }
    222           assert(k < j);
    223           *k-- = s;
    224         } else {
    225           assert(((s == 0) && (T[s] == c1)) || (s < 0));
    226           *j = ~s;
    227         }
    228       }
    229     }
    230   }
    231 
    232   /* Construct the suffix array by using
    233      the sorted order of type B suffixes. */
    234   k = SA + BUCKET_A(c2 = T[n - 1]);
    235   *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
    236   /* Scan the suffix array from left to right. */
    237   for(i = SA, j = SA + n; i < j; ++i) {
    238     if(0 < (s = *i)) {
    239       assert(T[s - 1] >= T[s]);
    240       c0 = T[--s];
    241       if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
    242       if(c0 != c2) {
    243         BUCKET_A(c2) = k - SA;
    244         k = SA + BUCKET_A(c2 = c0);
    245       }
    246       assert(i < k);
    247       *k++ = s;
    248     } else {
    249       assert(s < 0);
    250       *i = ~s;
    251     }
    252   }
    253 }
    254 
    255 /* Constructs the burrows-wheeler transformed string directly
    256    by using the sorted order of type B* suffixes. */
    257 static
    258 saidx_t
    259 construct_BWT(const sauchar_t *T, saidx_t *SA,
    260               saidx_t *bucket_A, saidx_t *bucket_B,
    261               saidx_t n, saidx_t m) {
    262   saidx_t *i, *j, *k, *orig;
    263   saidx_t s;
    264   saint_t c0, c1, c2;
    265 
    266   if(0 < m) {
    267     /* Construct the sorted order of type B suffixes by using
    268        the sorted order of type B* suffixes. */
    269     for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
    270       /* Scan the suffix array from right to left. */
    271       for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
    272           j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
    273           i <= j;
    274           --j) {
    275         if(0 < (s = *j)) {
    276           assert(T[s] == c1);
    277           assert(((s + 1) < n) && (T[s] <= T[s + 1]));
    278           assert(T[s - 1] <= T[s]);
    279           c0 = T[--s];
    280           *j = ~((saidx_t)c0);
    281           if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
    282           if(c0 != c2) {
    283             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
    284             k = SA + BUCKET_B(c2 = c0, c1);
    285           }
    286           assert(k < j);
    287           *k-- = s;
    288         } else if(s != 0) {
    289           *j = ~s;
    290 #ifndef NDEBUG
    291         } else {
    292           assert(T[s] == c1);
    293 #endif
    294         }
    295       }
    296     }
    297   }
    298 
    299   /* Construct the BWTed string by using
    300      the sorted order of type B suffixes. */
    301   k = SA + BUCKET_A(c2 = T[n - 1]);
    302   *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
    303   /* Scan the suffix array from left to right. */
    304   for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
    305     if(0 < (s = *i)) {
    306       assert(T[s - 1] >= T[s]);
    307       c0 = T[--s];
    308       *i = c0;
    309       if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
    310       if(c0 != c2) {
    311         BUCKET_A(c2) = k - SA;
    312         k = SA + BUCKET_A(c2 = c0);
    313       }
    314       assert(i < k);
    315       *k++ = s;
    316     } else if(s != 0) {
    317       *i = ~s;
    318     } else {
    319       orig = i;
    320     }
    321   }
    322 
    323   return orig - SA;
    324 }
    325 
    326 
    327 /*---------------------------------------------------------------------------*/
    328 
    329 /*- Function -*/
    330 
    331 saint_t
    332 divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {
    333   saidx_t *bucket_A, *bucket_B;
    334   saidx_t m;
    335   saint_t err = 0;
    336 
    337   /* Check arguments. */
    338   if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
    339   else if(n == 0) { return 0; }
    340   else if(n == 1) { SA[0] = 0; return 0; }
    341   else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
    342 
    343   bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
    344   bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
    345 
    346   /* Suffixsort. */
    347   if((bucket_A != NULL) && (bucket_B != NULL)) {
    348     m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
    349     construct_SA(T, SA, bucket_A, bucket_B, n, m);
    350   } else {
    351     err = -2;
    352   }
    353 
    354   free(bucket_B);
    355   free(bucket_A);
    356 
    357   return err;
    358 }
    359 
    360 saidx_t
    361 divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
    362   saidx_t *B;
    363   saidx_t *bucket_A, *bucket_B;
    364   saidx_t m, pidx, i;
    365 
    366   /* Check arguments. */
    367   if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
    368   else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
    369 
    370   if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
    371   bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
    372   bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
    373 
    374   /* Burrows-Wheeler Transform. */
    375   if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
    376     m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
    377     pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
    378 
    379     /* Copy to output string. */
    380     U[0] = T[n - 1];
    381     for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
    382     for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
    383     pidx += 1;
    384   } else {
    385     pidx = -2;
    386   }
    387 
    388   free(bucket_B);
    389   free(bucket_A);
    390   if(A == NULL) { free(B); }
    391 
    392   return pidx;
    393 }
    394 
    395 const char *
    396 divsufsort_version(void) {
    397   return PROJECT_VERSION_FULL;
    398 }
    399