Home | History | Annotate | Download | only in lib
      1 /*
      2  * utils.c for libdivsufsort
      3  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person
      6  * obtaining a copy of this software and associated documentation
      7  * files (the "Software"), to deal in the Software without
      8  * restriction, including without limitation the rights to use,
      9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following
     12  * conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be
     15  * included in all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
     19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
     21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     24  * OTHER DEALINGS IN THE SOFTWARE.
     25  */
     26 
     27 #include "divsufsort_private.h"
     28 
     29 
     30 /*- Private Function -*/
     31 
     32 /* Binary search for inverse bwt. */
     33 static
     34 saidx_t
     35 binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
     36   saidx_t half, i;
     37   for(i = 0, half = size >> 1;
     38       0 < size;
     39       size = half, half >>= 1) {
     40     if(A[i + half] < value) {
     41       i += half + 1;
     42       half -= (size & 1) ^ 1;
     43     }
     44   }
     45   return i;
     46 }
     47 
     48 
     49 /*- Functions -*/
     50 
     51 /* Burrows-Wheeler transform. */
     52 saint_t
     53 bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
     54              saidx_t n, saidx_t *idx) {
     55   saidx_t *A, i, j, p, t;
     56   saint_t c;
     57 
     58   /* Check arguments. */
     59   if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
     60   if(n <= 1) {
     61     if(n == 1) { U[0] = T[0]; }
     62     *idx = n;
     63     return 0;
     64   }
     65 
     66   if((A = SA) == NULL) {
     67     i = divbwt(T, U, NULL, n);
     68     if(0 <= i) { *idx = i; i = 0; }
     69     return (saint_t)i;
     70   }
     71 
     72   /* BW transform. */
     73   if(T == U) {
     74     t = n;
     75     for(i = 0, j = 0; i < n; ++i) {
     76       p = t - 1;
     77       t = A[i];
     78       if(0 <= p) {
     79         c = T[j];
     80         U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
     81         A[j] = c;
     82         j++;
     83       } else {
     84         *idx = i;
     85       }
     86     }
     87     p = t - 1;
     88     if(0 <= p) {
     89       c = T[j];
     90       U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
     91       A[j] = c;
     92     } else {
     93       *idx = i;
     94     }
     95   } else {
     96     U[0] = T[n - 1];
     97     for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
     98     *idx = i + 1;
     99     for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
    100   }
    101 
    102   if(SA == NULL) {
    103     /* Deallocate memory. */
    104     free(A);
    105   }
    106 
    107   return 0;
    108 }
    109 
    110 /* Inverse Burrows-Wheeler transform. */
    111 saint_t
    112 inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
    113                      saidx_t n, saidx_t idx) {
    114   saidx_t C[ALPHABET_SIZE];
    115   sauchar_t D[ALPHABET_SIZE];
    116   saidx_t *B;
    117   saidx_t i, p;
    118   saint_t c, d;
    119 
    120   /* Check arguments. */
    121   if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
    122      (n < idx) || ((0 < n) && (idx == 0))) {
    123     return -1;
    124   }
    125   if(n <= 1) { return 0; }
    126 
    127   if((B = A) == NULL) {
    128     /* Allocate n*sizeof(saidx_t) bytes of memory. */
    129     if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
    130   }
    131 
    132   /* Inverse BW transform. */
    133   for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
    134   for(i = 0; i < n; ++i) { ++C[T[i]]; }
    135   for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
    136     p = C[c];
    137     if(0 < p) {
    138       C[c] = i;
    139       D[d++] = (sauchar_t)c;
    140       i += p;
    141     }
    142   }
    143   for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
    144   for( ; i < n; ++i)       { B[C[T[i]]++] = i + 1; }
    145   for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
    146   for(i = 0, p = idx; i < n; ++i) {
    147     U[i] = D[binarysearch_lower(C, d, p)];
    148     p = B[p - 1];
    149   }
    150 
    151   if(A == NULL) {
    152     /* Deallocate memory. */
    153     free(B);
    154   }
    155 
    156   return 0;
    157 }
    158 
    159 /* Checks the suffix array SA of the string T. */
    160 saint_t
    161 sufcheck(const sauchar_t *T, const saidx_t *SA,
    162          saidx_t n, saint_t verbose) {
    163   saidx_t C[ALPHABET_SIZE];
    164   saidx_t i, p, q, t;
    165   saint_t c;
    166 
    167   if(verbose) { fprintf(stderr, "sufcheck: "); }
    168 
    169   /* Check arguments. */
    170   if((T == NULL) || (SA == NULL) || (n < 0)) {
    171     if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
    172     return -1;
    173   }
    174   if(n == 0) {
    175     if(verbose) { fprintf(stderr, "Done.\n"); }
    176     return 0;
    177   }
    178 
    179   /* check range: [0..n-1] */
    180   for(i = 0; i < n; ++i) {
    181     if((SA[i] < 0) || (n <= SA[i])) {
    182       if(verbose) {
    183         fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
    184                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
    185                         n - 1, i, SA[i]);
    186       }
    187       return -2;
    188     }
    189   }
    190 
    191   /* check first characters. */
    192   for(i = 1; i < n; ++i) {
    193     if(T[SA[i - 1]] > T[SA[i]]) {
    194       if(verbose) {
    195         fprintf(stderr, "Suffixes in wrong order.\n"
    196                         "  T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
    197                         " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
    198                         i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
    199       }
    200       return -3;
    201     }
    202   }
    203 
    204   /* check suffixes. */
    205   for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
    206   for(i = 0; i < n; ++i) { ++C[T[i]]; }
    207   for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
    208     t = C[i];
    209     C[i] = p;
    210     p += t;
    211   }
    212 
    213   q = C[T[n - 1]];
    214   C[T[n - 1]] += 1;
    215   for(i = 0; i < n; ++i) {
    216     p = SA[i];
    217     if(0 < p) {
    218       c = T[--p];
    219       t = C[c];
    220     } else {
    221       c = T[p = n - 1];
    222       t = q;
    223     }
    224     if((t < 0) || (p != SA[t])) {
    225       if(verbose) {
    226         fprintf(stderr, "Suffix in wrong position.\n"
    227                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
    228                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
    229                         t, (0 <= t) ? SA[t] : -1, i, SA[i]);
    230       }
    231       return -4;
    232     }
    233     if(t != q) {
    234       ++C[c];
    235       if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
    236     }
    237   }
    238 
    239   if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
    240   return 0;
    241 }
    242 
    243 
    244 static
    245 int
    246 _compare(const sauchar_t *T, saidx_t Tsize,
    247          const sauchar_t *P, saidx_t Psize,
    248          saidx_t suf, saidx_t *match) {
    249   saidx_t i, j;
    250   saint_t r;
    251   for(i = suf + *match, j = *match, r = 0;
    252       (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
    253   *match = j;
    254   return (r == 0) ? -(j != Psize) : r;
    255 }
    256 
    257 /* Search for the pattern P in the string T. */
    258 saidx_t
    259 sa_search(const sauchar_t *T, saidx_t Tsize,
    260           const sauchar_t *P, saidx_t Psize,
    261           const saidx_t *SA, saidx_t SAsize,
    262           saidx_t *idx) {
    263   saidx_t size, lsize, rsize, half;
    264   saidx_t match, lmatch, rmatch;
    265   saidx_t llmatch, lrmatch, rlmatch, rrmatch;
    266   saidx_t i, j, k;
    267   saint_t r;
    268 
    269   if(idx != NULL) { *idx = -1; }
    270   if((T == NULL) || (P == NULL) || (SA == NULL) ||
    271      (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
    272   if((Tsize == 0) || (SAsize == 0)) { return 0; }
    273   if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
    274 
    275   for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
    276       0 < size;
    277       size = half, half >>= 1) {
    278     match = MIN(lmatch, rmatch);
    279     r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
    280     if(r < 0) {
    281       i += half + 1;
    282       half -= (size & 1) ^ 1;
    283       lmatch = match;
    284     } else if(r > 0) {
    285       rmatch = match;
    286     } else {
    287       lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
    288 
    289       /* left part */
    290       for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
    291           0 < lsize;
    292           lsize = half, half >>= 1) {
    293         lmatch = MIN(llmatch, lrmatch);
    294         r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
    295         if(r < 0) {
    296           j += half + 1;
    297           half -= (lsize & 1) ^ 1;
    298           llmatch = lmatch;
    299         } else {
    300           lrmatch = lmatch;
    301         }
    302       }
    303 
    304       /* right part */
    305       for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
    306           0 < rsize;
    307           rsize = half, half >>= 1) {
    308         rmatch = MIN(rlmatch, rrmatch);
    309         r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
    310         if(r <= 0) {
    311           k += half + 1;
    312           half -= (rsize & 1) ^ 1;
    313           rlmatch = rmatch;
    314         } else {
    315           rrmatch = rmatch;
    316         }
    317       }
    318 
    319       break;
    320     }
    321   }
    322 
    323   if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
    324   return k - j;
    325 }
    326 
    327 /* Search for the character c in the string T. */
    328 saidx_t
    329 sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
    330                 const saidx_t *SA, saidx_t SAsize,
    331                 saint_t c, saidx_t *idx) {
    332   saidx_t size, lsize, rsize, half;
    333   saidx_t i, j, k, p;
    334   saint_t r;
    335 
    336   if(idx != NULL) { *idx = -1; }
    337   if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
    338   if((Tsize == 0) || (SAsize == 0)) { return 0; }
    339 
    340   for(i = j = k = 0, size = SAsize, half = size >> 1;
    341       0 < size;
    342       size = half, half >>= 1) {
    343     p = SA[i + half];
    344     r = (p < Tsize) ? T[p] - c : -1;
    345     if(r < 0) {
    346       i += half + 1;
    347       half -= (size & 1) ^ 1;
    348     } else if(r == 0) {
    349       lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
    350 
    351       /* left part */
    352       for(half = lsize >> 1;
    353           0 < lsize;
    354           lsize = half, half >>= 1) {
    355         p = SA[j + half];
    356         r = (p < Tsize) ? T[p] - c : -1;
    357         if(r < 0) {
    358           j += half + 1;
    359           half -= (lsize & 1) ^ 1;
    360         }
    361       }
    362 
    363       /* right part */
    364       for(half = rsize >> 1;
    365           0 < rsize;
    366           rsize = half, half >>= 1) {
    367         p = SA[k + half];
    368         r = (p < Tsize) ? T[p] - c : -1;
    369         if(r <= 0) {
    370           k += half + 1;
    371           half -= (rsize & 1) ^ 1;
    372         }
    373       }
    374 
    375       break;
    376     }
    377   }
    378 
    379   if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
    380   return k - j;
    381 }
    382