Home | History | Annotate | Download | only in bzip2
      1 
      2 /*-------------------------------------------------------------*/
      3 /*--- Block sorting machinery                               ---*/
      4 /*---                                           blocksort.c ---*/
      5 /*-------------------------------------------------------------*/
      6 
      7 /* ------------------------------------------------------------------
      8    This file is part of bzip2/libbzip2, a program and library for
      9    lossless, block-sorting data compression.
     10 
     11    bzip2/libbzip2 version 1.0.6 of 6 September 2010
     12    Copyright (C) 1996-2010 Julian Seward <jseward (at) bzip.org>
     13 
     14    Please read the WARNING, DISCLAIMER and PATENTS sections in the
     15    README file.
     16 
     17    This program is released under the terms of the license contained
     18    in the file LICENSE.
     19    ------------------------------------------------------------------ */
     20 
     21 
     22 #include "bzlib_private.h"
     23 
     24 /*---------------------------------------------*/
     25 /*--- Fallback O(N log(N)^2) sorting        ---*/
     26 /*--- algorithm, for repetitive blocks      ---*/
     27 /*---------------------------------------------*/
     28 
     29 /*---------------------------------------------*/
     30 static
     31 __inline__
     32 void fallbackSimpleSort ( UInt32* fmap,
     33                           UInt32* eclass,
     34                           Int32   lo,
     35                           Int32   hi )
     36 {
     37    Int32 i, j, tmp;
     38    UInt32 ec_tmp;
     39 
     40    if (lo == hi) return;
     41 
     42    if (hi - lo > 3) {
     43       for ( i = hi-4; i >= lo; i-- ) {
     44          tmp = fmap[i];
     45          ec_tmp = eclass[tmp];
     46          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
     47             fmap[j-4] = fmap[j];
     48          fmap[j-4] = tmp;
     49       }
     50    }
     51 
     52    for ( i = hi-1; i >= lo; i-- ) {
     53       tmp = fmap[i];
     54       ec_tmp = eclass[tmp];
     55       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
     56          fmap[j-1] = fmap[j];
     57       fmap[j-1] = tmp;
     58    }
     59 }
     60 
     61 
     62 /*---------------------------------------------*/
     63 #define fswap(zz1, zz2) \
     64    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
     65 
     66 #define fvswap(zzp1, zzp2, zzn)       \
     67 {                                     \
     68    Int32 yyp1 = (zzp1);               \
     69    Int32 yyp2 = (zzp2);               \
     70    Int32 yyn  = (zzn);                \
     71    while (yyn > 0) {                  \
     72       fswap(fmap[yyp1], fmap[yyp2]);  \
     73       yyp1++; yyp2++; yyn--;          \
     74    }                                  \
     75 }
     76 
     77 
     78 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
     79 
     80 #define fpush(lz,hz) { stackLo[sp] = lz; \
     81                        stackHi[sp] = hz; \
     82                        sp++; }
     83 
     84 #define fpop(lz,hz) { sp--;              \
     85                       lz = stackLo[sp];  \
     86                       hz = stackHi[sp]; }
     87 
     88 #define FALLBACK_QSORT_SMALL_THRESH 10
     89 #define FALLBACK_QSORT_STACK_SIZE   100
     90 
     91 
     92 static
     93 void fallbackQSort3 ( UInt32* fmap,
     94                       UInt32* eclass,
     95                       Int32   loSt,
     96                       Int32   hiSt )
     97 {
     98    Int32 unLo, unHi, ltLo, gtHi, n, m;
     99    Int32 sp, lo, hi;
    100    UInt32 med, r, r3;
    101    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
    102    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
    103 
    104    r = 0;
    105 
    106    sp = 0;
    107    fpush ( loSt, hiSt );
    108 
    109    while (sp > 0) {
    110 
    111       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
    112 
    113       fpop ( lo, hi );
    114       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
    115          fallbackSimpleSort ( fmap, eclass, lo, hi );
    116          continue;
    117       }
    118 
    119       /* Random partitioning.  Median of 3 sometimes fails to
    120          avoid bad cases.  Median of 9 seems to help but
    121          looks rather expensive.  This too seems to work but
    122          is cheaper.  Guidance for the magic constants
    123          7621 and 32768 is taken from Sedgewick's algorithms
    124          book, chapter 35.
    125       */
    126       r = ((r * 7621) + 1) % 32768;
    127       r3 = r % 3;
    128       if (r3 == 0) med = eclass[fmap[lo]]; else
    129       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
    130                    med = eclass[fmap[hi]];
    131 
    132       unLo = ltLo = lo;
    133       unHi = gtHi = hi;
    134 
    135       while (1) {
    136          while (1) {
    137             if (unLo > unHi) break;
    138             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
    139             if (n == 0) {
    140                fswap(fmap[unLo], fmap[ltLo]);
    141                ltLo++; unLo++;
    142                continue;
    143             };
    144             if (n > 0) break;
    145             unLo++;
    146          }
    147          while (1) {
    148             if (unLo > unHi) break;
    149             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
    150             if (n == 0) {
    151                fswap(fmap[unHi], fmap[gtHi]);
    152                gtHi--; unHi--;
    153                continue;
    154             };
    155             if (n < 0) break;
    156             unHi--;
    157          }
    158          if (unLo > unHi) break;
    159          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
    160       }
    161 
    162       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
    163 
    164       if (gtHi < ltLo) continue;
    165 
    166       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
    167       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
    168 
    169       n = lo + unLo - ltLo - 1;
    170       m = hi - (gtHi - unHi) + 1;
    171 
    172       if (n - lo > hi - m) {
    173          fpush ( lo, n );
    174          fpush ( m, hi );
    175       } else {
    176          fpush ( m, hi );
    177          fpush ( lo, n );
    178       }
    179    }
    180 }
    181 
    182 #undef fmin
    183 #undef fpush
    184 #undef fpop
    185 #undef fswap
    186 #undef fvswap
    187 #undef FALLBACK_QSORT_SMALL_THRESH
    188 #undef FALLBACK_QSORT_STACK_SIZE
    189 
    190 
    191 /*---------------------------------------------*/
    192 /* Pre:
    193       nblock > 0
    194       eclass exists for [0 .. nblock-1]
    195       ((UChar*)eclass) [0 .. nblock-1] holds block
    196       ptr exists for [0 .. nblock-1]
    197 
    198    Post:
    199       ((UChar*)eclass) [0 .. nblock-1] holds block
    200       All other areas of eclass destroyed
    201       fmap [0 .. nblock-1] holds sorted order
    202       bhtab [ 0 .. 2+(nblock/32) ] destroyed
    203 */
    204 
    205 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
    206 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
    207 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
    208 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
    209 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
    210 
    211 static
    212 void fallbackSort ( UInt32* fmap,
    213                     UInt32* eclass,
    214                     UInt32* bhtab,
    215                     Int32   nblock,
    216                     Int32   verb )
    217 {
    218    Int32 ftab[257];
    219    Int32 ftabCopy[256];
    220    Int32 H, i, j, k, l, r, cc, cc1;
    221    Int32 nNotDone;
    222    Int32 nBhtab;
    223    UChar* eclass8 = (UChar*)eclass;
    224 
    225    /*--
    226       Initial 1-char radix sort to generate
    227       initial fmap and initial BH bits.
    228    --*/
    229    if (verb >= 4)
    230       VPrintf0 ( "        bucket sorting ...\n" );
    231    for (i = 0; i < 257;    i++) ftab[i] = 0;
    232    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
    233    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
    234    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
    235 
    236    for (i = 0; i < nblock; i++) {
    237       j = eclass8[i];
    238       k = ftab[j] - 1;
    239       ftab[j] = k;
    240       fmap[k] = i;
    241    }
    242 
    243    nBhtab = 2 + (nblock / 32);
    244    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
    245    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
    246 
    247    /*--
    248       Inductively refine the buckets.  Kind-of an
    249       "exponential radix sort" (!), inspired by the
    250       Manber-Myers suffix array construction algorithm.
    251    --*/
    252 
    253    /*-- set sentinel bits for block-end detection --*/
    254    for (i = 0; i < 32; i++) {
    255       SET_BH(nblock + 2*i);
    256       CLEAR_BH(nblock + 2*i + 1);
    257    }
    258 
    259    /*-- the log(N) loop --*/
    260    H = 1;
    261    while (1) {
    262 
    263       if (verb >= 4)
    264          VPrintf1 ( "        depth %6d has ", H );
    265 
    266       j = 0;
    267       for (i = 0; i < nblock; i++) {
    268          if (ISSET_BH(i)) j = i;
    269          k = fmap[i] - H; if (k < 0) k += nblock;
    270          eclass[k] = j;
    271       }
    272 
    273       nNotDone = 0;
    274       r = -1;
    275       while (1) {
    276 
    277 	 /*-- find the next non-singleton bucket --*/
    278          k = r + 1;
    279          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    280          if (ISSET_BH(k)) {
    281             while (WORD_BH(k) == 0xffffffff) k += 32;
    282             while (ISSET_BH(k)) k++;
    283          }
    284          l = k - 1;
    285          if (l >= nblock) break;
    286          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    287          if (!ISSET_BH(k)) {
    288             while (WORD_BH(k) == 0x00000000) k += 32;
    289             while (!ISSET_BH(k)) k++;
    290          }
    291          r = k - 1;
    292          if (r >= nblock) break;
    293 
    294          /*-- now [l, r] bracket current bucket --*/
    295          if (r > l) {
    296             nNotDone += (r - l + 1);
    297             fallbackQSort3 ( fmap, eclass, l, r );
    298 
    299             /*-- scan bucket and generate header bits-- */
    300             cc = -1;
    301             for (i = l; i <= r; i++) {
    302                cc1 = eclass[fmap[i]];
    303                if (cc != cc1) { SET_BH(i); cc = cc1; };
    304             }
    305          }
    306       }
    307 
    308       if (verb >= 4)
    309          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
    310 
    311       H *= 2;
    312       if (H > nblock || nNotDone == 0) break;
    313    }
    314 
    315    /*--
    316       Reconstruct the original block in
    317       eclass8 [0 .. nblock-1], since the
    318       previous phase destroyed it.
    319    --*/
    320    if (verb >= 4)
    321       VPrintf0 ( "        reconstructing block ...\n" );
    322    j = 0;
    323    for (i = 0; i < nblock; i++) {
    324       while (ftabCopy[j] == 0) j++;
    325       ftabCopy[j]--;
    326       eclass8[fmap[i]] = (UChar)j;
    327    }
    328    AssertH ( j < 256, 1005 );
    329 }
    330 
    331 #undef       SET_BH
    332 #undef     CLEAR_BH
    333 #undef     ISSET_BH
    334 #undef      WORD_BH
    335 #undef UNALIGNED_BH
    336 
    337 
    338 /*---------------------------------------------*/
    339 /*--- The main, O(N^2 log(N)) sorting       ---*/
    340 /*--- algorithm.  Faster for "normal"       ---*/
    341 /*--- non-repetitive blocks.                ---*/
    342 /*---------------------------------------------*/
    343 
    344 /*---------------------------------------------*/
    345 static
    346 __inline__
    347 Bool mainGtU ( UInt32  i1,
    348                UInt32  i2,
    349                UChar*  block,
    350                UInt16* quadrant,
    351                UInt32  nblock,
    352                Int32*  budget )
    353 {
    354    Int32  k;
    355    UChar  c1, c2;
    356    UInt16 s1, s2;
    357 
    358    AssertD ( i1 != i2, "mainGtU" );
    359    /* 1 */
    360    c1 = block[i1]; c2 = block[i2];
    361    if (c1 != c2) return (c1 > c2);
    362    i1++; i2++;
    363    /* 2 */
    364    c1 = block[i1]; c2 = block[i2];
    365    if (c1 != c2) return (c1 > c2);
    366    i1++; i2++;
    367    /* 3 */
    368    c1 = block[i1]; c2 = block[i2];
    369    if (c1 != c2) return (c1 > c2);
    370    i1++; i2++;
    371    /* 4 */
    372    c1 = block[i1]; c2 = block[i2];
    373    if (c1 != c2) return (c1 > c2);
    374    i1++; i2++;
    375    /* 5 */
    376    c1 = block[i1]; c2 = block[i2];
    377    if (c1 != c2) return (c1 > c2);
    378    i1++; i2++;
    379    /* 6 */
    380    c1 = block[i1]; c2 = block[i2];
    381    if (c1 != c2) return (c1 > c2);
    382    i1++; i2++;
    383    /* 7 */
    384    c1 = block[i1]; c2 = block[i2];
    385    if (c1 != c2) return (c1 > c2);
    386    i1++; i2++;
    387    /* 8 */
    388    c1 = block[i1]; c2 = block[i2];
    389    if (c1 != c2) return (c1 > c2);
    390    i1++; i2++;
    391    /* 9 */
    392    c1 = block[i1]; c2 = block[i2];
    393    if (c1 != c2) return (c1 > c2);
    394    i1++; i2++;
    395    /* 10 */
    396    c1 = block[i1]; c2 = block[i2];
    397    if (c1 != c2) return (c1 > c2);
    398    i1++; i2++;
    399    /* 11 */
    400    c1 = block[i1]; c2 = block[i2];
    401    if (c1 != c2) return (c1 > c2);
    402    i1++; i2++;
    403    /* 12 */
    404    c1 = block[i1]; c2 = block[i2];
    405    if (c1 != c2) return (c1 > c2);
    406    i1++; i2++;
    407 
    408    k = nblock + 8;
    409 
    410    do {
    411       /* 1 */
    412       c1 = block[i1]; c2 = block[i2];
    413       if (c1 != c2) return (c1 > c2);
    414       s1 = quadrant[i1]; s2 = quadrant[i2];
    415       if (s1 != s2) return (s1 > s2);
    416       i1++; i2++;
    417       /* 2 */
    418       c1 = block[i1]; c2 = block[i2];
    419       if (c1 != c2) return (c1 > c2);
    420       s1 = quadrant[i1]; s2 = quadrant[i2];
    421       if (s1 != s2) return (s1 > s2);
    422       i1++; i2++;
    423       /* 3 */
    424       c1 = block[i1]; c2 = block[i2];
    425       if (c1 != c2) return (c1 > c2);
    426       s1 = quadrant[i1]; s2 = quadrant[i2];
    427       if (s1 != s2) return (s1 > s2);
    428       i1++; i2++;
    429       /* 4 */
    430       c1 = block[i1]; c2 = block[i2];
    431       if (c1 != c2) return (c1 > c2);
    432       s1 = quadrant[i1]; s2 = quadrant[i2];
    433       if (s1 != s2) return (s1 > s2);
    434       i1++; i2++;
    435       /* 5 */
    436       c1 = block[i1]; c2 = block[i2];
    437       if (c1 != c2) return (c1 > c2);
    438       s1 = quadrant[i1]; s2 = quadrant[i2];
    439       if (s1 != s2) return (s1 > s2);
    440       i1++; i2++;
    441       /* 6 */
    442       c1 = block[i1]; c2 = block[i2];
    443       if (c1 != c2) return (c1 > c2);
    444       s1 = quadrant[i1]; s2 = quadrant[i2];
    445       if (s1 != s2) return (s1 > s2);
    446       i1++; i2++;
    447       /* 7 */
    448       c1 = block[i1]; c2 = block[i2];
    449       if (c1 != c2) return (c1 > c2);
    450       s1 = quadrant[i1]; s2 = quadrant[i2];
    451       if (s1 != s2) return (s1 > s2);
    452       i1++; i2++;
    453       /* 8 */
    454       c1 = block[i1]; c2 = block[i2];
    455       if (c1 != c2) return (c1 > c2);
    456       s1 = quadrant[i1]; s2 = quadrant[i2];
    457       if (s1 != s2) return (s1 > s2);
    458       i1++; i2++;
    459 
    460       if (i1 >= nblock) i1 -= nblock;
    461       if (i2 >= nblock) i2 -= nblock;
    462 
    463       k -= 8;
    464       (*budget)--;
    465    }
    466       while (k >= 0);
    467 
    468    return False;
    469 }
    470 
    471 
    472 /*---------------------------------------------*/
    473 /*--
    474    Knuth's increments seem to work better
    475    than Incerpi-Sedgewick here.  Possibly
    476    because the number of elems to sort is
    477    usually small, typically <= 20.
    478 --*/
    479 static
    480 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
    481                    9841, 29524, 88573, 265720,
    482                    797161, 2391484 };
    483 
    484 static
    485 void mainSimpleSort ( UInt32* ptr,
    486                       UChar*  block,
    487                       UInt16* quadrant,
    488                       Int32   nblock,
    489                       Int32   lo,
    490                       Int32   hi,
    491                       Int32   d,
    492                       Int32*  budget )
    493 {
    494    Int32 i, j, h, bigN, hp;
    495    UInt32 v;
    496 
    497    bigN = hi - lo + 1;
    498    if (bigN < 2) return;
    499 
    500    hp = 0;
    501    while (incs[hp] < bigN) hp++;
    502    hp--;
    503 
    504    for (; hp >= 0; hp--) {
    505       h = incs[hp];
    506 
    507       i = lo + h;
    508       while (True) {
    509 
    510          /*-- copy 1 --*/
    511          if (i > hi) break;
    512          v = ptr[i];
    513          j = i;
    514          while ( mainGtU (
    515                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    516                  ) ) {
    517             ptr[j] = ptr[j-h];
    518             j = j - h;
    519             if (j <= (lo + h - 1)) break;
    520          }
    521          ptr[j] = v;
    522          i++;
    523 
    524          /*-- copy 2 --*/
    525          if (i > hi) break;
    526          v = ptr[i];
    527          j = i;
    528          while ( mainGtU (
    529                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    530                  ) ) {
    531             ptr[j] = ptr[j-h];
    532             j = j - h;
    533             if (j <= (lo + h - 1)) break;
    534          }
    535          ptr[j] = v;
    536          i++;
    537 
    538          /*-- copy 3 --*/
    539          if (i > hi) break;
    540          v = ptr[i];
    541          j = i;
    542          while ( mainGtU (
    543                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    544                  ) ) {
    545             ptr[j] = ptr[j-h];
    546             j = j - h;
    547             if (j <= (lo + h - 1)) break;
    548          }
    549          ptr[j] = v;
    550          i++;
    551 
    552          if (*budget < 0) return;
    553       }
    554    }
    555 }
    556 
    557 
    558 /*---------------------------------------------*/
    559 /*--
    560    The following is an implementation of
    561    an elegant 3-way quicksort for strings,
    562    described in a paper "Fast Algorithms for
    563    Sorting and Searching Strings", by Robert
    564    Sedgewick and Jon L. Bentley.
    565 --*/
    566 
    567 #define mswap(zz1, zz2) \
    568    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
    569 
    570 #define mvswap(zzp1, zzp2, zzn)       \
    571 {                                     \
    572    Int32 yyp1 = (zzp1);               \
    573    Int32 yyp2 = (zzp2);               \
    574    Int32 yyn  = (zzn);                \
    575    while (yyn > 0) {                  \
    576       mswap(ptr[yyp1], ptr[yyp2]);    \
    577       yyp1++; yyp2++; yyn--;          \
    578    }                                  \
    579 }
    580 
    581 static
    582 __inline__
    583 UChar mmed3 ( UChar a, UChar b, UChar c )
    584 {
    585    UChar t;
    586    if (a > b) { t = a; a = b; b = t; };
    587    if (b > c) {
    588       b = c;
    589       if (a > b) b = a;
    590    }
    591    return b;
    592 }
    593 
    594 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
    595 
    596 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
    597                           stackHi[sp] = hz; \
    598                           stackD [sp] = dz; \
    599                           sp++; }
    600 
    601 #define mpop(lz,hz,dz) { sp--;             \
    602                          lz = stackLo[sp]; \
    603                          hz = stackHi[sp]; \
    604                          dz = stackD [sp]; }
    605 
    606 
    607 #define mnextsize(az) (nextHi[az]-nextLo[az])
    608 
    609 #define mnextswap(az,bz)                                        \
    610    { Int32 tz;                                                  \
    611      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
    612      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
    613      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
    614 
    615 
    616 #define MAIN_QSORT_SMALL_THRESH 20
    617 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
    618 #define MAIN_QSORT_STACK_SIZE 100
    619 
    620 static
    621 void mainQSort3 ( UInt32* ptr,
    622                   UChar*  block,
    623                   UInt16* quadrant,
    624                   Int32   nblock,
    625                   Int32   loSt,
    626                   Int32   hiSt,
    627                   Int32   dSt,
    628                   Int32*  budget )
    629 {
    630    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
    631    Int32 sp, lo, hi, d;
    632 
    633    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
    634    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
    635    Int32 stackD [MAIN_QSORT_STACK_SIZE];
    636 
    637    Int32 nextLo[3];
    638    Int32 nextHi[3];
    639    Int32 nextD [3];
    640 
    641    sp = 0;
    642    mpush ( loSt, hiSt, dSt );
    643 
    644    while (sp > 0) {
    645 
    646       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
    647 
    648       mpop ( lo, hi, d );
    649       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
    650           d > MAIN_QSORT_DEPTH_THRESH) {
    651          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
    652          if (*budget < 0) return;
    653          continue;
    654       }
    655 
    656       med = (Int32)
    657             mmed3 ( block[ptr[ lo         ]+d],
    658                     block[ptr[ hi         ]+d],
    659                     block[ptr[ (lo+hi)>>1 ]+d] );
    660 
    661       unLo = ltLo = lo;
    662       unHi = gtHi = hi;
    663 
    664       while (True) {
    665          while (True) {
    666             if (unLo > unHi) break;
    667             n = ((Int32)block[ptr[unLo]+d]) - med;
    668             if (n == 0) {
    669                mswap(ptr[unLo], ptr[ltLo]);
    670                ltLo++; unLo++; continue;
    671             };
    672             if (n >  0) break;
    673             unLo++;
    674          }
    675          while (True) {
    676             if (unLo > unHi) break;
    677             n = ((Int32)block[ptr[unHi]+d]) - med;
    678             if (n == 0) {
    679                mswap(ptr[unHi], ptr[gtHi]);
    680                gtHi--; unHi--; continue;
    681             };
    682             if (n <  0) break;
    683             unHi--;
    684          }
    685          if (unLo > unHi) break;
    686          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
    687       }
    688 
    689       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
    690 
    691       if (gtHi < ltLo) {
    692          mpush(lo, hi, d+1 );
    693          continue;
    694       }
    695 
    696       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
    697       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
    698 
    699       n = lo + unLo - ltLo - 1;
    700       m = hi - (gtHi - unHi) + 1;
    701 
    702       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
    703       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
    704       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
    705 
    706       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    707       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
    708       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    709 
    710       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
    711       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
    712 
    713       mpush (nextLo[0], nextHi[0], nextD[0]);
    714       mpush (nextLo[1], nextHi[1], nextD[1]);
    715       mpush (nextLo[2], nextHi[2], nextD[2]);
    716    }
    717 }
    718 
    719 #undef mswap
    720 #undef mvswap
    721 #undef mpush
    722 #undef mpop
    723 #undef mmin
    724 #undef mnextsize
    725 #undef mnextswap
    726 #undef MAIN_QSORT_SMALL_THRESH
    727 #undef MAIN_QSORT_DEPTH_THRESH
    728 #undef MAIN_QSORT_STACK_SIZE
    729 
    730 
    731 /*---------------------------------------------*/
    732 /* Pre:
    733       nblock > N_OVERSHOOT
    734       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
    735       ((UChar*)block32) [0 .. nblock-1] holds block
    736       ptr exists for [0 .. nblock-1]
    737 
    738    Post:
    739       ((UChar*)block32) [0 .. nblock-1] holds block
    740       All other areas of block32 destroyed
    741       ftab [0 .. 65536 ] destroyed
    742       ptr [0 .. nblock-1] holds sorted order
    743       if (*budget < 0), sorting was abandoned
    744 */
    745 
    746 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
    747 #define SETMASK (1 << 21)
    748 #define CLEARMASK (~(SETMASK))
    749 
    750 static
    751 void mainSort ( UInt32* ptr,
    752                 UChar*  block,
    753                 UInt16* quadrant,
    754                 UInt32* ftab,
    755                 Int32   nblock,
    756                 Int32   verb,
    757                 Int32*  budget )
    758 {
    759    Int32  i, j, k, ss, sb;
    760    Int32  runningOrder[256];
    761    Bool   bigDone[256];
    762    Int32  copyStart[256];
    763    Int32  copyEnd  [256];
    764    UChar  c1;
    765    Int32  numQSorted;
    766    UInt16 s;
    767    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
    768 
    769    /*-- set up the 2-byte frequency table --*/
    770    for (i = 65536; i >= 0; i--) ftab[i] = 0;
    771 
    772    j = block[0] << 8;
    773    i = nblock-1;
    774    for (; i >= 3; i -= 4) {
    775       quadrant[i] = 0;
    776       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    777       ftab[j]++;
    778       quadrant[i-1] = 0;
    779       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
    780       ftab[j]++;
    781       quadrant[i-2] = 0;
    782       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
    783       ftab[j]++;
    784       quadrant[i-3] = 0;
    785       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
    786       ftab[j]++;
    787    }
    788    for (; i >= 0; i--) {
    789       quadrant[i] = 0;
    790       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    791       ftab[j]++;
    792    }
    793 
    794    /*-- (emphasises close relationship of block & quadrant) --*/
    795    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
    796       block   [nblock+i] = block[i];
    797       quadrant[nblock+i] = 0;
    798    }
    799 
    800    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
    801 
    802    /*-- Complete the initial radix sort --*/
    803    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
    804 
    805    s = block[0] << 8;
    806    i = nblock-1;
    807    for (; i >= 3; i -= 4) {
    808       s = (s >> 8) | (block[i] << 8);
    809       j = ftab[s] -1;
    810       ftab[s] = j;
    811       ptr[j] = i;
    812       s = (s >> 8) | (block[i-1] << 8);
    813       j = ftab[s] -1;
    814       ftab[s] = j;
    815       ptr[j] = i-1;
    816       s = (s >> 8) | (block[i-2] << 8);
    817       j = ftab[s] -1;
    818       ftab[s] = j;
    819       ptr[j] = i-2;
    820       s = (s >> 8) | (block[i-3] << 8);
    821       j = ftab[s] -1;
    822       ftab[s] = j;
    823       ptr[j] = i-3;
    824    }
    825    for (; i >= 0; i--) {
    826       s = (s >> 8) | (block[i] << 8);
    827       j = ftab[s] -1;
    828       ftab[s] = j;
    829       ptr[j] = i;
    830    }
    831 
    832    /*--
    833       Now ftab contains the first loc of every small bucket.
    834       Calculate the running order, from smallest to largest
    835       big bucket.
    836    --*/
    837    for (i = 0; i <= 255; i++) {
    838       bigDone     [i] = False;
    839       runningOrder[i] = i;
    840    }
    841 
    842    {
    843       Int32 vv;
    844       Int32 h = 1;
    845       do h = 3 * h + 1; while (h <= 256);
    846       do {
    847          h = h / 3;
    848          for (i = h; i <= 255; i++) {
    849             vv = runningOrder[i];
    850             j = i;
    851             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
    852                runningOrder[j] = runningOrder[j-h];
    853                j = j - h;
    854                if (j <= (h - 1)) goto zero;
    855             }
    856             zero:
    857             runningOrder[j] = vv;
    858          }
    859       } while (h != 1);
    860    }
    861 
    862    /*--
    863       The main sorting loop.
    864    --*/
    865 
    866    numQSorted = 0;
    867 
    868    for (i = 0; i <= 255; i++) {
    869 
    870       /*--
    871          Process big buckets, starting with the least full.
    872          Basically this is a 3-step process in which we call
    873          mainQSort3 to sort the small buckets [ss, j], but
    874          also make a big effort to avoid the calls if we can.
    875       --*/
    876       ss = runningOrder[i];
    877 
    878       /*--
    879          Step 1:
    880          Complete the big bucket [ss] by quicksorting
    881          any unsorted small buckets [ss, j], for j != ss.
    882          Hopefully previous pointer-scanning phases have already
    883          completed many of the small buckets [ss, j], so
    884          we don't have to sort them at all.
    885       --*/
    886       for (j = 0; j <= 255; j++) {
    887          if (j != ss) {
    888             sb = (ss << 8) + j;
    889             if ( ! (ftab[sb] & SETMASK) ) {
    890                Int32 lo = ftab[sb]   & CLEARMASK;
    891                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
    892                if (hi > lo) {
    893                   if (verb >= 4)
    894                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
    895                                 "done %d   this %d\n",
    896                                 ss, j, numQSorted, hi - lo + 1 );
    897                   mainQSort3 (
    898                      ptr, block, quadrant, nblock,
    899                      lo, hi, BZ_N_RADIX, budget
    900                   );
    901                   numQSorted += (hi - lo + 1);
    902                   if (*budget < 0) return;
    903                }
    904             }
    905             ftab[sb] |= SETMASK;
    906          }
    907       }
    908 
    909       AssertH ( !bigDone[ss], 1006 );
    910 
    911       /*--
    912          Step 2:
    913          Now scan this big bucket [ss] so as to synthesise the
    914          sorted order for small buckets [t, ss] for all t,
    915          including, magically, the bucket [ss,ss] too.
    916          This will avoid doing Real Work in subsequent Step 1's.
    917       --*/
    918       {
    919          for (j = 0; j <= 255; j++) {
    920             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
    921             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
    922          }
    923          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
    924             k = ptr[j]-1; if (k < 0) k += nblock;
    925             c1 = block[k];
    926             if (!bigDone[c1])
    927                ptr[ copyStart[c1]++ ] = k;
    928          }
    929          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
    930             k = ptr[j]-1; if (k < 0) k += nblock;
    931             c1 = block[k];
    932             if (!bigDone[c1])
    933                ptr[ copyEnd[c1]-- ] = k;
    934          }
    935       }
    936 
    937       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
    938                 ||
    939                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
    940                    Necessity for this case is demonstrated by compressing
    941                    a sequence of approximately 48.5 million of character
    942                    251; 1.0.0/1.0.1 will then die here. */
    943                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
    944                 1007 )
    945 
    946       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
    947 
    948       /*--
    949          Step 3:
    950          The [ss] big bucket is now done.  Record this fact,
    951          and update the quadrant descriptors.  Remember to
    952          update quadrants in the overshoot area too, if
    953          necessary.  The "if (i < 255)" test merely skips
    954          this updating for the last bucket processed, since
    955          updating for the last bucket is pointless.
    956 
    957          The quadrant array provides a way to incrementally
    958          cache sort orderings, as they appear, so as to
    959          make subsequent comparisons in fullGtU() complete
    960          faster.  For repetitive blocks this makes a big
    961          difference (but not big enough to be able to avoid
    962          the fallback sorting mechanism, exponential radix sort).
    963 
    964          The precise meaning is: at all times:
    965 
    966             for 0 <= i < nblock and 0 <= j <= nblock
    967 
    968             if block[i] != block[j],
    969 
    970                then the relative values of quadrant[i] and
    971                     quadrant[j] are meaningless.
    972 
    973                else {
    974                   if quadrant[i] < quadrant[j]
    975                      then the string starting at i lexicographically
    976                      precedes the string starting at j
    977 
    978                   else if quadrant[i] > quadrant[j]
    979                      then the string starting at j lexicographically
    980                      precedes the string starting at i
    981 
    982                   else
    983                      the relative ordering of the strings starting
    984                      at i and j has not yet been determined.
    985                }
    986       --*/
    987       bigDone[ss] = True;
    988 
    989       if (i < 255) {
    990          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
    991          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
    992          Int32 shifts   = 0;
    993 
    994          while ((bbSize >> shifts) > 65534) shifts++;
    995 
    996          for (j = bbSize-1; j >= 0; j--) {
    997             Int32 a2update     = ptr[bbStart + j];
    998             UInt16 qVal        = (UInt16)(j >> shifts);
    999             quadrant[a2update] = qVal;
   1000             if (a2update < BZ_N_OVERSHOOT)
   1001                quadrant[a2update + nblock] = qVal;
   1002          }
   1003          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
   1004       }
   1005 
   1006    }
   1007 
   1008    if (verb >= 4)
   1009       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
   1010                  nblock, numQSorted, nblock - numQSorted );
   1011 }
   1012 
   1013 #undef BIGFREQ
   1014 #undef SETMASK
   1015 #undef CLEARMASK
   1016 
   1017 
   1018 /*---------------------------------------------*/
   1019 /* Pre:
   1020       nblock > 0
   1021       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
   1022       ((UChar*)arr2)  [0 .. nblock-1] holds block
   1023       arr1 exists for [0 .. nblock-1]
   1024 
   1025    Post:
   1026       ((UChar*)arr2) [0 .. nblock-1] holds block
   1027       All other areas of block destroyed
   1028       ftab [ 0 .. 65536 ] destroyed
   1029       arr1 [0 .. nblock-1] holds sorted order
   1030 */
   1031 void BZ2_blockSort ( EState* s )
   1032 {
   1033    UInt32* ptr    = s->ptr;
   1034    UChar*  block  = s->block;
   1035    UInt32* ftab   = s->ftab;
   1036    Int32   nblock = s->nblock;
   1037    Int32   verb   = s->verbosity;
   1038    Int32   wfact  = s->workFactor;
   1039    UInt16* quadrant;
   1040    Int32   budget;
   1041    Int32   budgetInit;
   1042    Int32   i;
   1043 
   1044    if (nblock < 10000) {
   1045       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1046    } else {
   1047       /* Calculate the location for quadrant, remembering to get
   1048          the alignment right.  Assumes that &(block[0]) is at least
   1049          2-byte aligned -- this should be ok since block is really
   1050          the first section of arr2.
   1051       */
   1052       i = nblock+BZ_N_OVERSHOOT;
   1053       if (i & 1) i++;
   1054       quadrant = (UInt16*)(&(block[i]));
   1055 
   1056       /* (wfact-1) / 3 puts the default-factor-30
   1057          transition point at very roughly the same place as
   1058          with v0.1 and v0.9.0.
   1059          Not that it particularly matters any more, since the
   1060          resulting compressed stream is now the same regardless
   1061          of whether or not we use the main sort or fallback sort.
   1062       */
   1063       if (wfact < 1  ) wfact = 1;
   1064       if (wfact > 100) wfact = 100;
   1065       budgetInit = nblock * ((wfact-1) / 3);
   1066       budget = budgetInit;
   1067 
   1068       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
   1069       if (verb >= 3)
   1070          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
   1071                     budgetInit - budget,
   1072                     nblock,
   1073                     (float)(budgetInit - budget) /
   1074                     (float)(nblock==0 ? 1 : nblock) );
   1075       if (budget < 0) {
   1076          if (verb >= 2)
   1077             VPrintf0 ( "    too repetitive; using fallback"
   1078                        " sorting algorithm\n" );
   1079          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1080       }
   1081    }
   1082 
   1083    s->origPtr = -1;
   1084    for (i = 0; i < s->nblock; i++)
   1085       if (ptr[i] == 0)
   1086          { s->origPtr = i; break; };
   1087 
   1088    AssertH( s->origPtr != -1, 1003 );
   1089 }
   1090 
   1091 
   1092 /*-------------------------------------------------------------*/
   1093 /*--- end                                       blocksort.c ---*/
   1094 /*-------------------------------------------------------------*/
   1095