Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Remember a subset of a sequence of values, using a modest amount of memory
      6 
      7 /***
      8   Design:
      9   Accumulate in powers of three, using 3-way median to collapse entries.
     10   At any given time, there is one most-dense (highest power of 3) range of
     11   entries and a series of less-dense ranges that hold 0..2 entries each. There
     12   is a bounded-size storage array of S cells for all the entries.
     13 
     14   The overflow detect is set up so that a new higher power of 3, K+1, is
     15   triggered precisely when range K has 3n entries and all ranges < K have
     16   zero entries.
     17 
     18   In general, think of the range sizes as a multi-digit base 3 number, except
     19   the highest digit may exceed 2:
     20 
     21   3**6  3**5  3**4  3**3  3**2  3**1  3**0  K=2
     22     0     0     0     0   3n-1    2     2   unused:1
     23 
     24   There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
     25   one more than that, and we add a new 3**0 entry and "carry" by performing
     26   medians on any group of 3 elements:
     27 
     28   3**6  3**5  3**4  3**3  3**2  3**1  3**0       K=2
     29     0     0     0     0   3n-1    2     3        unused:0
     30     0     0     0     0   3n-1    3     0 carry  unused:2
     31     0     0     0     0    3n     0     0 carry  unused:4
     32 
     33   To accumulate 2 entries at all levels < K and 3 just before the first carry at
     34   level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
     35   in this case. Since we only have 4 cells in the example above, we need to
     36   make room by starting a new power of three:
     37 
     38   3**6  3**5  3**4  3**3  3**2  3**1  3**0
     39     0     0     0     0    3n     0     0        K=2 unused:4
     40     0     0     0     n     0     0     0        K=3 unused:2n+4
     41 
     42   In the code below, we don't worry about overflow from the topmost place.
     43 
     44 
     45 ***/
     46 
     47 #include "encodings/compact_lang_det/subsetsequence.h"
     48 #include <stdio.h>
     49 
     50 #include "encodings/compact_lang_det/win/cld_logging.h"
     51 
     52 void DumpInts(const char* label, const int* v, int n) {
     53   printf("%s ", label);
     54   for (int i = 0; i < n; ++i) {
     55     printf("%d ", v[i]);
     56   }
     57   printf("\n");
     58 }
     59 
     60 void DumpUint8s(const char* label, const uint8* v, int n) {
     61   printf("%s ", label);
     62   for (int i = 0; i < n; ++i) {
     63     printf("%d ", v[i]);
     64   }
     65   printf("\n");
     66 }
     67 
     68 // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
     69 uint8 SubsetSequence::Median3(int sub) {
     70   if (seq_[sub] == seq_[sub + 1]) {
     71     return seq_[sub];
     72   }
     73   if (seq_[sub] == seq_[sub + 2]) {
     74     return seq_[sub];
     75   }
     76   return seq_[sub + 1];
     77 }
     78 
     79 void SubsetSequence::Init() {
     80   // printf("Init\n");
     81 
     82   k_ = 0;
     83   count_[0] = 0;
     84   next_e_ = 0;
     85   seq_[0] = 0;    // Default value if no calls to Add
     86 
     87   // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
     88   int reserve = (2 * k_ + 1);
     89   level_limit_e_ = kMaxSeq_ - reserve;
     90   level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
     91   limit_e_ = level_limit_e_;
     92 }
     93 
     94 // Compress level k by 3x, creating level k+1
     95 void SubsetSequence::NewLevel() {
     96   // printf("NewLevel 3 ** %d\n", k_ + 1);
     97   //DumpUint8s("count[k]", count_, k_ + 1);
     98   //DumpUint8s("seq[next]", seq_, next_e_);
     99 
    100   // Incoming level must be an exact multiple of three in size
    101   CHECK((count_[k_] % 3) == 0);
    102   int k_size = count_[k_];
    103   int new_size = k_size / 3;
    104 
    105   // Compress down by 3x, via median
    106   for (int j = 0; j < new_size; ++j) {
    107     seq_[j] = Median3(j * 3);
    108   }
    109 
    110   // Update counts
    111   count_[k_] = 0;
    112   // Else Overflow -- just continue with 3x dense Level K
    113   if (k_ < (kMaxLevel_ - 1)) {++k_;}
    114   count_[k_] = new_size;
    115 
    116   // Update limits
    117   next_e_ = new_size;
    118   limit_e_ = next_e_ + 3;
    119 
    120   // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
    121   int reserve = (2 * k_ + 1);
    122   level_limit_e_ = kMaxSeq_ - reserve;
    123   level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3
    124                                               //
    125   //DumpUint8s("after: count[k]", count_, k_ + 1);
    126   //DumpUint8s("after: seq[next]", seq_, next_e_);
    127 }
    128 
    129 void SubsetSequence::DoCarries() {
    130   CHECK(count_[k_] > 3);    // We depend on count_[k_] being > 3 to stop while
    131   // Make room by carrying
    132 
    133   //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
    134   //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
    135 
    136   int i = 0;
    137   while (count_[i] == 3) {
    138     next_e_ -= 3;
    139     seq_[next_e_] = Median3(next_e_);
    140     ++next_e_;
    141     count_[i] = 0;
    142     ++count_[i + 1];
    143     ++i;
    144   }
    145   limit_e_ = next_e_ + 3;
    146 
    147   //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
    148   //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
    149 
    150   // If we just fully carried into level K,
    151   // Make sure there is now enough room, else start level K + 1
    152   if (i >= k_) {
    153     CHECK(count_[k_] == next_e_);
    154     if (next_e_ >= level_limit_e_) {
    155       NewLevel();
    156     }
    157   }
    158 }
    159 
    160 void SubsetSequence::Add(uint8 e) {
    161   // Add an entry then carry as needed
    162   seq_[next_e_] = e;
    163   ++next_e_;
    164   ++count_[0];
    165 
    166   if (next_e_ >= limit_e_) {
    167     DoCarries();
    168   }
    169 }
    170 
    171 
    172 // Collapse tail end by simple median across disparate-weight values,
    173 // dropping or duplicating last value if need be.
    174 // This routine is idempotent.
    175 void SubsetSequence::Flush() {
    176   // printf("Flush %d\n", count_[k_]);
    177   int start_tail = count_[k_];
    178   int size_tail = next_e_ - start_tail;
    179   if ((size_tail % 3) == 2) {
    180     seq_[next_e_] = seq_[next_e_ - 1];      // Duplicate last value
    181     ++size_tail;
    182   }
    183 
    184   // Compress tail down by 3x, via median
    185   int new_size = size_tail / 3;             // May delete last value
    186   for (int j = 0; j < new_size; ++j) {
    187     seq_[start_tail + j] = Median3(start_tail + j * 3);
    188   }
    189 
    190   next_e_ = start_tail + new_size;
    191   count_[k_] = next_e_;
    192 }
    193 
    194 
    195 // Extract representative pattern of exactly N values into dst[0..n-1]
    196 // This routine may be called multiple times, but it may downsample as a
    197 // side effect, causing subsequent calls with larger N to get poor answers.
    198 void SubsetSequence::Extract(int to_n, uint8* dst) {
    199   // Collapse partial-carries in tail
    200   Flush();
    201 
    202   // Just use Bresenham to resample
    203   int from_n = next_e_;
    204   if (to_n >= from_n) {
    205     // Up-sample from_n => to_n
    206     int err = to_n - 1;          // bias toward no overshoot
    207     int j = 0;
    208     for (int i = 0; i < to_n; ++i) {
    209       dst[i] = seq_[j];
    210       err -= from_n;
    211       if (err < 0) {
    212         ++j;
    213         err += to_n;
    214       }
    215     }
    216   } else {
    217     // Get to the point that the number of samples is <= 3 * to_n
    218     while (next_e_ > (to_n * 3)) {
    219       // Compress down by 3x, via median
    220       // printf("Extract, median %d / 3\n", next_e_);
    221       if ((next_e_ % 3) == 2) {
    222         seq_[next_e_] = seq_[next_e_ - 1];    // Duplicate last value
    223         ++next_e_;
    224       }
    225       int new_size = next_e_ / 3;             // May delete last value
    226       for (int j = 0; j < new_size; ++j) {
    227         seq_[j] = Median3(j * 3);
    228       }
    229       next_e_ = new_size;
    230       count_[k_] = next_e_;
    231     }
    232     from_n = next_e_;
    233 
    234     if (to_n == from_n) {
    235       // Copy verbatim
    236       for (int i = 0; i < to_n; ++i) {
    237         dst[i] = seq_[i];
    238       }
    239       return;
    240     }
    241 
    242     // Down-sample from_n => to_n, using medians
    243     int err = 0;          // Bias to immediate median sample
    244     int j = 0;
    245     for (int i = 0; i < from_n; ++i) {
    246       err -= to_n;
    247       if (err < 0) {
    248         if (i <= (next_e_ - 2)) {
    249           dst[j] = Median3(i);
    250         } else {
    251           dst[j] = seq_[i];
    252         }
    253         ++j;
    254         err += from_n;
    255       }
    256     }
    257   }
    258 
    259 }
    260