1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Remember a subset of a sequence of values, using a modest amount of memory 6 7 /*** 8 Design: 9 Accumulate in powers of three, using 3-way median to collapse entries. 10 At any given time, there is one most-dense (highest power of 3) range of 11 entries and a series of less-dense ranges that hold 0..2 entries each. There 12 is a bounded-size storage array of S cells for all the entries. 13 14 The overflow detect is set up so that a new higher power of 3, K+1, is 15 triggered precisely when range K has 3n entries and all ranges < K have 16 zero entries. 17 18 In general, think of the range sizes as a multi-digit base 3 number, except 19 the highest digit may exceed 2: 20 21 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2 22 0 0 0 0 3n-1 2 2 unused:1 23 24 There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at 25 one more than that, and we add a new 3**0 entry and "carry" by performing 26 medians on any group of 3 elements: 27 28 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2 29 0 0 0 0 3n-1 2 3 unused:0 30 0 0 0 0 3n-1 3 0 carry unused:2 31 0 0 0 0 3n 0 0 carry unused:4 32 33 To accumulate 2 entries at all levels < K and 3 just before the first carry at 34 level 0, we need 2*K + 1 unused cells after doing all carries, or five cells 35 in this case. Since we only have 4 cells in the example above, we need to 36 make room by starting a new power of three: 37 38 3**6 3**5 3**4 3**3 3**2 3**1 3**0 39 0 0 0 0 3n 0 0 K=2 unused:4 40 0 0 0 n 0 0 0 K=3 unused:2n+4 41 42 In the code below, we don't worry about overflow from the topmost place. 43 44 45 ***/ 46 47 #include "encodings/compact_lang_det/subsetsequence.h" 48 #include <stdio.h> 49 50 #include "encodings/compact_lang_det/win/cld_logging.h" 51 52 void DumpInts(const char* label, const int* v, int n) { 53 printf("%s ", label); 54 for (int i = 0; i < n; ++i) { 55 printf("%d ", v[i]); 56 } 57 printf("\n"); 58 } 59 60 void DumpUint8s(const char* label, const uint8* v, int n) { 61 printf("%s ", label); 62 for (int i = 0; i < n; ++i) { 63 printf("%d ", v[i]); 64 } 65 printf("\n"); 66 } 67 68 // Return median of seq_[sub] .. seq_[sub+2], favoring middle element 69 uint8 SubsetSequence::Median3(int sub) { 70 if (seq_[sub] == seq_[sub + 1]) { 71 return seq_[sub]; 72 } 73 if (seq_[sub] == seq_[sub + 2]) { 74 return seq_[sub]; 75 } 76 return seq_[sub + 1]; 77 } 78 79 void SubsetSequence::Init() { 80 // printf("Init\n"); 81 82 k_ = 0; 83 count_[0] = 0; 84 next_e_ = 0; 85 seq_[0] = 0; // Default value if no calls to Add 86 87 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3 88 int reserve = (2 * k_ + 1); 89 level_limit_e_ = kMaxSeq_ - reserve; 90 level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3 91 limit_e_ = level_limit_e_; 92 } 93 94 // Compress level k by 3x, creating level k+1 95 void SubsetSequence::NewLevel() { 96 // printf("NewLevel 3 ** %d\n", k_ + 1); 97 //DumpUint8s("count[k]", count_, k_ + 1); 98 //DumpUint8s("seq[next]", seq_, next_e_); 99 100 // Incoming level must be an exact multiple of three in size 101 CHECK((count_[k_] % 3) == 0); 102 int k_size = count_[k_]; 103 int new_size = k_size / 3; 104 105 // Compress down by 3x, via median 106 for (int j = 0; j < new_size; ++j) { 107 seq_[j] = Median3(j * 3); 108 } 109 110 // Update counts 111 count_[k_] = 0; 112 // Else Overflow -- just continue with 3x dense Level K 113 if (k_ < (kMaxLevel_ - 1)) {++k_;} 114 count_[k_] = new_size; 115 116 // Update limits 117 next_e_ = new_size; 118 limit_e_ = next_e_ + 3; 119 120 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3 121 int reserve = (2 * k_ + 1); 122 level_limit_e_ = kMaxSeq_ - reserve; 123 level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3 124 // 125 //DumpUint8s("after: count[k]", count_, k_ + 1); 126 //DumpUint8s("after: seq[next]", seq_, next_e_); 127 } 128 129 void SubsetSequence::DoCarries() { 130 CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while 131 // Make room by carrying 132 133 //DumpUint8s("DoCarries count[k]", count_, k_ + 1); 134 //DumpUint8s("DoCarries seq[next]", seq_, next_e_); 135 136 int i = 0; 137 while (count_[i] == 3) { 138 next_e_ -= 3; 139 seq_[next_e_] = Median3(next_e_); 140 ++next_e_; 141 count_[i] = 0; 142 ++count_[i + 1]; 143 ++i; 144 } 145 limit_e_ = next_e_ + 3; 146 147 //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1); 148 //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_); 149 150 // If we just fully carried into level K, 151 // Make sure there is now enough room, else start level K + 1 152 if (i >= k_) { 153 CHECK(count_[k_] == next_e_); 154 if (next_e_ >= level_limit_e_) { 155 NewLevel(); 156 } 157 } 158 } 159 160 void SubsetSequence::Add(uint8 e) { 161 // Add an entry then carry as needed 162 seq_[next_e_] = e; 163 ++next_e_; 164 ++count_[0]; 165 166 if (next_e_ >= limit_e_) { 167 DoCarries(); 168 } 169 } 170 171 172 // Collapse tail end by simple median across disparate-weight values, 173 // dropping or duplicating last value if need be. 174 // This routine is idempotent. 175 void SubsetSequence::Flush() { 176 // printf("Flush %d\n", count_[k_]); 177 int start_tail = count_[k_]; 178 int size_tail = next_e_ - start_tail; 179 if ((size_tail % 3) == 2) { 180 seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value 181 ++size_tail; 182 } 183 184 // Compress tail down by 3x, via median 185 int new_size = size_tail / 3; // May delete last value 186 for (int j = 0; j < new_size; ++j) { 187 seq_[start_tail + j] = Median3(start_tail + j * 3); 188 } 189 190 next_e_ = start_tail + new_size; 191 count_[k_] = next_e_; 192 } 193 194 195 // Extract representative pattern of exactly N values into dst[0..n-1] 196 // This routine may be called multiple times, but it may downsample as a 197 // side effect, causing subsequent calls with larger N to get poor answers. 198 void SubsetSequence::Extract(int to_n, uint8* dst) { 199 // Collapse partial-carries in tail 200 Flush(); 201 202 // Just use Bresenham to resample 203 int from_n = next_e_; 204 if (to_n >= from_n) { 205 // Up-sample from_n => to_n 206 int err = to_n - 1; // bias toward no overshoot 207 int j = 0; 208 for (int i = 0; i < to_n; ++i) { 209 dst[i] = seq_[j]; 210 err -= from_n; 211 if (err < 0) { 212 ++j; 213 err += to_n; 214 } 215 } 216 } else { 217 // Get to the point that the number of samples is <= 3 * to_n 218 while (next_e_ > (to_n * 3)) { 219 // Compress down by 3x, via median 220 // printf("Extract, median %d / 3\n", next_e_); 221 if ((next_e_ % 3) == 2) { 222 seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value 223 ++next_e_; 224 } 225 int new_size = next_e_ / 3; // May delete last value 226 for (int j = 0; j < new_size; ++j) { 227 seq_[j] = Median3(j * 3); 228 } 229 next_e_ = new_size; 230 count_[k_] = next_e_; 231 } 232 from_n = next_e_; 233 234 if (to_n == from_n) { 235 // Copy verbatim 236 for (int i = 0; i < to_n; ++i) { 237 dst[i] = seq_[i]; 238 } 239 return; 240 } 241 242 // Down-sample from_n => to_n, using medians 243 int err = 0; // Bias to immediate median sample 244 int j = 0; 245 for (int i = 0; i < from_n; ++i) { 246 err -= to_n; 247 if (err < 0) { 248 if (i <= (next_e_ - 2)) { 249 dst[j] = Median3(i); 250 } else { 251 dst[j] = seq_[i]; 252 } 253 ++j; 254 err += from_n; 255 } 256 } 257 } 258 259 } 260