Home | History | Annotate | Download | only in enc
      1 /* Copyright 2013 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* Literal cost model to allow backward reference replacement to be efficient.
      8 */
      9 
     10 #include "./literal_cost.h"
     11 
     12 #include <brotli/types.h>
     13 #include "./fast_log.h"
     14 #include "./port.h"
     15 #include "./utf8_util.h"
     16 
     17 #if defined(__cplusplus) || defined(c_plusplus)
     18 extern "C" {
     19 #endif
     20 
     21 static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
     22   if (c < 128) {
     23     return 0;  /* Next one is the 'Byte 1' again. */
     24   } else if (c >= 192) {  /* Next one is the 'Byte 2' of utf-8 encoding. */
     25     return BROTLI_MIN(size_t, 1, clamp);
     26   } else {
     27     /* Let's decide over the last byte if this ends the sequence. */
     28     if (last < 0xe0) {
     29       return 0;  /* Completed two or three byte coding. */
     30     } else {  /* Next one is the 'Byte 3' of utf-8 encoding. */
     31       return BROTLI_MIN(size_t, 2, clamp);
     32     }
     33   }
     34 }
     35 
     36 static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
     37                                         const uint8_t *data) {
     38   size_t counts[3] = { 0 };
     39   size_t max_utf8 = 1;  /* should be 2, but 1 compresses better. */
     40   size_t last_c = 0;
     41   size_t i;
     42   for (i = 0; i < len; ++i) {
     43     size_t c = data[(pos + i) & mask];
     44     ++counts[UTF8Position(last_c, c, 2)];
     45     last_c = c;
     46   }
     47   if (counts[2] < 500) {
     48     max_utf8 = 1;
     49   }
     50   if (counts[1] + counts[2] < 25) {
     51     max_utf8 = 0;
     52   }
     53   return max_utf8;
     54 }
     55 
     56 static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
     57                                             const uint8_t *data, float *cost) {
     58   /* max_utf8 is 0 (normal ASCII single byte modeling),
     59      1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
     60   const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
     61   size_t histogram[3][256] = { { 0 } };
     62   size_t window_half = 495;
     63   size_t in_window = BROTLI_MIN(size_t, window_half, len);
     64   size_t in_window_utf8[3] = { 0 };
     65 
     66   size_t i;
     67   {  /* Bootstrap histograms. */
     68     size_t last_c = 0;
     69     size_t utf8_pos = 0;
     70     for (i = 0; i < in_window; ++i) {
     71       size_t c = data[(pos + i) & mask];
     72       ++histogram[utf8_pos][c];
     73       ++in_window_utf8[utf8_pos];
     74       utf8_pos = UTF8Position(last_c, c, max_utf8);
     75       last_c = c;
     76     }
     77   }
     78 
     79   /* Compute bit costs with sliding window. */
     80   for (i = 0; i < len; ++i) {
     81     if (i >= window_half) {
     82       /* Remove a byte in the past. */
     83       size_t c =
     84           i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask];
     85       size_t last_c =
     86           i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask];
     87       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
     88       --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
     89       --in_window_utf8[utf8_pos2];
     90     }
     91     if (i + window_half < len) {
     92       /* Add a byte in the future. */
     93       size_t c = data[(pos + i + window_half - 1) & mask];
     94       size_t last_c = data[(pos + i + window_half - 2) & mask];
     95       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
     96       ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
     97       ++in_window_utf8[utf8_pos2];
     98     }
     99     {
    100       size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
    101       size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
    102       size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
    103       size_t masked_pos = (pos + i) & mask;
    104       size_t histo = histogram[utf8_pos][data[masked_pos]];
    105       double lit_cost;
    106       if (histo == 0) {
    107         histo = 1;
    108       }
    109       lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
    110       lit_cost += 0.02905;
    111       if (lit_cost < 1.0) {
    112         lit_cost *= 0.5;
    113         lit_cost += 0.5;
    114       }
    115       /* Make the first bytes more expensive -- seems to help, not sure why.
    116          Perhaps because the entropy source is changing its properties
    117          rapidly in the beginning of the file, perhaps because the beginning
    118          of the data is a statistical "anomaly". */
    119       if (i < 2000) {
    120         lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35);
    121       }
    122       cost[i] = (float)lit_cost;
    123     }
    124   }
    125 }
    126 
    127 void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
    128                                        const uint8_t *data, float *cost) {
    129   if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
    130     EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
    131     return;
    132   } else {
    133     size_t histogram[256] = { 0 };
    134     size_t window_half = 2000;
    135     size_t in_window = BROTLI_MIN(size_t, window_half, len);
    136 
    137     /* Bootstrap histogram. */
    138     size_t i;
    139     for (i = 0; i < in_window; ++i) {
    140       ++histogram[data[(pos + i) & mask]];
    141     }
    142 
    143     /* Compute bit costs with sliding window. */
    144     for (i = 0; i < len; ++i) {
    145       size_t histo;
    146       if (i >= window_half) {
    147         /* Remove a byte in the past. */
    148         --histogram[data[(pos + i - window_half) & mask]];
    149         --in_window;
    150       }
    151       if (i + window_half < len) {
    152         /* Add a byte in the future. */
    153         ++histogram[data[(pos + i + window_half) & mask]];
    154         ++in_window;
    155       }
    156       histo = histogram[data[(pos + i) & mask]];
    157       if (histo == 0) {
    158         histo = 1;
    159       }
    160       {
    161         double lit_cost = FastLog2(in_window) - FastLog2(histo);
    162         lit_cost += 0.029;
    163         if (lit_cost < 1.0) {
    164           lit_cost *= 0.5;
    165           lit_cost += 0.5;
    166         }
    167         cost[i] = (float)lit_cost;
    168       }
    169     }
    170   }
    171 }
    172 
    173 #if defined(__cplusplus) || defined(c_plusplus)
    174 }  /* extern "C" */
    175 #endif
    176