Home | History | Annotate | Download | only in enc
      1 // Copyright 2013 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Literal cost model to allow backward reference replacement to be efficient.
     16 
     17 #include "./literal_cost.h"
     18 
     19 #include <math.h>
     20 #include <stdint.h>
     21 #include <algorithm>
     22 
     23 namespace brotli {
     24 
     25 static int UTF8Position(int last, int c, int clamp) {
     26   if (c < 128) {
     27     return 0;  // Next one is the 'Byte 1' again.
     28   } else if (c >= 192) {
     29     return std::min(1, clamp);  // Next one is the 'Byte 2' of utf-8 encoding.
     30   } else {
     31     // Let's decide over the last byte if this ends the sequence.
     32     if (last < 0xe0) {
     33       return 0;  // Completed two or three byte coding.
     34     } else {
     35       return std::min(2, clamp);  // Next one is the 'Byte 3' of utf-8 encoding.
     36     }
     37   }
     38 }
     39 
     40 static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
     41                                      const uint8_t *data) {
     42   int counts[3] = { 0 };
     43   int max_utf8 = 1;  // should be 2, but 1 compresses better.
     44   int last_c = 0;
     45   int utf8_pos = 0;
     46   for (int i = 0; i < len; ++i) {
     47     int c = data[(pos + i) & mask];
     48     utf8_pos = UTF8Position(last_c, c, 2);
     49     ++counts[utf8_pos];
     50     last_c = c;
     51   }
     52   if (counts[2] < 500) {
     53     max_utf8 = 1;
     54   }
     55   if (counts[1] + counts[2] < 25) {
     56     max_utf8 = 0;
     57   }
     58   return max_utf8;
     59 }
     60 
     61 void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
     62                                      size_t cost_mask, const uint8_t *data,
     63                                      float *cost) {
     64 
     65   // max_utf8 is 0 (normal ascii single byte modeling),
     66   // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
     67   const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
     68   int histogram[3][256] = { { 0 } };
     69   int window_half = 495;
     70   int in_window = std::min(static_cast<size_t>(window_half), len);
     71   int in_window_utf8[3] = { 0 };
     72 
     73   // Bootstrap histograms.
     74   int last_c = 0;
     75   int utf8_pos = 0;
     76   for (int i = 0; i < in_window; ++i) {
     77     int c = data[(pos + i) & mask];
     78     ++histogram[utf8_pos][c];
     79     ++in_window_utf8[utf8_pos];
     80     utf8_pos = UTF8Position(last_c, c, max_utf8);
     81     last_c = c;
     82   }
     83 
     84   // Compute bit costs with sliding window.
     85   for (int i = 0; i < len; ++i) {
     86     if (i - window_half >= 0) {
     87       // Remove a byte in the past.
     88       int c = (i - window_half - 1) < 0 ?
     89           0 : data[(pos + i - window_half - 1) & mask];
     90       int last_c = (i - window_half - 2) < 0 ?
     91           0 : data[(pos + i - window_half - 2) & mask];
     92       int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
     93       --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
     94       --in_window_utf8[utf8_pos2];
     95     }
     96     if (i + window_half < len) {
     97       // Add a byte in the future.
     98       int c = (i + window_half - 1) < 0 ?
     99           0 : data[(pos + i + window_half - 1) & mask];
    100       int last_c = (i + window_half - 2) < 0 ?
    101           0 : data[(pos + i + window_half - 2) & mask];
    102       int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
    103       ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
    104       ++in_window_utf8[utf8_pos2];
    105     }
    106     int c = i < 1 ? 0 : data[(pos + i - 1) & mask];
    107     int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
    108     int utf8_pos = UTF8Position(last_c, c, max_utf8);
    109     int masked_pos = (pos + i) & mask;
    110     int histo = histogram[utf8_pos][data[masked_pos]];
    111     if (histo == 0) {
    112       histo = 1;
    113     }
    114     float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos])
    115                           / histo);
    116     lit_cost += 0.02905;
    117     if (lit_cost < 1.0) {
    118       lit_cost *= 0.5;
    119       lit_cost += 0.5;
    120     }
    121     // Make the first bytes more expensive -- seems to help, not sure why.
    122     // Perhaps because the entropy source is changing its properties
    123     // rapidly in the beginning of the file, perhaps because the beginning
    124     // of the data is a statistical "anomaly".
    125     if (i < 2000) {
    126       lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
    127     }
    128     cost[(pos + i) & cost_mask] = lit_cost;
    129   }
    130 }
    131 
    132 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
    133                                  size_t cost_mask, const uint8_t *data,
    134                                  float *cost) {
    135   int histogram[256] = { 0 };
    136   int window_half = 2000;
    137   int in_window = std::min(static_cast<size_t>(window_half), len);
    138 
    139   // Bootstrap histogram.
    140   for (int i = 0; i < in_window; ++i) {
    141     ++histogram[data[(pos + i) & mask]];
    142   }
    143 
    144   // Compute bit costs with sliding window.
    145   for (int i = 0; i < len; ++i) {
    146     if (i - window_half >= 0) {
    147       // Remove a byte in the past.
    148       --histogram[data[(pos + i - window_half) & mask]];
    149       --in_window;
    150     }
    151     if (i + window_half < len) {
    152       // Add a byte in the future.
    153       ++histogram[data[(pos + i + window_half) & mask]];
    154       ++in_window;
    155     }
    156     int histo = histogram[data[(pos + i) & mask]];
    157     if (histo == 0) {
    158       histo = 1;
    159     }
    160     float lit_cost = log2(static_cast<double>(in_window) / histo);
    161     lit_cost += 0.029;
    162     if (lit_cost < 1.0) {
    163       lit_cost *= 0.5;
    164       lit_cost += 0.5;
    165     }
    166     cost[(pos + i) & cost_mask] = lit_cost;
    167   }
    168 }
    169 
    170 
    171 }  // namespace brotli
    172