Home | History | Annotate | Download | only in dsp
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // Speed-critical encoding functions.
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <stdlib.h>  // for abs()
     15 #include "./dsp.h"
     16 #include "../enc/vp8enci.h"
     17 
     18 #if defined(__cplusplus) || defined(c_plusplus)
     19 extern "C" {
     20 #endif
     21 
     22 static WEBP_INLINE uint8_t clip_8b(int v) {
     23   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
     24 }
     25 
     26 static WEBP_INLINE int clip_max(int v, int max) {
     27   return (v > max) ? max : v;
     28 }
     29 
     30 //------------------------------------------------------------------------------
     31 // Compute susceptibility based on DCT-coeff histograms:
     32 // the higher, the "easier" the macroblock is to compress.
     33 
     34 const int VP8DspScan[16 + 4 + 4] = {
     35   // Luma
     36   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
     37   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
     38   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
     39   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
     40 
     41   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
     42   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
     43 };
     44 
     45 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
     46                              int start_block, int end_block,
     47                              VP8Histogram* const histo) {
     48   int j;
     49   for (j = start_block; j < end_block; ++j) {
     50     int k;
     51     int16_t out[16];
     52 
     53     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
     54 
     55     // Convert coefficients to bin.
     56     for (k = 0; k < 16; ++k) {
     57       const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
     58       const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
     59       histo->distribution[clipped_value]++;
     60     }
     61   }
     62 }
     63 
     64 //------------------------------------------------------------------------------
     65 // run-time tables (~4k)
     66 
     67 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
     68 
     69 // We declare this variable 'volatile' to prevent instruction reordering
     70 // and make sure it's set to true _last_ (so as to be thread-safe)
     71 static volatile int tables_ok = 0;
     72 
     73 static void InitTables(void) {
     74   if (!tables_ok) {
     75     int i;
     76     for (i = -255; i <= 255 + 255; ++i) {
     77       clip1[255 + i] = clip_8b(i);
     78     }
     79     tables_ok = 1;
     80   }
     81 }
     82 
     83 
     84 //------------------------------------------------------------------------------
     85 // Transforms (Paragraph 14.4)
     86 
     87 #define STORE(x, y, v) \
     88   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
     89 
     90 static const int kC1 = 20091 + (1 << 16);
     91 static const int kC2 = 35468;
     92 #define MUL(a, b) (((a) * (b)) >> 16)
     93 
     94 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
     95                                       uint8_t* dst) {
     96   int C[4 * 4], *tmp;
     97   int i;
     98   tmp = C;
     99   for (i = 0; i < 4; ++i) {    // vertical pass
    100     const int a = in[0] + in[8];
    101     const int b = in[0] - in[8];
    102     const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
    103     const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
    104     tmp[0] = a + d;
    105     tmp[1] = b + c;
    106     tmp[2] = b - c;
    107     tmp[3] = a - d;
    108     tmp += 4;
    109     in++;
    110   }
    111 
    112   tmp = C;
    113   for (i = 0; i < 4; ++i) {    // horizontal pass
    114     const int dc = tmp[0] + 4;
    115     const int a =  dc +  tmp[8];
    116     const int b =  dc -  tmp[8];
    117     const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
    118     const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
    119     STORE(0, i, a + d);
    120     STORE(1, i, b + c);
    121     STORE(2, i, b - c);
    122     STORE(3, i, a - d);
    123     tmp++;
    124   }
    125 }
    126 
    127 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
    128                        int do_two) {
    129   ITransformOne(ref, in, dst);
    130   if (do_two) {
    131     ITransformOne(ref + 4, in + 16, dst + 4);
    132   }
    133 }
    134 
    135 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
    136   int i;
    137   int tmp[16];
    138   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
    139     const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
    140     const int d1 = src[1] - ref[1];
    141     const int d2 = src[2] - ref[2];
    142     const int d3 = src[3] - ref[3];
    143     const int a0 = (d0 + d3);         // 10b                      [-510,510]
    144     const int a1 = (d1 + d2);
    145     const int a2 = (d1 - d2);
    146     const int a3 = (d0 - d3);
    147     tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
    148     tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
    149     tmp[2 + i * 4] = (a0 - a1) * 8;
    150     tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
    151   }
    152   for (i = 0; i < 4; ++i) {
    153     const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
    154     const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
    155     const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
    156     const int a3 = (tmp[0 + i] - tmp[12 + i]);
    157     out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
    158     out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
    159     out[8 + i] = (a0 - a1 + 7) >> 4;
    160     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
    161   }
    162 }
    163 
    164 static void ITransformWHT(const int16_t* in, int16_t* out) {
    165   int tmp[16];
    166   int i;
    167   for (i = 0; i < 4; ++i) {
    168     const int a0 = in[0 + i] + in[12 + i];
    169     const int a1 = in[4 + i] + in[ 8 + i];
    170     const int a2 = in[4 + i] - in[ 8 + i];
    171     const int a3 = in[0 + i] - in[12 + i];
    172     tmp[0  + i] = a0 + a1;
    173     tmp[8  + i] = a0 - a1;
    174     tmp[4  + i] = a3 + a2;
    175     tmp[12 + i] = a3 - a2;
    176   }
    177   for (i = 0; i < 4; ++i) {
    178     const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
    179     const int a0 = dc             + tmp[3 + i * 4];
    180     const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
    181     const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
    182     const int a3 = dc             - tmp[3 + i * 4];
    183     out[ 0] = (a0 + a1) >> 3;
    184     out[16] = (a3 + a2) >> 3;
    185     out[32] = (a0 - a1) >> 3;
    186     out[48] = (a3 - a2) >> 3;
    187     out += 64;
    188   }
    189 }
    190 
    191 static void FTransformWHT(const int16_t* in, int16_t* out) {
    192   // input is 12b signed
    193   int16_t tmp[16];
    194   int i;
    195   for (i = 0; i < 4; ++i, in += 64) {
    196     const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
    197     const int a1 = (in[1 * 16] + in[3 * 16]);
    198     const int a2 = (in[1 * 16] - in[3 * 16]);
    199     const int a3 = (in[0 * 16] - in[2 * 16]);
    200     tmp[0 + i * 4] = a0 + a1;   // 14b
    201     tmp[1 + i * 4] = a3 + a2;
    202     tmp[2 + i * 4] = a3 - a2;
    203     tmp[3 + i * 4] = a0 - a1;
    204   }
    205   for (i = 0; i < 4; ++i) {
    206     const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
    207     const int a1 = (tmp[4 + i] + tmp[12+ i]);
    208     const int a2 = (tmp[4 + i] - tmp[12+ i]);
    209     const int a3 = (tmp[0 + i] - tmp[8 + i]);
    210     const int b0 = a0 + a1;    // 16b
    211     const int b1 = a3 + a2;
    212     const int b2 = a3 - a2;
    213     const int b3 = a0 - a1;
    214     out[ 0 + i] = b0 >> 1;     // 15b
    215     out[ 4 + i] = b1 >> 1;
    216     out[ 8 + i] = b2 >> 1;
    217     out[12 + i] = b3 >> 1;
    218   }
    219 }
    220 
    221 #undef MUL
    222 #undef STORE
    223 
    224 //------------------------------------------------------------------------------
    225 // Intra predictions
    226 
    227 #define DST(x, y) dst[(x) + (y) * BPS]
    228 
    229 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
    230   int j;
    231   for (j = 0; j < size; ++j) {
    232     memset(dst + j * BPS, value, size);
    233   }
    234 }
    235 
    236 static WEBP_INLINE void VerticalPred(uint8_t* dst,
    237                                      const uint8_t* top, int size) {
    238   int j;
    239   if (top) {
    240     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
    241   } else {
    242     Fill(dst, 127, size);
    243   }
    244 }
    245 
    246 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
    247                                        const uint8_t* left, int size) {
    248   if (left) {
    249     int j;
    250     for (j = 0; j < size; ++j) {
    251       memset(dst + j * BPS, left[j], size);
    252     }
    253   } else {
    254     Fill(dst, 129, size);
    255   }
    256 }
    257 
    258 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
    259                                    const uint8_t* top, int size) {
    260   int y;
    261   if (left) {
    262     if (top) {
    263       const uint8_t* const clip = clip1 + 255 - left[-1];
    264       for (y = 0; y < size; ++y) {
    265         const uint8_t* const clip_table = clip + left[y];
    266         int x;
    267         for (x = 0; x < size; ++x) {
    268           dst[x] = clip_table[top[x]];
    269         }
    270         dst += BPS;
    271       }
    272     } else {
    273       HorizontalPred(dst, left, size);
    274     }
    275   } else {
    276     // true motion without left samples (hence: with default 129 value)
    277     // is equivalent to VE prediction where you just copy the top samples.
    278     // Note that if top samples are not available, the default value is
    279     // then 129, and not 127 as in the VerticalPred case.
    280     if (top) {
    281       VerticalPred(dst, top, size);
    282     } else {
    283       Fill(dst, 129, size);
    284     }
    285   }
    286 }
    287 
    288 static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
    289                                const uint8_t* top,
    290                                int size, int round, int shift) {
    291   int DC = 0;
    292   int j;
    293   if (top) {
    294     for (j = 0; j < size; ++j) DC += top[j];
    295     if (left) {   // top and left present
    296       for (j = 0; j < size; ++j) DC += left[j];
    297     } else {      // top, but no left
    298       DC += DC;
    299     }
    300     DC = (DC + round) >> shift;
    301   } else if (left) {   // left but no top
    302     for (j = 0; j < size; ++j) DC += left[j];
    303     DC += DC;
    304     DC = (DC + round) >> shift;
    305   } else {   // no top, no left, nothing.
    306     DC = 0x80;
    307   }
    308   Fill(dst, DC, size);
    309 }
    310 
    311 //------------------------------------------------------------------------------
    312 // Chroma 8x8 prediction (paragraph 12.2)
    313 
    314 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
    315                              const uint8_t* top) {
    316   // U block
    317   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
    318   VerticalPred(C8VE8 + dst, top, 8);
    319   HorizontalPred(C8HE8 + dst, left, 8);
    320   TrueMotion(C8TM8 + dst, left, top, 8);
    321   // V block
    322   dst += 8;
    323   if (top) top += 8;
    324   if (left) left += 16;
    325   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
    326   VerticalPred(C8VE8 + dst, top, 8);
    327   HorizontalPred(C8HE8 + dst, left, 8);
    328   TrueMotion(C8TM8 + dst, left, top, 8);
    329 }
    330 
    331 //------------------------------------------------------------------------------
    332 // luma 16x16 prediction (paragraph 12.3)
    333 
    334 static void Intra16Preds(uint8_t* dst,
    335                          const uint8_t* left, const uint8_t* top) {
    336   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
    337   VerticalPred(I16VE16 + dst, top, 16);
    338   HorizontalPred(I16HE16 + dst, left, 16);
    339   TrueMotion(I16TM16 + dst, left, top, 16);
    340 }
    341 
    342 //------------------------------------------------------------------------------
    343 // luma 4x4 prediction
    344 
    345 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
    346 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
    347 
    348 static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
    349   const uint8_t vals[4] = {
    350     AVG3(top[-1], top[0], top[1]),
    351     AVG3(top[ 0], top[1], top[2]),
    352     AVG3(top[ 1], top[2], top[3]),
    353     AVG3(top[ 2], top[3], top[4])
    354   };
    355   int i;
    356   for (i = 0; i < 4; ++i) {
    357     memcpy(dst + i * BPS, vals, 4);
    358   }
    359 }
    360 
    361 static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
    362   const int X = top[-1];
    363   const int I = top[-2];
    364   const int J = top[-3];
    365   const int K = top[-4];
    366   const int L = top[-5];
    367   *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
    368   *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
    369   *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
    370   *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
    371 }
    372 
    373 static void DC4(uint8_t* dst, const uint8_t* top) {
    374   uint32_t dc = 4;
    375   int i;
    376   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
    377   Fill(dst, dc >> 3, 4);
    378 }
    379 
    380 static void RD4(uint8_t* dst, const uint8_t* top) {
    381   const int X = top[-1];
    382   const int I = top[-2];
    383   const int J = top[-3];
    384   const int K = top[-4];
    385   const int L = top[-5];
    386   const int A = top[0];
    387   const int B = top[1];
    388   const int C = top[2];
    389   const int D = top[3];
    390   DST(0, 3)                                     = AVG3(J, K, L);
    391   DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
    392   DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
    393   DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
    394   DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
    395   DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
    396   DST(3, 0)                                     = AVG3(D, C, B);
    397 }
    398 
    399 static void LD4(uint8_t* dst, const uint8_t* top) {
    400   const int A = top[0];
    401   const int B = top[1];
    402   const int C = top[2];
    403   const int D = top[3];
    404   const int E = top[4];
    405   const int F = top[5];
    406   const int G = top[6];
    407   const int H = top[7];
    408   DST(0, 0)                                     = AVG3(A, B, C);
    409   DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
    410   DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
    411   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
    412   DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
    413   DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
    414   DST(3, 3)                                     = AVG3(G, H, H);
    415 }
    416 
    417 static void VR4(uint8_t* dst, const uint8_t* top) {
    418   const int X = top[-1];
    419   const int I = top[-2];
    420   const int J = top[-3];
    421   const int K = top[-4];
    422   const int A = top[0];
    423   const int B = top[1];
    424   const int C = top[2];
    425   const int D = top[3];
    426   DST(0, 0) = DST(1, 2) = AVG2(X, A);
    427   DST(1, 0) = DST(2, 2) = AVG2(A, B);
    428   DST(2, 0) = DST(3, 2) = AVG2(B, C);
    429   DST(3, 0)             = AVG2(C, D);
    430 
    431   DST(0, 3) =             AVG3(K, J, I);
    432   DST(0, 2) =             AVG3(J, I, X);
    433   DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
    434   DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
    435   DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
    436   DST(3, 1) =             AVG3(B, C, D);
    437 }
    438 
    439 static void VL4(uint8_t* dst, const uint8_t* top) {
    440   const int A = top[0];
    441   const int B = top[1];
    442   const int C = top[2];
    443   const int D = top[3];
    444   const int E = top[4];
    445   const int F = top[5];
    446   const int G = top[6];
    447   const int H = top[7];
    448   DST(0, 0) =             AVG2(A, B);
    449   DST(1, 0) = DST(0, 2) = AVG2(B, C);
    450   DST(2, 0) = DST(1, 2) = AVG2(C, D);
    451   DST(3, 0) = DST(2, 2) = AVG2(D, E);
    452 
    453   DST(0, 1) =             AVG3(A, B, C);
    454   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
    455   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
    456   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
    457               DST(3, 2) = AVG3(E, F, G);
    458               DST(3, 3) = AVG3(F, G, H);
    459 }
    460 
    461 static void HU4(uint8_t* dst, const uint8_t* top) {
    462   const int I = top[-2];
    463   const int J = top[-3];
    464   const int K = top[-4];
    465   const int L = top[-5];
    466   DST(0, 0) =             AVG2(I, J);
    467   DST(2, 0) = DST(0, 1) = AVG2(J, K);
    468   DST(2, 1) = DST(0, 2) = AVG2(K, L);
    469   DST(1, 0) =             AVG3(I, J, K);
    470   DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
    471   DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
    472   DST(3, 2) = DST(2, 2) =
    473   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
    474 }
    475 
    476 static void HD4(uint8_t* dst, const uint8_t* top) {
    477   const int X = top[-1];
    478   const int I = top[-2];
    479   const int J = top[-3];
    480   const int K = top[-4];
    481   const int L = top[-5];
    482   const int A = top[0];
    483   const int B = top[1];
    484   const int C = top[2];
    485 
    486   DST(0, 0) = DST(2, 1) = AVG2(I, X);
    487   DST(0, 1) = DST(2, 2) = AVG2(J, I);
    488   DST(0, 2) = DST(2, 3) = AVG2(K, J);
    489   DST(0, 3)             = AVG2(L, K);
    490 
    491   DST(3, 0)             = AVG3(A, B, C);
    492   DST(2, 0)             = AVG3(X, A, B);
    493   DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
    494   DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
    495   DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
    496   DST(1, 3)             = AVG3(L, K, J);
    497 }
    498 
    499 static void TM4(uint8_t* dst, const uint8_t* top) {
    500   int x, y;
    501   const uint8_t* const clip = clip1 + 255 - top[-1];
    502   for (y = 0; y < 4; ++y) {
    503     const uint8_t* const clip_table = clip + top[-2 - y];
    504     for (x = 0; x < 4; ++x) {
    505       dst[x] = clip_table[top[x]];
    506     }
    507     dst += BPS;
    508   }
    509 }
    510 
    511 #undef DST
    512 #undef AVG3
    513 #undef AVG2
    514 
    515 // Left samples are top[-5 .. -2], top_left is top[-1], top are
    516 // located at top[0..3], and top right is top[4..7]
    517 static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
    518   DC4(I4DC4 + dst, top);
    519   TM4(I4TM4 + dst, top);
    520   VE4(I4VE4 + dst, top);
    521   HE4(I4HE4 + dst, top);
    522   RD4(I4RD4 + dst, top);
    523   VR4(I4VR4 + dst, top);
    524   LD4(I4LD4 + dst, top);
    525   VL4(I4VL4 + dst, top);
    526   HD4(I4HD4 + dst, top);
    527   HU4(I4HU4 + dst, top);
    528 }
    529 
    530 //------------------------------------------------------------------------------
    531 // Metric
    532 
    533 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
    534                               int w, int h) {
    535   int count = 0;
    536   int y, x;
    537   for (y = 0; y < h; ++y) {
    538     for (x = 0; x < w; ++x) {
    539       const int diff = (int)a[x] - b[x];
    540       count += diff * diff;
    541     }
    542     a += BPS;
    543     b += BPS;
    544   }
    545   return count;
    546 }
    547 
    548 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
    549   return GetSSE(a, b, 16, 16);
    550 }
    551 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
    552   return GetSSE(a, b, 16, 8);
    553 }
    554 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
    555   return GetSSE(a, b, 8, 8);
    556 }
    557 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
    558   return GetSSE(a, b, 4, 4);
    559 }
    560 
    561 //------------------------------------------------------------------------------
    562 // Texture distortion
    563 //
    564 // We try to match the spectral content (weighted) between source and
    565 // reconstructed samples.
    566 
    567 // Hadamard transform
    568 // Returns the weighted sum of the absolute value of transformed coefficients.
    569 static int TTransform(const uint8_t* in, const uint16_t* w) {
    570   int sum = 0;
    571   int tmp[16];
    572   int i;
    573   // horizontal pass
    574   for (i = 0; i < 4; ++i, in += BPS) {
    575     const int a0 = in[0] + in[2];
    576     const int a1 = in[1] + in[3];
    577     const int a2 = in[1] - in[3];
    578     const int a3 = in[0] - in[2];
    579     tmp[0 + i * 4] = a0 + a1;
    580     tmp[1 + i * 4] = a3 + a2;
    581     tmp[2 + i * 4] = a3 - a2;
    582     tmp[3 + i * 4] = a0 - a1;
    583   }
    584   // vertical pass
    585   for (i = 0; i < 4; ++i, ++w) {
    586     const int a0 = tmp[0 + i] + tmp[8 + i];
    587     const int a1 = tmp[4 + i] + tmp[12+ i];
    588     const int a2 = tmp[4 + i] - tmp[12+ i];
    589     const int a3 = tmp[0 + i] - tmp[8 + i];
    590     const int b0 = a0 + a1;
    591     const int b1 = a3 + a2;
    592     const int b2 = a3 - a2;
    593     const int b3 = a0 - a1;
    594 
    595     sum += w[ 0] * abs(b0);
    596     sum += w[ 4] * abs(b1);
    597     sum += w[ 8] * abs(b2);
    598     sum += w[12] * abs(b3);
    599   }
    600   return sum;
    601 }
    602 
    603 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
    604                     const uint16_t* const w) {
    605   const int sum1 = TTransform(a, w);
    606   const int sum2 = TTransform(b, w);
    607   return abs(sum2 - sum1) >> 5;
    608 }
    609 
    610 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
    611                       const uint16_t* const w) {
    612   int D = 0;
    613   int x, y;
    614   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    615     for (x = 0; x < 16; x += 4) {
    616       D += Disto4x4(a + x + y, b + x + y, w);
    617     }
    618   }
    619   return D;
    620 }
    621 
    622 //------------------------------------------------------------------------------
    623 // Quantization
    624 //
    625 
    626 static const uint8_t kZigzag[16] = {
    627   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    628 };
    629 
    630 // Simple quantization
    631 static int QuantizeBlock(int16_t in[16], int16_t out[16],
    632                          int n, const VP8Matrix* const mtx) {
    633   int last = -1;
    634   for (; n < 16; ++n) {
    635     const int j = kZigzag[n];
    636     const int sign = (in[j] < 0);
    637     const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    638     if (coeff > mtx->zthresh_[j]) {
    639       const int Q = mtx->q_[j];
    640       const int iQ = mtx->iq_[j];
    641       const int B = mtx->bias_[j];
    642       out[n] = QUANTDIV(coeff, iQ, B);
    643       if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
    644       if (sign) out[n] = -out[n];
    645       in[j] = out[n] * Q;
    646       if (out[n]) last = n;
    647     } else {
    648       out[n] = 0;
    649       in[j] = 0;
    650     }
    651   }
    652   return (last >= 0);
    653 }
    654 
    655 //------------------------------------------------------------------------------
    656 // Block copy
    657 
    658 static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
    659   int y;
    660   for (y = 0; y < size; ++y) {
    661     memcpy(dst, src, size);
    662     src += BPS;
    663     dst += BPS;
    664   }
    665 }
    666 
    667 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
    668 
    669 //------------------------------------------------------------------------------
    670 // Initialization
    671 
    672 // Speed-critical function pointers. We have to initialize them to the default
    673 // implementations within VP8EncDspInit().
    674 VP8CHisto VP8CollectHistogram;
    675 VP8Idct VP8ITransform;
    676 VP8Fdct VP8FTransform;
    677 VP8WHT VP8ITransformWHT;
    678 VP8WHT VP8FTransformWHT;
    679 VP8Intra4Preds VP8EncPredLuma4;
    680 VP8IntraPreds VP8EncPredLuma16;
    681 VP8IntraPreds VP8EncPredChroma8;
    682 VP8Metric VP8SSE16x16;
    683 VP8Metric VP8SSE8x8;
    684 VP8Metric VP8SSE16x8;
    685 VP8Metric VP8SSE4x4;
    686 VP8WMetric VP8TDisto4x4;
    687 VP8WMetric VP8TDisto16x16;
    688 VP8QuantizeBlock VP8EncQuantizeBlock;
    689 VP8BlockCopy VP8Copy4x4;
    690 
    691 extern void VP8EncDspInitSSE2(void);
    692 extern void VP8EncDspInitNEON(void);
    693 
    694 void VP8EncDspInit(void) {
    695   InitTables();
    696 
    697   // default C implementations
    698   VP8CollectHistogram = CollectHistogram;
    699   VP8ITransform = ITransform;
    700   VP8FTransform = FTransform;
    701   VP8ITransformWHT = ITransformWHT;
    702   VP8FTransformWHT = FTransformWHT;
    703   VP8EncPredLuma4 = Intra4Preds;
    704   VP8EncPredLuma16 = Intra16Preds;
    705   VP8EncPredChroma8 = IntraChromaPreds;
    706   VP8SSE16x16 = SSE16x16;
    707   VP8SSE8x8 = SSE8x8;
    708   VP8SSE16x8 = SSE16x8;
    709   VP8SSE4x4 = SSE4x4;
    710   VP8TDisto4x4 = Disto4x4;
    711   VP8TDisto16x16 = Disto16x16;
    712   VP8EncQuantizeBlock = QuantizeBlock;
    713   VP8Copy4x4 = Copy4x4;
    714 
    715   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
    716   if (VP8GetCPUInfo) {
    717 #if defined(WEBP_USE_SSE2)
    718     if (VP8GetCPUInfo(kSSE2)) {
    719       VP8EncDspInitSSE2();
    720     }
    721 #elif defined(WEBP_USE_NEON)
    722     if (VP8GetCPUInfo(kNEON)) {
    723       VP8EncDspInitNEON();
    724     }
    725 #endif
    726   }
    727 }
    728 
    729 #if defined(__cplusplus) || defined(c_plusplus)
    730 }    // extern "C"
    731 #endif
    732