Home | History | Annotate | Download | only in dsp
      1 // Copyright 2010 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // Speed-critical decoding functions, default plain-C implementations.
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "./dsp.h"
     15 #include "../dec/vp8i_dec.h"
     16 #include "../utils/utils.h"
     17 
     18 //------------------------------------------------------------------------------
     19 
     20 static WEBP_INLINE uint8_t clip_8b(int v) {
     21   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
     22 }
     23 
     24 //------------------------------------------------------------------------------
     25 // Transforms (Paragraph 14.4)
     26 
     27 #define STORE(x, y, v) \
     28   dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
     29 
     30 #define STORE2(y, dc, d, c) do {    \
     31   const int DC = (dc);              \
     32   STORE(0, y, DC + (d));            \
     33   STORE(1, y, DC + (c));            \
     34   STORE(2, y, DC - (c));            \
     35   STORE(3, y, DC - (d));            \
     36 } while (0)
     37 
     38 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
     39 #define MUL2(a) (((a) * 35468) >> 16)
     40 
     41 static void TransformOne(const int16_t* in, uint8_t* dst) {
     42   int C[4 * 4], *tmp;
     43   int i;
     44   tmp = C;
     45   for (i = 0; i < 4; ++i) {    // vertical pass
     46     const int a = in[0] + in[8];    // [-4096, 4094]
     47     const int b = in[0] - in[8];    // [-4095, 4095]
     48     const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
     49     const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
     50     tmp[0] = a + d;   // [-7881, 7875]
     51     tmp[1] = b + c;   // [-7878, 7878]
     52     tmp[2] = b - c;   // [-7878, 7878]
     53     tmp[3] = a - d;   // [-7877, 7879]
     54     tmp += 4;
     55     in++;
     56   }
     57   // Each pass is expanding the dynamic range by ~3.85 (upper bound).
     58   // The exact value is (2. + (20091 + 35468) / 65536).
     59   // After the second pass, maximum interval is [-3794, 3794], assuming
     60   // an input in [-2048, 2047] interval. We then need to add a dst value
     61   // in the [0, 255] range.
     62   // In the worst case scenario, the input to clip_8b() can be as large as
     63   // [-60713, 60968].
     64   tmp = C;
     65   for (i = 0; i < 4; ++i) {    // horizontal pass
     66     const int dc = tmp[0] + 4;
     67     const int a =  dc +  tmp[8];
     68     const int b =  dc -  tmp[8];
     69     const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
     70     const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
     71     STORE(0, 0, a + d);
     72     STORE(1, 0, b + c);
     73     STORE(2, 0, b - c);
     74     STORE(3, 0, a - d);
     75     tmp++;
     76     dst += BPS;
     77   }
     78 }
     79 
     80 // Simplified transform when only in[0], in[1] and in[4] are non-zero
     81 static void TransformAC3(const int16_t* in, uint8_t* dst) {
     82   const int a = in[0] + 4;
     83   const int c4 = MUL2(in[4]);
     84   const int d4 = MUL1(in[4]);
     85   const int c1 = MUL2(in[1]);
     86   const int d1 = MUL1(in[1]);
     87   STORE2(0, a + d4, d1, c1);
     88   STORE2(1, a + c4, d1, c1);
     89   STORE2(2, a - c4, d1, c1);
     90   STORE2(3, a - d4, d1, c1);
     91 }
     92 #undef MUL1
     93 #undef MUL2
     94 #undef STORE2
     95 
     96 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
     97   TransformOne(in, dst);
     98   if (do_two) {
     99     TransformOne(in + 16, dst + 4);
    100   }
    101 }
    102 
    103 static void TransformUV(const int16_t* in, uint8_t* dst) {
    104   VP8Transform(in + 0 * 16, dst, 1);
    105   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
    106 }
    107 
    108 static void TransformDC(const int16_t* in, uint8_t* dst) {
    109   const int DC = in[0] + 4;
    110   int i, j;
    111   for (j = 0; j < 4; ++j) {
    112     for (i = 0; i < 4; ++i) {
    113       STORE(i, j, DC);
    114     }
    115   }
    116 }
    117 
    118 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
    119   if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
    120   if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
    121   if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
    122   if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
    123 }
    124 
    125 #undef STORE
    126 
    127 //------------------------------------------------------------------------------
    128 // Paragraph 14.3
    129 
    130 static void TransformWHT(const int16_t* in, int16_t* out) {
    131   int tmp[16];
    132   int i;
    133   for (i = 0; i < 4; ++i) {
    134     const int a0 = in[0 + i] + in[12 + i];
    135     const int a1 = in[4 + i] + in[ 8 + i];
    136     const int a2 = in[4 + i] - in[ 8 + i];
    137     const int a3 = in[0 + i] - in[12 + i];
    138     tmp[0  + i] = a0 + a1;
    139     tmp[8  + i] = a0 - a1;
    140     tmp[4  + i] = a3 + a2;
    141     tmp[12 + i] = a3 - a2;
    142   }
    143   for (i = 0; i < 4; ++i) {
    144     const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
    145     const int a0 = dc             + tmp[3 + i * 4];
    146     const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
    147     const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
    148     const int a3 = dc             - tmp[3 + i * 4];
    149     out[ 0] = (a0 + a1) >> 3;
    150     out[16] = (a3 + a2) >> 3;
    151     out[32] = (a0 - a1) >> 3;
    152     out[48] = (a3 - a2) >> 3;
    153     out += 64;
    154   }
    155 }
    156 
    157 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
    158 
    159 //------------------------------------------------------------------------------
    160 // Intra predictions
    161 
    162 #define DST(x, y) dst[(x) + (y) * BPS]
    163 
    164 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
    165   const uint8_t* top = dst - BPS;
    166   const uint8_t* const clip0 = VP8kclip1 - top[-1];
    167   int y;
    168   for (y = 0; y < size; ++y) {
    169     const uint8_t* const clip = clip0 + dst[-1];
    170     int x;
    171     for (x = 0; x < size; ++x) {
    172       dst[x] = clip[top[x]];
    173     }
    174     dst += BPS;
    175   }
    176 }
    177 static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
    178 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
    179 static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
    180 
    181 //------------------------------------------------------------------------------
    182 // 16x16
    183 
    184 static void VE16(uint8_t* dst) {     // vertical
    185   int j;
    186   for (j = 0; j < 16; ++j) {
    187     memcpy(dst + j * BPS, dst - BPS, 16);
    188   }
    189 }
    190 
    191 static void HE16(uint8_t* dst) {     // horizontal
    192   int j;
    193   for (j = 16; j > 0; --j) {
    194     memset(dst, dst[-1], 16);
    195     dst += BPS;
    196   }
    197 }
    198 
    199 static WEBP_INLINE void Put16(int v, uint8_t* dst) {
    200   int j;
    201   for (j = 0; j < 16; ++j) {
    202     memset(dst + j * BPS, v, 16);
    203   }
    204 }
    205 
    206 static void DC16(uint8_t* dst) {    // DC
    207   int DC = 16;
    208   int j;
    209   for (j = 0; j < 16; ++j) {
    210     DC += dst[-1 + j * BPS] + dst[j - BPS];
    211   }
    212   Put16(DC >> 5, dst);
    213 }
    214 
    215 static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
    216   int DC = 8;
    217   int j;
    218   for (j = 0; j < 16; ++j) {
    219     DC += dst[-1 + j * BPS];
    220   }
    221   Put16(DC >> 4, dst);
    222 }
    223 
    224 static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
    225   int DC = 8;
    226   int i;
    227   for (i = 0; i < 16; ++i) {
    228     DC += dst[i - BPS];
    229   }
    230   Put16(DC >> 4, dst);
    231 }
    232 
    233 static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
    234   Put16(0x80, dst);
    235 }
    236 
    237 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
    238 
    239 //------------------------------------------------------------------------------
    240 // 4x4
    241 
    242 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
    243 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
    244 
    245 static void VE4(uint8_t* dst) {    // vertical
    246   const uint8_t* top = dst - BPS;
    247   const uint8_t vals[4] = {
    248     AVG3(top[-1], top[0], top[1]),
    249     AVG3(top[ 0], top[1], top[2]),
    250     AVG3(top[ 1], top[2], top[3]),
    251     AVG3(top[ 2], top[3], top[4])
    252   };
    253   int i;
    254   for (i = 0; i < 4; ++i) {
    255     memcpy(dst + i * BPS, vals, sizeof(vals));
    256   }
    257 }
    258 
    259 static void HE4(uint8_t* dst) {    // horizontal
    260   const int A = dst[-1 - BPS];
    261   const int B = dst[-1];
    262   const int C = dst[-1 + BPS];
    263   const int D = dst[-1 + 2 * BPS];
    264   const int E = dst[-1 + 3 * BPS];
    265   WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C));
    266   WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D));
    267   WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E));
    268   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
    269 }
    270 
    271 static void DC4(uint8_t* dst) {   // DC
    272   uint32_t dc = 4;
    273   int i;
    274   for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
    275   dc >>= 3;
    276   for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
    277 }
    278 
    279 static void RD4(uint8_t* dst) {   // Down-right
    280   const int I = dst[-1 + 0 * BPS];
    281   const int J = dst[-1 + 1 * BPS];
    282   const int K = dst[-1 + 2 * BPS];
    283   const int L = dst[-1 + 3 * BPS];
    284   const int X = dst[-1 - BPS];
    285   const int A = dst[0 - BPS];
    286   const int B = dst[1 - BPS];
    287   const int C = dst[2 - BPS];
    288   const int D = dst[3 - BPS];
    289   DST(0, 3)                                     = AVG3(J, K, L);
    290   DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
    291   DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
    292   DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
    293               DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
    294                           DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
    295                                       DST(3, 0) = AVG3(D, C, B);
    296 }
    297 
    298 static void LD4(uint8_t* dst) {   // Down-Left
    299   const int A = dst[0 - BPS];
    300   const int B = dst[1 - BPS];
    301   const int C = dst[2 - BPS];
    302   const int D = dst[3 - BPS];
    303   const int E = dst[4 - BPS];
    304   const int F = dst[5 - BPS];
    305   const int G = dst[6 - BPS];
    306   const int H = dst[7 - BPS];
    307   DST(0, 0)                                     = AVG3(A, B, C);
    308   DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
    309   DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
    310   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
    311               DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
    312                           DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
    313                                       DST(3, 3) = AVG3(G, H, H);
    314 }
    315 
    316 static void VR4(uint8_t* dst) {   // Vertical-Right
    317   const int I = dst[-1 + 0 * BPS];
    318   const int J = dst[-1 + 1 * BPS];
    319   const int K = dst[-1 + 2 * BPS];
    320   const int X = dst[-1 - BPS];
    321   const int A = dst[0 - BPS];
    322   const int B = dst[1 - BPS];
    323   const int C = dst[2 - BPS];
    324   const int D = dst[3 - BPS];
    325   DST(0, 0) = DST(1, 2) = AVG2(X, A);
    326   DST(1, 0) = DST(2, 2) = AVG2(A, B);
    327   DST(2, 0) = DST(3, 2) = AVG2(B, C);
    328   DST(3, 0)             = AVG2(C, D);
    329 
    330   DST(0, 3) =             AVG3(K, J, I);
    331   DST(0, 2) =             AVG3(J, I, X);
    332   DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
    333   DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
    334   DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
    335   DST(3, 1) =             AVG3(B, C, D);
    336 }
    337 
    338 static void VL4(uint8_t* dst) {   // Vertical-Left
    339   const int A = dst[0 - BPS];
    340   const int B = dst[1 - BPS];
    341   const int C = dst[2 - BPS];
    342   const int D = dst[3 - BPS];
    343   const int E = dst[4 - BPS];
    344   const int F = dst[5 - BPS];
    345   const int G = dst[6 - BPS];
    346   const int H = dst[7 - BPS];
    347   DST(0, 0) =             AVG2(A, B);
    348   DST(1, 0) = DST(0, 2) = AVG2(B, C);
    349   DST(2, 0) = DST(1, 2) = AVG2(C, D);
    350   DST(3, 0) = DST(2, 2) = AVG2(D, E);
    351 
    352   DST(0, 1) =             AVG3(A, B, C);
    353   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
    354   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
    355   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
    356               DST(3, 2) = AVG3(E, F, G);
    357               DST(3, 3) = AVG3(F, G, H);
    358 }
    359 
    360 static void HU4(uint8_t* dst) {   // Horizontal-Up
    361   const int I = dst[-1 + 0 * BPS];
    362   const int J = dst[-1 + 1 * BPS];
    363   const int K = dst[-1 + 2 * BPS];
    364   const int L = dst[-1 + 3 * BPS];
    365   DST(0, 0) =             AVG2(I, J);
    366   DST(2, 0) = DST(0, 1) = AVG2(J, K);
    367   DST(2, 1) = DST(0, 2) = AVG2(K, L);
    368   DST(1, 0) =             AVG3(I, J, K);
    369   DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
    370   DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
    371   DST(3, 2) = DST(2, 2) =
    372     DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
    373 }
    374 
    375 static void HD4(uint8_t* dst) {  // Horizontal-Down
    376   const int I = dst[-1 + 0 * BPS];
    377   const int J = dst[-1 + 1 * BPS];
    378   const int K = dst[-1 + 2 * BPS];
    379   const int L = dst[-1 + 3 * BPS];
    380   const int X = dst[-1 - BPS];
    381   const int A = dst[0 - BPS];
    382   const int B = dst[1 - BPS];
    383   const int C = dst[2 - BPS];
    384 
    385   DST(0, 0) = DST(2, 1) = AVG2(I, X);
    386   DST(0, 1) = DST(2, 2) = AVG2(J, I);
    387   DST(0, 2) = DST(2, 3) = AVG2(K, J);
    388   DST(0, 3)             = AVG2(L, K);
    389 
    390   DST(3, 0)             = AVG3(A, B, C);
    391   DST(2, 0)             = AVG3(X, A, B);
    392   DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
    393   DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
    394   DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
    395   DST(1, 3)             = AVG3(L, K, J);
    396 }
    397 
    398 #undef DST
    399 #undef AVG3
    400 #undef AVG2
    401 
    402 VP8PredFunc VP8PredLuma4[NUM_BMODES];
    403 
    404 //------------------------------------------------------------------------------
    405 // Chroma
    406 
    407 static void VE8uv(uint8_t* dst) {    // vertical
    408   int j;
    409   for (j = 0; j < 8; ++j) {
    410     memcpy(dst + j * BPS, dst - BPS, 8);
    411   }
    412 }
    413 
    414 static void HE8uv(uint8_t* dst) {    // horizontal
    415   int j;
    416   for (j = 0; j < 8; ++j) {
    417     memset(dst, dst[-1], 8);
    418     dst += BPS;
    419   }
    420 }
    421 
    422 // helper for chroma-DC predictions
    423 static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
    424   int j;
    425   for (j = 0; j < 8; ++j) {
    426     memset(dst + j * BPS, value, 8);
    427   }
    428 }
    429 
    430 static void DC8uv(uint8_t* dst) {     // DC
    431   int dc0 = 8;
    432   int i;
    433   for (i = 0; i < 8; ++i) {
    434     dc0 += dst[i - BPS] + dst[-1 + i * BPS];
    435   }
    436   Put8x8uv(dc0 >> 4, dst);
    437 }
    438 
    439 static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
    440   int dc0 = 4;
    441   int i;
    442   for (i = 0; i < 8; ++i) {
    443     dc0 += dst[i - BPS];
    444   }
    445   Put8x8uv(dc0 >> 3, dst);
    446 }
    447 
    448 static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
    449   int dc0 = 4;
    450   int i;
    451   for (i = 0; i < 8; ++i) {
    452     dc0 += dst[-1 + i * BPS];
    453   }
    454   Put8x8uv(dc0 >> 3, dst);
    455 }
    456 
    457 static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
    458   Put8x8uv(0x80, dst);
    459 }
    460 
    461 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
    462 
    463 //------------------------------------------------------------------------------
    464 // Edge filtering functions
    465 
    466 // 4 pixels in, 2 pixels out
    467 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
    468   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
    469   const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
    470   const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
    471   const int a2 = VP8ksclip2[(a + 3) >> 3];
    472   p[-step] = VP8kclip1[p0 + a2];
    473   p[    0] = VP8kclip1[q0 - a1];
    474 }
    475 
    476 // 4 pixels in, 4 pixels out
    477 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
    478   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
    479   const int a = 3 * (q0 - p0);
    480   const int a1 = VP8ksclip2[(a + 4) >> 3];
    481   const int a2 = VP8ksclip2[(a + 3) >> 3];
    482   const int a3 = (a1 + 1) >> 1;
    483   p[-2*step] = VP8kclip1[p1 + a3];
    484   p[-  step] = VP8kclip1[p0 + a2];
    485   p[      0] = VP8kclip1[q0 - a1];
    486   p[   step] = VP8kclip1[q1 - a3];
    487 }
    488 
    489 // 6 pixels in, 6 pixels out
    490 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
    491   const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
    492   const int q0 = p[0], q1 = p[step], q2 = p[2*step];
    493   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
    494   // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
    495   const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
    496   const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
    497   const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
    498   p[-3*step] = VP8kclip1[p2 + a3];
    499   p[-2*step] = VP8kclip1[p1 + a2];
    500   p[-  step] = VP8kclip1[p0 + a1];
    501   p[      0] = VP8kclip1[q0 - a1];
    502   p[   step] = VP8kclip1[q1 - a2];
    503   p[ 2*step] = VP8kclip1[q2 - a3];
    504 }
    505 
    506 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
    507   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
    508   return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
    509 }
    510 
    511 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
    512   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
    513   return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
    514 }
    515 
    516 static WEBP_INLINE int needs_filter2(const uint8_t* p,
    517                                      int step, int t, int it) {
    518   const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
    519   const int p0 = p[-step], q0 = p[0];
    520   const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
    521   if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
    522   return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
    523          VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
    524          VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
    525 }
    526 
    527 //------------------------------------------------------------------------------
    528 // Simple In-loop filtering (Paragraph 15.2)
    529 
    530 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    531   int i;
    532   const int thresh2 = 2 * thresh + 1;
    533   for (i = 0; i < 16; ++i) {
    534     if (needs_filter(p + i, stride, thresh2)) {
    535       do_filter2(p + i, stride);
    536     }
    537   }
    538 }
    539 
    540 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    541   int i;
    542   const int thresh2 = 2 * thresh + 1;
    543   for (i = 0; i < 16; ++i) {
    544     if (needs_filter(p + i * stride, 1, thresh2)) {
    545       do_filter2(p + i * stride, 1);
    546     }
    547   }
    548 }
    549 
    550 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
    551   int k;
    552   for (k = 3; k > 0; --k) {
    553     p += 4 * stride;
    554     SimpleVFilter16(p, stride, thresh);
    555   }
    556 }
    557 
    558 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    559   int k;
    560   for (k = 3; k > 0; --k) {
    561     p += 4;
    562     SimpleHFilter16(p, stride, thresh);
    563   }
    564 }
    565 
    566 //------------------------------------------------------------------------------
    567 // Complex In-loop filtering (Paragraph 15.3)
    568 
    569 static WEBP_INLINE void FilterLoop26(uint8_t* p,
    570                                      int hstride, int vstride, int size,
    571                                      int thresh, int ithresh, int hev_thresh) {
    572   const int thresh2 = 2 * thresh + 1;
    573   while (size-- > 0) {
    574     if (needs_filter2(p, hstride, thresh2, ithresh)) {
    575       if (hev(p, hstride, hev_thresh)) {
    576         do_filter2(p, hstride);
    577       } else {
    578         do_filter6(p, hstride);
    579       }
    580     }
    581     p += vstride;
    582   }
    583 }
    584 
    585 static WEBP_INLINE void FilterLoop24(uint8_t* p,
    586                                      int hstride, int vstride, int size,
    587                                      int thresh, int ithresh, int hev_thresh) {
    588   const int thresh2 = 2 * thresh + 1;
    589   while (size-- > 0) {
    590     if (needs_filter2(p, hstride, thresh2, ithresh)) {
    591       if (hev(p, hstride, hev_thresh)) {
    592         do_filter2(p, hstride);
    593       } else {
    594         do_filter4(p, hstride);
    595       }
    596     }
    597     p += vstride;
    598   }
    599 }
    600 
    601 // on macroblock edges
    602 static void VFilter16(uint8_t* p, int stride,
    603                       int thresh, int ithresh, int hev_thresh) {
    604   FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    605 }
    606 
    607 static void HFilter16(uint8_t* p, int stride,
    608                       int thresh, int ithresh, int hev_thresh) {
    609   FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    610 }
    611 
    612 // on three inner edges
    613 static void VFilter16i(uint8_t* p, int stride,
    614                        int thresh, int ithresh, int hev_thresh) {
    615   int k;
    616   for (k = 3; k > 0; --k) {
    617     p += 4 * stride;
    618     FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    619   }
    620 }
    621 
    622 static void HFilter16i(uint8_t* p, int stride,
    623                        int thresh, int ithresh, int hev_thresh) {
    624   int k;
    625   for (k = 3; k > 0; --k) {
    626     p += 4;
    627     FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    628   }
    629 }
    630 
    631 // 8-pixels wide variant, for chroma filtering
    632 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
    633                      int thresh, int ithresh, int hev_thresh) {
    634   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
    635   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
    636 }
    637 
    638 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
    639                      int thresh, int ithresh, int hev_thresh) {
    640   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
    641   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
    642 }
    643 
    644 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
    645                       int thresh, int ithresh, int hev_thresh) {
    646   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    647   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    648 }
    649 
    650 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
    651                       int thresh, int ithresh, int hev_thresh) {
    652   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    653   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    654 }
    655 
    656 //------------------------------------------------------------------------------
    657 
    658 static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
    659                              int dst_stride) {
    660   int i, j;
    661   for (j = 0; j < 8; ++j) {
    662     for (i = 0; i < 8; ++i) {
    663       const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER;
    664       const int delta1 =
    665           (delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE;
    666       dst[i] = clip_8b((int)dst[i] + delta1);
    667     }
    668     dst += dst_stride;
    669     dither += 8;
    670   }
    671 }
    672 
    673 //------------------------------------------------------------------------------
    674 
    675 VP8DecIdct2 VP8Transform;
    676 VP8DecIdct VP8TransformAC3;
    677 VP8DecIdct VP8TransformUV;
    678 VP8DecIdct VP8TransformDC;
    679 VP8DecIdct VP8TransformDCUV;
    680 
    681 VP8LumaFilterFunc VP8VFilter16;
    682 VP8LumaFilterFunc VP8HFilter16;
    683 VP8ChromaFilterFunc VP8VFilter8;
    684 VP8ChromaFilterFunc VP8HFilter8;
    685 VP8LumaFilterFunc VP8VFilter16i;
    686 VP8LumaFilterFunc VP8HFilter16i;
    687 VP8ChromaFilterFunc VP8VFilter8i;
    688 VP8ChromaFilterFunc VP8HFilter8i;
    689 VP8SimpleFilterFunc VP8SimpleVFilter16;
    690 VP8SimpleFilterFunc VP8SimpleHFilter16;
    691 VP8SimpleFilterFunc VP8SimpleVFilter16i;
    692 VP8SimpleFilterFunc VP8SimpleHFilter16i;
    693 
    694 void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
    695                             int dst_stride);
    696 
    697 extern void VP8DspInitSSE2(void);
    698 extern void VP8DspInitSSE41(void);
    699 extern void VP8DspInitNEON(void);
    700 extern void VP8DspInitMIPS32(void);
    701 extern void VP8DspInitMIPSdspR2(void);
    702 extern void VP8DspInitMSA(void);
    703 
    704 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    705     (VP8CPUInfo)&dec_last_cpuinfo_used;
    706 
    707 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    708   if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
    709 
    710   VP8InitClipTables();
    711 
    712   VP8TransformWHT = TransformWHT;
    713   VP8Transform = TransformTwo;
    714   VP8TransformUV = TransformUV;
    715   VP8TransformDC = TransformDC;
    716   VP8TransformDCUV = TransformDCUV;
    717   VP8TransformAC3 = TransformAC3;
    718 
    719   VP8VFilter16 = VFilter16;
    720   VP8HFilter16 = HFilter16;
    721   VP8VFilter8 = VFilter8;
    722   VP8HFilter8 = HFilter8;
    723   VP8VFilter16i = VFilter16i;
    724   VP8HFilter16i = HFilter16i;
    725   VP8VFilter8i = VFilter8i;
    726   VP8HFilter8i = HFilter8i;
    727   VP8SimpleVFilter16 = SimpleVFilter16;
    728   VP8SimpleHFilter16 = SimpleHFilter16;
    729   VP8SimpleVFilter16i = SimpleVFilter16i;
    730   VP8SimpleHFilter16i = SimpleHFilter16i;
    731 
    732   VP8PredLuma4[0] = DC4;
    733   VP8PredLuma4[1] = TM4;
    734   VP8PredLuma4[2] = VE4;
    735   VP8PredLuma4[3] = HE4;
    736   VP8PredLuma4[4] = RD4;
    737   VP8PredLuma4[5] = VR4;
    738   VP8PredLuma4[6] = LD4;
    739   VP8PredLuma4[7] = VL4;
    740   VP8PredLuma4[8] = HD4;
    741   VP8PredLuma4[9] = HU4;
    742 
    743   VP8PredLuma16[0] = DC16;
    744   VP8PredLuma16[1] = TM16;
    745   VP8PredLuma16[2] = VE16;
    746   VP8PredLuma16[3] = HE16;
    747   VP8PredLuma16[4] = DC16NoTop;
    748   VP8PredLuma16[5] = DC16NoLeft;
    749   VP8PredLuma16[6] = DC16NoTopLeft;
    750 
    751   VP8PredChroma8[0] = DC8uv;
    752   VP8PredChroma8[1] = TM8uv;
    753   VP8PredChroma8[2] = VE8uv;
    754   VP8PredChroma8[3] = HE8uv;
    755   VP8PredChroma8[4] = DC8uvNoTop;
    756   VP8PredChroma8[5] = DC8uvNoLeft;
    757   VP8PredChroma8[6] = DC8uvNoTopLeft;
    758 
    759   VP8DitherCombine8x8 = DitherCombine8x8;
    760 
    761   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
    762   if (VP8GetCPUInfo != NULL) {
    763 #if defined(WEBP_USE_SSE2)
    764     if (VP8GetCPUInfo(kSSE2)) {
    765       VP8DspInitSSE2();
    766 #if defined(WEBP_USE_SSE41)
    767       if (VP8GetCPUInfo(kSSE4_1)) {
    768         VP8DspInitSSE41();
    769       }
    770 #endif
    771     }
    772 #endif
    773 #if defined(WEBP_USE_NEON)
    774     if (VP8GetCPUInfo(kNEON)) {
    775       VP8DspInitNEON();
    776     }
    777 #endif
    778 #if defined(WEBP_USE_MIPS32)
    779     if (VP8GetCPUInfo(kMIPS32)) {
    780       VP8DspInitMIPS32();
    781     }
    782 #endif
    783 #if defined(WEBP_USE_MIPS_DSP_R2)
    784     if (VP8GetCPUInfo(kMIPSdspR2)) {
    785       VP8DspInitMIPSdspR2();
    786     }
    787 #endif
    788 #if defined(WEBP_USE_MSA)
    789     if (VP8GetCPUInfo(kMSA)) {
    790       VP8DspInitMSA();
    791     }
    792 #endif
    793   }
    794   dec_last_cpuinfo_used = VP8GetCPUInfo;
    795 }
    796