Home | History | Annotate | Download | only in dsp
      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // SSE2 variant of methods for lossless decoder
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_SSE2)
     17 
     18 #include "src/dsp/common_sse2.h"
     19 #include "src/dsp/lossless.h"
     20 #include "src/dsp/lossless_common.h"
     21 #include <assert.h>
     22 #include <emmintrin.h>
     23 
     24 //------------------------------------------------------------------------------
     25 // Predictor Transform
     26 
     27 static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
     28                                                         uint32_t c1,
     29                                                         uint32_t c2) {
     30   const __m128i zero = _mm_setzero_si128();
     31   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
     32   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
     33   const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
     34   const __m128i V1 = _mm_add_epi16(C0, C1);
     35   const __m128i V2 = _mm_sub_epi16(V1, C2);
     36   const __m128i b = _mm_packus_epi16(V2, V2);
     37   const uint32_t output = _mm_cvtsi128_si32(b);
     38   return output;
     39 }
     40 
     41 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
     42                                                         uint32_t c1,
     43                                                         uint32_t c2) {
     44   const __m128i zero = _mm_setzero_si128();
     45   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
     46   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
     47   const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
     48   const __m128i avg = _mm_add_epi16(C1, C0);
     49   const __m128i A0 = _mm_srli_epi16(avg, 1);
     50   const __m128i A1 = _mm_sub_epi16(A0, B0);
     51   const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
     52   const __m128i A2 = _mm_sub_epi16(A1, BgtA);
     53   const __m128i A3 = _mm_srai_epi16(A2, 1);
     54   const __m128i A4 = _mm_add_epi16(A0, A3);
     55   const __m128i A5 = _mm_packus_epi16(A4, A4);
     56   const uint32_t output = _mm_cvtsi128_si32(A5);
     57   return output;
     58 }
     59 
     60 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
     61   int pa_minus_pb;
     62   const __m128i zero = _mm_setzero_si128();
     63   const __m128i A0 = _mm_cvtsi32_si128(a);
     64   const __m128i B0 = _mm_cvtsi32_si128(b);
     65   const __m128i C0 = _mm_cvtsi32_si128(c);
     66   const __m128i AC0 = _mm_subs_epu8(A0, C0);
     67   const __m128i CA0 = _mm_subs_epu8(C0, A0);
     68   const __m128i BC0 = _mm_subs_epu8(B0, C0);
     69   const __m128i CB0 = _mm_subs_epu8(C0, B0);
     70   const __m128i AC = _mm_or_si128(AC0, CA0);
     71   const __m128i BC = _mm_or_si128(BC0, CB0);
     72   const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
     73   const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
     74   const __m128i diff = _mm_sub_epi16(pb, pa);
     75   {
     76     int16_t out[8];
     77     _mm_storeu_si128((__m128i*)out, diff);
     78     pa_minus_pb = out[0] + out[1] + out[2] + out[3];
     79   }
     80   return (pa_minus_pb <= 0) ? a : b;
     81 }
     82 
     83 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
     84                                        const __m128i* const a1,
     85                                        __m128i* const avg) {
     86   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
     87   const __m128i ones = _mm_set1_epi8(1);
     88   const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
     89   const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
     90   *avg = _mm_sub_epi8(avg1, one);
     91 }
     92 
     93 static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
     94                                              const uint32_t a1,
     95                                              __m128i* const avg) {
     96   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
     97   const __m128i ones = _mm_set1_epi8(1);
     98   const __m128i A0 = _mm_cvtsi32_si128(a0);
     99   const __m128i A1 = _mm_cvtsi32_si128(a1);
    100   const __m128i avg1 = _mm_avg_epu8(A0, A1);
    101   const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
    102   *avg = _mm_sub_epi8(avg1, one);
    103 }
    104 
    105 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
    106   const __m128i zero = _mm_setzero_si128();
    107   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
    108   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
    109   const __m128i sum = _mm_add_epi16(A1, A0);
    110   return _mm_srli_epi16(sum, 1);
    111 }
    112 
    113 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
    114   __m128i output;
    115   Average2_uint32_SSE2(a0, a1, &output);
    116   return _mm_cvtsi128_si32(output);
    117 }
    118 
    119 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
    120                                           uint32_t a2) {
    121   const __m128i zero = _mm_setzero_si128();
    122   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
    123   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
    124   const __m128i sum = _mm_add_epi16(avg1, A1);
    125   const __m128i avg2 = _mm_srli_epi16(sum, 1);
    126   const __m128i A2 = _mm_packus_epi16(avg2, avg2);
    127   const uint32_t output = _mm_cvtsi128_si32(A2);
    128   return output;
    129 }
    130 
    131 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
    132                                           uint32_t a2, uint32_t a3) {
    133   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
    134   const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
    135   const __m128i sum = _mm_add_epi16(avg2, avg1);
    136   const __m128i avg3 = _mm_srli_epi16(sum, 1);
    137   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
    138   const uint32_t output = _mm_cvtsi128_si32(A0);
    139   return output;
    140 }
    141 
    142 static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
    143   const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
    144   return pred;
    145 }
    146 static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
    147   const uint32_t pred = Average2_SSE2(left, top[-1]);
    148   return pred;
    149 }
    150 static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
    151   const uint32_t pred = Average2_SSE2(left, top[0]);
    152   return pred;
    153 }
    154 static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
    155   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
    156   (void)left;
    157   return pred;
    158 }
    159 static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
    160   const uint32_t pred = Average2_SSE2(top[0], top[1]);
    161   (void)left;
    162   return pred;
    163 }
    164 static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
    165   const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
    166   return pred;
    167 }
    168 static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
    169   const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
    170   return pred;
    171 }
    172 static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
    173   const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
    174   return pred;
    175 }
    176 static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
    177   const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
    178   return pred;
    179 }
    180 
    181 // Batch versions of those functions.
    182 
    183 // Predictor0: ARGB_BLACK.
    184 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
    185                                int num_pixels, uint32_t* out) {
    186   int i;
    187   const __m128i black = _mm_set1_epi32(ARGB_BLACK);
    188   for (i = 0; i + 4 <= num_pixels; i += 4) {
    189     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    190     const __m128i res = _mm_add_epi8(src, black);
    191     _mm_storeu_si128((__m128i*)&out[i], res);
    192   }
    193   if (i != num_pixels) {
    194     VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
    195   }
    196 }
    197 
    198 // Predictor1: left.
    199 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
    200                                int num_pixels, uint32_t* out) {
    201   int i;
    202   __m128i prev = _mm_set1_epi32(out[-1]);
    203   for (i = 0; i + 4 <= num_pixels; i += 4) {
    204     // a | b | c | d
    205     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    206     // 0 | a | b | c
    207     const __m128i shift0 = _mm_slli_si128(src, 4);
    208     // a | a + b | b + c | c + d
    209     const __m128i sum0 = _mm_add_epi8(src, shift0);
    210     // 0 | 0 | a | a + b
    211     const __m128i shift1 = _mm_slli_si128(sum0, 8);
    212     // a | a + b | a + b + c | a + b + c + d
    213     const __m128i sum1 = _mm_add_epi8(sum0, shift1);
    214     const __m128i res = _mm_add_epi8(sum1, prev);
    215     _mm_storeu_si128((__m128i*)&out[i], res);
    216     // replicate prev output on the four lanes
    217     prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
    218   }
    219   if (i != num_pixels) {
    220     VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
    221   }
    222 }
    223 
    224 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
    225 // per 8 bit channel.
    226 #define GENERATE_PREDICTOR_1(X, IN)                                           \
    227 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
    228                                   int num_pixels, uint32_t* out) {            \
    229   int i;                                                                      \
    230   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
    231     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
    232     const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));             \
    233     const __m128i res = _mm_add_epi8(src, other);                             \
    234     _mm_storeu_si128((__m128i*)&out[i], res);                                 \
    235   }                                                                           \
    236   if (i != num_pixels) {                                                      \
    237     VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
    238   }                                                                           \
    239 }
    240 
    241 // Predictor2: Top.
    242 GENERATE_PREDICTOR_1(2, upper[i])
    243 // Predictor3: Top-right.
    244 GENERATE_PREDICTOR_1(3, upper[i + 1])
    245 // Predictor4: Top-left.
    246 GENERATE_PREDICTOR_1(4, upper[i - 1])
    247 #undef GENERATE_PREDICTOR_1
    248 
    249 // Due to averages with integers, values cannot be accumulated in parallel for
    250 // predictors 5 to 7.
    251 GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
    252 GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
    253 GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
    254 
    255 #define GENERATE_PREDICTOR_2(X, IN)                                           \
    256 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
    257                                    int num_pixels, uint32_t* out) {           \
    258   int i;                                                                      \
    259   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
    260     const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
    261     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);             \
    262     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
    263     __m128i avg, res;                                                         \
    264     Average2_m128i(&T, &Tother, &avg);                                        \
    265     res = _mm_add_epi8(avg, src);                                             \
    266     _mm_storeu_si128((__m128i*)&out[i], res);                                 \
    267   }                                                                           \
    268   if (i != num_pixels) {                                                      \
    269     VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
    270   }                                                                           \
    271 }
    272 // Predictor8: average TL T.
    273 GENERATE_PREDICTOR_2(8, upper[i - 1])
    274 // Predictor9: average T TR.
    275 GENERATE_PREDICTOR_2(9, upper[i + 1])
    276 #undef GENERATE_PREDICTOR_2
    277 
    278 // Predictor10: average of (average of (L,TL), average of (T, TR)).
    279 #define DO_PRED10(OUT) do {               \
    280   __m128i avgLTL, avg;                    \
    281   Average2_m128i(&L, &TL, &avgLTL);       \
    282   Average2_m128i(&avgTTR, &avgLTL, &avg); \
    283   L = _mm_add_epi8(avg, src);             \
    284   out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
    285 } while (0)
    286 
    287 #define DO_PRED10_SHIFT do {                                  \
    288   /* Rotate the pre-computed values for the next iteration.*/ \
    289   avgTTR = _mm_srli_si128(avgTTR, 4);                         \
    290   TL = _mm_srli_si128(TL, 4);                                 \
    291   src = _mm_srli_si128(src, 4);                               \
    292 } while (0)
    293 
    294 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
    295                                 int num_pixels, uint32_t* out) {
    296   int i;
    297   __m128i L = _mm_cvtsi32_si128(out[-1]);
    298   for (i = 0; i + 4 <= num_pixels; i += 4) {
    299     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    300     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    301     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    302     const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
    303     __m128i avgTTR;
    304     Average2_m128i(&T, &TR, &avgTTR);
    305     DO_PRED10(0);
    306     DO_PRED10_SHIFT;
    307     DO_PRED10(1);
    308     DO_PRED10_SHIFT;
    309     DO_PRED10(2);
    310     DO_PRED10_SHIFT;
    311     DO_PRED10(3);
    312   }
    313   if (i != num_pixels) {
    314     VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
    315   }
    316 }
    317 #undef DO_PRED10
    318 #undef DO_PRED10_SHIFT
    319 
    320 // Predictor11: select.
    321 #define DO_PRED11(OUT) do {                                            \
    322   const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
    323   const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
    324   const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
    325   const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
    326   const __m128i A = _mm_and_si128(mask, L);                            \
    327   const __m128i B = _mm_andnot_si128(mask, T);                         \
    328   const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
    329   L = _mm_add_epi8(src, pred);                                         \
    330   out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
    331 } while (0)
    332 
    333 #define DO_PRED11_SHIFT do {                                \
    334   /* Shift the pre-computed value for the next iteration.*/ \
    335   T = _mm_srli_si128(T, 4);                                 \
    336   TL = _mm_srli_si128(TL, 4);                               \
    337   src = _mm_srli_si128(src, 4);                             \
    338   pa = _mm_srli_si128(pa, 4);                               \
    339 } while (0)
    340 
    341 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
    342                                 int num_pixels, uint32_t* out) {
    343   int i;
    344   __m128i pa;
    345   __m128i L = _mm_cvtsi32_si128(out[-1]);
    346   for (i = 0; i + 4 <= num_pixels; i += 4) {
    347     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    348     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    349     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    350     {
    351       // We can unpack with any value on the upper 32 bits, provided it's the
    352       // same on both operands (so that their sum of abs diff is zero). Here we
    353       // use T.
    354       const __m128i T_lo = _mm_unpacklo_epi32(T, T);
    355       const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
    356       const __m128i T_hi = _mm_unpackhi_epi32(T, T);
    357       const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
    358       const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
    359       const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
    360       pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
    361     }
    362     DO_PRED11(0);
    363     DO_PRED11_SHIFT;
    364     DO_PRED11(1);
    365     DO_PRED11_SHIFT;
    366     DO_PRED11(2);
    367     DO_PRED11_SHIFT;
    368     DO_PRED11(3);
    369   }
    370   if (i != num_pixels) {
    371     VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
    372   }
    373 }
    374 #undef DO_PRED11
    375 #undef DO_PRED11_SHIFT
    376 
    377 // Predictor12: ClampedAddSubtractFull.
    378 #define DO_PRED12(DIFF, LANE, OUT) do {            \
    379   const __m128i all = _mm_add_epi16(L, (DIFF));    \
    380   const __m128i alls = _mm_packus_epi16(all, all); \
    381   const __m128i res = _mm_add_epi8(src, alls);     \
    382   out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
    383   L = _mm_unpacklo_epi8(res, zero);                \
    384 } while (0)
    385 
    386 #define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
    387   /* Shift the pre-computed value for the next iteration.*/ \
    388   if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
    389   src = _mm_srli_si128(src, 4);                             \
    390 } while (0)
    391 
    392 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
    393                                 int num_pixels, uint32_t* out) {
    394   int i;
    395   const __m128i zero = _mm_setzero_si128();
    396   const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
    397   __m128i L = _mm_unpacklo_epi8(L8, zero);
    398   for (i = 0; i + 4 <= num_pixels; i += 4) {
    399     // Load 4 pixels at a time.
    400     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    401     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    402     const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
    403     const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
    404     const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    405     const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
    406     const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
    407     __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
    408     __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
    409     DO_PRED12(diff_lo, 0, 0);
    410     DO_PRED12_SHIFT(diff_lo, 0);
    411     DO_PRED12(diff_lo, 1, 1);
    412     DO_PRED12_SHIFT(diff_lo, 1);
    413     DO_PRED12(diff_hi, 0, 2);
    414     DO_PRED12_SHIFT(diff_hi, 0);
    415     DO_PRED12(diff_hi, 1, 3);
    416   }
    417   if (i != num_pixels) {
    418     VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
    419   }
    420 }
    421 #undef DO_PRED12
    422 #undef DO_PRED12_SHIFT
    423 
    424 // Due to averages with integers, values cannot be accumulated in parallel for
    425 // predictors 13.
    426 GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
    427 
    428 //------------------------------------------------------------------------------
    429 // Subtract-Green Transform
    430 
    431 static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
    432                                       uint32_t* dst) {
    433   int i;
    434   for (i = 0; i + 4 <= num_pixels; i += 4) {
    435     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    436     const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    437     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    438     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    439     const __m128i out = _mm_add_epi8(in, C);
    440     _mm_storeu_si128((__m128i*)&dst[i], out);
    441   }
    442   // fallthrough and finish off with plain-C
    443   if (i != num_pixels) {
    444     VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
    445   }
    446 }
    447 
    448 //------------------------------------------------------------------------------
    449 // Color Transform
    450 
    451 static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
    452                                        const uint32_t* const src,
    453                                        int num_pixels, uint32_t* dst) {
    454 // sign-extended multiplying constants, pre-shifted by 5.
    455 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
    456 #define MK_CST_16(HI, LO) \
    457   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
    458   const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
    459   const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
    460 #undef MK_CST_16
    461 #undef CST
    462   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
    463   int i;
    464   for (i = 0; i + 4 <= num_pixels; i += 4) {
    465     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    466     const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    467     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    468     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    469     const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    470     const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
    471     const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
    472     const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
    473     const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
    474     const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
    475     const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
    476     const __m128i out = _mm_or_si128(J, A);
    477     _mm_storeu_si128((__m128i*)&dst[i], out);
    478   }
    479   // Fall-back to C-version for left-overs.
    480   if (i != num_pixels) {
    481     VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
    482   }
    483 }
    484 
    485 //------------------------------------------------------------------------------
    486 // Color-space conversion functions
    487 
    488 static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
    489                                   uint8_t* dst) {
    490   const __m128i* in = (const __m128i*)src;
    491   __m128i* out = (__m128i*)dst;
    492 
    493   while (num_pixels >= 32) {
    494     // Load the BGRA buffers.
    495     __m128i in0 = _mm_loadu_si128(in + 0);
    496     __m128i in1 = _mm_loadu_si128(in + 1);
    497     __m128i in2 = _mm_loadu_si128(in + 2);
    498     __m128i in3 = _mm_loadu_si128(in + 3);
    499     __m128i in4 = _mm_loadu_si128(in + 4);
    500     __m128i in5 = _mm_loadu_si128(in + 5);
    501     __m128i in6 = _mm_loadu_si128(in + 6);
    502     __m128i in7 = _mm_loadu_si128(in + 7);
    503     VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
    504     VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
    505     // At this points, in1/in5 contains red only, in2/in6 green only ...
    506     // Pack the colors in 24b RGB.
    507     VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
    508     _mm_storeu_si128(out + 0, in1);
    509     _mm_storeu_si128(out + 1, in5);
    510     _mm_storeu_si128(out + 2, in2);
    511     _mm_storeu_si128(out + 3, in6);
    512     _mm_storeu_si128(out + 4, in3);
    513     _mm_storeu_si128(out + 5, in7);
    514     in += 8;
    515     out += 6;
    516     num_pixels -= 32;
    517   }
    518   // left-overs
    519   if (num_pixels > 0) {
    520     VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
    521   }
    522 }
    523 
    524 static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
    525                                    int num_pixels, uint8_t* dst) {
    526   const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
    527   const __m128i* in = (const __m128i*)src;
    528   __m128i* out = (__m128i*)dst;
    529   while (num_pixels >= 8) {
    530     const __m128i A1 = _mm_loadu_si128(in++);
    531     const __m128i A2 = _mm_loadu_si128(in++);
    532     const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
    533     const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
    534     const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
    535     const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
    536     const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
    537     const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
    538     const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
    539     const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
    540     const __m128i F1 = _mm_or_si128(E1, C1);
    541     const __m128i F2 = _mm_or_si128(E2, C2);
    542     _mm_storeu_si128(out++, F1);
    543     _mm_storeu_si128(out++, F2);
    544     num_pixels -= 8;
    545   }
    546   // left-overs
    547   if (num_pixels > 0) {
    548     VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
    549   }
    550 }
    551 
    552 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
    553                                        int num_pixels, uint8_t* dst) {
    554   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
    555   const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
    556   const __m128i* in = (const __m128i*)src;
    557   __m128i* out = (__m128i*)dst;
    558   while (num_pixels >= 8) {
    559     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    560     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    561     const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    562     const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    563     const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
    564     const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
    565     const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
    566     const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
    567     const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
    568     const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
    569     const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
    570     const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
    571     const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
    572     const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
    573     const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
    574 #if (WEBP_SWAP_16BIT_CSP == 1)
    575     const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
    576 #else
    577     const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
    578 #endif
    579     _mm_storeu_si128(out++, rgba);
    580     num_pixels -= 8;
    581   }
    582   // left-overs
    583   if (num_pixels > 0) {
    584     VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
    585   }
    586 }
    587 
    588 static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
    589                                      int num_pixels, uint8_t* dst) {
    590   const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
    591   const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
    592   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
    593   const __m128i* in = (const __m128i*)src;
    594   __m128i* out = (__m128i*)dst;
    595   while (num_pixels >= 8) {
    596     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    597     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    598     const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    599     const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    600     const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
    601     const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
    602     const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
    603     const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
    604     const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
    605     const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
    606     const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
    607     const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
    608     const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
    609     const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
    610     const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
    611     const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
    612     const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
    613     const __m128i b1 = _mm_srli_epi16(b0, 3);
    614     const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
    615 #if (WEBP_SWAP_16BIT_CSP == 1)
    616     const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
    617 #else
    618     const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
    619 #endif
    620     _mm_storeu_si128(out++, rgba);
    621     num_pixels -= 8;
    622   }
    623   // left-overs
    624   if (num_pixels > 0) {
    625     VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
    626   }
    627 }
    628 
    629 static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
    630                                   int num_pixels, uint8_t* dst) {
    631   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
    632   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
    633   const __m128i* in = (const __m128i*)src;
    634   const uint8_t* const end = dst + num_pixels * 3;
    635   // the last storel_epi64 below writes 8 bytes starting at offset 18
    636   while (dst + 26 <= end) {
    637     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    638     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    639     const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
    640     const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
    641     const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
    642     const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
    643     const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
    644     const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
    645     const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
    646     const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
    647     const __m128i c2 = _mm_srli_si128(c0, 8);
    648     const __m128i c6 = _mm_srli_si128(c4, 8);
    649     _mm_storel_epi64((__m128i*)(dst +   0), c0);
    650     _mm_storel_epi64((__m128i*)(dst +   6), c2);
    651     _mm_storel_epi64((__m128i*)(dst +  12), c4);
    652     _mm_storel_epi64((__m128i*)(dst +  18), c6);
    653     dst += 24;
    654     num_pixels -= 8;
    655   }
    656   // left-overs
    657   if (num_pixels > 0) {
    658     VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
    659   }
    660 }
    661 
    662 //------------------------------------------------------------------------------
    663 // Entry point
    664 
    665 extern void VP8LDspInitSSE2(void);
    666 
    667 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
    668   VP8LPredictors[5] = Predictor5_SSE2;
    669   VP8LPredictors[6] = Predictor6_SSE2;
    670   VP8LPredictors[7] = Predictor7_SSE2;
    671   VP8LPredictors[8] = Predictor8_SSE2;
    672   VP8LPredictors[9] = Predictor9_SSE2;
    673   VP8LPredictors[10] = Predictor10_SSE2;
    674   VP8LPredictors[11] = Predictor11_SSE2;
    675   VP8LPredictors[12] = Predictor12_SSE2;
    676   VP8LPredictors[13] = Predictor13_SSE2;
    677 
    678   VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
    679   VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
    680   VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
    681   VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
    682   VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
    683   VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
    684   VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
    685   VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
    686   VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
    687   VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
    688   VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
    689   VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
    690   VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
    691   VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
    692 
    693   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
    694   VP8LTransformColorInverse = TransformColorInverse_SSE2;
    695 
    696   VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
    697   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
    698   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
    699   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
    700   VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
    701 }
    702 
    703 #else  // !WEBP_USE_SSE2
    704 
    705 WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
    706 
    707 #endif  // WEBP_USE_SSE2
    708