Home | History | Annotate | Download | only in dsp
      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // YUV->RGB conversion functions
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "./yuv.h"
     15 
     16 #if defined(WEBP_USE_SSE2)
     17 
     18 #include "./common_sse2.h"
     19 #include <stdlib.h>
     20 #include <emmintrin.h>
     21 
     22 //-----------------------------------------------------------------------------
     23 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
     24 
     25 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
     26 // R = (19077 * y             + 26149 * v - 14234) >> 6
     27 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
     28 // B = (19077 * y + 33050 * u             - 17685) >> 6
     29 static void ConvertYUV444ToRGB(const __m128i* const Y0,
     30                                const __m128i* const U0,
     31                                const __m128i* const V0,
     32                                __m128i* const R,
     33                                __m128i* const G,
     34                                __m128i* const B) {
     35   const __m128i k19077 = _mm_set1_epi16(19077);
     36   const __m128i k26149 = _mm_set1_epi16(26149);
     37   const __m128i k14234 = _mm_set1_epi16(14234);
     38   // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
     39   const __m128i k33050 = _mm_set1_epi16((short)33050);
     40   const __m128i k17685 = _mm_set1_epi16(17685);
     41   const __m128i k6419  = _mm_set1_epi16(6419);
     42   const __m128i k13320 = _mm_set1_epi16(13320);
     43   const __m128i k8708  = _mm_set1_epi16(8708);
     44 
     45   const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
     46 
     47   const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
     48   const __m128i R1 = _mm_sub_epi16(Y1, k14234);
     49   const __m128i R2 = _mm_add_epi16(R1, R0);
     50 
     51   const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
     52   const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
     53   const __m128i G2 = _mm_add_epi16(Y1, k8708);
     54   const __m128i G3 = _mm_add_epi16(G0, G1);
     55   const __m128i G4 = _mm_sub_epi16(G2, G3);
     56 
     57   // be careful with the saturated *unsigned* arithmetic here!
     58   const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
     59   const __m128i B1 = _mm_adds_epu16(B0, Y1);
     60   const __m128i B2 = _mm_subs_epu16(B1, k17685);
     61 
     62   // use logical shift for B2, which can be larger than 32767
     63   *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
     64   *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
     65   *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
     66 }
     67 
     68 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
     69 static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
     70   const __m128i zero = _mm_setzero_si128();
     71   return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
     72 }
     73 
     74 // Load and replicate the U/V samples
     75 static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
     76   const __m128i zero = _mm_setzero_si128();
     77   const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
     78   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
     79   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
     80 }
     81 
     82 // Convert 32 samples of YUV444 to R/G/B
     83 static void YUV444ToRGB(const uint8_t* const y,
     84                         const uint8_t* const u,
     85                         const uint8_t* const v,
     86                         __m128i* const R, __m128i* const G, __m128i* const B) {
     87   const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
     88   ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
     89 }
     90 
     91 // Convert 32 samples of YUV420 to R/G/B
     92 static void YUV420ToRGB(const uint8_t* const y,
     93                         const uint8_t* const u,
     94                         const uint8_t* const v,
     95                         __m128i* const R, __m128i* const G, __m128i* const B) {
     96   const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
     97   ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
     98 }
     99 
    100 // Pack R/G/B/A results into 32b output.
    101 static WEBP_INLINE void PackAndStore4(const __m128i* const R,
    102                                       const __m128i* const G,
    103                                       const __m128i* const B,
    104                                       const __m128i* const A,
    105                                       uint8_t* const dst) {
    106   const __m128i rb = _mm_packus_epi16(*R, *B);
    107   const __m128i ga = _mm_packus_epi16(*G, *A);
    108   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
    109   const __m128i ba = _mm_unpackhi_epi8(rb, ga);
    110   const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
    111   const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
    112   _mm_storeu_si128((__m128i*)(dst +  0), RGBA_lo);
    113   _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
    114 }
    115 
    116 // Pack R/G/B/A results into 16b output.
    117 static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
    118                                          const __m128i* const G,
    119                                          const __m128i* const B,
    120                                          const __m128i* const A,
    121                                          uint8_t* const dst) {
    122 #if !defined(WEBP_SWAP_16BIT_CSP)
    123   const __m128i rg0 = _mm_packus_epi16(*R, *G);
    124   const __m128i ba0 = _mm_packus_epi16(*B, *A);
    125 #else
    126   const __m128i rg0 = _mm_packus_epi16(*B, *A);
    127   const __m128i ba0 = _mm_packus_epi16(*R, *G);
    128 #endif
    129   const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
    130   const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
    131   const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
    132   const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
    133   const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
    134   const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
    135   _mm_storeu_si128((__m128i*)dst, rgba4444);
    136 }
    137 
    138 // Pack R/G/B results into 16b output.
    139 static WEBP_INLINE void PackAndStore565(const __m128i* const R,
    140                                         const __m128i* const G,
    141                                         const __m128i* const B,
    142                                         uint8_t* const dst) {
    143   const __m128i r0 = _mm_packus_epi16(*R, *R);
    144   const __m128i g0 = _mm_packus_epi16(*G, *G);
    145   const __m128i b0 = _mm_packus_epi16(*B, *B);
    146   const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
    147   const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
    148   const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
    149   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
    150   const __m128i rg = _mm_or_si128(r1, g1);
    151   const __m128i gb = _mm_or_si128(g2, b1);
    152 #if !defined(WEBP_SWAP_16BIT_CSP)
    153   const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
    154 #else
    155   const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
    156 #endif
    157   _mm_storeu_si128((__m128i*)dst, rgb565);
    158 }
    159 
    160 // Pack the planar buffers
    161 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    162 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
    163 static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
    164                                     __m128i* const in2, __m128i* const in3,
    165                                     __m128i* const in4, __m128i* const in5,
    166                                     uint8_t* const rgb) {
    167   // The input is 6 registers of sixteen 8b but for the sake of explanation,
    168   // let's take 6 registers of four 8b values.
    169   // To pack, we will keep taking one every two 8b integer and move it
    170   // around as follows:
    171   // Input:
    172   //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
    173   // Split the 6 registers in two sets of 3 registers: the first set as the even
    174   // 8b bytes, the second the odd ones:
    175   //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
    176   // Repeat the same permutations twice more:
    177   //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
    178   //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
    179   VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
    180 
    181   _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
    182   _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
    183   _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
    184   _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
    185   _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
    186   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
    187 }
    188 
    189 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    190                     uint8_t* dst) {
    191   const __m128i kAlpha = _mm_set1_epi16(255);
    192   int n;
    193   for (n = 0; n < 32; n += 8, dst += 32) {
    194     __m128i R, G, B;
    195     YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
    196     PackAndStore4(&R, &G, &B, &kAlpha, dst);
    197   }
    198 }
    199 
    200 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    201                     uint8_t* dst) {
    202   const __m128i kAlpha = _mm_set1_epi16(255);
    203   int n;
    204   for (n = 0; n < 32; n += 8, dst += 32) {
    205     __m128i R, G, B;
    206     YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
    207     PackAndStore4(&B, &G, &R, &kAlpha, dst);
    208   }
    209 }
    210 
    211 void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    212                     uint8_t* dst) {
    213   const __m128i kAlpha = _mm_set1_epi16(255);
    214   int n;
    215   for (n = 0; n < 32; n += 8, dst += 32) {
    216     __m128i R, G, B;
    217     YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
    218     PackAndStore4(&kAlpha, &R, &G, &B, dst);
    219   }
    220 }
    221 
    222 void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    223                         uint8_t* dst) {
    224   const __m128i kAlpha = _mm_set1_epi16(255);
    225   int n;
    226   for (n = 0; n < 32; n += 8, dst += 16) {
    227     __m128i R, G, B;
    228     YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
    229     PackAndStore4444(&R, &G, &B, &kAlpha, dst);
    230   }
    231 }
    232 
    233 void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    234                       uint8_t* dst) {
    235   int n;
    236   for (n = 0; n < 32; n += 8, dst += 16) {
    237     __m128i R, G, B;
    238     YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
    239     PackAndStore565(&R, &G, &B, dst);
    240   }
    241 }
    242 
    243 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    244                    uint8_t* dst) {
    245   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    246   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    247 
    248   YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
    249   YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
    250   YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    251   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    252 
    253   // Cast to 8b and store as RRRRGGGGBBBB.
    254   rgb0 = _mm_packus_epi16(R0, R1);
    255   rgb1 = _mm_packus_epi16(R2, R3);
    256   rgb2 = _mm_packus_epi16(G0, G1);
    257   rgb3 = _mm_packus_epi16(G2, G3);
    258   rgb4 = _mm_packus_epi16(B0, B1);
    259   rgb5 = _mm_packus_epi16(B2, B3);
    260 
    261   // Pack as RGBRGBRGBRGB.
    262   PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    263 }
    264 
    265 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    266                    uint8_t* dst) {
    267   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    268   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    269 
    270   YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    271   YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
    272   YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    273   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    274 
    275   // Cast to 8b and store as BBBBGGGGRRRR.
    276   bgr0 = _mm_packus_epi16(B0, B1);
    277   bgr1 = _mm_packus_epi16(B2, B3);
    278   bgr2 = _mm_packus_epi16(G0, G1);
    279   bgr3 = _mm_packus_epi16(G2, G3);
    280   bgr4 = _mm_packus_epi16(R0, R1);
    281   bgr5= _mm_packus_epi16(R2, R3);
    282 
    283   // Pack as BGRBGRBGRBGR.
    284   PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    285 }
    286 
    287 //-----------------------------------------------------------------------------
    288 // Arbitrary-length row conversion functions
    289 
    290 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    291                          uint8_t* dst, int len) {
    292   const __m128i kAlpha = _mm_set1_epi16(255);
    293   int n;
    294   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    295     __m128i R, G, B;
    296     YUV420ToRGB(y, u, v, &R, &G, &B);
    297     PackAndStore4(&R, &G, &B, &kAlpha, dst);
    298     y += 8;
    299     u += 4;
    300     v += 4;
    301   }
    302   for (; n < len; ++n) {   // Finish off
    303     VP8YuvToRgba(y[0], u[0], v[0], dst);
    304     dst += 4;
    305     y += 1;
    306     u += (n & 1);
    307     v += (n & 1);
    308   }
    309 }
    310 
    311 static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    312                          uint8_t* dst, int len) {
    313   const __m128i kAlpha = _mm_set1_epi16(255);
    314   int n;
    315   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    316     __m128i R, G, B;
    317     YUV420ToRGB(y, u, v, &R, &G, &B);
    318     PackAndStore4(&B, &G, &R, &kAlpha, dst);
    319     y += 8;
    320     u += 4;
    321     v += 4;
    322   }
    323   for (; n < len; ++n) {   // Finish off
    324     VP8YuvToBgra(y[0], u[0], v[0], dst);
    325     dst += 4;
    326     y += 1;
    327     u += (n & 1);
    328     v += (n & 1);
    329   }
    330 }
    331 
    332 static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    333                          uint8_t* dst, int len) {
    334   const __m128i kAlpha = _mm_set1_epi16(255);
    335   int n;
    336   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    337     __m128i R, G, B;
    338     YUV420ToRGB(y, u, v, &R, &G, &B);
    339     PackAndStore4(&kAlpha, &R, &G, &B, dst);
    340     y += 8;
    341     u += 4;
    342     v += 4;
    343   }
    344   for (; n < len; ++n) {   // Finish off
    345     VP8YuvToArgb(y[0], u[0], v[0], dst);
    346     dst += 4;
    347     y += 1;
    348     u += (n & 1);
    349     v += (n & 1);
    350   }
    351 }
    352 
    353 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    354                         uint8_t* dst, int len) {
    355   int n;
    356   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    357     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    358     __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    359 
    360     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    361     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    362     YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    363     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    364 
    365     // Cast to 8b and store as RRRRGGGGBBBB.
    366     rgb0 = _mm_packus_epi16(R0, R1);
    367     rgb1 = _mm_packus_epi16(R2, R3);
    368     rgb2 = _mm_packus_epi16(G0, G1);
    369     rgb3 = _mm_packus_epi16(G2, G3);
    370     rgb4 = _mm_packus_epi16(B0, B1);
    371     rgb5 = _mm_packus_epi16(B2, B3);
    372 
    373     // Pack as RGBRGBRGBRGB.
    374     PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    375 
    376     y += 32;
    377     u += 16;
    378     v += 16;
    379   }
    380   for (; n < len; ++n) {   // Finish off
    381     VP8YuvToRgb(y[0], u[0], v[0], dst);
    382     dst += 3;
    383     y += 1;
    384     u += (n & 1);
    385     v += (n & 1);
    386   }
    387 }
    388 
    389 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
    390                         uint8_t* dst, int len) {
    391   int n;
    392   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    393     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    394     __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    395 
    396     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    397     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    398     YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    399     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    400 
    401     // Cast to 8b and store as BBBBGGGGRRRR.
    402     bgr0 = _mm_packus_epi16(B0, B1);
    403     bgr1 = _mm_packus_epi16(B2, B3);
    404     bgr2 = _mm_packus_epi16(G0, G1);
    405     bgr3 = _mm_packus_epi16(G2, G3);
    406     bgr4 = _mm_packus_epi16(R0, R1);
    407     bgr5 = _mm_packus_epi16(R2, R3);
    408 
    409     // Pack as BGRBGRBGRBGR.
    410     PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    411 
    412     y += 32;
    413     u += 16;
    414     v += 16;
    415   }
    416   for (; n < len; ++n) {   // Finish off
    417     VP8YuvToBgr(y[0], u[0], v[0], dst);
    418     dst += 3;
    419     y += 1;
    420     u += (n & 1);
    421     v += (n & 1);
    422   }
    423 }
    424 
    425 //------------------------------------------------------------------------------
    426 // Entry point
    427 
    428 extern void WebPInitSamplersSSE2(void);
    429 
    430 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
    431   WebPSamplers[MODE_RGB]  = YuvToRgbRow;
    432   WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
    433   WebPSamplers[MODE_BGR]  = YuvToBgrRow;
    434   WebPSamplers[MODE_BGRA] = YuvToBgraRow;
    435   WebPSamplers[MODE_ARGB] = YuvToArgbRow;
    436 }
    437 
    438 //------------------------------------------------------------------------------
    439 // RGB24/32 -> YUV converters
    440 
    441 // Load eight 16b-words from *src.
    442 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
    443 // Store either 16b-words into *dst
    444 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
    445 
    446 // Function that inserts a value of the second half of the in buffer in between
    447 // every two char of the first half.
    448 static WEBP_INLINE void RGB24PackedToPlanarHelper(
    449     const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
    450   out[0] = _mm_unpacklo_epi8(in[0], in[3]);
    451   out[1] = _mm_unpackhi_epi8(in[0], in[3]);
    452   out[2] = _mm_unpacklo_epi8(in[1], in[4]);
    453   out[3] = _mm_unpackhi_epi8(in[1], in[4]);
    454   out[4] = _mm_unpacklo_epi8(in[2], in[5]);
    455   out[5] = _mm_unpackhi_epi8(in[2], in[5]);
    456 }
    457 
    458 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
    459 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    460 // Similar to PlanarTo24bHelper(), but in reverse order.
    461 static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
    462                                             __m128i* const out /*out[6]*/) {
    463   __m128i tmp[6];
    464   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
    465   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
    466   tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
    467   tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
    468   tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
    469   tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
    470 
    471   RGB24PackedToPlanarHelper(tmp, out);
    472   RGB24PackedToPlanarHelper(out, tmp);
    473   RGB24PackedToPlanarHelper(tmp, out);
    474   RGB24PackedToPlanarHelper(out, tmp);
    475   RGB24PackedToPlanarHelper(tmp, out);
    476 }
    477 
    478 // Convert 8 packed ARGB to r[], g[], b[]
    479 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
    480                                             __m128i* const rgb /*in[6]*/) {
    481   const __m128i zero = _mm_setzero_si128();
    482   __m128i a0 = LOAD_16(argb + 0);
    483   __m128i a1 = LOAD_16(argb + 4);
    484   __m128i a2 = LOAD_16(argb + 8);
    485   __m128i a3 = LOAD_16(argb + 12);
    486   VP8L32bToPlanar(&a0, &a1, &a2, &a3);
    487   rgb[0] = _mm_unpacklo_epi8(a1, zero);
    488   rgb[1] = _mm_unpackhi_epi8(a1, zero);
    489   rgb[2] = _mm_unpacklo_epi8(a2, zero);
    490   rgb[3] = _mm_unpackhi_epi8(a2, zero);
    491   rgb[4] = _mm_unpacklo_epi8(a3, zero);
    492   rgb[5] = _mm_unpackhi_epi8(a3, zero);
    493 }
    494 
    495 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
    496 // It's a macro and not a function because we need to use immediate values with
    497 // srai_epi32, e.g.
    498 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
    499                   ROUNDER, DESCALE_FIX, OUT) do {               \
    500   const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
    501   const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
    502   const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
    503   const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
    504   const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
    505   const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
    506   const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
    507   const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
    508   const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
    509   const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
    510   (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
    511 } while (0)
    512 
    513 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
    514 static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
    515                                       const __m128i* const G,
    516                                       const __m128i* const B,
    517                                       __m128i* const Y) {
    518   const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
    519   const __m128i kGB_y = MK_CST_16(16384, 6420);
    520   const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
    521 
    522   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    523   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    524   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    525   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    526   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
    527 }
    528 
    529 static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
    530                                        const __m128i* const G,
    531                                        const __m128i* const B,
    532                                        __m128i* const U, __m128i* const V) {
    533   const __m128i kRG_u = MK_CST_16(-9719, -19081);
    534   const __m128i kGB_u = MK_CST_16(0, 28800);
    535   const __m128i kRG_v = MK_CST_16(28800, 0);
    536   const __m128i kGB_v = MK_CST_16(-24116, -4684);
    537   const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
    538 
    539   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    540   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    541   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    542   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    543   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
    544             kHALF_UV, YUV_FIX + 2, *U);
    545   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
    546             kHALF_UV, YUV_FIX + 2, *V);
    547 }
    548 
    549 #undef MK_CST_16
    550 #undef TRANSFORM
    551 
    552 static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
    553   const int max_width = width & ~31;
    554   int i;
    555   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
    556     __m128i rgb_plane[6];
    557     int j;
    558 
    559     RGB24PackedToPlanar(rgb, rgb_plane);
    560 
    561     for (j = 0; j < 2; ++j, i += 16) {
    562       const __m128i zero = _mm_setzero_si128();
    563       __m128i r, g, b, Y0, Y1;
    564 
    565       // Convert to 16-bit Y.
    566       r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
    567       g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
    568       b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
    569       ConvertRGBToY(&r, &g, &b, &Y0);
    570 
    571       // Convert to 16-bit Y.
    572       r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
    573       g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
    574       b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
    575       ConvertRGBToY(&r, &g, &b, &Y1);
    576 
    577       // Cast to 8-bit and store.
    578       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    579     }
    580   }
    581   for (; i < width; ++i, rgb += 3) {   // left-over
    582     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
    583   }
    584 }
    585 
    586 static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
    587   const int max_width = width & ~31;
    588   int i;
    589   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
    590     __m128i bgr_plane[6];
    591     int j;
    592 
    593     RGB24PackedToPlanar(bgr, bgr_plane);
    594 
    595     for (j = 0; j < 2; ++j, i += 16) {
    596       const __m128i zero = _mm_setzero_si128();
    597       __m128i r, g, b, Y0, Y1;
    598 
    599       // Convert to 16-bit Y.
    600       b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
    601       g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
    602       r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
    603       ConvertRGBToY(&r, &g, &b, &Y0);
    604 
    605       // Convert to 16-bit Y.
    606       b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
    607       g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
    608       r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
    609       ConvertRGBToY(&r, &g, &b, &Y1);
    610 
    611       // Cast to 8-bit and store.
    612       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    613     }
    614   }
    615   for (; i < width; ++i, bgr += 3) {  // left-over
    616     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
    617   }
    618 }
    619 
    620 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
    621   const int max_width = width & ~15;
    622   int i;
    623   for (i = 0; i < max_width; i += 16) {
    624     __m128i Y0, Y1, rgb[6];
    625     RGB32PackedToPlanar(&argb[i], rgb);
    626     ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
    627     ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
    628     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    629   }
    630   for (; i < width; ++i) {   // left-over
    631     const uint32_t p = argb[i];
    632     y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
    633                      YUV_HALF);
    634   }
    635 }
    636 
    637 // Horizontal add (doubled) of two 16b values, result is 16b.
    638 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
    639 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
    640                               __m128i* const out) {
    641   const __m128i k2 = _mm_set1_epi16(2);
    642   const __m128i C = _mm_madd_epi16(*A, k2);
    643   const __m128i D = _mm_madd_epi16(*B, k2);
    644   *out = _mm_packs_epi32(C, D);
    645 }
    646 
    647 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
    648                             int src_width, int do_store) {
    649   const int max_width = src_width & ~31;
    650   int i;
    651   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
    652     __m128i rgb[6], U0, V0, U1, V1;
    653     RGB32PackedToPlanar(&argb[i], rgb);
    654     HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
    655     HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
    656     HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
    657     ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
    658 
    659     RGB32PackedToPlanar(&argb[i + 16], rgb);
    660     HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
    661     HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
    662     HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
    663     ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
    664 
    665     U0 = _mm_packus_epi16(U0, U1);
    666     V0 = _mm_packus_epi16(V0, V1);
    667     if (!do_store) {
    668       const __m128i prev_u = LOAD_16(u);
    669       const __m128i prev_v = LOAD_16(v);
    670       U0 = _mm_avg_epu8(U0, prev_u);
    671       V0 = _mm_avg_epu8(V0, prev_v);
    672     }
    673     STORE_16(U0, u);
    674     STORE_16(V0, v);
    675   }
    676   if (i < src_width) {  // left-over
    677     WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
    678   }
    679 }
    680 
    681 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
    682 static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
    683                                                  __m128i* const r,
    684                                                  __m128i* const g,
    685                                                  __m128i* const b) {
    686   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
    687   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
    688   const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
    689   const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
    690   // column-wise transpose
    691   const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
    692   const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
    693   const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
    694   const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
    695   const __m128i B0 = _mm_unpacklo_epi16(A0, A1);  // r0 r1 r2 r3 | g0 g1 ..
    696   const __m128i B1 = _mm_unpackhi_epi16(A0, A1);  // b0 b1 b2 b3 | x x x x
    697   const __m128i B2 = _mm_unpacklo_epi16(A2, A3);  // r4 r5 r6 r7 | g4 g5 ..
    698   const __m128i B3 = _mm_unpackhi_epi16(A2, A3);  // b4 b5 b6 b7 | x x x x
    699   *r = _mm_unpacklo_epi64(B0, B2);
    700   *g = _mm_unpackhi_epi64(B0, B2);
    701   *b = _mm_unpacklo_epi64(B1, B3);
    702 }
    703 
    704 static void ConvertRGBA32ToUV(const uint16_t* rgb,
    705                               uint8_t* u, uint8_t* v, int width) {
    706   const int max_width = width & ~15;
    707   const uint16_t* const last_rgb = rgb + 4 * max_width;
    708   while (rgb < last_rgb) {
    709     __m128i r, g, b, U0, V0, U1, V1;
    710     RGBA32PackedToPlanar_16b(rgb +  0, &r, &g, &b);
    711     ConvertRGBToUV(&r, &g, &b, &U0, &V0);
    712     RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
    713     ConvertRGBToUV(&r, &g, &b, &U1, &V1);
    714     STORE_16(_mm_packus_epi16(U0, U1), u);
    715     STORE_16(_mm_packus_epi16(V0, V1), v);
    716     u += 16;
    717     v += 16;
    718     rgb += 2 * 32;
    719   }
    720   if (max_width < width) {  // left-over
    721     WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
    722   }
    723 }
    724 
    725 //------------------------------------------------------------------------------
    726 
    727 extern void WebPInitConvertARGBToYUVSSE2(void);
    728 
    729 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
    730   WebPConvertARGBToY = ConvertARGBToY;
    731   WebPConvertARGBToUV = ConvertARGBToUV;
    732 
    733   WebPConvertRGB24ToY = ConvertRGB24ToY;
    734   WebPConvertBGR24ToY = ConvertBGR24ToY;
    735 
    736   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
    737 }
    738 
    739 //------------------------------------------------------------------------------
    740 
    741 #define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
    742 static uint16_t clip_y(int v) {
    743   return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
    744 }
    745 
    746 static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
    747                                      uint16_t* dst, int len) {
    748   uint64_t diff = 0;
    749   uint32_t tmp[4];
    750   int i;
    751   const __m128i zero = _mm_setzero_si128();
    752   const __m128i max = _mm_set1_epi16(MAX_Y);
    753   const __m128i one = _mm_set1_epi16(1);
    754   __m128i sum = zero;
    755 
    756   for (i = 0; i + 8 <= len; i += 8) {
    757     const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
    758     const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
    759     const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
    760     const __m128i D = _mm_sub_epi16(A, B);       // diff_y
    761     const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
    762     const __m128i F = _mm_add_epi16(C, D);       // new_y
    763     const __m128i G = _mm_or_si128(E, one);      // -1 or 1
    764     const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
    765     const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
    766     _mm_storeu_si128((__m128i*)(dst + i), H);
    767     sum = _mm_add_epi32(sum, I);
    768   }
    769   _mm_storeu_si128((__m128i*)tmp, sum);
    770   diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
    771   for (; i < len; ++i) {
    772     const int diff_y = ref[i] - src[i];
    773     const int new_y = (int)dst[i] + diff_y;
    774     dst[i] = clip_y(new_y);
    775     diff += (uint64_t)abs(diff_y);
    776   }
    777   return diff;
    778 }
    779 
    780 static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
    781                                    int16_t* dst, int len) {
    782   int i = 0;
    783   for (i = 0; i + 8 <= len; i += 8) {
    784     const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
    785     const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
    786     const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
    787     const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
    788     const __m128i E = _mm_add_epi16(C, D);   // new_uv
    789     _mm_storeu_si128((__m128i*)(dst + i), E);
    790   }
    791   for (; i < len; ++i) {
    792     const int diff_uv = ref[i] - src[i];
    793     dst[i] += diff_uv;
    794   }
    795 }
    796 
    797 static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
    798                                    const uint16_t* best_y, uint16_t* out) {
    799   int i;
    800   const __m128i kCst8 = _mm_set1_epi16(8);
    801   const __m128i max = _mm_set1_epi16(MAX_Y);
    802   const __m128i zero = _mm_setzero_si128();
    803   for (i = 0; i + 8 <= len; i += 8) {
    804     const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
    805     const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
    806     const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
    807     const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
    808     const __m128i a0b1 = _mm_add_epi16(a0, b1);
    809     const __m128i a1b0 = _mm_add_epi16(a1, b0);
    810     const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
    811     const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
    812     const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
    813     const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
    814     const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
    815     const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
    816     const __m128i d0 = _mm_add_epi16(c1, a0);
    817     const __m128i d1 = _mm_add_epi16(c0, a1);
    818     const __m128i e0 = _mm_srai_epi16(d0, 1);
    819     const __m128i e1 = _mm_srai_epi16(d1, 1);
    820     const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
    821     const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
    822     const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
    823     const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
    824     const __m128i h0 = _mm_add_epi16(g0, f0);
    825     const __m128i h1 = _mm_add_epi16(g1, f1);
    826     const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
    827     const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
    828     _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
    829     _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
    830   }
    831   for (; i < len; ++i) {
    832     //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
    833     // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
    834     // We reuse the common sub-expressions.
    835     const int a0b1 = A[i + 0] + B[i + 1];
    836     const int a1b0 = A[i + 1] + B[i + 0];
    837     const int a0a1b0b1 = a0b1 + a1b0 + 8;
    838     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    839     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    840     out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
    841     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
    842   }
    843 }
    844 
    845 #undef MAX_Y
    846 
    847 //------------------------------------------------------------------------------
    848 
    849 extern void WebPInitSharpYUVSSE2(void);
    850 
    851 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
    852   WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
    853   WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
    854   WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
    855 }
    856 
    857 #else  // !WEBP_USE_SSE2
    858 
    859 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
    860 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
    861 WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
    862 
    863 #endif  // WEBP_USE_SSE2
    864