Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <stdint.h>
     18 #include <x86intrin.h>
     19 
     20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
     21 static inline __m128i cvtepu8_epi32(__m128i x) {
     22 #if defined(__SSE4_1__)
     23     return _mm_cvtepu8_epi32(x);
     24 #elif defined(__SSSE3__)
     25     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
     26     x = _mm_shuffle_epi8(x, M8to32);
     27     return x;
     28 #else
     29 #   error "Require at least SSSE3"
     30 #endif
     31 }
     32 
     33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
     34 #if defined(__SSE4_1__)
     35     return _mm_packus_epi32(lo, hi);
     36 #elif defined(__SSSE3__)
     37     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
     38     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
     39     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
     40     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
     41     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
     42     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
     43     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
     44     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
     45     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
     46                         _mm_shuffle_epi8(hi, M32to16H));
     47 #else
     48 #   error "Require at least SSSE3"
     49 #endif
     50 }
     51 
     52 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
     53 #if defined(__SSE4_1__)
     54     return _mm_mullo_epi32(x, y);
     55 #elif defined(__SSSE3__)
     56     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
     57     __m128i even = _mm_mul_epu32(x, y);
     58     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
     59                                 _mm_srli_si128(y, 4));
     60     even = _mm_and_si128(even, Meven);
     61     odd = _mm_and_si128(odd, Meven);
     62     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
     63 #else
     64 #   error "Require at least SSSE3"
     65 #endif
     66 }
     67 
     68 /* 'mask' must packed 8-bit of 0x00 or 0xff */
     69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
     70 #if defined(__SSE4_1__)
     71     return _mm_blendv_epi8(x, y, mask);
     72 #elif defined(__SSSE3__)
     73     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
     74 #else
     75 #   error "Require at least SSSE3"
     76 #endif
     77 }
     78 
     79 void rsdIntrinsicConvolve3x3_K(void *dst,
     80                                const void *y0, const void *y1, const void *y2,
     81                                const short *coef, uint32_t count) {
     82     __m128i x;
     83     __m128i c0, c2, c4, c6, c8;
     84     __m128i r0, r1, r2;
     85     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
     86     __m128i o0, o1;
     87     uint32_t i;
     88 
     89     x = _mm_loadl_epi64((const __m128i *)(coef+0));
     90     c0 = _mm_shuffle_epi32(x, 0x00);
     91     c2 = _mm_shuffle_epi32(x, 0x55);
     92     x = _mm_loadl_epi64((const __m128i *)(coef+4));
     93     c4 = _mm_shuffle_epi32(x, 0x00);
     94     c6 = _mm_shuffle_epi32(x, 0x55);
     95     x = _mm_loadl_epi64((const __m128i *)(coef+8));
     96     c8 = _mm_shuffle_epi32(x, 0x00);
     97 
     98     for (i = 0; i < count; ++i) {
     99 
    100         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
    101         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
    102         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
    103         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
    104         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
    105         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
    106         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
    107         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
    108         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
    109         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
    110         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
    111         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
    112 
    113         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
    114         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
    115 
    116         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
    117         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
    118 
    119         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
    120         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
    121 
    122         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
    123         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
    124 
    125         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
    126         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
    127 
    128         o0 = _mm_srai_epi32(o0, 8);
    129         o1 = _mm_srai_epi32(o1, 8);
    130 
    131         o0 = packus_epi32(o0, o1);
    132         o0 = _mm_packus_epi16(o0, o0);
    133         _mm_storel_epi64((__m128i *)dst, o0);
    134 
    135         y0 = (const char *)y0 + 8;
    136         y1 = (const char *)y1 + 8;
    137         y2 = (const char *)y2 + 8;
    138         dst = (char *)dst + 8;
    139     }
    140 }
    141 
    142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
    143                                   const short *coef, uint32_t count) {
    144     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    145                                       14, 10, 6, 2,
    146                                       13,  9, 5, 1,
    147                                       12,  8, 4, 0);
    148 
    149     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    150     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    151     __m128i c0, c1, c2, c3;
    152     __m128i i4, o4;
    153     __m128i xy, zw;
    154     __m128i x2, y2, z2, w2;
    155     uint32_t i;
    156 
    157     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    158     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    159     c0 = _mm_unpacklo_epi16(c0, c1);
    160 
    161     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    162     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    163     c2 = _mm_unpacklo_epi16(c2, c3);
    164 
    165     for (i = 0; i < count; ++i) {
    166         i4 = _mm_load_si128((const __m128i *)src);
    167         xy = _mm_shuffle_epi8(i4, Mxy);
    168         zw = _mm_shuffle_epi8(i4, Mzw);
    169 
    170         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
    171         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
    172         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
    173         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
    174 
    175         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
    176         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
    177         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
    178         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
    179 
    180         x2 = _mm_srai_epi32(x2, 8);
    181         y2 = _mm_srai_epi32(y2, 8);
    182         z2 = _mm_srai_epi32(z2, 8);
    183         w2 = _mm_srai_epi32(w2, 8);
    184 
    185         x2 = packus_epi32(x2, y2);
    186         z2 = packus_epi32(z2, w2);
    187         o4 = _mm_packus_epi16(x2, z2);
    188 
    189         o4 = _mm_shuffle_epi8(o4, T4x4);
    190         _mm_storeu_si128((__m128i *)dst, o4);
    191 
    192         src = (const char *)src + 16;
    193         dst = (char *)dst + 16;
    194     }
    195 }
    196 
    197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
    198                                   const short *coef, uint32_t count) {
    199     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    200                                       14, 10, 6, 2,
    201                                       13,  9, 5, 1,
    202                                       12,  8, 4, 0);
    203 
    204     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    205     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    206 
    207     __m128i c0, c1, c2, c3;
    208     __m128i i4, o4;
    209     __m128i xy, zw;
    210     __m128i x2, y2, z2, w2;
    211     uint32_t i;
    212 
    213     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    214     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    215     c0 = _mm_unpacklo_epi16(c0, c1);
    216 
    217     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    218     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    219     c2 = _mm_unpacklo_epi16(c2, c3);
    220 
    221     for (i = 0; i < count; ++i) {
    222         i4 = _mm_loadu_si128((const __m128i *)src);
    223         xy = _mm_shuffle_epi8(i4, Mxy);
    224         zw = _mm_shuffle_epi8(i4, Mzw);
    225 
    226         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
    227         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
    228         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
    229 
    230         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
    231         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
    232         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
    233 
    234         x2 = _mm_srai_epi32(x2, 8);
    235         y2 = _mm_srai_epi32(y2, 8);
    236         z2 = _mm_srai_epi32(z2, 8);
    237         w2 = _mm_srli_epi32(zw, 16);
    238 
    239         x2 = packus_epi32(x2, y2);
    240         z2 = packus_epi32(z2, w2);
    241         o4 = _mm_packus_epi16(x2, z2);
    242 
    243         o4 = _mm_shuffle_epi8(o4, T4x4);
    244         _mm_storeu_si128((__m128i *)dst, o4);
    245 
    246         src = (const char *)src + 16;
    247         dst = (char *)dst + 16;
    248     }
    249 }
    250 
    251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
    252                                   const short *coef, uint32_t count) {
    253     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    254                                       14, 10, 6, 2,
    255                                       13,  9, 5, 1,
    256                                       12,  8, 4, 0);
    257     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    258     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    259     __m128i c0, c1, c2, c3;
    260     __m128i i4, o4;
    261     __m128i xy, zw;
    262     __m128i x2, y2, z2, w2;
    263     uint32_t i;
    264 
    265     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    266     c0 = _mm_shufflelo_epi16(c0, 0);
    267     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    268     c1 = _mm_shufflelo_epi16(c1, 0);
    269     c0 = _mm_unpacklo_epi16(c0, c1);
    270 
    271     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    272     c2 = _mm_shufflelo_epi16(c2, 0);
    273     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    274     c3 = _mm_shufflelo_epi16(c3, 0);
    275     c2 = _mm_unpacklo_epi16(c2, c3);
    276 
    277     for (i = 0; i < count; ++i) {
    278         i4 = _mm_loadu_si128((const __m128i *)src);
    279 
    280         xy = _mm_shuffle_epi8(i4, Mxy);
    281         zw = _mm_shuffle_epi8(i4, Mzw);
    282 
    283         x2 =  _mm_madd_epi16(xy, c0);
    284         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
    285 
    286         x2 = _mm_srai_epi32(x2, 8);
    287         y2 = x2;
    288         z2 = x2;
    289         w2 = _mm_srli_epi32(zw, 16);
    290 
    291         x2 = packus_epi32(x2, y2);
    292         z2 = packus_epi32(z2, w2);
    293         o4 = _mm_packus_epi16(x2, z2);
    294 
    295         o4 = _mm_shuffle_epi8(o4, T4x4);
    296         _mm_storeu_si128((__m128i *)dst, o4);
    297 
    298         src = (const char *)src + 16;
    299         dst = (char *)dst + 16;
    300     }
    301 }
    302 
    303 void rsdIntrinsicBlurVFU4_K(void *dst,
    304                           const void *pin, int stride, const void *gptr,
    305                           int rct, int x1, int x2) {
    306     const char *pi;
    307     __m128i pi0, pi1;
    308     __m128 pf0, pf1;
    309     __m128 bp0, bp1;
    310     __m128 x;
    311     int r;
    312 
    313     for (; x1 < x2; x1 += 2) {
    314         pi = (const char *)pin + (x1 << 2);
    315         bp0 = _mm_setzero_ps();
    316         bp1 = _mm_setzero_ps();
    317 
    318         for (r = 0; r < rct; ++r) {
    319             x = _mm_load_ss((const float *)gptr + r);
    320             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    321 
    322             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
    323             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
    324 
    325             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
    326             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
    327 
    328             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
    329             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
    330 
    331             pi += stride;
    332         }
    333 
    334         _mm_storeu_ps((float *)dst, bp0);
    335         _mm_storeu_ps((float *)dst + 4, bp1);
    336         dst = (char *)dst + 32;
    337     }
    338 }
    339 
    340 void rsdIntrinsicBlurHFU4_K(void *dst,
    341                           const void *pin, const void *gptr,
    342                           int rct, int x1, int x2) {
    343     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
    344     const float *pi;
    345     __m128 pf, x, y;
    346     __m128i o;
    347     int r;
    348 
    349     for (; x1 < x2; ++x1) {
    350         /* rct is define as 2*r+1 by the caller */
    351         x = _mm_load_ss((const float *)gptr);
    352         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    353 
    354         pi = (const float *)pin + (x1 << 2);
    355         pf = _mm_mul_ps(x, _mm_load_ps(pi));
    356 
    357         for (r = 1; r < rct; r += 2) {
    358             x = _mm_load_ss((const float *)gptr + r);
    359             y = _mm_load_ss((const float *)gptr + r + 1);
    360             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    361             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
    362 
    363             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
    364             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
    365         }
    366 
    367         o = _mm_cvtps_epi32(pf);
    368         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
    369         dst = (char *)dst + 4;
    370     }
    371 }
    372 
    373 void rsdIntrinsicBlurHFU1_K(void *dst,
    374                           const void *pin, const void *gptr,
    375                           int rct, int x1, int x2) {
    376     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
    377     const float *pi;
    378     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
    379     __m128i o;
    380     int r;
    381 
    382     for (; x1 < x2; x1+=4) {
    383         g0 = _mm_load_ss((const float *)gptr);
    384         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
    385 
    386         pi = (const float *)pin + x1;
    387         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
    388 
    389         for (r = 1; r < rct; r += 4) {
    390             gx = _mm_loadu_ps((const float *)gptr + r);
    391             p0 = _mm_loadu_ps(pi + r);
    392             p1 = _mm_loadu_ps(pi + r + 4);
    393 
    394             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
    395             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
    396             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
    397             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
    398             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
    399             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
    400             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
    401             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
    402         }
    403 
    404         o = _mm_cvtps_epi32(pf);
    405         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
    406         dst = (char *)dst + 4;
    407     }
    408 }
    409 
    410 void rsdIntrinsicYuv_K(void *dst,
    411                        const unsigned char *pY, const unsigned char *pUV,
    412                        uint32_t count, const short *param) {
    413     __m128i biasY, biasUV;
    414     __m128i c0, c1, c2, c3, c4;
    415 
    416     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    417     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    418 
    419     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    420     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    421     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    422     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    423     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    424 
    425     __m128i Y, UV, U, V, R, G, B, A;
    426 
    427     A = _mm_set1_epi32(255);
    428     uint32_t i;
    429 
    430     for (i = 0; i < (count << 1); ++i) {
    431         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    432         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
    433 
    434         Y = _mm_sub_epi32(Y, biasY);
    435         UV = _mm_sub_epi32(UV, biasUV);
    436 
    437         U = _mm_shuffle_epi32(UV, 0xf5);
    438         V = _mm_shuffle_epi32(UV, 0xa0);
    439 
    440         Y = mullo_epi32(Y, c0);
    441 
    442         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    443         R = _mm_add_epi32(R, biasUV);
    444         R = _mm_srai_epi32(R, 8);
    445 
    446         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    447         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    448         G = _mm_add_epi32(G, biasUV);
    449         G = _mm_srai_epi32(G, 8);
    450 
    451         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    452         B = _mm_add_epi32(B, biasUV);
    453         B = _mm_srai_epi32(B, 8);
    454 
    455         __m128i y1, y2, y3, y4;
    456 
    457         y1 = packus_epi32(R, G);
    458         y2 = packus_epi32(B, A);
    459         y3 = _mm_packus_epi16(y1, y2);
    460         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    461                                           14, 10, 6, 2,
    462                                           13,  9, 5, 1,
    463                                           12,  8, 4, 0);
    464         y4 = _mm_shuffle_epi8(y3, T4x4);
    465         _mm_storeu_si128((__m128i *)dst, y4);
    466         pY += 4;
    467         pUV += 4;
    468         dst = (__m128i *)dst + 1;
    469     }
    470 }
    471 
    472 void rsdIntrinsicYuvR_K(void *dst,
    473                        const unsigned char *pY, const unsigned char *pUV,
    474                        uint32_t count, const short *param) {
    475     __m128i biasY, biasUV;
    476     __m128i c0, c1, c2, c3, c4;
    477 
    478     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    479     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    480 
    481     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    482     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    483     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    484     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    485     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    486 
    487     __m128i Y, UV, U, V, R, G, B, A;
    488 
    489     A = _mm_set1_epi32(255);
    490     uint32_t i;
    491 
    492     for (i = 0; i < (count << 1); ++i) {
    493         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    494         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
    495 
    496         Y = _mm_sub_epi32(Y, biasY);
    497         UV = _mm_sub_epi32(UV, biasUV);
    498 
    499         V = _mm_shuffle_epi32(UV, 0xf5);
    500         U = _mm_shuffle_epi32(UV, 0xa0);
    501 
    502         Y = mullo_epi32(Y, c0);
    503 
    504         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    505         R = _mm_add_epi32(R, biasUV);
    506         R = _mm_srai_epi32(R, 8);
    507 
    508         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    509         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    510         G = _mm_add_epi32(G, biasUV);
    511         G = _mm_srai_epi32(G, 8);
    512 
    513         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    514         B = _mm_add_epi32(B, biasUV);
    515         B = _mm_srai_epi32(B, 8);
    516 
    517         __m128i y1, y2, y3, y4;
    518 
    519         y1 = packus_epi32(R, G);
    520         y2 = packus_epi32(B, A);
    521         y3 = _mm_packus_epi16(y1, y2);
    522         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    523                                           14, 10, 6, 2,
    524                                           13,  9, 5, 1,
    525                                           12,  8, 4, 0);
    526         y4 = _mm_shuffle_epi8(y3, T4x4);
    527         _mm_storeu_si128((__m128i *)dst, y4);
    528         pY += 4;
    529         pUV += 4;
    530         dst = (__m128i *)dst + 1;
    531     }
    532 }
    533 
    534 void rsdIntrinsicYuv2_K(void *dst,
    535                        const unsigned char *pY, const unsigned char *pU,
    536                        const unsigned char *pV, uint32_t count, const short *param) {
    537     __m128i biasY, biasUV;
    538     __m128i c0, c1, c2, c3, c4;
    539 
    540     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    541     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    542 
    543     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    544     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    545     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    546     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    547     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    548 
    549     __m128i Y, U, V, R, G, B, A;
    550 
    551     A = _mm_set1_epi32(255);
    552     uint32_t i;
    553 
    554     for (i = 0; i < (count << 1); ++i) {
    555         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    556         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
    557 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
    558 
    559         Y = _mm_sub_epi32(Y, biasY);
    560         U = _mm_sub_epi32(U, biasUV);
    561 		V = _mm_sub_epi32(V, biasUV);
    562 
    563         Y = mullo_epi32(Y, c0);
    564 
    565         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    566         R = _mm_add_epi32(R, biasUV);
    567         R = _mm_srai_epi32(R, 8);
    568 
    569         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    570         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    571         G = _mm_add_epi32(G, biasUV);
    572         G = _mm_srai_epi32(G, 8);
    573 
    574         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    575         B = _mm_add_epi32(B, biasUV);
    576         B = _mm_srai_epi32(B, 8);
    577 
    578         __m128i y1, y2, y3, y4;
    579 
    580         y1 = packus_epi32(R, G);
    581         y2 = packus_epi32(B, A);
    582         y3 = _mm_packus_epi16(y1, y2);
    583         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    584                                           14, 10, 6, 2,
    585                                           13,  9, 5, 1,
    586                                           12,  8, 4, 0);
    587         y4 = _mm_shuffle_epi8(y3, T4x4);
    588         _mm_storeu_si128((__m128i *)dst, y4);
    589         pY += 4;
    590         pU += 4;
    591 		pV += 4;
    592         dst = (__m128i *)dst + 1;
    593     }
    594 }
    595 
    596 void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
    597                                const void *y2, const void *y3, const void *y4,
    598                                const short *coef, uint32_t count) {
    599     __m128i x;
    600     __m128i c0, c2, c4, c6, c8, c10, c12;
    601     __m128i c14, c16, c18, c20, c22, c24;
    602     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
    603     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
    604     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
    605     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
    606     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
    607     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
    608     __m128i o0, o1, o2, o3;
    609     uint32_t i;
    610 
    611     x = _mm_loadl_epi64((const __m128i *)(coef+0));
    612     c0  = _mm_shuffle_epi32(x, 0x00);
    613     c2  = _mm_shuffle_epi32(x, 0x55);
    614 
    615     x = _mm_loadl_epi64((const __m128i *)(coef+4));
    616     c4  = _mm_shuffle_epi32(x, 0x00);
    617     c6  = _mm_shuffle_epi32(x, 0x55);
    618 
    619     x = _mm_loadl_epi64((const __m128i *)(coef+8));
    620     c8  = _mm_shuffle_epi32(x, 0x00);
    621     c10  = _mm_shuffle_epi32(x, 0x55);
    622 
    623     x = _mm_loadl_epi64((const __m128i *)(coef+12));
    624     c12  = _mm_shuffle_epi32(x, 0x00);
    625     c14  = _mm_shuffle_epi32(x, 0x55);
    626 
    627     x = _mm_loadl_epi64((const __m128i *)(coef+16));
    628     c16  = _mm_shuffle_epi32(x, 0x00);
    629     c18  = _mm_shuffle_epi32(x, 0x55);
    630 
    631     x = _mm_loadl_epi64((const __m128i *)(coef+20));
    632     c20  = _mm_shuffle_epi32(x, 0x00);
    633     c22  = _mm_shuffle_epi32(x, 0x55);
    634 
    635     x = _mm_loadl_epi64((const __m128i *)(coef+24));
    636     c24  = _mm_shuffle_epi32(x, 0x00);
    637 
    638     for (i = 0; i < count; ++i) {
    639 
    640         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
    641         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
    642         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
    643         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
    644         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
    645         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
    646         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
    647         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
    648 
    649         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
    650         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
    651         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
    652         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
    653         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
    654         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
    655         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
    656         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
    657 
    658         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
    659         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
    660         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
    661         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
    662         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
    663         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
    664         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
    665         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
    666 
    667         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
    668         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
    669         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
    670         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
    671         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
    672         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
    673         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
    674         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
    675 
    676         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
    677         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
    678         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
    679         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
    680         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
    681         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
    682         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
    683         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
    684 
    685         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
    686         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
    687         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
    688         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
    689         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
    690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
    691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
    692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
    693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
    694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
    695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
    696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
    697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
    698         o0 = _mm_srai_epi32(o0, 8);
    699 
    700         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
    701         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
    702         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
    703         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
    704         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
    705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
    706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
    707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
    708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
    709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
    710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
    711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
    712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
    713         o1 = _mm_srai_epi32(o1, 8);
    714 
    715         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
    716         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
    717         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
    718         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
    719         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
    720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
    721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
    722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
    723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
    724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
    725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
    726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
    727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
    728         o2 = _mm_srai_epi32(o2, 8);
    729 
    730         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
    731         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
    732         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
    733         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
    734         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
    735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
    736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
    737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
    738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
    739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
    740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
    741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
    742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
    743         o3 = _mm_srai_epi32(o3, 8);
    744 
    745         o0 = packus_epi32(o0, o1);
    746         o2 = packus_epi32(o2, o3);
    747         o0 = _mm_packus_epi16(o0, o2);
    748         _mm_storeu_si128((__m128i *)dst, o0);
    749 
    750         y0 = (const char *)y0 + 16;
    751         y1 = (const char *)y1 + 16;
    752         y2 = (const char *)y2 + 16;
    753         y3 = (const char *)y3 + 16;
    754         y4 = (const char *)y4 + 16;
    755         dst = (char *)dst + 16;
    756     }
    757 }
    758 
    759 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
    760     __m128i all1s, ina, ins;
    761     __m128i in0, in1, out0, out1;
    762     __m128i t0, t1, t2, t3;
    763     uint32_t i;
    764 
    765     all1s = _mm_set1_epi16(255);
    766 
    767     for (i = 0; i < count8; ++i) {
    768         in0 = _mm_loadu_si128((const __m128i *)src);
    769         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    770         out0 = _mm_loadu_si128((const __m128i *)dst);
    771         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    772 
    773         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    774         ina = _mm_shufflelo_epi16(ins, 0xFF);
    775         ina = _mm_shufflehi_epi16(ina, 0xFF);
    776         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    777         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
    778         t0 = _mm_srai_epi16(t0, 8);
    779         t0 = _mm_add_epi16(t0, ins);
    780 
    781         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    782         ina = _mm_shufflelo_epi16(ins, 0xFF);
    783         ina = _mm_shufflehi_epi16(ina, 0xFF);
    784         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    785         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
    786         t1 = _mm_srai_epi16(t1, 8);
    787         t1 = _mm_add_epi16(t1, ins);
    788 
    789         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    790         ina = _mm_shufflelo_epi16(ins, 0xFF);
    791         ina = _mm_shufflehi_epi16(ina, 0xFF);
    792         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    793         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
    794         t2 = _mm_srai_epi16(t2, 8);
    795         t2 = _mm_add_epi16(t2, ins);
    796 
    797         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    798         ina = _mm_shufflelo_epi16(ins, 0xFF);
    799         ina = _mm_shufflehi_epi16(ina, 0xFF);
    800         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    801         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
    802         t3 = _mm_srai_epi16(t3, 8);
    803         t3 = _mm_add_epi16(t3, ins);
    804 
    805         t0 = _mm_packus_epi16(t0, t1);
    806         t2 = _mm_packus_epi16(t2, t3);
    807         _mm_storeu_si128((__m128i *)dst, t0);
    808         _mm_storeu_si128((__m128i *)dst + 1, t2);
    809 
    810         src = (const __m128i *)src + 2;
    811         dst = (__m128i *)dst + 2;
    812     }
    813 }
    814 
    815 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
    816     __m128i all1s, outa, outs;
    817     __m128i in0, in1, out0, out1;
    818     __m128i t0, t1, t2, t3;
    819     uint32_t i;
    820 
    821     all1s = _mm_set1_epi16(255);
    822 
    823     for (i = 0; i < count8; ++i) {
    824         in0 = _mm_loadu_si128((const __m128i *)src);
    825         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    826         out0 = _mm_loadu_si128((const __m128i *)dst);
    827         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    828 
    829 
    830         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    831         outa = _mm_shufflelo_epi16(outs, 0xFF);
    832         outa = _mm_shufflehi_epi16(outa, 0xFF);
    833         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    834         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
    835         t0 = _mm_srai_epi16(t0, 8);
    836         t0 = _mm_add_epi16(t0, outs);
    837 
    838         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    839         outa = _mm_shufflelo_epi16(outs, 0xFF);
    840         outa = _mm_shufflehi_epi16(outa, 0xFF);
    841         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    842         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
    843         t1 = _mm_srai_epi16(t1, 8);
    844         t1 = _mm_add_epi16(t1, outs);
    845 
    846         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    847         outa = _mm_shufflelo_epi16(outs, 0xFF);
    848         outa = _mm_shufflehi_epi16(outa, 0xFF);
    849         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    850         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
    851         t2 = _mm_srai_epi16(t2, 8);
    852         t2 = _mm_add_epi16(t2, outs);
    853 
    854         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    855         outa = _mm_shufflelo_epi16(outs, 0xFF);
    856         outa = _mm_shufflehi_epi16(outa, 0xFF);
    857         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    858         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
    859         t3 = _mm_srai_epi16(t3, 8);
    860         t3 = _mm_add_epi16(t3, outs);
    861 
    862         t0 = _mm_packus_epi16(t0, t1);
    863         t2 = _mm_packus_epi16(t2, t3);
    864         _mm_storeu_si128((__m128i *)dst, t0);
    865         _mm_storeu_si128((__m128i *)dst + 1, t2);
    866 
    867         src = (const __m128i *)src + 2;
    868         dst = (__m128i *)dst + 2;
    869     }
    870 }
    871 
    872 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
    873     __m128i outa;
    874     __m128i in0, in1, out0, out1;
    875     __m128i t0, t1, t2, t3;
    876     uint32_t i;
    877 
    878     for (i = 0; i < count8; ++i) {
    879         in0 = _mm_loadu_si128((const __m128i *)src);
    880         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    881         out0 = _mm_loadu_si128((const __m128i *)dst);
    882         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    883 
    884         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    885         outa = _mm_shufflelo_epi16(outa, 0xFF);
    886         outa = _mm_shufflehi_epi16(outa, 0xFF);
    887         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    888         t0 = _mm_mullo_epi16(t0, outa);
    889         t0 = _mm_srai_epi16(t0, 8);
    890 
    891         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    892         outa = _mm_shufflelo_epi16(outa, 0xFF);
    893         outa = _mm_shufflehi_epi16(outa, 0xFF);
    894         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    895         t1 = _mm_mullo_epi16(t1, outa);
    896         t1 = _mm_srai_epi16(t1, 8);
    897 
    898         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    899         outa = _mm_shufflelo_epi16(outa, 0xFF);
    900         outa = _mm_shufflehi_epi16(outa, 0xFF);
    901         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    902         t2 = _mm_mullo_epi16(t2, outa);
    903         t2 = _mm_srai_epi16(t2, 8);
    904 
    905         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    906         outa = _mm_shufflelo_epi16(outa, 0xFF);
    907         outa = _mm_shufflehi_epi16(outa, 0xFF);
    908         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    909         t3 = _mm_mullo_epi16(t3, outa);
    910         t3 = _mm_srai_epi16(t3, 8);
    911 
    912         t0 = _mm_packus_epi16(t0, t1);
    913         t2 = _mm_packus_epi16(t2, t3);
    914         _mm_storeu_si128((__m128i *)dst, t0);
    915         _mm_storeu_si128((__m128i *)dst + 1, t2);
    916 
    917         src = (const __m128i *)src + 2;
    918         dst = (__m128i *)dst + 2;
    919     }
    920 }
    921 
    922 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
    923     __m128i ina;
    924     __m128i in0, in1, out0, out1;
    925     __m128i t0, t1, t2, t3;
    926     uint32_t i;
    927 
    928     for (i = 0; i < count8; ++i) {
    929         in0 = _mm_loadu_si128((const __m128i *)src);
    930         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    931         out0 = _mm_loadu_si128((const __m128i *)dst);
    932         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    933 
    934         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    935         ina = _mm_shufflelo_epi16(ina, 0xFF);
    936         ina = _mm_shufflehi_epi16(ina, 0xFF);
    937         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    938         t0 = _mm_mullo_epi16(t0, ina);
    939         t0 = _mm_srai_epi16(t0, 8);
    940 
    941         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    942         ina = _mm_shufflelo_epi16(ina, 0xFF);
    943         ina = _mm_shufflehi_epi16(ina, 0xFF);
    944         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    945         t1 = _mm_mullo_epi16(t1, ina);
    946         t1 = _mm_srai_epi16(t1, 8);
    947 
    948         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    949         ina = _mm_shufflelo_epi16(ina, 0xFF);
    950         ina = _mm_shufflehi_epi16(ina, 0xFF);
    951         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    952         t2 = _mm_mullo_epi16(t2, ina);
    953         t2 = _mm_srai_epi16(t2, 8);
    954 
    955         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    956         ina = _mm_shufflelo_epi16(ina, 0xFF);
    957         ina = _mm_shufflehi_epi16(ina, 0xFF);
    958         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    959         t3 = _mm_mullo_epi16(t3, ina);
    960         t3 = _mm_srai_epi16(t3, 8);
    961 
    962         t0 = _mm_packus_epi16(t0, t1);
    963         t2 = _mm_packus_epi16(t2, t3);
    964         _mm_storeu_si128((__m128i *)dst, t0);
    965         _mm_storeu_si128((__m128i *)dst + 1, t2);
    966 
    967         src = (const __m128i *)src + 2;
    968         dst = (__m128i *)dst + 2;
    969     }
    970 }
    971 
    972 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
    973     __m128i all1s, outa;
    974     __m128i in0, in1, out0, out1;
    975     __m128i t0, t1, t2, t3;
    976     uint32_t i;
    977 
    978     all1s = _mm_set1_epi16(255);
    979 
    980     for (i = 0; i < count8; ++i) {
    981         in0 = _mm_loadu_si128((const __m128i *)src);
    982         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    983         out0 = _mm_loadu_si128((const __m128i *)dst);
    984         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    985 
    986         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    987         outa = _mm_shufflelo_epi16(outa, 0xFF);
    988         outa = _mm_shufflehi_epi16(outa, 0xFF);
    989         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    990         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
    991         t0 = _mm_srai_epi16(t0, 8);
    992 
    993         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    994         outa = _mm_shufflelo_epi16(outa, 0xFF);
    995         outa = _mm_shufflehi_epi16(outa, 0xFF);
    996         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    997         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
    998         t1 = _mm_srai_epi16(t1, 8);
    999 
   1000         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1001         outa = _mm_shufflelo_epi16(outa, 0xFF);
   1002         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1003         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1004         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
   1005         t2 = _mm_srai_epi16(t2, 8);
   1006 
   1007         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1008         outa = _mm_shufflelo_epi16(outa, 0xFF);
   1009         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1010         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1011         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
   1012         t3 = _mm_srai_epi16(t3, 8);
   1013 
   1014         t0 = _mm_packus_epi16(t0, t1);
   1015         t2 = _mm_packus_epi16(t2, t3);
   1016         _mm_storeu_si128((__m128i *)dst, t0);
   1017         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1018 
   1019         src = (const __m128i *)src + 2;
   1020         dst = (__m128i *)dst + 2;
   1021     }
   1022 }
   1023 
   1024 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
   1025     __m128i all1s, ina;
   1026     __m128i in0, in1, out0, out1;
   1027     __m128i t0, t1, t2, t3;
   1028     uint32_t i;
   1029 
   1030     all1s = _mm_set1_epi16(255);
   1031 
   1032     for (i = 0; i < count8; ++i) {
   1033         in0 = _mm_loadu_si128((const __m128i *)src);
   1034         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1035         out0 = _mm_loadu_si128((const __m128i *)dst);
   1036         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1037 
   1038         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1039         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1040         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1041         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1042         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
   1043         t0 = _mm_srai_epi16(t0, 8);
   1044 
   1045         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1046         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1047         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1048         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1049         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
   1050         t1 = _mm_srai_epi16(t1, 8);
   1051 
   1052         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1053         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1054         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1055         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1056         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
   1057         t2 = _mm_srai_epi16(t2, 8);
   1058 
   1059         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1060         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1061         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1062         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1063         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
   1064         t3 = _mm_srai_epi16(t3, 8);
   1065 
   1066         t0 = _mm_packus_epi16(t0, t1);
   1067         t2 = _mm_packus_epi16(t2, t3);
   1068         _mm_storeu_si128((__m128i *)dst, t0);
   1069         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1070 
   1071         src = (const __m128i *)src + 2;
   1072         dst = (__m128i *)dst + 2;
   1073     }
   1074 }
   1075 
   1076 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
   1077     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
   1078     __m128i all1s, ina, outa, ins, outs;
   1079     __m128i in0, in1, out0, out1;
   1080     __m128i t0, t1, t2, t3;
   1081     uint32_t i;
   1082 
   1083     all1s = _mm_set1_epi16(255);
   1084 
   1085     for (i = 0; i < count8; ++i) {
   1086         in0 = _mm_loadu_si128((const __m128i *)src);
   1087         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1088         out0 = _mm_loadu_si128((const __m128i *)dst);
   1089         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1090 
   1091         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1092         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1093         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1094         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1095         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1096         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1097         t0 = _mm_sub_epi16(all1s, ina);
   1098         t0 = _mm_mullo_epi16(t0, outs);
   1099         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
   1100         t0 = _mm_srli_epi16(t0, 8);
   1101 
   1102         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1103         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1104         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1105         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1106         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1107         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1108         t1 = _mm_sub_epi16(all1s, ina);
   1109         t1 = _mm_mullo_epi16(t1, outs);
   1110         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
   1111         t1 = _mm_srli_epi16(t1, 8);
   1112 
   1113         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1114         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1115         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1116         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1117         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1118         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1119         t2 = _mm_sub_epi16(all1s, ina);
   1120         t2 = _mm_mullo_epi16(t2, outs);
   1121         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
   1122         t2 = _mm_srli_epi16(t2, 8);
   1123 
   1124         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1125         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1126         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1127         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1128         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1129         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1130         t3 = _mm_sub_epi16(all1s, ina);
   1131         t3 = _mm_mullo_epi16(t3, outs);
   1132         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
   1133         t3 = _mm_srli_epi16(t3, 8);
   1134 
   1135         t0 = _mm_packus_epi16(t0, t1);
   1136         t0 = blendv_epi8(t0, out0, M0001);
   1137         t2 = _mm_packus_epi16(t2, t3);
   1138         t2 = blendv_epi8(t2, out1, M0001);
   1139         _mm_storeu_si128((__m128i *)dst, t0);
   1140         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1141 
   1142         src = (const __m128i *)src + 2;
   1143         dst = (__m128i *)dst + 2;
   1144     }
   1145 }
   1146 
   1147 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
   1148     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
   1149     __m128i all1s, ina, ins, outa, outs;
   1150     __m128i in0, in1, out0, out1;
   1151     __m128i t0, t1, t2, t3;
   1152     uint32_t i;
   1153 
   1154     all1s = _mm_set1_epi16(255);
   1155 
   1156     for (i = 0; i < count8; ++i) {
   1157         in0 = _mm_loadu_si128((const __m128i *)src);
   1158         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1159         out0 = _mm_loadu_si128((const __m128i *)dst);
   1160         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1161 
   1162         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1163         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1164         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1165         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1166         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1167         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1168         t0 = _mm_sub_epi16(all1s, outa);
   1169         t0 = _mm_mullo_epi16(t0, ins);
   1170         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
   1171         t0 = _mm_srli_epi16(t0, 8);
   1172 
   1173         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1174         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1175         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1176         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1177         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1178         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1179         t1 = _mm_sub_epi16(all1s, outa);
   1180         t1 = _mm_mullo_epi16(t1, ins);
   1181         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
   1182         t1 = _mm_srli_epi16(t1, 8);
   1183 
   1184         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1185         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1186         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1187         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1188         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1189         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1190         t2 = _mm_sub_epi16(all1s, outa);
   1191         t2 = _mm_mullo_epi16(t2, ins);
   1192         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
   1193         t2 = _mm_srli_epi16(t2, 8);
   1194 
   1195         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1196         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1197         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1198         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1199         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1200         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1201         t3 = _mm_sub_epi16(all1s, outa);
   1202         t3 = _mm_mullo_epi16(t3, ins);
   1203         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
   1204         t3 = _mm_srli_epi16(t3, 8);
   1205 
   1206         t0 = _mm_packus_epi16(t0, t1);
   1207         t0 = blendv_epi8(t0, out0, M0001);
   1208         t2 = _mm_packus_epi16(t2, t3);
   1209         t2 = blendv_epi8(t2, out1, M0001);
   1210         _mm_storeu_si128((__m128i *)dst, t0);
   1211         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1212 
   1213         src = (const __m128i *)src + 2;
   1214         dst = (__m128i *)dst + 2;
   1215     }
   1216 }
   1217 
   1218 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
   1219     __m128i in0, in1, out0, out1;
   1220     uint32_t i;
   1221 
   1222     for (i = 0; i < count8; ++i) {
   1223         in0 = _mm_loadu_si128((const __m128i *)src);
   1224         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1225         out0 = _mm_loadu_si128((const __m128i *)dst);
   1226         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1227 
   1228         out0 = _mm_xor_si128(out0, in0);
   1229         out1 = _mm_xor_si128(out1, in1);
   1230 
   1231         _mm_storeu_si128((__m128i *)dst, out0);
   1232         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1233 
   1234         src = (const __m128i *)src + 2;
   1235         dst = (__m128i *)dst + 2;
   1236     }
   1237 }
   1238 
   1239 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
   1240     __m128i in0, in1, out0, out1;
   1241     __m128i t0, t1, t2, t3;
   1242     uint32_t i;
   1243 
   1244     for (i = 0; i < count8; ++i) {
   1245         in0 = _mm_loadu_si128((const __m128i *)src);
   1246         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1247         out0 = _mm_loadu_si128((const __m128i *)dst);
   1248         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1249 
   1250         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1251         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
   1252         t0 = _mm_srli_epi16(t0, 8);
   1253 
   1254         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1255         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
   1256         t1 = _mm_srli_epi16(t1, 8);
   1257 
   1258         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1259         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
   1260         t2 = _mm_srli_epi16(t2, 8);
   1261 
   1262         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1263         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
   1264         t3 = _mm_srli_epi16(t3, 8);
   1265 
   1266         t0 = _mm_packus_epi16(t0, t1);
   1267         t2 = _mm_packus_epi16(t2, t3);
   1268         _mm_storeu_si128((__m128i *)dst, t0);
   1269         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1270 
   1271         src = (const __m128i *)src + 2;
   1272         dst = (__m128i *)dst + 2;
   1273     }
   1274 }
   1275 
   1276 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
   1277     __m128i in0, in1, out0, out1;
   1278     uint32_t i;
   1279 
   1280     for (i = 0; i < count8; ++i) {
   1281         in0 = _mm_loadu_si128((const __m128i *)src);
   1282         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1283         out0 = _mm_loadu_si128((const __m128i *)dst);
   1284         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1285 
   1286         out0 = _mm_adds_epu8(out0, in0);
   1287         out1 = _mm_adds_epu8(out1, in1);
   1288 
   1289         _mm_storeu_si128((__m128i *)dst, out0);
   1290         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1291 
   1292         src = (const __m128i *)src + 2;
   1293         dst = (__m128i *)dst + 2;
   1294     }
   1295 }
   1296 
   1297 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
   1298     __m128i in0, in1, out0, out1;
   1299     uint32_t i;
   1300 
   1301     for (i = 0; i < count8; ++i) {
   1302         in0 = _mm_loadu_si128((const __m128i *)src);
   1303         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1304         out0 = _mm_loadu_si128((const __m128i *)dst);
   1305         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1306 
   1307         out0 = _mm_subs_epu8(out0, in0);
   1308         out1 = _mm_subs_epu8(out1, in1);
   1309 
   1310         _mm_storeu_si128((__m128i *)dst, out0);
   1311         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1312 
   1313         src = (const __m128i *)src + 2;
   1314         dst = (__m128i *)dst + 2;
   1315     }
   1316 }
   1317