Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <stdint.h>
     18 #include <x86intrin.h>
     19 
     20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
     21 static inline __m128i cvtepu8_epi32(__m128i x) {
     22 #if defined(__SSE4_1__)
     23     return _mm_cvtepu8_epi32(x);
     24 #elif defined(__SSSE3__)
     25     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
     26     x = _mm_shuffle_epi8(x, M8to32);
     27     return x;
     28 #else
     29 #   error "Require at least SSSE3"
     30 #endif
     31 }
     32 
     33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
     34 #if defined(__SSE4_1__)
     35     return _mm_packus_epi32(lo, hi);
     36 #elif defined(__SSSE3__)
     37     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
     38     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
     39     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
     40     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
     41     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
     42     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
     43     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
     44     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
     45     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
     46                         _mm_shuffle_epi8(hi, M32to16H));
     47 #else
     48 #   error "Require at least SSSE3"
     49 #endif
     50 }
     51 
     52 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
     53 #if defined(__SSE4_1__)
     54     return _mm_mullo_epi32(x, y);
     55 #elif defined(__SSSE3__)
     56     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
     57     __m128i even = _mm_mul_epu32(x, y);
     58     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
     59                                 _mm_srli_si128(y, 4));
     60     even = _mm_and_si128(even, Meven);
     61     odd = _mm_and_si128(odd, Meven);
     62     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
     63 #else
     64 #   error "Require at least SSSE3"
     65 #endif
     66 }
     67 
     68 /* 'mask' must packed 8-bit of 0x00 or 0xff */
     69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
     70 #if defined(__SSE4_1__)
     71     return _mm_blendv_epi8(x, y, mask);
     72 #elif defined(__SSSE3__)
     73     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
     74 #else
     75 #   error "Require at least SSSE3"
     76 #endif
     77 }
     78 
     79 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
     80                                           const void *y1, const void *y2,
     81                                           const short *coef, uint32_t count) {
     82     __m128i x;
     83     __m128i c0, c2, c4, c6, c8;
     84     __m128i r0, r1, r2;
     85     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
     86     __m128i o0, o1;
     87     uint32_t i;
     88 
     89     x = _mm_loadl_epi64((const __m128i *)(coef+0));
     90     c0 = _mm_shuffle_epi32(x, 0x00);
     91     c2 = _mm_shuffle_epi32(x, 0x55);
     92     x = _mm_loadl_epi64((const __m128i *)(coef+4));
     93     c4 = _mm_shuffle_epi32(x, 0x00);
     94     c6 = _mm_shuffle_epi32(x, 0x55);
     95     x = _mm_loadl_epi64((const __m128i *)(coef+8));
     96     c8 = _mm_shuffle_epi32(x, 0x00);
     97 
     98     for (i = 0; i < count; ++i) {
     99 
    100         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
    101         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
    102         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
    103         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
    104         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
    105         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
    106         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
    107         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
    108         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
    109         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
    110         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
    111         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
    112 
    113         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
    114         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
    115 
    116         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
    117         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
    118 
    119         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
    120         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
    121 
    122         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
    123         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
    124 
    125         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
    126         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
    127 
    128         o0 = _mm_srai_epi32(o0, 8);
    129         o1 = _mm_srai_epi32(o1, 8);
    130 
    131         o0 = packus_epi32(o0, o1);
    132         o0 = _mm_packus_epi16(o0, o0);
    133         _mm_storel_epi64((__m128i *)dst, o0);
    134 
    135         y0 = (const char *)y0 + 8;
    136         y1 = (const char *)y1 + 8;
    137         y2 = (const char *)y2 + 8;
    138         dst = (char *)dst + 8;
    139     }
    140 }
    141 
    142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
    143                                   const short *coef, uint32_t count) {
    144     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    145                                       14, 10, 6, 2,
    146                                       13,  9, 5, 1,
    147                                       12,  8, 4, 0);
    148 
    149     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    150     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    151     __m128i c0, c1, c2, c3;
    152     __m128i i4, o4;
    153     __m128i xy, zw;
    154     __m128i x2, y2, z2, w2;
    155     uint32_t i;
    156 
    157     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    158     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    159     c0 = _mm_unpacklo_epi16(c0, c1);
    160 
    161     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    162     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    163     c2 = _mm_unpacklo_epi16(c2, c3);
    164 
    165     for (i = 0; i < count; ++i) {
    166         i4 = _mm_load_si128((const __m128i *)src);
    167         xy = _mm_shuffle_epi8(i4, Mxy);
    168         zw = _mm_shuffle_epi8(i4, Mzw);
    169 
    170         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
    171         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
    172         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
    173         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
    174 
    175         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
    176         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
    177         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
    178         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
    179 
    180         x2 = _mm_srai_epi32(x2, 8);
    181         y2 = _mm_srai_epi32(y2, 8);
    182         z2 = _mm_srai_epi32(z2, 8);
    183         w2 = _mm_srai_epi32(w2, 8);
    184 
    185         x2 = packus_epi32(x2, y2);
    186         z2 = packus_epi32(z2, w2);
    187         o4 = _mm_packus_epi16(x2, z2);
    188 
    189         o4 = _mm_shuffle_epi8(o4, T4x4);
    190         _mm_storeu_si128((__m128i *)dst, o4);
    191 
    192         src = (const char *)src + 16;
    193         dst = (char *)dst + 16;
    194     }
    195 }
    196 
    197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
    198                                   const short *coef, uint32_t count) {
    199     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    200                                       14, 10, 6, 2,
    201                                       13,  9, 5, 1,
    202                                       12,  8, 4, 0);
    203 
    204     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    205     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    206 
    207     __m128i c0, c1, c2, c3;
    208     __m128i i4, o4;
    209     __m128i xy, zw;
    210     __m128i x2, y2, z2, w2;
    211     uint32_t i;
    212 
    213     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    214     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    215     c0 = _mm_unpacklo_epi16(c0, c1);
    216 
    217     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    218     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    219     c2 = _mm_unpacklo_epi16(c2, c3);
    220 
    221     for (i = 0; i < count; ++i) {
    222         i4 = _mm_loadu_si128((const __m128i *)src);
    223         xy = _mm_shuffle_epi8(i4, Mxy);
    224         zw = _mm_shuffle_epi8(i4, Mzw);
    225 
    226         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
    227         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
    228         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
    229 
    230         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
    231         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
    232         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
    233 
    234         x2 = _mm_srai_epi32(x2, 8);
    235         y2 = _mm_srai_epi32(y2, 8);
    236         z2 = _mm_srai_epi32(z2, 8);
    237         w2 = _mm_srli_epi32(zw, 16);
    238 
    239         x2 = packus_epi32(x2, y2);
    240         z2 = packus_epi32(z2, w2);
    241         o4 = _mm_packus_epi16(x2, z2);
    242 
    243         o4 = _mm_shuffle_epi8(o4, T4x4);
    244         _mm_storeu_si128((__m128i *)dst, o4);
    245 
    246         src = (const char *)src + 16;
    247         dst = (char *)dst + 16;
    248     }
    249 }
    250 
    251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
    252                                   const short *coef, uint32_t count) {
    253     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    254                                       14, 10, 6, 2,
    255                                       13,  9, 5, 1,
    256                                       12,  8, 4, 0);
    257     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
    258     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
    259     __m128i c0, c1, c2, c3;
    260     __m128i i4, o4;
    261     __m128i xy, zw;
    262     __m128i x2, y2, z2, w2;
    263     uint32_t i;
    264 
    265     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
    266     c0 = _mm_shufflelo_epi16(c0, 0);
    267     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
    268     c1 = _mm_shufflelo_epi16(c1, 0);
    269     c0 = _mm_unpacklo_epi16(c0, c1);
    270 
    271     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
    272     c2 = _mm_shufflelo_epi16(c2, 0);
    273     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
    274     c3 = _mm_shufflelo_epi16(c3, 0);
    275     c2 = _mm_unpacklo_epi16(c2, c3);
    276 
    277     for (i = 0; i < count; ++i) {
    278         i4 = _mm_loadu_si128((const __m128i *)src);
    279 
    280         xy = _mm_shuffle_epi8(i4, Mxy);
    281         zw = _mm_shuffle_epi8(i4, Mzw);
    282 
    283         x2 =  _mm_madd_epi16(xy, c0);
    284         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
    285 
    286         x2 = _mm_srai_epi32(x2, 8);
    287         y2 = x2;
    288         z2 = x2;
    289         w2 = _mm_srli_epi32(zw, 16);
    290 
    291         x2 = packus_epi32(x2, y2);
    292         z2 = packus_epi32(z2, w2);
    293         o4 = _mm_packus_epi16(x2, z2);
    294 
    295         o4 = _mm_shuffle_epi8(o4, T4x4);
    296         _mm_storeu_si128((__m128i *)dst, o4);
    297 
    298         src = (const char *)src + 16;
    299         dst = (char *)dst + 16;
    300     }
    301 }
    302 
    303 void rsdIntrinsicBlurVFU4_K(void *dst,
    304                           const void *pin, int stride, const void *gptr,
    305                           int rct, int x1, int x2) {
    306     const char *pi;
    307     __m128i pi0, pi1;
    308     __m128 pf0, pf1;
    309     __m128 bp0, bp1;
    310     __m128 x;
    311     int r;
    312 
    313     for (; x1 < x2; x1 += 2) {
    314         pi = (const char *)pin + (x1 << 2);
    315         bp0 = _mm_setzero_ps();
    316         bp1 = _mm_setzero_ps();
    317 
    318         for (r = 0; r < rct; ++r) {
    319             x = _mm_load_ss((const float *)gptr + r);
    320             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    321 
    322             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
    323             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
    324 
    325             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
    326             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
    327 
    328             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
    329             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
    330 
    331             pi += stride;
    332         }
    333 
    334         _mm_storeu_ps((float *)dst, bp0);
    335         _mm_storeu_ps((float *)dst + 4, bp1);
    336         dst = (char *)dst + 32;
    337     }
    338 }
    339 
    340 void rsdIntrinsicBlurHFU4_K(void *dst,
    341                           const void *pin, const void *gptr,
    342                           int rct, int x1, int x2) {
    343     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
    344     const float *pi;
    345     __m128 pf, x, y;
    346     __m128i o;
    347     int r;
    348 
    349     for (; x1 < x2; ++x1) {
    350         /* rct is define as 2*r+1 by the caller */
    351         x = _mm_load_ss((const float *)gptr);
    352         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    353 
    354         pi = (const float *)pin + (x1 << 2);
    355         pf = _mm_mul_ps(x, _mm_load_ps(pi));
    356 
    357         for (r = 1; r < rct; r += 2) {
    358             x = _mm_load_ss((const float *)gptr + r);
    359             y = _mm_load_ss((const float *)gptr + r + 1);
    360             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
    361             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
    362 
    363             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
    364             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
    365         }
    366 
    367         o = _mm_cvtps_epi32(pf);
    368         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
    369         dst = (char *)dst + 4;
    370     }
    371 }
    372 
    373 void rsdIntrinsicBlurHFU1_K(void *dst,
    374                           const void *pin, const void *gptr,
    375                           int rct, int x1, int x2) {
    376     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
    377     const float *pi;
    378     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
    379     __m128i o;
    380     int r;
    381 
    382     for (; x1 < x2; x1+=4) {
    383         g0 = _mm_load_ss((const float *)gptr);
    384         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
    385 
    386         pi = (const float *)pin + x1;
    387         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
    388 
    389         for (r = 1; r < rct; r += 4) {
    390             gx = _mm_loadu_ps((const float *)gptr + r);
    391             p0 = _mm_loadu_ps(pi + r);
    392             p1 = _mm_loadu_ps(pi + r + 4);
    393 
    394             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
    395             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
    396             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
    397             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
    398             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
    399             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
    400             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
    401             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
    402         }
    403 
    404         o = _mm_cvtps_epi32(pf);
    405         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
    406         dst = (char *)dst + 4;
    407     }
    408 }
    409 
    410 void rsdIntrinsicYuv_K(void *dst,
    411                        const unsigned char *pY, const unsigned char *pUV,
    412                        uint32_t count, const short *param) {
    413     __m128i biasY, biasUV;
    414     __m128i c0, c1, c2, c3, c4;
    415 
    416     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    417     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    418 
    419     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    420     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    421     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    422     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    423     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    424 
    425     __m128i Y, UV, U, V, R, G, B, A;
    426 
    427     A = _mm_set1_epi32(255);
    428     uint32_t i;
    429 
    430     for (i = 0; i < (count << 1); ++i) {
    431         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    432         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
    433 
    434         Y = _mm_sub_epi32(Y, biasY);
    435         UV = _mm_sub_epi32(UV, biasUV);
    436 
    437         U = _mm_shuffle_epi32(UV, 0xf5);
    438         V = _mm_shuffle_epi32(UV, 0xa0);
    439 
    440         Y = mullo_epi32(Y, c0);
    441 
    442         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    443         R = _mm_add_epi32(R, biasUV);
    444         R = _mm_srai_epi32(R, 8);
    445 
    446         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    447         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    448         G = _mm_add_epi32(G, biasUV);
    449         G = _mm_srai_epi32(G, 8);
    450 
    451         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    452         B = _mm_add_epi32(B, biasUV);
    453         B = _mm_srai_epi32(B, 8);
    454 
    455         __m128i y1, y2, y3, y4;
    456 
    457         y1 = packus_epi32(R, G);
    458         y2 = packus_epi32(B, A);
    459         y3 = _mm_packus_epi16(y1, y2);
    460         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    461                                           14, 10, 6, 2,
    462                                           13,  9, 5, 1,
    463                                           12,  8, 4, 0);
    464         y4 = _mm_shuffle_epi8(y3, T4x4);
    465         _mm_storeu_si128((__m128i *)dst, y4);
    466         pY += 4;
    467         pUV += 4;
    468         dst = (__m128i *)dst + 1;
    469     }
    470 }
    471 
    472 void rsdIntrinsicYuvR_K(void *dst,
    473                        const unsigned char *pY, const unsigned char *pUV,
    474                        uint32_t count, const short *param) {
    475     __m128i biasY, biasUV;
    476     __m128i c0, c1, c2, c3, c4;
    477 
    478     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    479     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    480 
    481     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    482     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    483     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    484     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    485     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    486 
    487     __m128i Y, UV, U, V, R, G, B, A;
    488 
    489     A = _mm_set1_epi32(255);
    490     uint32_t i;
    491 
    492     for (i = 0; i < (count << 1); ++i) {
    493         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    494         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
    495 
    496         Y = _mm_sub_epi32(Y, biasY);
    497         UV = _mm_sub_epi32(UV, biasUV);
    498 
    499         V = _mm_shuffle_epi32(UV, 0xf5);
    500         U = _mm_shuffle_epi32(UV, 0xa0);
    501 
    502         Y = mullo_epi32(Y, c0);
    503 
    504         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    505         R = _mm_add_epi32(R, biasUV);
    506         R = _mm_srai_epi32(R, 8);
    507 
    508         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    509         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    510         G = _mm_add_epi32(G, biasUV);
    511         G = _mm_srai_epi32(G, 8);
    512 
    513         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    514         B = _mm_add_epi32(B, biasUV);
    515         B = _mm_srai_epi32(B, 8);
    516 
    517         __m128i y1, y2, y3, y4;
    518 
    519         y1 = packus_epi32(R, G);
    520         y2 = packus_epi32(B, A);
    521         y3 = _mm_packus_epi16(y1, y2);
    522         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    523                                           14, 10, 6, 2,
    524                                           13,  9, 5, 1,
    525                                           12,  8, 4, 0);
    526         y4 = _mm_shuffle_epi8(y3, T4x4);
    527         _mm_storeu_si128((__m128i *)dst, y4);
    528         pY += 4;
    529         pUV += 4;
    530         dst = (__m128i *)dst + 1;
    531     }
    532 }
    533 
    534 void rsdIntrinsicYuv2_K(void *dst,
    535                        const unsigned char *pY, const unsigned char *pU,
    536                        const unsigned char *pV, uint32_t count, const short *param) {
    537     __m128i biasY, biasUV;
    538     __m128i c0, c1, c2, c3, c4;
    539 
    540     biasY = _mm_set1_epi32(param[8]);   /*  16 */
    541     biasUV = _mm_set1_epi32(param[16]); /* 128 */
    542 
    543     c0 = _mm_set1_epi32(param[0]);  /*  298 */
    544     c1 = _mm_set1_epi32(param[1]);  /*  409 */
    545     c2 = _mm_set1_epi32(param[2]);  /* -100 */
    546     c3 = _mm_set1_epi32(param[3]);  /*  516 */
    547     c4 = _mm_set1_epi32(param[4]);  /* -208 */
    548 
    549     __m128i Y, U, V, R, G, B, A;
    550 
    551     A = _mm_set1_epi32(255);
    552     uint32_t i;
    553 
    554     for (i = 0; i < (count << 1); ++i) {
    555         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
    556         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
    557 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
    558 
    559         Y = _mm_sub_epi32(Y, biasY);
    560         U = _mm_sub_epi32(U, biasUV);
    561 		V = _mm_sub_epi32(V, biasUV);
    562 
    563         Y = mullo_epi32(Y, c0);
    564 
    565         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
    566         R = _mm_add_epi32(R, biasUV);
    567         R = _mm_srai_epi32(R, 8);
    568 
    569         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
    570         G = _mm_add_epi32(G, mullo_epi32(V, c4));
    571         G = _mm_add_epi32(G, biasUV);
    572         G = _mm_srai_epi32(G, 8);
    573 
    574         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
    575         B = _mm_add_epi32(B, biasUV);
    576         B = _mm_srai_epi32(B, 8);
    577 
    578         __m128i y1, y2, y3, y4;
    579 
    580         y1 = packus_epi32(R, G);
    581         y2 = packus_epi32(B, A);
    582         y3 = _mm_packus_epi16(y1, y2);
    583         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
    584                                           14, 10, 6, 2,
    585                                           13,  9, 5, 1,
    586                                           12,  8, 4, 0);
    587         y4 = _mm_shuffle_epi8(y3, T4x4);
    588         _mm_storeu_si128((__m128i *)dst, y4);
    589         pY += 4;
    590         pU += 4;
    591 		pV += 4;
    592         dst = (__m128i *)dst + 1;
    593     }
    594 }
    595 
    596 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
    597                                           const void *y1, const void *y2,
    598                                           const void *y3, const void *y4,
    599                                           const short *coef, uint32_t count) {
    600     __m128i x;
    601     __m128i c0, c2, c4, c6, c8, c10, c12;
    602     __m128i c14, c16, c18, c20, c22, c24;
    603     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
    604     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
    605     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
    606     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
    607     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
    608     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
    609     __m128i o0, o1, o2, o3;
    610     uint32_t i;
    611 
    612     x = _mm_loadl_epi64((const __m128i *)(coef+0));
    613     c0  = _mm_shuffle_epi32(x, 0x00);
    614     c2  = _mm_shuffle_epi32(x, 0x55);
    615 
    616     x = _mm_loadl_epi64((const __m128i *)(coef+4));
    617     c4  = _mm_shuffle_epi32(x, 0x00);
    618     c6  = _mm_shuffle_epi32(x, 0x55);
    619 
    620     x = _mm_loadl_epi64((const __m128i *)(coef+8));
    621     c8  = _mm_shuffle_epi32(x, 0x00);
    622     c10  = _mm_shuffle_epi32(x, 0x55);
    623 
    624     x = _mm_loadl_epi64((const __m128i *)(coef+12));
    625     c12  = _mm_shuffle_epi32(x, 0x00);
    626     c14  = _mm_shuffle_epi32(x, 0x55);
    627 
    628     x = _mm_loadl_epi64((const __m128i *)(coef+16));
    629     c16  = _mm_shuffle_epi32(x, 0x00);
    630     c18  = _mm_shuffle_epi32(x, 0x55);
    631 
    632     x = _mm_loadl_epi64((const __m128i *)(coef+20));
    633     c20  = _mm_shuffle_epi32(x, 0x00);
    634     c22  = _mm_shuffle_epi32(x, 0x55);
    635 
    636     x = _mm_loadl_epi64((const __m128i *)(coef+24));
    637     c24  = _mm_shuffle_epi32(x, 0x00);
    638 
    639     for (i = 0; i < count; ++i) {
    640 
    641         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
    642         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
    643         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
    644         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
    645         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
    646         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
    647         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
    648         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
    649 
    650         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
    651         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
    652         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
    653         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
    654         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
    655         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
    656         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
    657         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
    658 
    659         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
    660         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
    661         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
    662         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
    663         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
    664         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
    665         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
    666         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
    667 
    668         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
    669         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
    670         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
    671         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
    672         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
    673         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
    674         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
    675         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
    676 
    677         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
    678         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
    679         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
    680         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
    681         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
    682         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
    683         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
    684         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
    685 
    686         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
    687         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
    688         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
    689         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
    690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
    691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
    692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
    693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
    694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
    695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
    696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
    697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
    698         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
    699         o0 = _mm_srai_epi32(o0, 8);
    700 
    701         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
    702         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
    703         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
    704         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
    705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
    706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
    707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
    708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
    709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
    710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
    711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
    712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
    713         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
    714         o1 = _mm_srai_epi32(o1, 8);
    715 
    716         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
    717         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
    718         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
    719         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
    720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
    721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
    722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
    723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
    724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
    725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
    726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
    727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
    728         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
    729         o2 = _mm_srai_epi32(o2, 8);
    730 
    731         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
    732         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
    733         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
    734         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
    735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
    736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
    737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
    738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
    739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
    740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
    741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
    742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
    743         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
    744         o3 = _mm_srai_epi32(o3, 8);
    745 
    746         o0 = packus_epi32(o0, o1);
    747         o2 = packus_epi32(o2, o3);
    748         o0 = _mm_packus_epi16(o0, o2);
    749         _mm_storeu_si128((__m128i *)dst, o0);
    750 
    751         y0 = (const char *)y0 + 16;
    752         y1 = (const char *)y1 + 16;
    753         y2 = (const char *)y2 + 16;
    754         y3 = (const char *)y3 + 16;
    755         y4 = (const char *)y4 + 16;
    756         dst = (char *)dst + 16;
    757     }
    758 }
    759 
    760 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
    761     __m128i all1s, ina, ins;
    762     __m128i in0, in1, out0, out1;
    763     __m128i t0, t1, t2, t3;
    764     uint32_t i;
    765 
    766     all1s = _mm_set1_epi16(255);
    767 
    768     for (i = 0; i < count8; ++i) {
    769         in0 = _mm_loadu_si128((const __m128i *)src);
    770         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    771         out0 = _mm_loadu_si128((const __m128i *)dst);
    772         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    773 
    774         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    775         ina = _mm_shufflelo_epi16(ins, 0xFF);
    776         ina = _mm_shufflehi_epi16(ina, 0xFF);
    777         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    778         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
    779         t0 = _mm_srli_epi16(t0, 8);
    780         t0 = _mm_add_epi16(t0, ins);
    781 
    782         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    783         ina = _mm_shufflelo_epi16(ins, 0xFF);
    784         ina = _mm_shufflehi_epi16(ina, 0xFF);
    785         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    786         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
    787         t1 = _mm_srli_epi16(t1, 8);
    788         t1 = _mm_add_epi16(t1, ins);
    789 
    790         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    791         ina = _mm_shufflelo_epi16(ins, 0xFF);
    792         ina = _mm_shufflehi_epi16(ina, 0xFF);
    793         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    794         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
    795         t2 = _mm_srli_epi16(t2, 8);
    796         t2 = _mm_add_epi16(t2, ins);
    797 
    798         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    799         ina = _mm_shufflelo_epi16(ins, 0xFF);
    800         ina = _mm_shufflehi_epi16(ina, 0xFF);
    801         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    802         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
    803         t3 = _mm_srli_epi16(t3, 8);
    804         t3 = _mm_add_epi16(t3, ins);
    805 
    806         t0 = _mm_packus_epi16(t0, t1);
    807         t2 = _mm_packus_epi16(t2, t3);
    808         _mm_storeu_si128((__m128i *)dst, t0);
    809         _mm_storeu_si128((__m128i *)dst + 1, t2);
    810 
    811         src = (const __m128i *)src + 2;
    812         dst = (__m128i *)dst + 2;
    813     }
    814 }
    815 
    816 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
    817     __m128i all1s, outa, outs;
    818     __m128i in0, in1, out0, out1;
    819     __m128i t0, t1, t2, t3;
    820     uint32_t i;
    821 
    822     all1s = _mm_set1_epi16(255);
    823 
    824     for (i = 0; i < count8; ++i) {
    825         in0 = _mm_loadu_si128((const __m128i *)src);
    826         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    827         out0 = _mm_loadu_si128((const __m128i *)dst);
    828         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    829 
    830 
    831         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    832         outa = _mm_shufflelo_epi16(outs, 0xFF);
    833         outa = _mm_shufflehi_epi16(outa, 0xFF);
    834         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    835         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
    836         t0 = _mm_srli_epi16(t0, 8);
    837         t0 = _mm_add_epi16(t0, outs);
    838 
    839         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    840         outa = _mm_shufflelo_epi16(outs, 0xFF);
    841         outa = _mm_shufflehi_epi16(outa, 0xFF);
    842         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    843         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
    844         t1 = _mm_srli_epi16(t1, 8);
    845         t1 = _mm_add_epi16(t1, outs);
    846 
    847         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    848         outa = _mm_shufflelo_epi16(outs, 0xFF);
    849         outa = _mm_shufflehi_epi16(outa, 0xFF);
    850         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    851         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
    852         t2 = _mm_srli_epi16(t2, 8);
    853         t2 = _mm_add_epi16(t2, outs);
    854 
    855         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    856         outa = _mm_shufflelo_epi16(outs, 0xFF);
    857         outa = _mm_shufflehi_epi16(outa, 0xFF);
    858         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    859         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
    860         t3 = _mm_srli_epi16(t3, 8);
    861         t3 = _mm_add_epi16(t3, outs);
    862 
    863         t0 = _mm_packus_epi16(t0, t1);
    864         t2 = _mm_packus_epi16(t2, t3);
    865         _mm_storeu_si128((__m128i *)dst, t0);
    866         _mm_storeu_si128((__m128i *)dst + 1, t2);
    867 
    868         src = (const __m128i *)src + 2;
    869         dst = (__m128i *)dst + 2;
    870     }
    871 }
    872 
    873 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
    874     __m128i outa;
    875     __m128i in0, in1, out0, out1;
    876     __m128i t0, t1, t2, t3;
    877     uint32_t i;
    878 
    879     for (i = 0; i < count8; ++i) {
    880         in0 = _mm_loadu_si128((const __m128i *)src);
    881         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    882         out0 = _mm_loadu_si128((const __m128i *)dst);
    883         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    884 
    885         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    886         outa = _mm_shufflelo_epi16(outa, 0xFF);
    887         outa = _mm_shufflehi_epi16(outa, 0xFF);
    888         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    889         t0 = _mm_mullo_epi16(t0, outa);
    890         t0 = _mm_srli_epi16(t0, 8);
    891 
    892         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    893         outa = _mm_shufflelo_epi16(outa, 0xFF);
    894         outa = _mm_shufflehi_epi16(outa, 0xFF);
    895         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    896         t1 = _mm_mullo_epi16(t1, outa);
    897         t1 = _mm_srli_epi16(t1, 8);
    898 
    899         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    900         outa = _mm_shufflelo_epi16(outa, 0xFF);
    901         outa = _mm_shufflehi_epi16(outa, 0xFF);
    902         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    903         t2 = _mm_mullo_epi16(t2, outa);
    904         t2 = _mm_srli_epi16(t2, 8);
    905 
    906         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    907         outa = _mm_shufflelo_epi16(outa, 0xFF);
    908         outa = _mm_shufflehi_epi16(outa, 0xFF);
    909         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    910         t3 = _mm_mullo_epi16(t3, outa);
    911         t3 = _mm_srli_epi16(t3, 8);
    912 
    913         t0 = _mm_packus_epi16(t0, t1);
    914         t2 = _mm_packus_epi16(t2, t3);
    915         _mm_storeu_si128((__m128i *)dst, t0);
    916         _mm_storeu_si128((__m128i *)dst + 1, t2);
    917 
    918         src = (const __m128i *)src + 2;
    919         dst = (__m128i *)dst + 2;
    920     }
    921 }
    922 
    923 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
    924     __m128i ina;
    925     __m128i in0, in1, out0, out1;
    926     __m128i t0, t1, t2, t3;
    927     uint32_t i;
    928 
    929     for (i = 0; i < count8; ++i) {
    930         in0 = _mm_loadu_si128((const __m128i *)src);
    931         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    932         out0 = _mm_loadu_si128((const __m128i *)dst);
    933         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    934 
    935         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    936         ina = _mm_shufflelo_epi16(ina, 0xFF);
    937         ina = _mm_shufflehi_epi16(ina, 0xFF);
    938         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    939         t0 = _mm_mullo_epi16(t0, ina);
    940         t0 = _mm_srli_epi16(t0, 8);
    941 
    942         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    943         ina = _mm_shufflelo_epi16(ina, 0xFF);
    944         ina = _mm_shufflehi_epi16(ina, 0xFF);
    945         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    946         t1 = _mm_mullo_epi16(t1, ina);
    947         t1 = _mm_srli_epi16(t1, 8);
    948 
    949         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
    950         ina = _mm_shufflelo_epi16(ina, 0xFF);
    951         ina = _mm_shufflehi_epi16(ina, 0xFF);
    952         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
    953         t2 = _mm_mullo_epi16(t2, ina);
    954         t2 = _mm_srli_epi16(t2, 8);
    955 
    956         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
    957         ina = _mm_shufflelo_epi16(ina, 0xFF);
    958         ina = _mm_shufflehi_epi16(ina, 0xFF);
    959         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
    960         t3 = _mm_mullo_epi16(t3, ina);
    961         t3 = _mm_srli_epi16(t3, 8);
    962 
    963         t0 = _mm_packus_epi16(t0, t1);
    964         t2 = _mm_packus_epi16(t2, t3);
    965         _mm_storeu_si128((__m128i *)dst, t0);
    966         _mm_storeu_si128((__m128i *)dst + 1, t2);
    967 
    968         src = (const __m128i *)src + 2;
    969         dst = (__m128i *)dst + 2;
    970     }
    971 }
    972 
    973 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
    974     __m128i all1s, outa;
    975     __m128i in0, in1, out0, out1;
    976     __m128i t0, t1, t2, t3;
    977     uint32_t i;
    978 
    979     all1s = _mm_set1_epi16(255);
    980 
    981     for (i = 0; i < count8; ++i) {
    982         in0 = _mm_loadu_si128((const __m128i *)src);
    983         in1 = _mm_loadu_si128((const __m128i *)src + 1);
    984         out0 = _mm_loadu_si128((const __m128i *)dst);
    985         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
    986 
    987         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
    988         outa = _mm_shufflelo_epi16(outa, 0xFF);
    989         outa = _mm_shufflehi_epi16(outa, 0xFF);
    990         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
    991         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
    992         t0 = _mm_srli_epi16(t0, 8);
    993 
    994         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
    995         outa = _mm_shufflelo_epi16(outa, 0xFF);
    996         outa = _mm_shufflehi_epi16(outa, 0xFF);
    997         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
    998         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
    999         t1 = _mm_srli_epi16(t1, 8);
   1000 
   1001         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1002         outa = _mm_shufflelo_epi16(outa, 0xFF);
   1003         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1004         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1005         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
   1006         t2 = _mm_srli_epi16(t2, 8);
   1007 
   1008         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1009         outa = _mm_shufflelo_epi16(outa, 0xFF);
   1010         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1011         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1012         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
   1013         t3 = _mm_srli_epi16(t3, 8);
   1014 
   1015         t0 = _mm_packus_epi16(t0, t1);
   1016         t2 = _mm_packus_epi16(t2, t3);
   1017         _mm_storeu_si128((__m128i *)dst, t0);
   1018         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1019 
   1020         src = (const __m128i *)src + 2;
   1021         dst = (__m128i *)dst + 2;
   1022     }
   1023 }
   1024 
   1025 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
   1026     __m128i all1s, ina;
   1027     __m128i in0, in1, out0, out1;
   1028     __m128i t0, t1, t2, t3;
   1029     uint32_t i;
   1030 
   1031     all1s = _mm_set1_epi16(255);
   1032 
   1033     for (i = 0; i < count8; ++i) {
   1034         in0 = _mm_loadu_si128((const __m128i *)src);
   1035         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1036         out0 = _mm_loadu_si128((const __m128i *)dst);
   1037         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1038 
   1039         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1040         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1041         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1042         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1043         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
   1044         t0 = _mm_srli_epi16(t0, 8);
   1045 
   1046         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1047         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1048         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1049         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1050         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
   1051         t1 = _mm_srli_epi16(t1, 8);
   1052 
   1053         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1054         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1055         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1056         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1057         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
   1058         t2 = _mm_srli_epi16(t2, 8);
   1059 
   1060         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1061         ina = _mm_shufflelo_epi16(ina, 0xFF);
   1062         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1063         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1064         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
   1065         t3 = _mm_srli_epi16(t3, 8);
   1066 
   1067         t0 = _mm_packus_epi16(t0, t1);
   1068         t2 = _mm_packus_epi16(t2, t3);
   1069         _mm_storeu_si128((__m128i *)dst, t0);
   1070         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1071 
   1072         src = (const __m128i *)src + 2;
   1073         dst = (__m128i *)dst + 2;
   1074     }
   1075 }
   1076 
   1077 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
   1078     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
   1079     __m128i all1s, ina, outa, ins, outs;
   1080     __m128i in0, in1, out0, out1;
   1081     __m128i t0, t1, t2, t3;
   1082     uint32_t i;
   1083 
   1084     all1s = _mm_set1_epi16(255);
   1085 
   1086     for (i = 0; i < count8; ++i) {
   1087         in0 = _mm_loadu_si128((const __m128i *)src);
   1088         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1089         out0 = _mm_loadu_si128((const __m128i *)dst);
   1090         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1091 
   1092         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1093         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1094         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1095         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1096         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1097         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1098         t0 = _mm_sub_epi16(all1s, ina);
   1099         t0 = _mm_mullo_epi16(t0, outs);
   1100         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
   1101         t0 = _mm_srli_epi16(t0, 8);
   1102 
   1103         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1104         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1105         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1106         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1107         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1108         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1109         t1 = _mm_sub_epi16(all1s, ina);
   1110         t1 = _mm_mullo_epi16(t1, outs);
   1111         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
   1112         t1 = _mm_srli_epi16(t1, 8);
   1113 
   1114         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1115         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1116         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1117         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1118         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1119         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1120         t2 = _mm_sub_epi16(all1s, ina);
   1121         t2 = _mm_mullo_epi16(t2, outs);
   1122         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
   1123         t2 = _mm_srli_epi16(t2, 8);
   1124 
   1125         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1126         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1127         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1128         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1129         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1130         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1131         t3 = _mm_sub_epi16(all1s, ina);
   1132         t3 = _mm_mullo_epi16(t3, outs);
   1133         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
   1134         t3 = _mm_srli_epi16(t3, 8);
   1135 
   1136         t0 = _mm_packus_epi16(t0, t1);
   1137         t0 = blendv_epi8(t0, out0, M0001);
   1138         t2 = _mm_packus_epi16(t2, t3);
   1139         t2 = blendv_epi8(t2, out1, M0001);
   1140         _mm_storeu_si128((__m128i *)dst, t0);
   1141         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1142 
   1143         src = (const __m128i *)src + 2;
   1144         dst = (__m128i *)dst + 2;
   1145     }
   1146 }
   1147 
   1148 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
   1149     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
   1150     __m128i all1s, ina, ins, outa, outs;
   1151     __m128i in0, in1, out0, out1;
   1152     __m128i t0, t1, t2, t3;
   1153     uint32_t i;
   1154 
   1155     all1s = _mm_set1_epi16(255);
   1156 
   1157     for (i = 0; i < count8; ++i) {
   1158         in0 = _mm_loadu_si128((const __m128i *)src);
   1159         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1160         out0 = _mm_loadu_si128((const __m128i *)dst);
   1161         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1162 
   1163         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1164         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1165         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1166         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
   1167         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1168         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1169         t0 = _mm_sub_epi16(all1s, outa);
   1170         t0 = _mm_mullo_epi16(t0, ins);
   1171         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
   1172         t0 = _mm_srli_epi16(t0, 8);
   1173 
   1174         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1175         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1176         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1177         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
   1178         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1179         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1180         t1 = _mm_sub_epi16(all1s, outa);
   1181         t1 = _mm_mullo_epi16(t1, ins);
   1182         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
   1183         t1 = _mm_srli_epi16(t1, 8);
   1184 
   1185         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1186         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1187         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1188         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
   1189         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1190         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1191         t2 = _mm_sub_epi16(all1s, outa);
   1192         t2 = _mm_mullo_epi16(t2, ins);
   1193         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
   1194         t2 = _mm_srli_epi16(t2, 8);
   1195 
   1196         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1197         ina = _mm_shufflelo_epi16(ins, 0xFF);
   1198         ina = _mm_shufflehi_epi16(ina, 0xFF);
   1199         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
   1200         outa = _mm_shufflelo_epi16(outs, 0xFF);
   1201         outa = _mm_shufflehi_epi16(outa, 0xFF);
   1202         t3 = _mm_sub_epi16(all1s, outa);
   1203         t3 = _mm_mullo_epi16(t3, ins);
   1204         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
   1205         t3 = _mm_srli_epi16(t3, 8);
   1206 
   1207         t0 = _mm_packus_epi16(t0, t1);
   1208         t0 = blendv_epi8(t0, in0, M0001);
   1209         t2 = _mm_packus_epi16(t2, t3);
   1210         t2 = blendv_epi8(t2, in1, M0001);
   1211         _mm_storeu_si128((__m128i *)dst, t0);
   1212         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1213 
   1214         src = (const __m128i *)src + 2;
   1215         dst = (__m128i *)dst + 2;
   1216     }
   1217 }
   1218 
   1219 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
   1220     __m128i in0, in1, out0, out1;
   1221     uint32_t i;
   1222 
   1223     for (i = 0; i < count8; ++i) {
   1224         in0 = _mm_loadu_si128((const __m128i *)src);
   1225         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1226         out0 = _mm_loadu_si128((const __m128i *)dst);
   1227         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1228 
   1229         out0 = _mm_xor_si128(out0, in0);
   1230         out1 = _mm_xor_si128(out1, in1);
   1231 
   1232         _mm_storeu_si128((__m128i *)dst, out0);
   1233         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1234 
   1235         src = (const __m128i *)src + 2;
   1236         dst = (__m128i *)dst + 2;
   1237     }
   1238 }
   1239 
   1240 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
   1241     __m128i in0, in1, out0, out1;
   1242     __m128i t0, t1, t2, t3;
   1243     uint32_t i;
   1244 
   1245     for (i = 0; i < count8; ++i) {
   1246         in0 = _mm_loadu_si128((const __m128i *)src);
   1247         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1248         out0 = _mm_loadu_si128((const __m128i *)dst);
   1249         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1250 
   1251         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
   1252         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
   1253         t0 = _mm_srli_epi16(t0, 8);
   1254 
   1255         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
   1256         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
   1257         t1 = _mm_srli_epi16(t1, 8);
   1258 
   1259         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
   1260         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
   1261         t2 = _mm_srli_epi16(t2, 8);
   1262 
   1263         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
   1264         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
   1265         t3 = _mm_srli_epi16(t3, 8);
   1266 
   1267         t0 = _mm_packus_epi16(t0, t1);
   1268         t2 = _mm_packus_epi16(t2, t3);
   1269         _mm_storeu_si128((__m128i *)dst, t0);
   1270         _mm_storeu_si128((__m128i *)dst + 1, t2);
   1271 
   1272         src = (const __m128i *)src + 2;
   1273         dst = (__m128i *)dst + 2;
   1274     }
   1275 }
   1276 
   1277 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
   1278     __m128i in0, in1, out0, out1;
   1279     uint32_t i;
   1280 
   1281     for (i = 0; i < count8; ++i) {
   1282         in0 = _mm_loadu_si128((const __m128i *)src);
   1283         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1284         out0 = _mm_loadu_si128((const __m128i *)dst);
   1285         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1286 
   1287         out0 = _mm_adds_epu8(out0, in0);
   1288         out1 = _mm_adds_epu8(out1, in1);
   1289 
   1290         _mm_storeu_si128((__m128i *)dst, out0);
   1291         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1292 
   1293         src = (const __m128i *)src + 2;
   1294         dst = (__m128i *)dst + 2;
   1295     }
   1296 }
   1297 
   1298 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
   1299     __m128i in0, in1, out0, out1;
   1300     uint32_t i;
   1301 
   1302     for (i = 0; i < count8; ++i) {
   1303         in0 = _mm_loadu_si128((const __m128i *)src);
   1304         in1 = _mm_loadu_si128((const __m128i *)src + 1);
   1305         out0 = _mm_loadu_si128((const __m128i *)dst);
   1306         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
   1307 
   1308         out0 = _mm_subs_epu8(out0, in0);
   1309         out1 = _mm_subs_epu8(out1, in1);
   1310 
   1311         _mm_storeu_si128((__m128i *)dst, out0);
   1312         _mm_storeu_si128((__m128i *)dst + 1, out1);
   1313 
   1314         src = (const __m128i *)src + 2;
   1315         dst = (__m128i *)dst + 2;
   1316     }
   1317 }
   1318