Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2014 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkColorPriv.h"
      9 #include "SkColor_opts_SSE2.h"
     10 #include "SkMathPriv.h"
     11 #include "SkMath_opts_SSE2.h"
     12 #include "SkXfermode.h"
     13 #include "SkXfermode_opts_SSE2.h"
     14 #include "SkXfermode_proccoeff.h"
     15 
     16 ////////////////////////////////////////////////////////////////////////////////
     17 // 4 pixels SSE2 version functions
     18 ////////////////////////////////////////////////////////////////////////////////
     19 
     20 static inline __m128i SkDiv255Round_SSE2(const __m128i& a) {
     21     __m128i prod = _mm_add_epi32(a, _mm_set1_epi32(128)); // prod += 128;
     22     prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));  // prod + (prod >> 8)
     23     prod = _mm_srli_epi32(prod, 8);                       // >> 8
     24 
     25     return prod;
     26 }
     27 
     28 static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
     29     __m128i sum = _mm_add_epi32(a, b);
     30     __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));
     31 
     32     sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
     33                        _mm_andnot_si128(cmp, sum));
     34     return sum;
     35 }
     36 
     37 static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) {
     38     __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128());
     39     __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255));
     40     __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255));
     41 
     42     __m128i cmp = _mm_or_si128(cmp1, cmp2);
     43     ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n));
     44 
     45     return ret;
     46 }
     47 
     48 static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
     49     // test if > 0
     50     __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128());
     51     // test if < 255*255
     52     __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255));
     53 
     54     __m128i ret = _mm_setzero_si128();
     55 
     56     // if value >= 255*255, value = 255
     57     ret = _mm_andnot_si128(cmp2,  _mm_set1_epi32(255));
     58 
     59     __m128i div = SkDiv255Round_SSE2(prod);
     60 
     61     // test if > 0 && < 255*255
     62     __m128i cmp = _mm_and_si128(cmp1, cmp2);
     63 
     64     ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret));
     65 
     66     return ret;
     67 }
     68 
     69 static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     70     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
     71     return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa));
     72 }
     73 
     74 static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     75     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
     76     return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida));
     77 }
     78 
     79 static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     80     __m128i da = SkGetPackedA32_SSE2(dst);
     81     return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da));
     82 }
     83 
     84 static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     85     __m128i sa = SkGetPackedA32_SSE2(src);
     86     return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa));
     87 }
     88 
     89 static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     90     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst));
     91     return SkAlphaMulQ_SSE2(src, ida);
     92 }
     93 
     94 static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
     95     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src));
     96     return SkAlphaMulQ_SSE2(dst, isa);
     97 }
     98 
     99 static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    100     __m128i sa = SkGetPackedA32_SSE2(src);
    101     __m128i da = SkGetPackedA32_SSE2(dst);
    102     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    103 
    104     __m128i a = da;
    105 
    106     __m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src));
    107     __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
    108     __m128i r = _mm_add_epi32(r1, r2);
    109 
    110     __m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src));
    111     __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
    112     __m128i g = _mm_add_epi32(g1, g2);
    113 
    114     __m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src));
    115     __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
    116     __m128i b = _mm_add_epi32(b1, b2);
    117 
    118     return SkPackARGB32_SSE2(a, r, g, b);
    119 }
    120 
    121 static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    122     __m128i sa = SkGetPackedA32_SSE2(src);
    123     __m128i da = SkGetPackedA32_SSE2(dst);
    124     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    125 
    126     __m128i a = sa;
    127 
    128     __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
    129     __m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst));
    130     __m128i r = _mm_add_epi32(r1, r2);
    131 
    132     __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
    133     __m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst));
    134     __m128i g = _mm_add_epi32(g1, g2);
    135 
    136     __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
    137     __m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst));
    138     __m128i b = _mm_add_epi32(b1, b2);
    139 
    140     return SkPackARGB32_SSE2(a, r, g, b);
    141 }
    142 
    143 static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    144     __m128i sa = SkGetPackedA32_SSE2(src);
    145     __m128i da = SkGetPackedA32_SSE2(dst);
    146     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    147     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    148 
    149     __m128i a1 = _mm_add_epi32(sa, da);
    150     __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da);
    151     a2 = _mm_slli_epi32(a2, 1);
    152     __m128i a = _mm_sub_epi32(a1, a2);
    153 
    154     __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src));
    155     __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst));
    156     __m128i r = _mm_add_epi32(r1, r2);
    157 
    158     __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src));
    159     __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst));
    160     __m128i g = _mm_add_epi32(g1, g2);
    161 
    162     __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src));
    163     __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst));
    164     __m128i b = _mm_add_epi32(b1, b2);
    165 
    166     return SkPackARGB32_SSE2(a, r, g, b);
    167 }
    168 
    169 static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    170     __m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src),
    171                                    SkGetPackedB32_SSE2(dst));
    172     __m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src),
    173                                    SkGetPackedG32_SSE2(dst));
    174     __m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src),
    175                                    SkGetPackedR32_SSE2(dst));
    176     __m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src),
    177                                    SkGetPackedA32_SSE2(dst));
    178     return SkPackARGB32_SSE2(a, r, g, b);
    179 }
    180 
    181 static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    182     __m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src),
    183                                      SkGetPackedA32_SSE2(dst));
    184     __m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src),
    185                                      SkGetPackedR32_SSE2(dst));
    186     __m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src),
    187                                      SkGetPackedG32_SSE2(dst));
    188     __m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src),
    189                                      SkGetPackedB32_SSE2(dst));
    190     return SkPackARGB32_SSE2(a, r, g, b);
    191 }
    192 
    193 static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) {
    194     __m128i cmp = _mm_cmplt_epi32(a, b);
    195     return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b));
    196 }
    197 
    198 static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) {
    199     // a + b - SkAlphaMulAlpha(a, b);
    200     return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b));
    201 
    202 }
    203 
    204 static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc,
    205                                                    const __m128i& sa, const __m128i& da) {
    206     // sc * (255 - da)
    207     __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da);
    208     ret1 = _mm_mullo_epi16(sc, ret1);
    209 
    210     // dc * (255 - sa)
    211     __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    212     ret2 = _mm_mullo_epi16(dc, ret2);
    213 
    214     // sc * dc
    215     __m128i ret3 = _mm_mullo_epi16(sc, dc);
    216 
    217     __m128i ret = _mm_add_epi32(ret1, ret2);
    218     ret = _mm_add_epi32(ret, ret3);
    219 
    220     return clamp_div255round_SSE2(ret);
    221 }
    222 
    223 static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    224     __m128i sa = SkGetPackedA32_SSE2(src);
    225     __m128i da = SkGetPackedA32_SSE2(dst);
    226     __m128i a = srcover_byte_SSE2(sa, da);
    227 
    228     __m128i sr = SkGetPackedR32_SSE2(src);
    229     __m128i dr = SkGetPackedR32_SSE2(dst);
    230     __m128i r = blendfunc_multiply_byte_SSE2(sr, dr, sa, da);
    231 
    232     __m128i sg = SkGetPackedG32_SSE2(src);
    233     __m128i dg = SkGetPackedG32_SSE2(dst);
    234     __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da);
    235 
    236 
    237     __m128i sb = SkGetPackedB32_SSE2(src);
    238     __m128i db = SkGetPackedB32_SSE2(dst);
    239     __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da);
    240 
    241     return SkPackARGB32_SSE2(a, r, g, b);
    242 }
    243 
    244 static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    245     __m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src),
    246                                   SkGetPackedA32_SSE2(dst));
    247     __m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src),
    248                                   SkGetPackedR32_SSE2(dst));
    249     __m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src),
    250                                   SkGetPackedG32_SSE2(dst));
    251     __m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src),
    252                                   SkGetPackedB32_SSE2(dst));
    253     return SkPackARGB32_SSE2(a, r, g, b);
    254 }
    255 
    256 // Portable version overlay_byte() is in SkXfermode.cpp.
    257 static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc,
    258                                         const __m128i& sa, const __m128i& da) {
    259     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    260     __m128i tmp1 = _mm_mullo_epi16(sc, ida);
    261     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    262     __m128i tmp2 = _mm_mullo_epi16(dc, isa);
    263     __m128i tmp = _mm_add_epi32(tmp1, tmp2);
    264 
    265     __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da);
    266     __m128i rc1 = _mm_slli_epi32(sc, 1);                        // 2 * sc
    267     rc1 = Multiply32_SSE2(rc1, dc);                             // *dc
    268 
    269     __m128i rc2 = _mm_mullo_epi16(sa, da);                      // sa * da
    270     __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1);    // 2 * (da - dc)
    271     tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc));        // * (sa - sc)
    272     rc2 = _mm_sub_epi32(rc2, tmp3);
    273 
    274     __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1),
    275                               _mm_and_si128(cmp, rc2));
    276     return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp));
    277 }
    278 
    279 static __m128i overlay_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    280     __m128i sa = SkGetPackedA32_SSE2(src);
    281     __m128i da = SkGetPackedA32_SSE2(dst);
    282 
    283     __m128i a = srcover_byte_SSE2(sa, da);
    284     __m128i r = overlay_byte_SSE2(SkGetPackedR32_SSE2(src),
    285                                   SkGetPackedR32_SSE2(dst), sa, da);
    286     __m128i g = overlay_byte_SSE2(SkGetPackedG32_SSE2(src),
    287                                   SkGetPackedG32_SSE2(dst), sa, da);
    288     __m128i b = overlay_byte_SSE2(SkGetPackedB32_SSE2(src),
    289                                   SkGetPackedB32_SSE2(dst), sa, da);
    290     return SkPackARGB32_SSE2(a, r, g, b);
    291 }
    292 
    293 static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc,
    294                                        const __m128i& sa, const __m128i& da) {
    295     __m128i sd = _mm_mullo_epi16(sc, da);
    296     __m128i ds = _mm_mullo_epi16(dc, sa);
    297 
    298     __m128i cmp = _mm_cmplt_epi32(sd, ds);
    299 
    300     __m128i tmp = _mm_add_epi32(sc, dc);
    301     __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
    302     __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
    303     __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
    304                                _mm_andnot_si128(cmp, ret2));
    305     return ret;
    306 }
    307 
    308 static __m128i darken_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    309     __m128i sa = SkGetPackedA32_SSE2(src);
    310     __m128i da = SkGetPackedA32_SSE2(dst);
    311 
    312     __m128i a = srcover_byte_SSE2(sa, da);
    313     __m128i r = darken_byte_SSE2(SkGetPackedR32_SSE2(src),
    314                                  SkGetPackedR32_SSE2(dst), sa, da);
    315     __m128i g = darken_byte_SSE2(SkGetPackedG32_SSE2(src),
    316                                  SkGetPackedG32_SSE2(dst), sa, da);
    317     __m128i b = darken_byte_SSE2(SkGetPackedB32_SSE2(src),
    318                                  SkGetPackedB32_SSE2(dst), sa, da);
    319     return SkPackARGB32_SSE2(a, r, g, b);
    320 }
    321 
    322 static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc,
    323                                         const __m128i& sa, const __m128i& da) {
    324     __m128i sd = _mm_mullo_epi16(sc, da);
    325     __m128i ds = _mm_mullo_epi16(dc, sa);
    326 
    327     __m128i cmp = _mm_cmpgt_epi32(sd, ds);
    328 
    329     __m128i tmp = _mm_add_epi32(sc, dc);
    330     __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
    331     __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
    332     __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
    333                                _mm_andnot_si128(cmp, ret2));
    334     return ret;
    335 }
    336 
    337 static __m128i lighten_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    338     __m128i sa = SkGetPackedA32_SSE2(src);
    339     __m128i da = SkGetPackedA32_SSE2(dst);
    340 
    341     __m128i a = srcover_byte_SSE2(sa, da);
    342     __m128i r = lighten_byte_SSE2(SkGetPackedR32_SSE2(src),
    343                                   SkGetPackedR32_SSE2(dst), sa, da);
    344     __m128i g = lighten_byte_SSE2(SkGetPackedG32_SSE2(src),
    345                                   SkGetPackedG32_SSE2(dst), sa, da);
    346     __m128i b = lighten_byte_SSE2(SkGetPackedB32_SSE2(src),
    347                                   SkGetPackedB32_SSE2(dst), sa, da);
    348     return SkPackARGB32_SSE2(a, r, g, b);
    349 }
    350 
    351 static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
    352                                            const __m128i& sa, const __m128i& da) {
    353     __m128i diff = _mm_sub_epi32(sa, sc);
    354     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    355     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    356 
    357     // if (0 == dc)
    358     __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
    359     __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));
    360 
    361     // else if (0 == diff)
    362     __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
    363     __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    364     __m128i tmp1 = _mm_mullo_epi16(sa, da);
    365     __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    366     __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    367     __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
    368     rc2 = _mm_add_epi32(rc2, tmp3);
    369     rc2 = clamp_div255round_SSE2(rc2);
    370     rc2 = _mm_and_si128(cmp, rc2);
    371 
    372     // else
    373     __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    374     __m128i value = _mm_mullo_epi16(dc, sa);
    375     diff = shim_mm_div_epi32(value, diff);
    376 
    377     __m128i tmp4 = SkMin32_SSE2(da, diff);
    378     tmp4 = Multiply32_SSE2(sa, tmp4);
    379     __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
    380     rc3 = _mm_add_epi32(rc3, tmp3);
    381     rc3 = clamp_div255round_SSE2(rc3);
    382     rc3 = _mm_andnot_si128(cmp3, rc3);
    383 
    384     __m128i rc = _mm_or_si128(rc1, rc2);
    385     rc = _mm_or_si128(rc, rc3);
    386 
    387     return rc;
    388 }
    389 
    390 static __m128i colordodge_modeproc_SSE2(const __m128i& src,
    391                                         const __m128i& dst) {
    392     __m128i sa = SkGetPackedA32_SSE2(src);
    393     __m128i da = SkGetPackedA32_SSE2(dst);
    394 
    395     __m128i a = srcover_byte_SSE2(sa, da);
    396     __m128i r = colordodge_byte_SSE2(SkGetPackedR32_SSE2(src),
    397                                      SkGetPackedR32_SSE2(dst), sa, da);
    398     __m128i g = colordodge_byte_SSE2(SkGetPackedG32_SSE2(src),
    399                                      SkGetPackedG32_SSE2(dst), sa, da);
    400     __m128i b = colordodge_byte_SSE2(SkGetPackedB32_SSE2(src),
    401                                      SkGetPackedB32_SSE2(dst), sa, da);
    402     return SkPackARGB32_SSE2(a, r, g, b);
    403 }
    404 
    405 static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
    406                                           const __m128i& sa, const __m128i& da) {
    407     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    408     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    409 
    410     // if (dc == da)
    411     __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
    412     __m128i tmp1 = _mm_mullo_epi16(sa, da);
    413     __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    414     __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    415     __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
    416     rc1 = _mm_add_epi32(rc1, tmp3);
    417     rc1 = clamp_div255round_SSE2(rc1);
    418     rc1 = _mm_and_si128(cmp1, rc1);
    419 
    420     // else if (0 == sc)
    421     __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
    422     __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
    423     __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    424     rc2 = _mm_and_si128(cmp, rc2);
    425 
    426     // else
    427     __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    428     __m128i tmp4 = _mm_sub_epi32(da, dc);
    429     tmp4 = Multiply32_SSE2(tmp4, sa);
    430     tmp4 = shim_mm_div_epi32(tmp4, sc);
    431 
    432     __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
    433     tmp5 = Multiply32_SSE2(sa, tmp5);
    434     __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
    435     rc3 = _mm_add_epi32(rc3, tmp3);
    436     rc3 = clamp_div255round_SSE2(rc3);
    437     rc3 = _mm_andnot_si128(cmp3, rc3);
    438 
    439     __m128i rc = _mm_or_si128(rc1, rc2);
    440     rc = _mm_or_si128(rc, rc3);
    441 
    442     return rc;
    443 }
    444 
    445 static __m128i colorburn_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    446     __m128i sa = SkGetPackedA32_SSE2(src);
    447     __m128i da = SkGetPackedA32_SSE2(dst);
    448 
    449     __m128i a = srcover_byte_SSE2(sa, da);
    450     __m128i r = colorburn_byte_SSE2(SkGetPackedR32_SSE2(src),
    451                                     SkGetPackedR32_SSE2(dst), sa, da);
    452     __m128i g = colorburn_byte_SSE2(SkGetPackedG32_SSE2(src),
    453                                     SkGetPackedG32_SSE2(dst), sa, da);
    454     __m128i b = colorburn_byte_SSE2(SkGetPackedB32_SSE2(src),
    455                                     SkGetPackedB32_SSE2(dst), sa, da);
    456     return SkPackARGB32_SSE2(a, r, g, b);
    457 }
    458 
    459 static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
    460                                           const __m128i& sa, const __m128i& da) {
    461     // if (2 * sc <= sa)
    462     __m128i tmp1 = _mm_slli_epi32(sc, 1);
    463     __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    464     __m128i rc1 = _mm_mullo_epi16(sc, dc);                // sc * dc;
    465     rc1 = _mm_slli_epi32(rc1, 1);                         // 2 * sc * dc
    466     rc1 = _mm_andnot_si128(cmp1, rc1);
    467 
    468     // else
    469     tmp1 = _mm_mullo_epi16(sa, da);
    470     __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc),
    471                                    _mm_sub_epi32(sa, sc));
    472     tmp2 = _mm_slli_epi32(tmp2, 1);
    473     __m128i rc2 = _mm_sub_epi32(tmp1, tmp2);
    474     rc2 = _mm_and_si128(cmp1, rc2);
    475 
    476     __m128i rc = _mm_or_si128(rc1, rc2);
    477 
    478     __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    479     tmp1 = _mm_mullo_epi16(sc, ida);
    480     __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    481     tmp2 = _mm_mullo_epi16(dc, isa);
    482     rc = _mm_add_epi32(rc, tmp1);
    483     rc = _mm_add_epi32(rc, tmp2);
    484     return clamp_div255round_SSE2(rc);
    485 }
    486 
    487 static __m128i hardlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    488     __m128i sa = SkGetPackedA32_SSE2(src);
    489     __m128i da = SkGetPackedA32_SSE2(dst);
    490 
    491     __m128i a = srcover_byte_SSE2(sa, da);
    492     __m128i r = hardlight_byte_SSE2(SkGetPackedR32_SSE2(src),
    493                                     SkGetPackedR32_SSE2(dst), sa, da);
    494     __m128i g = hardlight_byte_SSE2(SkGetPackedG32_SSE2(src),
    495                                     SkGetPackedG32_SSE2(dst), sa, da);
    496     __m128i b = hardlight_byte_SSE2(SkGetPackedB32_SSE2(src),
    497                                     SkGetPackedB32_SSE2(dst), sa, da);
    498     return SkPackARGB32_SSE2(a, r, g, b);
    499 }
    500 
    501 static __m128i sqrt_unit_byte_SSE2(const __m128i& n) {
    502     return SkSqrtBits_SSE2(n, 15+4);
    503 }
    504 
    505 static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
    506                                           const __m128i& sa, const __m128i& da) {
    507     __m128i tmp1, tmp2, tmp3;
    508 
    509     // int m = da ? dc * 256 / da : 0;
    510     __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
    511     __m128i m = _mm_slli_epi32(dc, 8);
    512     __m128 x = _mm_cvtepi32_ps(m);
    513     __m128 y = _mm_cvtepi32_ps(da);
    514     m = _mm_cvttps_epi32(_mm_div_ps(x, y));
    515     m = _mm_andnot_si128(cmp, m);
    516 
    517     // if (2 * sc <= sa)
    518     tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
    519     __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    520     tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
    521     tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
    522     tmp1 = Multiply32_SSE2(tmp1, tmp2);
    523     tmp1 = _mm_srai_epi32(tmp1, 8);
    524     tmp1 = _mm_add_epi32(sa, tmp1);
    525     tmp1 = Multiply32_SSE2(dc, tmp1);
    526     __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);
    527 
    528     // else if (4 * dc <= da)
    529     tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
    530     __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
    531     __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
    532     __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
    533     __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
    534     __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
    535     i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
    536     i = _mm_srai_epi32(i, 16);                         // >> 16
    537     j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
    538     tmp2 = _mm_add_epi32(i, j);
    539     i = Multiply32_SSE2(dc, sa);                       // dc * sa
    540     j = _mm_slli_epi32(sc, 1);                         // 2 * sc
    541     j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
    542     j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
    543     tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
    544     tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
    545     tmp2 = _mm_add_epi32(i, tmp2);
    546     cmp = _mm_andnot_si128(cmp2, cmp1);
    547     __m128i rc2 = _mm_and_si128(cmp, tmp2);
    548     __m128i rc = _mm_or_si128(rc1, rc2);
    549 
    550     // else
    551     tmp3 = sqrt_unit_byte_SSE2(m);
    552     tmp3 = _mm_sub_epi32(tmp3, m);
    553     tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
    554     tmp3 = _mm_srai_epi32(tmp3, 8);
    555     tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
    556     cmp = _mm_and_si128(cmp1, cmp2);
    557     __m128i rc3 = _mm_and_si128(cmp, tmp3);
    558     rc = _mm_or_si128(rc, rc3);
    559 
    560     tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
    561     tmp1 = _mm_mullo_epi16(sc, tmp1);
    562     tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
    563     tmp2 = _mm_mullo_epi16(dc, tmp2);
    564     rc = _mm_add_epi32(rc, tmp1);
    565     rc = _mm_add_epi32(rc, tmp2);
    566     return clamp_div255round_SSE2(rc);
    567 }
    568 
    569 static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    570     __m128i sa = SkGetPackedA32_SSE2(src);
    571     __m128i da = SkGetPackedA32_SSE2(dst);
    572 
    573     __m128i a = srcover_byte_SSE2(sa, da);
    574     __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src),
    575                                     SkGetPackedR32_SSE2(dst), sa, da);
    576     __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src),
    577                                     SkGetPackedG32_SSE2(dst), sa, da);
    578     __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src),
    579                                     SkGetPackedB32_SSE2(dst), sa, da);
    580     return SkPackARGB32_SSE2(a, r, g, b);
    581 }
    582 
    583 static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc,
    584                                            const __m128i& sa, const __m128i& da) {
    585     __m128i tmp1 = _mm_mullo_epi16(sc, da);
    586     __m128i tmp2 = _mm_mullo_epi16(dc, sa);
    587     __m128i tmp = SkMin32_SSE2(tmp1, tmp2);
    588 
    589     __m128i ret1 = _mm_add_epi32(sc, dc);
    590     __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1);
    591     __m128i ret = _mm_sub_epi32(ret1, ret2);
    592 
    593     ret = clamp_signed_byte_SSE2(ret);
    594     return ret;
    595 }
    596 
    597 static __m128i difference_modeproc_SSE2(const __m128i& src,
    598                                         const __m128i& dst) {
    599     __m128i sa = SkGetPackedA32_SSE2(src);
    600     __m128i da = SkGetPackedA32_SSE2(dst);
    601 
    602     __m128i a = srcover_byte_SSE2(sa, da);
    603     __m128i r = difference_byte_SSE2(SkGetPackedR32_SSE2(src),
    604                                      SkGetPackedR32_SSE2(dst), sa, da);
    605     __m128i g = difference_byte_SSE2(SkGetPackedG32_SSE2(src),
    606                                      SkGetPackedG32_SSE2(dst), sa, da);
    607     __m128i b = difference_byte_SSE2(SkGetPackedB32_SSE2(src),
    608                                      SkGetPackedB32_SSE2(dst), sa, da);
    609     return SkPackARGB32_SSE2(a, r, g, b);
    610 }
    611 
    612 static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc,
    613                                           const __m128i&, __m128i&) {
    614     __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc
    615     __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc
    616     tmp1 = _mm_add_epi32(tmp1, tmp2);
    617     tmp2 = _mm_mullo_epi16(sc, dc);                          // sc * dc
    618     tmp2 = _mm_slli_epi32(tmp2, 1);                          // 2 * sc * dc
    619 
    620     __m128i r = _mm_sub_epi32(tmp1, tmp2);
    621     return clamp_div255round_SSE2(r);
    622 }
    623 
    624 static __m128i exclusion_modeproc_SSE2(const __m128i& src, const __m128i& dst) {
    625     __m128i sa = SkGetPackedA32_SSE2(src);
    626     __m128i da = SkGetPackedA32_SSE2(dst);
    627 
    628     __m128i a = srcover_byte_SSE2(sa, da);
    629     __m128i r = exclusion_byte_SSE2(SkGetPackedR32_SSE2(src),
    630                                     SkGetPackedR32_SSE2(dst), sa, da);
    631     __m128i g = exclusion_byte_SSE2(SkGetPackedG32_SSE2(src),
    632                                     SkGetPackedG32_SSE2(dst), sa, da);
    633     __m128i b = exclusion_byte_SSE2(SkGetPackedB32_SSE2(src),
    634                                     SkGetPackedB32_SSE2(dst), sa, da);
    635     return SkPackARGB32_SSE2(a, r, g, b);
    636 }
    637 
    638 ////////////////////////////////////////////////////////////////////////////////
    639 
    640 typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst);
    641 
    642 extern SkXfermodeProcSIMD gSSE2XfermodeProcs[];
    643 
    644 #ifdef SK_SUPPORT_LEGACY_DEEPFLATTENING
    645 SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) : INHERITED(buffer) {
    646     fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]);
    647     buffer.validate(fProcSIMD != NULL);
    648 }
    649 #endif
    650 
    651 void SkSSE2ProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
    652                                      int count, const SkAlpha aa[]) const {
    653     SkASSERT(dst && src && count >= 0);
    654 
    655     SkXfermodeProc proc = this->getProc();
    656     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    657     SkASSERT(procSIMD != NULL);
    658 
    659     if (NULL == aa) {
    660         if (count >= 4) {
    661             while (((size_t)dst & 0x0F) != 0) {
    662                 *dst = proc(*src, *dst);
    663                 dst++;
    664                 src++;
    665                 count--;
    666             }
    667 
    668             const __m128i* s = reinterpret_cast<const __m128i*>(src);
    669             __m128i* d = reinterpret_cast<__m128i*>(dst);
    670 
    671             while (count >= 4) {
    672                 __m128i src_pixel = _mm_loadu_si128(s++);
    673                 __m128i dst_pixel = _mm_load_si128(d);
    674 
    675                 dst_pixel = procSIMD(src_pixel, dst_pixel);
    676                 _mm_store_si128(d++, dst_pixel);
    677                 count -= 4;
    678             }
    679 
    680             src = reinterpret_cast<const SkPMColor*>(s);
    681             dst = reinterpret_cast<SkPMColor*>(d);
    682         }
    683 
    684         for (int i = count - 1; i >= 0; --i) {
    685             *dst = proc(*src, *dst);
    686             dst++;
    687             src++;
    688         }
    689     } else {
    690         for (int i = count - 1; i >= 0; --i) {
    691             unsigned a = aa[i];
    692             if (0 != a) {
    693                 SkPMColor dstC = dst[i];
    694                 SkPMColor C = proc(src[i], dstC);
    695                 if (a != 0xFF) {
    696                     C = SkFourByteInterp(C, dstC, a);
    697                 }
    698                 dst[i] = C;
    699             }
    700         }
    701     }
    702 }
    703 
    704 void SkSSE2ProcCoeffXfermode::xfer16(uint16_t dst[], const SkPMColor src[],
    705                                      int count, const SkAlpha aa[]) const {
    706     SkASSERT(dst && src && count >= 0);
    707 
    708     SkXfermodeProc proc = this->getProc();
    709     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    710     SkASSERT(procSIMD != NULL);
    711 
    712     if (NULL == aa) {
    713         if (count >= 8) {
    714             while (((size_t)dst & 0x0F) != 0) {
    715                 SkPMColor dstC = SkPixel16ToPixel32(*dst);
    716                 *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
    717                 dst++;
    718                 src++;
    719                 count--;
    720             }
    721 
    722             const __m128i* s = reinterpret_cast<const __m128i*>(src);
    723             __m128i* d = reinterpret_cast<__m128i*>(dst);
    724 
    725             while (count >= 8) {
    726                 __m128i src_pixel1 = _mm_loadu_si128(s++);
    727                 __m128i src_pixel2 = _mm_loadu_si128(s++);
    728                 __m128i dst_pixel = _mm_load_si128(d);
    729 
    730                 __m128i dst_pixel1 = _mm_unpacklo_epi16(dst_pixel, _mm_setzero_si128());
    731                 __m128i dst_pixel2 = _mm_unpackhi_epi16(dst_pixel, _mm_setzero_si128());
    732 
    733                 __m128i dstC1 = SkPixel16ToPixel32_SSE2(dst_pixel1);
    734                 __m128i dstC2 = SkPixel16ToPixel32_SSE2(dst_pixel2);
    735 
    736                 dst_pixel1 = procSIMD(src_pixel1, dstC1);
    737                 dst_pixel2 = procSIMD(src_pixel2, dstC2);
    738                 dst_pixel = SkPixel32ToPixel16_ToU16_SSE2(dst_pixel1, dst_pixel2);
    739 
    740                 _mm_store_si128(d++, dst_pixel);
    741                 count -= 8;
    742             }
    743 
    744             src = reinterpret_cast<const SkPMColor*>(s);
    745             dst = reinterpret_cast<uint16_t*>(d);
    746         }
    747 
    748         for (int i = count - 1; i >= 0; --i) {
    749             SkPMColor dstC = SkPixel16ToPixel32(*dst);
    750             *dst = SkPixel32ToPixel16_ToU16(proc(*src, dstC));
    751             dst++;
    752             src++;
    753         }
    754     } else {
    755         for (int i = count - 1; i >= 0; --i) {
    756             unsigned a = aa[i];
    757             if (0 != a) {
    758                 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
    759                 SkPMColor C = proc(src[i], dstC);
    760                 if (0xFF != a) {
    761                     C = SkFourByteInterp(C, dstC, a);
    762                 }
    763                 dst[i] = SkPixel32ToPixel16_ToU16(C);
    764             }
    765         }
    766     }
    767 }
    768 
    769 #ifndef SK_IGNORE_TO_STRING
    770 void SkSSE2ProcCoeffXfermode::toString(SkString* str) const {
    771     this->INHERITED::toString(str);
    772 }
    773 #endif
    774 
    775 ////////////////////////////////////////////////////////////////////////////////
    776 
    777 // 4 pixels modeprocs with SSE2
    778 SkXfermodeProcSIMD gSSE2XfermodeProcs[] = {
    779     NULL, // kClear_Mode
    780     NULL, // kSrc_Mode
    781     NULL, // kDst_Mode
    782     srcover_modeproc_SSE2,
    783     dstover_modeproc_SSE2,
    784     srcin_modeproc_SSE2,
    785     dstin_modeproc_SSE2,
    786     srcout_modeproc_SSE2,
    787     dstout_modeproc_SSE2,
    788     srcatop_modeproc_SSE2,
    789     dstatop_modeproc_SSE2,
    790     xor_modeproc_SSE2,
    791     plus_modeproc_SSE2,
    792     modulate_modeproc_SSE2,
    793     screen_modeproc_SSE2,
    794 
    795     overlay_modeproc_SSE2,
    796     darken_modeproc_SSE2,
    797     lighten_modeproc_SSE2,
    798     colordodge_modeproc_SSE2,
    799     colorburn_modeproc_SSE2,
    800     hardlight_modeproc_SSE2,
    801     softlight_modeproc_SSE2,
    802     difference_modeproc_SSE2,
    803     exclusion_modeproc_SSE2,
    804     multiply_modeproc_SSE2,
    805 
    806     NULL, // kHue_Mode
    807     NULL, // kSaturation_Mode
    808     NULL, // kColor_Mode
    809     NULL, // kLuminosity_Mode
    810 };
    811 
    812 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
    813                                                          SkXfermode::Mode mode) {
    814     void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]);
    815 
    816     if (procSIMD != NULL) {
    817         return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD));
    818     }
    819     return NULL;
    820 }
    821