Home | History | Annotate | Download | only in opts
      1 #include "SkXfermode.h"
      2 #include "SkXfermode_proccoeff.h"
      3 #include "SkColorPriv.h"
      4 
      5 #include <arm_neon.h>
      6 #include "SkColor_opts_neon.h"
      7 #include "SkXfermode_opts_arm_neon.h"
      8 
      9 #define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
     10 
     11 
     12 ////////////////////////////////////////////////////////////////////////////////
     13 // NEONized skia functions
     14 ////////////////////////////////////////////////////////////////////////////////
     15 
     16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) {
     17     uint16x8_t tmp;
     18     uint8x8_t ret;
     19 
     20     tmp = vmull_u8(color, alpha);
     21     tmp = vaddq_u16(tmp, vdupq_n_u16(128));
     22     tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8));
     23 
     24     ret = vshrn_n_u16(tmp, 8);
     25 
     26     return ret;
     27 }
     28 
     29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) {
     30     uint16x8_t ret;
     31 
     32     ret = vmull_u8(color, alpha);
     33     ret = vaddq_u16(ret, vdupq_n_u16(128));
     34     ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));
     35 
     36     ret = vshrq_n_u16(ret, 8);
     37 
     38     return ret;
     39 }
     40 
     41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
     42     uint16x8_t tmp;
     43 
     44 #ifdef SK_CPU_ARM64
     45     tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
     46                          vreinterpretq_u32_s32(p2));
     47 #else
     48     tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
     49                        vmovn_u32(vreinterpretq_u32_s32(p2)));
     50 #endif
     51 
     52     tmp += vdupq_n_u16(128);
     53     tmp += vshrq_n_u16(tmp, 8);
     54 
     55     return vshrn_n_u16(tmp, 8);
     56 }
     57 
     58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {
     59     prod += vdupq_n_u16(128);
     60     prod += vshrq_n_u16(prod, 8);
     61 
     62     return vshrq_n_u16(prod, 8);
     63 }
     64 
     65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) {
     66     uint8x8_t ret;
     67     uint32x4_t cmp1, cmp2;
     68     uint16x8_t cmp16;
     69     uint8x8_t cmp8, cmp8_1;
     70 
     71     // Test if <= 0
     72     cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
     73     cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
     74 #ifdef SK_CPU_ARM64
     75     cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
     76 #else
     77     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
     78 #endif
     79     cmp8_1 = vmovn_u16(cmp16);
     80 
     81     // Init to zero
     82     ret = vdup_n_u8(0);
     83 
     84     // Test if >= 255*255
     85     cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
     86     cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
     87 #ifdef SK_CPU_ARM64
     88     cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
     89 #else
     90     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
     91 #endif
     92     cmp8 = vmovn_u16(cmp16);
     93 
     94     // Insert 255 where true
     95     ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);
     96 
     97     // Calc SkDiv255Round
     98     uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);
     99 
    100     // Insert where false and previous test false
    101     cmp8 = cmp8 | cmp8_1;
    102     ret = vbsl_u8(cmp8, ret, div);
    103 
    104     // Return the final combination
    105     return ret;
    106 }
    107 
    108 ////////////////////////////////////////////////////////////////////////////////
    109 // 1 pixel modeprocs
    110 ////////////////////////////////////////////////////////////////////////////////
    111 
    112 //  kSrcATop_Mode,  //!< [Da, Sc * Da + (1 - Sa) * Dc]
    113 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
    114     unsigned sa = SkGetPackedA32(src);
    115     unsigned da = SkGetPackedA32(dst);
    116     unsigned isa = 255 - sa;
    117 
    118     uint8x8_t vda, visa, vsrc, vdst;
    119 
    120     vda = vdup_n_u8(da);
    121     visa = vdup_n_u8(isa);
    122 
    123     uint16x8_t vsrc_wide, vdst_wide;
    124     vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src)));
    125     vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst)));
    126 
    127     vsrc_wide += vdupq_n_u16(128);
    128     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    129 
    130     vdst_wide += vdupq_n_u16(128);
    131     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    132 
    133     vsrc = vshrn_n_u16(vsrc_wide, 8);
    134     vdst = vshrn_n_u16(vdst_wide, 8);
    135 
    136     vsrc += vdst;
    137     vsrc = vset_lane_u8(da, vsrc, 3);
    138 
    139     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    140 }
    141 
    142 //  kDstATop_Mode,  //!< [Sa, Sa * Dc + Sc * (1 - Da)]
    143 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
    144     unsigned sa = SkGetPackedA32(src);
    145     unsigned da = SkGetPackedA32(dst);
    146     unsigned ida = 255 - da;
    147 
    148     uint8x8_t vsa, vida, vsrc, vdst;
    149 
    150     vsa = vdup_n_u8(sa);
    151     vida = vdup_n_u8(ida);
    152 
    153     uint16x8_t vsrc_wide, vdst_wide;
    154     vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src)));
    155     vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst)));
    156 
    157     vsrc_wide += vdupq_n_u16(128);
    158     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    159 
    160     vdst_wide += vdupq_n_u16(128);
    161     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    162 
    163     vsrc = vshrn_n_u16(vsrc_wide, 8);
    164     vdst = vshrn_n_u16(vdst_wide, 8);
    165 
    166     vsrc += vdst;
    167     vsrc = vset_lane_u8(sa, vsrc, 3);
    168 
    169     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    170 }
    171 
    172 //  kXor_Mode   [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc]
    173 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) {
    174     unsigned sa = SkGetPackedA32(src);
    175     unsigned da = SkGetPackedA32(dst);
    176     unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1);
    177     unsigned isa = 255 - sa;
    178     unsigned ida = 255 - da;
    179 
    180     uint8x8_t vsrc, vdst, visa, vida;
    181     uint16x8_t vsrc_wide, vdst_wide;
    182 
    183     visa = vdup_n_u8(isa);
    184     vida = vdup_n_u8(ida);
    185     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    186     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    187 
    188     vsrc_wide = vmull_u8(vsrc, vida);
    189     vdst_wide = vmull_u8(vdst, visa);
    190 
    191     vsrc_wide += vdupq_n_u16(128);
    192     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    193 
    194     vdst_wide += vdupq_n_u16(128);
    195     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    196 
    197     vsrc = vshrn_n_u16(vsrc_wide, 8);
    198     vdst = vshrn_n_u16(vdst_wide, 8);
    199 
    200     vsrc += vdst;
    201 
    202     vsrc = vset_lane_u8(ret_alpha, vsrc, 3);
    203 
    204     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    205 }
    206 
    207 // kPlus_Mode
    208 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) {
    209     uint8x8_t vsrc, vdst;
    210     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    211     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    212     vsrc = vqadd_u8(vsrc, vdst);
    213 
    214     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    215 }
    216 
    217 // kModulate_Mode
    218 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) {
    219     uint8x8_t vsrc, vdst, vres;
    220     uint16x8_t vres_wide;
    221 
    222     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    223     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    224 
    225     vres_wide = vmull_u8(vsrc, vdst);
    226 
    227     vres_wide += vdupq_n_u16(128);
    228     vres_wide += vshrq_n_u16(vres_wide, 8);
    229 
    230     vres = vshrn_n_u16(vres_wide, 8);
    231 
    232     return vget_lane_u32(vreinterpret_u32_u8(vres), 0);
    233 }
    234 
    235 ////////////////////////////////////////////////////////////////////////////////
    236 // 8 pixels modeprocs
    237 ////////////////////////////////////////////////////////////////////////////////
    238 
    239 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    240     uint8x8x4_t ret;
    241     uint16x8_t src_scale;
    242 
    243     src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
    244 
    245     ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale);
    246     ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale);
    247     ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale);
    248     ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale);
    249 
    250     return ret;
    251 }
    252 
    253 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    254     uint8x8x4_t ret;
    255     uint16x8_t scale;
    256 
    257     scale = SkAlpha255To256_neon8(dst.val[NEON_A]);
    258 
    259     ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale);
    260     ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale);
    261     ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale);
    262     ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale);
    263 
    264     return ret;
    265 }
    266 
    267 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    268     uint8x8x4_t ret;
    269     uint16x8_t scale;
    270 
    271     scale = SkAlpha255To256_neon8(src.val[NEON_A]);
    272 
    273     ret = SkAlphaMulQ_neon8(dst, scale);
    274 
    275     return ret;
    276 }
    277 
    278 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    279     uint8x8x4_t ret;
    280     uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
    281 
    282     ret = SkAlphaMulQ_neon8(src, scale);
    283 
    284     return ret;
    285 }
    286 
    287 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    288     uint8x8x4_t ret;
    289     uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]);
    290 
    291     ret = SkAlphaMulQ_neon8(dst, scale);
    292 
    293     return ret;
    294 }
    295 
    296 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    297     uint8x8x4_t ret;
    298     uint8x8_t isa;
    299 
    300     isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
    301 
    302     ret.val[NEON_A] = dst.val[NEON_A];
    303     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A])
    304                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
    305     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A])
    306                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
    307     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A])
    308                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
    309 
    310     return ret;
    311 }
    312 
    313 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    314     uint8x8x4_t ret;
    315     uint8x8_t ida;
    316 
    317     ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
    318 
    319     ret.val[NEON_A] = src.val[NEON_A];
    320     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
    321                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]);
    322     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
    323                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]);
    324     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
    325                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]);
    326 
    327     return ret;
    328 }
    329 
    330 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    331     uint8x8x4_t ret;
    332     uint8x8_t isa, ida;
    333     uint16x8_t tmp_wide, tmp_wide2;
    334 
    335     isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
    336     ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
    337 
    338     // First calc alpha
    339     tmp_wide = vmovl_u8(src.val[NEON_A]);
    340     tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]);
    341     tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1);
    342     tmp_wide = vsubq_u16(tmp_wide, tmp_wide2);
    343     ret.val[NEON_A] = vmovn_u16(tmp_wide);
    344 
    345     // Then colors
    346     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
    347                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
    348     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
    349                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
    350     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
    351                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
    352 
    353     return ret;
    354 }
    355 
    356 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    357     uint8x8x4_t ret;
    358 
    359     ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]);
    360     ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]);
    361     ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]);
    362     ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]);
    363 
    364     return ret;
    365 }
    366 
    367 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    368     uint8x8x4_t ret;
    369 
    370     ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]);
    371     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]);
    372     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]);
    373     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]);
    374 
    375     return ret;
    376 }
    377 
    378 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) {
    379     uint16x8_t tmp;
    380 
    381     tmp = vaddl_u8(a, b);
    382     tmp -= SkAlphaMulAlpha_neon8_16(a, b);
    383 
    384     return vmovn_u16(tmp);
    385 }
    386 
    387 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    388     uint8x8x4_t ret;
    389 
    390     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    391     ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]);
    392     ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]);
    393     ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]);
    394 
    395     return ret;
    396 }
    397 
    398 template <bool overlay>
    399 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
    400                                                uint8x8_t sa, uint8x8_t da) {
    401     /*
    402      * In the end we're gonna use (rc + tmp) with a different rc
    403      * coming from an alternative.
    404      * The whole value (rc + tmp) can always be expressed as
    405      * VAL = COM - SUB in the if case
    406      * VAL = COM + SUB - sa*da in the else case
    407      *
    408      * with COM = 255 * (sc + dc)
    409      * and  SUB = sc*da + dc*sa - 2*dc*sc
    410      */
    411 
    412     // Prepare common subexpressions
    413     uint16x8_t const255 = vdupq_n_u16(255);
    414     uint16x8_t sc_plus_dc = vaddl_u8(sc, dc);
    415     uint16x8_t scda = vmull_u8(sc, da);
    416     uint16x8_t dcsa = vmull_u8(dc, sa);
    417     uint16x8_t sada = vmull_u8(sa, da);
    418 
    419     // Prepare non common subexpressions
    420     uint16x8_t dc2, sc2;
    421     uint32x4_t scdc2_1, scdc2_2;
    422     if (overlay) {
    423         dc2 = vshll_n_u8(dc, 1);
    424         scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
    425 #ifdef SK_CPU_ARM64
    426         scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
    427 #else
    428         scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
    429 #endif
    430     } else {
    431         sc2 = vshll_n_u8(sc, 1);
    432         scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
    433 #ifdef SK_CPU_ARM64
    434         scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
    435 #else
    436         scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
    437 #endif
    438     }
    439 
    440     // Calc COM
    441     int32x4_t com1, com2;
    442     com1 = vreinterpretq_s32_u32(
    443                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
    444     com2 = vreinterpretq_s32_u32(
    445 #ifdef SK_CPU_ARM64
    446                 vmull_high_u16(const255, sc_plus_dc));
    447 #else
    448                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
    449 #endif
    450 
    451     // Calc SUB
    452     int32x4_t sub1, sub2;
    453     sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
    454 #ifdef SK_CPU_ARM64
    455     sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
    456 #else
    457     sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
    458 #endif
    459     sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
    460     sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
    461 
    462     // Compare 2*dc <= da
    463     uint16x8_t cmp;
    464 
    465     if (overlay) {
    466         cmp = vcleq_u16(dc2, vmovl_u8(da));
    467     } else {
    468         cmp = vcleq_u16(sc2, vmovl_u8(sa));
    469     }
    470 
    471     // Prepare variables
    472     int32x4_t val1_1, val1_2;
    473     int32x4_t val2_1, val2_2;
    474     uint32x4_t cmp1, cmp2;
    475 
    476     // Doing a signed lengthening allows to save a few instructions
    477     // thanks to sign extension.
    478     cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
    479 #ifdef SK_CPU_ARM64
    480     cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
    481 #else
    482     cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
    483 #endif
    484 
    485     // Calc COM - SUB
    486     val1_1 = com1 - sub1;
    487     val1_2 = com2 - sub2;
    488 
    489     // Calc COM + SUB - sa*da
    490     val2_1 = com1 + sub1;
    491     val2_2 = com2 + sub2;
    492 
    493     val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
    494 #ifdef SK_CPU_ARM64
    495     val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
    496 #else
    497     val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
    498 #endif
    499 
    500     // Insert where needed
    501     val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
    502     val1_2 = vbslq_s32(cmp2, val1_2, val2_2);
    503 
    504     // Call the clamp_div255round function
    505     return clamp_div255round_simd8_32(val1_1, val1_2);
    506 }
    507 
    508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,
    509                                       uint8x8_t sa, uint8x8_t da) {
    510     return overlay_hardlight_color<true>(sc, dc, sa, da);
    511 }
    512 
    513 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    514     uint8x8x4_t ret;
    515 
    516     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    517     ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R],
    518                                     src.val[NEON_A], dst.val[NEON_A]);
    519     ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G],
    520                                     src.val[NEON_A], dst.val[NEON_A]);
    521     ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B],
    522                                     src.val[NEON_A], dst.val[NEON_A]);
    523 
    524     return ret;
    525 }
    526 
    527 template <bool lighten>
    528 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc,
    529                                              uint8x8_t sa, uint8x8_t da) {
    530     uint16x8_t sd, ds, cmp, tmp, tmp2;
    531 
    532     // Prepare
    533     sd = vmull_u8(sc, da);
    534     ds = vmull_u8(dc, sa);
    535 
    536     // Do test
    537     if (lighten) {
    538         cmp = vcgtq_u16(sd, ds);
    539     } else {
    540         cmp = vcltq_u16(sd, ds);
    541     }
    542 
    543     // Assign if
    544     tmp = vaddl_u8(sc, dc);
    545     tmp2 = tmp;
    546     tmp -= SkDiv255Round_neon8_16_16(ds);
    547 
    548     // Calc else
    549     tmp2 -= SkDiv255Round_neon8_16_16(sd);
    550 
    551     // Insert where needed
    552     tmp = vbslq_u16(cmp, tmp, tmp2);
    553 
    554     return vmovn_u16(tmp);
    555 }
    556 
    557 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc,
    558                                      uint8x8_t sa, uint8x8_t da) {
    559     return lighten_darken_color<false>(sc, dc, sa, da);
    560 }
    561 
    562 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    563     uint8x8x4_t ret;
    564 
    565     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    566     ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R],
    567                                    src.val[NEON_A], dst.val[NEON_A]);
    568     ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G],
    569                                    src.val[NEON_A], dst.val[NEON_A]);
    570     ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B],
    571                                    src.val[NEON_A], dst.val[NEON_A]);
    572 
    573     return ret;
    574 }
    575 
    576 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc,
    577                                       uint8x8_t sa, uint8x8_t da) {
    578     return lighten_darken_color<true>(sc, dc, sa, da);
    579 }
    580 
    581 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    582     uint8x8x4_t ret;
    583 
    584     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    585     ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R],
    586                                     src.val[NEON_A], dst.val[NEON_A]);
    587     ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G],
    588                                     src.val[NEON_A], dst.val[NEON_A]);
    589     ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B],
    590                                     src.val[NEON_A], dst.val[NEON_A]);
    591 
    592     return ret;
    593 }
    594 
    595 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc,
    596                                         uint8x8_t sa, uint8x8_t da) {
    597     return overlay_hardlight_color<false>(sc, dc, sa, da);
    598 }
    599 
    600 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    601     uint8x8x4_t ret;
    602 
    603     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    604     ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R],
    605                                       src.val[NEON_A], dst.val[NEON_A]);
    606     ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G],
    607                                       src.val[NEON_A], dst.val[NEON_A]);
    608     ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B],
    609                                       src.val[NEON_A], dst.val[NEON_A]);
    610 
    611     return ret;
    612 }
    613 
    614 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc,
    615                                          uint8x8_t sa, uint8x8_t da) {
    616     uint16x8_t sd, ds, tmp;
    617     int16x8_t val;
    618 
    619     sd = vmull_u8(sc, da);
    620     ds = vmull_u8(dc, sa);
    621 
    622     tmp = vminq_u16(sd, ds);
    623     tmp = SkDiv255Round_neon8_16_16(tmp);
    624     tmp = vshlq_n_u16(tmp, 1);
    625 
    626     val = vreinterpretq_s16_u16(vaddl_u8(sc, dc));
    627 
    628     val -= vreinterpretq_s16_u16(tmp);
    629 
    630     val = vmaxq_s16(val, vdupq_n_s16(0));
    631     val = vminq_s16(val, vdupq_n_s16(255));
    632 
    633     return vmovn_u16(vreinterpretq_u16_s16(val));
    634 }
    635 
    636 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    637     uint8x8x4_t ret;
    638 
    639     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    640     ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R],
    641                                        src.val[NEON_A], dst.val[NEON_A]);
    642     ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G],
    643                                        src.val[NEON_A], dst.val[NEON_A]);
    644     ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B],
    645                                        src.val[NEON_A], dst.val[NEON_A]);
    646 
    647     return ret;
    648 }
    649 
    650 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
    651                                         uint8x8_t sa, uint8x8_t da) {
    652     /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */
    653 
    654     uint16x8_t sc_plus_dc, scdc, const255;
    655     int32x4_t term1_1, term1_2, term2_1, term2_2;
    656 
    657     /* Calc (sc + dc) and (sc * dc) */
    658     sc_plus_dc = vaddl_u8(sc, dc);
    659     scdc = vmull_u8(sc, dc);
    660 
    661     /* Prepare constants */
    662     const255 = vdupq_n_u16(255);
    663 
    664     /* Calc the first term */
    665     term1_1 = vreinterpretq_s32_u32(
    666                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
    667     term1_2 = vreinterpretq_s32_u32(
    668 #ifdef SK_CPU_ARM64
    669                 vmull_high_u16(const255, sc_plus_dc));
    670 #else
    671                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
    672 #endif
    673 
    674     /* Calc the second term */
    675     term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
    676 #ifdef SK_CPU_ARM64
    677     term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
    678 #else
    679     term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
    680 #endif
    681 
    682     return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
    683 }
    684 
    685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    686     uint8x8x4_t ret;
    687 
    688     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    689     ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],
    690                                       src.val[NEON_A], dst.val[NEON_A]);
    691     ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],
    692                                       src.val[NEON_A], dst.val[NEON_A]);
    693     ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],
    694                                       src.val[NEON_A], dst.val[NEON_A]);
    695 
    696     return ret;
    697 }
    698 
    699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
    700                                                  uint8x8_t sa, uint8x8_t da) {
    701     uint32x4_t val1, val2;
    702     uint16x8_t scdc, t1, t2;
    703 
    704     t1 = vmull_u8(sc, vdup_n_u8(255) - da);
    705     t2 = vmull_u8(dc, vdup_n_u8(255) - sa);
    706     scdc = vmull_u8(sc, dc);
    707 
    708     val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
    709 #ifdef SK_CPU_ARM64
    710     val2 = vaddl_high_u16(t1, t2);
    711 #else
    712     val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
    713 #endif
    714 
    715     val1 = vaddw_u16(val1, vget_low_u16(scdc));
    716 #ifdef SK_CPU_ARM64
    717     val2 = vaddw_high_u16(val2, scdc);
    718 #else
    719     val2 = vaddw_u16(val2, vget_high_u16(scdc));
    720 #endif
    721 
    722     return clamp_div255round_simd8_32(
    723                 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
    724 }
    725 
    726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    727     uint8x8x4_t ret;
    728 
    729     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    730     ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],
    731                                                src.val[NEON_A], dst.val[NEON_A]);
    732     ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G],
    733                                                src.val[NEON_A], dst.val[NEON_A]);
    734     ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B],
    735                                                src.val[NEON_A], dst.val[NEON_A]);
    736 
    737     return ret;
    738 }
    739 
    740 ////////////////////////////////////////////////////////////////////////////////
    741 
    742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
    743 
    744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
    745 
    746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
    747         : INHERITED(buffer) {
    748     fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
    749 }
    750 
    751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
    752                                      int count, const SkAlpha aa[]) const {
    753     SkASSERT(dst && src && count >= 0);
    754 
    755     SkXfermodeProc proc = this->getProc();
    756     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    757     SkASSERT(procSIMD != NULL);
    758 
    759     if (NULL == aa) {
    760         // Unrolled NEON code
    761         while (count >= 8) {
    762             uint8x8x4_t vsrc, vdst, vres;
    763 
    764 #ifdef SK_CPU_ARM64
    765             vsrc = vld4_u8((uint8_t*)src);
    766             vdst = vld4_u8((uint8_t*)dst);
    767 #else
    768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    769             asm volatile (
    770                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
    771                 "vld4.u8    %h[vdst], [%[dst]]   \t\n"
    772                 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)
    773                 : [dst] "r" (dst)
    774                 :
    775             );
    776 #else
    777             register uint8x8_t d0 asm("d0");
    778             register uint8x8_t d1 asm("d1");
    779             register uint8x8_t d2 asm("d2");
    780             register uint8x8_t d3 asm("d3");
    781             register uint8x8_t d4 asm("d4");
    782             register uint8x8_t d5 asm("d5");
    783             register uint8x8_t d6 asm("d6");
    784             register uint8x8_t d7 asm("d7");
    785 
    786             asm volatile (
    787                 "vld4.u8    {d0-d3},[%[src]]!;"
    788                 "vld4.u8    {d4-d7},[%[dst]];"
    789                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    790                   "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),
    791                   [src] "+&r" (src)
    792                 : [dst] "r" (dst)
    793                 :
    794             );
    795             vsrc.val[0] = d0; vdst.val[0] = d4;
    796             vsrc.val[1] = d1; vdst.val[1] = d5;
    797             vsrc.val[2] = d2; vdst.val[2] = d6;
    798             vsrc.val[3] = d3; vdst.val[3] = d7;
    799 #endif
    800 #endif // #ifdef SK_CPU_ARM64
    801 
    802             vres = procSIMD(vsrc, vdst);
    803 
    804             vst4_u8((uint8_t*)dst, vres);
    805 
    806             count -= 8;
    807             dst += 8;
    808 #ifdef SK_CPU_ARM64
    809             src += 8;
    810 #endif
    811         }
    812         // Leftovers
    813         for (int i = 0; i < count; i++) {
    814             dst[i] = proc(src[i], dst[i]);
    815         }
    816     } else {
    817         for (int i = count - 1; i >= 0; --i) {
    818             unsigned a = aa[i];
    819             if (0 != a) {
    820                 SkPMColor dstC = dst[i];
    821                 SkPMColor C = proc(src[i], dstC);
    822                 if (a != 0xFF) {
    823                     C = SkFourByteInterp_neon(C, dstC, a);
    824                 }
    825                 dst[i] = C;
    826             }
    827         }
    828     }
    829 }
    830 
    831 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
    832                                      const SkPMColor* SK_RESTRICT src, int count,
    833                                      const SkAlpha* SK_RESTRICT aa) const {
    834     SkASSERT(dst && src && count >= 0);
    835 
    836     SkXfermodeProc proc = this->getProc();
    837     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    838     SkASSERT(procSIMD != NULL);
    839 
    840     if (NULL == aa) {
    841         while(count >= 8) {
    842             uint16x8_t vdst, vres16;
    843             uint8x8x4_t vdst32, vsrc, vres;
    844 
    845             vdst = vld1q_u16(dst);
    846 
    847 #ifdef SK_CPU_ARM64
    848             vsrc = vld4_u8((uint8_t*)src);
    849 #else
    850 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    851             asm volatile (
    852                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
    853                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    854                 : :
    855             );
    856 #else
    857             register uint8x8_t d0 asm("d0");
    858             register uint8x8_t d1 asm("d1");
    859             register uint8x8_t d2 asm("d2");
    860             register uint8x8_t d3 asm("d3");
    861 
    862             asm volatile (
    863                 "vld4.u8    {d0-d3},[%[src]]!;"
    864                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    865                   [src] "+&r" (src)
    866                 : :
    867             );
    868             vsrc.val[0] = d0;
    869             vsrc.val[1] = d1;
    870             vsrc.val[2] = d2;
    871             vsrc.val[3] = d3;
    872 #endif
    873 #endif // #ifdef SK_CPU_ARM64
    874 
    875             vdst32 = SkPixel16ToPixel32_neon8(vdst);
    876             vres = procSIMD(vsrc, vdst32);
    877             vres16 = SkPixel32ToPixel16_neon8(vres);
    878 
    879             vst1q_u16(dst, vres16);
    880 
    881             count -= 8;
    882             dst += 8;
    883 #ifdef SK_CPU_ARM64
    884             src += 8;
    885 #endif
    886         }
    887         for (int i = 0; i < count; i++) {
    888             SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
    889             dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
    890         }
    891     } else {
    892         for (int i = count - 1; i >= 0; --i) {
    893             unsigned a = aa[i];
    894             if (0 != a) {
    895                 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
    896                 SkPMColor C = proc(src[i], dstC);
    897                 if (0xFF != a) {
    898                     C = SkFourByteInterp_neon(C, dstC, a);
    899                 }
    900                 dst[i] = SkPixel32ToPixel16_ToU16(C);
    901             }
    902         }
    903     }
    904 }
    905 
    906 #ifndef SK_IGNORE_TO_STRING
    907 void SkNEONProcCoeffXfermode::toString(SkString* str) const {
    908     this->INHERITED::toString(str);
    909 }
    910 #endif
    911 
    912 ////////////////////////////////////////////////////////////////////////////////
    913 
    914 SkXfermodeProcSIMD gNEONXfermodeProcs[] = {
    915     NULL, // kClear_Mode
    916     NULL, // kSrc_Mode
    917     NULL, // kDst_Mode
    918     NULL, // kSrcOver_Mode
    919     dstover_modeproc_neon8,
    920     srcin_modeproc_neon8,
    921     dstin_modeproc_neon8,
    922     srcout_modeproc_neon8,
    923     dstout_modeproc_neon8,
    924     srcatop_modeproc_neon8,
    925     dstatop_modeproc_neon8,
    926     xor_modeproc_neon8,
    927     plus_modeproc_neon8,
    928     modulate_modeproc_neon8,
    929     screen_modeproc_neon8,
    930 
    931     overlay_modeproc_neon8,
    932     darken_modeproc_neon8,
    933     lighten_modeproc_neon8,
    934     NULL, // kColorDodge_Mode
    935     NULL, // kColorBurn_Mode
    936     hardlight_modeproc_neon8,
    937     NULL, // kSoftLight_Mode
    938     difference_modeproc_neon8,
    939     exclusion_modeproc_neon8,
    940     multiply_modeproc_neon8,
    941 
    942     NULL, // kHue_Mode
    943     NULL, // kSaturation_Mode
    944     NULL, // kColor_Mode
    945     NULL, // kLuminosity_Mode
    946 };
    947 
    948 SK_COMPILE_ASSERT(
    949     SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1,
    950     mode_count_arm
    951 );
    952 
    953 SkXfermodeProc gNEONXfermodeProcs1[] = {
    954     NULL, // kClear_Mode
    955     NULL, // kSrc_Mode
    956     NULL, // kDst_Mode
    957     NULL, // kSrcOver_Mode
    958     NULL, // kDstOver_Mode
    959     NULL, // kSrcIn_Mode
    960     NULL, // kDstIn_Mode
    961     NULL, // kSrcOut_Mode
    962     NULL, // kDstOut_Mode
    963     srcatop_modeproc_neon,
    964     dstatop_modeproc_neon,
    965     xor_modeproc_neon,
    966     plus_modeproc_neon,
    967     modulate_modeproc_neon,
    968     NULL, // kScreen_Mode
    969 
    970     NULL, // kOverlay_Mode
    971     NULL, // kDarken_Mode
    972     NULL, // kLighten_Mode
    973     NULL, // kColorDodge_Mode
    974     NULL, // kColorBurn_Mode
    975     NULL, // kHardLight_Mode
    976     NULL, // kSoftLight_Mode
    977     NULL, // kDifference_Mode
    978     NULL, // kExclusion_Mode
    979     NULL, // kMultiply_Mode
    980 
    981     NULL, // kHue_Mode
    982     NULL, // kSaturation_Mode
    983     NULL, // kColor_Mode
    984     NULL, // kLuminosity_Mode
    985 };
    986 
    987 SK_COMPILE_ASSERT(
    988     SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1,
    989     mode1_count_arm
    990 );
    991 
    992 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec,
    993                                                          SkXfermode::Mode mode) {
    994 
    995     void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]);
    996 
    997     if (procSIMD != NULL) {
    998         return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));
    999     }
   1000     return NULL;
   1001 }
   1002 
   1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {
   1004     return gNEONXfermodeProcs1[mode];
   1005 }
   1006