Home | History | Annotate | Download | only in opts
      1 #include "SkXfermode.h"
      2 #include "SkXfermode_proccoeff.h"
      3 #include "SkColorPriv.h"
      4 
      5 #include <arm_neon.h>
      6 #include "SkColor_opts_neon.h"
      7 #include "SkXfermode_opts_arm_neon.h"
      8 
      9 #define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
     10 
     11 
     12 ////////////////////////////////////////////////////////////////////////////////
     13 // NEONized skia functions
     14 ////////////////////////////////////////////////////////////////////////////////
     15 
     16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) {
     17     uint16x8_t tmp;
     18     uint8x8_t ret;
     19 
     20     tmp = vmull_u8(color, alpha);
     21     tmp = vaddq_u16(tmp, vdupq_n_u16(128));
     22     tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8));
     23 
     24     ret = vshrn_n_u16(tmp, 8);
     25 
     26     return ret;
     27 }
     28 
     29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) {
     30     uint16x8_t ret;
     31 
     32     ret = vmull_u8(color, alpha);
     33     ret = vaddq_u16(ret, vdupq_n_u16(128));
     34     ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));
     35 
     36     ret = vshrq_n_u16(ret, 8);
     37 
     38     return ret;
     39 }
     40 
     41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
     42     uint16x8_t tmp;
     43 
     44 #ifdef SK_CPU_ARM64
     45     tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
     46                          vreinterpretq_u32_s32(p2));
     47 #else
     48     tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
     49                        vmovn_u32(vreinterpretq_u32_s32(p2)));
     50 #endif
     51 
     52     tmp += vdupq_n_u16(128);
     53     tmp += vshrq_n_u16(tmp, 8);
     54 
     55     return vshrn_n_u16(tmp, 8);
     56 }
     57 
     58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {
     59     prod += vdupq_n_u16(128);
     60     prod += vshrq_n_u16(prod, 8);
     61 
     62     return vshrq_n_u16(prod, 8);
     63 }
     64 
     65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) {
     66     uint8x8_t ret;
     67     uint32x4_t cmp1, cmp2;
     68     uint16x8_t cmp16;
     69     uint8x8_t cmp8, cmp8_1;
     70 
     71     // Test if <= 0
     72     cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
     73     cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
     74 #ifdef SK_CPU_ARM64
     75     cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
     76 #else
     77     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
     78 #endif
     79     cmp8_1 = vmovn_u16(cmp16);
     80 
     81     // Init to zero
     82     ret = vdup_n_u8(0);
     83 
     84     // Test if >= 255*255
     85     cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
     86     cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
     87 #ifdef SK_CPU_ARM64
     88     cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
     89 #else
     90     cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
     91 #endif
     92     cmp8 = vmovn_u16(cmp16);
     93 
     94     // Insert 255 where true
     95     ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);
     96 
     97     // Calc SkDiv255Round
     98     uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);
     99 
    100     // Insert where false and previous test false
    101     cmp8 = cmp8 | cmp8_1;
    102     ret = vbsl_u8(cmp8, ret, div);
    103 
    104     // Return the final combination
    105     return ret;
    106 }
    107 
    108 ////////////////////////////////////////////////////////////////////////////////
    109 // 1 pixel modeprocs
    110 ////////////////////////////////////////////////////////////////////////////////
    111 
    112 //  kSrcATop_Mode,  //!< [Da, Sc * Da + (1 - Sa) * Dc]
    113 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
    114     unsigned sa = SkGetPackedA32(src);
    115     unsigned da = SkGetPackedA32(dst);
    116     unsigned isa = 255 - sa;
    117 
    118     uint8x8_t vda, visa, vsrc, vdst;
    119 
    120     vda = vdup_n_u8(da);
    121     visa = vdup_n_u8(isa);
    122 
    123     uint16x8_t vsrc_wide, vdst_wide;
    124     vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src)));
    125     vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst)));
    126 
    127     vsrc_wide += vdupq_n_u16(128);
    128     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    129 
    130     vdst_wide += vdupq_n_u16(128);
    131     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    132 
    133     vsrc = vshrn_n_u16(vsrc_wide, 8);
    134     vdst = vshrn_n_u16(vdst_wide, 8);
    135 
    136     vsrc += vdst;
    137     vsrc = vset_lane_u8(da, vsrc, 3);
    138 
    139     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    140 }
    141 
    142 //  kDstATop_Mode,  //!< [Sa, Sa * Dc + Sc * (1 - Da)]
    143 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
    144     unsigned sa = SkGetPackedA32(src);
    145     unsigned da = SkGetPackedA32(dst);
    146     unsigned ida = 255 - da;
    147 
    148     uint8x8_t vsa, vida, vsrc, vdst;
    149 
    150     vsa = vdup_n_u8(sa);
    151     vida = vdup_n_u8(ida);
    152 
    153     uint16x8_t vsrc_wide, vdst_wide;
    154     vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src)));
    155     vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst)));
    156 
    157     vsrc_wide += vdupq_n_u16(128);
    158     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    159 
    160     vdst_wide += vdupq_n_u16(128);
    161     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    162 
    163     vsrc = vshrn_n_u16(vsrc_wide, 8);
    164     vdst = vshrn_n_u16(vdst_wide, 8);
    165 
    166     vsrc += vdst;
    167     vsrc = vset_lane_u8(sa, vsrc, 3);
    168 
    169     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    170 }
    171 
    172 //  kXor_Mode   [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc]
    173 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) {
    174     unsigned sa = SkGetPackedA32(src);
    175     unsigned da = SkGetPackedA32(dst);
    176     unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1);
    177     unsigned isa = 255 - sa;
    178     unsigned ida = 255 - da;
    179 
    180     uint8x8_t vsrc, vdst, visa, vida;
    181     uint16x8_t vsrc_wide, vdst_wide;
    182 
    183     visa = vdup_n_u8(isa);
    184     vida = vdup_n_u8(ida);
    185     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    186     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    187 
    188     vsrc_wide = vmull_u8(vsrc, vida);
    189     vdst_wide = vmull_u8(vdst, visa);
    190 
    191     vsrc_wide += vdupq_n_u16(128);
    192     vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
    193 
    194     vdst_wide += vdupq_n_u16(128);
    195     vdst_wide += vshrq_n_u16(vdst_wide, 8);
    196 
    197     vsrc = vshrn_n_u16(vsrc_wide, 8);
    198     vdst = vshrn_n_u16(vdst_wide, 8);
    199 
    200     vsrc += vdst;
    201 
    202     vsrc = vset_lane_u8(ret_alpha, vsrc, 3);
    203 
    204     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    205 }
    206 
    207 // kPlus_Mode
    208 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) {
    209     uint8x8_t vsrc, vdst;
    210     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    211     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    212     vsrc = vqadd_u8(vsrc, vdst);
    213 
    214     return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
    215 }
    216 
    217 // kModulate_Mode
    218 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) {
    219     uint8x8_t vsrc, vdst, vres;
    220     uint16x8_t vres_wide;
    221 
    222     vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
    223     vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
    224 
    225     vres_wide = vmull_u8(vsrc, vdst);
    226 
    227     vres_wide += vdupq_n_u16(128);
    228     vres_wide += vshrq_n_u16(vres_wide, 8);
    229 
    230     vres = vshrn_n_u16(vres_wide, 8);
    231 
    232     return vget_lane_u32(vreinterpret_u32_u8(vres), 0);
    233 }
    234 
    235 ////////////////////////////////////////////////////////////////////////////////
    236 // 8 pixels modeprocs
    237 ////////////////////////////////////////////////////////////////////////////////
    238 
    239 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    240     uint8x8x4_t ret;
    241     uint16x8_t src_scale;
    242 
    243     src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
    244 
    245     ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale);
    246     ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale);
    247     ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale);
    248     ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale);
    249 
    250     return ret;
    251 }
    252 
    253 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    254     uint8x8x4_t ret;
    255     uint16x8_t scale;
    256 
    257     scale = SkAlpha255To256_neon8(dst.val[NEON_A]);
    258 
    259     ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale);
    260     ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale);
    261     ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale);
    262     ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale);
    263 
    264     return ret;
    265 }
    266 
    267 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    268     uint8x8x4_t ret;
    269     uint16x8_t scale;
    270 
    271     scale = SkAlpha255To256_neon8(src.val[NEON_A]);
    272 
    273     ret = SkAlphaMulQ_neon8(dst, scale);
    274 
    275     return ret;
    276 }
    277 
    278 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    279     uint8x8x4_t ret;
    280     uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
    281 
    282     ret = SkAlphaMulQ_neon8(src, scale);
    283 
    284     return ret;
    285 }
    286 
    287 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    288     uint8x8x4_t ret;
    289     uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]);
    290 
    291     ret = SkAlphaMulQ_neon8(dst, scale);
    292 
    293     return ret;
    294 }
    295 
    296 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    297     uint8x8x4_t ret;
    298     uint8x8_t isa;
    299 
    300     isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
    301 
    302     ret.val[NEON_A] = dst.val[NEON_A];
    303     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A])
    304                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
    305     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A])
    306                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
    307     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A])
    308                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
    309 
    310     return ret;
    311 }
    312 
    313 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    314     uint8x8x4_t ret;
    315     uint8x8_t ida;
    316 
    317     ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
    318 
    319     ret.val[NEON_A] = src.val[NEON_A];
    320     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
    321                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]);
    322     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
    323                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]);
    324     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
    325                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]);
    326 
    327     return ret;
    328 }
    329 
    330 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    331     uint8x8x4_t ret;
    332     uint8x8_t isa, ida;
    333     uint16x8_t tmp_wide, tmp_wide2;
    334 
    335     isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
    336     ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
    337 
    338     // First calc alpha
    339     tmp_wide = vmovl_u8(src.val[NEON_A]);
    340     tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]);
    341     tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1);
    342     tmp_wide = vsubq_u16(tmp_wide, tmp_wide2);
    343     ret.val[NEON_A] = vmovn_u16(tmp_wide);
    344 
    345     // Then colors
    346     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
    347                       + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
    348     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
    349                       + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
    350     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
    351                       + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
    352 
    353     return ret;
    354 }
    355 
    356 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    357     uint8x8x4_t ret;
    358 
    359     ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]);
    360     ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]);
    361     ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]);
    362     ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]);
    363 
    364     return ret;
    365 }
    366 
    367 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    368     uint8x8x4_t ret;
    369 
    370     ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]);
    371     ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]);
    372     ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]);
    373     ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]);
    374 
    375     return ret;
    376 }
    377 
    378 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) {
    379     uint16x8_t tmp;
    380 
    381     tmp = vaddl_u8(a, b);
    382     tmp -= SkAlphaMulAlpha_neon8_16(a, b);
    383 
    384     return vmovn_u16(tmp);
    385 }
    386 
    387 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    388     uint8x8x4_t ret;
    389 
    390     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    391     ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]);
    392     ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]);
    393     ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]);
    394 
    395     return ret;
    396 }
    397 
    398 template <bool overlay>
    399 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
    400                                                uint8x8_t sa, uint8x8_t da) {
    401     /*
    402      * In the end we're gonna use (rc + tmp) with a different rc
    403      * coming from an alternative.
    404      * The whole value (rc + tmp) can always be expressed as
    405      * VAL = COM - SUB in the if case
    406      * VAL = COM + SUB - sa*da in the else case
    407      *
    408      * with COM = 255 * (sc + dc)
    409      * and  SUB = sc*da + dc*sa - 2*dc*sc
    410      */
    411 
    412     // Prepare common subexpressions
    413     uint16x8_t const255 = vdupq_n_u16(255);
    414     uint16x8_t sc_plus_dc = vaddl_u8(sc, dc);
    415     uint16x8_t scda = vmull_u8(sc, da);
    416     uint16x8_t dcsa = vmull_u8(dc, sa);
    417     uint16x8_t sada = vmull_u8(sa, da);
    418 
    419     // Prepare non common subexpressions
    420     uint16x8_t dc2, sc2;
    421     uint32x4_t scdc2_1, scdc2_2;
    422     if (overlay) {
    423         dc2 = vshll_n_u8(dc, 1);
    424         scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
    425 #ifdef SK_CPU_ARM64
    426         scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
    427 #else
    428         scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
    429 #endif
    430     } else {
    431         sc2 = vshll_n_u8(sc, 1);
    432         scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
    433 #ifdef SK_CPU_ARM64
    434         scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
    435 #else
    436         scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
    437 #endif
    438     }
    439 
    440     // Calc COM
    441     int32x4_t com1, com2;
    442     com1 = vreinterpretq_s32_u32(
    443                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
    444     com2 = vreinterpretq_s32_u32(
    445 #ifdef SK_CPU_ARM64
    446                 vmull_high_u16(const255, sc_plus_dc));
    447 #else
    448                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
    449 #endif
    450 
    451     // Calc SUB
    452     int32x4_t sub1, sub2;
    453     sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
    454 #ifdef SK_CPU_ARM64
    455     sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
    456 #else
    457     sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
    458 #endif
    459     sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
    460     sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
    461 
    462     // Compare 2*dc <= da
    463     uint16x8_t cmp;
    464 
    465     if (overlay) {
    466         cmp = vcleq_u16(dc2, vmovl_u8(da));
    467     } else {
    468         cmp = vcleq_u16(sc2, vmovl_u8(sa));
    469     }
    470 
    471     // Prepare variables
    472     int32x4_t val1_1, val1_2;
    473     int32x4_t val2_1, val2_2;
    474     uint32x4_t cmp1, cmp2;
    475 
    476     // Doing a signed lengthening allows to save a few instructions
    477     // thanks to sign extension.
    478     cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
    479 #ifdef SK_CPU_ARM64
    480     cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
    481 #else
    482     cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
    483 #endif
    484 
    485     // Calc COM - SUB
    486     val1_1 = com1 - sub1;
    487     val1_2 = com2 - sub2;
    488 
    489     // Calc COM + SUB - sa*da
    490     val2_1 = com1 + sub1;
    491     val2_2 = com2 + sub2;
    492 
    493     val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
    494 #ifdef SK_CPU_ARM64
    495     val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
    496 #else
    497     val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
    498 #endif
    499 
    500     // Insert where needed
    501     val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
    502     val1_2 = vbslq_s32(cmp2, val1_2, val2_2);
    503 
    504     // Call the clamp_div255round function
    505     return clamp_div255round_simd8_32(val1_1, val1_2);
    506 }
    507 
    508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,
    509                                       uint8x8_t sa, uint8x8_t da) {
    510     return overlay_hardlight_color<true>(sc, dc, sa, da);
    511 }
    512 
    513 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    514     uint8x8x4_t ret;
    515 
    516     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    517     ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R],
    518                                     src.val[NEON_A], dst.val[NEON_A]);
    519     ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G],
    520                                     src.val[NEON_A], dst.val[NEON_A]);
    521     ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B],
    522                                     src.val[NEON_A], dst.val[NEON_A]);
    523 
    524     return ret;
    525 }
    526 
    527 template <bool lighten>
    528 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc,
    529                                              uint8x8_t sa, uint8x8_t da) {
    530     uint16x8_t sd, ds, cmp, tmp, tmp2;
    531 
    532     // Prepare
    533     sd = vmull_u8(sc, da);
    534     ds = vmull_u8(dc, sa);
    535 
    536     // Do test
    537     if (lighten) {
    538         cmp = vcgtq_u16(sd, ds);
    539     } else {
    540         cmp = vcltq_u16(sd, ds);
    541     }
    542 
    543     // Assign if
    544     tmp = vaddl_u8(sc, dc);
    545     tmp2 = tmp;
    546     tmp -= SkDiv255Round_neon8_16_16(ds);
    547 
    548     // Calc else
    549     tmp2 -= SkDiv255Round_neon8_16_16(sd);
    550 
    551     // Insert where needed
    552     tmp = vbslq_u16(cmp, tmp, tmp2);
    553 
    554     return vmovn_u16(tmp);
    555 }
    556 
    557 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc,
    558                                      uint8x8_t sa, uint8x8_t da) {
    559     return lighten_darken_color<false>(sc, dc, sa, da);
    560 }
    561 
    562 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    563     uint8x8x4_t ret;
    564 
    565     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    566     ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R],
    567                                    src.val[NEON_A], dst.val[NEON_A]);
    568     ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G],
    569                                    src.val[NEON_A], dst.val[NEON_A]);
    570     ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B],
    571                                    src.val[NEON_A], dst.val[NEON_A]);
    572 
    573     return ret;
    574 }
    575 
    576 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc,
    577                                       uint8x8_t sa, uint8x8_t da) {
    578     return lighten_darken_color<true>(sc, dc, sa, da);
    579 }
    580 
    581 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    582     uint8x8x4_t ret;
    583 
    584     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    585     ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R],
    586                                     src.val[NEON_A], dst.val[NEON_A]);
    587     ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G],
    588                                     src.val[NEON_A], dst.val[NEON_A]);
    589     ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B],
    590                                     src.val[NEON_A], dst.val[NEON_A]);
    591 
    592     return ret;
    593 }
    594 
    595 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc,
    596                                         uint8x8_t sa, uint8x8_t da) {
    597     return overlay_hardlight_color<false>(sc, dc, sa, da);
    598 }
    599 
    600 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    601     uint8x8x4_t ret;
    602 
    603     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    604     ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R],
    605                                       src.val[NEON_A], dst.val[NEON_A]);
    606     ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G],
    607                                       src.val[NEON_A], dst.val[NEON_A]);
    608     ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B],
    609                                       src.val[NEON_A], dst.val[NEON_A]);
    610 
    611     return ret;
    612 }
    613 
    614 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc,
    615                                          uint8x8_t sa, uint8x8_t da) {
    616     uint16x8_t sd, ds, tmp;
    617     int16x8_t val;
    618 
    619     sd = vmull_u8(sc, da);
    620     ds = vmull_u8(dc, sa);
    621 
    622     tmp = vminq_u16(sd, ds);
    623     tmp = SkDiv255Round_neon8_16_16(tmp);
    624     tmp = vshlq_n_u16(tmp, 1);
    625 
    626     val = vreinterpretq_s16_u16(vaddl_u8(sc, dc));
    627 
    628     val -= vreinterpretq_s16_u16(tmp);
    629 
    630     val = vmaxq_s16(val, vdupq_n_s16(0));
    631     val = vminq_s16(val, vdupq_n_s16(255));
    632 
    633     return vmovn_u16(vreinterpretq_u16_s16(val));
    634 }
    635 
    636 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    637     uint8x8x4_t ret;
    638 
    639     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    640     ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R],
    641                                        src.val[NEON_A], dst.val[NEON_A]);
    642     ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G],
    643                                        src.val[NEON_A], dst.val[NEON_A]);
    644     ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B],
    645                                        src.val[NEON_A], dst.val[NEON_A]);
    646 
    647     return ret;
    648 }
    649 
    650 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
    651                                         uint8x8_t sa, uint8x8_t da) {
    652     /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */
    653 
    654     uint16x8_t sc_plus_dc, scdc, const255;
    655     int32x4_t term1_1, term1_2, term2_1, term2_2;
    656 
    657     /* Calc (sc + dc) and (sc * dc) */
    658     sc_plus_dc = vaddl_u8(sc, dc);
    659     scdc = vmull_u8(sc, dc);
    660 
    661     /* Prepare constants */
    662     const255 = vdupq_n_u16(255);
    663 
    664     /* Calc the first term */
    665     term1_1 = vreinterpretq_s32_u32(
    666                 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
    667     term1_2 = vreinterpretq_s32_u32(
    668 #ifdef SK_CPU_ARM64
    669                 vmull_high_u16(const255, sc_plus_dc));
    670 #else
    671                 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
    672 #endif
    673 
    674     /* Calc the second term */
    675     term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
    676 #ifdef SK_CPU_ARM64
    677     term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
    678 #else
    679     term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
    680 #endif
    681 
    682     return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
    683 }
    684 
    685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    686     uint8x8x4_t ret;
    687 
    688     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    689     ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],
    690                                       src.val[NEON_A], dst.val[NEON_A]);
    691     ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],
    692                                       src.val[NEON_A], dst.val[NEON_A]);
    693     ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],
    694                                       src.val[NEON_A], dst.val[NEON_A]);
    695 
    696     return ret;
    697 }
    698 
    699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
    700                                                  uint8x8_t sa, uint8x8_t da) {
    701     uint32x4_t val1, val2;
    702     uint16x8_t scdc, t1, t2;
    703 
    704     t1 = vmull_u8(sc, vdup_n_u8(255) - da);
    705     t2 = vmull_u8(dc, vdup_n_u8(255) - sa);
    706     scdc = vmull_u8(sc, dc);
    707 
    708     val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
    709 #ifdef SK_CPU_ARM64
    710     val2 = vaddl_high_u16(t1, t2);
    711 #else
    712     val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
    713 #endif
    714 
    715     val1 = vaddw_u16(val1, vget_low_u16(scdc));
    716 #ifdef SK_CPU_ARM64
    717     val2 = vaddw_high_u16(val2, scdc);
    718 #else
    719     val2 = vaddw_u16(val2, vget_high_u16(scdc));
    720 #endif
    721 
    722     return clamp_div255round_simd8_32(
    723                 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
    724 }
    725 
    726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
    727     uint8x8x4_t ret;
    728 
    729     ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
    730     ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],
    731                                                src.val[NEON_A], dst.val[NEON_A]);
    732     ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G],
    733                                                src.val[NEON_A], dst.val[NEON_A]);
    734     ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B],
    735                                                src.val[NEON_A], dst.val[NEON_A]);
    736 
    737     return ret;
    738 }
    739 
    740 ////////////////////////////////////////////////////////////////////////////////
    741 
    742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
    743 
    744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
    745 
    746 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst,
    747                                      const SkPMColor* SK_RESTRICT src, int count,
    748                                      const SkAlpha* SK_RESTRICT aa) const {
    749     SkASSERT(dst && src && count >= 0);
    750 
    751     SkXfermodeProc proc = this->getProc();
    752     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    753     SkASSERT(procSIMD != NULL);
    754 
    755     if (NULL == aa) {
    756         // Unrolled NEON code
    757         // We'd like to just do this (modulo a few casts):
    758         // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst)));
    759         // src += 8;
    760         // dst += 8;
    761         // but that tends to generate miserable code. Here are a bunch of faster
    762         // workarounds for different architectures and compilers.
    763         while (count >= 8) {
    764 
    765 #ifdef SK_CPU_ARM32
    766             uint8x8x4_t vsrc, vdst, vres;
    767 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    768             asm volatile (
    769                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
    770                 "vld4.u8    %h[vdst], [%[dst]]   \t\n"
    771                 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)
    772                 : [dst] "r" (dst)
    773                 :
    774             );
    775 #else
    776             register uint8x8_t d0 asm("d0");
    777             register uint8x8_t d1 asm("d1");
    778             register uint8x8_t d2 asm("d2");
    779             register uint8x8_t d3 asm("d3");
    780             register uint8x8_t d4 asm("d4");
    781             register uint8x8_t d5 asm("d5");
    782             register uint8x8_t d6 asm("d6");
    783             register uint8x8_t d7 asm("d7");
    784 
    785             asm volatile (
    786                 "vld4.u8    {d0-d3},[%[src]]!;"
    787                 "vld4.u8    {d4-d7},[%[dst]];"
    788                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    789                   "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),
    790                   [src] "+&r" (src)
    791                 : [dst] "r" (dst)
    792                 :
    793             );
    794             vsrc.val[0] = d0; vdst.val[0] = d4;
    795             vsrc.val[1] = d1; vdst.val[1] = d5;
    796             vsrc.val[2] = d2; vdst.val[2] = d6;
    797             vsrc.val[3] = d3; vdst.val[3] = d7;
    798 #endif
    799 
    800             vres = procSIMD(vsrc, vdst);
    801 
    802             vst4_u8((uint8_t*)dst, vres);
    803 
    804             dst += 8;
    805 
    806 #else // #ifdef SK_CPU_ARM32
    807 
    808             asm volatile (
    809                 "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
    810                 "ld4    {v4.8b - v7.8b}, [%[dst]]      \t\n"
    811                 "blr    %[proc]                        \t\n"
    812                 "st4    {v0.8b - v3.8b}, [%[dst]], #32 \t\n"
    813                 : [src] "+&r" (src), [dst] "+&r" (dst)
    814                 : [proc] "r" (procSIMD)
    815                 : "cc", "memory",
    816                   /* We don't know what proc is going to clobber so we must
    817                    * add everything that is not callee-saved.
    818                    */
    819                   "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
    820                   "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18",
    821                   "x30", /* x30 implicitly clobbered by blr */
    822                   "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
    823                   "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
    824                   "v27", "v28", "v29", "v30", "v31"
    825             );
    826 
    827 #endif // #ifdef SK_CPU_ARM32
    828 
    829             count -= 8;
    830         }
    831         // Leftovers
    832         for (int i = 0; i < count; i++) {
    833             dst[i] = proc(src[i], dst[i]);
    834         }
    835     } else {
    836         for (int i = count - 1; i >= 0; --i) {
    837             unsigned a = aa[i];
    838             if (0 != a) {
    839                 SkPMColor dstC = dst[i];
    840                 SkPMColor C = proc(src[i], dstC);
    841                 if (a != 0xFF) {
    842                     C = SkFourByteInterp_neon(C, dstC, a);
    843                 }
    844                 dst[i] = C;
    845             }
    846         }
    847     }
    848 }
    849 
    850 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
    851                                      const SkPMColor* SK_RESTRICT src, int count,
    852                                      const SkAlpha* SK_RESTRICT aa) const {
    853     SkASSERT(dst && src && count >= 0);
    854 
    855     SkXfermodeProc proc = this->getProc();
    856     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
    857     SkASSERT(procSIMD != NULL);
    858 
    859     if (NULL == aa) {
    860         while(count >= 8) {
    861             uint16x8_t vdst, vres16;
    862             uint8x8x4_t vdst32, vsrc, vres;
    863 
    864             vdst = vld1q_u16(dst);
    865 
    866 #ifdef SK_CPU_ARM64
    867             vsrc = vld4_u8((uint8_t*)src);
    868 #else
    869 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    870             asm volatile (
    871                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
    872                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    873                 : :
    874             );
    875 #else
    876             register uint8x8_t d0 asm("d0");
    877             register uint8x8_t d1 asm("d1");
    878             register uint8x8_t d2 asm("d2");
    879             register uint8x8_t d3 asm("d3");
    880 
    881             asm volatile (
    882                 "vld4.u8    {d0-d3},[%[src]]!;"
    883                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    884                   [src] "+&r" (src)
    885                 : :
    886             );
    887             vsrc.val[0] = d0;
    888             vsrc.val[1] = d1;
    889             vsrc.val[2] = d2;
    890             vsrc.val[3] = d3;
    891 #endif
    892 #endif // #ifdef SK_CPU_ARM64
    893 
    894             vdst32 = SkPixel16ToPixel32_neon8(vdst);
    895             vres = procSIMD(vsrc, vdst32);
    896             vres16 = SkPixel32ToPixel16_neon8(vres);
    897 
    898             vst1q_u16(dst, vres16);
    899 
    900             count -= 8;
    901             dst += 8;
    902 #ifdef SK_CPU_ARM64
    903             src += 8;
    904 #endif
    905         }
    906         for (int i = 0; i < count; i++) {
    907             SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
    908             dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
    909         }
    910     } else {
    911         for (int i = count - 1; i >= 0; --i) {
    912             unsigned a = aa[i];
    913             if (0 != a) {
    914                 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
    915                 SkPMColor C = proc(src[i], dstC);
    916                 if (0xFF != a) {
    917                     C = SkFourByteInterp_neon(C, dstC, a);
    918                 }
    919                 dst[i] = SkPixel32ToPixel16_ToU16(C);
    920             }
    921         }
    922     }
    923 }
    924 
    925 #ifndef SK_IGNORE_TO_STRING
    926 void SkNEONProcCoeffXfermode::toString(SkString* str) const {
    927     this->INHERITED::toString(str);
    928 }
    929 #endif
    930 
    931 ////////////////////////////////////////////////////////////////////////////////
    932 
    933 SkXfermodeProcSIMD gNEONXfermodeProcs[] = {
    934     NULL, // kClear_Mode
    935     NULL, // kSrc_Mode
    936     NULL, // kDst_Mode
    937     NULL, // kSrcOver_Mode
    938     dstover_modeproc_neon8,
    939     srcin_modeproc_neon8,
    940     dstin_modeproc_neon8,
    941     srcout_modeproc_neon8,
    942     dstout_modeproc_neon8,
    943     srcatop_modeproc_neon8,
    944     dstatop_modeproc_neon8,
    945     xor_modeproc_neon8,
    946     plus_modeproc_neon8,
    947     modulate_modeproc_neon8,
    948     screen_modeproc_neon8,
    949 
    950     overlay_modeproc_neon8,
    951     darken_modeproc_neon8,
    952     lighten_modeproc_neon8,
    953     NULL, // kColorDodge_Mode
    954     NULL, // kColorBurn_Mode
    955     hardlight_modeproc_neon8,
    956     NULL, // kSoftLight_Mode
    957     difference_modeproc_neon8,
    958     exclusion_modeproc_neon8,
    959     multiply_modeproc_neon8,
    960 
    961     NULL, // kHue_Mode
    962     NULL, // kSaturation_Mode
    963     NULL, // kColor_Mode
    964     NULL, // kLuminosity_Mode
    965 };
    966 
    967 SK_COMPILE_ASSERT(
    968     SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1,
    969     mode_count_arm
    970 );
    971 
    972 SkXfermodeProc gNEONXfermodeProcs1[] = {
    973     NULL, // kClear_Mode
    974     NULL, // kSrc_Mode
    975     NULL, // kDst_Mode
    976     NULL, // kSrcOver_Mode
    977     NULL, // kDstOver_Mode
    978     NULL, // kSrcIn_Mode
    979     NULL, // kDstIn_Mode
    980     NULL, // kSrcOut_Mode
    981     NULL, // kDstOut_Mode
    982     srcatop_modeproc_neon,
    983     dstatop_modeproc_neon,
    984     xor_modeproc_neon,
    985     plus_modeproc_neon,
    986     modulate_modeproc_neon,
    987     NULL, // kScreen_Mode
    988 
    989     NULL, // kOverlay_Mode
    990     NULL, // kDarken_Mode
    991     NULL, // kLighten_Mode
    992     NULL, // kColorDodge_Mode
    993     NULL, // kColorBurn_Mode
    994     NULL, // kHardLight_Mode
    995     NULL, // kSoftLight_Mode
    996     NULL, // kDifference_Mode
    997     NULL, // kExclusion_Mode
    998     NULL, // kMultiply_Mode
    999 
   1000     NULL, // kHue_Mode
   1001     NULL, // kSaturation_Mode
   1002     NULL, // kColor_Mode
   1003     NULL, // kLuminosity_Mode
   1004 };
   1005 
   1006 SK_COMPILE_ASSERT(
   1007     SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1,
   1008     mode1_count_arm
   1009 );
   1010 
   1011 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec,
   1012                                                          SkXfermode::Mode mode) {
   1013 
   1014     void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]);
   1015 
   1016     if (procSIMD != NULL) {
   1017         return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));
   1018     }
   1019     return NULL;
   1020 }
   1021 
   1022 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {
   1023     return gNEONXfermodeProcs1[mode];
   1024 }
   1025