1 #include "SkXfermode.h" 2 #include "SkXfermode_proccoeff.h" 3 #include "SkColorPriv.h" 4 5 #include <arm_neon.h> 6 #include "SkColor_opts_neon.h" 7 #include "SkXfermode_opts_arm_neon.h" 8 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) 10 11 12 //////////////////////////////////////////////////////////////////////////////// 13 // NEONized skia functions 14 //////////////////////////////////////////////////////////////////////////////// 15 16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { 17 uint16x8_t tmp; 18 uint8x8_t ret; 19 20 tmp = vmull_u8(color, alpha); 21 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); 22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); 23 24 ret = vshrn_n_u16(tmp, 8); 25 26 return ret; 27 } 28 29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) { 30 uint16x8_t ret; 31 32 ret = vmull_u8(color, alpha); 33 ret = vaddq_u16(ret, vdupq_n_u16(128)); 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); 35 36 ret = vshrq_n_u16(ret, 8); 37 38 return ret; 39 } 40 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { 42 uint16x8_t tmp; 43 44 #ifdef SK_CPU_ARM64 45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), 46 vreinterpretq_u32_s32(p2)); 47 #else 48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), 49 vmovn_u32(vreinterpretq_u32_s32(p2))); 50 #endif 51 52 tmp += vdupq_n_u16(128); 53 tmp += vshrq_n_u16(tmp, 8); 54 55 return vshrn_n_u16(tmp, 8); 56 } 57 58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { 59 prod += vdupq_n_u16(128); 60 prod += vshrq_n_u16(prod, 8); 61 62 return vshrq_n_u16(prod, 8); 63 } 64 65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) { 66 uint8x8_t ret; 67 uint32x4_t cmp1, cmp2; 68 uint16x8_t cmp16; 69 uint8x8_t cmp8, cmp8_1; 70 71 // Test if <= 0 72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); 73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); 74 #ifdef SK_CPU_ARM64 75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); 76 #else 77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); 78 #endif 79 cmp8_1 = vmovn_u16(cmp16); 80 81 // Init to zero 82 ret = vdup_n_u8(0); 83 84 // Test if >= 255*255 85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); 86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); 87 #ifdef SK_CPU_ARM64 88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); 89 #else 90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); 91 #endif 92 cmp8 = vmovn_u16(cmp16); 93 94 // Insert 255 where true 95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); 96 97 // Calc SkDiv255Round 98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); 99 100 // Insert where false and previous test false 101 cmp8 = cmp8 | cmp8_1; 102 ret = vbsl_u8(cmp8, ret, div); 103 104 // Return the final combination 105 return ret; 106 } 107 108 //////////////////////////////////////////////////////////////////////////////// 109 // 1 pixel modeprocs 110 //////////////////////////////////////////////////////////////////////////////// 111 112 // kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] 113 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { 114 unsigned sa = SkGetPackedA32(src); 115 unsigned da = SkGetPackedA32(dst); 116 unsigned isa = 255 - sa; 117 118 uint8x8_t vda, visa, vsrc, vdst; 119 120 vda = vdup_n_u8(da); 121 visa = vdup_n_u8(isa); 122 123 uint16x8_t vsrc_wide, vdst_wide; 124 vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); 125 vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); 126 127 vsrc_wide += vdupq_n_u16(128); 128 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 129 130 vdst_wide += vdupq_n_u16(128); 131 vdst_wide += vshrq_n_u16(vdst_wide, 8); 132 133 vsrc = vshrn_n_u16(vsrc_wide, 8); 134 vdst = vshrn_n_u16(vdst_wide, 8); 135 136 vsrc += vdst; 137 vsrc = vset_lane_u8(da, vsrc, 3); 138 139 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 140 } 141 142 // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] 143 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { 144 unsigned sa = SkGetPackedA32(src); 145 unsigned da = SkGetPackedA32(dst); 146 unsigned ida = 255 - da; 147 148 uint8x8_t vsa, vida, vsrc, vdst; 149 150 vsa = vdup_n_u8(sa); 151 vida = vdup_n_u8(ida); 152 153 uint16x8_t vsrc_wide, vdst_wide; 154 vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); 155 vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); 156 157 vsrc_wide += vdupq_n_u16(128); 158 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 159 160 vdst_wide += vdupq_n_u16(128); 161 vdst_wide += vshrq_n_u16(vdst_wide, 8); 162 163 vsrc = vshrn_n_u16(vsrc_wide, 8); 164 vdst = vshrn_n_u16(vdst_wide, 8); 165 166 vsrc += vdst; 167 vsrc = vset_lane_u8(sa, vsrc, 3); 168 169 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 170 } 171 172 // kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] 173 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { 174 unsigned sa = SkGetPackedA32(src); 175 unsigned da = SkGetPackedA32(dst); 176 unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); 177 unsigned isa = 255 - sa; 178 unsigned ida = 255 - da; 179 180 uint8x8_t vsrc, vdst, visa, vida; 181 uint16x8_t vsrc_wide, vdst_wide; 182 183 visa = vdup_n_u8(isa); 184 vida = vdup_n_u8(ida); 185 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 186 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 187 188 vsrc_wide = vmull_u8(vsrc, vida); 189 vdst_wide = vmull_u8(vdst, visa); 190 191 vsrc_wide += vdupq_n_u16(128); 192 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 193 194 vdst_wide += vdupq_n_u16(128); 195 vdst_wide += vshrq_n_u16(vdst_wide, 8); 196 197 vsrc = vshrn_n_u16(vsrc_wide, 8); 198 vdst = vshrn_n_u16(vdst_wide, 8); 199 200 vsrc += vdst; 201 202 vsrc = vset_lane_u8(ret_alpha, vsrc, 3); 203 204 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 205 } 206 207 // kPlus_Mode 208 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { 209 uint8x8_t vsrc, vdst; 210 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 211 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 212 vsrc = vqadd_u8(vsrc, vdst); 213 214 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 215 } 216 217 // kModulate_Mode 218 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { 219 uint8x8_t vsrc, vdst, vres; 220 uint16x8_t vres_wide; 221 222 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 223 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 224 225 vres_wide = vmull_u8(vsrc, vdst); 226 227 vres_wide += vdupq_n_u16(128); 228 vres_wide += vshrq_n_u16(vres_wide, 8); 229 230 vres = vshrn_n_u16(vres_wide, 8); 231 232 return vget_lane_u32(vreinterpret_u32_u8(vres), 0); 233 } 234 235 //////////////////////////////////////////////////////////////////////////////// 236 // 8 pixels modeprocs 237 //////////////////////////////////////////////////////////////////////////////// 238 239 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 240 uint8x8x4_t ret; 241 uint16x8_t src_scale; 242 243 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); 244 245 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale); 246 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale); 247 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale); 248 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale); 249 250 return ret; 251 } 252 253 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 254 uint8x8x4_t ret; 255 uint16x8_t scale; 256 257 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); 258 259 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); 260 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); 261 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); 262 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); 263 264 return ret; 265 } 266 267 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 268 uint8x8x4_t ret; 269 uint16x8_t scale; 270 271 scale = SkAlpha255To256_neon8(src.val[NEON_A]); 272 273 ret = SkAlphaMulQ_neon8(dst, scale); 274 275 return ret; 276 } 277 278 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 279 uint8x8x4_t ret; 280 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); 281 282 ret = SkAlphaMulQ_neon8(src, scale); 283 284 return ret; 285 } 286 287 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 288 uint8x8x4_t ret; 289 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); 290 291 ret = SkAlphaMulQ_neon8(dst, scale); 292 293 return ret; 294 } 295 296 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 297 uint8x8x4_t ret; 298 uint8x8_t isa; 299 300 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); 301 302 ret.val[NEON_A] = dst.val[NEON_A]; 303 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) 304 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); 305 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) 306 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); 307 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) 308 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); 309 310 return ret; 311 } 312 313 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 314 uint8x8x4_t ret; 315 uint8x8_t ida; 316 317 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); 318 319 ret.val[NEON_A] = src.val[NEON_A]; 320 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) 321 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); 322 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) 323 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); 324 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) 325 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); 326 327 return ret; 328 } 329 330 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 331 uint8x8x4_t ret; 332 uint8x8_t isa, ida; 333 uint16x8_t tmp_wide, tmp_wide2; 334 335 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); 336 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); 337 338 // First calc alpha 339 tmp_wide = vmovl_u8(src.val[NEON_A]); 340 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); 341 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1); 342 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); 343 ret.val[NEON_A] = vmovn_u16(tmp_wide); 344 345 // Then colors 346 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) 347 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); 348 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) 349 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); 350 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) 351 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); 352 353 return ret; 354 } 355 356 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 357 uint8x8x4_t ret; 358 359 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); 360 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); 361 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); 362 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); 363 364 return ret; 365 } 366 367 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 368 uint8x8x4_t ret; 369 370 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); 371 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); 372 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); 373 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); 374 375 return ret; 376 } 377 378 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { 379 uint16x8_t tmp; 380 381 tmp = vaddl_u8(a, b); 382 tmp -= SkAlphaMulAlpha_neon8_16(a, b); 383 384 return vmovn_u16(tmp); 385 } 386 387 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 388 uint8x8x4_t ret; 389 390 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 391 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); 392 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); 393 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); 394 395 return ret; 396 } 397 398 template <bool overlay> 399 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, 400 uint8x8_t sa, uint8x8_t da) { 401 /* 402 * In the end we're gonna use (rc + tmp) with a different rc 403 * coming from an alternative. 404 * The whole value (rc + tmp) can always be expressed as 405 * VAL = COM - SUB in the if case 406 * VAL = COM + SUB - sa*da in the else case 407 * 408 * with COM = 255 * (sc + dc) 409 * and SUB = sc*da + dc*sa - 2*dc*sc 410 */ 411 412 // Prepare common subexpressions 413 uint16x8_t const255 = vdupq_n_u16(255); 414 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); 415 uint16x8_t scda = vmull_u8(sc, da); 416 uint16x8_t dcsa = vmull_u8(dc, sa); 417 uint16x8_t sada = vmull_u8(sa, da); 418 419 // Prepare non common subexpressions 420 uint16x8_t dc2, sc2; 421 uint32x4_t scdc2_1, scdc2_2; 422 if (overlay) { 423 dc2 = vshll_n_u8(dc, 1); 424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); 425 #ifdef SK_CPU_ARM64 426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); 427 #else 428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); 429 #endif 430 } else { 431 sc2 = vshll_n_u8(sc, 1); 432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); 433 #ifdef SK_CPU_ARM64 434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); 435 #else 436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); 437 #endif 438 } 439 440 // Calc COM 441 int32x4_t com1, com2; 442 com1 = vreinterpretq_s32_u32( 443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); 444 com2 = vreinterpretq_s32_u32( 445 #ifdef SK_CPU_ARM64 446 vmull_high_u16(const255, sc_plus_dc)); 447 #else 448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); 449 #endif 450 451 // Calc SUB 452 int32x4_t sub1, sub2; 453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa))); 454 #ifdef SK_CPU_ARM64 455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); 456 #else 457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa))); 458 #endif 459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); 460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); 461 462 // Compare 2*dc <= da 463 uint16x8_t cmp; 464 465 if (overlay) { 466 cmp = vcleq_u16(dc2, vmovl_u8(da)); 467 } else { 468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); 469 } 470 471 // Prepare variables 472 int32x4_t val1_1, val1_2; 473 int32x4_t val2_1, val2_2; 474 uint32x4_t cmp1, cmp2; 475 476 // Doing a signed lengthening allows to save a few instructions 477 // thanks to sign extension. 478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp)))); 479 #ifdef SK_CPU_ARM64 480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); 481 #else 482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp)))); 483 #endif 484 485 // Calc COM - SUB 486 val1_1 = com1 - sub1; 487 val1_2 = com2 - sub2; 488 489 // Calc COM + SUB - sa*da 490 val2_1 = com1 + sub1; 491 val2_2 = com2 + sub2; 492 493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada)))); 494 #ifdef SK_CPU_ARM64 495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); 496 #else 497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada)))); 498 #endif 499 500 // Insert where needed 501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); 502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); 503 504 // Call the clamp_div255round function 505 return clamp_div255round_simd8_32(val1_1, val1_2); 506 } 507 508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, 509 uint8x8_t sa, uint8x8_t da) { 510 return overlay_hardlight_color<true>(sc, dc, sa, da); 511 } 512 513 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 514 uint8x8x4_t ret; 515 516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 517 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], 518 src.val[NEON_A], dst.val[NEON_A]); 519 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], 520 src.val[NEON_A], dst.val[NEON_A]); 521 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], 522 src.val[NEON_A], dst.val[NEON_A]); 523 524 return ret; 525 } 526 527 template <bool lighten> 528 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, 529 uint8x8_t sa, uint8x8_t da) { 530 uint16x8_t sd, ds, cmp, tmp, tmp2; 531 532 // Prepare 533 sd = vmull_u8(sc, da); 534 ds = vmull_u8(dc, sa); 535 536 // Do test 537 if (lighten) { 538 cmp = vcgtq_u16(sd, ds); 539 } else { 540 cmp = vcltq_u16(sd, ds); 541 } 542 543 // Assign if 544 tmp = vaddl_u8(sc, dc); 545 tmp2 = tmp; 546 tmp -= SkDiv255Round_neon8_16_16(ds); 547 548 // Calc else 549 tmp2 -= SkDiv255Round_neon8_16_16(sd); 550 551 // Insert where needed 552 tmp = vbslq_u16(cmp, tmp, tmp2); 553 554 return vmovn_u16(tmp); 555 } 556 557 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, 558 uint8x8_t sa, uint8x8_t da) { 559 return lighten_darken_color<false>(sc, dc, sa, da); 560 } 561 562 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 563 uint8x8x4_t ret; 564 565 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 566 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], 567 src.val[NEON_A], dst.val[NEON_A]); 568 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], 569 src.val[NEON_A], dst.val[NEON_A]); 570 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], 571 src.val[NEON_A], dst.val[NEON_A]); 572 573 return ret; 574 } 575 576 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, 577 uint8x8_t sa, uint8x8_t da) { 578 return lighten_darken_color<true>(sc, dc, sa, da); 579 } 580 581 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 582 uint8x8x4_t ret; 583 584 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 585 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], 586 src.val[NEON_A], dst.val[NEON_A]); 587 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], 588 src.val[NEON_A], dst.val[NEON_A]); 589 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], 590 src.val[NEON_A], dst.val[NEON_A]); 591 592 return ret; 593 } 594 595 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, 596 uint8x8_t sa, uint8x8_t da) { 597 return overlay_hardlight_color<false>(sc, dc, sa, da); 598 } 599 600 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 601 uint8x8x4_t ret; 602 603 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 604 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], 605 src.val[NEON_A], dst.val[NEON_A]); 606 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], 607 src.val[NEON_A], dst.val[NEON_A]); 608 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], 609 src.val[NEON_A], dst.val[NEON_A]); 610 611 return ret; 612 } 613 614 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, 615 uint8x8_t sa, uint8x8_t da) { 616 uint16x8_t sd, ds, tmp; 617 int16x8_t val; 618 619 sd = vmull_u8(sc, da); 620 ds = vmull_u8(dc, sa); 621 622 tmp = vminq_u16(sd, ds); 623 tmp = SkDiv255Round_neon8_16_16(tmp); 624 tmp = vshlq_n_u16(tmp, 1); 625 626 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); 627 628 val -= vreinterpretq_s16_u16(tmp); 629 630 val = vmaxq_s16(val, vdupq_n_s16(0)); 631 val = vminq_s16(val, vdupq_n_s16(255)); 632 633 return vmovn_u16(vreinterpretq_u16_s16(val)); 634 } 635 636 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 637 uint8x8x4_t ret; 638 639 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 640 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], 641 src.val[NEON_A], dst.val[NEON_A]); 642 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], 643 src.val[NEON_A], dst.val[NEON_A]); 644 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], 645 src.val[NEON_A], dst.val[NEON_A]); 646 647 return ret; 648 } 649 650 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, 651 uint8x8_t sa, uint8x8_t da) { 652 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ 653 654 uint16x8_t sc_plus_dc, scdc, const255; 655 int32x4_t term1_1, term1_2, term2_1, term2_2; 656 657 /* Calc (sc + dc) and (sc * dc) */ 658 sc_plus_dc = vaddl_u8(sc, dc); 659 scdc = vmull_u8(sc, dc); 660 661 /* Prepare constants */ 662 const255 = vdupq_n_u16(255); 663 664 /* Calc the first term */ 665 term1_1 = vreinterpretq_s32_u32( 666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); 667 term1_2 = vreinterpretq_s32_u32( 668 #ifdef SK_CPU_ARM64 669 vmull_high_u16(const255, sc_plus_dc)); 670 #else 671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); 672 #endif 673 674 /* Calc the second term */ 675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); 676 #ifdef SK_CPU_ARM64 677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); 678 #else 679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); 680 #endif 681 682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); 683 } 684 685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 686 uint8x8x4_t ret; 687 688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], 690 src.val[NEON_A], dst.val[NEON_A]); 691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], 692 src.val[NEON_A], dst.val[NEON_A]); 693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], 694 src.val[NEON_A], dst.val[NEON_A]); 695 696 return ret; 697 } 698 699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, 700 uint8x8_t sa, uint8x8_t da) { 701 uint32x4_t val1, val2; 702 uint16x8_t scdc, t1, t2; 703 704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); 705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); 706 scdc = vmull_u8(sc, dc); 707 708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); 709 #ifdef SK_CPU_ARM64 710 val2 = vaddl_high_u16(t1, t2); 711 #else 712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); 713 #endif 714 715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); 716 #ifdef SK_CPU_ARM64 717 val2 = vaddw_high_u16(val2, scdc); 718 #else 719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); 720 #endif 721 722 return clamp_div255round_simd8_32( 723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); 724 } 725 726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 727 uint8x8x4_t ret; 728 729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], 731 src.val[NEON_A], dst.val[NEON_A]); 732 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], 733 src.val[NEON_A], dst.val[NEON_A]); 734 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], 735 src.val[NEON_A], dst.val[NEON_A]); 736 737 return ret; 738 } 739 740 //////////////////////////////////////////////////////////////////////////////// 741 742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); 743 744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; 745 746 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst, 747 const SkPMColor* SK_RESTRICT src, int count, 748 const SkAlpha* SK_RESTRICT aa) const { 749 SkASSERT(dst && src && count >= 0); 750 751 SkXfermodeProc proc = this->getProc(); 752 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); 753 SkASSERT(procSIMD != NULL); 754 755 if (NULL == aa) { 756 // Unrolled NEON code 757 // We'd like to just do this (modulo a few casts): 758 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst))); 759 // src += 8; 760 // dst += 8; 761 // but that tends to generate miserable code. Here are a bunch of faster 762 // workarounds for different architectures and compilers. 763 while (count >= 8) { 764 765 #ifdef SK_CPU_ARM32 766 uint8x8x4_t vsrc, vdst, vres; 767 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 768 asm volatile ( 769 "vld4.u8 %h[vsrc], [%[src]]! \t\n" 770 "vld4.u8 %h[vdst], [%[dst]] \t\n" 771 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) 772 : [dst] "r" (dst) 773 : 774 ); 775 #else 776 register uint8x8_t d0 asm("d0"); 777 register uint8x8_t d1 asm("d1"); 778 register uint8x8_t d2 asm("d2"); 779 register uint8x8_t d3 asm("d3"); 780 register uint8x8_t d4 asm("d4"); 781 register uint8x8_t d5 asm("d5"); 782 register uint8x8_t d6 asm("d6"); 783 register uint8x8_t d7 asm("d7"); 784 785 asm volatile ( 786 "vld4.u8 {d0-d3},[%[src]]!;" 787 "vld4.u8 {d4-d7},[%[dst]];" 788 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 789 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), 790 [src] "+&r" (src) 791 : [dst] "r" (dst) 792 : 793 ); 794 vsrc.val[0] = d0; vdst.val[0] = d4; 795 vsrc.val[1] = d1; vdst.val[1] = d5; 796 vsrc.val[2] = d2; vdst.val[2] = d6; 797 vsrc.val[3] = d3; vdst.val[3] = d7; 798 #endif 799 800 vres = procSIMD(vsrc, vdst); 801 802 vst4_u8((uint8_t*)dst, vres); 803 804 dst += 8; 805 806 #else // #ifdef SK_CPU_ARM32 807 808 asm volatile ( 809 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 810 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n" 811 "blr %[proc] \t\n" 812 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n" 813 : [src] "+&r" (src), [dst] "+&r" (dst) 814 : [proc] "r" (procSIMD) 815 : "cc", "memory", 816 /* We don't know what proc is going to clobber so we must 817 * add everything that is not callee-saved. 818 */ 819 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", 820 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", 821 "x30", /* x30 implicitly clobbered by blr */ 822 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 823 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 824 "v27", "v28", "v29", "v30", "v31" 825 ); 826 827 #endif // #ifdef SK_CPU_ARM32 828 829 count -= 8; 830 } 831 // Leftovers 832 for (int i = 0; i < count; i++) { 833 dst[i] = proc(src[i], dst[i]); 834 } 835 } else { 836 for (int i = count - 1; i >= 0; --i) { 837 unsigned a = aa[i]; 838 if (0 != a) { 839 SkPMColor dstC = dst[i]; 840 SkPMColor C = proc(src[i], dstC); 841 if (a != 0xFF) { 842 C = SkFourByteInterp_neon(C, dstC, a); 843 } 844 dst[i] = C; 845 } 846 } 847 } 848 } 849 850 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, 851 const SkPMColor* SK_RESTRICT src, int count, 852 const SkAlpha* SK_RESTRICT aa) const { 853 SkASSERT(dst && src && count >= 0); 854 855 SkXfermodeProc proc = this->getProc(); 856 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); 857 SkASSERT(procSIMD != NULL); 858 859 if (NULL == aa) { 860 while(count >= 8) { 861 uint16x8_t vdst, vres16; 862 uint8x8x4_t vdst32, vsrc, vres; 863 864 vdst = vld1q_u16(dst); 865 866 #ifdef SK_CPU_ARM64 867 vsrc = vld4_u8((uint8_t*)src); 868 #else 869 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 870 asm volatile ( 871 "vld4.u8 %h[vsrc], [%[src]]! \t\n" 872 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 873 : : 874 ); 875 #else 876 register uint8x8_t d0 asm("d0"); 877 register uint8x8_t d1 asm("d1"); 878 register uint8x8_t d2 asm("d2"); 879 register uint8x8_t d3 asm("d3"); 880 881 asm volatile ( 882 "vld4.u8 {d0-d3},[%[src]]!;" 883 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 884 [src] "+&r" (src) 885 : : 886 ); 887 vsrc.val[0] = d0; 888 vsrc.val[1] = d1; 889 vsrc.val[2] = d2; 890 vsrc.val[3] = d3; 891 #endif 892 #endif // #ifdef SK_CPU_ARM64 893 894 vdst32 = SkPixel16ToPixel32_neon8(vdst); 895 vres = procSIMD(vsrc, vdst32); 896 vres16 = SkPixel32ToPixel16_neon8(vres); 897 898 vst1q_u16(dst, vres16); 899 900 count -= 8; 901 dst += 8; 902 #ifdef SK_CPU_ARM64 903 src += 8; 904 #endif 905 } 906 for (int i = 0; i < count; i++) { 907 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); 908 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); 909 } 910 } else { 911 for (int i = count - 1; i >= 0; --i) { 912 unsigned a = aa[i]; 913 if (0 != a) { 914 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); 915 SkPMColor C = proc(src[i], dstC); 916 if (0xFF != a) { 917 C = SkFourByteInterp_neon(C, dstC, a); 918 } 919 dst[i] = SkPixel32ToPixel16_ToU16(C); 920 } 921 } 922 } 923 } 924 925 #ifndef SK_IGNORE_TO_STRING 926 void SkNEONProcCoeffXfermode::toString(SkString* str) const { 927 this->INHERITED::toString(str); 928 } 929 #endif 930 931 //////////////////////////////////////////////////////////////////////////////// 932 933 SkXfermodeProcSIMD gNEONXfermodeProcs[] = { 934 NULL, // kClear_Mode 935 NULL, // kSrc_Mode 936 NULL, // kDst_Mode 937 NULL, // kSrcOver_Mode 938 dstover_modeproc_neon8, 939 srcin_modeproc_neon8, 940 dstin_modeproc_neon8, 941 srcout_modeproc_neon8, 942 dstout_modeproc_neon8, 943 srcatop_modeproc_neon8, 944 dstatop_modeproc_neon8, 945 xor_modeproc_neon8, 946 plus_modeproc_neon8, 947 modulate_modeproc_neon8, 948 screen_modeproc_neon8, 949 950 overlay_modeproc_neon8, 951 darken_modeproc_neon8, 952 lighten_modeproc_neon8, 953 NULL, // kColorDodge_Mode 954 NULL, // kColorBurn_Mode 955 hardlight_modeproc_neon8, 956 NULL, // kSoftLight_Mode 957 difference_modeproc_neon8, 958 exclusion_modeproc_neon8, 959 multiply_modeproc_neon8, 960 961 NULL, // kHue_Mode 962 NULL, // kSaturation_Mode 963 NULL, // kColor_Mode 964 NULL, // kLuminosity_Mode 965 }; 966 967 SK_COMPILE_ASSERT( 968 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, 969 mode_count_arm 970 ); 971 972 SkXfermodeProc gNEONXfermodeProcs1[] = { 973 NULL, // kClear_Mode 974 NULL, // kSrc_Mode 975 NULL, // kDst_Mode 976 NULL, // kSrcOver_Mode 977 NULL, // kDstOver_Mode 978 NULL, // kSrcIn_Mode 979 NULL, // kDstIn_Mode 980 NULL, // kSrcOut_Mode 981 NULL, // kDstOut_Mode 982 srcatop_modeproc_neon, 983 dstatop_modeproc_neon, 984 xor_modeproc_neon, 985 plus_modeproc_neon, 986 modulate_modeproc_neon, 987 NULL, // kScreen_Mode 988 989 NULL, // kOverlay_Mode 990 NULL, // kDarken_Mode 991 NULL, // kLighten_Mode 992 NULL, // kColorDodge_Mode 993 NULL, // kColorBurn_Mode 994 NULL, // kHardLight_Mode 995 NULL, // kSoftLight_Mode 996 NULL, // kDifference_Mode 997 NULL, // kExclusion_Mode 998 NULL, // kMultiply_Mode 999 1000 NULL, // kHue_Mode 1001 NULL, // kSaturation_Mode 1002 NULL, // kColor_Mode 1003 NULL, // kLuminosity_Mode 1004 }; 1005 1006 SK_COMPILE_ASSERT( 1007 SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, 1008 mode1_count_arm 1009 ); 1010 1011 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, 1012 SkXfermode::Mode mode) { 1013 1014 void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]); 1015 1016 if (procSIMD != NULL) { 1017 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); 1018 } 1019 return NULL; 1020 } 1021 1022 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { 1023 return gNEONXfermodeProcs1[mode]; 1024 } 1025