1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitRow_opts_arm_neon.h" 9 10 #include "SkBlitMask.h" 11 #include "SkBlitRow.h" 12 #include "SkColorPriv.h" 13 #include "SkDither.h" 14 #include "SkMathPriv.h" 15 #include "SkUtils.h" 16 17 #include "SkColor_opts_neon.h" 18 #include <arm_neon.h> 19 20 #ifdef SK_CPU_ARM64 21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { 22 uint8x8x4_t vsrc; 23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; 24 25 asm ( 26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 27 "mov %[vsrc0].8b, v0.8b \t\n" 28 "mov %[vsrc1].8b, v1.8b \t\n" 29 "mov %[vsrc2].8b, v2.8b \t\n" 30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) 32 : : "v0", "v1", "v2", "v3" 33 ); 34 35 vsrc.val[0] = vsrc_0; 36 vsrc.val[1] = vsrc_1; 37 vsrc.val[2] = vsrc_2; 38 39 return vsrc; 40 } 41 42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { 43 uint8x8x4_t vsrc; 44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; 45 46 asm ( 47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 48 "mov %[vsrc0].8b, v0.8b \t\n" 49 "mov %[vsrc1].8b, v1.8b \t\n" 50 "mov %[vsrc2].8b, v2.8b \t\n" 51 "mov %[vsrc3].8b, v3.8b \t\n" 52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), 54 [src] "+&r" (src) 55 : : "v0", "v1", "v2", "v3" 56 ); 57 58 vsrc.val[0] = vsrc_0; 59 vsrc.val[1] = vsrc_1; 60 vsrc.val[2] = vsrc_2; 61 vsrc.val[3] = vsrc_3; 62 63 return vsrc; 64 } 65 #endif 66 67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 68 const SkPMColor* SK_RESTRICT src, int count, 69 U8CPU alpha, int /*x*/, int /*y*/) { 70 SkASSERT(255 == alpha); 71 72 while (count >= 8) { 73 uint8x8x4_t vsrc; 74 uint16x8_t vdst; 75 76 // Load 77 #ifdef SK_CPU_ARM64 78 vsrc = sk_vld4_u8_arm64_3(src); 79 #else 80 vsrc = vld4_u8((uint8_t*)src); 81 src += 8; 82 #endif 83 84 // Convert src to 565 85 vdst = SkPixel32ToPixel16_neon8(vsrc); 86 87 // Store 88 vst1q_u16(dst, vdst); 89 90 // Prepare next iteration 91 dst += 8; 92 count -= 8; 93 }; 94 95 // Leftovers 96 while (count > 0) { 97 SkPMColor c = *src++; 98 SkPMColorAssert(c); 99 *dst = SkPixel32ToPixel16_ToU16(c); 100 dst++; 101 count--; 102 }; 103 } 104 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 106 const SkPMColor* SK_RESTRICT src, int count, 107 U8CPU alpha, int /*x*/, int /*y*/) { 108 SkASSERT(255 > alpha); 109 110 uint16x8_t vmask_blue, vscale; 111 112 // prepare constants 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 114 vmask_blue = vmovq_n_u16(0x1F); 115 116 while (count >= 8) { 117 uint8x8x4_t vsrc; 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 119 uint16x8_t vres_r, vres_g, vres_b; 120 121 // Load src 122 #ifdef SK_CPU_ARM64 123 vsrc = sk_vld4_u8_arm64_3(src); 124 #else 125 { 126 register uint8x8_t d0 asm("d0"); 127 register uint8x8_t d1 asm("d1"); 128 register uint8x8_t d2 asm("d2"); 129 register uint8x8_t d3 asm("d3"); 130 131 asm ( 132 "vld4.8 {d0-d3},[%[src]]!" 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 134 : 135 ); 136 vsrc.val[0] = d0; 137 vsrc.val[1] = d1; 138 vsrc.val[2] = d2; 139 } 140 #endif 141 142 // Load and unpack dst 143 vdst = vld1q_u16(dst); 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 148 149 // Shift src to 565 range 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); 153 154 // Scale src - dst 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; 158 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); 162 163 vres_r += vdst_r; 164 vres_g += vdst_g; 165 vres_b += vdst_b; 166 167 // Combine 168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue 169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue 170 171 // Store 172 vst1q_u16(dst, vres_b); 173 dst += 8; 174 count -= 8; 175 } 176 if (count > 0) { 177 int scale = SkAlpha255To256(alpha); 178 do { 179 SkPMColor c = *src++; 180 SkPMColorAssert(c); 181 uint16_t d = *dst; 182 *dst++ = SkPackRGB16( 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 186 } while (--count != 0); 187 } 188 } 189 190 #ifdef SK_CPU_ARM32 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 192 const SkPMColor* SK_RESTRICT src, int count, 193 U8CPU alpha, int /*x*/, int /*y*/) { 194 SkASSERT(255 == alpha); 195 196 if (count >= 8) { 197 uint16_t* SK_RESTRICT keep_dst = 0; 198 199 asm volatile ( 200 "ands ip, %[count], #7 \n\t" 201 "vmov.u8 d31, #1<<7 \n\t" 202 "vld1.16 {q12}, [%[dst]] \n\t" 203 "vld4.8 {d0-d3}, [%[src]] \n\t" 204 // Thumb does not support the standard ARM conditional 205 // instructions but instead requires the 'it' instruction 206 // to signal conditional execution 207 "it eq \n\t" 208 "moveq ip, #8 \n\t" 209 "mov %[keep_dst], %[dst] \n\t" 210 211 "add %[src], %[src], ip, LSL#2 \n\t" 212 "add %[dst], %[dst], ip, LSL#1 \n\t" 213 "subs %[count], %[count], ip \n\t" 214 "b 9f \n\t" 215 // LOOP 216 "2: \n\t" 217 218 "vld1.16 {q12}, [%[dst]]! \n\t" 219 "vld4.8 {d0-d3}, [%[src]]! \n\t" 220 "vst1.16 {q10}, [%[keep_dst]] \n\t" 221 "sub %[keep_dst], %[dst], #8*2 \n\t" 222 "subs %[count], %[count], #8 \n\t" 223 "9: \n\t" 224 "pld [%[dst],#32] \n\t" 225 // expand 0565 q12 to 8888 {d4-d7} 226 "vmovn.u16 d4, q12 \n\t" 227 "vshr.u16 q11, q12, #5 \n\t" 228 "vshr.u16 q10, q12, #6+5 \n\t" 229 "vmovn.u16 d5, q11 \n\t" 230 "vmovn.u16 d6, q10 \n\t" 231 "vshl.u8 d4, d4, #3 \n\t" 232 "vshl.u8 d5, d5, #2 \n\t" 233 "vshl.u8 d6, d6, #3 \n\t" 234 235 "vmovl.u8 q14, d31 \n\t" 236 "vmovl.u8 q13, d31 \n\t" 237 "vmovl.u8 q12, d31 \n\t" 238 239 // duplicate in 4/2/1 & 8pix vsns 240 "vmvn.8 d30, d3 \n\t" 241 "vmlal.u8 q14, d30, d6 \n\t" 242 "vmlal.u8 q13, d30, d5 \n\t" 243 "vmlal.u8 q12, d30, d4 \n\t" 244 "vshr.u16 q8, q14, #5 \n\t" 245 "vshr.u16 q9, q13, #6 \n\t" 246 "vaddhn.u16 d6, q14, q8 \n\t" 247 "vshr.u16 q8, q12, #5 \n\t" 248 "vaddhn.u16 d5, q13, q9 \n\t" 249 "vqadd.u8 d6, d6, d0 \n\t" // moved up 250 "vaddhn.u16 d4, q12, q8 \n\t" 251 // intentionally don't calculate alpha 252 // result in d4-d6 253 254 "vqadd.u8 d5, d5, d1 \n\t" 255 "vqadd.u8 d4, d4, d2 \n\t" 256 257 // pack 8888 {d4-d6} to 0565 q10 258 "vshll.u8 q10, d6, #8 \n\t" 259 "vshll.u8 q3, d5, #8 \n\t" 260 "vshll.u8 q2, d4, #8 \n\t" 261 "vsri.u16 q10, q3, #5 \n\t" 262 "vsri.u16 q10, q2, #11 \n\t" 263 264 "bne 2b \n\t" 265 266 "1: \n\t" 267 "vst1.16 {q10}, [%[keep_dst]] \n\t" 268 : [count] "+r" (count) 269 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 270 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 271 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 272 "d30","d31" 273 ); 274 } 275 else 276 { // handle count < 8 277 uint16_t* SK_RESTRICT keep_dst = 0; 278 279 asm volatile ( 280 "vmov.u8 d31, #1<<7 \n\t" 281 "mov %[keep_dst], %[dst] \n\t" 282 283 "tst %[count], #4 \n\t" 284 "beq 14f \n\t" 285 "vld1.16 {d25}, [%[dst]]! \n\t" 286 "vld1.32 {q1}, [%[src]]! \n\t" 287 288 "14: \n\t" 289 "tst %[count], #2 \n\t" 290 "beq 12f \n\t" 291 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 292 "vld1.32 {d1}, [%[src]]! \n\t" 293 294 "12: \n\t" 295 "tst %[count], #1 \n\t" 296 "beq 11f \n\t" 297 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 298 "vld1.32 {d0[1]}, [%[src]]! \n\t" 299 300 "11: \n\t" 301 // unzips achieve the same as a vld4 operation 302 "vuzp.u16 q0, q1 \n\t" 303 "vuzp.u8 d0, d1 \n\t" 304 "vuzp.u8 d2, d3 \n\t" 305 // expand 0565 q12 to 8888 {d4-d7} 306 "vmovn.u16 d4, q12 \n\t" 307 "vshr.u16 q11, q12, #5 \n\t" 308 "vshr.u16 q10, q12, #6+5 \n\t" 309 "vmovn.u16 d5, q11 \n\t" 310 "vmovn.u16 d6, q10 \n\t" 311 "vshl.u8 d4, d4, #3 \n\t" 312 "vshl.u8 d5, d5, #2 \n\t" 313 "vshl.u8 d6, d6, #3 \n\t" 314 315 "vmovl.u8 q14, d31 \n\t" 316 "vmovl.u8 q13, d31 \n\t" 317 "vmovl.u8 q12, d31 \n\t" 318 319 // duplicate in 4/2/1 & 8pix vsns 320 "vmvn.8 d30, d3 \n\t" 321 "vmlal.u8 q14, d30, d6 \n\t" 322 "vmlal.u8 q13, d30, d5 \n\t" 323 "vmlal.u8 q12, d30, d4 \n\t" 324 "vshr.u16 q8, q14, #5 \n\t" 325 "vshr.u16 q9, q13, #6 \n\t" 326 "vaddhn.u16 d6, q14, q8 \n\t" 327 "vshr.u16 q8, q12, #5 \n\t" 328 "vaddhn.u16 d5, q13, q9 \n\t" 329 "vqadd.u8 d6, d6, d0 \n\t" // moved up 330 "vaddhn.u16 d4, q12, q8 \n\t" 331 // intentionally don't calculate alpha 332 // result in d4-d6 333 334 "vqadd.u8 d5, d5, d1 \n\t" 335 "vqadd.u8 d4, d4, d2 \n\t" 336 337 // pack 8888 {d4-d6} to 0565 q10 338 "vshll.u8 q10, d6, #8 \n\t" 339 "vshll.u8 q3, d5, #8 \n\t" 340 "vshll.u8 q2, d4, #8 \n\t" 341 "vsri.u16 q10, q3, #5 \n\t" 342 "vsri.u16 q10, q2, #11 \n\t" 343 344 // store 345 "tst %[count], #4 \n\t" 346 "beq 24f \n\t" 347 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 348 349 "24: \n\t" 350 "tst %[count], #2 \n\t" 351 "beq 22f \n\t" 352 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 353 354 "22: \n\t" 355 "tst %[count], #1 \n\t" 356 "beq 21f \n\t" 357 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 358 359 "21: \n\t" 360 : [count] "+r" (count) 361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 364 "d30","d31" 365 ); 366 } 367 } 368 369 #else // #ifdef SK_CPU_ARM32 370 371 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 372 const SkPMColor* SK_RESTRICT src, int count, 373 U8CPU alpha, int /*x*/, int /*y*/) { 374 SkASSERT(255 == alpha); 375 376 if (count >= 16) { 377 asm ( 378 "movi v4.8h, #0x80 \t\n" 379 380 "1: \t\n" 381 "sub %[count], %[count], #16 \t\n" 382 "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n" 383 "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n" 384 "prfm pldl1keep, [%[src],#512] \t\n" 385 "prfm pldl1keep, [%[dst],#256] \t\n" 386 "ushr v20.8h, v17.8h, #5 \t\n" 387 "ushr v31.8h, v16.8h, #5 \t\n" 388 "xtn v6.8b, v31.8h \t\n" 389 "xtn2 v6.16b, v20.8h \t\n" 390 "ushr v20.8h, v17.8h, #11 \t\n" 391 "shl v19.16b, v6.16b, #2 \t\n" 392 "ushr v31.8h, v16.8h, #11 \t\n" 393 "xtn v22.8b, v31.8h \t\n" 394 "xtn2 v22.16b, v20.8h \t\n" 395 "shl v18.16b, v22.16b, #3 \t\n" 396 "mvn v3.16b, v3.16b \t\n" 397 "xtn v16.8b, v16.8h \t\n" 398 "mov v7.16b, v4.16b \t\n" 399 "xtn2 v16.16b, v17.8h \t\n" 400 "umlal v7.8h, v3.8b, v19.8b \t\n" 401 "shl v16.16b, v16.16b, #3 \t\n" 402 "mov v22.16b, v4.16b \t\n" 403 "ushr v24.8h, v7.8h, #6 \t\n" 404 "umlal v22.8h, v3.8b, v18.8b \t\n" 405 "ushr v20.8h, v22.8h, #5 \t\n" 406 "addhn v20.8b, v22.8h, v20.8h \t\n" 407 "cmp %[count], #16 \t\n" 408 "mov v6.16b, v4.16b \t\n" 409 "mov v5.16b, v4.16b \t\n" 410 "umlal v6.8h, v3.8b, v16.8b \t\n" 411 "umlal2 v5.8h, v3.16b, v19.16b \t\n" 412 "mov v17.16b, v4.16b \t\n" 413 "ushr v19.8h, v6.8h, #5 \t\n" 414 "umlal2 v17.8h, v3.16b, v18.16b \t\n" 415 "addhn v7.8b, v7.8h, v24.8h \t\n" 416 "ushr v18.8h, v5.8h, #6 \t\n" 417 "ushr v21.8h, v17.8h, #5 \t\n" 418 "addhn2 v7.16b, v5.8h, v18.8h \t\n" 419 "addhn2 v20.16b, v17.8h, v21.8h \t\n" 420 "mov v22.16b, v4.16b \t\n" 421 "addhn v6.8b, v6.8h, v19.8h \t\n" 422 "umlal2 v22.8h, v3.16b, v16.16b \t\n" 423 "ushr v5.8h, v22.8h, #5 \t\n" 424 "addhn2 v6.16b, v22.8h, v5.8h \t\n" 425 "uqadd v7.16b, v1.16b, v7.16b \t\n" 426 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 427 "uqadd v20.16b, v2.16b, v20.16b \t\n" 428 "uqadd v6.16b, v0.16b, v6.16b \t\n" 429 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 430 "uqadd v20.16b, v0.16b, v20.16b \t\n" 431 "uqadd v6.16b, v2.16b, v6.16b \t\n" 432 #else 433 #error "This function only supports BGRA and RGBA." 434 #endif 435 "shll v22.8h, v20.8b, #8 \t\n" 436 "shll v5.8h, v7.8b, #8 \t\n" 437 "sri v22.8h, v5.8h, #5 \t\n" 438 "shll v17.8h, v6.8b, #8 \t\n" 439 "shll2 v23.8h, v20.16b, #8 \t\n" 440 "shll2 v7.8h, v7.16b, #8 \t\n" 441 "sri v22.8h, v17.8h, #11 \t\n" 442 "sri v23.8h, v7.8h, #5 \t\n" 443 "shll2 v6.8h, v6.16b, #8 \t\n" 444 "st1 {v22.8h}, [%[dst]], #16 \t\n" 445 "sri v23.8h, v6.8h, #11 \t\n" 446 "st1 {v23.8h}, [%[dst]], #16 \t\n" 447 "b.ge 1b \t\n" 448 : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count) 449 :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 450 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 451 "v31" 452 ); 453 } 454 // Leftovers 455 if (count > 0) { 456 do { 457 SkPMColor c = *src++; 458 SkPMColorAssert(c); 459 if (c) { 460 *dst = SkSrcOver32To16(c, *dst); 461 } 462 dst += 1; 463 } while (--count != 0); 464 } 465 } 466 #endif // #ifdef SK_CPU_ARM32 467 468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 469 prod += vdupq_n_u16(128); 470 prod += vshrq_n_u16(prod, 8); 471 return vshrq_n_u16(prod, 8); 472 } 473 474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 475 const SkPMColor* SK_RESTRICT src, int count, 476 U8CPU alpha, int /*x*/, int /*y*/) { 477 SkASSERT(255 > alpha); 478 479 /* This code implements a Neon version of S32A_D565_Blend. The results have 480 * a few mismatches compared to the original code. These mismatches never 481 * exceed 1. 482 */ 483 484 if (count >= 8) { 485 uint16x8_t valpha_max, vmask_blue; 486 uint8x8_t valpha; 487 488 // prepare constants 489 valpha_max = vmovq_n_u16(255); 490 valpha = vdup_n_u8(alpha); 491 vmask_blue = vmovq_n_u16(SK_B16_MASK); 492 493 do { 494 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 495 uint16x8_t vres_a, vres_r, vres_g, vres_b; 496 uint8x8x4_t vsrc; 497 498 // load pixels 499 vdst = vld1q_u16(dst); 500 #ifdef SK_CPU_ARM64 501 vsrc = sk_vld4_u8_arm64_4(src); 502 #else 503 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 504 asm ( 505 "vld4.u8 %h[vsrc], [%[src]]!" 506 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 507 : : 508 ); 509 #else 510 register uint8x8_t d0 asm("d0"); 511 register uint8x8_t d1 asm("d1"); 512 register uint8x8_t d2 asm("d2"); 513 register uint8x8_t d3 asm("d3"); 514 515 asm volatile ( 516 "vld4.u8 {d0-d3},[%[src]]!;" 517 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 518 [src] "+&r" (src) 519 : : 520 ); 521 vsrc.val[0] = d0; 522 vsrc.val[1] = d1; 523 vsrc.val[2] = d2; 524 vsrc.val[3] = d3; 525 #endif 526 #endif // #ifdef SK_CPU_ARM64 527 528 529 // deinterleave dst 530 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 531 vdst_b = vdst & vmask_blue; // extract blue 532 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 533 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 534 535 // shift src to 565 536 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 537 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 538 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 539 540 // calc src * src_scale 541 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 542 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 543 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 544 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 545 546 // prepare dst_scale 547 vres_a = SkDiv255Round_neon8(vres_a); 548 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 549 550 // add dst * dst_scale to previous result 551 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 552 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 553 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 554 555 #ifdef S32A_D565_BLEND_EXACT 556 // It is possible to get exact results with this but it is slow, 557 // even slower than C code in some cases 558 vres_r = SkDiv255Round_neon8(vres_r); 559 vres_g = SkDiv255Round_neon8(vres_g); 560 vres_b = SkDiv255Round_neon8(vres_b); 561 #else 562 vres_r = vrshrq_n_u16(vres_r, 8); 563 vres_g = vrshrq_n_u16(vres_g, 8); 564 vres_b = vrshrq_n_u16(vres_b, 8); 565 #endif 566 // pack result 567 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 568 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 569 570 // store 571 vst1q_u16(dst, vres_b); 572 dst += 8; 573 count -= 8; 574 } while (count >= 8); 575 } 576 577 // leftovers 578 while (count-- > 0) { 579 SkPMColor sc = *src++; 580 if (sc) { 581 uint16_t dc = *dst; 582 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 583 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 584 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 585 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 586 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 587 } 588 dst += 1; 589 } 590 } 591 592 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 593 * each dither value is spaced out into byte lanes, and repeated 594 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 595 * start of each row. 596 */ 597 static const uint8_t gDitherMatrix_Neon[48] = { 598 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 599 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 600 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 601 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 602 603 }; 604 605 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 606 int count, U8CPU alpha, int x, int y) 607 { 608 609 SkASSERT(255 > alpha); 610 611 // rescale alpha to range 1 - 256 612 int scale = SkAlpha255To256(alpha); 613 614 if (count >= 8) { 615 /* select row and offset for dither array */ 616 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 617 618 uint8x8_t vdither = vld1_u8(dstart); // load dither values 619 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 620 621 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 622 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 623 624 do { 625 626 uint8x8x4_t vsrc; 627 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 628 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 629 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 630 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 631 uint16x8_t vdst; 632 uint16x8_t vdst_r, vdst_g, vdst_b; 633 int16x8_t vres_r, vres_g, vres_b; 634 int8x8_t vres8_r, vres8_g, vres8_b; 635 636 // Load source and add dither 637 #ifdef SK_CPU_ARM64 638 vsrc = sk_vld4_u8_arm64_3(src); 639 #else 640 { 641 register uint8x8_t d0 asm("d0"); 642 register uint8x8_t d1 asm("d1"); 643 register uint8x8_t d2 asm("d2"); 644 register uint8x8_t d3 asm("d3"); 645 646 asm ( 647 "vld4.8 {d0-d3},[%[src]]! " 648 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 649 : 650 ); 651 vsrc.val[0] = d0; 652 vsrc.val[1] = d1; 653 vsrc.val[2] = d2; 654 } 655 #endif 656 vsrc_r = vsrc.val[NEON_R]; 657 vsrc_g = vsrc.val[NEON_G]; 658 vsrc_b = vsrc.val[NEON_B]; 659 660 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 661 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 662 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 663 664 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 665 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 666 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 667 668 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 669 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 670 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 671 672 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 673 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 674 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 675 676 // Load dst and unpack 677 vdst = vld1q_u16(dst); 678 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 679 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 680 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 681 682 // subtract dst from src and widen 683 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 684 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 685 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 686 687 // multiply diffs by scale and shift 688 vres_r = vmulq_s16(vres_r, vscale); 689 vres_g = vmulq_s16(vres_g, vscale); 690 vres_b = vmulq_s16(vres_b, vscale); 691 692 vres8_r = vshrn_n_s16(vres_r, 8); 693 vres8_g = vshrn_n_s16(vres_g, 8); 694 vres8_b = vshrn_n_s16(vres_b, 8); 695 696 // add dst to result 697 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 698 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 699 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 700 701 // put result into 565 format 702 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 703 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 704 705 // Store result 706 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 707 708 // Next iteration 709 dst += 8; 710 count -= 8; 711 712 } while (count >= 8); 713 } 714 715 // Leftovers 716 if (count > 0) { 717 int scale = SkAlpha255To256(alpha); 718 DITHER_565_SCAN(y); 719 do { 720 SkPMColor c = *src++; 721 SkPMColorAssert(c); 722 723 int dither = DITHER_VALUE(x); 724 int sr = SkGetPackedR32(c); 725 int sg = SkGetPackedG32(c); 726 int sb = SkGetPackedB32(c); 727 sr = SkDITHER_R32To565(sr, dither); 728 sg = SkDITHER_G32To565(sg, dither); 729 sb = SkDITHER_B32To565(sb, dither); 730 731 uint16_t d = *dst; 732 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 733 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 734 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 735 DITHER_INC_X(x); 736 } while (--count != 0); 737 } 738 } 739 740 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 741 const SkPMColor* SK_RESTRICT src, 742 int count, U8CPU alpha) { 743 744 SkASSERT(255 == alpha); 745 if (count > 0) { 746 747 748 uint8x8_t alpha_mask; 749 750 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 751 alpha_mask = vld1_u8(alpha_mask_setup); 752 753 /* do the NEON unrolled code */ 754 #define UNROLL 4 755 while (count >= UNROLL) { 756 uint8x8_t src_raw, dst_raw, dst_final; 757 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 758 759 /* The two prefetches below may make the code slighlty 760 * slower for small values of count but are worth having 761 * in the general case. 762 */ 763 __builtin_prefetch(src+32); 764 __builtin_prefetch(dst+32); 765 766 /* get the source */ 767 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 768 #if UNROLL > 2 769 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 770 #endif 771 772 /* get and hold the dst too */ 773 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 774 #if UNROLL > 2 775 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 776 #endif 777 778 /* 1st and 2nd bits of the unrolling */ 779 { 780 uint8x8_t dst_cooked; 781 uint16x8_t dst_wide; 782 uint8x8_t alpha_narrow; 783 uint16x8_t alpha_wide; 784 785 /* get the alphas spread out properly */ 786 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 787 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 788 789 /* spread the dest */ 790 dst_wide = vmovl_u8(dst_raw); 791 792 /* alpha mul the dest */ 793 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 794 dst_cooked = vshrn_n_u16(dst_wide, 8); 795 796 /* sum -- ignoring any byte lane overflows */ 797 dst_final = vadd_u8(src_raw, dst_cooked); 798 } 799 800 #if UNROLL > 2 801 /* the 3rd and 4th bits of our unrolling */ 802 { 803 uint8x8_t dst_cooked; 804 uint16x8_t dst_wide; 805 uint8x8_t alpha_narrow; 806 uint16x8_t alpha_wide; 807 808 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 809 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 810 811 /* spread the dest */ 812 dst_wide = vmovl_u8(dst_raw_2); 813 814 /* alpha mul the dest */ 815 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 816 dst_cooked = vshrn_n_u16(dst_wide, 8); 817 818 /* sum -- ignoring any byte lane overflows */ 819 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 820 } 821 #endif 822 823 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 824 #if UNROLL > 2 825 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 826 #endif 827 828 src += UNROLL; 829 dst += UNROLL; 830 count -= UNROLL; 831 } 832 #undef UNROLL 833 834 /* do any residual iterations */ 835 while (--count >= 0) { 836 *dst = SkPMSrcOver(*src, *dst); 837 src += 1; 838 dst += 1; 839 } 840 } 841 } 842 843 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 844 const SkPMColor* SK_RESTRICT src, 845 int count, U8CPU alpha) { 846 SkASSERT(255 == alpha); 847 848 if (count <= 0) 849 return; 850 851 /* Use these to check if src is transparent or opaque */ 852 const unsigned int ALPHA_OPAQ = 0xFF000000; 853 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 854 855 #define UNROLL 4 856 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 857 const SkPMColor* SK_RESTRICT src_temp = src; 858 859 /* set up the NEON variables */ 860 uint8x8_t alpha_mask; 861 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 862 alpha_mask = vld1_u8(alpha_mask_setup); 863 864 uint8x8_t src_raw, dst_raw, dst_final; 865 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 866 uint8x8_t dst_cooked; 867 uint16x8_t dst_wide; 868 uint8x8_t alpha_narrow; 869 uint16x8_t alpha_wide; 870 871 /* choose the first processing type */ 872 if( src >= src_end) 873 goto TAIL; 874 if(*src <= ALPHA_TRANS) 875 goto ALPHA_0; 876 if(*src >= ALPHA_OPAQ) 877 goto ALPHA_255; 878 /* fall-thru */ 879 880 ALPHA_1_TO_254: 881 do { 882 883 /* get the source */ 884 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 885 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 886 887 /* get and hold the dst too */ 888 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 889 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 890 891 892 /* get the alphas spread out properly */ 893 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 894 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 895 /* we collapsed (255-a)+1 ... */ 896 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 897 898 /* spread the dest */ 899 dst_wide = vmovl_u8(dst_raw); 900 901 /* alpha mul the dest */ 902 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 903 dst_cooked = vshrn_n_u16(dst_wide, 8); 904 905 /* sum -- ignoring any byte lane overflows */ 906 dst_final = vadd_u8(src_raw, dst_cooked); 907 908 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 909 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 910 /* we collapsed (255-a)+1 ... */ 911 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 912 913 /* spread the dest */ 914 dst_wide = vmovl_u8(dst_raw_2); 915 916 /* alpha mul the dest */ 917 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 918 dst_cooked = vshrn_n_u16(dst_wide, 8); 919 920 /* sum -- ignoring any byte lane overflows */ 921 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 922 923 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 924 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 925 926 src += UNROLL; 927 dst += UNROLL; 928 929 /* if 2 of the next pixels aren't between 1 and 254 930 it might make sense to go to the optimized loops */ 931 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 932 break; 933 934 } while(src < src_end); 935 936 if (src >= src_end) 937 goto TAIL; 938 939 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 940 goto ALPHA_255; 941 942 /*fall-thru*/ 943 944 ALPHA_0: 945 946 /*In this state, we know the current alpha is 0 and 947 we optimize for the next alpha also being zero. */ 948 src_temp = src; //so we don't have to increment dst every time 949 do { 950 if(*(++src) > ALPHA_TRANS) 951 break; 952 if(*(++src) > ALPHA_TRANS) 953 break; 954 if(*(++src) > ALPHA_TRANS) 955 break; 956 if(*(++src) > ALPHA_TRANS) 957 break; 958 } while(src < src_end); 959 960 dst += (src - src_temp); 961 962 /* no longer alpha 0, so determine where to go next. */ 963 if( src >= src_end) 964 goto TAIL; 965 if(*src >= ALPHA_OPAQ) 966 goto ALPHA_255; 967 else 968 goto ALPHA_1_TO_254; 969 970 ALPHA_255: 971 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 972 dst[0]=src[0]; 973 dst[1]=src[1]; 974 dst[2]=src[2]; 975 dst[3]=src[3]; 976 src+=UNROLL; 977 dst+=UNROLL; 978 if(src >= src_end) 979 goto TAIL; 980 } 981 982 //Handle remainder. 983 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 984 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 985 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 986 } 987 } 988 989 if( src >= src_end) 990 goto TAIL; 991 if(*src <= ALPHA_TRANS) 992 goto ALPHA_0; 993 else 994 goto ALPHA_1_TO_254; 995 996 TAIL: 997 /* do any residual iterations */ 998 src_end += UNROLL + 1; //goto the real end 999 while(src != src_end) { 1000 if( *src != 0 ) { 1001 if( *src >= ALPHA_OPAQ ) { 1002 *dst = *src; 1003 } 1004 else { 1005 *dst = SkPMSrcOver(*src, *dst); 1006 } 1007 } 1008 src++; 1009 dst++; 1010 } 1011 1012 #undef UNROLL 1013 return; 1014 } 1015 1016 /* Neon version of S32_Blend_BlitRow32() 1017 * portable version is in src/core/SkBlitRow_D32.cpp 1018 */ 1019 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1020 const SkPMColor* SK_RESTRICT src, 1021 int count, U8CPU alpha) { 1022 SkASSERT(alpha <= 255); 1023 1024 if (count <= 0) { 1025 return; 1026 } 1027 1028 uint16_t src_scale = SkAlpha255To256(alpha); 1029 uint16_t dst_scale = 256 - src_scale; 1030 1031 while (count >= 2) { 1032 uint8x8_t vsrc, vdst, vres; 1033 uint16x8_t vsrc_wide, vdst_wide; 1034 1035 /* These commented prefetches are a big win for count 1036 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 1037 * They also hurt a little (<5%) on an A15 1038 */ 1039 //__builtin_prefetch(src+32); 1040 //__builtin_prefetch(dst+32); 1041 1042 // Load 1043 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1044 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1045 1046 // Process src 1047 vsrc_wide = vmovl_u8(vsrc); 1048 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1049 1050 // Process dst 1051 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1052 1053 // Combine 1054 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1055 1056 // Store 1057 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1058 1059 src += 2; 1060 dst += 2; 1061 count -= 2; 1062 } 1063 1064 if (count == 1) { 1065 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1066 uint16x8_t vsrc_wide, vdst_wide; 1067 1068 // Load 1069 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1070 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1071 1072 // Process 1073 vsrc_wide = vmovl_u8(vsrc); 1074 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1075 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1076 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1077 1078 // Store 1079 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1080 } 1081 } 1082 1083 #ifdef SK_CPU_ARM32 1084 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1085 const SkPMColor* SK_RESTRICT src, 1086 int count, U8CPU alpha) { 1087 1088 SkASSERT(255 >= alpha); 1089 1090 if (count <= 0) { 1091 return; 1092 } 1093 1094 unsigned alpha256 = SkAlpha255To256(alpha); 1095 1096 // First deal with odd counts 1097 if (count & 1) { 1098 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1099 uint16x8_t vdst_wide, vsrc_wide; 1100 unsigned dst_scale; 1101 1102 // Load 1103 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1104 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1105 1106 // Calc dst_scale 1107 dst_scale = vget_lane_u8(vsrc, 3); 1108 dst_scale *= alpha256; 1109 dst_scale >>= 8; 1110 dst_scale = 256 - dst_scale; 1111 1112 // Process src 1113 vsrc_wide = vmovl_u8(vsrc); 1114 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 1115 1116 // Process dst 1117 vdst_wide = vmovl_u8(vdst); 1118 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 1119 1120 // Combine 1121 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1122 1123 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1124 dst++; 1125 src++; 1126 count--; 1127 } 1128 1129 if (count) { 1130 uint8x8_t alpha_mask; 1131 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1132 alpha_mask = vld1_u8(alpha_mask_setup); 1133 1134 do { 1135 1136 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 1137 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 1138 1139 __builtin_prefetch(src+32); 1140 __builtin_prefetch(dst+32); 1141 1142 // Load 1143 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1144 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1145 1146 // Prepare src_scale 1147 vsrc_scale = vdupq_n_u16(alpha256); 1148 1149 // Calc dst_scale 1150 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1151 vdst_scale = vmovl_u8(vsrc_alphas); 1152 vdst_scale *= vsrc_scale; 1153 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1154 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1155 1156 // Process src 1157 vsrc_wide = vmovl_u8(vsrc); 1158 vsrc_wide *= vsrc_scale; 1159 1160 // Process dst 1161 vdst_wide = vmovl_u8(vdst); 1162 vdst_wide *= vdst_scale; 1163 1164 // Combine 1165 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1166 1167 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1168 1169 src += 2; 1170 dst += 2; 1171 count -= 2; 1172 } while(count); 1173 } 1174 } 1175 1176 /////////////////////////////////////////////////////////////////////////////// 1177 1178 #undef DEBUG_OPAQUE_DITHER 1179 1180 #if defined(DEBUG_OPAQUE_DITHER) 1181 static void showme8(char *str, void *p, int len) 1182 { 1183 static char buf[256]; 1184 char tbuf[32]; 1185 int i; 1186 char *pc = (char*) p; 1187 sprintf(buf,"%8s:", str); 1188 for(i=0;i<len;i++) { 1189 sprintf(tbuf, " %02x", pc[i]); 1190 strcat(buf, tbuf); 1191 } 1192 SkDebugf("%s\n", buf); 1193 } 1194 static void showme16(char *str, void *p, int len) 1195 { 1196 static char buf[256]; 1197 char tbuf[32]; 1198 int i; 1199 uint16_t *pc = (uint16_t*) p; 1200 sprintf(buf,"%8s:", str); 1201 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1202 for(i=0;i<len;i++) { 1203 sprintf(tbuf, " %04x", pc[i]); 1204 strcat(buf, tbuf); 1205 } 1206 SkDebugf("%s\n", buf); 1207 } 1208 #endif 1209 #endif // #ifdef SK_CPU_ARM32 1210 1211 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1212 const SkPMColor* SK_RESTRICT src, 1213 int count, U8CPU alpha, int x, int y) { 1214 SkASSERT(255 == alpha); 1215 1216 #define UNROLL 8 1217 1218 if (count >= UNROLL) { 1219 1220 #if defined(DEBUG_OPAQUE_DITHER) 1221 uint16_t tmpbuf[UNROLL]; 1222 int td[UNROLL]; 1223 int tdv[UNROLL]; 1224 int ta[UNROLL]; 1225 int tap[UNROLL]; 1226 uint16_t in_dst[UNROLL]; 1227 int offset = 0; 1228 int noisy = 0; 1229 #endif 1230 1231 uint8x8_t dbase; 1232 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1233 dbase = vld1_u8(dstart); 1234 1235 do { 1236 uint8x8x4_t vsrc; 1237 uint8x8_t sr, sg, sb, sa, d; 1238 uint16x8_t dst8, scale8, alpha8; 1239 uint16x8_t dst_r, dst_g, dst_b; 1240 1241 #if defined(DEBUG_OPAQUE_DITHER) 1242 // calculate 8 elements worth into a temp buffer 1243 { 1244 int my_y = y; 1245 int my_x = x; 1246 SkPMColor* my_src = (SkPMColor*)src; 1247 uint16_t* my_dst = dst; 1248 int i; 1249 1250 DITHER_565_SCAN(my_y); 1251 for(i = 0; i < UNROLL; i++) { 1252 SkPMColor c = *my_src++; 1253 SkPMColorAssert(c); 1254 if (c) { 1255 unsigned a = SkGetPackedA32(c); 1256 1257 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1258 tdv[i] = DITHER_VALUE(my_x); 1259 ta[i] = a; 1260 tap[i] = SkAlpha255To256(a); 1261 td[i] = d; 1262 1263 unsigned sr = SkGetPackedR32(c); 1264 unsigned sg = SkGetPackedG32(c); 1265 unsigned sb = SkGetPackedB32(c); 1266 sr = SkDITHER_R32_FOR_565(sr, d); 1267 sg = SkDITHER_G32_FOR_565(sg, d); 1268 sb = SkDITHER_B32_FOR_565(sb, d); 1269 1270 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1271 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1272 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1273 // now src and dst expanded are in g:11 r:10 x:1 b:10 1274 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1275 td[i] = d; 1276 } else { 1277 tmpbuf[i] = *my_dst; 1278 ta[i] = tdv[i] = td[i] = 0xbeef; 1279 } 1280 in_dst[i] = *my_dst; 1281 my_dst += 1; 1282 DITHER_INC_X(my_x); 1283 } 1284 } 1285 #endif 1286 1287 #ifdef SK_CPU_ARM64 1288 vsrc = sk_vld4_u8_arm64_4(src); 1289 #else 1290 { 1291 register uint8x8_t d0 asm("d0"); 1292 register uint8x8_t d1 asm("d1"); 1293 register uint8x8_t d2 asm("d2"); 1294 register uint8x8_t d3 asm("d3"); 1295 1296 asm ("vld4.8 {d0-d3},[%[src]]! " 1297 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1298 : 1299 ); 1300 vsrc.val[0] = d0; 1301 vsrc.val[1] = d1; 1302 vsrc.val[2] = d2; 1303 vsrc.val[3] = d3; 1304 } 1305 #endif 1306 sa = vsrc.val[NEON_A]; 1307 sr = vsrc.val[NEON_R]; 1308 sg = vsrc.val[NEON_G]; 1309 sb = vsrc.val[NEON_B]; 1310 1311 /* calculate 'd', which will be 0..7 1312 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1313 */ 1314 alpha8 = vmovl_u8(dbase); 1315 alpha8 = vmlal_u8(alpha8, sa, dbase); 1316 d = vshrn_n_u16(alpha8, 8); // narrowing too 1317 1318 // sr = sr - (sr>>5) + d 1319 /* watching for 8-bit overflow. d is 0..7; risky range of 1320 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1321 * safe as long as we do ((sr-sr>>5) + d) 1322 */ 1323 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1324 sr = vadd_u8(sr, d); 1325 1326 // sb = sb - (sb>>5) + d 1327 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1328 sb = vadd_u8(sb, d); 1329 1330 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1331 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1332 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1333 1334 // need to pick up 8 dst's -- at 16 bits each, 128 bits 1335 dst8 = vld1q_u16(dst); 1336 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1337 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1338 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1339 1340 // blend 1341 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1342 1343 // combine the addq and mul, save 3 insns 1344 scale8 = vshrq_n_u16(scale8, 3); 1345 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1346 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1347 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1348 1349 // repack to store 1350 dst8 = vshrq_n_u16(dst_b, 5); 1351 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1352 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1353 1354 vst1q_u16(dst, dst8); 1355 1356 #if defined(DEBUG_OPAQUE_DITHER) 1357 // verify my 8 elements match the temp buffer 1358 { 1359 int i, bad=0; 1360 static int invocation; 1361 1362 for (i = 0; i < UNROLL; i++) { 1363 if (tmpbuf[i] != dst[i]) { 1364 bad=1; 1365 } 1366 } 1367 if (bad) { 1368 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1369 invocation, offset); 1370 SkDebugf(" alpha 0x%x\n", alpha); 1371 for (i = 0; i < UNROLL; i++) 1372 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1373 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], 1374 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); 1375 1376 showme16("alpha8", &alpha8, sizeof(alpha8)); 1377 showme16("scale8", &scale8, sizeof(scale8)); 1378 showme8("d", &d, sizeof(d)); 1379 showme16("dst8", &dst8, sizeof(dst8)); 1380 showme16("dst_b", &dst_b, sizeof(dst_b)); 1381 showme16("dst_g", &dst_g, sizeof(dst_g)); 1382 showme16("dst_r", &dst_r, sizeof(dst_r)); 1383 showme8("sb", &sb, sizeof(sb)); 1384 showme8("sg", &sg, sizeof(sg)); 1385 showme8("sr", &sr, sizeof(sr)); 1386 1387 return; 1388 } 1389 offset += UNROLL; 1390 invocation++; 1391 } 1392 #endif 1393 dst += UNROLL; 1394 count -= UNROLL; 1395 // skip x += UNROLL, since it's unchanged mod-4 1396 } while (count >= UNROLL); 1397 } 1398 #undef UNROLL 1399 1400 // residuals 1401 if (count > 0) { 1402 DITHER_565_SCAN(y); 1403 do { 1404 SkPMColor c = *src++; 1405 SkPMColorAssert(c); 1406 if (c) { 1407 unsigned a = SkGetPackedA32(c); 1408 1409 // dither and alpha are just temporary variables to work-around 1410 // an ICE in debug. 1411 unsigned dither = DITHER_VALUE(x); 1412 unsigned alpha = SkAlpha255To256(a); 1413 int d = SkAlphaMul(dither, alpha); 1414 1415 unsigned sr = SkGetPackedR32(c); 1416 unsigned sg = SkGetPackedG32(c); 1417 unsigned sb = SkGetPackedB32(c); 1418 sr = SkDITHER_R32_FOR_565(sr, d); 1419 sg = SkDITHER_G32_FOR_565(sg, d); 1420 sb = SkDITHER_B32_FOR_565(sb, d); 1421 1422 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1423 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1424 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1425 // now src and dst expanded are in g:11 r:10 x:1 b:10 1426 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1427 } 1428 dst += 1; 1429 DITHER_INC_X(x); 1430 } while (--count != 0); 1431 } 1432 } 1433 1434 /////////////////////////////////////////////////////////////////////////////// 1435 1436 #undef DEBUG_S32_OPAQUE_DITHER 1437 1438 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1439 const SkPMColor* SK_RESTRICT src, 1440 int count, U8CPU alpha, int x, int y) { 1441 SkASSERT(255 == alpha); 1442 1443 #define UNROLL 8 1444 if (count >= UNROLL) { 1445 uint8x8_t d; 1446 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1447 d = vld1_u8(dstart); 1448 1449 while (count >= UNROLL) { 1450 uint8x8_t sr, sg, sb; 1451 uint16x8_t dr, dg, db; 1452 uint16x8_t dst8; 1453 uint8x8x4_t vsrc; 1454 1455 #ifdef SK_CPU_ARM64 1456 vsrc = sk_vld4_u8_arm64_3(src); 1457 #else 1458 { 1459 register uint8x8_t d0 asm("d0"); 1460 register uint8x8_t d1 asm("d1"); 1461 register uint8x8_t d2 asm("d2"); 1462 register uint8x8_t d3 asm("d3"); 1463 1464 asm ( 1465 "vld4.8 {d0-d3},[%[src]]! " 1466 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1467 : 1468 ); 1469 vsrc.val[0] = d0; 1470 vsrc.val[1] = d1; 1471 vsrc.val[2] = d2; 1472 } 1473 #endif 1474 sr = vsrc.val[NEON_R]; 1475 sg = vsrc.val[NEON_G]; 1476 sb = vsrc.val[NEON_B]; 1477 1478 /* XXX: if we want to prefetch, hide it in the above asm() 1479 * using the gcc __builtin_prefetch(), the prefetch will 1480 * fall to the bottom of the loop -- it won't stick up 1481 * at the top of the loop, just after the vld4. 1482 */ 1483 1484 // sr = sr - (sr>>5) + d 1485 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1486 dr = vaddl_u8(sr, d); 1487 1488 // sb = sb - (sb>>5) + d 1489 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1490 db = vaddl_u8(sb, d); 1491 1492 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1493 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1494 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1495 1496 // pack high bits of each into 565 format (rgb, b is lsb) 1497 dst8 = vshrq_n_u16(db, 3); 1498 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1499 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1500 1501 // store it 1502 vst1q_u16(dst, dst8); 1503 1504 #if defined(DEBUG_S32_OPAQUE_DITHER) 1505 // always good to know if we generated good results 1506 { 1507 int i, myx = x, myy = y; 1508 DITHER_565_SCAN(myy); 1509 for (i=0;i<UNROLL;i++) { 1510 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1511 SkPMColor c = src[i-8]; 1512 unsigned dither = DITHER_VALUE(myx); 1513 uint16_t val = SkDitherRGB32To565(c, dither); 1514 if (val != dst[i]) { 1515 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1516 c, dither, val, dst[i], dstart[i]); 1517 } 1518 DITHER_INC_X(myx); 1519 } 1520 } 1521 #endif 1522 1523 dst += UNROLL; 1524 // we don't need to increment src as the asm above has already done it 1525 count -= UNROLL; 1526 x += UNROLL; // probably superfluous 1527 } 1528 } 1529 #undef UNROLL 1530 1531 // residuals 1532 if (count > 0) { 1533 DITHER_565_SCAN(y); 1534 do { 1535 SkPMColor c = *src++; 1536 SkPMColorAssert(c); 1537 SkASSERT(SkGetPackedA32(c) == 255); 1538 1539 unsigned dither = DITHER_VALUE(x); 1540 *dst++ = SkDitherRGB32To565(c, dither); 1541 DITHER_INC_X(x); 1542 } while (--count != 0); 1543 } 1544 } 1545 1546 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1547 SkPMColor color) { 1548 if (count <= 0) { 1549 return; 1550 } 1551 1552 if (0 == color) { 1553 if (src != dst) { 1554 memcpy(dst, src, count * sizeof(SkPMColor)); 1555 } 1556 return; 1557 } 1558 1559 unsigned colorA = SkGetPackedA32(color); 1560 if (255 == colorA) { 1561 sk_memset32(dst, color, count); 1562 return; 1563 } 1564 1565 unsigned scale = 256 - SkAlpha255To256(colorA); 1566 1567 if (count >= 8) { 1568 uint32x4_t vcolor; 1569 uint8x8_t vscale; 1570 1571 vcolor = vdupq_n_u32(color); 1572 1573 // scale numerical interval [0-255], so load as 8 bits 1574 vscale = vdup_n_u8(scale); 1575 1576 do { 1577 // load src color, 8 pixels, 4 64 bit registers 1578 // (and increment src). 1579 uint32x2x4_t vsrc; 1580 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) 1581 asm ( 1582 "vld1.32 %h[vsrc], [%[src]]!" 1583 : [vsrc] "=w" (vsrc), [src] "+r" (src) 1584 : : 1585 ); 1586 #else // 64bit targets and Clang 1587 vsrc.val[0] = vld1_u32(src); 1588 vsrc.val[1] = vld1_u32(src+2); 1589 vsrc.val[2] = vld1_u32(src+4); 1590 vsrc.val[3] = vld1_u32(src+6); 1591 src += 8; 1592 #endif 1593 1594 // multiply long by scale, 64 bits at a time, 1595 // destination into a 128 bit register. 1596 uint16x8x4_t vtmp; 1597 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale); 1598 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale); 1599 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale); 1600 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale); 1601 1602 // shift the 128 bit registers, containing the 16 1603 // bit scaled values back to 8 bits, narrowing the 1604 // results to 64 bit registers. 1605 uint8x16x2_t vres; 1606 vres.val[0] = vcombine_u8( 1607 vshrn_n_u16(vtmp.val[0], 8), 1608 vshrn_n_u16(vtmp.val[1], 8)); 1609 vres.val[1] = vcombine_u8( 1610 vshrn_n_u16(vtmp.val[2], 8), 1611 vshrn_n_u16(vtmp.val[3], 8)); 1612 1613 // adding back the color, using 128 bit registers. 1614 uint32x4x2_t vdst; 1615 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + 1616 vreinterpretq_u8_u32(vcolor)); 1617 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + 1618 vreinterpretq_u8_u32(vcolor)); 1619 1620 // store back the 8 calculated pixels (2 128 bit 1621 // registers), and increment dst. 1622 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) 1623 asm ( 1624 "vst1.32 %h[vdst], [%[dst]]!" 1625 : [dst] "+r" (dst) 1626 : [vdst] "w" (vdst) 1627 : "memory" 1628 ); 1629 #else // 64bit targets and Clang 1630 vst1q_u32(dst, vdst.val[0]); 1631 vst1q_u32(dst+4, vdst.val[1]); 1632 dst += 8; 1633 #endif 1634 count -= 8; 1635 1636 } while (count >= 8); 1637 } 1638 1639 while (count > 0) { 1640 *dst = color + SkAlphaMulQ(*src, scale); 1641 src += 1; 1642 dst += 1; 1643 count--; 1644 } 1645 } 1646 1647 /////////////////////////////////////////////////////////////////////////////// 1648 1649 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1650 // no dither 1651 S32_D565_Opaque_neon, 1652 S32_D565_Blend_neon, 1653 S32A_D565_Opaque_neon, 1654 #if 0 1655 S32A_D565_Blend_neon, 1656 #else 1657 NULL, // https://code.google.com/p/skia/issues/detail?id=2845 1658 // https://code.google.com/p/skia/issues/detail?id=2797 1659 #endif 1660 1661 // dither 1662 S32_D565_Opaque_Dither_neon, 1663 S32_D565_Blend_Dither_neon, 1664 S32A_D565_Opaque_Dither_neon, 1665 NULL, // S32A_D565_Blend_Dither 1666 }; 1667 1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1669 NULL, // S32_Opaque, 1670 S32_Blend_BlitRow32_neon, // S32_Blend, 1671 /* 1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1673 * value and attempts to optimize accordingly. The optimization is 1674 * sensitive to the source content and is not a win in all cases. For 1675 * example, if there are a lot of transitions between the alpha states, 1676 * the performance will almost certainly be worse. However, for many 1677 * common cases the performance is equivalent or better than the standard 1678 * case where we do not inspect the src alpha. 1679 */ 1680 #if SK_A32_SHIFT == 24 1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1683 #else 1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1685 #endif 1686 #ifdef SK_CPU_ARM32 1687 S32A_Blend_BlitRow32_neon // S32A_Blend 1688 #else 1689 NULL 1690 #endif 1691 }; 1692