1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitRow_opts_arm_neon.h" 9 10 #include "SkBlitMask.h" 11 #include "SkBlitRow.h" 12 #include "SkColorPriv.h" 13 #include "SkDither.h" 14 #include "SkMathPriv.h" 15 #include "SkUtils.h" 16 17 #include "SkColor_opts_neon.h" 18 #include <arm_neon.h> 19 20 #ifdef SK_CPU_ARM64 21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { 22 uint8x8x4_t vsrc; 23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; 24 25 asm ( 26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 27 "mov %[vsrc0].8b, v0.8b \t\n" 28 "mov %[vsrc1].8b, v1.8b \t\n" 29 "mov %[vsrc2].8b, v2.8b \t\n" 30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) 32 : : "v0", "v1", "v2", "v3" 33 ); 34 35 vsrc.val[0] = vsrc_0; 36 vsrc.val[1] = vsrc_1; 37 vsrc.val[2] = vsrc_2; 38 39 return vsrc; 40 } 41 42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { 43 uint8x8x4_t vsrc; 44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; 45 46 asm ( 47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 48 "mov %[vsrc0].8b, v0.8b \t\n" 49 "mov %[vsrc1].8b, v1.8b \t\n" 50 "mov %[vsrc2].8b, v2.8b \t\n" 51 "mov %[vsrc3].8b, v3.8b \t\n" 52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), 54 [src] "+&r" (src) 55 : : "v0", "v1", "v2", "v3" 56 ); 57 58 vsrc.val[0] = vsrc_0; 59 vsrc.val[1] = vsrc_1; 60 vsrc.val[2] = vsrc_2; 61 vsrc.val[3] = vsrc_3; 62 63 return vsrc; 64 } 65 #endif 66 67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 68 const SkPMColor* SK_RESTRICT src, int count, 69 U8CPU alpha, int /*x*/, int /*y*/) { 70 SkASSERT(255 == alpha); 71 72 while (count >= 8) { 73 uint8x8x4_t vsrc; 74 uint16x8_t vdst; 75 76 // Load 77 #ifdef SK_CPU_ARM64 78 vsrc = sk_vld4_u8_arm64_3(src); 79 #else 80 vsrc = vld4_u8((uint8_t*)src); 81 src += 8; 82 #endif 83 84 // Convert src to 565 85 vdst = SkPixel32ToPixel16_neon8(vsrc); 86 87 // Store 88 vst1q_u16(dst, vdst); 89 90 // Prepare next iteration 91 dst += 8; 92 count -= 8; 93 }; 94 95 // Leftovers 96 while (count > 0) { 97 SkPMColor c = *src++; 98 SkPMColorAssert(c); 99 *dst = SkPixel32ToPixel16_ToU16(c); 100 dst++; 101 count--; 102 }; 103 } 104 105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 106 const SkPMColor* SK_RESTRICT src, int count, 107 U8CPU alpha, int /*x*/, int /*y*/) { 108 SkASSERT(255 > alpha); 109 110 uint16x8_t vmask_blue, vscale; 111 112 // prepare constants 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 114 vmask_blue = vmovq_n_u16(0x1F); 115 116 while (count >= 8) { 117 uint8x8x4_t vsrc; 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 119 uint16x8_t vres_r, vres_g, vres_b; 120 121 // Load src 122 #ifdef SK_CPU_ARM64 123 vsrc = sk_vld4_u8_arm64_3(src); 124 #else 125 { 126 register uint8x8_t d0 asm("d0"); 127 register uint8x8_t d1 asm("d1"); 128 register uint8x8_t d2 asm("d2"); 129 register uint8x8_t d3 asm("d3"); 130 131 asm ( 132 "vld4.8 {d0-d3},[%[src]]!" 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 134 : 135 ); 136 vsrc.val[0] = d0; 137 vsrc.val[1] = d1; 138 vsrc.val[2] = d2; 139 } 140 #endif 141 142 // Load and unpack dst 143 vdst = vld1q_u16(dst); 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 148 149 // Shift src to 565 range 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); 153 154 // Scale src - dst 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; 158 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); 162 163 vres_r += vdst_r; 164 vres_g += vdst_g; 165 vres_b += vdst_b; 166 167 // Combine 168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue 169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue 170 171 // Store 172 vst1q_u16(dst, vres_b); 173 dst += 8; 174 count -= 8; 175 } 176 if (count > 0) { 177 int scale = SkAlpha255To256(alpha); 178 do { 179 SkPMColor c = *src++; 180 SkPMColorAssert(c); 181 uint16_t d = *dst; 182 *dst++ = SkPackRGB16( 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 186 } while (--count != 0); 187 } 188 } 189 190 #ifdef SK_CPU_ARM32 191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 192 const SkPMColor* SK_RESTRICT src, int count, 193 U8CPU alpha, int /*x*/, int /*y*/) { 194 SkASSERT(255 == alpha); 195 196 if (count >= 8) { 197 uint16_t* SK_RESTRICT keep_dst = 0; 198 199 asm volatile ( 200 "ands ip, %[count], #7 \n\t" 201 "vmov.u8 d31, #1<<7 \n\t" 202 "vld1.16 {q12}, [%[dst]] \n\t" 203 "vld4.8 {d0-d3}, [%[src]] \n\t" 204 // Thumb does not support the standard ARM conditional 205 // instructions but instead requires the 'it' instruction 206 // to signal conditional execution 207 "it eq \n\t" 208 "moveq ip, #8 \n\t" 209 "mov %[keep_dst], %[dst] \n\t" 210 211 "add %[src], %[src], ip, LSL#2 \n\t" 212 "add %[dst], %[dst], ip, LSL#1 \n\t" 213 "subs %[count], %[count], ip \n\t" 214 "b 9f \n\t" 215 // LOOP 216 "2: \n\t" 217 218 "vld1.16 {q12}, [%[dst]]! \n\t" 219 "vld4.8 {d0-d3}, [%[src]]! \n\t" 220 "vst1.16 {q10}, [%[keep_dst]] \n\t" 221 "sub %[keep_dst], %[dst], #8*2 \n\t" 222 "subs %[count], %[count], #8 \n\t" 223 "9: \n\t" 224 "pld [%[dst],#32] \n\t" 225 // expand 0565 q12 to 8888 {d4-d7} 226 "vmovn.u16 d4, q12 \n\t" 227 "vshr.u16 q11, q12, #5 \n\t" 228 "vshr.u16 q10, q12, #6+5 \n\t" 229 "vmovn.u16 d5, q11 \n\t" 230 "vmovn.u16 d6, q10 \n\t" 231 "vshl.u8 d4, d4, #3 \n\t" 232 "vshl.u8 d5, d5, #2 \n\t" 233 "vshl.u8 d6, d6, #3 \n\t" 234 235 "vmovl.u8 q14, d31 \n\t" 236 "vmovl.u8 q13, d31 \n\t" 237 "vmovl.u8 q12, d31 \n\t" 238 239 // duplicate in 4/2/1 & 8pix vsns 240 "vmvn.8 d30, d3 \n\t" 241 "vmlal.u8 q14, d30, d6 \n\t" 242 "vmlal.u8 q13, d30, d5 \n\t" 243 "vmlal.u8 q12, d30, d4 \n\t" 244 "vshr.u16 q8, q14, #5 \n\t" 245 "vshr.u16 q9, q13, #6 \n\t" 246 "vaddhn.u16 d6, q14, q8 \n\t" 247 "vshr.u16 q8, q12, #5 \n\t" 248 "vaddhn.u16 d5, q13, q9 \n\t" 249 "vaddhn.u16 d4, q12, q8 \n\t" 250 // intentionally don't calculate alpha 251 // result in d4-d6 252 253 #ifdef SK_PMCOLOR_IS_RGBA 254 "vqadd.u8 d6, d6, d0 \n\t" 255 "vqadd.u8 d5, d5, d1 \n\t" 256 "vqadd.u8 d4, d4, d2 \n\t" 257 #else 258 "vqadd.u8 d6, d6, d2 \n\t" 259 "vqadd.u8 d5, d5, d1 \n\t" 260 "vqadd.u8 d4, d4, d0 \n\t" 261 #endif 262 263 // pack 8888 {d4-d6} to 0565 q10 264 "vshll.u8 q10, d6, #8 \n\t" 265 "vshll.u8 q3, d5, #8 \n\t" 266 "vshll.u8 q2, d4, #8 \n\t" 267 "vsri.u16 q10, q3, #5 \n\t" 268 "vsri.u16 q10, q2, #11 \n\t" 269 270 "bne 2b \n\t" 271 272 "1: \n\t" 273 "vst1.16 {q10}, [%[keep_dst]] \n\t" 274 : [count] "+r" (count) 275 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 276 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 277 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 278 "d30","d31" 279 ); 280 } 281 else 282 { // handle count < 8 283 uint16_t* SK_RESTRICT keep_dst = 0; 284 285 asm volatile ( 286 "vmov.u8 d31, #1<<7 \n\t" 287 "mov %[keep_dst], %[dst] \n\t" 288 289 "tst %[count], #4 \n\t" 290 "beq 14f \n\t" 291 "vld1.16 {d25}, [%[dst]]! \n\t" 292 "vld1.32 {q1}, [%[src]]! \n\t" 293 294 "14: \n\t" 295 "tst %[count], #2 \n\t" 296 "beq 12f \n\t" 297 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 298 "vld1.32 {d1}, [%[src]]! \n\t" 299 300 "12: \n\t" 301 "tst %[count], #1 \n\t" 302 "beq 11f \n\t" 303 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 304 "vld1.32 {d0[1]}, [%[src]]! \n\t" 305 306 "11: \n\t" 307 // unzips achieve the same as a vld4 operation 308 "vuzp.u16 q0, q1 \n\t" 309 "vuzp.u8 d0, d1 \n\t" 310 "vuzp.u8 d2, d3 \n\t" 311 // expand 0565 q12 to 8888 {d4-d7} 312 "vmovn.u16 d4, q12 \n\t" 313 "vshr.u16 q11, q12, #5 \n\t" 314 "vshr.u16 q10, q12, #6+5 \n\t" 315 "vmovn.u16 d5, q11 \n\t" 316 "vmovn.u16 d6, q10 \n\t" 317 "vshl.u8 d4, d4, #3 \n\t" 318 "vshl.u8 d5, d5, #2 \n\t" 319 "vshl.u8 d6, d6, #3 \n\t" 320 321 "vmovl.u8 q14, d31 \n\t" 322 "vmovl.u8 q13, d31 \n\t" 323 "vmovl.u8 q12, d31 \n\t" 324 325 // duplicate in 4/2/1 & 8pix vsns 326 "vmvn.8 d30, d3 \n\t" 327 "vmlal.u8 q14, d30, d6 \n\t" 328 "vmlal.u8 q13, d30, d5 \n\t" 329 "vmlal.u8 q12, d30, d4 \n\t" 330 "vshr.u16 q8, q14, #5 \n\t" 331 "vshr.u16 q9, q13, #6 \n\t" 332 "vaddhn.u16 d6, q14, q8 \n\t" 333 "vshr.u16 q8, q12, #5 \n\t" 334 "vaddhn.u16 d5, q13, q9 \n\t" 335 "vaddhn.u16 d4, q12, q8 \n\t" 336 // intentionally don't calculate alpha 337 // result in d4-d6 338 339 #ifdef SK_PMCOLOR_IS_RGBA 340 "vqadd.u8 d6, d6, d0 \n\t" 341 "vqadd.u8 d5, d5, d1 \n\t" 342 "vqadd.u8 d4, d4, d2 \n\t" 343 #else 344 "vqadd.u8 d6, d6, d2 \n\t" 345 "vqadd.u8 d5, d5, d1 \n\t" 346 "vqadd.u8 d4, d4, d0 \n\t" 347 #endif 348 349 // pack 8888 {d4-d6} to 0565 q10 350 "vshll.u8 q10, d6, #8 \n\t" 351 "vshll.u8 q3, d5, #8 \n\t" 352 "vshll.u8 q2, d4, #8 \n\t" 353 "vsri.u16 q10, q3, #5 \n\t" 354 "vsri.u16 q10, q2, #11 \n\t" 355 356 // store 357 "tst %[count], #4 \n\t" 358 "beq 24f \n\t" 359 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 360 361 "24: \n\t" 362 "tst %[count], #2 \n\t" 363 "beq 22f \n\t" 364 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 365 366 "22: \n\t" 367 "tst %[count], #1 \n\t" 368 "beq 21f \n\t" 369 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 370 371 "21: \n\t" 372 : [count] "+r" (count) 373 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 374 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 375 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 376 "d30","d31" 377 ); 378 } 379 } 380 381 #else // #ifdef SK_CPU_ARM32 382 383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 384 const SkPMColor* SK_RESTRICT src, int count, 385 U8CPU alpha, int /*x*/, int /*y*/) { 386 SkASSERT(255 == alpha); 387 388 if (count >= 16) { 389 asm ( 390 "movi v4.8h, #0x80 \t\n" 391 392 "1: \t\n" 393 "sub %[count], %[count], #16 \t\n" 394 "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n" 395 "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n" 396 "prfm pldl1keep, [%[src],#512] \t\n" 397 "prfm pldl1keep, [%[dst],#256] \t\n" 398 "ushr v20.8h, v17.8h, #5 \t\n" 399 "ushr v31.8h, v16.8h, #5 \t\n" 400 "xtn v6.8b, v31.8h \t\n" 401 "xtn2 v6.16b, v20.8h \t\n" 402 "ushr v20.8h, v17.8h, #11 \t\n" 403 "shl v19.16b, v6.16b, #2 \t\n" 404 "ushr v31.8h, v16.8h, #11 \t\n" 405 "xtn v22.8b, v31.8h \t\n" 406 "xtn2 v22.16b, v20.8h \t\n" 407 "shl v18.16b, v22.16b, #3 \t\n" 408 "mvn v3.16b, v3.16b \t\n" 409 "xtn v16.8b, v16.8h \t\n" 410 "mov v7.16b, v4.16b \t\n" 411 "xtn2 v16.16b, v17.8h \t\n" 412 "umlal v7.8h, v3.8b, v19.8b \t\n" 413 "shl v16.16b, v16.16b, #3 \t\n" 414 "mov v22.16b, v4.16b \t\n" 415 "ushr v24.8h, v7.8h, #6 \t\n" 416 "umlal v22.8h, v3.8b, v18.8b \t\n" 417 "ushr v20.8h, v22.8h, #5 \t\n" 418 "addhn v20.8b, v22.8h, v20.8h \t\n" 419 "cmp %[count], #16 \t\n" 420 "mov v6.16b, v4.16b \t\n" 421 "mov v5.16b, v4.16b \t\n" 422 "umlal v6.8h, v3.8b, v16.8b \t\n" 423 "umlal2 v5.8h, v3.16b, v19.16b \t\n" 424 "mov v17.16b, v4.16b \t\n" 425 "ushr v19.8h, v6.8h, #5 \t\n" 426 "umlal2 v17.8h, v3.16b, v18.16b \t\n" 427 "addhn v7.8b, v7.8h, v24.8h \t\n" 428 "ushr v18.8h, v5.8h, #6 \t\n" 429 "ushr v21.8h, v17.8h, #5 \t\n" 430 "addhn2 v7.16b, v5.8h, v18.8h \t\n" 431 "addhn2 v20.16b, v17.8h, v21.8h \t\n" 432 "mov v22.16b, v4.16b \t\n" 433 "addhn v6.8b, v6.8h, v19.8h \t\n" 434 "umlal2 v22.8h, v3.16b, v16.16b \t\n" 435 "ushr v5.8h, v22.8h, #5 \t\n" 436 "addhn2 v6.16b, v22.8h, v5.8h \t\n" 437 "uqadd v7.16b, v1.16b, v7.16b \t\n" 438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 439 "uqadd v20.16b, v2.16b, v20.16b \t\n" 440 "uqadd v6.16b, v0.16b, v6.16b \t\n" 441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 442 "uqadd v20.16b, v0.16b, v20.16b \t\n" 443 "uqadd v6.16b, v2.16b, v6.16b \t\n" 444 #else 445 #error "This function only supports BGRA and RGBA." 446 #endif 447 "shll v22.8h, v20.8b, #8 \t\n" 448 "shll v5.8h, v7.8b, #8 \t\n" 449 "sri v22.8h, v5.8h, #5 \t\n" 450 "shll v17.8h, v6.8b, #8 \t\n" 451 "shll2 v23.8h, v20.16b, #8 \t\n" 452 "shll2 v7.8h, v7.16b, #8 \t\n" 453 "sri v22.8h, v17.8h, #11 \t\n" 454 "sri v23.8h, v7.8h, #5 \t\n" 455 "shll2 v6.8h, v6.16b, #8 \t\n" 456 "st1 {v22.8h}, [%[dst]], #16 \t\n" 457 "sri v23.8h, v6.8h, #11 \t\n" 458 "st1 {v23.8h}, [%[dst]], #16 \t\n" 459 "b.ge 1b \t\n" 460 : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count) 461 :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 462 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 463 "v31" 464 ); 465 } 466 // Leftovers 467 if (count > 0) { 468 do { 469 SkPMColor c = *src++; 470 SkPMColorAssert(c); 471 if (c) { 472 *dst = SkSrcOver32To16(c, *dst); 473 } 474 dst += 1; 475 } while (--count != 0); 476 } 477 } 478 #endif // #ifdef SK_CPU_ARM32 479 480 static uint32_t pmcolor_to_expand16(SkPMColor c) { 481 unsigned r = SkGetPackedR32(c); 482 unsigned g = SkGetPackedG32(c); 483 unsigned b = SkGetPackedB32(c); 484 return (g << 24) | (r << 13) | (b << 2); 485 } 486 487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) { 488 uint32_t src_expand; 489 unsigned scale; 490 uint16x8_t vmask_blue; 491 492 if (count <= 0) return; 493 SkASSERT(((size_t)dst & 0x01) == 0); 494 495 /* 496 * This preamble code is in order to make dst aligned to 8 bytes 497 * in the next mutiple bytes read & write access. 498 */ 499 src_expand = pmcolor_to_expand16(src); 500 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; 501 502 #define DST_ALIGN 8 503 504 /* 505 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time. 506 */ 507 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); 508 509 for (int i = 0; i < preamble_size; i+=2, dst++) { 510 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; 511 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); 512 if (--count == 0) 513 break; 514 } 515 516 int count16 = 0; 517 count16 = count >> 4; 518 vmask_blue = vmovq_n_u16(SK_B16_MASK); 519 520 if (count16) { 521 uint16x8_t wide_sr; 522 uint16x8_t wide_sg; 523 uint16x8_t wide_sb; 524 uint16x8_t wide_256_sa; 525 526 unsigned sr = SkGetPackedR32(src); 527 unsigned sg = SkGetPackedG32(src); 528 unsigned sb = SkGetPackedB32(src); 529 unsigned sa = SkGetPackedA32(src); 530 531 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb 532 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted, 533 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) 534 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift 535 536 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted, 537 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5) 538 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift 539 540 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted, 541 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) 542 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift 543 544 wide_256_sa = 545 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3 546 547 while (count16-- > 0) { 548 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b; 549 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b; 550 vdst1 = vld1q_u16(dst); 551 dst += 8; 552 vdst2 = vld1q_u16(dst); 553 dst -= 8; //to store dst again. 554 555 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes 556 vdst1_b = vdst1 & vmask_blue; // extract blue 557 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red 558 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green 559 560 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes 561 vdst2_b = vdst2 & vmask_blue; // extract blue 562 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red 563 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green 564 565 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1 566 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1 567 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1 568 569 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2 570 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2 571 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2 572 573 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red 574 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green 575 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue 576 577 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue 578 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue 579 580 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red 581 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green 582 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue 583 584 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue 585 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue 586 587 vst1q_u16(dst, vdst1); 588 dst += 8; 589 vst1q_u16(dst, vdst2); 590 dst += 8; 591 } 592 } 593 594 count &= 0xF; 595 if (count > 0) { 596 do { 597 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; 598 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); 599 dst += 1; 600 } while (--count != 0); 601 } 602 } 603 604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 605 prod += vdupq_n_u16(128); 606 prod += vshrq_n_u16(prod, 8); 607 return vshrq_n_u16(prod, 8); 608 } 609 610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 611 const SkPMColor* SK_RESTRICT src, int count, 612 U8CPU alpha, int /*x*/, int /*y*/) { 613 SkASSERT(255 > alpha); 614 615 /* This code implements a Neon version of S32A_D565_Blend. The results have 616 * a few mismatches compared to the original code. These mismatches never 617 * exceed 1. 618 */ 619 620 if (count >= 8) { 621 uint16x8_t valpha_max, vmask_blue; 622 uint8x8_t valpha; 623 624 // prepare constants 625 valpha_max = vmovq_n_u16(255); 626 valpha = vdup_n_u8(alpha); 627 vmask_blue = vmovq_n_u16(SK_B16_MASK); 628 629 do { 630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 631 uint16x8_t vres_a, vres_r, vres_g, vres_b; 632 uint8x8x4_t vsrc; 633 634 // load pixels 635 vdst = vld1q_u16(dst); 636 #ifdef SK_CPU_ARM64 637 vsrc = sk_vld4_u8_arm64_4(src); 638 #else 639 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 640 asm ( 641 "vld4.u8 %h[vsrc], [%[src]]!" 642 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 643 : : 644 ); 645 #else 646 register uint8x8_t d0 asm("d0"); 647 register uint8x8_t d1 asm("d1"); 648 register uint8x8_t d2 asm("d2"); 649 register uint8x8_t d3 asm("d3"); 650 651 asm volatile ( 652 "vld4.u8 {d0-d3},[%[src]]!;" 653 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 654 [src] "+&r" (src) 655 : : 656 ); 657 vsrc.val[0] = d0; 658 vsrc.val[1] = d1; 659 vsrc.val[2] = d2; 660 vsrc.val[3] = d3; 661 #endif 662 #endif // #ifdef SK_CPU_ARM64 663 664 665 // deinterleave dst 666 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 667 vdst_b = vdst & vmask_blue; // extract blue 668 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 669 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 670 671 // shift src to 565 672 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 673 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 674 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 675 676 // calc src * src_scale 677 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 678 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 679 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 680 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 681 682 // prepare dst_scale 683 vres_a = SkDiv255Round_neon8(vres_a); 684 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 685 686 // add dst * dst_scale to previous result 687 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 688 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 689 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 690 691 #ifdef S32A_D565_BLEND_EXACT 692 // It is possible to get exact results with this but it is slow, 693 // even slower than C code in some cases 694 vres_r = SkDiv255Round_neon8(vres_r); 695 vres_g = SkDiv255Round_neon8(vres_g); 696 vres_b = SkDiv255Round_neon8(vres_b); 697 #else 698 vres_r = vrshrq_n_u16(vres_r, 8); 699 vres_g = vrshrq_n_u16(vres_g, 8); 700 vres_b = vrshrq_n_u16(vres_b, 8); 701 #endif 702 // pack result 703 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 704 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 705 706 // store 707 vst1q_u16(dst, vres_b); 708 dst += 8; 709 count -= 8; 710 } while (count >= 8); 711 } 712 713 // leftovers 714 while (count-- > 0) { 715 SkPMColor sc = *src++; 716 if (sc) { 717 uint16_t dc = *dst; 718 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 719 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 720 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 721 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 722 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 723 } 724 dst += 1; 725 } 726 } 727 728 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 729 * each dither value is spaced out into byte lanes, and repeated 730 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 731 * start of each row. 732 */ 733 static const uint8_t gDitherMatrix_Neon[48] = { 734 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 735 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 736 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 737 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 738 739 }; 740 741 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 742 int count, U8CPU alpha, int x, int y) 743 { 744 745 SkASSERT(255 > alpha); 746 747 // rescale alpha to range 1 - 256 748 int scale = SkAlpha255To256(alpha); 749 750 if (count >= 8) { 751 /* select row and offset for dither array */ 752 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 753 754 uint8x8_t vdither = vld1_u8(dstart); // load dither values 755 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 756 757 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 758 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 759 760 do { 761 762 uint8x8x4_t vsrc; 763 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 764 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 765 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 766 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 767 uint16x8_t vdst; 768 uint16x8_t vdst_r, vdst_g, vdst_b; 769 int16x8_t vres_r, vres_g, vres_b; 770 int8x8_t vres8_r, vres8_g, vres8_b; 771 772 // Load source and add dither 773 #ifdef SK_CPU_ARM64 774 vsrc = sk_vld4_u8_arm64_3(src); 775 #else 776 { 777 register uint8x8_t d0 asm("d0"); 778 register uint8x8_t d1 asm("d1"); 779 register uint8x8_t d2 asm("d2"); 780 register uint8x8_t d3 asm("d3"); 781 782 asm ( 783 "vld4.8 {d0-d3},[%[src]]! " 784 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 785 : 786 ); 787 vsrc.val[0] = d0; 788 vsrc.val[1] = d1; 789 vsrc.val[2] = d2; 790 } 791 #endif 792 vsrc_r = vsrc.val[NEON_R]; 793 vsrc_g = vsrc.val[NEON_G]; 794 vsrc_b = vsrc.val[NEON_B]; 795 796 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 797 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 798 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 799 800 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 801 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 802 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 803 804 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 805 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 806 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 807 808 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 809 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 810 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 811 812 // Load dst and unpack 813 vdst = vld1q_u16(dst); 814 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 815 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 816 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 817 818 // subtract dst from src and widen 819 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 820 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 821 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 822 823 // multiply diffs by scale and shift 824 vres_r = vmulq_s16(vres_r, vscale); 825 vres_g = vmulq_s16(vres_g, vscale); 826 vres_b = vmulq_s16(vres_b, vscale); 827 828 vres8_r = vshrn_n_s16(vres_r, 8); 829 vres8_g = vshrn_n_s16(vres_g, 8); 830 vres8_b = vshrn_n_s16(vres_b, 8); 831 832 // add dst to result 833 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 834 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 835 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 836 837 // put result into 565 format 838 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 839 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 840 841 // Store result 842 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 843 844 // Next iteration 845 dst += 8; 846 count -= 8; 847 848 } while (count >= 8); 849 } 850 851 // Leftovers 852 if (count > 0) { 853 int scale = SkAlpha255To256(alpha); 854 DITHER_565_SCAN(y); 855 do { 856 SkPMColor c = *src++; 857 SkPMColorAssert(c); 858 859 int dither = DITHER_VALUE(x); 860 int sr = SkGetPackedR32(c); 861 int sg = SkGetPackedG32(c); 862 int sb = SkGetPackedB32(c); 863 sr = SkDITHER_R32To565(sr, dither); 864 sg = SkDITHER_G32To565(sg, dither); 865 sb = SkDITHER_B32To565(sb, dither); 866 867 uint16_t d = *dst; 868 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 869 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 870 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 871 DITHER_INC_X(x); 872 } while (--count != 0); 873 } 874 } 875 876 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 877 const SkPMColor* SK_RESTRICT src, 878 int count, U8CPU alpha) { 879 880 SkASSERT(255 == alpha); 881 if (count > 0) { 882 883 884 uint8x8_t alpha_mask; 885 886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 887 alpha_mask = vld1_u8(alpha_mask_setup); 888 889 /* do the NEON unrolled code */ 890 #define UNROLL 4 891 while (count >= UNROLL) { 892 uint8x8_t src_raw, dst_raw, dst_final; 893 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 894 895 /* The two prefetches below may make the code slighlty 896 * slower for small values of count but are worth having 897 * in the general case. 898 */ 899 __builtin_prefetch(src+32); 900 __builtin_prefetch(dst+32); 901 902 /* get the source */ 903 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 904 #if UNROLL > 2 905 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 906 #endif 907 908 /* get and hold the dst too */ 909 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 910 #if UNROLL > 2 911 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 912 #endif 913 914 /* 1st and 2nd bits of the unrolling */ 915 { 916 uint8x8_t dst_cooked; 917 uint16x8_t dst_wide; 918 uint8x8_t alpha_narrow; 919 uint16x8_t alpha_wide; 920 921 /* get the alphas spread out properly */ 922 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 923 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 924 925 /* spread the dest */ 926 dst_wide = vmovl_u8(dst_raw); 927 928 /* alpha mul the dest */ 929 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 930 dst_cooked = vshrn_n_u16(dst_wide, 8); 931 932 /* sum -- ignoring any byte lane overflows */ 933 dst_final = vadd_u8(src_raw, dst_cooked); 934 } 935 936 #if UNROLL > 2 937 /* the 3rd and 4th bits of our unrolling */ 938 { 939 uint8x8_t dst_cooked; 940 uint16x8_t dst_wide; 941 uint8x8_t alpha_narrow; 942 uint16x8_t alpha_wide; 943 944 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 945 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 946 947 /* spread the dest */ 948 dst_wide = vmovl_u8(dst_raw_2); 949 950 /* alpha mul the dest */ 951 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 952 dst_cooked = vshrn_n_u16(dst_wide, 8); 953 954 /* sum -- ignoring any byte lane overflows */ 955 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 956 } 957 #endif 958 959 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 960 #if UNROLL > 2 961 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 962 #endif 963 964 src += UNROLL; 965 dst += UNROLL; 966 count -= UNROLL; 967 } 968 #undef UNROLL 969 970 /* do any residual iterations */ 971 while (--count >= 0) { 972 *dst = SkPMSrcOver(*src, *dst); 973 src += 1; 974 dst += 1; 975 } 976 } 977 } 978 979 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 980 const SkPMColor* SK_RESTRICT src, 981 int count, U8CPU alpha) { 982 SkASSERT(255 == alpha); 983 984 if (count <= 0) 985 return; 986 987 /* Use these to check if src is transparent or opaque */ 988 const unsigned int ALPHA_OPAQ = 0xFF000000; 989 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 990 991 #define UNROLL 4 992 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 993 const SkPMColor* SK_RESTRICT src_temp = src; 994 995 /* set up the NEON variables */ 996 uint8x8_t alpha_mask; 997 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 998 alpha_mask = vld1_u8(alpha_mask_setup); 999 1000 uint8x8_t src_raw, dst_raw, dst_final; 1001 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 1002 uint8x8_t dst_cooked; 1003 uint16x8_t dst_wide; 1004 uint8x8_t alpha_narrow; 1005 uint16x8_t alpha_wide; 1006 1007 /* choose the first processing type */ 1008 if( src >= src_end) 1009 goto TAIL; 1010 if(*src <= ALPHA_TRANS) 1011 goto ALPHA_0; 1012 if(*src >= ALPHA_OPAQ) 1013 goto ALPHA_255; 1014 /* fall-thru */ 1015 1016 ALPHA_1_TO_254: 1017 do { 1018 1019 /* get the source */ 1020 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 1021 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 1022 1023 /* get and hold the dst too */ 1024 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 1025 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 1026 1027 1028 /* get the alphas spread out properly */ 1029 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 1030 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 1031 /* we collapsed (255-a)+1 ... */ 1032 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1033 1034 /* spread the dest */ 1035 dst_wide = vmovl_u8(dst_raw); 1036 1037 /* alpha mul the dest */ 1038 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1039 dst_cooked = vshrn_n_u16(dst_wide, 8); 1040 1041 /* sum -- ignoring any byte lane overflows */ 1042 dst_final = vadd_u8(src_raw, dst_cooked); 1043 1044 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 1045 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 1046 /* we collapsed (255-a)+1 ... */ 1047 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1048 1049 /* spread the dest */ 1050 dst_wide = vmovl_u8(dst_raw_2); 1051 1052 /* alpha mul the dest */ 1053 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1054 dst_cooked = vshrn_n_u16(dst_wide, 8); 1055 1056 /* sum -- ignoring any byte lane overflows */ 1057 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 1058 1059 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 1060 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 1061 1062 src += UNROLL; 1063 dst += UNROLL; 1064 1065 /* if 2 of the next pixels aren't between 1 and 254 1066 it might make sense to go to the optimized loops */ 1067 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 1068 break; 1069 1070 } while(src < src_end); 1071 1072 if (src >= src_end) 1073 goto TAIL; 1074 1075 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 1076 goto ALPHA_255; 1077 1078 /*fall-thru*/ 1079 1080 ALPHA_0: 1081 1082 /*In this state, we know the current alpha is 0 and 1083 we optimize for the next alpha also being zero. */ 1084 src_temp = src; //so we don't have to increment dst every time 1085 do { 1086 if(*(++src) > ALPHA_TRANS) 1087 break; 1088 if(*(++src) > ALPHA_TRANS) 1089 break; 1090 if(*(++src) > ALPHA_TRANS) 1091 break; 1092 if(*(++src) > ALPHA_TRANS) 1093 break; 1094 } while(src < src_end); 1095 1096 dst += (src - src_temp); 1097 1098 /* no longer alpha 0, so determine where to go next. */ 1099 if( src >= src_end) 1100 goto TAIL; 1101 if(*src >= ALPHA_OPAQ) 1102 goto ALPHA_255; 1103 else 1104 goto ALPHA_1_TO_254; 1105 1106 ALPHA_255: 1107 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 1108 dst[0]=src[0]; 1109 dst[1]=src[1]; 1110 dst[2]=src[2]; 1111 dst[3]=src[3]; 1112 src+=UNROLL; 1113 dst+=UNROLL; 1114 if(src >= src_end) 1115 goto TAIL; 1116 } 1117 1118 //Handle remainder. 1119 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 1120 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 1121 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 1122 } 1123 } 1124 1125 if( src >= src_end) 1126 goto TAIL; 1127 if(*src <= ALPHA_TRANS) 1128 goto ALPHA_0; 1129 else 1130 goto ALPHA_1_TO_254; 1131 1132 TAIL: 1133 /* do any residual iterations */ 1134 src_end += UNROLL + 1; //goto the real end 1135 while(src != src_end) { 1136 if( *src != 0 ) { 1137 if( *src >= ALPHA_OPAQ ) { 1138 *dst = *src; 1139 } 1140 else { 1141 *dst = SkPMSrcOver(*src, *dst); 1142 } 1143 } 1144 src++; 1145 dst++; 1146 } 1147 1148 #undef UNROLL 1149 return; 1150 } 1151 1152 /* Neon version of S32_Blend_BlitRow32() 1153 * portable version is in src/core/SkBlitRow_D32.cpp 1154 */ 1155 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1156 const SkPMColor* SK_RESTRICT src, 1157 int count, U8CPU alpha) { 1158 SkASSERT(alpha <= 255); 1159 1160 if (count <= 0) { 1161 return; 1162 } 1163 1164 uint16_t src_scale = SkAlpha255To256(alpha); 1165 uint16_t dst_scale = 256 - src_scale; 1166 1167 while (count >= 2) { 1168 uint8x8_t vsrc, vdst, vres; 1169 uint16x8_t vsrc_wide, vdst_wide; 1170 1171 /* These commented prefetches are a big win for count 1172 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 1173 * They also hurt a little (<5%) on an A15 1174 */ 1175 //__builtin_prefetch(src+32); 1176 //__builtin_prefetch(dst+32); 1177 1178 // Load 1179 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1180 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1181 1182 // Process src 1183 vsrc_wide = vmovl_u8(vsrc); 1184 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1185 1186 // Process dst 1187 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1188 1189 // Combine 1190 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1191 1192 // Store 1193 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1194 1195 src += 2; 1196 dst += 2; 1197 count -= 2; 1198 } 1199 1200 if (count == 1) { 1201 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1202 uint16x8_t vsrc_wide, vdst_wide; 1203 1204 // Load 1205 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1206 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1207 1208 // Process 1209 vsrc_wide = vmovl_u8(vsrc); 1210 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1211 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1212 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1213 1214 // Store 1215 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1216 } 1217 } 1218 1219 #ifdef SK_CPU_ARM32 1220 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1221 const SkPMColor* SK_RESTRICT src, 1222 int count, U8CPU alpha) { 1223 1224 SkASSERT(255 >= alpha); 1225 1226 if (count <= 0) { 1227 return; 1228 } 1229 1230 unsigned alpha256 = SkAlpha255To256(alpha); 1231 1232 // First deal with odd counts 1233 if (count & 1) { 1234 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1235 uint16x8_t vdst_wide, vsrc_wide; 1236 unsigned dst_scale; 1237 1238 // Load 1239 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1240 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1241 1242 // Calc dst_scale 1243 dst_scale = vget_lane_u8(vsrc, 3); 1244 dst_scale *= alpha256; 1245 dst_scale >>= 8; 1246 dst_scale = 256 - dst_scale; 1247 1248 // Process src 1249 vsrc_wide = vmovl_u8(vsrc); 1250 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 1251 1252 // Process dst 1253 vdst_wide = vmovl_u8(vdst); 1254 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 1255 1256 // Combine 1257 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1258 1259 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1260 dst++; 1261 src++; 1262 count--; 1263 } 1264 1265 if (count) { 1266 uint8x8_t alpha_mask; 1267 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1268 alpha_mask = vld1_u8(alpha_mask_setup); 1269 1270 do { 1271 1272 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 1273 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 1274 1275 __builtin_prefetch(src+32); 1276 __builtin_prefetch(dst+32); 1277 1278 // Load 1279 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1280 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1281 1282 // Prepare src_scale 1283 vsrc_scale = vdupq_n_u16(alpha256); 1284 1285 // Calc dst_scale 1286 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1287 vdst_scale = vmovl_u8(vsrc_alphas); 1288 vdst_scale *= vsrc_scale; 1289 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1290 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1291 1292 // Process src 1293 vsrc_wide = vmovl_u8(vsrc); 1294 vsrc_wide *= vsrc_scale; 1295 1296 // Process dst 1297 vdst_wide = vmovl_u8(vdst); 1298 vdst_wide *= vdst_scale; 1299 1300 // Combine 1301 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1302 1303 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1304 1305 src += 2; 1306 dst += 2; 1307 count -= 2; 1308 } while(count); 1309 } 1310 } 1311 1312 /////////////////////////////////////////////////////////////////////////////// 1313 1314 #undef DEBUG_OPAQUE_DITHER 1315 1316 #if defined(DEBUG_OPAQUE_DITHER) 1317 static void showme8(char *str, void *p, int len) 1318 { 1319 static char buf[256]; 1320 char tbuf[32]; 1321 int i; 1322 char *pc = (char*) p; 1323 sprintf(buf,"%8s:", str); 1324 for(i=0;i<len;i++) { 1325 sprintf(tbuf, " %02x", pc[i]); 1326 strcat(buf, tbuf); 1327 } 1328 SkDebugf("%s\n", buf); 1329 } 1330 static void showme16(char *str, void *p, int len) 1331 { 1332 static char buf[256]; 1333 char tbuf[32]; 1334 int i; 1335 uint16_t *pc = (uint16_t*) p; 1336 sprintf(buf,"%8s:", str); 1337 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1338 for(i=0;i<len;i++) { 1339 sprintf(tbuf, " %04x", pc[i]); 1340 strcat(buf, tbuf); 1341 } 1342 SkDebugf("%s\n", buf); 1343 } 1344 #endif 1345 #endif // #ifdef SK_CPU_ARM32 1346 1347 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1348 const SkPMColor* SK_RESTRICT src, 1349 int count, U8CPU alpha, int x, int y) { 1350 SkASSERT(255 == alpha); 1351 1352 #define UNROLL 8 1353 1354 if (count >= UNROLL) { 1355 1356 #if defined(DEBUG_OPAQUE_DITHER) 1357 uint16_t tmpbuf[UNROLL]; 1358 int td[UNROLL]; 1359 int tdv[UNROLL]; 1360 int ta[UNROLL]; 1361 int tap[UNROLL]; 1362 uint16_t in_dst[UNROLL]; 1363 int offset = 0; 1364 int noisy = 0; 1365 #endif 1366 1367 uint8x8_t dbase; 1368 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1369 dbase = vld1_u8(dstart); 1370 1371 do { 1372 uint8x8x4_t vsrc; 1373 uint8x8_t sr, sg, sb, sa, d; 1374 uint16x8_t dst8, scale8, alpha8; 1375 uint16x8_t dst_r, dst_g, dst_b; 1376 1377 #if defined(DEBUG_OPAQUE_DITHER) 1378 // calculate 8 elements worth into a temp buffer 1379 { 1380 int my_y = y; 1381 int my_x = x; 1382 SkPMColor* my_src = (SkPMColor*)src; 1383 uint16_t* my_dst = dst; 1384 int i; 1385 1386 DITHER_565_SCAN(my_y); 1387 for(i = 0; i < UNROLL; i++) { 1388 SkPMColor c = *my_src++; 1389 SkPMColorAssert(c); 1390 if (c) { 1391 unsigned a = SkGetPackedA32(c); 1392 1393 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1394 tdv[i] = DITHER_VALUE(my_x); 1395 ta[i] = a; 1396 tap[i] = SkAlpha255To256(a); 1397 td[i] = d; 1398 1399 unsigned sr = SkGetPackedR32(c); 1400 unsigned sg = SkGetPackedG32(c); 1401 unsigned sb = SkGetPackedB32(c); 1402 sr = SkDITHER_R32_FOR_565(sr, d); 1403 sg = SkDITHER_G32_FOR_565(sg, d); 1404 sb = SkDITHER_B32_FOR_565(sb, d); 1405 1406 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1407 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1408 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1409 // now src and dst expanded are in g:11 r:10 x:1 b:10 1410 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1411 td[i] = d; 1412 } else { 1413 tmpbuf[i] = *my_dst; 1414 ta[i] = tdv[i] = td[i] = 0xbeef; 1415 } 1416 in_dst[i] = *my_dst; 1417 my_dst += 1; 1418 DITHER_INC_X(my_x); 1419 } 1420 } 1421 #endif 1422 1423 #ifdef SK_CPU_ARM64 1424 vsrc = sk_vld4_u8_arm64_4(src); 1425 #else 1426 { 1427 register uint8x8_t d0 asm("d0"); 1428 register uint8x8_t d1 asm("d1"); 1429 register uint8x8_t d2 asm("d2"); 1430 register uint8x8_t d3 asm("d3"); 1431 1432 asm ("vld4.8 {d0-d3},[%[src]]! " 1433 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1434 : 1435 ); 1436 vsrc.val[0] = d0; 1437 vsrc.val[1] = d1; 1438 vsrc.val[2] = d2; 1439 vsrc.val[3] = d3; 1440 } 1441 #endif 1442 sa = vsrc.val[NEON_A]; 1443 sr = vsrc.val[NEON_R]; 1444 sg = vsrc.val[NEON_G]; 1445 sb = vsrc.val[NEON_B]; 1446 1447 /* calculate 'd', which will be 0..7 1448 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1449 */ 1450 alpha8 = vmovl_u8(dbase); 1451 alpha8 = vmlal_u8(alpha8, sa, dbase); 1452 d = vshrn_n_u16(alpha8, 8); // narrowing too 1453 1454 // sr = sr - (sr>>5) + d 1455 /* watching for 8-bit overflow. d is 0..7; risky range of 1456 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1457 * safe as long as we do ((sr-sr>>5) + d) 1458 */ 1459 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1460 sr = vadd_u8(sr, d); 1461 1462 // sb = sb - (sb>>5) + d 1463 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1464 sb = vadd_u8(sb, d); 1465 1466 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1467 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1468 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1469 1470 // need to pick up 8 dst's -- at 16 bits each, 128 bits 1471 dst8 = vld1q_u16(dst); 1472 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1473 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1474 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1475 1476 // blend 1477 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1478 1479 // combine the addq and mul, save 3 insns 1480 scale8 = vshrq_n_u16(scale8, 3); 1481 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1482 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1483 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1484 1485 // repack to store 1486 dst8 = vshrq_n_u16(dst_b, 5); 1487 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1488 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1489 1490 vst1q_u16(dst, dst8); 1491 1492 #if defined(DEBUG_OPAQUE_DITHER) 1493 // verify my 8 elements match the temp buffer 1494 { 1495 int i, bad=0; 1496 static int invocation; 1497 1498 for (i = 0; i < UNROLL; i++) { 1499 if (tmpbuf[i] != dst[i]) { 1500 bad=1; 1501 } 1502 } 1503 if (bad) { 1504 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1505 invocation, offset); 1506 SkDebugf(" alpha 0x%x\n", alpha); 1507 for (i = 0; i < UNROLL; i++) 1508 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1509 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], 1510 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); 1511 1512 showme16("alpha8", &alpha8, sizeof(alpha8)); 1513 showme16("scale8", &scale8, sizeof(scale8)); 1514 showme8("d", &d, sizeof(d)); 1515 showme16("dst8", &dst8, sizeof(dst8)); 1516 showme16("dst_b", &dst_b, sizeof(dst_b)); 1517 showme16("dst_g", &dst_g, sizeof(dst_g)); 1518 showme16("dst_r", &dst_r, sizeof(dst_r)); 1519 showme8("sb", &sb, sizeof(sb)); 1520 showme8("sg", &sg, sizeof(sg)); 1521 showme8("sr", &sr, sizeof(sr)); 1522 1523 return; 1524 } 1525 offset += UNROLL; 1526 invocation++; 1527 } 1528 #endif 1529 dst += UNROLL; 1530 count -= UNROLL; 1531 // skip x += UNROLL, since it's unchanged mod-4 1532 } while (count >= UNROLL); 1533 } 1534 #undef UNROLL 1535 1536 // residuals 1537 if (count > 0) { 1538 DITHER_565_SCAN(y); 1539 do { 1540 SkPMColor c = *src++; 1541 SkPMColorAssert(c); 1542 if (c) { 1543 unsigned a = SkGetPackedA32(c); 1544 1545 // dither and alpha are just temporary variables to work-around 1546 // an ICE in debug. 1547 unsigned dither = DITHER_VALUE(x); 1548 unsigned alpha = SkAlpha255To256(a); 1549 int d = SkAlphaMul(dither, alpha); 1550 1551 unsigned sr = SkGetPackedR32(c); 1552 unsigned sg = SkGetPackedG32(c); 1553 unsigned sb = SkGetPackedB32(c); 1554 sr = SkDITHER_R32_FOR_565(sr, d); 1555 sg = SkDITHER_G32_FOR_565(sg, d); 1556 sb = SkDITHER_B32_FOR_565(sb, d); 1557 1558 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1559 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1560 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1561 // now src and dst expanded are in g:11 r:10 x:1 b:10 1562 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1563 } 1564 dst += 1; 1565 DITHER_INC_X(x); 1566 } while (--count != 0); 1567 } 1568 } 1569 1570 /////////////////////////////////////////////////////////////////////////////// 1571 1572 #undef DEBUG_S32_OPAQUE_DITHER 1573 1574 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1575 const SkPMColor* SK_RESTRICT src, 1576 int count, U8CPU alpha, int x, int y) { 1577 SkASSERT(255 == alpha); 1578 1579 #define UNROLL 8 1580 if (count >= UNROLL) { 1581 uint8x8_t d; 1582 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1583 d = vld1_u8(dstart); 1584 1585 while (count >= UNROLL) { 1586 uint8x8_t sr, sg, sb; 1587 uint16x8_t dr, dg, db; 1588 uint16x8_t dst8; 1589 uint8x8x4_t vsrc; 1590 1591 #ifdef SK_CPU_ARM64 1592 vsrc = sk_vld4_u8_arm64_3(src); 1593 #else 1594 { 1595 register uint8x8_t d0 asm("d0"); 1596 register uint8x8_t d1 asm("d1"); 1597 register uint8x8_t d2 asm("d2"); 1598 register uint8x8_t d3 asm("d3"); 1599 1600 asm ( 1601 "vld4.8 {d0-d3},[%[src]]! " 1602 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1603 : 1604 ); 1605 vsrc.val[0] = d0; 1606 vsrc.val[1] = d1; 1607 vsrc.val[2] = d2; 1608 } 1609 #endif 1610 sr = vsrc.val[NEON_R]; 1611 sg = vsrc.val[NEON_G]; 1612 sb = vsrc.val[NEON_B]; 1613 1614 /* XXX: if we want to prefetch, hide it in the above asm() 1615 * using the gcc __builtin_prefetch(), the prefetch will 1616 * fall to the bottom of the loop -- it won't stick up 1617 * at the top of the loop, just after the vld4. 1618 */ 1619 1620 // sr = sr - (sr>>5) + d 1621 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1622 dr = vaddl_u8(sr, d); 1623 1624 // sb = sb - (sb>>5) + d 1625 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1626 db = vaddl_u8(sb, d); 1627 1628 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1629 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1630 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1631 1632 // pack high bits of each into 565 format (rgb, b is lsb) 1633 dst8 = vshrq_n_u16(db, 3); 1634 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1635 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1636 1637 // store it 1638 vst1q_u16(dst, dst8); 1639 1640 #if defined(DEBUG_S32_OPAQUE_DITHER) 1641 // always good to know if we generated good results 1642 { 1643 int i, myx = x, myy = y; 1644 DITHER_565_SCAN(myy); 1645 for (i=0;i<UNROLL;i++) { 1646 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1647 SkPMColor c = src[i-8]; 1648 unsigned dither = DITHER_VALUE(myx); 1649 uint16_t val = SkDitherRGB32To565(c, dither); 1650 if (val != dst[i]) { 1651 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1652 c, dither, val, dst[i], dstart[i]); 1653 } 1654 DITHER_INC_X(myx); 1655 } 1656 } 1657 #endif 1658 1659 dst += UNROLL; 1660 // we don't need to increment src as the asm above has already done it 1661 count -= UNROLL; 1662 x += UNROLL; // probably superfluous 1663 } 1664 } 1665 #undef UNROLL 1666 1667 // residuals 1668 if (count > 0) { 1669 DITHER_565_SCAN(y); 1670 do { 1671 SkPMColor c = *src++; 1672 SkPMColorAssert(c); 1673 SkASSERT(SkGetPackedA32(c) == 255); 1674 1675 unsigned dither = DITHER_VALUE(x); 1676 *dst++ = SkDitherRGB32To565(c, dither); 1677 DITHER_INC_X(x); 1678 } while (--count != 0); 1679 } 1680 } 1681 1682 /////////////////////////////////////////////////////////////////////////////// 1683 1684 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = { 1685 // no dither 1686 S32_D565_Opaque_neon, 1687 S32_D565_Blend_neon, 1688 S32A_D565_Opaque_neon, 1689 #if 0 1690 S32A_D565_Blend_neon, 1691 #else 1692 NULL, // https://code.google.com/p/skia/issues/detail?id=2797 1693 #endif 1694 1695 // dither 1696 S32_D565_Opaque_Dither_neon, 1697 S32_D565_Blend_Dither_neon, 1698 S32A_D565_Opaque_Dither_neon, 1699 NULL, // S32A_D565_Blend_Dither 1700 }; 1701 1702 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { 1703 Color32A_D565_neon, // Color32_D565, 1704 Color32A_D565_neon, // Color32A_D565, 1705 Color32A_D565_neon, // Color32_D565_Dither, 1706 Color32A_D565_neon, // Color32A_D565_Dither 1707 }; 1708 1709 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1710 NULL, // S32_Opaque, 1711 S32_Blend_BlitRow32_neon, // S32_Blend, 1712 /* 1713 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1714 * value and attempts to optimize accordingly. The optimization is 1715 * sensitive to the source content and is not a win in all cases. For 1716 * example, if there are a lot of transitions between the alpha states, 1717 * the performance will almost certainly be worse. However, for many 1718 * common cases the performance is equivalent or better than the standard 1719 * case where we do not inspect the src alpha. 1720 */ 1721 #if SK_A32_SHIFT == 24 1722 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1723 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1724 #else 1725 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1726 #endif 1727 #ifdef SK_CPU_ARM32 1728 S32A_Blend_BlitRow32_neon // S32A_Blend 1729 #else 1730 NULL 1731 #endif 1732 }; 1733