1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitRow_opts_arm_neon.h" 9 10 #include "SkBlitMask.h" 11 #include "SkBlitRow.h" 12 #include "SkColorPriv.h" 13 #include "SkDither.h" 14 #include "SkMathPriv.h" 15 #include "SkUtils.h" 16 17 #include "SkCachePreload_arm.h" 18 #include "SkColor_opts_neon.h" 19 #include <arm_neon.h> 20 21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 while (count >= 8) { 27 uint8x8x4_t vsrc; 28 uint16x8_t vdst; 29 30 // Load 31 vsrc = vld4_u8((uint8_t*)src); 32 33 // Convert src to 565 34 vdst = SkPixel32ToPixel16_neon8(vsrc); 35 36 // Store 37 vst1q_u16(dst, vdst); 38 39 // Prepare next iteration 40 dst += 8; 41 src += 8; 42 count -= 8; 43 }; 44 45 // Leftovers 46 while (count > 0) { 47 SkPMColor c = *src++; 48 SkPMColorAssert(c); 49 *dst = SkPixel32ToPixel16_ToU16(c); 50 dst++; 51 count--; 52 }; 53 } 54 55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 56 const SkPMColor* SK_RESTRICT src, int count, 57 U8CPU alpha, int /*x*/, int /*y*/) { 58 SkASSERT(255 == alpha); 59 60 if (count >= 8) { 61 uint16_t* SK_RESTRICT keep_dst = 0; 62 63 asm volatile ( 64 "ands ip, %[count], #7 \n\t" 65 "vmov.u8 d31, #1<<7 \n\t" 66 "vld1.16 {q12}, [%[dst]] \n\t" 67 "vld4.8 {d0-d3}, [%[src]] \n\t" 68 // Thumb does not support the standard ARM conditional 69 // instructions but instead requires the 'it' instruction 70 // to signal conditional execution 71 "it eq \n\t" 72 "moveq ip, #8 \n\t" 73 "mov %[keep_dst], %[dst] \n\t" 74 75 "add %[src], %[src], ip, LSL#2 \n\t" 76 "add %[dst], %[dst], ip, LSL#1 \n\t" 77 "subs %[count], %[count], ip \n\t" 78 "b 9f \n\t" 79 // LOOP 80 "2: \n\t" 81 82 "vld1.16 {q12}, [%[dst]]! \n\t" 83 "vld4.8 {d0-d3}, [%[src]]! \n\t" 84 "vst1.16 {q10}, [%[keep_dst]] \n\t" 85 "sub %[keep_dst], %[dst], #8*2 \n\t" 86 "subs %[count], %[count], #8 \n\t" 87 "9: \n\t" 88 "pld [%[dst],#32] \n\t" 89 // expand 0565 q12 to 8888 {d4-d7} 90 "vmovn.u16 d4, q12 \n\t" 91 "vshr.u16 q11, q12, #5 \n\t" 92 "vshr.u16 q10, q12, #6+5 \n\t" 93 "vmovn.u16 d5, q11 \n\t" 94 "vmovn.u16 d6, q10 \n\t" 95 "vshl.u8 d4, d4, #3 \n\t" 96 "vshl.u8 d5, d5, #2 \n\t" 97 "vshl.u8 d6, d6, #3 \n\t" 98 99 "vmovl.u8 q14, d31 \n\t" 100 "vmovl.u8 q13, d31 \n\t" 101 "vmovl.u8 q12, d31 \n\t" 102 103 // duplicate in 4/2/1 & 8pix vsns 104 "vmvn.8 d30, d3 \n\t" 105 "vmlal.u8 q14, d30, d6 \n\t" 106 "vmlal.u8 q13, d30, d5 \n\t" 107 "vmlal.u8 q12, d30, d4 \n\t" 108 "vshr.u16 q8, q14, #5 \n\t" 109 "vshr.u16 q9, q13, #6 \n\t" 110 "vaddhn.u16 d6, q14, q8 \n\t" 111 "vshr.u16 q8, q12, #5 \n\t" 112 "vaddhn.u16 d5, q13, q9 \n\t" 113 "vqadd.u8 d6, d6, d0 \n\t" // moved up 114 "vaddhn.u16 d4, q12, q8 \n\t" 115 // intentionally don't calculate alpha 116 // result in d4-d6 117 118 "vqadd.u8 d5, d5, d1 \n\t" 119 "vqadd.u8 d4, d4, d2 \n\t" 120 121 // pack 8888 {d4-d6} to 0565 q10 122 "vshll.u8 q10, d6, #8 \n\t" 123 "vshll.u8 q3, d5, #8 \n\t" 124 "vshll.u8 q2, d4, #8 \n\t" 125 "vsri.u16 q10, q3, #5 \n\t" 126 "vsri.u16 q10, q2, #11 \n\t" 127 128 "bne 2b \n\t" 129 130 "1: \n\t" 131 "vst1.16 {q10}, [%[keep_dst]] \n\t" 132 : [count] "+r" (count) 133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 136 "d30","d31" 137 ); 138 } 139 else 140 { // handle count < 8 141 uint16_t* SK_RESTRICT keep_dst = 0; 142 143 asm volatile ( 144 "vmov.u8 d31, #1<<7 \n\t" 145 "mov %[keep_dst], %[dst] \n\t" 146 147 "tst %[count], #4 \n\t" 148 "beq 14f \n\t" 149 "vld1.16 {d25}, [%[dst]]! \n\t" 150 "vld1.32 {q1}, [%[src]]! \n\t" 151 152 "14: \n\t" 153 "tst %[count], #2 \n\t" 154 "beq 12f \n\t" 155 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 156 "vld1.32 {d1}, [%[src]]! \n\t" 157 158 "12: \n\t" 159 "tst %[count], #1 \n\t" 160 "beq 11f \n\t" 161 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 162 "vld1.32 {d0[1]}, [%[src]]! \n\t" 163 164 "11: \n\t" 165 // unzips achieve the same as a vld4 operation 166 "vuzpq.u16 q0, q1 \n\t" 167 "vuzp.u8 d0, d1 \n\t" 168 "vuzp.u8 d2, d3 \n\t" 169 // expand 0565 q12 to 8888 {d4-d7} 170 "vmovn.u16 d4, q12 \n\t" 171 "vshr.u16 q11, q12, #5 \n\t" 172 "vshr.u16 q10, q12, #6+5 \n\t" 173 "vmovn.u16 d5, q11 \n\t" 174 "vmovn.u16 d6, q10 \n\t" 175 "vshl.u8 d4, d4, #3 \n\t" 176 "vshl.u8 d5, d5, #2 \n\t" 177 "vshl.u8 d6, d6, #3 \n\t" 178 179 "vmovl.u8 q14, d31 \n\t" 180 "vmovl.u8 q13, d31 \n\t" 181 "vmovl.u8 q12, d31 \n\t" 182 183 // duplicate in 4/2/1 & 8pix vsns 184 "vmvn.8 d30, d3 \n\t" 185 "vmlal.u8 q14, d30, d6 \n\t" 186 "vmlal.u8 q13, d30, d5 \n\t" 187 "vmlal.u8 q12, d30, d4 \n\t" 188 "vshr.u16 q8, q14, #5 \n\t" 189 "vshr.u16 q9, q13, #6 \n\t" 190 "vaddhn.u16 d6, q14, q8 \n\t" 191 "vshr.u16 q8, q12, #5 \n\t" 192 "vaddhn.u16 d5, q13, q9 \n\t" 193 "vqadd.u8 d6, d6, d0 \n\t" // moved up 194 "vaddhn.u16 d4, q12, q8 \n\t" 195 // intentionally don't calculate alpha 196 // result in d4-d6 197 198 "vqadd.u8 d5, d5, d1 \n\t" 199 "vqadd.u8 d4, d4, d2 \n\t" 200 201 // pack 8888 {d4-d6} to 0565 q10 202 "vshll.u8 q10, d6, #8 \n\t" 203 "vshll.u8 q3, d5, #8 \n\t" 204 "vshll.u8 q2, d4, #8 \n\t" 205 "vsri.u16 q10, q3, #5 \n\t" 206 "vsri.u16 q10, q2, #11 \n\t" 207 208 // store 209 "tst %[count], #4 \n\t" 210 "beq 24f \n\t" 211 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 212 213 "24: \n\t" 214 "tst %[count], #2 \n\t" 215 "beq 22f \n\t" 216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 217 218 "22: \n\t" 219 "tst %[count], #1 \n\t" 220 "beq 21f \n\t" 221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 222 223 "21: \n\t" 224 : [count] "+r" (count) 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 228 "d30","d31" 229 ); 230 } 231 } 232 233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 234 const SkPMColor* SK_RESTRICT src, int count, 235 U8CPU alpha, int /*x*/, int /*y*/) { 236 237 U8CPU alpha_for_asm = alpha; 238 239 asm volatile ( 240 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 241 * the original in two respects: 242 * 1. The results have a few mismatches compared to the original code. These mismatches 243 * never exceed 1. It's possible to improve accuracy vs. a floating point 244 * implementation by introducing rounding right shifts (vrshr) for the final stage. 245 * Rounding is not present in the code below, because although results would be closer 246 * to a floating point implementation, the number of mismatches compared to the 247 * original code would be far greater. 248 * 2. On certain inputs, the original code can overflow, causing colour channels to 249 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 250 * to affect another. 251 */ 252 253 #if 1 254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 255 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 256 #else 257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 258 #endif 259 "vmov.u16 q3, #255 \n\t" // set up constant 260 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 261 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 262 "beq 2f \n\t" // if count8 == 0, exit 263 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 264 265 "1: \n\t" 266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 267 "subs r4, r4, #1 \n\t" // decrement loop counter 268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 269 // and deinterleave 270 271 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 272 "vand q10, q0, q15 \n\t" // extract blue 273 "vshr.u16 q8, q0, #11 \n\t" // extract red 274 "vshr.u16 q9, q9, #10 \n\t" // extract green 275 // dstrgb = {q8, q9, q10} 276 277 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 278 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 279 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 280 281 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 282 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 283 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 284 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 285 // srcrgba = {q11, q12, q13, q14} 286 287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 288 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 289 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 290 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 291 292 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 293 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 294 // dst_scale = q2 295 296 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 297 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 298 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 299 300 #if 1 301 // trying for a better match with SkDiv255Round(a) 302 // C alg is: a+=128; (a+a>>8)>>8 303 // we'll use just a rounding shift [q2 is available for scratch] 304 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 305 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 306 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 307 #else 308 // arm's original "truncating divide by 256" 309 "vshr.u16 q11, q11, #8 \n\t" // shift down red 310 "vshr.u16 q12, q12, #8 \n\t" // shift down green 311 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 312 #endif 313 314 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 315 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 317 318 "bne 1b \n\t" // if counter != 0, loop 319 "2: \n\t" // exit 320 321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 322 : 323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 324 ); 325 326 count &= 7; 327 if (count > 0) { 328 do { 329 SkPMColor sc = *src++; 330 if (sc) { 331 uint16_t dc = *dst; 332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 337 } 338 dst += 1; 339 } while (--count != 0); 340 } 341 } 342 343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 344 * each dither value is spaced out into byte lanes, and repeated 345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 346 * start of each row. 347 */ 348 static const uint8_t gDitherMatrix_Neon[48] = { 349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 350 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 351 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 352 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 353 354 }; 355 356 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 357 int count, U8CPU alpha, int x, int y) 358 { 359 360 SkASSERT(255 > alpha); 361 362 // rescale alpha to range 1 - 256 363 int scale = SkAlpha255To256(alpha); 364 365 if (count >= 8) { 366 /* select row and offset for dither array */ 367 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 368 369 uint8x8_t vdither = vld1_u8(dstart); // load dither values 370 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 371 372 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 373 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 374 375 do { 376 377 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 378 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 379 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 380 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 381 uint16x8_t vdst; 382 uint16x8_t vdst_r, vdst_g, vdst_b; 383 int16x8_t vres_r, vres_g, vres_b; 384 int8x8_t vres8_r, vres8_g, vres8_b; 385 386 // Load source and add dither 387 { 388 register uint8x8_t d0 asm("d0"); 389 register uint8x8_t d1 asm("d1"); 390 register uint8x8_t d2 asm("d2"); 391 register uint8x8_t d3 asm("d3"); 392 393 asm ( 394 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 395 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 396 : 397 ); 398 vsrc_g = d1; 399 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 400 vsrc_r = d2; vsrc_b = d0; 401 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 402 vsrc_r = d0; vsrc_b = d2; 403 #endif 404 } 405 406 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 407 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 408 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 409 410 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 411 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 412 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 413 414 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 415 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 416 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 417 418 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 419 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 420 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 421 422 // Load dst and unpack 423 vdst = vld1q_u16(dst); 424 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 425 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 426 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 427 428 // subtract dst from src and widen 429 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 430 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 431 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 432 433 // multiply diffs by scale and shift 434 vres_r = vmulq_s16(vres_r, vscale); 435 vres_g = vmulq_s16(vres_g, vscale); 436 vres_b = vmulq_s16(vres_b, vscale); 437 438 vres8_r = vshrn_n_s16(vres_r, 8); 439 vres8_g = vshrn_n_s16(vres_g, 8); 440 vres8_b = vshrn_n_s16(vres_b, 8); 441 442 // add dst to result 443 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 444 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 445 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 446 447 // put result into 565 format 448 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 449 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 450 451 // Store result 452 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 453 454 // Next iteration 455 dst += 8; 456 count -= 8; 457 458 } while (count >= 8); 459 } 460 461 // Leftovers 462 if (count > 0) { 463 int scale = SkAlpha255To256(alpha); 464 DITHER_565_SCAN(y); 465 do { 466 SkPMColor c = *src++; 467 SkPMColorAssert(c); 468 469 int dither = DITHER_VALUE(x); 470 int sr = SkGetPackedR32(c); 471 int sg = SkGetPackedG32(c); 472 int sb = SkGetPackedB32(c); 473 sr = SkDITHER_R32To565(sr, dither); 474 sg = SkDITHER_G32To565(sg, dither); 475 sb = SkDITHER_B32To565(sb, dither); 476 477 uint16_t d = *dst; 478 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 479 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 480 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 481 DITHER_INC_X(x); 482 } while (--count != 0); 483 } 484 } 485 486 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 487 const SkPMColor* SK_RESTRICT src, 488 int count, U8CPU alpha) { 489 490 SkASSERT(255 == alpha); 491 if (count > 0) { 492 493 494 uint8x8_t alpha_mask; 495 496 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 497 alpha_mask = vld1_u8(alpha_mask_setup); 498 499 /* do the NEON unrolled code */ 500 #define UNROLL 4 501 while (count >= UNROLL) { 502 uint8x8_t src_raw, dst_raw, dst_final; 503 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 504 505 /* The two prefetches below may make the code slighlty 506 * slower for small values of count but are worth having 507 * in the general case. 508 */ 509 __builtin_prefetch(src+32); 510 __builtin_prefetch(dst+32); 511 512 /* get the source */ 513 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 514 #if UNROLL > 2 515 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 516 #endif 517 518 /* get and hold the dst too */ 519 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 520 #if UNROLL > 2 521 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 522 #endif 523 524 /* 1st and 2nd bits of the unrolling */ 525 { 526 uint8x8_t dst_cooked; 527 uint16x8_t dst_wide; 528 uint8x8_t alpha_narrow; 529 uint16x8_t alpha_wide; 530 531 /* get the alphas spread out properly */ 532 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 533 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 534 535 /* spread the dest */ 536 dst_wide = vmovl_u8(dst_raw); 537 538 /* alpha mul the dest */ 539 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 540 dst_cooked = vshrn_n_u16(dst_wide, 8); 541 542 /* sum -- ignoring any byte lane overflows */ 543 dst_final = vadd_u8(src_raw, dst_cooked); 544 } 545 546 #if UNROLL > 2 547 /* the 3rd and 4th bits of our unrolling */ 548 { 549 uint8x8_t dst_cooked; 550 uint16x8_t dst_wide; 551 uint8x8_t alpha_narrow; 552 uint16x8_t alpha_wide; 553 554 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 555 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 556 557 /* spread the dest */ 558 dst_wide = vmovl_u8(dst_raw_2); 559 560 /* alpha mul the dest */ 561 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 562 dst_cooked = vshrn_n_u16(dst_wide, 8); 563 564 /* sum -- ignoring any byte lane overflows */ 565 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 566 } 567 #endif 568 569 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 570 #if UNROLL > 2 571 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 572 #endif 573 574 src += UNROLL; 575 dst += UNROLL; 576 count -= UNROLL; 577 } 578 #undef UNROLL 579 580 /* do any residual iterations */ 581 while (--count >= 0) { 582 *dst = SkPMSrcOver(*src, *dst); 583 src += 1; 584 dst += 1; 585 } 586 } 587 } 588 589 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 590 const SkPMColor* SK_RESTRICT src, 591 int count, U8CPU alpha) { 592 SkASSERT(255 == alpha); 593 594 if (count <= 0) 595 return; 596 597 /* Use these to check if src is transparent or opaque */ 598 const unsigned int ALPHA_OPAQ = 0xFF000000; 599 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 600 601 #define UNROLL 4 602 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 603 const SkPMColor* SK_RESTRICT src_temp = src; 604 605 /* set up the NEON variables */ 606 uint8x8_t alpha_mask; 607 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 608 alpha_mask = vld1_u8(alpha_mask_setup); 609 610 uint8x8_t src_raw, dst_raw, dst_final; 611 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 612 uint8x8_t dst_cooked; 613 uint16x8_t dst_wide; 614 uint8x8_t alpha_narrow; 615 uint16x8_t alpha_wide; 616 617 /* choose the first processing type */ 618 if( src >= src_end) 619 goto TAIL; 620 if(*src <= ALPHA_TRANS) 621 goto ALPHA_0; 622 if(*src >= ALPHA_OPAQ) 623 goto ALPHA_255; 624 /* fall-thru */ 625 626 ALPHA_1_TO_254: 627 do { 628 629 /* get the source */ 630 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 631 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 632 633 /* get and hold the dst too */ 634 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 635 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 636 637 638 /* get the alphas spread out properly */ 639 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 640 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 641 /* we collapsed (255-a)+1 ... */ 642 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 643 644 /* spread the dest */ 645 dst_wide = vmovl_u8(dst_raw); 646 647 /* alpha mul the dest */ 648 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 649 dst_cooked = vshrn_n_u16(dst_wide, 8); 650 651 /* sum -- ignoring any byte lane overflows */ 652 dst_final = vadd_u8(src_raw, dst_cooked); 653 654 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 655 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 656 /* we collapsed (255-a)+1 ... */ 657 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 658 659 /* spread the dest */ 660 dst_wide = vmovl_u8(dst_raw_2); 661 662 /* alpha mul the dest */ 663 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 664 dst_cooked = vshrn_n_u16(dst_wide, 8); 665 666 /* sum -- ignoring any byte lane overflows */ 667 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 668 669 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 670 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 671 672 src += UNROLL; 673 dst += UNROLL; 674 675 /* if 2 of the next pixels aren't between 1 and 254 676 it might make sense to go to the optimized loops */ 677 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 678 break; 679 680 } while(src < src_end); 681 682 if (src >= src_end) 683 goto TAIL; 684 685 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 686 goto ALPHA_255; 687 688 /*fall-thru*/ 689 690 ALPHA_0: 691 692 /*In this state, we know the current alpha is 0 and 693 we optimize for the next alpha also being zero. */ 694 src_temp = src; //so we don't have to increment dst every time 695 do { 696 if(*(++src) > ALPHA_TRANS) 697 break; 698 if(*(++src) > ALPHA_TRANS) 699 break; 700 if(*(++src) > ALPHA_TRANS) 701 break; 702 if(*(++src) > ALPHA_TRANS) 703 break; 704 } while(src < src_end); 705 706 dst += (src - src_temp); 707 708 /* no longer alpha 0, so determine where to go next. */ 709 if( src >= src_end) 710 goto TAIL; 711 if(*src >= ALPHA_OPAQ) 712 goto ALPHA_255; 713 else 714 goto ALPHA_1_TO_254; 715 716 ALPHA_255: 717 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 718 dst[0]=src[0]; 719 dst[1]=src[1]; 720 dst[2]=src[2]; 721 dst[3]=src[3]; 722 src+=UNROLL; 723 dst+=UNROLL; 724 if(src >= src_end) 725 goto TAIL; 726 } 727 728 //Handle remainder. 729 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 730 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 731 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 732 } 733 } 734 735 if( src >= src_end) 736 goto TAIL; 737 if(*src <= ALPHA_TRANS) 738 goto ALPHA_0; 739 else 740 goto ALPHA_1_TO_254; 741 742 TAIL: 743 /* do any residual iterations */ 744 src_end += UNROLL + 1; //goto the real end 745 while(src != src_end) { 746 if( *src != 0 ) { 747 if( *src >= ALPHA_OPAQ ) { 748 *dst = *src; 749 } 750 else { 751 *dst = SkPMSrcOver(*src, *dst); 752 } 753 } 754 src++; 755 dst++; 756 } 757 758 #undef UNROLL 759 return; 760 } 761 762 /* Neon version of S32_Blend_BlitRow32() 763 * portable version is in src/core/SkBlitRow_D32.cpp 764 */ 765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 766 const SkPMColor* SK_RESTRICT src, 767 int count, U8CPU alpha) { 768 SkASSERT(alpha <= 255); 769 if (count > 0) { 770 uint16_t src_scale = SkAlpha255To256(alpha); 771 uint16_t dst_scale = 256 - src_scale; 772 773 /* run them N at a time through the NEON unit */ 774 /* note that each 1 is 4 bytes, each treated exactly the same, 775 * so we can work under that guise. We *do* know that the src&dst 776 * will be 32-bit aligned quantities, so we can specify that on 777 * the load/store ops and do a neon 'reinterpret' to get us to 778 * byte-sized (pun intended) pieces that we widen/multiply/shift 779 * we're limited at 128 bits in the wide ops, which is 8x16bits 780 * or a pair of 32 bit src/dsts. 781 */ 782 /* we *could* manually unroll this loop so that we load 128 bits 783 * (as a pair of 64s) from each of src and dst, processing them 784 * in pieces. This might give us a little better management of 785 * the memory latency, but my initial attempts here did not 786 * produce an instruction stream that looked all that nice. 787 */ 788 #define UNROLL 2 789 while (count >= UNROLL) { 790 uint8x8_t src_raw, dst_raw, dst_final; 791 uint16x8_t src_wide, dst_wide; 792 793 /* get 64 bits of src, widen it, multiply by src_scale */ 794 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 795 src_wide = vmovl_u8(src_raw); 796 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 797 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 798 799 /* ditto with dst */ 800 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 801 dst_wide = vmovl_u8(dst_raw); 802 803 /* combine add with dst multiply into mul-accumulate */ 804 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 805 806 dst_final = vshrn_n_u16(dst_wide, 8); 807 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 808 809 src += UNROLL; 810 dst += UNROLL; 811 count -= UNROLL; 812 } 813 /* RBE: well, i don't like how gcc manages src/dst across the above 814 * loop it's constantly calculating src+bias, dst+bias and it only 815 * adjusts the real ones when we leave the loop. Not sure why 816 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 817 * the adjustments to src/dst/count, but it does... 818 * (might be SSA-style internal logic... 819 */ 820 821 #if UNROLL == 2 822 if (count == 1) { 823 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 824 } 825 #else 826 if (count > 0) { 827 do { 828 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 829 src += 1; 830 dst += 1; 831 } while (--count > 0); 832 } 833 #endif 834 835 #undef UNROLL 836 } 837 } 838 839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 840 const SkPMColor* SK_RESTRICT src, 841 int count, U8CPU alpha) { 842 843 SkASSERT(255 >= alpha); 844 845 if (count <= 0) { 846 return; 847 } 848 849 unsigned alpha256 = SkAlpha255To256(alpha); 850 851 // First deal with odd counts 852 if (count & 1) { 853 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 854 uint16x8_t vdst_wide, vsrc_wide; 855 unsigned dst_scale; 856 857 // Load 858 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 859 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 860 861 // Calc dst_scale 862 dst_scale = vget_lane_u8(vsrc, 3); 863 dst_scale *= alpha256; 864 dst_scale >>= 8; 865 dst_scale = 256 - dst_scale; 866 867 // Process src 868 vsrc_wide = vmovl_u8(vsrc); 869 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 870 871 // Process dst 872 vdst_wide = vmovl_u8(vdst); 873 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 874 875 // Combine 876 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 877 878 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 879 dst++; 880 src++; 881 count--; 882 } 883 884 if (count) { 885 uint8x8_t alpha_mask; 886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 887 alpha_mask = vld1_u8(alpha_mask_setup); 888 889 do { 890 891 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 892 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 893 894 __builtin_prefetch(src+32); 895 __builtin_prefetch(dst+32); 896 897 // Load 898 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 899 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 900 901 // Prepare src_scale 902 vsrc_scale = vdupq_n_u16(alpha256); 903 904 // Calc dst_scale 905 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 906 vdst_scale = vmovl_u8(vsrc_alphas); 907 vdst_scale *= vsrc_scale; 908 vdst_scale = vshrq_n_u16(vdst_scale, 8); 909 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 910 911 // Process src 912 vsrc_wide = vmovl_u8(vsrc); 913 vsrc_wide *= vsrc_scale; 914 915 // Process dst 916 vdst_wide = vmovl_u8(vdst); 917 vdst_wide *= vdst_scale; 918 919 // Combine 920 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 921 922 vst1_u32(dst, vreinterpret_u32_u8(vres)); 923 924 src += 2; 925 dst += 2; 926 count -= 2; 927 } while(count); 928 } 929 } 930 931 /////////////////////////////////////////////////////////////////////////////// 932 933 #undef DEBUG_OPAQUE_DITHER 934 935 #if defined(DEBUG_OPAQUE_DITHER) 936 static void showme8(char *str, void *p, int len) 937 { 938 static char buf[256]; 939 char tbuf[32]; 940 int i; 941 char *pc = (char*) p; 942 sprintf(buf,"%8s:", str); 943 for(i=0;i<len;i++) { 944 sprintf(tbuf, " %02x", pc[i]); 945 strcat(buf, tbuf); 946 } 947 SkDebugf("%s\n", buf); 948 } 949 static void showme16(char *str, void *p, int len) 950 { 951 static char buf[256]; 952 char tbuf[32]; 953 int i; 954 uint16_t *pc = (uint16_t*) p; 955 sprintf(buf,"%8s:", str); 956 len = (len / sizeof(uint16_t)); /* passed as bytes */ 957 for(i=0;i<len;i++) { 958 sprintf(tbuf, " %04x", pc[i]); 959 strcat(buf, tbuf); 960 } 961 SkDebugf("%s\n", buf); 962 } 963 #endif 964 965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 966 const SkPMColor* SK_RESTRICT src, 967 int count, U8CPU alpha, int x, int y) { 968 SkASSERT(255 == alpha); 969 970 #define UNROLL 8 971 972 if (count >= UNROLL) { 973 uint8x8_t dbase; 974 975 #if defined(DEBUG_OPAQUE_DITHER) 976 uint16_t tmpbuf[UNROLL]; 977 int td[UNROLL]; 978 int tdv[UNROLL]; 979 int ta[UNROLL]; 980 int tap[UNROLL]; 981 uint16_t in_dst[UNROLL]; 982 int offset = 0; 983 int noisy = 0; 984 #endif 985 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 987 dbase = vld1_u8(dstart); 988 989 do { 990 uint8x8_t sr, sg, sb, sa, d; 991 uint16x8_t dst8, scale8, alpha8; 992 uint16x8_t dst_r, dst_g, dst_b; 993 994 #if defined(DEBUG_OPAQUE_DITHER) 995 /* calculate 8 elements worth into a temp buffer */ 996 { 997 int my_y = y; 998 int my_x = x; 999 SkPMColor* my_src = (SkPMColor*)src; 1000 uint16_t* my_dst = dst; 1001 int i; 1002 1003 DITHER_565_SCAN(my_y); 1004 for(i=0;i<UNROLL;i++) { 1005 SkPMColor c = *my_src++; 1006 SkPMColorAssert(c); 1007 if (c) { 1008 unsigned a = SkGetPackedA32(c); 1009 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1011 tdv[i] = DITHER_VALUE(my_x); 1012 ta[i] = a; 1013 tap[i] = SkAlpha255To256(a); 1014 td[i] = d; 1015 1016 unsigned sr = SkGetPackedR32(c); 1017 unsigned sg = SkGetPackedG32(c); 1018 unsigned sb = SkGetPackedB32(c); 1019 sr = SkDITHER_R32_FOR_565(sr, d); 1020 sg = SkDITHER_G32_FOR_565(sg, d); 1021 sb = SkDITHER_B32_FOR_565(sb, d); 1022 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1028 td[i] = d; 1029 1030 } else { 1031 tmpbuf[i] = *my_dst; 1032 ta[i] = tdv[i] = td[i] = 0xbeef; 1033 } 1034 in_dst[i] = *my_dst; 1035 my_dst += 1; 1036 DITHER_INC_X(my_x); 1037 } 1038 } 1039 #endif 1040 1041 /* source is in ABGR */ 1042 { 1043 register uint8x8_t d0 asm("d0"); 1044 register uint8x8_t d1 asm("d1"); 1045 register uint8x8_t d2 asm("d2"); 1046 register uint8x8_t d3 asm("d3"); 1047 1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1050 : "r" (src) 1051 ); 1052 sr = d0; sg = d1; sb = d2; sa = d3; 1053 } 1054 1055 /* calculate 'd', which will be 0..7 */ 1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1057 #if defined(SK_BUILD_FOR_ANDROID) 1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1060 #else 1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1062 #endif 1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 1065 1066 /* sr = sr - (sr>>5) + d */ 1067 /* watching for 8-bit overflow. d is 0..7; risky range of 1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1069 * safe as long as we do ((sr-sr>>5) + d) */ 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1071 sr = vadd_u8(sr, d); 1072 1073 /* sb = sb - (sb>>5) + d */ 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1075 sb = vadd_u8(sb, d); 1076 1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1080 1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1082 dst8 = vld1q_u16(dst); 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1086 1087 /* blend */ 1088 #if 1 1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1090 /* originally 255-sa + 1 */ 1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1092 #else 1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1095 #endif 1096 1097 #if 1 1098 /* combine the addq and mul, save 3 insns */ 1099 scale8 = vshrq_n_u16(scale8, 3); 1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1103 #else 1104 /* known correct, but +3 insns over above */ 1105 scale8 = vshrq_n_u16(scale8, 3); 1106 dst_b = vmulq_u16(dst_b, scale8); 1107 dst_g = vmulq_u16(dst_g, scale8); 1108 dst_r = vmulq_u16(dst_r, scale8); 1109 1110 /* combine */ 1111 /* NB: vshll widens, need to preserve those bits */ 1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1115 #endif 1116 1117 /* repack to store */ 1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1121 1122 vst1q_u16(dst, dst8); 1123 1124 #if defined(DEBUG_OPAQUE_DITHER) 1125 /* verify my 8 elements match the temp buffer */ 1126 { 1127 int i, bad=0; 1128 static int invocation; 1129 1130 for (i=0;i<UNROLL;i++) 1131 if (tmpbuf[i] != dst[i]) bad=1; 1132 if (bad) { 1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1134 invocation, offset); 1135 SkDebugf(" alpha 0x%x\n", alpha); 1136 for (i=0;i<UNROLL;i++) 1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1140 1141 showme16("alpha8", &alpha8, sizeof(alpha8)); 1142 showme16("scale8", &scale8, sizeof(scale8)); 1143 showme8("d", &d, sizeof(d)); 1144 showme16("dst8", &dst8, sizeof(dst8)); 1145 showme16("dst_b", &dst_b, sizeof(dst_b)); 1146 showme16("dst_g", &dst_g, sizeof(dst_g)); 1147 showme16("dst_r", &dst_r, sizeof(dst_r)); 1148 showme8("sb", &sb, sizeof(sb)); 1149 showme8("sg", &sg, sizeof(sg)); 1150 showme8("sr", &sr, sizeof(sr)); 1151 1152 /* cop out */ 1153 return; 1154 } 1155 offset += UNROLL; 1156 invocation++; 1157 } 1158 #endif 1159 1160 dst += UNROLL; 1161 src += UNROLL; 1162 count -= UNROLL; 1163 /* skip x += UNROLL, since it's unchanged mod-4 */ 1164 } while (count >= UNROLL); 1165 } 1166 #undef UNROLL 1167 1168 /* residuals */ 1169 if (count > 0) { 1170 DITHER_565_SCAN(y); 1171 do { 1172 SkPMColor c = *src++; 1173 SkPMColorAssert(c); 1174 if (c) { 1175 unsigned a = SkGetPackedA32(c); 1176 1177 // dither and alpha are just temporary variables to work-around 1178 // an ICE in debug. 1179 unsigned dither = DITHER_VALUE(x); 1180 unsigned alpha = SkAlpha255To256(a); 1181 int d = SkAlphaMul(dither, alpha); 1182 1183 unsigned sr = SkGetPackedR32(c); 1184 unsigned sg = SkGetPackedG32(c); 1185 unsigned sb = SkGetPackedB32(c); 1186 sr = SkDITHER_R32_FOR_565(sr, d); 1187 sg = SkDITHER_G32_FOR_565(sg, d); 1188 sb = SkDITHER_B32_FOR_565(sb, d); 1189 1190 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1191 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1192 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1193 // now src and dst expanded are in g:11 r:10 x:1 b:10 1194 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1195 } 1196 dst += 1; 1197 DITHER_INC_X(x); 1198 } while (--count != 0); 1199 } 1200 } 1201 1202 /////////////////////////////////////////////////////////////////////////////// 1203 1204 #undef DEBUG_S32_OPAQUE_DITHER 1205 1206 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1207 const SkPMColor* SK_RESTRICT src, 1208 int count, U8CPU alpha, int x, int y) { 1209 SkASSERT(255 == alpha); 1210 1211 #define UNROLL 8 1212 if (count >= UNROLL) { 1213 uint8x8_t d; 1214 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1215 d = vld1_u8(dstart); 1216 1217 while (count >= UNROLL) { 1218 uint8x8_t sr, sg, sb; 1219 uint16x8_t dr, dg, db; 1220 uint16x8_t dst8; 1221 1222 { 1223 register uint8x8_t d0 asm("d0"); 1224 register uint8x8_t d1 asm("d1"); 1225 register uint8x8_t d2 asm("d2"); 1226 register uint8x8_t d3 asm("d3"); 1227 1228 asm ( 1229 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1230 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1231 : 1232 ); 1233 sg = d1; 1234 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1235 sr = d2; sb = d0; 1236 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1237 sr = d0; sb = d2; 1238 #endif 1239 } 1240 /* XXX: if we want to prefetch, hide it in the above asm() 1241 * using the gcc __builtin_prefetch(), the prefetch will 1242 * fall to the bottom of the loop -- it won't stick up 1243 * at the top of the loop, just after the vld4. 1244 */ 1245 1246 // sr = sr - (sr>>5) + d 1247 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1248 dr = vaddl_u8(sr, d); 1249 1250 // sb = sb - (sb>>5) + d 1251 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1252 db = vaddl_u8(sb, d); 1253 1254 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1255 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1256 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1257 1258 // pack high bits of each into 565 format (rgb, b is lsb) 1259 dst8 = vshrq_n_u16(db, 3); 1260 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1261 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1262 1263 // store it 1264 vst1q_u16(dst, dst8); 1265 1266 #if defined(DEBUG_S32_OPAQUE_DITHER) 1267 // always good to know if we generated good results 1268 { 1269 int i, myx = x, myy = y; 1270 DITHER_565_SCAN(myy); 1271 for (i=0;i<UNROLL;i++) { 1272 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1273 SkPMColor c = src[i-8]; 1274 unsigned dither = DITHER_VALUE(myx); 1275 uint16_t val = SkDitherRGB32To565(c, dither); 1276 if (val != dst[i]) { 1277 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1278 c, dither, val, dst[i], dstart[i]); 1279 } 1280 DITHER_INC_X(myx); 1281 } 1282 } 1283 #endif 1284 1285 dst += UNROLL; 1286 // we don't need to increment src as the asm above has already done it 1287 count -= UNROLL; 1288 x += UNROLL; // probably superfluous 1289 } 1290 } 1291 #undef UNROLL 1292 1293 // residuals 1294 if (count > 0) { 1295 DITHER_565_SCAN(y); 1296 do { 1297 SkPMColor c = *src++; 1298 SkPMColorAssert(c); 1299 SkASSERT(SkGetPackedA32(c) == 255); 1300 1301 unsigned dither = DITHER_VALUE(x); 1302 *dst++ = SkDitherRGB32To565(c, dither); 1303 DITHER_INC_X(x); 1304 } while (--count != 0); 1305 } 1306 } 1307 1308 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1309 SkPMColor color) { 1310 if (count <= 0) { 1311 return; 1312 } 1313 1314 if (0 == color) { 1315 if (src != dst) { 1316 memcpy(dst, src, count * sizeof(SkPMColor)); 1317 } 1318 return; 1319 } 1320 1321 unsigned colorA = SkGetPackedA32(color); 1322 if (255 == colorA) { 1323 sk_memset32(dst, color, count); 1324 } else { 1325 unsigned scale = 256 - SkAlpha255To256(colorA); 1326 1327 if (count >= 8) { 1328 // at the end of this assembly, count will have been decremented 1329 // to a negative value. That is, if count mod 8 = x, it will be 1330 // -8 +x coming out. 1331 asm volatile ( 1332 PLD128(src, 0) 1333 1334 "vdup.32 q0, %[color] \n\t" 1335 1336 PLD128(src, 128) 1337 1338 // scale numerical interval [0-255], so load as 8 bits 1339 "vdup.8 d2, %[scale] \n\t" 1340 1341 PLD128(src, 256) 1342 1343 "subs %[count], %[count], #8 \n\t" 1344 1345 PLD128(src, 384) 1346 1347 "Loop_Color32: \n\t" 1348 1349 // load src color, 8 pixels, 4 64 bit registers 1350 // (and increment src). 1351 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1352 1353 PLD128(src, 384) 1354 1355 // multiply long by scale, 64 bits at a time, 1356 // destination into a 128 bit register. 1357 "vmull.u8 q4, d4, d2 \n\t" 1358 "vmull.u8 q5, d5, d2 \n\t" 1359 "vmull.u8 q6, d6, d2 \n\t" 1360 "vmull.u8 q7, d7, d2 \n\t" 1361 1362 // shift the 128 bit registers, containing the 16 1363 // bit scaled values back to 8 bits, narrowing the 1364 // results to 64 bit registers. 1365 "vshrn.i16 d8, q4, #8 \n\t" 1366 "vshrn.i16 d9, q5, #8 \n\t" 1367 "vshrn.i16 d10, q6, #8 \n\t" 1368 "vshrn.i16 d11, q7, #8 \n\t" 1369 1370 // adding back the color, using 128 bit registers. 1371 "vadd.i8 q6, q4, q0 \n\t" 1372 "vadd.i8 q7, q5, q0 \n\t" 1373 1374 // store back the 8 calculated pixels (2 128 bit 1375 // registers), and increment dst. 1376 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1377 1378 "subs %[count], %[count], #8 \n\t" 1379 "bge Loop_Color32 \n\t" 1380 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1381 : [color] "r" (color), [scale] "r" (scale) 1382 : "cc", "memory", 1383 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1384 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1385 ); 1386 // At this point, if we went through the inline assembly, count is 1387 // a negative value: 1388 // if the value is -8, there is no pixel left to process. 1389 // if the value is -7, there is one pixel left to process 1390 // ... 1391 // And'ing it with 7 will give us the number of pixels 1392 // left to process. 1393 count = count & 0x7; 1394 } 1395 1396 while (count > 0) { 1397 *dst = color + SkAlphaMulQ(*src, scale); 1398 src += 1; 1399 dst += 1; 1400 count--; 1401 } 1402 } 1403 } 1404 1405 /////////////////////////////////////////////////////////////////////////////// 1406 1407 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1408 // no dither 1409 // NOTE: For the S32_D565_Blend function below, we don't have a special 1410 // version that assumes that each source pixel is opaque. But our 1411 // S32A is still faster than the default, so use it. 1412 S32_D565_Opaque_neon, 1413 S32A_D565_Blend_neon, // really S32_D565_Blend 1414 S32A_D565_Opaque_neon, 1415 S32A_D565_Blend_neon, 1416 1417 // dither 1418 S32_D565_Opaque_Dither_neon, 1419 S32_D565_Blend_Dither_neon, 1420 S32A_D565_Opaque_Dither_neon, 1421 NULL, // S32A_D565_Blend_Dither 1422 }; 1423 1424 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1425 NULL, // S32_Opaque, 1426 S32_Blend_BlitRow32_neon, // S32_Blend, 1427 /* 1428 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1429 * value and attempts to optimize accordingly. The optimization is 1430 * sensitive to the source content and is not a win in all cases. For 1431 * example, if there are a lot of transitions between the alpha states, 1432 * the performance will almost certainly be worse. However, for many 1433 * common cases the performance is equivalent or better than the standard 1434 * case where we do not inspect the src alpha. 1435 */ 1436 #if SK_A32_SHIFT == 24 1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1439 #else 1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1441 #endif 1442 S32A_Blend_BlitRow32_neon // S32A_Blend 1443 }; 1444