1 /* 2 * Copyright 2009 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9 #include "SkBlitRow.h" 10 #include "SkBlitMask.h" 11 #include "SkColorPriv.h" 12 #include "SkDither.h" 13 14 #if defined(__ARM_HAVE_NEON) 15 #include <arm_neon.h> 16 #endif 17 18 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 19 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 20 const SkPMColor* SK_RESTRICT src, int count, 21 U8CPU alpha, int /*x*/, int /*y*/) { 22 SkASSERT(255 == alpha); 23 24 if (count >= 8) { 25 uint16_t* SK_RESTRICT keep_dst; 26 27 asm volatile ( 28 "ands ip, %[count], #7 \n\t" 29 "vmov.u8 d31, #1<<7 \n\t" 30 "vld1.16 {q12}, [%[dst]] \n\t" 31 "vld4.8 {d0-d3}, [%[src]] \n\t" 32 "moveq ip, #8 \n\t" 33 "mov %[keep_dst], %[dst] \n\t" 34 35 "add %[src], %[src], ip, LSL#2 \n\t" 36 "add %[dst], %[dst], ip, LSL#1 \n\t" 37 "subs %[count], %[count], ip \n\t" 38 "b 9f \n\t" 39 // LOOP 40 "2: \n\t" 41 42 "vld1.16 {q12}, [%[dst]]! \n\t" 43 "vld4.8 {d0-d3}, [%[src]]! \n\t" 44 "vst1.16 {q10}, [%[keep_dst]] \n\t" 45 "sub %[keep_dst], %[dst], #8*2 \n\t" 46 "subs %[count], %[count], #8 \n\t" 47 "9: \n\t" 48 "pld [%[dst],#32] \n\t" 49 // expand 0565 q12 to 8888 {d4-d7} 50 "vmovn.u16 d4, q12 \n\t" 51 "vshr.u16 q11, q12, #5 \n\t" 52 "vshr.u16 q10, q12, #6+5 \n\t" 53 "vmovn.u16 d5, q11 \n\t" 54 "vmovn.u16 d6, q10 \n\t" 55 "vshl.u8 d4, d4, #3 \n\t" 56 "vshl.u8 d5, d5, #2 \n\t" 57 "vshl.u8 d6, d6, #3 \n\t" 58 59 "vmovl.u8 q14, d31 \n\t" 60 "vmovl.u8 q13, d31 \n\t" 61 "vmovl.u8 q12, d31 \n\t" 62 63 // duplicate in 4/2/1 & 8pix vsns 64 "vmvn.8 d30, d3 \n\t" 65 "vmlal.u8 q14, d30, d6 \n\t" 66 "vmlal.u8 q13, d30, d5 \n\t" 67 "vmlal.u8 q12, d30, d4 \n\t" 68 "vshr.u16 q8, q14, #5 \n\t" 69 "vshr.u16 q9, q13, #6 \n\t" 70 "vaddhn.u16 d6, q14, q8 \n\t" 71 "vshr.u16 q8, q12, #5 \n\t" 72 "vaddhn.u16 d5, q13, q9 \n\t" 73 "vqadd.u8 d6, d6, d0 \n\t" // moved up 74 "vaddhn.u16 d4, q12, q8 \n\t" 75 // intentionally don't calculate alpha 76 // result in d4-d6 77 78 "vqadd.u8 d5, d5, d1 \n\t" 79 "vqadd.u8 d4, d4, d2 \n\t" 80 81 // pack 8888 {d4-d6} to 0565 q10 82 "vshll.u8 q10, d6, #8 \n\t" 83 "vshll.u8 q3, d5, #8 \n\t" 84 "vshll.u8 q2, d4, #8 \n\t" 85 "vsri.u16 q10, q3, #5 \n\t" 86 "vsri.u16 q10, q2, #11 \n\t" 87 88 "bne 2b \n\t" 89 90 "1: \n\t" 91 "vst1.16 {q10}, [%[keep_dst]] \n\t" 92 : [count] "+r" (count) 93 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 94 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 95 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 96 "d30","d31" 97 ); 98 } 99 else 100 { // handle count < 8 101 uint16_t* SK_RESTRICT keep_dst; 102 103 asm volatile ( 104 "vmov.u8 d31, #1<<7 \n\t" 105 "mov %[keep_dst], %[dst] \n\t" 106 107 "tst %[count], #4 \n\t" 108 "beq 14f \n\t" 109 "vld1.16 {d25}, [%[dst]]! \n\t" 110 "vld1.32 {q1}, [%[src]]! \n\t" 111 112 "14: \n\t" 113 "tst %[count], #2 \n\t" 114 "beq 12f \n\t" 115 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 116 "vld1.32 {d1}, [%[src]]! \n\t" 117 118 "12: \n\t" 119 "tst %[count], #1 \n\t" 120 "beq 11f \n\t" 121 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 122 "vld1.32 {d0[1]}, [%[src]]! \n\t" 123 124 "11: \n\t" 125 // unzips achieve the same as a vld4 operation 126 "vuzpq.u16 q0, q1 \n\t" 127 "vuzp.u8 d0, d1 \n\t" 128 "vuzp.u8 d2, d3 \n\t" 129 // expand 0565 q12 to 8888 {d4-d7} 130 "vmovn.u16 d4, q12 \n\t" 131 "vshr.u16 q11, q12, #5 \n\t" 132 "vshr.u16 q10, q12, #6+5 \n\t" 133 "vmovn.u16 d5, q11 \n\t" 134 "vmovn.u16 d6, q10 \n\t" 135 "vshl.u8 d4, d4, #3 \n\t" 136 "vshl.u8 d5, d5, #2 \n\t" 137 "vshl.u8 d6, d6, #3 \n\t" 138 139 "vmovl.u8 q14, d31 \n\t" 140 "vmovl.u8 q13, d31 \n\t" 141 "vmovl.u8 q12, d31 \n\t" 142 143 // duplicate in 4/2/1 & 8pix vsns 144 "vmvn.8 d30, d3 \n\t" 145 "vmlal.u8 q14, d30, d6 \n\t" 146 "vmlal.u8 q13, d30, d5 \n\t" 147 "vmlal.u8 q12, d30, d4 \n\t" 148 "vshr.u16 q8, q14, #5 \n\t" 149 "vshr.u16 q9, q13, #6 \n\t" 150 "vaddhn.u16 d6, q14, q8 \n\t" 151 "vshr.u16 q8, q12, #5 \n\t" 152 "vaddhn.u16 d5, q13, q9 \n\t" 153 "vqadd.u8 d6, d6, d0 \n\t" // moved up 154 "vaddhn.u16 d4, q12, q8 \n\t" 155 // intentionally don't calculate alpha 156 // result in d4-d6 157 158 "vqadd.u8 d5, d5, d1 \n\t" 159 "vqadd.u8 d4, d4, d2 \n\t" 160 161 // pack 8888 {d4-d6} to 0565 q10 162 "vshll.u8 q10, d6, #8 \n\t" 163 "vshll.u8 q3, d5, #8 \n\t" 164 "vshll.u8 q2, d4, #8 \n\t" 165 "vsri.u16 q10, q3, #5 \n\t" 166 "vsri.u16 q10, q2, #11 \n\t" 167 168 // store 169 "tst %[count], #4 \n\t" 170 "beq 24f \n\t" 171 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 172 173 "24: \n\t" 174 "tst %[count], #2 \n\t" 175 "beq 22f \n\t" 176 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 177 178 "22: \n\t" 179 "tst %[count], #1 \n\t" 180 "beq 21f \n\t" 181 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 182 183 "21: \n\t" 184 : [count] "+r" (count) 185 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 186 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 187 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 188 "d30","d31" 189 ); 190 } 191 } 192 193 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 194 const SkPMColor* SK_RESTRICT src, int count, 195 U8CPU alpha, int /*x*/, int /*y*/) { 196 197 U8CPU alpha_for_asm = alpha; 198 199 asm volatile ( 200 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 201 * the original in two respects: 202 * 1. The results have a few mismatches compared to the original code. These mismatches 203 * never exceed 1. It's possible to improve accuracy vs. a floating point 204 * implementation by introducing rounding right shifts (vrshr) for the final stage. 205 * Rounding is not present in the code below, because although results would be closer 206 * to a floating point implementation, the number of mismatches compared to the 207 * original code would be far greater. 208 * 2. On certain inputs, the original code can overflow, causing colour channels to 209 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 210 * to affect another. 211 */ 212 213 #if 1 214 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 215 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 216 #else 217 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 218 #endif 219 "vmov.u16 q3, #255 \n\t" // set up constant 220 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 221 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 222 "beq 2f \n\t" // if count8 == 0, exit 223 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 224 225 "1: \n\t" 226 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 227 "subs r4, r4, #1 \n\t" // decrement loop counter 228 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 229 // and deinterleave 230 231 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 232 "vand q10, q0, q15 \n\t" // extract blue 233 "vshr.u16 q8, q0, #11 \n\t" // extract red 234 "vshr.u16 q9, q9, #10 \n\t" // extract green 235 // dstrgb = {q8, q9, q10} 236 237 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 238 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 239 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 240 241 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 242 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 243 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 244 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 245 // srcrgba = {q11, q12, q13, q14} 246 247 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 248 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 249 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 250 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 251 252 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 253 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 254 // dst_scale = q2 255 256 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 257 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 258 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 259 260 #if 1 261 // trying for a better match with SkDiv255Round(a) 262 // C alg is: a+=128; (a+a>>8)>>8 263 // we'll use just a rounding shift [q2 is available for scratch] 264 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 265 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 266 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 267 #else 268 // arm's original "truncating divide by 256" 269 "vshr.u16 q11, q11, #8 \n\t" // shift down red 270 "vshr.u16 q12, q12, #8 \n\t" // shift down green 271 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 272 #endif 273 274 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 275 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 276 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 277 278 "bne 1b \n\t" // if counter != 0, loop 279 "2: \n\t" // exit 280 281 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 282 : 283 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 284 ); 285 286 count &= 7; 287 if (count > 0) { 288 do { 289 SkPMColor sc = *src++; 290 if (sc) { 291 uint16_t dc = *dst; 292 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 293 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 294 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 295 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 296 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 297 } 298 dst += 1; 299 } while (--count != 0); 300 } 301 } 302 303 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 304 * each dither value is spaced out into byte lanes, and repeated 305 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 306 * start of each row. 307 */ 308 static const uint8_t gDitherMatrix_Neon[48] = { 309 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 310 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 311 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 312 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 313 314 }; 315 316 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 317 int count, U8CPU alpha, int x, int y) 318 { 319 /* select row and offset for dither array */ 320 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 321 322 /* rescale alpha to range 0 - 256 */ 323 int scale = SkAlpha255To256(alpha); 324 325 asm volatile ( 326 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 327 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 328 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 329 "vmov.i8 d29, #0x3f \n\t" // set up green mask 330 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 331 "1: \n\t" 332 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 333 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 334 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 335 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 336 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 337 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 338 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 339 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 340 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 341 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 342 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 343 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 344 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 345 // load 8 pixels from dst, extract rgb 346 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 347 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 348 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 349 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 350 "vand d17, d17, d29 \n\t" // and green with green mask 351 "vand d18, d18, d28 \n\t" // and blue with blue mask 352 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 353 // src = {d22 (r), d23 (g), d24 (b)} 354 // dst = {d16 (r), d17 (g), d18 (b)} 355 // subtract dst from src and widen 356 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 357 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 358 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 359 // multiply diffs by scale and shift 360 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 361 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 362 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 363 "subs %[count], %[count], #8 \n\t" // decrement loop counter 364 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 365 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 366 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 367 // add dst to result 368 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 369 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 370 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 371 // put result into 565 format 372 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 373 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 374 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 375 "bgt 1b \n\t" // loop if count > 0 376 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 377 : [dstart] "r" (dstart), [scale] "r" (scale) 378 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 379 ); 380 381 DITHER_565_SCAN(y); 382 383 while((count & 7) > 0) 384 { 385 SkPMColor c = *src++; 386 387 int dither = DITHER_VALUE(x); 388 int sr = SkGetPackedR32(c); 389 int sg = SkGetPackedG32(c); 390 int sb = SkGetPackedB32(c); 391 sr = SkDITHER_R32To565(sr, dither); 392 sg = SkDITHER_G32To565(sg, dither); 393 sb = SkDITHER_B32To565(sb, dither); 394 395 uint16_t d = *dst; 396 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 397 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 398 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 399 DITHER_INC_X(x); 400 count--; 401 } 402 } 403 404 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon 405 #define S32A_D565_Blend_PROC S32A_D565_Blend_neon 406 #define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon 407 #elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN) 408 static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst, 409 const SkPMColor* SK_RESTRICT src, int count, 410 U8CPU alpha, int /*x*/, int /*y*/) { 411 SkASSERT(255 == alpha); 412 413 asm volatile ( 414 "1: \n\t" 415 "ldr r3, [%[src]], #4 \n\t" 416 "cmp r3, #0xff000000 \n\t" 417 "blo 2f \n\t" 418 "and r4, r3, #0x0000f8 \n\t" 419 "and r5, r3, #0x00fc00 \n\t" 420 "and r6, r3, #0xf80000 \n\t" 421 "pld [r1, #32] \n\t" 422 "lsl r3, r4, #8 \n\t" 423 "orr r3, r3, r5, lsr #5 \n\t" 424 "orr r3, r3, r6, lsr #19 \n\t" 425 "subs %[count], %[count], #1 \n\t" 426 "strh r3, [%[dst]], #2 \n\t" 427 "bne 1b \n\t" 428 "b 4f \n\t" 429 "2: \n\t" 430 "lsrs r7, r3, #24 \n\t" 431 "beq 3f \n\t" 432 "ldrh r4, [%[dst]] \n\t" 433 "rsb r7, r7, #255 \n\t" 434 "and r6, r4, #0x001f \n\t" 435 "ubfx r5, r4, #5, #6 \n\t" 436 "pld [r0, #16] \n\t" 437 "lsr r4, r4, #11 \n\t" 438 "smulbb r6, r6, r7 \n\t" 439 "smulbb r5, r5, r7 \n\t" 440 "smulbb r4, r4, r7 \n\t" 441 "ubfx r7, r3, #16, #8 \n\t" 442 "ubfx ip, r3, #8, #8 \n\t" 443 "and r3, r3, #0xff \n\t" 444 "add r6, r6, #16 \n\t" 445 "add r5, r5, #32 \n\t" 446 "add r4, r4, #16 \n\t" 447 "add r6, r6, r6, lsr #5 \n\t" 448 "add r5, r5, r5, lsr #6 \n\t" 449 "add r4, r4, r4, lsr #5 \n\t" 450 "add r6, r7, r6, lsr #5 \n\t" 451 "add r5, ip, r5, lsr #6 \n\t" 452 "add r4, r3, r4, lsr #5 \n\t" 453 "lsr r6, r6, #3 \n\t" 454 "and r5, r5, #0xfc \n\t" 455 "and r4, r4, #0xf8 \n\t" 456 "orr r6, r6, r5, lsl #3 \n\t" 457 "orr r4, r6, r4, lsl #8 \n\t" 458 "strh r4, [%[dst]], #2 \n\t" 459 "pld [r1, #32] \n\t" 460 "subs %[count], %[count], #1 \n\t" 461 "bne 1b \n\t" 462 "b 4f \n\t" 463 "3: \n\t" 464 "subs %[count], %[count], #1 \n\t" 465 "add %[dst], %[dst], #2 \n\t" 466 "bne 1b \n\t" 467 "4: \n\t" 468 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 469 : 470 : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" 471 ); 472 } 473 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_v7 474 #define S32A_D565_Blend_PROC NULL 475 #define S32_D565_Blend_Dither_PROC NULL 476 #else 477 #define S32A_D565_Opaque_PROC NULL 478 #define S32A_D565_Blend_PROC NULL 479 #define S32_D565_Blend_Dither_PROC NULL 480 #endif 481 482 /* Don't have a special version that assumes each src is opaque, but our S32A 483 is still faster than the default, so use it here 484 */ 485 #define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC 486 #define S32_D565_Blend_PROC S32A_D565_Blend_PROC 487 488 /////////////////////////////////////////////////////////////////////////////// 489 490 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA) 491 492 static void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst, 493 const SkPMColor* SK_RESTRICT src, 494 int count, U8CPU alpha) { 495 SkASSERT(255 == alpha); 496 if (count <= 0) 497 return; 498 499 /* Use these to check if src is transparent or opaque */ 500 const unsigned int ALPHA_OPAQ = 0xFF000000; 501 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 502 503 #define UNROLL 4 504 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 505 const SkPMColor* SK_RESTRICT src_temp = src; 506 507 /* set up the NEON variables */ 508 uint8x8_t alpha_mask; 509 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 510 alpha_mask = vld1_u8(alpha_mask_setup); 511 512 uint8x8_t src_raw, dst_raw, dst_final; 513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 514 uint8x8_t dst_cooked; 515 uint16x8_t dst_wide; 516 uint8x8_t alpha_narrow; 517 uint16x8_t alpha_wide; 518 519 /* choose the first processing type */ 520 if( src >= src_end) 521 goto TAIL; 522 if(*src <= ALPHA_TRANS) 523 goto ALPHA_0; 524 if(*src >= ALPHA_OPAQ) 525 goto ALPHA_255; 526 /* fall-thru */ 527 528 ALPHA_1_TO_254: 529 do { 530 531 /* get the source */ 532 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 533 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 534 535 /* get and hold the dst too */ 536 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 537 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 538 539 540 /* get the alphas spread out properly */ 541 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 542 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 543 /* we collapsed (255-a)+1 ... */ 544 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 545 546 /* spread the dest */ 547 dst_wide = vmovl_u8(dst_raw); 548 549 /* alpha mul the dest */ 550 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 551 dst_cooked = vshrn_n_u16(dst_wide, 8); 552 553 /* sum -- ignoring any byte lane overflows */ 554 dst_final = vadd_u8(src_raw, dst_cooked); 555 556 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 557 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 558 /* we collapsed (255-a)+1 ... */ 559 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 560 561 /* spread the dest */ 562 dst_wide = vmovl_u8(dst_raw_2); 563 564 /* alpha mul the dest */ 565 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 566 dst_cooked = vshrn_n_u16(dst_wide, 8); 567 568 /* sum -- ignoring any byte lane overflows */ 569 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 570 571 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 572 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 573 574 src += UNROLL; 575 dst += UNROLL; 576 577 /* if 2 of the next pixels aren't between 1 and 254 578 it might make sense to go to the optimized loops */ 579 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 580 break; 581 582 } while(src < src_end); 583 584 if (src >= src_end) 585 goto TAIL; 586 587 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 588 goto ALPHA_255; 589 590 /*fall-thru*/ 591 592 ALPHA_0: 593 594 /*In this state, we know the current alpha is 0 and 595 we optimize for the next alpha also being zero. */ 596 src_temp = src; //so we don't have to increment dst every time 597 do { 598 if(*(++src) > ALPHA_TRANS) 599 break; 600 if(*(++src) > ALPHA_TRANS) 601 break; 602 if(*(++src) > ALPHA_TRANS) 603 break; 604 if(*(++src) > ALPHA_TRANS) 605 break; 606 } while(src < src_end); 607 608 dst += (src - src_temp); 609 610 /* no longer alpha 0, so determine where to go next. */ 611 if( src >= src_end) 612 goto TAIL; 613 if(*src >= ALPHA_OPAQ) 614 goto ALPHA_255; 615 else 616 goto ALPHA_1_TO_254; 617 618 ALPHA_255: 619 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 620 dst[0]=src[0]; 621 dst[1]=src[1]; 622 dst[2]=src[2]; 623 dst[3]=src[3]; 624 src+=UNROLL; 625 dst+=UNROLL; 626 if(src >= src_end) 627 goto TAIL; 628 } 629 630 //Handle remainder. 631 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 632 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 633 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 634 } 635 } 636 637 if( src >= src_end) 638 goto TAIL; 639 if(*src <= ALPHA_TRANS) 640 goto ALPHA_0; 641 else 642 goto ALPHA_1_TO_254; 643 644 TAIL: 645 /* do any residual iterations */ 646 src_end += UNROLL + 1; //goto the real end 647 while(src != src_end) { 648 if( *src != 0 ) { 649 if( *src >= ALPHA_OPAQ ) { 650 *dst = *src; 651 } 652 else { 653 *dst = SkPMSrcOver(*src, *dst); 654 } 655 } 656 src++; 657 dst++; 658 } 659 return; 660 } 661 662 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon_test_alpha 663 664 #elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 665 666 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 667 const SkPMColor* SK_RESTRICT src, 668 int count, U8CPU alpha) { 669 670 SkASSERT(255 == alpha); 671 if (count > 0) { 672 673 674 uint8x8_t alpha_mask; 675 676 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 677 alpha_mask = vld1_u8(alpha_mask_setup); 678 679 /* do the NEON unrolled code */ 680 #define UNROLL 4 681 while (count >= UNROLL) { 682 uint8x8_t src_raw, dst_raw, dst_final; 683 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 684 685 /* get the source */ 686 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 687 #if UNROLL > 2 688 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 689 #endif 690 691 /* get and hold the dst too */ 692 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 693 #if UNROLL > 2 694 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 695 #endif 696 697 /* 1st and 2nd bits of the unrolling */ 698 { 699 uint8x8_t dst_cooked; 700 uint16x8_t dst_wide; 701 uint8x8_t alpha_narrow; 702 uint16x8_t alpha_wide; 703 704 /* get the alphas spread out properly */ 705 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 706 #if 1 707 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 708 /* we collapsed (255-a)+1 ... */ 709 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 710 #else 711 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 712 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 713 #endif 714 715 /* spread the dest */ 716 dst_wide = vmovl_u8(dst_raw); 717 718 /* alpha mul the dest */ 719 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 720 dst_cooked = vshrn_n_u16(dst_wide, 8); 721 722 /* sum -- ignoring any byte lane overflows */ 723 dst_final = vadd_u8(src_raw, dst_cooked); 724 } 725 726 #if UNROLL > 2 727 /* the 3rd and 4th bits of our unrolling */ 728 { 729 uint8x8_t dst_cooked; 730 uint16x8_t dst_wide; 731 uint8x8_t alpha_narrow; 732 uint16x8_t alpha_wide; 733 734 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 735 #if 1 736 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 737 /* we collapsed (255-a)+1 ... */ 738 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 739 #else 740 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 741 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 742 #endif 743 744 /* spread the dest */ 745 dst_wide = vmovl_u8(dst_raw_2); 746 747 /* alpha mul the dest */ 748 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 749 dst_cooked = vshrn_n_u16(dst_wide, 8); 750 751 /* sum -- ignoring any byte lane overflows */ 752 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 753 } 754 #endif 755 756 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 757 #if UNROLL > 2 758 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 759 #endif 760 761 src += UNROLL; 762 dst += UNROLL; 763 count -= UNROLL; 764 } 765 #undef UNROLL 766 767 /* do any residual iterations */ 768 while (--count >= 0) { 769 #ifdef TEST_SRC_ALPHA 770 SkPMColor sc = *src; 771 if (sc) { 772 unsigned srcA = SkGetPackedA32(sc); 773 SkPMColor result = sc; 774 if (srcA != 255) { 775 result = SkPMSrcOver(sc, *dst); 776 } 777 *dst = result; 778 } 779 #else 780 *dst = SkPMSrcOver(*src, *dst); 781 #endif 782 src += 1; 783 dst += 1; 784 } 785 } 786 } 787 788 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon 789 790 #elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */ 791 792 #if defined(TEST_SRC_ALPHA) 793 794 static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha 795 (SkPMColor* SK_RESTRICT dst, 796 const SkPMColor* SK_RESTRICT src, 797 int count, U8CPU alpha) { 798 799 /* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */ 800 /* Predicts that the next pixel will have the same alpha type as the current pixel */ 801 802 asm volatile ( 803 804 "\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */ 805 /* we should not save r0-r3 according to ABI */ 806 807 "\tCMP r2, #0 \n" /* if (count == 0) */ 808 "\tBEQ 9f \n" /* go to EXIT */ 809 810 "\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */ 811 "\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */ 812 813 "\tMOV r14, #255 \n" /* r14 = 255 */ 814 /* will be used later for left-side comparison */ 815 816 "\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */ 817 "\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */ 818 819 "\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */ 820 "\tBGT 8f \n" /* calculated marker for 4-way -> */ 821 /* use simple one-by-one processing */ 822 823 /* START OF DISPATCHING BLOCK */ 824 825 "\t0: \n" 826 827 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 828 829 "\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */ 830 "\tCMP r7, r4, LSR #24 \n" 831 "\tCMPEQ r7, r5, LSR #24 \n" 832 "\tCMPEQ r7, r6, LSR #24 \n" 833 "\tBNE 1f \n" /* -> go to general 4-way processing routine */ 834 835 "\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */ 836 "\tBEQ 3f \n" /* go to alpha == 255 optimized routine */ 837 838 "\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */ 839 "\tBEQ 6f \n" /* go to alpha == 0 optimized routine */ 840 841 /* END OF DISPATCHING BLOCK */ 842 843 /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ 844 845 "\t1: \n" 846 /* we do not have enough registers to make */ 847 /* 4-way [dst] loading -> we are using 2 * 2-way */ 848 849 "\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */ 850 851 /* PROCESSING BLOCK 1 */ 852 /* r3 = src, r7 = dst */ 853 854 "\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */ 855 "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ 856 "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ 857 "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ 858 "\tMUL r9, r9, r11 \n" /* br = br * scale */ 859 "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ 860 "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ 861 "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ 862 "\tORR r7, r9, r10 \n" /* br | ag */ 863 "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ 864 865 /* PROCESSING BLOCK 2 */ 866 /* r4 = src, r8 = dst */ 867 868 "\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */ 869 "\tAND r9, r12, r8 \n" 870 "\tRSB r11, r11, #256 \n" 871 "\tAND r10, r12, r8, LSR #8 \n" 872 "\tMUL r9, r9, r11 \n" 873 "\tAND r9, r12, r9, LSR #8 \n" 874 "\tMUL r10, r10, r11 \n" 875 "\tAND r10, r10, r12, LSL #8 \n" 876 "\tORR r8, r9, r10 \n" 877 "\tADD r8, r4, r8 \n" 878 879 "\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */ 880 881 "\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */ 882 883 /* PROCESSING BLOCK 3 */ 884 /* r5 = src, r9 = dst */ 885 886 "\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */ 887 "\tAND r7, r12, r9 \n" 888 "\tRSB r11, r11, #256 \n" 889 "\tAND r8, r12, r9, LSR #8 \n" 890 "\tMUL r7, r7, r11 \n" 891 "\tAND r7, r12, r7, LSR #8 \n" 892 "\tMUL r8, r8, r11 \n" 893 "\tAND r8, r8, r12, LSL #8 \n" 894 "\tORR r9, r7, r8 \n" 895 "\tADD r9, r5, r9 \n" 896 897 /* PROCESSING BLOCK 4 */ 898 /* r6 = src, r10 = dst */ 899 900 "\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */ 901 "\tAND r7, r12, r10 \n" 902 "\tRSB r11, r11, #256 \n" 903 "\tAND r8, r12, r10, LSR #8 \n" 904 "\tMUL r7, r7, r11 \n" 905 "\tAND r7, r12, r7, LSR #8 \n" 906 "\tMUL r8, r8, r11 \n" 907 "\tAND r8, r8, r12, LSL #8 \n" 908 "\tORR r10, r7, r8 \n" 909 "\tADD r10, r6, r10 \n" 910 911 "\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */ 912 913 "\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */ 914 "\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */ 915 "\tBGT 8f \n" /* else -> use simple one-by-one processing */ 916 917 /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ 918 919 /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */ 920 921 "\t2: \n" /* ENTRY 1: LOADING [src] to registers */ 922 923 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 924 925 "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ 926 "\tAND r8, r5, r6 \n" 927 "\tAND r9, r7, r8 \n" 928 "\tCMP r14, r9, LSR #24 \n" 929 "\tBNE 4f \n" /* -> go to alpha == 0 check */ 930 931 "\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ 932 933 "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ 934 935 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 936 "\tBLE 2b \n" /* we could run 4-way processing */ 937 /* because now we're in ALPHA == 255 state */ 938 /* run next cycle with priority alpha == 255 checks */ 939 940 "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 941 /* use simple one-by-one processing */ 942 943 "\t4: \n" 944 945 "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ 946 "\tORR r8, r5, r6 \n" 947 "\tORR r9, r7, r8 \n" 948 "\tLSRS r9, #24 \n" 949 "\tBNE 1b \n" /* -> go to general processing mode */ 950 /* (we already checked for alpha == 255) */ 951 952 "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ 953 954 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 955 "\tBLE 5f \n" /* we could run 4-way processing one more time */ 956 /* because now we're in ALPHA == 0 state */ 957 /* run next cycle with priority alpha == 0 checks */ 958 959 "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 960 /* use simple one-by-one processing */ 961 962 /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */ 963 964 /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */ 965 966 "\t5: \n" /* ENTRY 1: LOADING [src] to registers */ 967 968 "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 969 970 "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ 971 "\tORR r8, r5, r6 \n" 972 "\tORR r9, r7, r8 \n" 973 "\tLSRS r9, #24 \n" 974 "\tBNE 7f \n" /* -> go to alpha == 255 check */ 975 976 "\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ 977 978 "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ 979 980 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 981 "\tBLE 5b \n" /* we could run 4-way processing one more time */ 982 /* because now we're in ALPHA == 0 state */ 983 /* run next cycle with priority alpha == 0 checks */ 984 985 "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 986 /* use simple one-by-one processing */ 987 "\t7: \n" 988 989 "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ 990 "\tAND r8, r5, r6 \n" 991 "\tAND r9, r7, r8 \n" 992 "\tCMP r14, r9, LSR #24 \n" 993 "\tBNE 1b \n" /* -> go to general processing mode */ 994 /* (we already checked for alpha == 0) */ 995 996 "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ 997 998 "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 999 "\tBLE 2b \n" /* we could run 4-way processing one more time */ 1000 /* because now we're in ALPHA == 255 state */ 1001 /* run next cycle with priority alpha == 255 checks */ 1002 1003 "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 1004 /* use simple one-by-one processing */ 1005 1006 /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */ 1007 1008 /* START OF TAIL BLOCK */ 1009 /* (used when array is too small to be processed with 4-way algorithm)*/ 1010 1011 "\t8: \n" 1012 1013 "\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */ 1014 /* we've done r2 = r2 - 16 at procedure start */ 1015 1016 "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ 1017 "\tBEQ 9f \n" /* goto EXIT */ 1018 1019 /* TAIL PROCESSING BLOCK 1 */ 1020 1021 "\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */ 1022 "\tLDR r7, [%[dst]] \n" /* r7 = *dst */ 1023 1024 "\tLSR r11, r3, #24 \n" /* extracting alpha from source */ 1025 "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ 1026 "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ 1027 "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ 1028 "\tMUL r9, r9, r11 \n" /* br = br * scale */ 1029 "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ 1030 "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ 1031 "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ 1032 "\tORR r7, r9, r10 \n" /* br | ag */ 1033 "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ 1034 1035 "\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */ 1036 1037 "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ 1038 "\tBEQ 9f \n" /* goto EXIT */ 1039 1040 /* TAIL PROCESSING BLOCK 2 */ 1041 1042 "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ 1043 "\tLDR r7, [%[dst]] \n" 1044 1045 "\tLSR r11, r3, #24 \n" 1046 "\tAND r9, r12, r7 \n" 1047 "\tRSB r11, r11, #256 \n" 1048 "\tAND r10, r12, r7, LSR #8 \n" 1049 "\tMUL r9, r9, r11 \n" 1050 "\tAND r9, r12, r9, LSR #8 \n" 1051 "\tMUL r10, r10, r11 \n" 1052 "\tAND r10, r10, r12, LSL #8 \n" 1053 "\tORR r7, r9, r10 \n" 1054 "\tADD r7, r3, r7 \n" 1055 1056 "\tSTR r7, [%[dst]], #4 \n" 1057 1058 "\tCMP %[src], r2 \n" 1059 "\tBEQ 9f \n" 1060 1061 /* TAIL PROCESSING BLOCK 3 */ 1062 1063 "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ 1064 "\tLDR r7, [%[dst]] \n" 1065 1066 "\tLSR r11, r3, #24 \n" 1067 "\tAND r9, r12, r7 \n" 1068 "\tRSB r11, r11, #256 \n" 1069 "\tAND r10, r12, r7, LSR #8 \n" 1070 "\tMUL r9, r9, r11 \n" 1071 "\tAND r9, r12, r9, LSR #8 \n" 1072 "\tMUL r10, r10, r11 \n" 1073 "\tAND r10, r10, r12, LSL #8 \n" 1074 "\tORR r7, r9, r10 \n" 1075 "\tADD r7, r3, r7 \n" 1076 1077 "\tSTR r7, [%[dst]], #4 \n" 1078 1079 /* END OF TAIL BLOCK */ 1080 1081 "\t9: \n" /* EXIT */ 1082 1083 "\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */ 1084 "\tBX lr \n" /* return */ 1085 1086 : [dst] "+r" (dst), [src] "+r" (src) 1087 : 1088 : "cc", "r2", "r3", "memory" 1089 1090 ); 1091 1092 } 1093 1094 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha 1095 #else /* !defined(TEST_SRC_ALPHA) */ 1096 1097 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 1098 const SkPMColor* SK_RESTRICT src, 1099 int count, U8CPU alpha) { 1100 1101 SkASSERT(255 == alpha); 1102 1103 /* Does not support the TEST_SRC_ALPHA case */ 1104 asm volatile ( 1105 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 1106 "beq 3f \n\t" /* if zero exit */ 1107 1108 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ 1109 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ 1110 1111 "cmp %[count], #2 \n\t" /* compare count with 2 */ 1112 "blt 2f \n\t" /* if less than 2 -> single loop */ 1113 1114 /* Double Loop */ 1115 "1: \n\t" /* <double loop> */ 1116 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ 1117 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ 1118 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1119 1120 /* ----------- */ 1121 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 1122 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 1123 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 1124 1125 "mul r9, r9, r4 \n\t" /* br = br * scale */ 1126 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1127 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1128 1129 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 1130 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1131 "orr r7, r9, r10 \n\t" /* br | ag*/ 1132 1133 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ 1134 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ 1135 1136 /* ----------- */ 1137 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ 1138 1139 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ 1140 "mul r9, r9, r4 \n\t" /* br = br * scale */ 1141 "sub %[count], %[count], #2 \n\t" 1142 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1143 1144 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1145 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 1146 "cmp %[count], #1 \n\t" /* comparing count with 1 */ 1147 "orr r8, r9, r10 \n\t" /* br | ag */ 1148 1149 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ 1150 1151 /* ----------------- */ 1152 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ 1153 /* ----------------- */ 1154 1155 "bgt 1b \n\t" /* if greater than 1 -> reloop */ 1156 "blt 3f \n\t" /* if less than 1 -> exit */ 1157 1158 /* Single Loop */ 1159 "2: \n\t" /* <single loop> */ 1160 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ 1161 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ 1162 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1163 1164 /* ----------- */ 1165 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 1166 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 1167 1168 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 1169 "mul r9, r9, r4 \n\t" /* br = br * scale */ 1170 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1171 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1172 1173 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ 1174 "orr r7, r9, r10 \n\t" /* br | ag */ 1175 1176 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ 1177 1178 /* ----------------- */ 1179 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ 1180 /* ----------------- */ 1181 1182 "3: \n\t" /* <exit> */ 1183 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 1184 : 1185 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" 1186 ); 1187 } 1188 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm 1189 #endif /* !defined(TEST_SRC_ALPHA) */ 1190 #else /* ... #elif defined (__ARM_ARCH__) */ 1191 #define S32A_Opaque_BlitRow32_PROC NULL 1192 #endif 1193 1194 /* 1195 * ARM asm version of S32A_Blend_BlitRow32 1196 */ 1197 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 1198 const SkPMColor* SK_RESTRICT src, 1199 int count, U8CPU alpha) { 1200 asm volatile ( 1201 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 1202 "beq 3f \n\t" /* if zero exit */ 1203 1204 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ 1205 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ 1206 1207 /* src1,2_scale */ 1208 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ 1209 1210 "cmp %[count], #2 \n\t" /* comparing count with 2 */ 1211 "blt 2f \n\t" /* if less than 2 -> single loop */ 1212 1213 /* Double Loop */ 1214 "1: \n\t" /* <double loop> */ 1215 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ 1216 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ 1217 1218 /* dst1_scale and dst2_scale*/ 1219 "lsr r9, r5, #24 \n\t" /* src >> 24 */ 1220 "lsr r10, r6, #24 \n\t" /* src >> 24 */ 1221 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 1222 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 1223 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ 1224 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ 1225 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ 1226 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ 1227 1228 /* ---------------------- */ 1229 1230 /* src1, src1_scale */ 1231 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ 1232 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ 1233 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 1234 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 1235 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1236 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1237 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ 1238 1239 /* dst1, dst1_scale */ 1240 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ 1241 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ 1242 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ 1243 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ 1244 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1245 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1246 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ 1247 1248 /* ---------------------- */ 1249 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ 1250 /* ---------------------- */ 1251 1252 /* ====================== */ 1253 1254 /* src2, src2_scale */ 1255 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ 1256 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ 1257 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 1258 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 1259 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1260 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1261 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ 1262 1263 /* dst2, dst2_scale */ 1264 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ 1265 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ 1266 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ 1267 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ 1268 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1269 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1270 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ 1271 1272 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ 1273 /* ---------------------- */ 1274 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ 1275 /* ---------------------- */ 1276 "cmp %[count], #1 \n\t" /* compare count with 1 */ 1277 /* ----------------- */ 1278 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ 1279 /* ----------------- */ 1280 1281 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ 1282 "blt 3f \n\t" /* if %[count] less than 1 exit */ 1283 /* else get into the single loop */ 1284 /* Single Loop */ 1285 "2: \n\t" /* <single loop> */ 1286 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ 1287 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ 1288 1289 "lsr r6, r5, #24 \n\t" /* src >> 24 */ 1290 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ 1291 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 1292 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ 1293 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ 1294 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ 1295 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ 1296 1297 /* src, src_scale */ 1298 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ 1299 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1300 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 1301 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ 1302 1303 /* dst, dst_scale */ 1304 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ 1305 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ 1306 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ 1307 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ 1308 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1309 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 1310 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ 1311 1312 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ 1313 1314 /* ----------------- */ 1315 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ 1316 /* ----------------- */ 1317 1318 "3: \n\t" /* <exit> */ 1319 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) 1320 : 1321 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" 1322 ); 1323 1324 } 1325 #define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm 1326 1327 /* Neon version of S32_Blend_BlitRow32() 1328 * portable version is in src/core/SkBlitRow_D32.cpp 1329 */ 1330 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 1331 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1332 const SkPMColor* SK_RESTRICT src, 1333 int count, U8CPU alpha) { 1334 SkASSERT(alpha <= 255); 1335 if (count > 0) { 1336 uint16_t src_scale = SkAlpha255To256(alpha); 1337 uint16_t dst_scale = 256 - src_scale; 1338 1339 /* run them N at a time through the NEON unit */ 1340 /* note that each 1 is 4 bytes, each treated exactly the same, 1341 * so we can work under that guise. We *do* know that the src&dst 1342 * will be 32-bit aligned quantities, so we can specify that on 1343 * the load/store ops and do a neon 'reinterpret' to get us to 1344 * byte-sized (pun intended) pieces that we widen/multiply/shift 1345 * we're limited at 128 bits in the wide ops, which is 8x16bits 1346 * or a pair of 32 bit src/dsts. 1347 */ 1348 /* we *could* manually unroll this loop so that we load 128 bits 1349 * (as a pair of 64s) from each of src and dst, processing them 1350 * in pieces. This might give us a little better management of 1351 * the memory latency, but my initial attempts here did not 1352 * produce an instruction stream that looked all that nice. 1353 */ 1354 #define UNROLL 2 1355 while (count >= UNROLL) { 1356 uint8x8_t src_raw, dst_raw, dst_final; 1357 uint16x8_t src_wide, dst_wide; 1358 1359 /* get 64 bits of src, widen it, multiply by src_scale */ 1360 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 1361 src_wide = vmovl_u8(src_raw); 1362 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 1363 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 1364 1365 /* ditto with dst */ 1366 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 1367 dst_wide = vmovl_u8(dst_raw); 1368 1369 /* combine add with dst multiply into mul-accumulate */ 1370 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 1371 1372 dst_final = vshrn_n_u16(dst_wide, 8); 1373 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 1374 1375 src += UNROLL; 1376 dst += UNROLL; 1377 count -= UNROLL; 1378 } 1379 /* RBE: well, i don't like how gcc manages src/dst across the above 1380 * loop it's constantly calculating src+bias, dst+bias and it only 1381 * adjusts the real ones when we leave the loop. Not sure why 1382 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 1383 * the adjustments to src/dst/count, but it does... 1384 * (might be SSA-style internal logic... 1385 */ 1386 1387 #if UNROLL == 2 1388 if (count == 1) { 1389 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1390 } 1391 #else 1392 if (count > 0) { 1393 do { 1394 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1395 src += 1; 1396 dst += 1; 1397 } while (--count > 0); 1398 } 1399 #endif 1400 1401 #undef UNROLL 1402 } 1403 } 1404 1405 #define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon 1406 #else 1407 #define S32_Blend_BlitRow32_PROC NULL 1408 #endif 1409 1410 /////////////////////////////////////////////////////////////////////////////// 1411 1412 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 1413 1414 #undef DEBUG_OPAQUE_DITHER 1415 1416 #if defined(DEBUG_OPAQUE_DITHER) 1417 static void showme8(char *str, void *p, int len) 1418 { 1419 static char buf[256]; 1420 char tbuf[32]; 1421 int i; 1422 char *pc = (char*) p; 1423 sprintf(buf,"%8s:", str); 1424 for(i=0;i<len;i++) { 1425 sprintf(tbuf, " %02x", pc[i]); 1426 strcat(buf, tbuf); 1427 } 1428 SkDebugf("%s\n", buf); 1429 } 1430 static void showme16(char *str, void *p, int len) 1431 { 1432 static char buf[256]; 1433 char tbuf[32]; 1434 int i; 1435 uint16_t *pc = (uint16_t*) p; 1436 sprintf(buf,"%8s:", str); 1437 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1438 for(i=0;i<len;i++) { 1439 sprintf(tbuf, " %04x", pc[i]); 1440 strcat(buf, tbuf); 1441 } 1442 SkDebugf("%s\n", buf); 1443 } 1444 #endif 1445 1446 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1447 const SkPMColor* SK_RESTRICT src, 1448 int count, U8CPU alpha, int x, int y) { 1449 SkASSERT(255 == alpha); 1450 1451 #define UNROLL 8 1452 1453 if (count >= UNROLL) { 1454 uint8x8_t dbase; 1455 1456 #if defined(DEBUG_OPAQUE_DITHER) 1457 uint16_t tmpbuf[UNROLL]; 1458 int td[UNROLL]; 1459 int tdv[UNROLL]; 1460 int ta[UNROLL]; 1461 int tap[UNROLL]; 1462 uint16_t in_dst[UNROLL]; 1463 int offset = 0; 1464 int noisy = 0; 1465 #endif 1466 1467 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1468 dbase = vld1_u8(dstart); 1469 1470 do { 1471 uint8x8_t sr, sg, sb, sa, d; 1472 uint16x8_t dst8, scale8, alpha8; 1473 uint16x8_t dst_r, dst_g, dst_b; 1474 1475 #if defined(DEBUG_OPAQUE_DITHER) 1476 /* calculate 8 elements worth into a temp buffer */ 1477 { 1478 int my_y = y; 1479 int my_x = x; 1480 SkPMColor* my_src = (SkPMColor*)src; 1481 uint16_t* my_dst = dst; 1482 int i; 1483 1484 DITHER_565_SCAN(my_y); 1485 for(i=0;i<UNROLL;i++) { 1486 SkPMColor c = *my_src++; 1487 SkPMColorAssert(c); 1488 if (c) { 1489 unsigned a = SkGetPackedA32(c); 1490 1491 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1492 tdv[i] = DITHER_VALUE(my_x); 1493 ta[i] = a; 1494 tap[i] = SkAlpha255To256(a); 1495 td[i] = d; 1496 1497 unsigned sr = SkGetPackedR32(c); 1498 unsigned sg = SkGetPackedG32(c); 1499 unsigned sb = SkGetPackedB32(c); 1500 sr = SkDITHER_R32_FOR_565(sr, d); 1501 sg = SkDITHER_G32_FOR_565(sg, d); 1502 sb = SkDITHER_B32_FOR_565(sb, d); 1503 1504 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1505 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1506 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1507 // now src and dst expanded are in g:11 r:10 x:1 b:10 1508 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1509 td[i] = d; 1510 1511 } else { 1512 tmpbuf[i] = *my_dst; 1513 ta[i] = tdv[i] = td[i] = 0xbeef; 1514 } 1515 in_dst[i] = *my_dst; 1516 my_dst += 1; 1517 DITHER_INC_X(my_x); 1518 } 1519 } 1520 #endif 1521 1522 /* source is in ABGR */ 1523 { 1524 register uint8x8_t d0 asm("d0"); 1525 register uint8x8_t d1 asm("d1"); 1526 register uint8x8_t d2 asm("d2"); 1527 register uint8x8_t d3 asm("d3"); 1528 1529 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1530 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1531 : "r" (src) 1532 ); 1533 sr = d0; sg = d1; sb = d2; sa = d3; 1534 } 1535 1536 /* calculate 'd', which will be 0..7 */ 1537 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1538 #if defined(SK_BUILD_FOR_ANDROID) 1539 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1540 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1541 #else 1542 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1543 #endif 1544 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1545 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 1546 1547 /* sr = sr - (sr>>5) + d */ 1548 /* watching for 8-bit overflow. d is 0..7; risky range of 1549 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1550 * safe as long as we do ((sr-sr>>5) + d) */ 1551 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1552 sr = vadd_u8(sr, d); 1553 1554 /* sb = sb - (sb>>5) + d */ 1555 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1556 sb = vadd_u8(sb, d); 1557 1558 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1559 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1560 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1561 1562 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1563 dst8 = vld1q_u16(dst); 1564 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1565 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1566 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1567 1568 /* blend */ 1569 #if 1 1570 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1571 /* originally 255-sa + 1 */ 1572 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1573 #else 1574 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1575 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1576 #endif 1577 1578 #if 1 1579 /* combine the addq and mul, save 3 insns */ 1580 scale8 = vshrq_n_u16(scale8, 3); 1581 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1582 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1583 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1584 #else 1585 /* known correct, but +3 insns over above */ 1586 scale8 = vshrq_n_u16(scale8, 3); 1587 dst_b = vmulq_u16(dst_b, scale8); 1588 dst_g = vmulq_u16(dst_g, scale8); 1589 dst_r = vmulq_u16(dst_r, scale8); 1590 1591 /* combine */ 1592 /* NB: vshll widens, need to preserve those bits */ 1593 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1594 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1595 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1596 #endif 1597 1598 /* repack to store */ 1599 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1600 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1601 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1602 1603 vst1q_u16(dst, dst8); 1604 1605 #if defined(DEBUG_OPAQUE_DITHER) 1606 /* verify my 8 elements match the temp buffer */ 1607 { 1608 int i, bad=0; 1609 static int invocation; 1610 1611 for (i=0;i<UNROLL;i++) 1612 if (tmpbuf[i] != dst[i]) bad=1; 1613 if (bad) { 1614 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1615 invocation, offset); 1616 SkDebugf(" alpha 0x%x\n", alpha); 1617 for (i=0;i<UNROLL;i++) 1618 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1619 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1620 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1621 1622 showme16("alpha8", &alpha8, sizeof(alpha8)); 1623 showme16("scale8", &scale8, sizeof(scale8)); 1624 showme8("d", &d, sizeof(d)); 1625 showme16("dst8", &dst8, sizeof(dst8)); 1626 showme16("dst_b", &dst_b, sizeof(dst_b)); 1627 showme16("dst_g", &dst_g, sizeof(dst_g)); 1628 showme16("dst_r", &dst_r, sizeof(dst_r)); 1629 showme8("sb", &sb, sizeof(sb)); 1630 showme8("sg", &sg, sizeof(sg)); 1631 showme8("sr", &sr, sizeof(sr)); 1632 1633 /* cop out */ 1634 return; 1635 } 1636 offset += UNROLL; 1637 invocation++; 1638 } 1639 #endif 1640 1641 dst += UNROLL; 1642 src += UNROLL; 1643 count -= UNROLL; 1644 /* skip x += UNROLL, since it's unchanged mod-4 */ 1645 } while (count >= UNROLL); 1646 } 1647 #undef UNROLL 1648 1649 /* residuals */ 1650 if (count > 0) { 1651 DITHER_565_SCAN(y); 1652 do { 1653 SkPMColor c = *src++; 1654 SkPMColorAssert(c); 1655 if (c) { 1656 unsigned a = SkGetPackedA32(c); 1657 1658 // dither and alpha are just temporary variables to work-around 1659 // an ICE in debug. 1660 unsigned dither = DITHER_VALUE(x); 1661 unsigned alpha = SkAlpha255To256(a); 1662 int d = SkAlphaMul(dither, alpha); 1663 1664 unsigned sr = SkGetPackedR32(c); 1665 unsigned sg = SkGetPackedG32(c); 1666 unsigned sb = SkGetPackedB32(c); 1667 sr = SkDITHER_R32_FOR_565(sr, d); 1668 sg = SkDITHER_G32_FOR_565(sg, d); 1669 sb = SkDITHER_B32_FOR_565(sb, d); 1670 1671 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1672 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1673 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1674 // now src and dst expanded are in g:11 r:10 x:1 b:10 1675 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1676 } 1677 dst += 1; 1678 DITHER_INC_X(x); 1679 } while (--count != 0); 1680 } 1681 } 1682 1683 #define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon 1684 #else 1685 #define S32A_D565_Opaque_Dither_PROC NULL 1686 #endif 1687 1688 /////////////////////////////////////////////////////////////////////////////// 1689 1690 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 1691 /* 2009/10/27: RBE says "a work in progress"; debugging says ok; 1692 * speedup untested, but ARM version is 26 insns/iteration and 1693 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 1694 * which is 10x the native version; that's pure instruction counts, 1695 * not accounting for any instruction or memory latencies. 1696 */ 1697 1698 #undef DEBUG_S32_OPAQUE_DITHER 1699 1700 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1701 const SkPMColor* SK_RESTRICT src, 1702 int count, U8CPU alpha, int x, int y) { 1703 SkASSERT(255 == alpha); 1704 1705 #define UNROLL 8 1706 if (count >= UNROLL) { 1707 uint8x8_t d; 1708 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1709 d = vld1_u8(dstart); 1710 1711 while (count >= UNROLL) { 1712 uint8x8_t sr, sg, sb, sa; 1713 uint16x8_t dr, dg, db, da; 1714 uint16x8_t dst8; 1715 1716 /* source is in ABGR ordering (R == lsb) */ 1717 { 1718 register uint8x8_t d0 asm("d0"); 1719 register uint8x8_t d1 asm("d1"); 1720 register uint8x8_t d2 asm("d2"); 1721 register uint8x8_t d3 asm("d3"); 1722 1723 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1724 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1725 : "r" (src) 1726 ); 1727 sr = d0; sg = d1; sb = d2; sa = d3; 1728 } 1729 /* XXX: if we want to prefetch, hide it in the above asm() 1730 * using the gcc __builtin_prefetch(), the prefetch will 1731 * fall to the bottom of the loop -- it won't stick up 1732 * at the top of the loop, just after the vld4. 1733 */ 1734 1735 /* sr = sr - (sr>>5) + d */ 1736 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1737 dr = vaddl_u8(sr, d); 1738 1739 /* sb = sb - (sb>>5) + d */ 1740 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1741 db = vaddl_u8(sb, d); 1742 1743 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1744 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1745 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 1746 /* XXX: check that the "d>>1" here is hoisted */ 1747 1748 /* pack high bits of each into 565 format (rgb, b is lsb) */ 1749 dst8 = vshrq_n_u16(db, 3); 1750 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1751 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 1752 1753 /* store it */ 1754 vst1q_u16(dst, dst8); 1755 1756 #if defined(DEBUG_S32_OPAQUE_DITHER) 1757 /* always good to know if we generated good results */ 1758 { 1759 int i, myx = x, myy = y; 1760 DITHER_565_SCAN(myy); 1761 for (i=0;i<UNROLL;i++) { 1762 SkPMColor c = src[i]; 1763 unsigned dither = DITHER_VALUE(myx); 1764 uint16_t val = SkDitherRGB32To565(c, dither); 1765 if (val != dst[i]) { 1766 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1767 c, dither, val, dst[i], dstart[i]); 1768 } 1769 DITHER_INC_X(myx); 1770 } 1771 } 1772 #endif 1773 1774 dst += UNROLL; 1775 src += UNROLL; 1776 count -= UNROLL; 1777 x += UNROLL; /* probably superfluous */ 1778 } 1779 } 1780 #undef UNROLL 1781 1782 /* residuals */ 1783 if (count > 0) { 1784 DITHER_565_SCAN(y); 1785 do { 1786 SkPMColor c = *src++; 1787 SkPMColorAssert(c); 1788 SkASSERT(SkGetPackedA32(c) == 255); 1789 1790 unsigned dither = DITHER_VALUE(x); 1791 *dst++ = SkDitherRGB32To565(c, dither); 1792 DITHER_INC_X(x); 1793 } while (--count != 0); 1794 } 1795 } 1796 1797 #define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon 1798 #else 1799 #define S32_D565_Opaque_Dither_PROC NULL 1800 #endif 1801 1802 /////////////////////////////////////////////////////////////////////////////// 1803 1804 static const SkBlitRow::Proc platform_565_procs[] = { 1805 // no dither 1806 S32_D565_Opaque_PROC, 1807 S32_D565_Blend_PROC, 1808 S32A_D565_Opaque_PROC, 1809 S32A_D565_Blend_PROC, 1810 1811 // dither 1812 S32_D565_Opaque_Dither_PROC, 1813 S32_D565_Blend_Dither_PROC, 1814 S32A_D565_Opaque_Dither_PROC, 1815 NULL, // S32A_D565_Blend_Dither 1816 }; 1817 1818 static const SkBlitRow::Proc platform_4444_procs[] = { 1819 // no dither 1820 NULL, // S32_D4444_Opaque, 1821 NULL, // S32_D4444_Blend, 1822 NULL, // S32A_D4444_Opaque, 1823 NULL, // S32A_D4444_Blend, 1824 1825 // dither 1826 NULL, // S32_D4444_Opaque_Dither, 1827 NULL, // S32_D4444_Blend_Dither, 1828 NULL, // S32A_D4444_Opaque_Dither, 1829 NULL, // S32A_D4444_Blend_Dither 1830 }; 1831 1832 static const SkBlitRow::Proc32 platform_32_procs[] = { 1833 NULL, // S32_Opaque, 1834 S32_Blend_BlitRow32_PROC, // S32_Blend, 1835 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, 1836 S32A_Blend_BlitRow32_PROC // S32A_Blend 1837 }; 1838 1839 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { 1840 return platform_4444_procs[flags]; 1841 } 1842 1843 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 1844 return platform_565_procs[flags]; 1845 } 1846 1847 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 1848 return platform_32_procs[flags]; 1849 } 1850 1851 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 1852 return NULL; 1853 } 1854 1855 /////////////////////////////////////////////////////////////////////////////// 1856 1857 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, 1858 SkMask::Format maskFormat, 1859 SkColor color) { 1860 return NULL; 1861 } 1862 1863 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { 1864 return NULL; 1865 } 1866 1867 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, 1868 SkMask::Format maskFormat, 1869 RowFlags flags) { 1870 return NULL; 1871 } 1872