1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7 /* 8 * Modifications done in-house at Motorola 9 * 10 * this is a clone of SkBitmapProcState_matrix.h 11 * and has been tuned to work with the NEON unit. 12 * 13 * Still going back and forth between whether this approach 14 * (clone the entire SkBitmapProcState_matrix.h file or 15 * if I should put just the modified routines in here and 16 * then use a construct like #define DONT_DO_THIS_FUNCTION or 17 * something like that... 18 * 19 * This is for the ClampX_ClampY instance 20 * 21 */ 22 23 24 #include <arm_neon.h> 25 26 /* 27 * This has been modified on the knowledge that (at the time) 28 * we had the following macro definitions in the parent file 29 * 30 * #define MAKENAME(suffix) ClampX_ClampY ## suffix 31 * #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 32 * #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 33 * #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 34 * #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 35 * #define CHECK_FOR_DECAL 36 */ 37 38 /* SkClampMax(val,max) -- bound to 0..max */ 39 40 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale_neon) 41 #define SCALE_FILTER_NAME MAKENAME(_filter_scale_neon) 42 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine_neon) 43 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine_neon) 44 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp_neon) 45 #define PERSP_FILTER_NAME MAKENAME(_filter_persp_neon) 46 47 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) 48 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) 49 50 #ifndef PREAMBLE 51 #define PREAMBLE(state) 52 #define PREAMBLE_PARAM_X 53 #define PREAMBLE_PARAM_Y 54 #define PREAMBLE_ARG_X 55 #define PREAMBLE_ARG_Y 56 #endif 57 58 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, 59 uint32_t xy[], int count, int x, int y) { 60 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 61 SkMatrix::kScale_Mask)) == 0); 62 63 PREAMBLE(s); 64 // we store y, x, x, x, x, x 65 66 const unsigned maxX = s.fBitmap->width() - 1; 67 SkFixed fx; 68 { 69 SkPoint pt; 70 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 71 SkIntToScalar(y) + SK_ScalarHalf, &pt); 72 fx = SkScalarToFixed(pt.fY); 73 const unsigned maxY = s.fBitmap->height() - 1; 74 *xy++ = TILEY_PROCF(fx, maxY); 75 fx = SkScalarToFixed(pt.fX); 76 } 77 78 if (0 == maxX) { 79 // all of the following X values must be 0 80 memset(xy, 0, count * sizeof(uint16_t)); 81 return; 82 } 83 84 const SkFixed dx = s.fInvSx; 85 86 #ifdef CHECK_FOR_DECAL 87 // test if we don't need to apply the tile proc 88 if ((unsigned)(fx >> 16) <= maxX && 89 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 90 decal_nofilter_scale_neon(xy, fx, dx, count); 91 return; 92 } 93 #endif 94 95 int i; 96 97 /* very much like done in decal_nofilter, but with 98 * an extra clamping function applied. 99 * TILEX_PROCF(fx,max) SkClampMax((fx)>>16, max) 100 */ 101 if (count >= 8) { 102 /* SkFixed is 16.16 fixed point */ 103 SkFixed dx2 = dx+dx; 104 SkFixed dx4 = dx2+dx2; 105 SkFixed dx8 = dx4+dx4; 106 107 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 108 SkFixed fx1, fx2, fx3; 109 int32x2_t lower, upper; 110 int32x4_t lbase, hbase; 111 int16_t *dst16 = (int16_t *)xy; 112 113 fx1 = fx+dx; 114 fx2 = fx1+dx; 115 fx3 = fx2+dx; 116 117 /* build my template(s) */ 118 /* avoid the 'lbase unitialized' warning */ 119 lbase = vdupq_n_s32(fx); 120 lbase = vsetq_lane_s32(fx1, lbase, 1); 121 lbase = vsetq_lane_s32(fx2, lbase, 2); 122 lbase = vsetq_lane_s32(fx3, lbase, 3); 123 124 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 125 126 /* store & bump */ 127 do { 128 int32x4_t lout; 129 int32x4_t hout; 130 int16x8_t hi16; 131 132 /* get the hi 16s of all those 32s */ 133 lout = lbase; 134 hout = hbase; 135 /* this sets up all lout's then all hout's in hout */ 136 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 137 hi16 = vreinterpretq_s16_s32(hout); 138 139 /* clamp & output */ 140 hi16 = vmaxq_s16(hi16, vdupq_n_s16(0)); 141 hi16 = vminq_s16(hi16, vdupq_n_s16(maxX)); 142 vst1q_s16(dst16, hi16); 143 144 /* but preserving base & on to the next */ 145 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 146 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 147 dst16 += 8; 148 count -= 8; 149 fx += dx8; 150 } while (count >= 8); 151 xy = (uint32_t *) dst16; 152 } 153 154 uint16_t* xx = (uint16_t*)xy; 155 for (i = count; i > 0; --i) { 156 *xx++ = TILEX_PROCF(fx, maxX); fx += dx; 157 } 158 } 159 160 // note: we could special-case on a matrix which is skewed in X but not Y. 161 // this would require a more general setup thatn SCALE does, but could use 162 // SCALE's inner loop that only looks at dx 163 164 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, 165 uint32_t xy[], int count, int x, int y) { 166 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 167 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 168 SkMatrix::kScale_Mask | 169 SkMatrix::kAffine_Mask)) == 0); 170 171 PREAMBLE(s); 172 SkPoint srcPt; 173 s.fInvProc(*s.fInvMatrix, 174 SkIntToScalar(x) + SK_ScalarHalf, 175 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 176 177 SkFixed fx = SkScalarToFixed(srcPt.fX); 178 SkFixed fy = SkScalarToFixed(srcPt.fY); 179 SkFixed dx = s.fInvSx; 180 SkFixed dy = s.fInvKy; 181 int maxX = s.fBitmap->width() - 1; 182 int maxY = s.fBitmap->height() - 1; 183 184 /* NEON lets us do an 8x unrolling */ 185 if (count >= 8) { 186 /* SkFixed is 16.16 fixed point */ 187 SkFixed dx4 = dx * 4; 188 SkFixed dy4 = dy * 4; 189 SkFixed dx8 = dx * 8; 190 SkFixed dy8 = dy * 8; 191 192 int32x4_t xbase, ybase; 193 int32x4_t x2base, y2base; 194 int16_t *dst16 = (int16_t *) xy; 195 196 /* my sets of maxx/maxy for clamping */ 197 int32_t maxpair = (maxX&0xffff) | ((maxY&0xffff)<<16); 198 int16x8_t maxXY = vreinterpretq_s16_s32(vdupq_n_s32(maxpair)); 199 200 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 201 /* avoid the 'xbase unitialized' warning...*/ 202 xbase = vdupq_n_s32(fx); 203 xbase = vsetq_lane_s32(fx+dx, xbase, 1); 204 xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); 205 xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); 206 207 /* same for fy */ 208 /* avoid the 'ybase unitialized' warning...*/ 209 ybase = vdupq_n_s32(fy); 210 ybase = vsetq_lane_s32(fy+dy, ybase, 1); 211 ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); 212 ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); 213 214 x2base = vaddq_s32(xbase, vdupq_n_s32(dx4)); 215 y2base = vaddq_s32(ybase, vdupq_n_s32(dy4)); 216 217 /* store & bump */ 218 do { 219 int32x4_t xout, yout; 220 int32x4_t x2out, y2out; 221 int16x8_t hi16, hi16_2; 222 223 xout = xbase; 224 yout = ybase; 225 226 /* overlay y's low16 with hi16 from x */ 227 /* so we properly shifted xyxyxyxy */ 228 yout = vsriq_n_s32(yout, xout, 16); 229 hi16 = vreinterpretq_s16_s32 (yout); 230 231 /* do the clamping; both guys get 0's */ 232 hi16 = vmaxq_s16 (hi16, vdupq_n_s16(0)); 233 hi16 = vminq_s16 (hi16, maxXY); 234 235 vst1q_s16 (dst16, hi16); 236 237 /* and for the other 4 pieces of this iteration */ 238 x2out = x2base; 239 y2out = y2base; 240 241 /* overlay y's low16 with hi16 from x */ 242 /* so we properly shifted xyxyxyxy */ 243 y2out = vsriq_n_s32(y2out, x2out, 16); 244 hi16_2 = vreinterpretq_s16_s32 (y2out); 245 246 /* do the clamping; both guys get 0's */ 247 hi16_2 = vmaxq_s16 (hi16_2, vdupq_n_s16(0)); 248 hi16_2 = vminq_s16 (hi16_2, maxXY); 249 250 /* RBE: gcc regenerates dst16+8 all the time instead 251 * of folding it into an addressing mode. *sigh* */ 252 vst1q_s16 (dst16+8, hi16_2); 253 254 /* moving base and on to the next */ 255 xbase = vaddq_s32 (xbase, vdupq_n_s32 (dx8)); 256 ybase = vaddq_s32 (ybase, vdupq_n_s32 (dy8)); 257 x2base = vaddq_s32 (x2base, vdupq_n_s32 (dx8)); 258 y2base = vaddq_s32 (y2base, vdupq_n_s32 (dy8)); 259 260 dst16 += 16; /* 8x32 aka 16x16 */ 261 count -= 8; 262 fx += dx8; 263 fy += dy8; 264 } while (count >= 8); 265 xy = (uint32_t *) dst16; 266 } 267 268 for (int i = count; i > 0; --i) { 269 *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); 270 fx += dx; fy += dy; 271 } 272 } 273 274 #undef DEBUG_PERSP_NOFILTER 275 276 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, 277 uint32_t* SK_RESTRICT xy, 278 int count, int x, int y) { 279 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 280 281 PREAMBLE(s); 282 /* max{X,Y} are int here, but later shown/assumed to fit in 16 bits */ 283 int maxX = s.fBitmap->width() - 1; 284 int maxY = s.fBitmap->height() - 1; 285 286 SkPerspIter iter(*s.fInvMatrix, 287 SkIntToScalar(x) + SK_ScalarHalf, 288 SkIntToScalar(y) + SK_ScalarHalf, count); 289 290 while ((count = iter.next()) != 0) { 291 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 292 293 #if defined(DEBUG_PERSP_NOFILTER) 294 /* debugging stuff */ 295 const SkFixed *end_srcXY = srcXY + (count*2); 296 uint32_t *end_xy = xy + (count); 297 const SkFixed *base_srcXY = srcXY; 298 uint32_t *base_xy = xy; 299 int base_count = count; 300 #endif 301 302 #if 1 303 // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition 304 // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn 305 306 /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... 307 * but we immediately discard the low 16 bits... 308 * so what we're going to do is vld4, which will give us 309 * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo' 310 * parts.... 311 */ 312 if (count >= 8) { 313 int16_t *mysrc = (int16_t *) srcXY; 314 int16_t *mydst = (int16_t *) xy; 315 int16x4_t maxX4 = vdup_n_s16((int16_t)maxX); 316 int16x4_t maxY4 = vdup_n_s16((int16_t)maxY); 317 int16x4_t zero4 = vdup_n_s16(0); 318 319 /* The constructs with local blocks for register assignments 320 * and asm() instructions is to make keep any hard register 321 * assignments to as small a scope as possible. and to avoid 322 * burning call-preserved hard registers on the vld/vst 323 * instructions. 324 */ 325 326 do { 327 int16x4_t xlo, xhi, ylo, yhi; 328 int16x4_t x2lo, x2hi, y2lo, y2hi; 329 330 /* vld4 does the de-interleaving for us */ 331 { 332 register int16x4_t t_xlo asm("d0"); 333 register int16x4_t t_xhi asm("d1"); 334 register int16x4_t t_ylo asm("d2"); 335 register int16x4_t t_yhi asm("d3"); 336 337 asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" 338 : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) 339 : "r" (mysrc) 340 ); 341 xlo = t_xlo; 342 xhi = t_xhi; 343 ylo = t_ylo; 344 yhi = t_yhi; 345 } 346 347 /* clamp X>>16 (aka xhi) to 0..maxX */ 348 xhi = vmax_s16(xhi, zero4); /* now 0.. */ 349 xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */ 350 351 /* clamp Y>>16 (aka yhi) to 0..maxY */ 352 yhi = vmax_s16(yhi, zero4); /* now 0.. */ 353 yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */ 354 355 /* deal with the second set of numbers */ 356 { 357 register int16x4_t t_xlo asm("d4"); 358 register int16x4_t t_xhi asm("d5"); 359 register int16x4_t t_ylo asm("d6"); 360 register int16x4_t t_yhi asm("d7"); 361 362 /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */ 363 asm ("vld4.16 {d4-d7},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" 364 : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) 365 : "r" (mysrc+16) 366 ); 367 x2lo = t_xlo; 368 x2hi = t_xhi; 369 y2lo = t_ylo; 370 y2hi = t_yhi; 371 } 372 373 /* clamp the second 4 here */ 374 375 if (0) { extern void rbe(void); rbe(); } 376 377 /* clamp X>>16 (aka xhi) to 0..maxX */ 378 x2hi = vmax_s16(x2hi, zero4); /* now 0.. */ 379 x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */ 380 381 /* clamp Y>>16 (aka yhi) to 0..maxY */ 382 y2hi = vmax_s16(y2hi, zero4); /* now 0.. */ 383 y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */ 384 385 /* we're storing as {x,y}s: x is [0], y is [1] */ 386 /* we'll use vst2 to make this happen */ 387 388 { 389 register int16x4_t out_x asm("d16") = xhi; 390 register int16x4_t out_y asm("d17") = yhi; 391 392 asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */" 393 : 394 : "w" (out_x), "w" (out_y), "r" (mydst) 395 ); 396 } 397 { 398 register int16x4_t out_x asm("d18") = x2hi; 399 register int16x4_t out_y asm("d19") = y2hi; 400 401 asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */" 402 : 403 : "w" (out_x), "w" (out_y), "r" (mydst+8) 404 ); 405 } 406 407 /* XXX: gcc isn't interleaving these with the NEON ops 408 * but i think that all the scoreboarding works out */ 409 count -= 8; /* 8 iterations */ 410 mysrc += 32; /* 16 longs, aka 32 shorts */ 411 mydst += 16; /* 16 shorts, aka 8 longs */ 412 } while (count >= 8); 413 /* get xy and srcXY fixed up */ 414 srcXY = (const SkFixed *) mysrc; 415 xy = (uint32_t *) mydst; 416 } 417 #endif 418 419 while (--count >= 0) { 420 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | 421 TILEX_PROCF(srcXY[0], maxX); 422 srcXY += 2; 423 } 424 425 #if defined(DEBUG_PERSP_NOFILTER) 426 /* for checking our NEON-produced results against vanilla code */ 427 { 428 int bad = (-1); 429 for (int i = 0; i < base_count; i++) { 430 uint32_t val; 431 val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | 432 TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); 433 434 if (val != base_xy[i]) { 435 bad = i; 436 break; 437 } 438 } 439 if (bad >= 0) { 440 SkDebugf("clamp-nofilter-persp failed piece %d\n", bad); 441 SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY); 442 bad -= (bad & 0x7); /* align */ 443 for (int i = bad; i < bad + 8; i++) { 444 uint32_t val; 445 val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | 446 TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); 447 448 SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", 449 i, base_xy[i], val, base_srcXY[i * 2 + 0], 450 base_srcXY[i * 2 + 1]); 451 } 452 SkDebugf ("---\n"); 453 } 454 455 if (end_xy != xy) { 456 SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy); 457 } 458 if (end_srcXY != srcXY) { 459 SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY, 460 end_srcXY); 461 } 462 } 463 #endif 464 } 465 } 466 467 #undef DEBUG_PERSP_NOFILTER 468 469 ////////////////////////////////////////////////////////////////////////////// 470 471 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, 472 SkFixed one PREAMBLE_PARAM_Y) { 473 unsigned i = TILEY_PROCF(f, max); 474 i = (i << 4) | TILEY_LOW_BITS(f, max); 475 return (i << 14) | (TILEY_PROCF((f + one), max)); 476 } 477 478 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, 479 SkFixed one PREAMBLE_PARAM_X) { 480 unsigned i = TILEX_PROCF(f, max); 481 i = (i << 4) | TILEX_LOW_BITS(f, max); 482 return (i << 14) | (TILEX_PROCF((f + one), max)); 483 } 484 485 static void SCALE_FILTER_NAME(const SkBitmapProcState& s, 486 uint32_t xy[], int count, int x, int y) { 487 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 488 SkMatrix::kScale_Mask)) == 0); 489 SkASSERT(s.fInvKy == 0); 490 491 PREAMBLE(s); 492 493 const unsigned maxX = s.fBitmap->width() - 1; 494 const SkFixed one = s.fFilterOneX; 495 const SkFixed dx = s.fInvSx; 496 SkFixed fx; 497 498 { 499 SkPoint pt; 500 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 501 SkIntToScalar(y) + SK_ScalarHalf, &pt); 502 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 503 const unsigned maxY = s.fBitmap->height() - 1; 504 // compute our two Y values up front 505 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); 506 // now initialize fx 507 fx = SkScalarToFixed(pt.fX) - (one >> 1); 508 } 509 510 #ifdef CHECK_FOR_DECAL 511 // test if we don't need to apply the tile proc 512 if (dx > 0 && 513 (unsigned)(fx >> 16) <= maxX && 514 (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 515 decal_filter_scale_neon(xy, fx, dx, count); 516 } else 517 #endif 518 519 if (count >= 4) { 520 int32x4_t wide_dx, wide_one; 521 int32x4_t wide_fx, wide_fx1, wide_i, wide_lo; 522 #if 0 523 /* verification hooks -- see below */ 524 SkFixed debug_fx = fx; 525 int count_done = 0; 526 #endif 527 528 wide_fx = vdupq_n_s32(fx); 529 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 530 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 531 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 532 533 wide_dx = vdupq_n_s32(dx); 534 wide_one = vdupq_n_s32(one); 535 536 while (count >= 4) { 537 /* original expands to: 538 * unsigned i = SkClampMax((f) >> 16, max); 539 * i = (i << 4) | (((f) >> 12) & 0xF); 540 * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 541 */ 542 543 /* i = SkClampMax(f>>16, maxX) */ 544 wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); 545 wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); 546 547 /* i<<4 | TILEX_LOW_BITS(fx) */ 548 wide_lo = vshrq_n_s32(wide_fx, 12); 549 wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 550 551 /* i<<14 */ 552 wide_i = vshlq_n_s32(wide_i, 14); 553 554 /* SkClampMax(((f + one)) >> 16, max) */ 555 wide_fx1 = vaddq_s32(wide_fx, wide_one); 556 wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); 557 wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); 558 559 /* final combination */ 560 wide_i = vorrq_s32(wide_i, wide_fx1); 561 562 vst1q_u32(xy, vreinterpretq_u32_s32(wide_i)); 563 564 #if 0 565 /* having a verification hook is a good idea */ 566 /* use debug_fx, debug_fx+dx, etc. */ 567 568 for (int i=0;i<4;i++) { 569 uint32_t want = PACK_FILTER_X_NAME(debug_fx, maxX, one PREAMBLE_ARG_X); 570 if (xy[i] != want) 571 { 572 /* print a nastygram */ 573 SkDebugf("clamp-filter-scale fails\n"); 574 SkDebugf("got %08x want %08x\n", xy[i], want); 575 SkDebugf("fx %08x debug_fx %08x dx %08x done %d\n", 576 fx, debug_fx, dx, count_done); 577 SkDebugf(" maxX %08x one %08x\n", maxX, one); 578 579 } 580 debug_fx += dx; 581 count_done++; 582 } 583 #endif 584 wide_fx += vdupq_n_s32(dx+dx+dx+dx); 585 fx += dx+dx+dx+dx; 586 xy += 4; 587 count -= 4; 588 } 589 } 590 591 while (--count >= 0) { 592 *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X); 593 fx += dx; 594 } 595 } 596 597 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, 598 uint32_t xy[], int count, int x, int y) { 599 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 600 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 601 SkMatrix::kScale_Mask | 602 SkMatrix::kAffine_Mask)) == 0); 603 604 PREAMBLE(s); 605 SkPoint srcPt; 606 s.fInvProc(*s.fInvMatrix, 607 SkIntToScalar(x) + SK_ScalarHalf, 608 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 609 610 SkFixed oneX = s.fFilterOneX; 611 SkFixed oneY = s.fFilterOneY; 612 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 613 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 614 SkFixed dx = s.fInvSx; 615 SkFixed dy = s.fInvKy; 616 unsigned maxX = s.fBitmap->width() - 1; 617 unsigned maxY = s.fBitmap->height() - 1; 618 619 if (count >= 4) { 620 int32x4_t wide_one, wide_i, wide_lo; 621 int32x4_t wide_dx, wide_fx, wide_onex, wide_fx1; 622 int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1; 623 624 #undef AFFINE_DEBUG 625 #if defined(AFFINE_DEBUG) 626 SkFixed fyp = fy; 627 SkFixed fxp = fx; 628 uint32_t *xyp = xy; 629 int count_done = 0; 630 #endif 631 632 wide_fx = vdupq_n_s32(fx); 633 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 634 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 635 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 636 wide_dx = vdupq_n_s32(dx); 637 638 wide_fy = vdupq_n_s32(fy); 639 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); 640 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); 641 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); 642 wide_dy = vdupq_n_s32(dy); 643 644 wide_onex = vdupq_n_s32(oneX); 645 wide_oney = vdupq_n_s32(oneY); 646 647 while (count >= 4) { 648 int32x4_t wide_x; 649 int32x4_t wide_y; 650 651 /* do the X side, then the Y side, then interleave them */ 652 653 /* original expands to: 654 * unsigned i = SkClampMax((f) >> 16, max); 655 * i = (i << 4) | (((f) >> 12) & 0xF); 656 * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 657 */ 658 659 /* i = SkClampMax(f>>16, maxX) */ 660 wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); 661 wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); 662 663 /* i<<4 | TILEX_LOW_BITS(fx) */ 664 wide_lo = vshrq_n_s32(wide_fx, 12); 665 wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 666 667 /* i<<14 */ 668 wide_i = vshlq_n_s32(wide_i, 14); 669 670 /* SkClampMax(((f + one)) >> 16, max) */ 671 wide_fx1 = vaddq_s32(wide_fx, wide_onex); 672 wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); 673 wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); 674 675 /* final combination */ 676 wide_x = vorrq_s32(wide_i, wide_fx1); 677 678 /* And now the Y side */ 679 680 /* i = SkClampMax(f>>16, maxX) */ 681 wide_i = vmaxq_s32(vshrq_n_s32(wide_fy,16), vdupq_n_s32(0)); 682 wide_i = vminq_s32(wide_i, vdupq_n_s32(maxY)); 683 684 /* i<<4 | TILEX_LOW_BITS(fx) */ 685 wide_lo = vshrq_n_s32(wide_fy, 12); 686 wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 687 688 /* i<<14 */ 689 wide_i = vshlq_n_s32(wide_i, 14); 690 691 /* SkClampMax(((f + one)) >> 16, max) */ 692 wide_fy1 = vaddq_s32(wide_fy, wide_oney); 693 wide_fy1 = vmaxq_s32(vshrq_n_s32(wide_fy1,16), vdupq_n_s32(0)); 694 wide_fy1 = vminq_s32(wide_fy1, vdupq_n_s32(maxY)); 695 696 /* final combination */ 697 wide_y = vorrq_s32(wide_i, wide_fy1); 698 699 /* interleave as YXYXYXYX as part of the storing */ 700 { 701 /* vst2.32 needs side-by-side registers */ 702 register int32x4_t t_x asm("q1"); 703 register int32x4_t t_y asm("q0"); 704 705 t_x = wide_x; t_y = wide_y; 706 asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" 707 : 708 : "w" (t_y), "w" (t_x), "r" (xy) 709 ); 710 } 711 712 #if defined(AFFINE_DEBUG) 713 /* make sure we're good here -- check the 4 we just output */ 714 for (int i = 0; i<4;i++) { 715 uint32_t val; 716 val = PACK_FILTER_Y_NAME(fyp, maxY, oneY PREAMBLE_ARG_Y); 717 if (val != xy[i*2+0]) { 718 /* print a nastygram */ 719 SkDebugf("clamp-filter-affine fails\n"); 720 SkDebugf("[bad-y] got %08x want %08x\n", xy[i*2+0], val); 721 SkDebugf("fy %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", 722 fy, fxp, fyp, dx, dy, count_done); 723 SkDebugf(" maxY %08x oneY %08x\n", maxY, oneY); 724 } 725 val = PACK_FILTER_X_NAME(fxp, maxX, oneX PREAMBLE_ARG_X); 726 if (val != xy[i*2+1]) { 727 /* print a nastygram */ 728 SkDebugf("clamp-filter-affine fails\n"); 729 SkDebugf("[bad-x] got %08x want %08x\n", xy[i*2+1], val); 730 SkDebugf("fx %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", 731 fx, fxp, fyp, dx, dy, count_done); 732 SkDebugf(" maxX %08x one %08x\n", maxX, oneX); 733 } 734 fyp += dy; 735 fxp += dx; 736 count_done++; 737 } 738 #endif 739 740 wide_fx += vdupq_n_s32(dx+dx+dx+dx); 741 fx += dx+dx+dx+dx; 742 wide_fy += vdupq_n_s32(dy+dy+dy+dy); 743 fy += dy+dy+dy+dy; 744 xy += 8; /* 4 x's, 4 y's */ 745 count -= 4; 746 } 747 } 748 749 while (--count >= 0) { 750 /* NB: writing Y/X */ 751 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); 752 fy += dy; 753 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); 754 fx += dx; 755 } 756 } 757 758 static void PERSP_FILTER_NAME(const SkBitmapProcState& s, 759 uint32_t* SK_RESTRICT xy, int count, 760 int x, int y) { 761 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 762 763 PREAMBLE(s); 764 unsigned maxX = s.fBitmap->width() - 1; 765 unsigned maxY = s.fBitmap->height() - 1; 766 SkFixed oneX = s.fFilterOneX; 767 SkFixed oneY = s.fFilterOneY; 768 769 SkPerspIter iter(*s.fInvMatrix, 770 SkIntToScalar(x) + SK_ScalarHalf, 771 SkIntToScalar(y) + SK_ScalarHalf, count); 772 773 while ((count = iter.next()) != 0) { 774 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 775 776 if (count >= 4) { 777 int32x4_t wide_one, wide_i, wide_lo; 778 int32x4_t wide_fx1; 779 int32x4_t wide_fy1; 780 int32x4_t wide_x, wide_y; 781 782 while (count >= 4) { 783 /* RBE: it's good, but: 784 * -- we spill a constant that could be easily regnerated 785 * [perhaps tweak gcc's NEON constant costs?] 786 */ 787 788 /* load src: x-y-x-y-x-y-x-y */ 789 { 790 register int32x4_t q0 asm ("q0"); 791 register int32x4_t q1 asm ("q1"); 792 asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" 793 : "=w" (q0), "=w" (q1) 794 : "r" (srcXY)); 795 wide_x = q0; wide_y = q1; 796 } 797 798 /* do the X side, then the Y side, then interleave them */ 799 800 wide_x = vsubq_s32(wide_x, vdupq_n_s32 (oneX>>1)); 801 802 /* original expands to: 803 * unsigned i = SkClampMax((f) >> 16, max); 804 * i = (i << 4) | (((f) >> 12) & 0xF); 805 * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 806 */ 807 808 /* i = SkClampMax(f>>16, maxX) */ 809 wide_i = vmaxq_s32 (vshrq_n_s32 (wide_x, 16), vdupq_n_s32 (0)); 810 wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxX)); 811 812 /* i<<4 | TILEX_LOW_BITS(fx) */ 813 wide_lo = vshrq_n_s32 (wide_x, 12); 814 wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); 815 816 /* i<<14 */ 817 wide_i = vshlq_n_s32 (wide_i, 14); 818 819 /* SkClampMax(((f + one)) >> 16, max) */ 820 wide_fx1 = vaddq_s32 (wide_x, vdupq_n_s32(oneX)); 821 wide_fx1 = vmaxq_s32 (vshrq_n_s32 (wide_fx1, 16), vdupq_n_s32 (0)); 822 wide_fx1 = vminq_s32 (wide_fx1, vdupq_n_s32 (maxX)); 823 824 /* final combination */ 825 wide_x = vorrq_s32 (wide_i, wide_fx1); 826 827 828 /* And now the Y side */ 829 830 wide_y = vsubq_s32(wide_y, vdupq_n_s32 (oneY>>1)); 831 832 /* i = SkClampMax(f>>16, maxX) */ 833 wide_i = vmaxq_s32 (vshrq_n_s32 (wide_y, 16), vdupq_n_s32 (0)); 834 wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxY)); 835 836 /* i<<4 | TILEX_LOW_BITS(fx) */ 837 wide_lo = vshrq_n_s32 (wide_y, 12); 838 wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); 839 840 /* i<<14 */ 841 wide_i = vshlq_n_s32 (wide_i, 14); 842 843 /* SkClampMax(((f + one)) >> 16, max) */ 844 845 /* wide_fy1_1 and wide_fy1_2 are just temporary variables to 846 * work-around an ICE in debug */ 847 int32x4_t wide_fy1_1 = vaddq_s32 (wide_y, vdupq_n_s32(oneY)); 848 int32x4_t wide_fy1_2 = vmaxq_s32 (vshrq_n_s32 (wide_fy1_1, 16), 849 vdupq_n_s32 (0)); 850 wide_fy1 = vminq_s32 (wide_fy1_2, vdupq_n_s32 (maxY)); 851 852 /* final combination */ 853 wide_y = vorrq_s32 (wide_i, wide_fy1); 854 855 /* switch them around; have to do it this way to get them 856 * in the proper registers to match our instruction */ 857 858 /* iteration bookkeeping, ahead of the asm() for scheduling */ 859 srcXY += 2*4; 860 count -= 4; 861 862 /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */ 863 { 864 register int32x4_t q0 asm ("q0") = wide_y; 865 register int32x4_t q1 asm ("q1") = wide_x; 866 867 asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" 868 : 869 : "w" (q0), "w" (q1), "r" (xy)); 870 } 871 872 /* on to the next iteration */ 873 /* count, srcXY are handled above */ 874 xy += 2*4; 875 } 876 } 877 878 /* was do-while; NEON code invalidates original count>0 assumption */ 879 while (--count >= 0) { 880 /* NB: we read x/y, we write y/x */ 881 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, 882 oneY PREAMBLE_ARG_Y); 883 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, 884 oneX PREAMBLE_ARG_X); 885 srcXY += 2; 886 } 887 } 888 } 889 890 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { 891 SCALE_NOFILTER_NAME, 892 SCALE_FILTER_NAME, 893 AFFINE_NOFILTER_NAME, 894 AFFINE_FILTER_NAME, 895 PERSP_NOFILTER_NAME, 896 PERSP_FILTER_NAME 897 }; 898 899 #undef MAKENAME 900 #undef TILEX_PROCF 901 #undef TILEY_PROCF 902 #ifdef CHECK_FOR_DECAL 903 #undef CHECK_FOR_DECAL 904 #endif 905 906 #undef SCALE_NOFILTER_NAME 907 #undef SCALE_FILTER_NAME 908 #undef AFFINE_NOFILTER_NAME 909 #undef AFFINE_FILTER_NAME 910 #undef PERSP_NOFILTER_NAME 911 #undef PERSP_FILTER_NAME 912 913 #undef PREAMBLE 914 #undef PREAMBLE_PARAM_X 915 #undef PREAMBLE_PARAM_Y 916 #undef PREAMBLE_ARG_X 917 #undef PREAMBLE_ARG_Y 918 919 #undef TILEX_LOW_BITS 920 #undef TILEY_LOW_BITS 921