1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7 /* 8 * Modifications done in-house at Motorola 9 * 10 * this is a clone of SkBitmapProcState_matrix.h 11 * and has been tuned to work with the NEON unit. 12 * 13 * Still going back and forth between whether this approach 14 * (clone the entire SkBitmapProcState_matrix.h file or 15 * if I should put just the modified routines in here and 16 * then use a construct like #define DONT_DO_THIS_FUNCTION or 17 * something like that... 18 * 19 * This is for the RepeatX_RepeatY part of the world 20 */ 21 22 23 #include <arm_neon.h> 24 25 /* 26 * This has been modified on the knowledge that (at the time) 27 * we had the following macro definitions in the parent file 28 * 29 * #define MAKENAME(suffix) RepeatX_RepeatY ## suffix 30 * #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 31 * #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16) 32 * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 33 * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 34 */ 35 36 /* SkClampMax(val,max) -- bound to 0..max */ 37 38 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) 39 #define SCALE_FILTER_NAME MAKENAME(_filter_scale) 40 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) 41 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine) 42 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) 43 #define PERSP_FILTER_NAME MAKENAME(_filter_persp) 44 45 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) 46 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) 47 48 #ifndef PREAMBLE 49 #define PREAMBLE(state) 50 #define PREAMBLE_PARAM_X 51 #define PREAMBLE_PARAM_Y 52 #define PREAMBLE_ARG_X 53 #define PREAMBLE_ARG_Y 54 #endif 55 56 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, 57 uint32_t xy[], int count, int x, int y) { 58 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 59 SkMatrix::kScale_Mask)) == 0); 60 61 PREAMBLE(s); 62 // we store y, x, x, x, x, x 63 64 const unsigned maxX = s.fBitmap->width() - 1; 65 SkFixed fx; 66 { 67 SkPoint pt; 68 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 69 SkIntToScalar(y) + SK_ScalarHalf, &pt); 70 fx = SkScalarToFixed(pt.fY); 71 const unsigned maxY = s.fBitmap->height() - 1; 72 *xy++ = TILEY_PROCF(fx, maxY); 73 fx = SkScalarToFixed(pt.fX); 74 } 75 76 if (0 == maxX) { 77 // all of the following X values must be 0 78 memset(xy, 0, count * sizeof(uint16_t)); 79 return; 80 } 81 82 const SkFixed dx = s.fInvSx; 83 84 #ifdef CHECK_FOR_DECAL 85 // test if we don't need to apply the tile proc 86 if ((unsigned)(fx >> 16) <= maxX && 87 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 88 decal_nofilter_scale_neon(xy, fx, dx, count); 89 } else 90 #endif 91 { 92 int i; 93 94 /* RBE: very much like done in decal_nofilter , 95 * but some processing of the 'fx' information 96 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 97 */ 98 if (count >= 8) { 99 /* SkFixed is 16.16 fixed point */ 100 SkFixed dx2 = dx+dx; 101 SkFixed dx4 = dx2+dx2; 102 SkFixed dx8 = dx4+dx4; 103 104 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 105 SkFixed fx1, fx2, fx3; 106 int32x2_t lower, upper; 107 int32x4_t lbase, hbase; 108 int16_t *dst16 = (int16_t *)xy; 109 110 fx1 = fx+dx; 111 fx2 = fx1+dx; 112 fx3 = fx2+dx; 113 114 lbase = vdupq_n_s32(fx); 115 lbase = vsetq_lane_s32(fx1, lbase, 1); 116 lbase = vsetq_lane_s32(fx2, lbase, 2); 117 lbase = vsetq_lane_s32(fx3, lbase, 3); 118 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 119 120 /* store & bump */ 121 do 122 { 123 int32x4_t lout; 124 int32x4_t hout; 125 int16x8_t hi16; 126 127 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ 128 /* mask to low 16 [would like to use uzp tricks) */ 129 lout = vandq_s32(lbase, vdupq_n_s32(0xffff)); 130 hout = vandq_s32(hbase, vdupq_n_s32(0xffff)); 131 /* bare multiplication, not SkFixedMul */ 132 lout = vmulq_s32(lout, vdupq_n_s32(maxX+1)); 133 hout = vmulq_s32(hout, vdupq_n_s32(maxX+1)); 134 135 /* extraction, using uzp */ 136 /* this is ok -- we want all hi(lout)s then all hi(hout)s */ 137 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 138 hi16 = vreinterpretq_s16_s32(hout); 139 vst1q_s16(dst16, hi16); 140 141 /* bump our base on to the next */ 142 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 143 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 144 dst16 += 8; 145 count -= 8; 146 fx += dx8; 147 } while (count >= 8); 148 xy = (uint32_t *) dst16; 149 } 150 uint16_t* xx = (uint16_t*)xy; 151 for (i = count; i > 0; --i) { 152 *xx++ = TILEX_PROCF(fx, maxX); fx += dx; 153 } 154 } 155 } 156 157 // note: we could special-case on a matrix which is skewed in X but not Y. 158 // this would require a more general setup thatn SCALE does, but could use 159 // SCALE's inner loop that only looks at dx 160 161 162 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, 163 uint32_t xy[], int count, int x, int y) { 164 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 165 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 166 SkMatrix::kScale_Mask | 167 SkMatrix::kAffine_Mask)) == 0); 168 169 PREAMBLE(s); 170 SkPoint srcPt; 171 s.fInvProc(*s.fInvMatrix, 172 SkIntToScalar(x) + SK_ScalarHalf, 173 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 174 175 SkFixed fx = SkScalarToFixed(srcPt.fX); 176 SkFixed fy = SkScalarToFixed(srcPt.fY); 177 SkFixed dx = s.fInvSx; 178 SkFixed dy = s.fInvKy; 179 int maxX = s.fBitmap->width() - 1; 180 int maxY = s.fBitmap->height() - 1; 181 182 #if 1 183 int ocount = count; 184 uint32_t *oxy = xy; 185 SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy; 186 #endif 187 188 189 if (0) { extern void rbe(void); rbe(); } 190 191 /* RBE: benchmarks show this eats up time; can we neonize it? */ 192 /* RBE: very much like done in decal_nofilter , 193 * but some processing of the 'fx' information 194 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 195 */ 196 if (count >= 4) { 197 /* SkFixed is 16.16 fixed point */ 198 SkFixed dx4 = dx*4; 199 SkFixed dy4 = dy*4; 200 201 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 202 int32x2_t lower, upper; 203 int32x4_t xbase, ybase; 204 int16_t *dst16 = (int16_t *)xy; 205 206 /* synthesize 4x for both X and Y */ 207 xbase = vdupq_n_s32(fx); 208 xbase = vsetq_lane_s32(fx+dx, xbase, 1); 209 xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); 210 xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); 211 212 ybase = vdupq_n_s32(fy); 213 ybase = vsetq_lane_s32(fy+dy, ybase, 1); 214 ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); 215 ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); 216 217 /* store & bump */ 218 do { 219 int32x4_t xout; 220 int32x4_t yout; 221 int16x8_t hi16; 222 223 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ 224 /* mask to low 16 [would like to use uzp tricks) */ 225 xout = vandq_s32(xbase, vdupq_n_s32(0xffff)); 226 yout = vandq_s32(ybase, vdupq_n_s32(0xffff)); 227 /* bare multiplication, not SkFixedMul */ 228 xout = vmulq_s32(xout, vdupq_n_s32(maxX+1)); 229 yout = vmulq_s32(yout, vdupq_n_s32(maxY+1)); 230 231 /* put hi16 from xout over low16 from yout */ 232 yout = vsriq_n_s32(yout, xout, 16); 233 234 /* and then yout has the interleaved upper 16's */ 235 hi16 = vreinterpretq_s16_s32(yout); 236 vst1q_s16(dst16, hi16); 237 238 /* bump preserved base & on to the next */ 239 xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4)); 240 ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4)); 241 dst16 += 8; /* 8 x16 aka 4x32 */ 242 count -= 4; 243 fx += dx4; 244 fy += dy4; 245 } while (count >= 4); 246 xy = (uint32_t *) dst16; 247 } 248 249 #if 0 250 /* diagnostics... see whether we agree with the NEON code */ 251 int bad = 0; 252 uint32_t *myxy = oxy; 253 int myi = (-1); 254 SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy; 255 for (myi = ocount; myi > 0; --myi) { 256 uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) | TILEX_PROCF(ofx, maxX); 257 if (val != *myxy++) { 258 bad++; 259 break; 260 } 261 ofx += odx; ofy += ody; 262 } 263 if (bad) { 264 SkDebugf("repeat-nofilter-affine fails\n"); 265 SkDebugf("count %d myi %d\n", ocount, myi); 266 SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", 267 bfx, bdx, bfy, bdy); 268 SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); 269 } 270 #endif 271 272 for (int i = count; i > 0; --i) { 273 /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */ 274 /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ 275 *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); 276 fx += dx; fy += dy; 277 } 278 } 279 280 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, 281 uint32_t* SK_RESTRICT xy, 282 int count, int x, int y) { 283 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 284 285 PREAMBLE(s); 286 int maxX = s.fBitmap->width() - 1; 287 int maxY = s.fBitmap->height() - 1; 288 289 SkPerspIter iter(*s.fInvMatrix, 290 SkIntToScalar(x) + SK_ScalarHalf, 291 SkIntToScalar(y) + SK_ScalarHalf, count); 292 293 while ((count = iter.next()) != 0) { 294 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 295 296 /* RBE: */ 297 /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ 298 /* it's a little more complicated than what I did for the 299 * clamp case -- where I could immediately snip to the top 300 * 16 bits and do my min/max games there. 301 * ... might only be able to get 4x unrolling here 302 */ 303 304 /* vld2 to get a set of 32x4's ... */ 305 /* do the tile[xy]_procf operations */ 306 /* which includes doing vuzp to get hi16's */ 307 /* store it */ 308 /* -- inner loop (other than vld2) can be had from above */ 309 310 /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... 311 * but we immediately discard the low 16 bits... 312 * so what we're going to do is vld4, which will give us 313 * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo' 314 * parts.... 315 */ 316 if (0) { extern void rbe(void); rbe(); } 317 if (count >= 8) { 318 int32_t *mysrc = (int32_t *) srcXY; 319 int16_t *mydst = (int16_t *) xy; 320 do { 321 int32x4_t x, y, x2, y2; 322 int16x8_t hi, hi2; 323 324 /* read array of x,y,x,y,x,y */ 325 /* vld2 does the de-interleaving for us */ 326 /* isolate reg-bound scopes; gcc will minimize register 327 * motion if possible; this ensures that we don't lose 328 * a register across a debugging call because it happens 329 * to be bound into a call-clobbered register 330 */ 331 { 332 register int32x4_t q0 asm("q0"); 333 register int32x4_t q1 asm("q1"); 334 asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" 335 : "=w" (q0), "=w" (q1) 336 : "r" (mysrc) 337 ); 338 x = q0; y = q1; 339 } 340 341 /* offset == 256 bits == 32 bytes == 8 longs */ 342 { 343 register int32x4_t q2 asm("q2"); 344 register int32x4_t q3 asm("q3"); 345 asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */" 346 : "=w" (q2), "=w" (q3) 347 : "r" (mysrc+8) 348 ); 349 x2 = q2; y2 = q3; 350 } 351 352 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ 353 /* mask to low 16 [would like to use uzp tricks) */ 354 /* bare multiplication, not SkFixedMul */ 355 x = vandq_s32(x, vdupq_n_s32(0xffff)); 356 x = vmulq_s32(x, vdupq_n_s32(maxX+1)); 357 y = vandq_s32(y, vdupq_n_s32(0xffff)); 358 y = vmulq_s32(y, vdupq_n_s32(maxY+1)); 359 360 x2 = vandq_s32(x2, vdupq_n_s32(0xffff)); 361 x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1)); 362 y2 = vandq_s32(y2, vdupq_n_s32(0xffff)); 363 y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1)); 364 365 /* now collect interleaved high 16's */ 366 /* (hi-x, hi-y)4 (hi-x2; hi-y2)4 */ 367 368 /* extraction, using uzp, leaves hi16's in y */ 369 y = vsriq_n_s32(y, x, 16); 370 hi = vreinterpretq_s16_s32(y); 371 vst1q_s16(mydst, hi); 372 373 /* and likewise for the second 8 entries */ 374 y2 = vsriq_n_s32(y2, x2, 16); 375 hi2 = vreinterpretq_s16_s32(y2); 376 vst1q_s16(mydst+8, hi2); 377 378 /* XXX: gcc isn't interleaving these with the NEON ops 379 * but i think that all the scoreboarding works out */ 380 count -= 8; /* 8 iterations */ 381 mysrc += 16; /* 16 longs */ 382 mydst += 16; /* 16 shorts, aka 8 longs */ 383 } while (count >= 8); 384 /* get xy and srcXY fixed up */ 385 srcXY = (const SkFixed *) mysrc; 386 xy = (uint32_t *) mydst; 387 } 388 while (--count >= 0) { 389 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | 390 TILEX_PROCF(srcXY[0], maxX); 391 srcXY += 2; 392 } 393 } 394 } 395 396 ////////////////////////////////////////////////////////////////////////////// 397 398 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, 399 SkFixed one PREAMBLE_PARAM_Y) { 400 unsigned i = TILEY_PROCF(f, max); 401 i = (i << 4) | TILEY_LOW_BITS(f, max); 402 return (i << 14) | (TILEY_PROCF((f + one), max)); 403 } 404 405 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, 406 SkFixed one PREAMBLE_PARAM_X) { 407 unsigned i = TILEX_PROCF(f, max); 408 i = (i << 4) | TILEX_LOW_BITS(f, max); 409 return (i << 14) | (TILEX_PROCF((f + one), max)); 410 } 411 412 static void SCALE_FILTER_NAME(const SkBitmapProcState& s, 413 uint32_t xy[], int count, int x, int y) { 414 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 415 SkMatrix::kScale_Mask)) == 0); 416 SkASSERT(s.fInvKy == 0); 417 418 PREAMBLE(s); 419 420 const unsigned maxX = s.fBitmap->width() - 1; 421 const SkFixed one = s.fFilterOneX; 422 const SkFixed dx = s.fInvSx; 423 SkFixed fx; 424 425 { 426 SkPoint pt; 427 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 428 SkIntToScalar(y) + SK_ScalarHalf, &pt); 429 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 430 const unsigned maxY = s.fBitmap->height() - 1; 431 // compute our two Y values up front 432 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); 433 // now initialize fx 434 fx = SkScalarToFixed(pt.fX) - (one >> 1); 435 } 436 437 #ifdef CHECK_FOR_DECAL 438 // test if we don't need to apply the tile proc 439 if (dx > 0 && 440 (unsigned)(fx >> 16) <= maxX && 441 (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 442 decal_filter_scale_neon(xy, fx, dx, count); 443 } else 444 #endif 445 { 446 do { 447 *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X); 448 fx += dx; 449 } while (--count != 0); 450 } 451 } 452 453 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, 454 uint32_t xy[], int count, int x, int y) { 455 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 456 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 457 SkMatrix::kScale_Mask | 458 SkMatrix::kAffine_Mask)) == 0); 459 460 PREAMBLE(s); 461 SkPoint srcPt; 462 s.fInvProc(*s.fInvMatrix, 463 SkIntToScalar(x) + SK_ScalarHalf, 464 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 465 466 SkFixed oneX = s.fFilterOneX; 467 SkFixed oneY = s.fFilterOneY; 468 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 469 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 470 SkFixed dx = s.fInvSx; 471 SkFixed dy = s.fInvKy; 472 unsigned maxX = s.fBitmap->width() - 1; 473 unsigned maxY = s.fBitmap->height() - 1; 474 475 do { 476 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); 477 fy += dy; 478 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); 479 fx += dx; 480 } while (--count != 0); 481 } 482 483 static void PERSP_FILTER_NAME(const SkBitmapProcState& s, 484 uint32_t* SK_RESTRICT xy, int count, 485 int x, int y) { 486 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 487 488 extern void rbe(void); 489 490 PREAMBLE(s); 491 unsigned maxX = s.fBitmap->width() - 1; 492 unsigned maxY = s.fBitmap->height() - 1; 493 SkFixed oneX = s.fFilterOneX; 494 SkFixed oneY = s.fFilterOneY; 495 496 497 498 SkPerspIter iter(*s.fInvMatrix, 499 SkIntToScalar(x) + SK_ScalarHalf, 500 SkIntToScalar(y) + SK_ScalarHalf, count); 501 502 while ((count = iter.next()) != 0) { 503 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 504 do { 505 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, 506 oneY PREAMBLE_ARG_Y); 507 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, 508 oneX PREAMBLE_ARG_X); 509 srcXY += 2; 510 } while (--count != 0); 511 } 512 } 513 514 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { 515 SCALE_NOFILTER_NAME, 516 SCALE_FILTER_NAME, 517 AFFINE_NOFILTER_NAME, 518 AFFINE_FILTER_NAME, 519 PERSP_NOFILTER_NAME, 520 PERSP_FILTER_NAME 521 }; 522 523 #undef MAKENAME 524 #undef TILEX_PROCF 525 #undef TILEY_PROCF 526 #ifdef CHECK_FOR_DECAL 527 #undef CHECK_FOR_DECAL 528 #endif 529 530 #undef SCALE_NOFILTER_NAME 531 #undef SCALE_FILTER_NAME 532 #undef AFFINE_NOFILTER_NAME 533 #undef AFFINE_FILTER_NAME 534 #undef PERSP_NOFILTER_NAME 535 #undef PERSP_FILTER_NAME 536 537 #undef PREAMBLE 538 #undef PREAMBLE_PARAM_X 539 #undef PREAMBLE_PARAM_Y 540 #undef PREAMBLE_ARG_X 541 #undef PREAMBLE_ARG_Y 542 543 #undef TILEX_LOW_BITS 544 #undef TILEY_LOW_BITS 545