1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola */ 2 3 #include "SkBitmapProcState.h" 4 #include "SkPerspIter.h" 5 #include "SkShader.h" 6 #include "SkUtils.h" 7 8 /* returns 0...(n-1) given any x (positive or negative). 9 10 As an example, if n (which is always positive) is 5... 11 12 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 13 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 14 */ 15 static inline int sk_int_mod(int x, int n) { 16 SkASSERT(n > 0); 17 if ((unsigned)x >= (unsigned)n) { 18 if (x < 0) { 19 x = n + ~(~x % n); 20 } else { 21 x = x % n; 22 } 23 } 24 return x; 25 } 26 27 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 28 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 29 30 #define MAKENAME(suffix) ClampX_ClampY ## suffix 31 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 32 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 33 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 34 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 35 #define CHECK_FOR_DECAL 36 #if defined(__ARM_HAVE_NEON) 37 #include "SkBitmapProcState_matrix_clamp.h" 38 #else 39 #include "SkBitmapProcState_matrix.h" 40 #endif 41 42 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix 43 #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 44 #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16) 45 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 46 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 47 #if defined(__ARM_HAVE_NEON) 48 #include "SkBitmapProcState_matrix_repeat.h" 49 #else 50 #include "SkBitmapProcState_matrix.h" 51 #endif 52 53 #define MAKENAME(suffix) GeneralXY ## suffix 54 #define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \ 55 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY 56 #define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX 57 #define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY 58 #define PREAMBLE_ARG_X , tileProcX 59 #define PREAMBLE_ARG_Y , tileProcY 60 #define TILEX_PROCF(fx, max) (tileProcX(fx) * ((max) + 1) >> 16) 61 #define TILEY_PROCF(fy, max) (tileProcY(fy) * ((max) + 1) >> 16) 62 #define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF) 63 #define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF) 64 #include "SkBitmapProcState_matrix.h" 65 66 static inline U16CPU fixed_clamp(SkFixed x) 67 { 68 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR 69 if (x >> 16) 70 x = 0xFFFF; 71 if (x < 0) 72 x = 0; 73 #else 74 if (x >> 16) 75 { 76 if (x < 0) 77 x = 0; 78 else 79 x = 0xFFFF; 80 } 81 #endif 82 return x; 83 } 84 85 static inline U16CPU fixed_repeat(SkFixed x) 86 { 87 return x & 0xFFFF; 88 } 89 90 static inline U16CPU fixed_mirror(SkFixed x) 91 { 92 SkFixed s = x << 15 >> 31; 93 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval 94 return (x ^ s) & 0xFFFF; 95 } 96 97 static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) 98 { 99 if (SkShader::kClamp_TileMode == m) 100 return fixed_clamp; 101 if (SkShader::kRepeat_TileMode == m) 102 return fixed_repeat; 103 SkASSERT(SkShader::kMirror_TileMode == m); 104 return fixed_mirror; 105 } 106 107 static inline U16CPU int_clamp(int x, int n) { 108 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR 109 if (x >= n) 110 x = n - 1; 111 if (x < 0) 112 x = 0; 113 #else 114 if ((unsigned)x >= (unsigned)n) { 115 if (x < 0) { 116 x = 0; 117 } else { 118 x = n - 1; 119 } 120 } 121 #endif 122 return x; 123 } 124 125 static inline U16CPU int_repeat(int x, int n) { 126 return sk_int_mod(x, n); 127 } 128 129 static inline U16CPU int_mirror(int x, int n) { 130 x = sk_int_mod(x, 2 * n); 131 if (x >= n) { 132 x = n + ~(x - n); 133 } 134 return x; 135 } 136 137 #if 0 138 static void test_int_tileprocs() { 139 for (int i = -8; i <= 8; i++) { 140 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); 141 } 142 } 143 #endif 144 145 static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { 146 if (SkShader::kClamp_TileMode == tm) 147 return int_clamp; 148 if (SkShader::kRepeat_TileMode == tm) 149 return int_repeat; 150 SkASSERT(SkShader::kMirror_TileMode == tm); 151 return int_mirror; 152 } 153 154 ////////////////////////////////////////////////////////////////////////////// 155 156 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 157 { 158 int i; 159 160 #if defined(__ARM_HAVE_NEON) 161 if (count >= 8) { 162 /* SkFixed is 16.16 fixed point */ 163 SkFixed dx2 = dx+dx; 164 SkFixed dx4 = dx2+dx2; 165 SkFixed dx8 = dx4+dx4; 166 167 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 168 SkFixed fx1, fx2, fx3; 169 int32x2_t lower, upper; 170 int32x4_t lbase, hbase; 171 uint16_t *dst16 = (uint16_t *)dst; 172 173 fx1 = fx+dx; 174 fx2 = fx1+dx; 175 fx3 = fx2+dx; 176 177 /* avoid an 'lbase unitialized' warning */ 178 lbase = vdupq_n_s32(fx); 179 lbase = vsetq_lane_s32(fx1, lbase, 1); 180 lbase = vsetq_lane_s32(fx2, lbase, 2); 181 lbase = vsetq_lane_s32(fx3, lbase, 3); 182 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 183 184 /* take upper 16 of each, store, and bump everything */ 185 do { 186 int32x4_t lout, hout; 187 uint16x8_t hi16; 188 189 lout = lbase; 190 hout = hbase; 191 /* gets hi's of all louts then hi's of all houts */ 192 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 193 hi16 = vreinterpretq_u16_s32(hout); 194 vst1q_u16(dst16, hi16); 195 196 /* on to the next */ 197 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 198 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 199 dst16 += 8; 200 count -= 8; 201 fx += dx8; 202 } while (count >= 8); 203 dst = (uint32_t *) dst16; 204 } 205 #else 206 for (i = (count >> 2); i > 0; --i) 207 { 208 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 209 fx += dx+dx; 210 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 211 fx += dx+dx; 212 } 213 count &= 3; 214 #endif 215 216 uint16_t* xx = (uint16_t*)dst; 217 for (i = count; i > 0; --i) { 218 *xx++ = SkToU16(fx >> 16); fx += dx; 219 } 220 } 221 222 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 223 { 224 225 #if defined(__ARM_HAVE_NEON) 226 if (count >= 8) { 227 int32x4_t wide_fx; 228 int32x4_t wide_fx2; 229 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 230 231 wide_fx = vdupq_n_s32(fx); 232 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 233 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 234 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 235 236 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 237 238 while (count >= 8) { 239 int32x4_t wide_out; 240 int32x4_t wide_out2; 241 242 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 243 wide_out = vorrq_s32(wide_out, 244 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 245 246 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 247 wide_out2 = vorrq_s32(wide_out2, 248 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 249 250 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 251 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 252 253 dst += 8; 254 fx += dx*8; 255 wide_fx = vaddq_s32(wide_fx, wide_dx8); 256 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 257 count -= 8; 258 } 259 } 260 #endif 261 262 if (count & 1) 263 { 264 SkASSERT((fx >> (16 + 14)) == 0); 265 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 266 fx += dx; 267 } 268 while ((count -= 2) >= 0) 269 { 270 SkASSERT((fx >> (16 + 14)) == 0); 271 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 272 fx += dx; 273 274 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 275 fx += dx; 276 } 277 } 278 279 /////////////////////////////////////////////////////////////////////////////// 280 // stores the same as SCALE, but is cheaper to compute. Also since there is no 281 // scale, we don't need/have a FILTER version 282 283 static void fill_sequential(uint16_t xptr[], int start, int count) { 284 #if 1 285 if (reinterpret_cast<intptr_t>(xptr) & 0x2) { 286 *xptr++ = start++; 287 count -= 1; 288 } 289 if (count > 3) { 290 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr); 291 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); 292 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); 293 start += count & ~3; 294 int qcount = count >> 2; 295 do { 296 *xxptr++ = pattern0; 297 pattern0 += 0x40004; 298 *xxptr++ = pattern1; 299 pattern1 += 0x40004; 300 } while (--qcount != 0); 301 xptr = reinterpret_cast<uint16_t*>(xxptr); 302 count &= 3; 303 } 304 while (--count >= 0) { 305 *xptr++ = start++; 306 } 307 #else 308 for (int i = 0; i < count; i++) { 309 *xptr++ = start++; 310 } 311 #endif 312 } 313 314 static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, 315 int x, int y) { 316 SkPoint pt; 317 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 318 SkIntToScalar(y) + SK_ScalarHalf, &pt); 319 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, 320 s.fBitmap->height()); 321 *xy += 1; // bump the ptr 322 // return our starting X position 323 return SkScalarToFixed(pt.fX) >> 16; 324 } 325 326 static void clampx_nofilter_trans(const SkBitmapProcState& s, 327 uint32_t xy[], int count, int x, int y) { 328 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 329 330 int xpos = nofilter_trans_preamble(s, &xy, x, y); 331 const int width = s.fBitmap->width(); 332 if (1 == width) { 333 // all of the following X values must be 0 334 memset(xy, 0, count * sizeof(uint16_t)); 335 return; 336 } 337 338 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 339 int n; 340 341 // fill before 0 as needed 342 if (xpos < 0) { 343 n = -xpos; 344 if (n > count) { 345 n = count; 346 } 347 memset(xptr, 0, n * sizeof(uint16_t)); 348 count -= n; 349 if (0 == count) { 350 return; 351 } 352 xptr += n; 353 xpos = 0; 354 } 355 356 // fill in 0..width-1 if needed 357 if (xpos < width) { 358 n = width - xpos; 359 if (n > count) { 360 n = count; 361 } 362 fill_sequential(xptr, xpos, n); 363 count -= n; 364 if (0 == count) { 365 return; 366 } 367 xptr += n; 368 } 369 370 // fill the remaining with the max value 371 sk_memset16(xptr, width - 1, count); 372 } 373 374 static void repeatx_nofilter_trans(const SkBitmapProcState& s, 375 uint32_t xy[], int count, int x, int y) { 376 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 377 378 int xpos = nofilter_trans_preamble(s, &xy, x, y); 379 const int width = s.fBitmap->width(); 380 if (1 == width) { 381 // all of the following X values must be 0 382 memset(xy, 0, count * sizeof(uint16_t)); 383 return; 384 } 385 386 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 387 int start = sk_int_mod(xpos, width); 388 int n = width - start; 389 if (n > count) { 390 n = count; 391 } 392 fill_sequential(xptr, start, n); 393 xptr += n; 394 count -= n; 395 396 while (count >= width) { 397 fill_sequential(xptr, 0, width); 398 xptr += width; 399 count -= width; 400 } 401 402 if (count > 0) { 403 fill_sequential(xptr, 0, count); 404 } 405 } 406 407 static void fill_backwards(uint16_t xptr[], int pos, int count) { 408 for (int i = 0; i < count; i++) { 409 SkASSERT(pos >= 0); 410 xptr[i] = pos--; 411 } 412 } 413 414 static void mirrorx_nofilter_trans(const SkBitmapProcState& s, 415 uint32_t xy[], int count, int x, int y) { 416 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 417 418 int xpos = nofilter_trans_preamble(s, &xy, x, y); 419 const int width = s.fBitmap->width(); 420 if (1 == width) { 421 // all of the following X values must be 0 422 memset(xy, 0, count * sizeof(uint16_t)); 423 return; 424 } 425 426 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 427 // need to know our start, and our initial phase (forward or backward) 428 bool forward; 429 int n; 430 int start = sk_int_mod(xpos, 2 * width); 431 if (start >= width) { 432 start = width + ~(start - width); 433 forward = false; 434 n = start + 1; // [start .. 0] 435 } else { 436 forward = true; 437 n = width - start; // [start .. width) 438 } 439 if (n > count) { 440 n = count; 441 } 442 if (forward) { 443 fill_sequential(xptr, start, n); 444 } else { 445 fill_backwards(xptr, start, n); 446 } 447 forward = !forward; 448 xptr += n; 449 count -= n; 450 451 while (count >= width) { 452 if (forward) { 453 fill_sequential(xptr, 0, width); 454 } else { 455 fill_backwards(xptr, width - 1, width); 456 } 457 forward = !forward; 458 xptr += width; 459 count -= width; 460 } 461 462 if (count > 0) { 463 if (forward) { 464 fill_sequential(xptr, 0, count); 465 } else { 466 fill_backwards(xptr, width - 1, count); 467 } 468 } 469 } 470 471 /////////////////////////////////////////////////////////////////////////////// 472 473 SkBitmapProcState::MatrixProc 474 SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { 475 // test_int_tileprocs(); 476 // check for our special case when there is no scale/affine/perspective 477 if (trivial_matrix) { 478 SkASSERT(!fDoFilter); 479 fIntTileProcY = choose_int_tile_proc(fTileModeY); 480 switch (fTileModeX) { 481 case SkShader::kClamp_TileMode: 482 return clampx_nofilter_trans; 483 case SkShader::kRepeat_TileMode: 484 return repeatx_nofilter_trans; 485 case SkShader::kMirror_TileMode: 486 return mirrorx_nofilter_trans; 487 } 488 } 489 490 int index = 0; 491 if (fDoFilter) { 492 index = 1; 493 } 494 if (fInvType & SkMatrix::kPerspective_Mask) { 495 index += 4; 496 } else if (fInvType & SkMatrix::kAffine_Mask) { 497 index += 2; 498 } 499 500 if (SkShader::kClamp_TileMode == fTileModeX && 501 SkShader::kClamp_TileMode == fTileModeY) 502 { 503 // clamp gets special version of filterOne 504 fFilterOneX = SK_Fixed1; 505 fFilterOneY = SK_Fixed1; 506 return ClampX_ClampY_Procs[index]; 507 } 508 509 // all remaining procs use this form for filterOne 510 fFilterOneX = SK_Fixed1 / fBitmap->width(); 511 fFilterOneY = SK_Fixed1 / fBitmap->height(); 512 513 if (SkShader::kRepeat_TileMode == fTileModeX && 514 SkShader::kRepeat_TileMode == fTileModeY) 515 { 516 return RepeatX_RepeatY_Procs[index]; 517 } 518 519 fTileProcX = choose_tile_proc(fTileModeX); 520 fTileProcY = choose_tile_proc(fTileModeY); 521 return GeneralXY_Procs[index]; 522 } 523 524