1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7 #include "SkBitmapProcState.h" 8 #include "SkPerspIter.h" 9 #include "SkShader.h" 10 #include "SkUtils.h" 11 12 /* returns 0...(n-1) given any x (positive or negative). 13 14 As an example, if n (which is always positive) is 5... 15 16 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 17 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 18 */ 19 static inline int sk_int_mod(int x, int n) { 20 SkASSERT(n > 0); 21 if ((unsigned)x >= (unsigned)n) { 22 if (x < 0) { 23 x = n + ~(~x % n); 24 } else { 25 x = x % n; 26 } 27 } 28 return x; 29 } 30 31 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 32 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 33 34 #define MAKENAME(suffix) ClampX_ClampY ## suffix 35 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 36 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 37 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 38 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 39 #define CHECK_FOR_DECAL 40 #if defined(__ARM_HAVE_NEON) 41 #include "SkBitmapProcState_matrix_clamp.h" 42 #else 43 #include "SkBitmapProcState_matrix.h" 44 #endif 45 46 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix 47 #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 48 #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16) 49 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 50 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 51 #if defined(__ARM_HAVE_NEON) 52 #include "SkBitmapProcState_matrix_repeat.h" 53 #else 54 #include "SkBitmapProcState_matrix.h" 55 #endif 56 57 #define MAKENAME(suffix) GeneralXY ## suffix 58 #define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \ 59 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY 60 #define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX 61 #define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY 62 #define PREAMBLE_ARG_X , tileProcX 63 #define PREAMBLE_ARG_Y , tileProcY 64 #define TILEX_PROCF(fx, max) (tileProcX(fx) * ((max) + 1) >> 16) 65 #define TILEY_PROCF(fy, max) (tileProcY(fy) * ((max) + 1) >> 16) 66 #define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF) 67 #define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF) 68 #include "SkBitmapProcState_matrix.h" 69 70 static inline U16CPU fixed_clamp(SkFixed x) 71 { 72 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR 73 if (x >> 16) 74 x = 0xFFFF; 75 if (x < 0) 76 x = 0; 77 #else 78 if (x >> 16) 79 { 80 if (x < 0) 81 x = 0; 82 else 83 x = 0xFFFF; 84 } 85 #endif 86 return x; 87 } 88 89 static inline U16CPU fixed_repeat(SkFixed x) 90 { 91 return x & 0xFFFF; 92 } 93 94 static inline U16CPU fixed_mirror(SkFixed x) 95 { 96 SkFixed s = x << 15 >> 31; 97 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval 98 return (x ^ s) & 0xFFFF; 99 } 100 101 static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) 102 { 103 if (SkShader::kClamp_TileMode == m) 104 return fixed_clamp; 105 if (SkShader::kRepeat_TileMode == m) 106 return fixed_repeat; 107 SkASSERT(SkShader::kMirror_TileMode == m); 108 return fixed_mirror; 109 } 110 111 static inline U16CPU int_clamp(int x, int n) { 112 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR 113 if (x >= n) 114 x = n - 1; 115 if (x < 0) 116 x = 0; 117 #else 118 if ((unsigned)x >= (unsigned)n) { 119 if (x < 0) { 120 x = 0; 121 } else { 122 x = n - 1; 123 } 124 } 125 #endif 126 return x; 127 } 128 129 static inline U16CPU int_repeat(int x, int n) { 130 return sk_int_mod(x, n); 131 } 132 133 static inline U16CPU int_mirror(int x, int n) { 134 x = sk_int_mod(x, 2 * n); 135 if (x >= n) { 136 x = n + ~(x - n); 137 } 138 return x; 139 } 140 141 #if 0 142 static void test_int_tileprocs() { 143 for (int i = -8; i <= 8; i++) { 144 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); 145 } 146 } 147 #endif 148 149 static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { 150 if (SkShader::kClamp_TileMode == tm) 151 return int_clamp; 152 if (SkShader::kRepeat_TileMode == tm) 153 return int_repeat; 154 SkASSERT(SkShader::kMirror_TileMode == tm); 155 return int_mirror; 156 } 157 158 ////////////////////////////////////////////////////////////////////////////// 159 160 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 161 { 162 int i; 163 164 #if defined(__ARM_HAVE_NEON) 165 if (count >= 8) { 166 /* SkFixed is 16.16 fixed point */ 167 SkFixed dx2 = dx+dx; 168 SkFixed dx4 = dx2+dx2; 169 SkFixed dx8 = dx4+dx4; 170 171 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 172 SkFixed fx1, fx2, fx3; 173 int32x2_t lower, upper; 174 int32x4_t lbase, hbase; 175 uint16_t *dst16 = (uint16_t *)dst; 176 177 fx1 = fx+dx; 178 fx2 = fx1+dx; 179 fx3 = fx2+dx; 180 181 /* avoid an 'lbase unitialized' warning */ 182 lbase = vdupq_n_s32(fx); 183 lbase = vsetq_lane_s32(fx1, lbase, 1); 184 lbase = vsetq_lane_s32(fx2, lbase, 2); 185 lbase = vsetq_lane_s32(fx3, lbase, 3); 186 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 187 188 /* take upper 16 of each, store, and bump everything */ 189 do { 190 int32x4_t lout, hout; 191 uint16x8_t hi16; 192 193 lout = lbase; 194 hout = hbase; 195 /* gets hi's of all louts then hi's of all houts */ 196 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 197 hi16 = vreinterpretq_u16_s32(hout); 198 vst1q_u16(dst16, hi16); 199 200 /* on to the next */ 201 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 202 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 203 dst16 += 8; 204 count -= 8; 205 fx += dx8; 206 } while (count >= 8); 207 dst = (uint32_t *) dst16; 208 } 209 #else 210 for (i = (count >> 2); i > 0; --i) 211 { 212 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 213 fx += dx+dx; 214 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 215 fx += dx+dx; 216 } 217 count &= 3; 218 #endif 219 220 uint16_t* xx = (uint16_t*)dst; 221 for (i = count; i > 0; --i) { 222 *xx++ = SkToU16(fx >> 16); fx += dx; 223 } 224 } 225 226 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 227 { 228 229 #if defined(__ARM_HAVE_NEON) 230 if (count >= 8) { 231 int32x4_t wide_fx; 232 int32x4_t wide_fx2; 233 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 234 235 wide_fx = vdupq_n_s32(fx); 236 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 237 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 238 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 239 240 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 241 242 while (count >= 8) { 243 int32x4_t wide_out; 244 int32x4_t wide_out2; 245 246 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 247 wide_out = vorrq_s32(wide_out, 248 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 249 250 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 251 wide_out2 = vorrq_s32(wide_out2, 252 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 253 254 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 255 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 256 257 dst += 8; 258 fx += dx*8; 259 wide_fx = vaddq_s32(wide_fx, wide_dx8); 260 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 261 count -= 8; 262 } 263 } 264 #endif 265 266 if (count & 1) 267 { 268 SkASSERT((fx >> (16 + 14)) == 0); 269 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 270 fx += dx; 271 } 272 while ((count -= 2) >= 0) 273 { 274 SkASSERT((fx >> (16 + 14)) == 0); 275 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 276 fx += dx; 277 278 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 279 fx += dx; 280 } 281 } 282 283 /////////////////////////////////////////////////////////////////////////////// 284 // stores the same as SCALE, but is cheaper to compute. Also since there is no 285 // scale, we don't need/have a FILTER version 286 287 static void fill_sequential(uint16_t xptr[], int start, int count) { 288 #if 1 289 if (reinterpret_cast<intptr_t>(xptr) & 0x2) { 290 *xptr++ = start++; 291 count -= 1; 292 } 293 if (count > 3) { 294 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr); 295 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); 296 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); 297 start += count & ~3; 298 int qcount = count >> 2; 299 do { 300 *xxptr++ = pattern0; 301 pattern0 += 0x40004; 302 *xxptr++ = pattern1; 303 pattern1 += 0x40004; 304 } while (--qcount != 0); 305 xptr = reinterpret_cast<uint16_t*>(xxptr); 306 count &= 3; 307 } 308 while (--count >= 0) { 309 *xptr++ = start++; 310 } 311 #else 312 for (int i = 0; i < count; i++) { 313 *xptr++ = start++; 314 } 315 #endif 316 } 317 318 static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, 319 int x, int y) { 320 SkPoint pt; 321 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 322 SkIntToScalar(y) + SK_ScalarHalf, &pt); 323 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, 324 s.fBitmap->height()); 325 *xy += 1; // bump the ptr 326 // return our starting X position 327 return SkScalarToFixed(pt.fX) >> 16; 328 } 329 330 static void clampx_nofilter_trans(const SkBitmapProcState& s, 331 uint32_t xy[], int count, int x, int y) { 332 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 333 334 int xpos = nofilter_trans_preamble(s, &xy, x, y); 335 const int width = s.fBitmap->width(); 336 if (1 == width) { 337 // all of the following X values must be 0 338 memset(xy, 0, count * sizeof(uint16_t)); 339 return; 340 } 341 342 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 343 int n; 344 345 // fill before 0 as needed 346 if (xpos < 0) { 347 n = -xpos; 348 if (n > count) { 349 n = count; 350 } 351 memset(xptr, 0, n * sizeof(uint16_t)); 352 count -= n; 353 if (0 == count) { 354 return; 355 } 356 xptr += n; 357 xpos = 0; 358 } 359 360 // fill in 0..width-1 if needed 361 if (xpos < width) { 362 n = width - xpos; 363 if (n > count) { 364 n = count; 365 } 366 fill_sequential(xptr, xpos, n); 367 count -= n; 368 if (0 == count) { 369 return; 370 } 371 xptr += n; 372 } 373 374 // fill the remaining with the max value 375 sk_memset16(xptr, width - 1, count); 376 } 377 378 static void repeatx_nofilter_trans(const SkBitmapProcState& s, 379 uint32_t xy[], int count, int x, int y) { 380 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 381 382 int xpos = nofilter_trans_preamble(s, &xy, x, y); 383 const int width = s.fBitmap->width(); 384 if (1 == width) { 385 // all of the following X values must be 0 386 memset(xy, 0, count * sizeof(uint16_t)); 387 return; 388 } 389 390 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 391 int start = sk_int_mod(xpos, width); 392 int n = width - start; 393 if (n > count) { 394 n = count; 395 } 396 fill_sequential(xptr, start, n); 397 xptr += n; 398 count -= n; 399 400 while (count >= width) { 401 fill_sequential(xptr, 0, width); 402 xptr += width; 403 count -= width; 404 } 405 406 if (count > 0) { 407 fill_sequential(xptr, 0, count); 408 } 409 } 410 411 static void fill_backwards(uint16_t xptr[], int pos, int count) { 412 for (int i = 0; i < count; i++) { 413 SkASSERT(pos >= 0); 414 xptr[i] = pos--; 415 } 416 } 417 418 static void mirrorx_nofilter_trans(const SkBitmapProcState& s, 419 uint32_t xy[], int count, int x, int y) { 420 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 421 422 int xpos = nofilter_trans_preamble(s, &xy, x, y); 423 const int width = s.fBitmap->width(); 424 if (1 == width) { 425 // all of the following X values must be 0 426 memset(xy, 0, count * sizeof(uint16_t)); 427 return; 428 } 429 430 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 431 // need to know our start, and our initial phase (forward or backward) 432 bool forward; 433 int n; 434 int start = sk_int_mod(xpos, 2 * width); 435 if (start >= width) { 436 start = width + ~(start - width); 437 forward = false; 438 n = start + 1; // [start .. 0] 439 } else { 440 forward = true; 441 n = width - start; // [start .. width) 442 } 443 if (n > count) { 444 n = count; 445 } 446 if (forward) { 447 fill_sequential(xptr, start, n); 448 } else { 449 fill_backwards(xptr, start, n); 450 } 451 forward = !forward; 452 xptr += n; 453 count -= n; 454 455 while (count >= width) { 456 if (forward) { 457 fill_sequential(xptr, 0, width); 458 } else { 459 fill_backwards(xptr, width - 1, width); 460 } 461 forward = !forward; 462 xptr += width; 463 count -= width; 464 } 465 466 if (count > 0) { 467 if (forward) { 468 fill_sequential(xptr, 0, count); 469 } else { 470 fill_backwards(xptr, width - 1, count); 471 } 472 } 473 } 474 475 /////////////////////////////////////////////////////////////////////////////// 476 477 SkBitmapProcState::MatrixProc 478 SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { 479 // test_int_tileprocs(); 480 // check for our special case when there is no scale/affine/perspective 481 if (trivial_matrix) { 482 SkASSERT(!fDoFilter); 483 fIntTileProcY = choose_int_tile_proc(fTileModeY); 484 switch (fTileModeX) { 485 case SkShader::kClamp_TileMode: 486 return clampx_nofilter_trans; 487 case SkShader::kRepeat_TileMode: 488 return repeatx_nofilter_trans; 489 case SkShader::kMirror_TileMode: 490 return mirrorx_nofilter_trans; 491 } 492 } 493 494 int index = 0; 495 if (fDoFilter) { 496 index = 1; 497 } 498 if (fInvType & SkMatrix::kPerspective_Mask) { 499 index += 4; 500 } else if (fInvType & SkMatrix::kAffine_Mask) { 501 index += 2; 502 } 503 504 if (SkShader::kClamp_TileMode == fTileModeX && 505 SkShader::kClamp_TileMode == fTileModeY) 506 { 507 // clamp gets special version of filterOne 508 fFilterOneX = SK_Fixed1; 509 fFilterOneY = SK_Fixed1; 510 return ClampX_ClampY_Procs[index]; 511 } 512 513 // all remaining procs use this form for filterOne 514 fFilterOneX = SK_Fixed1 / fBitmap->width(); 515 fFilterOneY = SK_Fixed1 / fBitmap->height(); 516 517 if (SkShader::kRepeat_TileMode == fTileModeX && 518 SkShader::kRepeat_TileMode == fTileModeY) 519 { 520 return RepeatX_RepeatY_Procs[index]; 521 } 522 523 fTileProcX = choose_tile_proc(fTileModeX); 524 fTileProcY = choose_tile_proc(fTileModeY); 525 return GeneralXY_Procs[index]; 526 } 527 528