1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 // This restricted SkJumper backend works on 8-bit per channel pixels stored in 9 // 16-bit channels. This is a last attempt to write a performant low-precision 10 // backend with stage definitions that can be shared by x86 and ARM. 11 12 #include "SkJumper.h" 13 #include "SkJumper_misc.h" 14 15 #if defined(__clang__) // This file is empty when not compiled by Clang. 16 17 #if defined(__ARM_NEON) 18 #include <arm_neon.h> 19 #elif defined(__SSE2__) 20 #include <immintrin.h> 21 #else 22 #include <math.h> 23 #endif 24 25 #if !defined(JUMPER_IS_OFFLINE) 26 #define WRAP(name) sk_##name##_lowp 27 #elif defined(__AVX2__) 28 #define WRAP(name) sk_##name##_hsw_lowp 29 #elif defined(__SSE4_1__) 30 #define WRAP(name) sk_##name##_sse41_lowp 31 #elif defined(__SSE2__) 32 #define WRAP(name) sk_##name##_sse2_lowp 33 #endif 34 35 #if defined(__AVX2__) 36 using U8 = uint8_t __attribute__((ext_vector_type(16))); 37 using U16 = uint16_t __attribute__((ext_vector_type(16))); 38 using I16 = int16_t __attribute__((ext_vector_type(16))); 39 using I32 = int32_t __attribute__((ext_vector_type(16))); 40 using U32 = uint32_t __attribute__((ext_vector_type(16))); 41 using F = float __attribute__((ext_vector_type(16))); 42 #else 43 using U8 = uint8_t __attribute__((ext_vector_type(8))); 44 using U16 = uint16_t __attribute__((ext_vector_type(8))); 45 using I16 = int16_t __attribute__((ext_vector_type(8))); 46 using I32 = int32_t __attribute__((ext_vector_type(8))); 47 using U32 = uint32_t __attribute__((ext_vector_type(8))); 48 using F = float __attribute__((ext_vector_type(8))); 49 #endif 50 51 static const size_t N = sizeof(U16) / sizeof(uint16_t); 52 53 // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. 54 using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, 55 U16 r, U16 g, U16 b, U16 a, 56 U16 dr, U16 dg, U16 db, U16 da); 57 58 extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0, 59 const size_t y0, 60 const size_t xlimit, 61 const size_t ylimit, 62 void** program) { 63 auto start = (Stage)load_and_inc(program); 64 for (size_t dy = y0; dy < ylimit; dy++) { 65 size_t dx = x0; 66 for (; dx + N <= xlimit; dx += N) { 67 start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); 68 } 69 if (size_t tail = xlimit - dx) { 70 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); 71 } 72 } 73 } 74 75 extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t, 76 U16,U16,U16,U16, U16,U16,U16,U16) {} 77 78 // All stages use the same function call ABI to chain into each other, but there are three types: 79 // GG: geometry in, geometry out -- think, a matrix 80 // GP: geometry in, pixels out. -- think, a memory gather 81 // PP: pixels in, pixels out. -- think, a blend mode 82 // 83 // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) 84 // 85 // These three STAGE_ macros let you define each type of stage, 86 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. 87 88 #define STAGE_GG(name, ...) \ 89 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ 90 extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 91 U16 r, U16 g, U16 b, U16 a, \ 92 U16 dr, U16 dg, U16 db, U16 da) { \ 93 auto x = join<F>(r,g), \ 94 y = join<F>(b,a); \ 95 name##_k(Ctx{program}, dx,dy,tail, x,y); \ 96 split(x, &r,&g); \ 97 split(y, &b,&a); \ 98 auto next = (Stage)load_and_inc(program); \ 99 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 100 } \ 101 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) 102 103 #define STAGE_GP(name, ...) \ 104 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 105 U16& r, U16& g, U16& b, U16& a, \ 106 U16& dr, U16& dg, U16& db, U16& da); \ 107 extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 108 U16 r, U16 g, U16 b, U16 a, \ 109 U16 dr, U16 dg, U16 db, U16 da) { \ 110 auto x = join<F>(r,g), \ 111 y = join<F>(b,a); \ 112 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ 113 auto next = (Stage)load_and_inc(program); \ 114 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 115 } \ 116 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 117 U16& r, U16& g, U16& b, U16& a, \ 118 U16& dr, U16& dg, U16& db, U16& da) 119 120 #define STAGE_PP(name, ...) \ 121 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 122 U16& r, U16& g, U16& b, U16& a, \ 123 U16& dr, U16& dg, U16& db, U16& da); \ 124 extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 125 U16 r, U16 g, U16 b, U16 a, \ 126 U16 dr, U16 dg, U16 db, U16 da) { \ 127 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ 128 auto next = (Stage)load_and_inc(program); \ 129 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 130 } \ 131 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 132 U16& r, U16& g, U16& b, U16& a, \ 133 U16& dr, U16& dg, U16& db, U16& da) 134 135 // ~~~~~~ Commonly used helper functions ~~~~~~ // 136 137 SI U16 div255(U16 v) { 138 #if 0 139 return (v+127)/255; // The ideal rounding divide by 255. 140 #else 141 return (v+255)/256; // A good approximation of (v+127)/255. 142 #endif 143 } 144 145 SI U16 inv(U16 v) { return 255-v; } 146 147 SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } 148 SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } 149 150 SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } 151 SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } 152 SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); } 153 SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); } 154 155 SI U16 from_float(float f) { return f * 255.0f + 0.5f; } 156 157 SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } 158 159 template <typename D, typename S> 160 SI D cast(S src) { 161 return __builtin_convertvector(src, D); 162 } 163 164 template <typename D, typename S> 165 SI void split(S v, D* lo, D* hi) { 166 static_assert(2*sizeof(D) == sizeof(S), ""); 167 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); 168 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); 169 } 170 template <typename D, typename S> 171 SI D join(S lo, S hi) { 172 static_assert(sizeof(D) == 2*sizeof(S), ""); 173 D v; 174 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); 175 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); 176 return v; 177 } 178 template <typename V, typename H> 179 SI V map(V v, H (*fn)(H)) { 180 H lo,hi; 181 split(v, &lo,&hi); 182 lo = fn(lo); 183 hi = fn(hi); 184 return join<V>(lo,hi); 185 } 186 187 // TODO: do we need platform-specific intrinsics for any of these? 188 SI F if_then_else(I32 c, F t, F e) { 189 return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); 190 } 191 SI F max(F x, F y) { return if_then_else(x < y, y, x); } 192 SI F min(F x, F y) { return if_then_else(x < y, x, y); } 193 194 SI F mad(F f, F m, F a) { return f*m+a; } 195 SI U32 trunc_(F x) { return (U32)cast<I32>(x); } 196 197 SI F rcp(F x) { 198 #if defined(__AVX2__) 199 return map(x, _mm256_rcp_ps); 200 #elif defined(__SSE__) 201 return map(x, _mm_rcp_ps); 202 #elif defined(__ARM_NEON) 203 return map(x, +[](float32x4_t v) { 204 auto est = vrecpeq_f32(v); 205 return vrecpsq_f32(v,est)*est; 206 }); 207 #else 208 return 1.0f / x; 209 #endif 210 } 211 SI F sqrt_(F x) { 212 #if defined(__AVX2__) 213 return map(x, _mm256_sqrt_ps); 214 #elif defined(__SSE__) 215 return map(x, _mm_sqrt_ps); 216 #elif defined(__aarch64__) 217 return map(x, vsqrtq_f32); 218 #elif defined(__ARM_NEON) 219 return map(x, +[](float32x4_t v) { 220 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). 221 est *= vrsqrtsq_f32(v,est*est); 222 est *= vrsqrtsq_f32(v,est*est); 223 return v*est; // sqrt(v) == v*rsqrt(v). 224 }); 225 #else 226 return F{ 227 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), 228 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), 229 }; 230 #endif 231 } 232 233 SI F floor_(F x) { 234 #if defined(__aarch64__) 235 return map(x, vrndmq_f32); 236 #elif defined(__AVX2__) 237 return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro... 238 #elif defined(__SSE4_1__) 239 return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too. 240 #else 241 F roundtrip = cast<F>(cast<I32>(x)); 242 return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); 243 #endif 244 } 245 SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); } 246 247 // ~~~~~~ Basic / misc. stages ~~~~~~ // 248 249 STAGE_GG(seed_shader, const float* iota) { 250 x = cast<F>(I32(dx)) + unaligned_load<F>(iota); 251 y = cast<F>(I32(dy)) + 0.5f; 252 } 253 254 STAGE_GG(matrix_translate, const float* m) { 255 x += m[0]; 256 y += m[1]; 257 } 258 STAGE_GG(matrix_scale_translate, const float* m) { 259 x = mad(x,m[0], m[2]); 260 y = mad(y,m[1], m[3]); 261 } 262 STAGE_GG(matrix_2x3, const float* m) { 263 auto X = mad(x,m[0], mad(y,m[2], m[4])), 264 Y = mad(x,m[1], mad(y,m[3], m[5])); 265 x = X; 266 y = Y; 267 } 268 STAGE_GG(matrix_perspective, const float* m) { 269 // N.B. Unlike the other matrix_ stages, this matrix is row-major. 270 auto X = mad(x,m[0], mad(y,m[1], m[2])), 271 Y = mad(x,m[3], mad(y,m[4], m[5])), 272 Z = mad(x,m[6], mad(y,m[7], m[8])); 273 x = X * rcp(Z); 274 y = Y * rcp(Z); 275 } 276 277 STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) { 278 r = c->rgba[0]; 279 g = c->rgba[1]; 280 b = c->rgba[2]; 281 a = c->rgba[3]; 282 } 283 STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } 284 STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } 285 286 STAGE_PP(set_rgb, const float rgb[3]) { 287 r = from_float(rgb[0]); 288 g = from_float(rgb[1]); 289 b = from_float(rgb[2]); 290 } 291 292 STAGE_PP(clamp_a, Ctx::None) { 293 r = min(r, a); 294 g = min(g, a); 295 b = min(b, a); 296 } 297 STAGE_PP(clamp_a_dst, Ctx::None) { 298 dr = min(dr, da); 299 dg = min(dg, da); 300 db = min(db, da); 301 } 302 303 STAGE_PP(premul, Ctx::None) { 304 r = div255(r * a); 305 g = div255(g * a); 306 b = div255(b * a); 307 } 308 STAGE_PP(premul_dst, Ctx::None) { 309 dr = div255(dr * da); 310 dg = div255(dg * da); 311 db = div255(db * da); 312 } 313 314 STAGE_PP(force_opaque , Ctx::None) { a = 255; } 315 STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } 316 317 STAGE_PP(swap_rb, Ctx::None) { 318 auto tmp = r; 319 r = b; 320 b = tmp; 321 } 322 323 STAGE_PP(move_src_dst, Ctx::None) { 324 dr = r; 325 dg = g; 326 db = b; 327 da = a; 328 } 329 330 STAGE_PP(move_dst_src, Ctx::None) { 331 r = dr; 332 g = dg; 333 b = db; 334 a = da; 335 } 336 337 STAGE_PP(invert, Ctx::None) { 338 r = inv(r); 339 g = inv(g); 340 b = inv(b); 341 a = inv(a); 342 } 343 344 // ~~~~~~ Blend modes ~~~~~~ // 345 346 // The same logic applied to all 4 channels. 347 #define BLEND_MODE(name) \ 348 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 349 STAGE_PP(name, Ctx::None) { \ 350 r = name##_channel(r,dr,a,da); \ 351 g = name##_channel(g,dg,a,da); \ 352 b = name##_channel(b,db,a,da); \ 353 a = name##_channel(a,da,a,da); \ 354 } \ 355 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 356 357 BLEND_MODE(clear) { return 0; } 358 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } 359 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } 360 BLEND_MODE(srcin) { return div255( s*da ); } 361 BLEND_MODE(dstin) { return div255( d*sa ); } 362 BLEND_MODE(srcout) { return div255( s*inv(da) ); } 363 BLEND_MODE(dstout) { return div255( d*inv(sa) ); } 364 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } 365 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } 366 BLEND_MODE(modulate) { return div255( s*d ); } 367 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } 368 BLEND_MODE(plus_) { return min(s+d, 255); } 369 BLEND_MODE(screen) { return s + d - div255( s*d ); } 370 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } 371 #undef BLEND_MODE 372 373 // The same logic applied to color, and srcover for alpha. 374 #define BLEND_MODE(name) \ 375 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 376 STAGE_PP(name, Ctx::None) { \ 377 r = name##_channel(r,dr,a,da); \ 378 g = name##_channel(g,dg,a,da); \ 379 b = name##_channel(b,db,a,da); \ 380 a = a + div255( da*inv(a) ); \ 381 } \ 382 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 383 384 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } 385 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } 386 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } 387 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } 388 389 BLEND_MODE(hardlight) { 390 return div255( s*inv(da) + d*inv(sa) + 391 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 392 } 393 BLEND_MODE(overlay) { 394 return div255( s*inv(da) + d*inv(sa) + 395 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 396 } 397 #undef BLEND_MODE 398 399 // ~~~~~~ Helpers for interacting with memory ~~~~~~ // 400 401 template <typename T> 402 SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) { 403 return (T*)ctx->pixels + dy*ctx->stride + dx; 404 } 405 406 template <typename T> 407 SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { 408 auto clamp = [](F v, F limit) { 409 limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. 410 return min(max(0, v), limit); 411 }; 412 x = clamp(x, ctx->width); 413 y = clamp(y, ctx->height); 414 415 *ptr = (const T*)ctx->pixels; 416 return trunc_(y)*ctx->stride + trunc_(x); 417 } 418 419 template <typename V, typename T> 420 SI V load(const T* ptr, size_t tail) { 421 V v = 0; 422 switch (tail & (N-1)) { 423 case 0: memcpy(&v, ptr, sizeof(v)); break; 424 #if defined(__AVX2__) 425 case 15: v[14] = ptr[14]; 426 case 14: v[13] = ptr[13]; 427 case 13: v[12] = ptr[12]; 428 case 12: memcpy(&v, ptr, 12*sizeof(T)); break; 429 case 11: v[10] = ptr[10]; 430 case 10: v[ 9] = ptr[ 9]; 431 case 9: v[ 8] = ptr[ 8]; 432 case 8: memcpy(&v, ptr, 8*sizeof(T)); break; 433 #endif 434 case 7: v[ 6] = ptr[ 6]; 435 case 6: v[ 5] = ptr[ 5]; 436 case 5: v[ 4] = ptr[ 4]; 437 case 4: memcpy(&v, ptr, 4*sizeof(T)); break; 438 case 3: v[ 2] = ptr[ 2]; 439 case 2: memcpy(&v, ptr, 2*sizeof(T)); break; 440 case 1: v[ 0] = ptr[ 0]; 441 } 442 return v; 443 } 444 template <typename V, typename T> 445 SI void store(T* ptr, size_t tail, V v) { 446 switch (tail & (N-1)) { 447 case 0: memcpy(ptr, &v, sizeof(v)); break; 448 #if defined(__AVX2__) 449 case 15: ptr[14] = v[14]; 450 case 14: ptr[13] = v[13]; 451 case 13: ptr[12] = v[12]; 452 case 12: memcpy(ptr, &v, 12*sizeof(T)); break; 453 case 11: ptr[10] = v[10]; 454 case 10: ptr[ 9] = v[ 9]; 455 case 9: ptr[ 8] = v[ 8]; 456 case 8: memcpy(ptr, &v, 8*sizeof(T)); break; 457 #endif 458 case 7: ptr[ 6] = v[ 6]; 459 case 6: ptr[ 5] = v[ 5]; 460 case 5: ptr[ 4] = v[ 4]; 461 case 4: memcpy(ptr, &v, 4*sizeof(T)); break; 462 case 3: ptr[ 2] = v[ 2]; 463 case 2: memcpy(ptr, &v, 2*sizeof(T)); break; 464 case 1: ptr[ 0] = v[ 0]; 465 } 466 } 467 468 #if defined(__AVX2__) 469 template <typename V, typename T> 470 SI V gather(const T* ptr, U32 ix) { 471 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 472 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], 473 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], 474 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; 475 } 476 477 template<> 478 F gather(const float* p, U32 ix) { 479 __m256i lo, hi; 480 split(ix, &lo, &hi); 481 482 return join<F>(_mm256_i32gather_ps(p, lo, 4), 483 _mm256_i32gather_ps(p, hi, 4)); 484 } 485 486 template<> 487 U32 gather(const uint32_t* p, U32 ix) { 488 __m256i lo, hi; 489 split(ix, &lo, &hi); 490 491 return join<U32>(_mm256_i32gather_epi32(p, lo, 4), 492 _mm256_i32gather_epi32(p, hi, 4)); 493 } 494 #else 495 template <typename V, typename T> 496 SI V gather(const T* ptr, U32 ix) { 497 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 498 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; 499 } 500 #endif 501 502 503 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // 504 505 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { 506 #if 1 && defined(__AVX2__) 507 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. 508 __m256i _01,_23; 509 split(rgba, &_01, &_23); 510 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), 511 _13 = _mm256_permute2x128_si256(_01,_23, 0x31); 512 rgba = join<U32>(_02, _13); 513 514 auto cast_U16 = [](U32 v) -> U16 { 515 __m256i _02,_13; 516 split(v, &_02,&_13); 517 return _mm256_packus_epi32(_02,_13); 518 }; 519 #else 520 auto cast_U16 = [](U32 v) -> U16 { 521 return cast<U16>(v); 522 }; 523 #endif 524 *r = cast_U16(rgba & 65535) & 255; 525 *g = cast_U16(rgba & 65535) >> 8; 526 *b = cast_U16(rgba >> 16) & 255; 527 *a = cast_U16(rgba >> 16) >> 8; 528 } 529 530 SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 531 #if 1 && defined(__ARM_NEON) 532 uint8x8x4_t rgba; 533 switch (tail & (N-1)) { 534 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; 535 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); 536 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); 537 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); 538 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); 539 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); 540 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); 541 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); 542 } 543 *r = cast<U16>(rgba.val[0]); 544 *g = cast<U16>(rgba.val[1]); 545 *b = cast<U16>(rgba.val[2]); 546 *a = cast<U16>(rgba.val[3]); 547 #else 548 from_8888(load<U32>(ptr, tail), r,g,b,a); 549 #endif 550 } 551 SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 552 #if 1 && defined(__ARM_NEON) 553 uint8x8x4_t rgba = {{ 554 cast<U8>(r), 555 cast<U8>(g), 556 cast<U8>(b), 557 cast<U8>(a), 558 }}; 559 switch (tail & (N-1)) { 560 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; 561 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); 562 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); 563 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); 564 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); 565 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); 566 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); 567 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); 568 } 569 #else 570 store(ptr, tail, cast<U32>(r | (g<<8)) << 0 571 | cast<U32>(b | (a<<8)) << 16); 572 #endif 573 } 574 575 STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) { 576 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 577 } 578 STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) { 579 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 580 } 581 STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) { 582 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); 583 } 584 585 STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) { 586 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a); 587 } 588 STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { 589 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da); 590 } 591 STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) { 592 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a); 593 } 594 595 STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) { 596 const uint32_t* ptr; 597 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 598 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); 599 } 600 STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) { 601 const uint32_t* ptr; 602 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 603 from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a); 604 } 605 606 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // 607 608 SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { 609 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 610 U16 R = (rgb >> 11) & 31, 611 G = (rgb >> 5) & 63, 612 B = (rgb >> 0) & 31; 613 614 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. 615 *r = (R << 3) | (R >> 2); 616 *g = (G << 2) | (G >> 4); 617 *b = (B << 3) | (B >> 2); 618 } 619 SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 620 from_565(load<U16>(ptr, tail), r,g,b); 621 } 622 SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { 623 // Select the top 5,6,5 bits. 624 U16 R = r >> 3, 625 G = g >> 2, 626 B = b >> 3; 627 // Pack them back into 15|rrrrr gggggg bbbbb|0. 628 store(ptr, tail, R << 11 629 | G << 5 630 | B << 0); 631 } 632 633 STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) { 634 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); 635 a = 255; 636 } 637 STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) { 638 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); 639 da = 255; 640 } 641 STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) { 642 store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); 643 } 644 STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) { 645 const uint16_t* ptr; 646 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 647 from_565(gather<U16>(ptr, ix), &r, &g, &b); 648 a = 255; 649 } 650 651 SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { 652 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. 653 U16 R = (rgba >> 12) & 15, 654 G = (rgba >> 8) & 15, 655 B = (rgba >> 4) & 15, 656 A = (rgba >> 0) & 15; 657 658 // Scale [0,15] to [0,255]. 659 *r = (R << 4) | R; 660 *g = (G << 4) | G; 661 *b = (B << 4) | B; 662 *a = (A << 4) | A; 663 } 664 SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 665 from_4444(load<U16>(ptr, tail), r,g,b,a); 666 } 667 SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 668 // Select the top 4 bits of each. 669 U16 R = r >> 4, 670 G = g >> 4, 671 B = b >> 4, 672 A = a >> 4; 673 // Pack them back into 15|rrrr gggg bbbb aaaa|0. 674 store(ptr, tail, R << 12 675 | G << 8 676 | B << 4 677 | A << 0); 678 } 679 680 STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) { 681 load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 682 } 683 STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) { 684 load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 685 } 686 STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) { 687 store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); 688 } 689 STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) { 690 const uint16_t* ptr; 691 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 692 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); 693 } 694 695 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // 696 697 SI U16 load_8(const uint8_t* ptr, size_t tail) { 698 return cast<U16>(load<U8>(ptr, tail)); 699 } 700 SI void store_8(uint8_t* ptr, size_t tail, U16 v) { 701 store(ptr, tail, cast<U8>(v)); 702 } 703 704 STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) { 705 r = g = b = 0; 706 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 707 } 708 STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) { 709 dr = dg = db = 0; 710 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 711 } 712 STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) { 713 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); 714 } 715 STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) { 716 const uint8_t* ptr; 717 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 718 r = g = b = 0; 719 a = cast<U16>(gather<U8>(ptr, ix)); 720 } 721 722 STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) { 723 r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 724 a = 255; 725 } 726 STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) { 727 dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 728 da = 255; 729 } 730 STAGE_PP(luminance_to_alpha, Ctx::None) { 731 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. 732 r = g = b = 0; 733 } 734 STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) { 735 const uint8_t* ptr; 736 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 737 r = g = b = cast<U16>(gather<U8>(ptr, ix)); 738 a = 255; 739 } 740 741 // ~~~~~~ Coverage scales / lerps ~~~~~~ // 742 743 STAGE_PP(scale_1_float, const float* f) { 744 U16 c = from_float(*f); 745 r = div255( r * c ); 746 g = div255( g * c ); 747 b = div255( b * c ); 748 a = div255( a * c ); 749 } 750 STAGE_PP(lerp_1_float, const float* f) { 751 U16 c = from_float(*f); 752 r = lerp(dr, r, c); 753 g = lerp(dg, g, c); 754 b = lerp(db, b, c); 755 a = lerp(da, a, c); 756 } 757 758 STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) { 759 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 760 r = div255( r * c ); 761 g = div255( g * c ); 762 b = div255( b * c ); 763 a = div255( a * c ); 764 } 765 STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) { 766 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 767 r = lerp(dr, r, c); 768 g = lerp(dg, g, c); 769 b = lerp(db, b, c); 770 a = lerp(da, a, c); 771 } 772 773 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. 774 SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { 775 return if_then_else(a < da, min(cr,cg,cb) 776 , max(cr,cg,cb)); 777 } 778 STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) { 779 U16 cr,cg,cb; 780 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 781 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 782 783 r = div255( r * cr ); 784 g = div255( g * cg ); 785 b = div255( b * cb ); 786 a = div255( a * ca ); 787 } 788 STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) { 789 U16 cr,cg,cb; 790 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 791 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 792 793 r = lerp(dr, r, cr); 794 g = lerp(dg, g, cg); 795 b = lerp(db, b, cb); 796 a = lerp(da, a, ca); 797 } 798 799 // ~~~~~~ Gradient stages ~~~~~~ // 800 801 // Clamp x to [0,1], both sides inclusive (think, gradients). 802 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. 803 SI F clamp_01(F v) { return min(max(0, v), 1); } 804 805 STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } 806 STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } 807 STAGE_GG(mirror_x_1, Ctx::None) { 808 auto two = [](F x){ return x+x; }; 809 x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); 810 } 811 812 SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); } 813 814 SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t, 815 U16* r, U16* g, U16* b, U16* a) { 816 817 F fr, fg, fb, fa, br, bg, bb, ba; 818 #if defined(__AVX2__) 819 if (c->stopCount <=8) { 820 __m256i lo, hi; 821 split(idx, &lo, &hi); 822 823 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), 824 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); 825 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), 826 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); 827 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), 828 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); 829 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), 830 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); 831 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), 832 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); 833 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), 834 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); 835 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), 836 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); 837 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), 838 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); 839 } else 840 #endif 841 { 842 fr = gather<F>(c->fs[0], idx); 843 fg = gather<F>(c->fs[1], idx); 844 fb = gather<F>(c->fs[2], idx); 845 fa = gather<F>(c->fs[3], idx); 846 br = gather<F>(c->bs[0], idx); 847 bg = gather<F>(c->bs[1], idx); 848 bb = gather<F>(c->bs[2], idx); 849 ba = gather<F>(c->bs[3], idx); 850 } 851 *r = round_F_to_U16(mad(t, fr, br)); 852 *g = round_F_to_U16(mad(t, fg, bg)); 853 *b = round_F_to_U16(mad(t, fb, bb)); 854 *a = round_F_to_U16(mad(t, fa, ba)); 855 } 856 857 STAGE_GP(gradient, const SkJumper_GradientCtx* c) { 858 auto t = x; 859 U32 idx = 0; 860 861 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. 862 for (size_t i = 1; i < c->stopCount; i++) { 863 idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); 864 } 865 866 gradient_lookup(c, idx, t, &r, &g, &b, &a); 867 } 868 869 STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) { 870 auto t = x; 871 auto idx = trunc_(t * (c->stopCount-1)); 872 gradient_lookup(c, idx, t, &r, &g, &b, &a); 873 } 874 875 STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) { 876 // TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx. 877 struct Ctx { float f[4], b[4]; }; 878 auto c = (const Ctx*)ctx; 879 880 auto t = x; 881 r = round_F_to_U16(mad(t, c->f[0], c->b[0])); 882 g = round_F_to_U16(mad(t, c->f[1], c->b[1])); 883 b = round_F_to_U16(mad(t, c->f[2], c->b[2])); 884 a = round_F_to_U16(mad(t, c->f[3], c->b[3])); 885 } 886 887 STAGE_GG(xy_to_unit_angle, Ctx::None) { 888 F xabs = abs_(x), 889 yabs = abs_(y); 890 891 F slope = min(xabs, yabs)/max(xabs, yabs); 892 F s = slope * slope; 893 894 // Use a 7th degree polynomial to approximate atan. 895 // This was generated using sollya.gforge.inria.fr. 896 // A float optimized polynomial was generated using the following command. 897 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); 898 F phi = slope 899 * (0.15912117063999176025390625f + s 900 * (-5.185396969318389892578125e-2f + s 901 * (2.476101927459239959716796875e-2f + s 902 * (-7.0547382347285747528076171875e-3f)))); 903 904 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); 905 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); 906 phi = if_then_else(y < 0.0f , 1.0f - phi , phi); 907 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. 908 x = phi; 909 } 910 STAGE_GG(xy_to_radius, Ctx::None) { 911 x = sqrt_(x*x + y*y); 912 } 913 914 // ~~~~~~ Compound stages ~~~~~~ // 915 916 STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { 917 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 918 919 load_8888(ptr, tail, &dr,&dg,&db,&da); 920 r = r + div255( dr*inv(a) ); 921 g = g + div255( dg*inv(a) ); 922 b = b + div255( db*inv(a) ); 923 a = a + div255( da*inv(a) ); 924 store_8888(ptr, tail, r,g,b,a); 925 } 926 STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) { 927 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 928 929 load_8888(ptr, tail, &db,&dg,&dr,&da); 930 r = r + div255( dr*inv(a) ); 931 g = g + div255( dg*inv(a) ); 932 b = b + div255( db*inv(a) ); 933 a = a + div255( da*inv(a) ); 934 store_8888(ptr, tail, b,g,r,a); 935 } 936 937 #endif//defined(__clang__) 938