Home | History | Annotate | Download | only in jumper
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 // This restricted SkJumper backend works on 8-bit per channel pixels stored in
      9 // 16-bit channels.  This is a last attempt to write a performant low-precision
     10 // backend with stage definitions that can be shared by x86 and ARM.
     11 
     12 #include "SkJumper.h"
     13 #include "SkJumper_misc.h"
     14 
     15 #if defined(__clang__)  // This file is empty when not compiled by Clang.
     16 
     17 #if defined(__ARM_NEON)
     18     #include <arm_neon.h>
     19 #elif defined(__SSE2__)
     20     #include <immintrin.h>
     21 #else
     22     #include <math.h>
     23 #endif
     24 
     25 #if !defined(JUMPER_IS_OFFLINE)
     26     #define WRAP(name) sk_##name##_lowp
     27 #elif defined(__AVX2__)
     28     #define WRAP(name) sk_##name##_hsw_lowp
     29 #elif defined(__SSE4_1__)
     30     #define WRAP(name) sk_##name##_sse41_lowp
     31 #elif defined(__SSE2__)
     32     #define WRAP(name) sk_##name##_sse2_lowp
     33 #endif
     34 
     35 #if defined(__AVX2__)
     36     using U8  = uint8_t  __attribute__((ext_vector_type(16)));
     37     using U16 = uint16_t __attribute__((ext_vector_type(16)));
     38     using I16 =  int16_t __attribute__((ext_vector_type(16)));
     39     using I32 =  int32_t __attribute__((ext_vector_type(16)));
     40     using U32 = uint32_t __attribute__((ext_vector_type(16)));
     41     using F   = float    __attribute__((ext_vector_type(16)));
     42 #else
     43     using U8  = uint8_t  __attribute__((ext_vector_type(8)));
     44     using U16 = uint16_t __attribute__((ext_vector_type(8)));
     45     using I16 =  int16_t __attribute__((ext_vector_type(8)));
     46     using I32 =  int32_t __attribute__((ext_vector_type(8)));
     47     using U32 = uint32_t __attribute__((ext_vector_type(8)));
     48     using F   = float    __attribute__((ext_vector_type(8)));
     49 #endif
     50 
     51 static const size_t N = sizeof(U16) / sizeof(uint16_t);
     52 
     53 // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
     54 using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
     55                           U16  r, U16  g, U16  b, U16  a,
     56                           U16 dr, U16 dg, U16 db, U16 da);
     57 
     58 extern "C" MAYBE_MSABI void WRAP(start_pipeline)(const size_t x0,
     59                                                  const size_t y0,
     60                                                  const size_t xlimit,
     61                                                  const size_t ylimit,
     62                                                  void** program) {
     63     auto start = (Stage)load_and_inc(program);
     64     for (size_t dy = y0; dy < ylimit; dy++) {
     65         size_t dx = x0;
     66         for (; dx + N <= xlimit; dx += N) {
     67             start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
     68         }
     69         if (size_t tail = xlimit - dx) {
     70             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
     71         }
     72     }
     73 }
     74 
     75 extern "C" ABI void WRAP(just_return)(size_t,void**,size_t,size_t,
     76                                       U16,U16,U16,U16, U16,U16,U16,U16) {}
     77 
     78 // All stages use the same function call ABI to chain into each other, but there are three types:
     79 //   GG: geometry in, geometry out  -- think, a matrix
     80 //   GP: geometry in, pixels out.   -- think, a memory gather
     81 //   PP: pixels in, pixels out.     -- think, a blend mode
     82 //
     83 // (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.)
     84 //
     85 // These three STAGE_ macros let you define each type of stage,
     86 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
     87 
     88 #define STAGE_GG(name, ...)                                                            \
     89     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
     90     extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
     91                                    U16  r, U16  g, U16  b, U16  a,                     \
     92                                    U16 dr, U16 dg, U16 db, U16 da) {                   \
     93         auto x = join<F>(r,g),                                                         \
     94              y = join<F>(b,a);                                                         \
     95         name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
     96         split(x, &r,&g);                                                               \
     97         split(y, &b,&a);                                                               \
     98         auto next = (Stage)load_and_inc(program);                                      \
     99         next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
    100     }                                                                                  \
    101     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
    102 
    103 #define STAGE_GP(name, ...)                                                            \
    104     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
    105                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
    106                      U16& dr, U16& dg, U16& db, U16& da);                              \
    107     extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
    108                                    U16  r, U16  g, U16  b, U16  a,                     \
    109                                    U16 dr, U16 dg, U16 db, U16 da) {                   \
    110         auto x = join<F>(r,g),                                                         \
    111              y = join<F>(b,a);                                                         \
    112         name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
    113         auto next = (Stage)load_and_inc(program);                                      \
    114         next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
    115     }                                                                                  \
    116     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
    117                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
    118                      U16& dr, U16& dg, U16& db, U16& da)
    119 
    120 #define STAGE_PP(name, ...)                                                            \
    121     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
    122                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
    123                      U16& dr, U16& dg, U16& db, U16& da);                              \
    124     extern "C" ABI void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
    125                                    U16  r, U16  g, U16  b, U16  a,                     \
    126                                    U16 dr, U16 dg, U16 db, U16 da) {                   \
    127         name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
    128         auto next = (Stage)load_and_inc(program);                                      \
    129         next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
    130     }                                                                                  \
    131     SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
    132                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
    133                      U16& dr, U16& dg, U16& db, U16& da)
    134 
    135 // ~~~~~~ Commonly used helper functions ~~~~~~ //
    136 
    137 SI U16 div255(U16 v) {
    138 #if 0
    139     return (v+127)/255;  // The ideal rounding divide by 255.
    140 #else
    141     return (v+255)/256;  // A good approximation of (v+127)/255.
    142 #endif
    143 }
    144 
    145 SI U16 inv(U16 v) { return 255-v; }
    146 
    147 SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
    148 SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
    149 
    150 SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
    151 SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
    152 SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
    153 SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
    154 
    155 SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
    156 
    157 SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
    158 
    159 template <typename D, typename S>
    160 SI D cast(S src) {
    161     return __builtin_convertvector(src, D);
    162 }
    163 
    164 template <typename D, typename S>
    165 SI void split(S v, D* lo, D* hi) {
    166     static_assert(2*sizeof(D) == sizeof(S), "");
    167     memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
    168     memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
    169 }
    170 template <typename D, typename S>
    171 SI D join(S lo, S hi) {
    172     static_assert(sizeof(D) == 2*sizeof(S), "");
    173     D v;
    174     memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
    175     memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
    176     return v;
    177 }
    178 template <typename V, typename H>
    179 SI V map(V v, H (*fn)(H)) {
    180     H lo,hi;
    181     split(v, &lo,&hi);
    182     lo = fn(lo);
    183     hi = fn(hi);
    184     return join<V>(lo,hi);
    185 }
    186 
    187 // TODO: do we need platform-specific intrinsics for any of these?
    188 SI F if_then_else(I32 c, F t, F e) {
    189     return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
    190 }
    191 SI F max(F x, F y) { return if_then_else(x < y, y, x); }
    192 SI F min(F x, F y) { return if_then_else(x < y, x, y); }
    193 
    194 SI F mad(F f, F m, F a) { return f*m+a; }
    195 SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
    196 
    197 SI F rcp(F x) {
    198 #if defined(__AVX2__)
    199     return map(x, _mm256_rcp_ps);
    200 #elif defined(__SSE__)
    201     return map(x, _mm_rcp_ps);
    202 #elif defined(__ARM_NEON)
    203     return map(x, +[](float32x4_t v) {
    204         auto est = vrecpeq_f32(v);
    205         return vrecpsq_f32(v,est)*est;
    206     });
    207 #else
    208     return 1.0f / x;
    209 #endif
    210 }
    211 SI F sqrt_(F x) {
    212 #if defined(__AVX2__)
    213     return map(x, _mm256_sqrt_ps);
    214 #elif defined(__SSE__)
    215     return map(x, _mm_sqrt_ps);
    216 #elif defined(__aarch64__)
    217     return map(x, vsqrtq_f32);
    218 #elif defined(__ARM_NEON)
    219     return map(x, +[](float32x4_t v) {
    220         auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v).
    221         est *= vrsqrtsq_f32(v,est*est);
    222         est *= vrsqrtsq_f32(v,est*est);
    223         return v*est;                // sqrt(v) == v*rsqrt(v).
    224     });
    225 #else
    226     return F{
    227         sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
    228         sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
    229     };
    230 #endif
    231 }
    232 
    233 SI F floor_(F x) {
    234 #if defined(__aarch64__)
    235     return map(x, vrndmq_f32);
    236 #elif defined(__AVX2__)
    237     return map(x, +[](__m256 v){ return _mm256_floor_ps(v); });  // _mm256_floor_ps is a macro...
    238 #elif defined(__SSE4_1__)
    239     return map(x, +[](__m128 v){ return    _mm_floor_ps(v); });  // _mm_floor_ps() is a macro too.
    240 #else
    241     F roundtrip = cast<F>(cast<I32>(x));
    242     return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
    243 #endif
    244 }
    245 SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
    246 
    247 // ~~~~~~ Basic / misc. stages ~~~~~~ //
    248 
    249 STAGE_GG(seed_shader, const float* iota) {
    250     x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
    251     y = cast<F>(I32(dy)) + 0.5f;
    252 }
    253 
    254 STAGE_GG(matrix_translate, const float* m) {
    255     x += m[0];
    256     y += m[1];
    257 }
    258 STAGE_GG(matrix_scale_translate, const float* m) {
    259     x = mad(x,m[0], m[2]);
    260     y = mad(y,m[1], m[3]);
    261 }
    262 STAGE_GG(matrix_2x3, const float* m) {
    263     auto X = mad(x,m[0], mad(y,m[2], m[4])),
    264          Y = mad(x,m[1], mad(y,m[3], m[5]));
    265     x = X;
    266     y = Y;
    267 }
    268 STAGE_GG(matrix_perspective, const float* m) {
    269     // N.B. Unlike the other matrix_ stages, this matrix is row-major.
    270     auto X = mad(x,m[0], mad(y,m[1], m[2])),
    271          Y = mad(x,m[3], mad(y,m[4], m[5])),
    272          Z = mad(x,m[6], mad(y,m[7], m[8]));
    273     x = X * rcp(Z);
    274     y = Y * rcp(Z);
    275 }
    276 
    277 STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
    278     r = c->rgba[0];
    279     g = c->rgba[1];
    280     b = c->rgba[2];
    281     a = c->rgba[3];
    282 }
    283 STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
    284 STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
    285 
    286 STAGE_PP(set_rgb, const float rgb[3]) {
    287     r = from_float(rgb[0]);
    288     g = from_float(rgb[1]);
    289     b = from_float(rgb[2]);
    290 }
    291 
    292 STAGE_PP(clamp_a, Ctx::None) {
    293     r = min(r, a);
    294     g = min(g, a);
    295     b = min(b, a);
    296 }
    297 STAGE_PP(clamp_a_dst, Ctx::None) {
    298     dr = min(dr, da);
    299     dg = min(dg, da);
    300     db = min(db, da);
    301 }
    302 
    303 STAGE_PP(premul, Ctx::None) {
    304     r = div255(r * a);
    305     g = div255(g * a);
    306     b = div255(b * a);
    307 }
    308 STAGE_PP(premul_dst, Ctx::None) {
    309     dr = div255(dr * da);
    310     dg = div255(dg * da);
    311     db = div255(db * da);
    312 }
    313 
    314 STAGE_PP(force_opaque    , Ctx::None) {  a = 255; }
    315 STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; }
    316 
    317 STAGE_PP(swap_rb, Ctx::None) {
    318     auto tmp = r;
    319     r = b;
    320     b = tmp;
    321 }
    322 
    323 STAGE_PP(move_src_dst, Ctx::None) {
    324     dr = r;
    325     dg = g;
    326     db = b;
    327     da = a;
    328 }
    329 
    330 STAGE_PP(move_dst_src, Ctx::None) {
    331     r = dr;
    332     g = dg;
    333     b = db;
    334     a = da;
    335 }
    336 
    337 STAGE_PP(invert, Ctx::None) {
    338     r = inv(r);
    339     g = inv(g);
    340     b = inv(b);
    341     a = inv(a);
    342 }
    343 
    344 // ~~~~~~ Blend modes ~~~~~~ //
    345 
    346 // The same logic applied to all 4 channels.
    347 #define BLEND_MODE(name)                                 \
    348     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
    349     STAGE_PP(name, Ctx::None) {                          \
    350         r = name##_channel(r,dr,a,da);                   \
    351         g = name##_channel(g,dg,a,da);                   \
    352         b = name##_channel(b,db,a,da);                   \
    353         a = name##_channel(a,da,a,da);                   \
    354     }                                                    \
    355     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
    356 
    357     BLEND_MODE(clear)    { return 0; }
    358     BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); }
    359     BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); }
    360     BLEND_MODE(srcin)    { return div255( s*da ); }
    361     BLEND_MODE(dstin)    { return div255( d*sa ); }
    362     BLEND_MODE(srcout)   { return div255( s*inv(da) ); }
    363     BLEND_MODE(dstout)   { return div255( d*inv(sa) ); }
    364     BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); }
    365     BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); }
    366     BLEND_MODE(modulate) { return div255( s*d ); }
    367     BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
    368     BLEND_MODE(plus_)    { return min(s+d, 255); }
    369     BLEND_MODE(screen)   { return s + d - div255( s*d ); }
    370     BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); }
    371 #undef BLEND_MODE
    372 
    373 // The same logic applied to color, and srcover for alpha.
    374 #define BLEND_MODE(name)                                 \
    375     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
    376     STAGE_PP(name, Ctx::None) {                          \
    377         r = name##_channel(r,dr,a,da);                   \
    378         g = name##_channel(g,dg,a,da);                   \
    379         b = name##_channel(b,db,a,da);                   \
    380         a = a + div255( da*inv(a) );                     \
    381     }                                                    \
    382     SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
    383 
    384     BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); }
    385     BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); }
    386     BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
    387     BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); }
    388 
    389     BLEND_MODE(hardlight) {
    390         return div255( s*inv(da) + d*inv(sa) +
    391                        if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
    392     }
    393     BLEND_MODE(overlay) {
    394         return div255( s*inv(da) + d*inv(sa) +
    395                        if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
    396     }
    397 #undef BLEND_MODE
    398 
    399 // ~~~~~~ Helpers for interacting with memory ~~~~~~ //
    400 
    401 template <typename T>
    402 SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
    403     return (T*)ctx->pixels + dy*ctx->stride + dx;
    404 }
    405 
    406 template <typename T>
    407 SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
    408     auto clamp = [](F v, F limit) {
    409         limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
    410         return min(max(0, v), limit);
    411     };
    412     x = clamp(x, ctx->width);
    413     y = clamp(y, ctx->height);
    414 
    415     *ptr = (const T*)ctx->pixels;
    416     return trunc_(y)*ctx->stride + trunc_(x);
    417 }
    418 
    419 template <typename V, typename T>
    420 SI V load(const T* ptr, size_t tail) {
    421     V v = 0;
    422     switch (tail & (N-1)) {
    423         case  0: memcpy(&v, ptr, sizeof(v)); break;
    424     #if defined(__AVX2__)
    425         case 15: v[14] = ptr[14];
    426         case 14: v[13] = ptr[13];
    427         case 13: v[12] = ptr[12];
    428         case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
    429         case 11: v[10] = ptr[10];
    430         case 10: v[ 9] = ptr[ 9];
    431         case  9: v[ 8] = ptr[ 8];
    432         case  8: memcpy(&v, ptr,  8*sizeof(T)); break;
    433     #endif
    434         case  7: v[ 6] = ptr[ 6];
    435         case  6: v[ 5] = ptr[ 5];
    436         case  5: v[ 4] = ptr[ 4];
    437         case  4: memcpy(&v, ptr,  4*sizeof(T)); break;
    438         case  3: v[ 2] = ptr[ 2];
    439         case  2: memcpy(&v, ptr,  2*sizeof(T)); break;
    440         case  1: v[ 0] = ptr[ 0];
    441     }
    442     return v;
    443 }
    444 template <typename V, typename T>
    445 SI void store(T* ptr, size_t tail, V v) {
    446     switch (tail & (N-1)) {
    447         case  0: memcpy(ptr, &v, sizeof(v)); break;
    448     #if defined(__AVX2__)
    449         case 15: ptr[14] = v[14];
    450         case 14: ptr[13] = v[13];
    451         case 13: ptr[12] = v[12];
    452         case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
    453         case 11: ptr[10] = v[10];
    454         case 10: ptr[ 9] = v[ 9];
    455         case  9: ptr[ 8] = v[ 8];
    456         case  8: memcpy(ptr, &v,  8*sizeof(T)); break;
    457     #endif
    458         case  7: ptr[ 6] = v[ 6];
    459         case  6: ptr[ 5] = v[ 5];
    460         case  5: ptr[ 4] = v[ 4];
    461         case  4: memcpy(ptr, &v,  4*sizeof(T)); break;
    462         case  3: ptr[ 2] = v[ 2];
    463         case  2: memcpy(ptr, &v,  2*sizeof(T)); break;
    464         case  1: ptr[ 0] = v[ 0];
    465     }
    466 }
    467 
    468 #if defined(__AVX2__)
    469     template <typename V, typename T>
    470     SI V gather(const T* ptr, U32 ix) {
    471         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
    472                   ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
    473                   ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
    474                   ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
    475     }
    476 
    477     template<>
    478     F gather(const float* p, U32 ix) {
    479         __m256i lo, hi;
    480         split(ix, &lo, &hi);
    481 
    482         return join<F>(_mm256_i32gather_ps(p, lo, 4),
    483                        _mm256_i32gather_ps(p, hi, 4));
    484     }
    485 
    486     template<>
    487     U32 gather(const uint32_t* p, U32 ix) {
    488         __m256i lo, hi;
    489         split(ix, &lo, &hi);
    490 
    491         return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
    492                          _mm256_i32gather_epi32(p, hi, 4));
    493     }
    494 #else
    495     template <typename V, typename T>
    496     SI V gather(const T* ptr, U32 ix) {
    497         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
    498                   ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
    499     }
    500 #endif
    501 
    502 
    503 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
    504 
    505 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
    506 #if 1 && defined(__AVX2__)
    507     // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
    508     __m256i _01,_23;
    509     split(rgba, &_01, &_23);
    510     __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
    511             _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
    512     rgba = join<U32>(_02, _13);
    513 
    514     auto cast_U16 = [](U32 v) -> U16 {
    515         __m256i _02,_13;
    516         split(v, &_02,&_13);
    517         return _mm256_packus_epi32(_02,_13);
    518     };
    519 #else
    520     auto cast_U16 = [](U32 v) -> U16 {
    521         return cast<U16>(v);
    522     };
    523 #endif
    524     *r = cast_U16(rgba & 65535) & 255;
    525     *g = cast_U16(rgba & 65535) >>  8;
    526     *b = cast_U16(rgba >>   16) & 255;
    527     *a = cast_U16(rgba >>   16) >>  8;
    528 }
    529 
    530 SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    531 #if 1 && defined(__ARM_NEON)
    532     uint8x8x4_t rgba;
    533     switch (tail & (N-1)) {
    534         case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
    535         case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
    536         case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
    537         case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
    538         case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
    539         case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
    540         case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
    541         case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
    542     }
    543     *r = cast<U16>(rgba.val[0]);
    544     *g = cast<U16>(rgba.val[1]);
    545     *b = cast<U16>(rgba.val[2]);
    546     *a = cast<U16>(rgba.val[3]);
    547 #else
    548     from_8888(load<U32>(ptr, tail), r,g,b,a);
    549 #endif
    550 }
    551 SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    552 #if 1 && defined(__ARM_NEON)
    553     uint8x8x4_t rgba = {{
    554         cast<U8>(r),
    555         cast<U8>(g),
    556         cast<U8>(b),
    557         cast<U8>(a),
    558     }};
    559     switch (tail & (N-1)) {
    560         case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break;
    561         case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
    562         case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
    563         case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
    564         case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
    565         case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
    566         case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
    567         case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
    568     }
    569 #else
    570     store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
    571                    | cast<U32>(b | (a<<8)) << 16);
    572 #endif
    573 }
    574 
    575 STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
    576     load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
    577 }
    578 STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
    579     load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
    580 }
    581 STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
    582     store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
    583 }
    584 
    585 STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
    586     load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
    587 }
    588 STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
    589     load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
    590 }
    591 STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
    592     store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
    593 }
    594 
    595 STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
    596     const uint32_t* ptr;
    597     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    598     from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
    599 }
    600 STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
    601     const uint32_t* ptr;
    602     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    603     from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
    604 }
    605 
    606 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
    607 
    608 SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
    609     // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
    610     U16 R = (rgb >> 11) & 31,
    611         G = (rgb >>  5) & 63,
    612         B = (rgb >>  0) & 31;
    613 
    614     // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
    615     *r = (R << 3) | (R >> 2);
    616     *g = (G << 2) | (G >> 4);
    617     *b = (B << 3) | (B >> 2);
    618 }
    619 SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
    620     from_565(load<U16>(ptr, tail), r,g,b);
    621 }
    622 SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
    623     // Select the top 5,6,5 bits.
    624     U16 R = r >> 3,
    625         G = g >> 2,
    626         B = b >> 3;
    627     // Pack them back into 15|rrrrr gggggg bbbbb|0.
    628     store(ptr, tail, R << 11
    629                    | G <<  5
    630                    | B <<  0);
    631 }
    632 
    633 STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
    634     load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
    635     a = 255;
    636 }
    637 STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
    638     load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
    639     da = 255;
    640 }
    641 STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
    642     store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
    643 }
    644 STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
    645     const uint16_t* ptr;
    646     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    647     from_565(gather<U16>(ptr, ix), &r, &g, &b);
    648     a = 255;
    649 }
    650 
    651 SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
    652     // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
    653     U16 R = (rgba >> 12) & 15,
    654         G = (rgba >>  8) & 15,
    655         B = (rgba >>  4) & 15,
    656         A = (rgba >>  0) & 15;
    657 
    658     // Scale [0,15] to [0,255].
    659     *r = (R << 4) | R;
    660     *g = (G << 4) | G;
    661     *b = (B << 4) | B;
    662     *a = (A << 4) | A;
    663 }
    664 SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    665     from_4444(load<U16>(ptr, tail), r,g,b,a);
    666 }
    667 SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    668     // Select the top 4 bits of each.
    669     U16 R = r >> 4,
    670         G = g >> 4,
    671         B = b >> 4,
    672         A = a >> 4;
    673     // Pack them back into 15|rrrr gggg bbbb aaaa|0.
    674     store(ptr, tail, R << 12
    675                    | G <<  8
    676                    | B <<  4
    677                    | A <<  0);
    678 }
    679 
    680 STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) {
    681     load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
    682 }
    683 STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
    684     load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
    685 }
    686 STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) {
    687     store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
    688 }
    689 STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) {
    690     const uint16_t* ptr;
    691     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    692     from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
    693 }
    694 
    695 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
    696 
    697 SI U16 load_8(const uint8_t* ptr, size_t tail) {
    698     return cast<U16>(load<U8>(ptr, tail));
    699 }
    700 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
    701     store(ptr, tail, cast<U8>(v));
    702 }
    703 
    704 STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
    705     r = g = b = 0;
    706     a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    707 }
    708 STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
    709     dr = dg = db = 0;
    710     da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    711 }
    712 STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
    713     store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
    714 }
    715 STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
    716     const uint8_t* ptr;
    717     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    718     r = g = b = 0;
    719     a = cast<U16>(gather<U8>(ptr, ix));
    720 }
    721 
    722 STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
    723     r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    724     a = 255;
    725 }
    726 STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
    727     dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    728     da = 255;
    729 }
    730 STAGE_PP(luminance_to_alpha, Ctx::None) {
    731     a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator.
    732     r = g = b = 0;
    733 }
    734 STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
    735     const uint8_t* ptr;
    736     U32 ix = ix_and_ptr(&ptr, ctx, x,y);
    737     r = g = b = cast<U16>(gather<U8>(ptr, ix));
    738     a = 255;
    739 }
    740 
    741 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
    742 
    743 STAGE_PP(scale_1_float, const float* f) {
    744     U16 c = from_float(*f);
    745     r = div255( r * c );
    746     g = div255( g * c );
    747     b = div255( b * c );
    748     a = div255( a * c );
    749 }
    750 STAGE_PP(lerp_1_float, const float* f) {
    751     U16 c = from_float(*f);
    752     r = lerp(dr, r, c);
    753     g = lerp(dg, g, c);
    754     b = lerp(db, b, c);
    755     a = lerp(da, a, c);
    756 }
    757 
    758 STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
    759     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    760     r = div255( r * c );
    761     g = div255( g * c );
    762     b = div255( b * c );
    763     a = div255( a * c );
    764 }
    765 STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
    766     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
    767     r = lerp(dr, r, c);
    768     g = lerp(dg, g, c);
    769     b = lerp(db, b, c);
    770     a = lerp(da, a, c);
    771 }
    772 
    773 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
    774 SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
    775     return if_then_else(a < da, min(cr,cg,cb)
    776                               , max(cr,cg,cb));
    777 }
    778 STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
    779     U16 cr,cg,cb;
    780     load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
    781     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
    782 
    783     r = div255( r * cr );
    784     g = div255( g * cg );
    785     b = div255( b * cb );
    786     a = div255( a * ca );
    787 }
    788 STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
    789     U16 cr,cg,cb;
    790     load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
    791     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
    792 
    793     r = lerp(dr, r, cr);
    794     g = lerp(dg, g, cg);
    795     b = lerp(db, b, cb);
    796     a = lerp(da, a, ca);
    797 }
    798 
    799 // ~~~~~~ Gradient stages ~~~~~~ //
    800 
    801 // Clamp x to [0,1], both sides inclusive (think, gradients).
    802 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
    803 SI F clamp_01(F v) { return min(max(0, v), 1); }
    804 
    805 STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
    806 STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
    807 STAGE_GG(mirror_x_1, Ctx::None) {
    808     auto two = [](F x){ return x+x; };
    809     x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
    810 }
    811 
    812 SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); }
    813 
    814 SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
    815                         U16* r, U16* g, U16* b, U16* a) {
    816 
    817     F fr, fg, fb, fa, br, bg, bb, ba;
    818 #if defined(__AVX2__)
    819     if (c->stopCount <=8) {
    820         __m256i lo, hi;
    821         split(idx, &lo, &hi);
    822 
    823         fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
    824                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
    825         br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
    826                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
    827         fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
    828                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
    829         bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
    830                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
    831         fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
    832                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
    833         bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
    834                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
    835         fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
    836                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
    837         ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
    838                      _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
    839     } else
    840 #endif
    841     {
    842         fr = gather<F>(c->fs[0], idx);
    843         fg = gather<F>(c->fs[1], idx);
    844         fb = gather<F>(c->fs[2], idx);
    845         fa = gather<F>(c->fs[3], idx);
    846         br = gather<F>(c->bs[0], idx);
    847         bg = gather<F>(c->bs[1], idx);
    848         bb = gather<F>(c->bs[2], idx);
    849         ba = gather<F>(c->bs[3], idx);
    850     }
    851     *r = round_F_to_U16(mad(t, fr, br));
    852     *g = round_F_to_U16(mad(t, fg, bg));
    853     *b = round_F_to_U16(mad(t, fb, bb));
    854     *a = round_F_to_U16(mad(t, fa, ba));
    855 }
    856 
    857 STAGE_GP(gradient, const SkJumper_GradientCtx* c) {
    858     auto t = x;
    859     U32 idx = 0;
    860 
    861     // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
    862     for (size_t i = 1; i < c->stopCount; i++) {
    863         idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
    864     }
    865 
    866     gradient_lookup(c, idx, t, &r, &g, &b, &a);
    867 }
    868 
    869 STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) {
    870     auto t = x;
    871     auto idx = trunc_(t * (c->stopCount-1));
    872     gradient_lookup(c, idx, t, &r, &g, &b, &a);
    873 }
    874 
    875 STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) {
    876     // TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx.
    877     struct Ctx { float f[4], b[4]; };
    878     auto c = (const Ctx*)ctx;
    879 
    880     auto t = x;
    881     r = round_F_to_U16(mad(t, c->f[0], c->b[0]));
    882     g = round_F_to_U16(mad(t, c->f[1], c->b[1]));
    883     b = round_F_to_U16(mad(t, c->f[2], c->b[2]));
    884     a = round_F_to_U16(mad(t, c->f[3], c->b[3]));
    885 }
    886 
    887 STAGE_GG(xy_to_unit_angle, Ctx::None) {
    888     F xabs = abs_(x),
    889       yabs = abs_(y);
    890 
    891     F slope = min(xabs, yabs)/max(xabs, yabs);
    892     F s = slope * slope;
    893 
    894     // Use a 7th degree polynomial to approximate atan.
    895     // This was generated using sollya.gforge.inria.fr.
    896     // A float optimized polynomial was generated using the following command.
    897     // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
    898     F phi = slope
    899              * (0.15912117063999176025390625f     + s
    900              * (-5.185396969318389892578125e-2f   + s
    901              * (2.476101927459239959716796875e-2f + s
    902              * (-7.0547382347285747528076171875e-3f))));
    903 
    904     phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
    905     phi = if_then_else(x < 0.0f   , 1.0f/2.0f - phi, phi);
    906     phi = if_then_else(y < 0.0f   , 1.0f - phi     , phi);
    907     phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN.
    908     x = phi;
    909 }
    910 STAGE_GG(xy_to_radius, Ctx::None) {
    911     x = sqrt_(x*x + y*y);
    912 }
    913 
    914 // ~~~~~~ Compound stages ~~~~~~ //
    915 
    916 STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
    917     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
    918 
    919     load_8888(ptr, tail, &dr,&dg,&db,&da);
    920     r = r + div255( dr*inv(a) );
    921     g = g + div255( dg*inv(a) );
    922     b = b + div255( db*inv(a) );
    923     a = a + div255( da*inv(a) );
    924     store_8888(ptr, tail, r,g,b,a);
    925 }
    926 STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
    927     auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
    928 
    929     load_8888(ptr, tail, &db,&dg,&dr,&da);
    930     r = r + div255( dr*inv(a) );
    931     g = g + div255( dg*inv(a) );
    932     b = b + div255( db*inv(a) );
    933     a = a + div255( da*inv(a) );
    934     store_8888(ptr, tail, b,g,r,a);
    935 }
    936 
    937 #endif//defined(__clang__)
    938