Home | History | Annotate | Download | only in jumper
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkJumper_vectors_DEFINED
      9 #define SkJumper_vectors_DEFINED
     10 
     11 #include "SkJumper.h"
     12 #include "SkJumper_misc.h"
     13 
     14 // This file contains vector types that SkJumper_stages.cpp uses to define stages.
     15 
     16 // Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
     17 
     18 #if !defined(JUMPER)
     19     // This path should lead to portable code that can be compiled directly into Skia.
     20     // (All other paths are compiled offline by Clang into SkJumper_generated.S.)
     21     #include <math.h>
     22 
     23     using F   = float   ;
     24     using I32 =  int32_t;
     25     using U64 = uint64_t;
     26     using U32 = uint32_t;
     27     using U16 = uint16_t;
     28     using U8  = uint8_t ;
     29 
     30     SI F   mad(F f, F m, F a)   { return f*m+a; }
     31     SI F   min(F a, F b)        { return fminf(a,b); }
     32     SI F   max(F a, F b)        { return fmaxf(a,b); }
     33     SI F   abs_  (F v)          { return fabsf(v); }
     34     SI F   floor_(F v)          { return floorf(v); }
     35     SI F   rcp   (F v)          { return 1.0f / v; }
     36     SI F   rsqrt (F v)          { return 1.0f / sqrtf(v); }
     37     SI F    sqrt_(F v)          { return sqrtf(v); }
     38     SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); }
     39     SI U16 pack(U32 v)          { return (U16)v; }
     40     SI U8  pack(U16 v)          { return  (U8)v; }
     41 
     42     SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
     43 
     44     template <typename T>
     45     SI T gather(const T* p, U32 ix) { return p[ix]; }
     46 
     47     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
     48         *r = ptr[0];
     49         *g = ptr[1];
     50         *b = ptr[2];
     51     }
     52     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
     53         *r = ptr[0];
     54         *g = ptr[1];
     55         *b = ptr[2];
     56         *a = ptr[3];
     57     }
     58     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
     59         ptr[0] = r;
     60         ptr[1] = g;
     61         ptr[2] = b;
     62         ptr[3] = a;
     63     }
     64 
     65     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
     66         *r = ptr[0];
     67         *g = ptr[1];
     68         *b = ptr[2];
     69         *a = ptr[3];
     70     }
     71     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
     72         ptr[0] = r;
     73         ptr[1] = g;
     74         ptr[2] = b;
     75         ptr[3] = a;
     76     }
     77 
     78 #elif defined(__aarch64__)
     79     #include <arm_neon.h>
     80 
     81     // Since we know we're using Clang, we can use its vector extensions.
     82     template <typename T> using V = T __attribute__((ext_vector_type(4)));
     83     using F   = V<float   >;
     84     using I32 = V< int32_t>;
     85     using U64 = V<uint64_t>;
     86     using U32 = V<uint32_t>;
     87     using U16 = V<uint16_t>;
     88     using U8  = V<uint8_t >;
     89 
     90     // We polyfill a few routines that Clang doesn't build into ext_vector_types.
     91     SI F   mad(F f, F m, F a)                    { return vfmaq_f32(a,f,m);        }
     92     SI F   min(F a, F b)                         { return vminq_f32(a,b);          }
     93     SI F   max(F a, F b)                         { return vmaxq_f32(a,b);          }
     94     SI F   abs_  (F v)                           { return vabsq_f32(v);            }
     95     SI F   floor_(F v)                           { return vrndmq_f32(v);           }
     96     SI F   rcp   (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
     97     SI F   rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
     98     SI F    sqrt_(F v)                           { return vsqrtq_f32(v); }
     99     SI U32 round (F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
    100     SI U16 pack(U32 v)                           { return __builtin_convertvector(v, U16); }
    101     SI U8  pack(U16 v)                           { return __builtin_convertvector(v,  U8); }
    102 
    103     SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
    104 
    105     template <typename T>
    106     SI V<T> gather(const T* p, U32 ix) {
    107         return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
    108     }
    109 
    110     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
    111         uint16x4x3_t rgb;
    112         if (__builtin_expect(tail,0)) {
    113             if (  true  ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); }
    114             if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
    115             if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
    116         } else {
    117             rgb = vld3_u16(ptr);
    118         }
    119         *r = rgb.val[0];
    120         *g = rgb.val[1];
    121         *b = rgb.val[2];
    122     }
    123     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    124         uint16x4x4_t rgba;
    125         if (__builtin_expect(tail,0)) {
    126             if (  true  ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); }
    127             if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
    128             if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
    129         } else {
    130             rgba = vld4_u16(ptr);
    131         }
    132         *r = rgba.val[0];
    133         *g = rgba.val[1];
    134         *b = rgba.val[2];
    135         *a = rgba.val[3];
    136     }
    137     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    138         if (__builtin_expect(tail,0)) {
    139             if (  true  ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); }
    140             if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
    141             if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
    142         } else {
    143             vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
    144         }
    145     }
    146     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
    147         float32x4x4_t rgba;
    148         if (__builtin_expect(tail,0)) {
    149             if (  true  ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); }
    150             if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
    151             if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
    152         } else {
    153             rgba = vld4q_f32(ptr);
    154         }
    155         *r = rgba.val[0];
    156         *g = rgba.val[1];
    157         *b = rgba.val[2];
    158         *a = rgba.val[3];
    159     }
    160     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
    161         if (__builtin_expect(tail,0)) {
    162             if (  true  ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); }
    163             if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
    164             if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
    165         } else {
    166             vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
    167         }
    168     }
    169 
    170 #elif defined(__arm__)
    171     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
    172         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
    173     #endif
    174     #include <arm_neon.h>
    175 
    176     // We can pass {s0-s15} as arguments under AAPCS-VFP.  We'll slice that as 8 d-registers.
    177     template <typename T> using V = T __attribute__((ext_vector_type(2)));
    178     using F   = V<float   >;
    179     using I32 = V< int32_t>;
    180     using U64 = V<uint64_t>;
    181     using U32 = V<uint32_t>;
    182     using U16 = V<uint16_t>;
    183     using U8  = V<uint8_t >;
    184 
    185     SI F   mad(F f, F m, F a)                  { return vfma_f32(a,f,m);        }
    186     SI F   min(F a, F b)                       { return vmin_f32(a,b);          }
    187     SI F   max(F a, F b)                       { return vmax_f32(a,b);          }
    188     SI F   abs_ (F v)                          { return vabs_f32(v);            }
    189     SI F   rcp  (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
    190     SI F   rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
    191     SI U32 round(F v, F scale)                 { return vcvt_u32_f32(mad(v,scale,0.5f)); }
    192     SI U16 pack(U32 v)                         { return __builtin_convertvector(v, U16); }
    193     SI U8  pack(U16 v)                         { return __builtin_convertvector(v,  U8); }
    194 
    195     SI F sqrt_(F v) {
    196         auto e = vrsqrte_f32(v);  // Estimate and two refinement steps for e = rsqrt(v).
    197         e *= vrsqrts_f32(v,e*e);
    198         e *= vrsqrts_f32(v,e*e);
    199         return v*e;               // sqrt(v) == v*rsqrt(v).
    200     }
    201 
    202     SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
    203 
    204     SI F floor_(F v) {
    205         F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
    206         return roundtrip - if_then_else(roundtrip > v, 1, 0);
    207     }
    208 
    209     template <typename T>
    210     SI V<T> gather(const T* p, U32 ix) {
    211         return {p[ix[0]], p[ix[1]]};
    212     }
    213 
    214     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
    215         uint16x4x3_t rgb;
    216         rgb = vld3_lane_u16(ptr + 0, rgb, 0);
    217         if (__builtin_expect(tail, 0)) {
    218             vset_lane_u16(0, rgb.val[0], 1);
    219             vset_lane_u16(0, rgb.val[1], 1);
    220             vset_lane_u16(0, rgb.val[2], 1);
    221         } else {
    222             rgb = vld3_lane_u16(ptr + 3, rgb, 1);
    223         }
    224         *r = unaligned_load<U16>(rgb.val+0);
    225         *g = unaligned_load<U16>(rgb.val+1);
    226         *b = unaligned_load<U16>(rgb.val+2);
    227     }
    228     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    229         uint16x4x4_t rgba;
    230         rgba = vld4_lane_u16(ptr + 0, rgba, 0);
    231         if (__builtin_expect(tail, 0)) {
    232             vset_lane_u16(0, rgba.val[0], 1);
    233             vset_lane_u16(0, rgba.val[1], 1);
    234             vset_lane_u16(0, rgba.val[2], 1);
    235             vset_lane_u16(0, rgba.val[3], 1);
    236         } else {
    237             rgba = vld4_lane_u16(ptr + 4, rgba, 1);
    238         }
    239         *r = unaligned_load<U16>(rgba.val+0);
    240         *g = unaligned_load<U16>(rgba.val+1);
    241         *b = unaligned_load<U16>(rgba.val+2);
    242         *a = unaligned_load<U16>(rgba.val+3);
    243     }
    244     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    245         uint16x4x4_t rgba = {{
    246             widen_cast<uint16x4_t>(r),
    247             widen_cast<uint16x4_t>(g),
    248             widen_cast<uint16x4_t>(b),
    249             widen_cast<uint16x4_t>(a),
    250         }};
    251         vst4_lane_u16(ptr + 0, rgba, 0);
    252         if (__builtin_expect(tail == 0, true)) {
    253             vst4_lane_u16(ptr + 4, rgba, 1);
    254         }
    255     }
    256 
    257     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
    258         float32x2x4_t rgba;
    259         if (__builtin_expect(tail, 0)) {
    260             rgba = vld4_dup_f32(ptr);
    261         } else {
    262             rgba = vld4_f32(ptr);
    263         }
    264         *r = rgba.val[0];
    265         *g = rgba.val[1];
    266         *b = rgba.val[2];
    267         *a = rgba.val[3];
    268     }
    269     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
    270         if (__builtin_expect(tail, 0)) {
    271             vst4_lane_f32(ptr, (float32x2x4_t{{r,g,b,a}}), 0);
    272         } else {
    273             vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
    274         }
    275     }
    276 
    277 
    278 #elif defined(__AVX__)
    279     #include <immintrin.h>
    280 
    281     // These are __m256 and __m256i, but friendlier and strongly-typed.
    282     template <typename T> using V = T __attribute__((ext_vector_type(8)));
    283     using F   = V<float   >;
    284     using I32 = V< int32_t>;
    285     using U64 = V<uint64_t>;
    286     using U32 = V<uint32_t>;
    287     using U16 = V<uint16_t>;
    288     using U8  = V<uint8_t >;
    289 
    290     SI F mad(F f, F m, F a)  {
    291     #if defined(__FMA__)
    292         return _mm256_fmadd_ps(f,m,a);
    293     #else
    294         return f*m+a;
    295     #endif
    296     }
    297 
    298     SI F   min(F a, F b)        { return _mm256_min_ps(a,b);    }
    299     SI F   max(F a, F b)        { return _mm256_max_ps(a,b);    }
    300     SI F   abs_  (F v)          { return _mm256_and_ps(v, 0-v); }
    301     SI F   floor_(F v)          { return _mm256_floor_ps(v);    }
    302     SI F   rcp   (F v)          { return _mm256_rcp_ps  (v);    }
    303     SI F   rsqrt (F v)          { return _mm256_rsqrt_ps(v);    }
    304     SI F    sqrt_(F v)          { return _mm256_sqrt_ps (v);    }
    305     SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
    306 
    307     SI U16 pack(U32 v) {
    308         return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
    309                                 _mm256_extractf128_si256(v, 1));
    310     }
    311     SI U8 pack(U16 v) {
    312         auto r = _mm_packus_epi16(v,v);
    313         return unaligned_load<U8>(&r);
    314     }
    315 
    316     SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
    317 
    318     template <typename T>
    319     SI V<T> gather(const T* p, U32 ix) {
    320         return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
    321                  p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
    322     }
    323     #if defined(__AVX2__)
    324         SI F   gather(const float*    p, U32 ix) { return _mm256_i32gather_ps   (p, ix, 4); }
    325         SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
    326         SI U64 gather(const uint64_t* p, U32 ix) {
    327             __m256i parts[] = {
    328                 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8),
    329                 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8),
    330             };
    331             return bit_cast<U64>(parts);
    332         }
    333     #endif
    334 
    335     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
    336         __m128i _0,_1,_2,_3,_4,_5,_6,_7;
    337         if (__builtin_expect(tail,0)) {
    338             auto load_rgb = [](const uint16_t* src) {
    339                 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
    340                 return _mm_insert_epi16(v, src[2], 2);
    341             };
    342             if (tail > 0) { _0 = load_rgb(ptr +  0); }
    343             if (tail > 1) { _1 = load_rgb(ptr +  3); }
    344             if (tail > 2) { _2 = load_rgb(ptr +  6); }
    345             if (tail > 3) { _3 = load_rgb(ptr +  9); }
    346             if (tail > 4) { _4 = load_rgb(ptr + 12); }
    347             if (tail > 5) { _5 = load_rgb(ptr + 15); }
    348             if (tail > 6) { _6 = load_rgb(ptr + 18); }
    349         } else {
    350             // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
    351             auto _01 =                _mm_loadu_si128((const __m128i*)(ptr +  0))    ;
    352             auto _23 =                _mm_loadu_si128((const __m128i*)(ptr +  6))    ;
    353             auto _45 =                _mm_loadu_si128((const __m128i*)(ptr + 12))    ;
    354             auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
    355             _0 = _01; _1 = _mm_srli_si128(_01, 6),
    356             _2 = _23; _3 = _mm_srli_si128(_23, 6),
    357             _4 = _45; _5 = _mm_srli_si128(_45, 6),
    358             _6 = _67; _7 = _mm_srli_si128(_67, 6);
    359         }
    360 
    361         auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx
    362              _13 = _mm_unpacklo_epi16(_1, _3),
    363              _46 = _mm_unpacklo_epi16(_4, _6),
    364              _57 = _mm_unpacklo_epi16(_5, _7);
    365 
    366         auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
    367              bx0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 xx xx xx xx
    368              rg4567 = _mm_unpacklo_epi16(_46, _57),
    369              bx4567 = _mm_unpackhi_epi16(_46, _57);
    370 
    371         *r = _mm_unpacklo_epi64(rg0123, rg4567);
    372         *g = _mm_unpackhi_epi64(rg0123, rg4567);
    373         *b = _mm_unpacklo_epi64(bx0123, bx4567);
    374     }
    375     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    376         __m128i _01, _23, _45, _67;
    377         if (__builtin_expect(tail,0)) {
    378             auto src = (const double*)ptr;
    379             _01 = _23 = _45 = _67 = _mm_setzero_si128();
    380             if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
    381             if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
    382             if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
    383             if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
    384             if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
    385             if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
    386             if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
    387         } else {
    388             _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
    389             _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
    390             _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
    391             _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
    392         }
    393 
    394         auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
    395              _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
    396              _46 = _mm_unpacklo_epi16(_45, _67),
    397              _57 = _mm_unpackhi_epi16(_45, _67);
    398 
    399         auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
    400              ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
    401              rg4567 = _mm_unpacklo_epi16(_46, _57),
    402              ba4567 = _mm_unpackhi_epi16(_46, _57);
    403 
    404         *r = _mm_unpacklo_epi64(rg0123, rg4567);
    405         *g = _mm_unpackhi_epi64(rg0123, rg4567);
    406         *b = _mm_unpacklo_epi64(ba0123, ba4567);
    407         *a = _mm_unpackhi_epi64(ba0123, ba4567);
    408     }
    409     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    410         auto rg0123 = _mm_unpacklo_epi16(r, g),  // r0 g0 r1 g1 r2 g2 r3 g3
    411              rg4567 = _mm_unpackhi_epi16(r, g),  // r4 g4 r5 g5 r6 g6 r7 g7
    412              ba0123 = _mm_unpacklo_epi16(b, a),
    413              ba4567 = _mm_unpackhi_epi16(b, a);
    414 
    415         auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
    416              _23 = _mm_unpackhi_epi32(rg0123, ba0123),
    417              _45 = _mm_unpacklo_epi32(rg4567, ba4567),
    418              _67 = _mm_unpackhi_epi32(rg4567, ba4567);
    419 
    420         if (__builtin_expect(tail,0)) {
    421             auto dst = (double*)ptr;
    422             if (tail > 0) { _mm_storel_pd(dst+0, _01); }
    423             if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
    424             if (tail > 2) { _mm_storel_pd(dst+2, _23); }
    425             if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
    426             if (tail > 4) { _mm_storel_pd(dst+4, _45); }
    427             if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
    428             if (tail > 6) { _mm_storel_pd(dst+6, _67); }
    429         } else {
    430             _mm_storeu_si128((__m128i*)ptr + 0, _01);
    431             _mm_storeu_si128((__m128i*)ptr + 1, _23);
    432             _mm_storeu_si128((__m128i*)ptr + 2, _45);
    433             _mm_storeu_si128((__m128i*)ptr + 3, _67);
    434         }
    435     }
    436 
    437     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
    438         F _04, _15, _26, _37;
    439 
    440         switch (tail) {
    441             case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
    442             case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
    443             case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
    444             case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
    445             case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
    446             case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
    447             case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
    448             case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
    449         }
    450 
    451         F rg0145 = _mm256_unpacklo_ps(_04,_15),  // r0 r1 g0 g1 | r4 r5 g4 g5
    452           ba0145 = _mm256_unpackhi_ps(_04,_15),
    453           rg2367 = _mm256_unpacklo_ps(_26,_37),
    454           ba2367 = _mm256_unpackhi_ps(_26,_37);
    455 
    456         *r = _mm256_unpacklo_pd(rg0145, rg2367);
    457         *g = _mm256_unpackhi_pd(rg0145, rg2367);
    458         *b = _mm256_unpacklo_pd(ba0145, ba2367);
    459         *a = _mm256_unpackhi_pd(ba0145, ba2367);
    460     }
    461     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
    462         F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5
    463           rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ...
    464           ba0145 = _mm256_unpacklo_ps(b, a),  // b0 a0 b1 a1 | b4 a4 b5 a5
    465           ba2367 = _mm256_unpackhi_ps(b, a);  // b2 ...      | b6 ...
    466 
    467         F _04 = _mm256_unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4
    468           _15 = _mm256_unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ...
    469           _26 = _mm256_unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ...
    470           _37 = _mm256_unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ...
    471 
    472         if (__builtin_expect(tail, 0)) {
    473             if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
    474             if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
    475             if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
    476             if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
    477             if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
    478             if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
    479             if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
    480         } else {
    481             F _01 = _mm256_permute2f128_ps(_04, _15, 32),  // 32 == 0010 0000 == lo, lo
    482               _23 = _mm256_permute2f128_ps(_26, _37, 32),
    483               _45 = _mm256_permute2f128_ps(_04, _15, 49),  // 49 == 0011 0001 == hi, hi
    484               _67 = _mm256_permute2f128_ps(_26, _37, 49);
    485             _mm256_storeu_ps(ptr+ 0, _01);
    486             _mm256_storeu_ps(ptr+ 8, _23);
    487             _mm256_storeu_ps(ptr+16, _45);
    488             _mm256_storeu_ps(ptr+24, _67);
    489         }
    490     }
    491 
    492 #elif defined(__SSE2__)
    493     #include <immintrin.h>
    494 
    495     template <typename T> using V = T __attribute__((ext_vector_type(4)));
    496     using F   = V<float   >;
    497     using I32 = V< int32_t>;
    498     using U64 = V<uint64_t>;
    499     using U32 = V<uint32_t>;
    500     using U16 = V<uint16_t>;
    501     using U8  = V<uint8_t >;
    502 
    503     SI F   mad(F f, F m, F a)  { return f*m+a;              }
    504     SI F   min(F a, F b)       { return _mm_min_ps(a,b);    }
    505     SI F   max(F a, F b)       { return _mm_max_ps(a,b);    }
    506     SI F   abs_(F v)           { return _mm_and_ps(v, 0-v); }
    507     SI F   rcp   (F v)         { return _mm_rcp_ps  (v);    }
    508     SI F   rsqrt (F v)         { return _mm_rsqrt_ps(v);    }
    509     SI F    sqrt_(F v)         { return _mm_sqrt_ps (v);    }
    510     SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
    511 
    512     SI U16 pack(U32 v) {
    513     #if defined(__SSE4_1__)
    514         auto p = _mm_packus_epi32(v,v);
    515     #else
    516         // Sign extend so that _mm_packs_epi32() does the pack we want.
    517         auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
    518         p = _mm_packs_epi32(p,p);
    519     #endif
    520         return unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one.
    521     }
    522     SI U8 pack(U16 v) {
    523         auto r = widen_cast<__m128i>(v);
    524         r = _mm_packus_epi16(r,r);
    525         return unaligned_load<U8>(&r);
    526     }
    527 
    528     SI F if_then_else(I32 c, F t, F e) {
    529         return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
    530     }
    531 
    532     SI F floor_(F v) {
    533     #if defined(__SSE4_1__)
    534         return _mm_floor_ps(v);
    535     #else
    536         F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
    537         return roundtrip - if_then_else(roundtrip > v, 1, 0);
    538     #endif
    539     }
    540 
    541     template <typename T>
    542     SI V<T> gather(const T* p, U32 ix) {
    543         return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
    544     }
    545 
    546     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
    547         __m128i _0, _1, _2, _3;
    548         if (__builtin_expect(tail,0)) {
    549             _1 = _2 = _3 = _mm_setzero_si128();
    550             auto load_rgb = [](const uint16_t* src) {
    551                 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
    552                 return _mm_insert_epi16(v, src[2], 2);
    553             };
    554             if (  true  ) { _0 = load_rgb(ptr + 0); }
    555             if (tail > 1) { _1 = load_rgb(ptr + 3); }
    556             if (tail > 2) { _2 = load_rgb(ptr + 6); }
    557         } else {
    558             // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
    559             auto _01 =                _mm_loadu_si128((const __m128i*)(ptr + 0))    ,
    560                  _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
    561 
    562             // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
    563             _0 = _01;
    564             _1 = _mm_srli_si128(_01, 6);
    565             _2 = _23;
    566             _3 = _mm_srli_si128(_23, 6);
    567         }
    568 
    569         // De-interlace to R,G,B.
    570         auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx
    571              _13 = _mm_unpacklo_epi16(_1, _3);  // r1 r3 g1 g3 b1 b3 xx xx
    572 
    573         auto R = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
    574              G = _mm_srli_si128(R, 8),
    575              B = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 xx xx xx xx
    576 
    577         *r = unaligned_load<U16>(&R);
    578         *g = unaligned_load<U16>(&G);
    579         *b = unaligned_load<U16>(&B);
    580     }
    581 
    582     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
    583         __m128i _01, _23;
    584         if (__builtin_expect(tail,0)) {
    585             _01 = _23 = _mm_setzero_si128();
    586             auto src = (const double*)ptr;
    587             if (  true  ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00
    588             if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
    589             if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
    590         } else {
    591             _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
    592             _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
    593         }
    594 
    595         auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
    596              _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
    597 
    598         auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
    599              ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
    600 
    601         *r = unaligned_load<U16>((uint16_t*)&rg + 0);
    602         *g = unaligned_load<U16>((uint16_t*)&rg + 4);
    603         *b = unaligned_load<U16>((uint16_t*)&ba + 0);
    604         *a = unaligned_load<U16>((uint16_t*)&ba + 4);
    605     }
    606 
    607     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
    608         auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
    609              ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
    610 
    611         if (__builtin_expect(tail, 0)) {
    612             auto dst = (double*)ptr;
    613             if (  true  ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); }
    614             if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
    615             if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
    616         } else {
    617             _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
    618             _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
    619         }
    620     }
    621 
    622     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
    623         F _0, _1, _2, _3;
    624         if (__builtin_expect(tail, 0)) {
    625             _1 = _2 = _3 = _mm_setzero_si128();
    626             if (  true  ) { _0 = _mm_loadu_ps(ptr + 0); }
    627             if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
    628             if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
    629         } else {
    630             _0 = _mm_loadu_ps(ptr + 0);
    631             _1 = _mm_loadu_ps(ptr + 4);
    632             _2 = _mm_loadu_ps(ptr + 8);
    633             _3 = _mm_loadu_ps(ptr +12);
    634         }
    635         _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
    636         *r = _0;
    637         *g = _1;
    638         *b = _2;
    639         *a = _3;
    640     }
    641 
    642     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
    643         _MM_TRANSPOSE4_PS(r,g,b,a);
    644         if (__builtin_expect(tail, 0)) {
    645             if (  true  ) { _mm_storeu_ps(ptr + 0, r); }
    646             if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
    647             if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
    648         } else {
    649             _mm_storeu_ps(ptr + 0, r);
    650             _mm_storeu_ps(ptr + 4, g);
    651             _mm_storeu_ps(ptr + 8, b);
    652             _mm_storeu_ps(ptr +12, a);
    653         }
    654     }
    655 #endif
    656 
    657 // We need to be a careful with casts.
    658 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
    659 // These named casts and bit_cast() are always what they seem to be.
    660 #if defined(JUMPER)
    661     SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
    662     SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
    663     SI U32 expand(U16 v) { return      __builtin_convertvector(     v, U32); }
    664     SI U32 expand(U8  v) { return      __builtin_convertvector(     v, U32); }
    665 #else
    666     SI F   cast  (U32 v) { return   (F)v; }
    667     SI U32 trunc_(F   v) { return (U32)v; }
    668     SI U32 expand(U16 v) { return (U32)v; }
    669     SI U32 expand(U8  v) { return (U32)v; }
    670 #endif
    671 
    672 template <typename V>
    673 SI V if_then_else(I32 c, V t, V e) {
    674     return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
    675 }
    676 
    677 SI U16 bswap(U16 x) {
    678 #if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
    679     // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
    680     // when generating code for SSE2 and SSE4.1.  We'll do it manually...
    681     auto v = widen_cast<__m128i>(x);
    682     v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
    683     return unaligned_load<U16>(&v);
    684 #else
    685     return (x<<8) | (x>>8);
    686 #endif
    687 }
    688 
    689 SI F fract(F v) { return v - floor_(v); }
    690 
    691 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
    692 SI F approx_log2(F x) {
    693     // e - 127 is a fair approximation of log2(x) in its own right...
    694     F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23));
    695 
    696     // ... but using the mantissa to refine its error is _much_ better.
    697     F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
    698     return e
    699          - 124.225514990f
    700          -   1.498030302f * m
    701          -   1.725879990f / (0.3520887068f + m);
    702 }
    703 SI F approx_pow2(F x) {
    704     F f = fract(x);
    705     return bit_cast<F>(round(1.0f * (1<<23),
    706                              x + 121.274057500f
    707                                -   1.490129070f * f
    708                                +  27.728023300f / (4.84252568f - f)));
    709 }
    710 
    711 SI F approx_powf(F x, F y) {
    712     return approx_pow2(approx_log2(x) * y);
    713 }
    714 
    715 SI F from_half(U16 h) {
    716 #if defined(JUMPER) && defined(__aarch64__)
    717     return vcvt_f32_f16(h);
    718 
    719 #elif defined(JUMPER) && defined(__arm__)
    720     auto v = widen_cast<uint16x4_t>(h);
    721     return vget_low_f32(vcvt_f32_f16(v));
    722 
    723 #elif defined(JUMPER) && defined(__AVX2__)
    724     return _mm256_cvtph_ps(h);
    725 
    726 #else
    727     // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
    728     U32 sem = expand(h),
    729         s   = sem & 0x8000,
    730          em = sem ^ s;
    731 
    732     // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
    733     auto denorm = (I32)em < 0x0400;      // I32 comparison is often quicker, and always safe here.
    734     return if_then_else(denorm, F(0)
    735                               , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
    736 #endif
    737 }
    738 
    739 SI U16 to_half(F f) {
    740 #if defined(JUMPER) && defined(__aarch64__)
    741     return vcvt_f16_f32(f);
    742 
    743 #elif defined(JUMPER) && defined(__arm__)
    744     auto v = widen_cast<float32x4_t>(f);
    745     uint16x4_t h = vcvt_f16_f32(v);
    746     return unaligned_load<U16>(&h);
    747 
    748 #elif defined(JUMPER) && defined(__AVX2__)
    749     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
    750 
    751 #else
    752     // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
    753     U32 sem = bit_cast<U32>(f),
    754         s   = sem & 0x80000000,
    755          em = sem ^ s;
    756 
    757     // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
    758     auto denorm = (I32)em < 0x38800000;  // I32 comparison is often quicker, and always safe here.
    759     return pack(if_then_else(denorm, U32(0)
    760                                    , (s>>16) + (em>>13) - ((127-15)<<10)));
    761 #endif
    762 }
    763 
    764 
    765 
    766 #endif//SkJumper_vectors_DEFINED
    767