Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2015 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkNx_neon_DEFINED
      9 #define SkNx_neon_DEFINED
     10 
     11 #define SKNX_IS_FAST
     12 
     13 // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it:
     14 //   - roundtrip through integers via truncation
     15 //   - subtract 1 if that's too big (possible for negative values).
     16 // This restricts the domain of our inputs to a maximum somehwere around 2^31.  Seems plenty big.
     17 static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) {
     18     auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
     19     auto too_big = vcgtq_f32(roundtrip, v);
     20     return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
     21 }
     22 
     23 // Well, this is absurd.  The shifts require compile-time constant arguments.
     24 
     25 #define SHIFT8(op, v, bits) switch(bits) { \
     26     case  1: return op(v,  1);  case  2: return op(v,  2);  case  3: return op(v,  3); \
     27     case  4: return op(v,  4);  case  5: return op(v,  5);  case  6: return op(v,  6); \
     28     case  7: return op(v,  7); \
     29     } return fVec
     30 
     31 #define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits) { \
     32                                 case  8: return op(v,  8);  case  9: return op(v,  9); \
     33     case 10: return op(v, 10);  case 11: return op(v, 11);  case 12: return op(v, 12); \
     34     case 13: return op(v, 13);  case 14: return op(v, 14);  case 15: return op(v, 15); \
     35     } return fVec
     36 
     37 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bits) { \
     38     case 16: return op(v, 16);  case 17: return op(v, 17);  case 18: return op(v, 18); \
     39     case 19: return op(v, 19);  case 20: return op(v, 20);  case 21: return op(v, 21); \
     40     case 22: return op(v, 22);  case 23: return op(v, 23);  case 24: return op(v, 24); \
     41     case 25: return op(v, 25);  case 26: return op(v, 26);  case 27: return op(v, 27); \
     42     case 28: return op(v, 28);  case 29: return op(v, 29);  case 30: return op(v, 30); \
     43     case 31: return op(v, 31); } return fVec
     44 
     45 template <>
     46 class SkNx<2, float> {
     47 public:
     48     SkNx(float32x2_t vec) : fVec(vec) {}
     49 
     50     SkNx() {}
     51     SkNx(float val)           : fVec(vdup_n_f32(val)) {}
     52     static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
     53     SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
     54 
     55     void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
     56 
     57     SkNx approxInvert() const {
     58         float32x2_t est0 = vrecpe_f32(fVec),
     59                     est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
     60         return est1;
     61     }
     62     SkNx invert() const {
     63         float32x2_t est1 = this->approxInvert().fVec,
     64                     est2 = vmul_f32(vrecps_f32(est1, fVec), est1);
     65         return est2;
     66     }
     67 
     68     SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
     69     SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
     70     SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
     71     SkNx operator / (const SkNx& o) const {
     72     #if defined(SK_CPU_ARM64)
     73         return vdiv_f32(fVec, o.fVec);
     74     #else
     75         return vmul_f32(fVec, o.invert().fVec);
     76     #endif
     77     }
     78 
     79     SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
     80     SkNx operator  < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
     81     SkNx operator  > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
     82     SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
     83     SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
     84     SkNx operator != (const SkNx& o) const {
     85         return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
     86     }
     87 
     88     static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
     89     static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
     90 
     91     SkNx rsqrt0() const { return vrsqrte_f32(fVec); }
     92     SkNx rsqrt1() const {
     93         float32x2_t est0 = this->rsqrt0().fVec;
     94         return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
     95     }
     96     SkNx rsqrt2() const {
     97         float32x2_t est1 = this->rsqrt1().fVec;
     98         return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
     99     }
    100 
    101     SkNx sqrt() const {
    102     #if defined(SK_CPU_ARM64)
    103         return vsqrt_f32(fVec);
    104     #else
    105         return *this * this->rsqrt2();
    106     #endif
    107     }
    108 
    109     float operator[](int k) const {
    110         SkASSERT(0 <= k && k < 2);
    111         union { float32x2_t v; float fs[2]; } pun = {fVec};
    112         return pun.fs[k&1];
    113     }
    114 
    115     bool allTrue() const {
    116         auto v = vreinterpret_u32_f32(fVec);
    117         return vget_lane_u32(v,0) && vget_lane_u32(v,1);
    118     }
    119     bool anyTrue() const {
    120         auto v = vreinterpret_u32_f32(fVec);
    121         return vget_lane_u32(v,0) || vget_lane_u32(v,1);
    122     }
    123 
    124     float32x2_t fVec;
    125 };
    126 
    127 template <>
    128 class SkNx<4, float> {
    129 public:
    130     SkNx(float32x4_t vec) : fVec(vec) {}
    131 
    132     SkNx() {}
    133     SkNx(float val)           : fVec(vdupq_n_f32(val)) {}
    134     static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
    135     SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
    136 
    137     void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
    138     SkNx approxInvert() const {
    139         float32x4_t est0 = vrecpeq_f32(fVec),
    140                     est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
    141         return est1;
    142     }
    143     SkNx invert() const {
    144         float32x4_t est1 = this->approxInvert().fVec,
    145                     est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1);
    146         return est2;
    147     }
    148 
    149     SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
    150     SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
    151     SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
    152     SkNx operator / (const SkNx& o) const {
    153     #if defined(SK_CPU_ARM64)
    154         return vdivq_f32(fVec, o.fVec);
    155     #else
    156         return vmulq_f32(fVec, o.invert().fVec);
    157     #endif
    158     }
    159 
    160     SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); }
    161     SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); }
    162     SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); }
    163     SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); }
    164     SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); }
    165     SkNx operator!=(const SkNx& o) const {
    166         return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
    167     }
    168 
    169     static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
    170     static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
    171 
    172     SkNx abs() const { return vabsq_f32(fVec); }
    173     SkNx floor() const {
    174     #if defined(SK_CPU_ARM64)
    175         return vrndmq_f32(fVec);
    176     #else
    177         return armv7_vrndmq_f32(fVec);
    178     #endif
    179     }
    180 
    181 
    182     SkNx rsqrt0() const { return vrsqrteq_f32(fVec); }
    183     SkNx rsqrt1() const {
    184         float32x4_t est0 = this->rsqrt0().fVec;
    185         return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
    186     }
    187     SkNx rsqrt2() const {
    188         float32x4_t est1 = this->rsqrt1().fVec;
    189         return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
    190     }
    191 
    192     SkNx sqrt() const {
    193     #if defined(SK_CPU_ARM64)
    194         return vsqrtq_f32(fVec);
    195     #else
    196         return *this * this->rsqrt2();
    197     #endif
    198     }
    199 
    200     float operator[](int k) const {
    201         SkASSERT(0 <= k && k < 4);
    202         union { float32x4_t v; float fs[4]; } pun = {fVec};
    203         return pun.fs[k&3];
    204     }
    205 
    206     bool allTrue() const {
    207         auto v = vreinterpretq_u32_f32(fVec);
    208         return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
    209             && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
    210     }
    211     bool anyTrue() const {
    212         auto v = vreinterpretq_u32_f32(fVec);
    213         return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
    214             || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
    215     }
    216 
    217     SkNx thenElse(const SkNx& t, const SkNx& e) const {
    218         return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
    219     }
    220 
    221     float32x4_t fVec;
    222 };
    223 
    224 // It's possible that for our current use cases, representing this as
    225 // half a uint16x8_t might be better than representing it as a uint16x4_t.
    226 // It'd make conversion to Sk4b one step simpler.
    227 template <>
    228 class SkNx<4, uint16_t> {
    229 public:
    230     SkNx(const uint16x4_t& vec) : fVec(vec) {}
    231 
    232     SkNx() {}
    233     SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
    234     static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
    235 
    236     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
    237         fVec = (uint16x4_t) { a,b,c,d };
    238     }
    239 
    240     void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
    241 
    242     SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
    243     SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
    244     SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
    245 
    246     SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); }
    247     SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); }
    248 
    249     static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
    250 
    251     uint16_t operator[](int k) const {
    252         SkASSERT(0 <= k && k < 4);
    253         union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
    254         return pun.us[k&3];
    255     }
    256 
    257     SkNx thenElse(const SkNx& t, const SkNx& e) const {
    258         return vbsl_u16(fVec, t.fVec, e.fVec);
    259     }
    260 
    261     uint16x4_t fVec;
    262 };
    263 
    264 template <>
    265 class SkNx<8, uint16_t> {
    266 public:
    267     SkNx(const uint16x8_t& vec) : fVec(vec) {}
    268 
    269     SkNx() {}
    270     SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
    271     static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
    272 
    273     SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
    274          uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
    275         fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
    276     }
    277 
    278     void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
    279 
    280     SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
    281     SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
    282     SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
    283 
    284     SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
    285     SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
    286 
    287     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
    288 
    289     uint16_t operator[](int k) const {
    290         SkASSERT(0 <= k && k < 8);
    291         union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
    292         return pun.us[k&7];
    293     }
    294 
    295     SkNx thenElse(const SkNx& t, const SkNx& e) const {
    296         return vbslq_u16(fVec, t.fVec, e.fVec);
    297     }
    298 
    299     uint16x8_t fVec;
    300 };
    301 
    302 template <>
    303 class SkNx<4, uint8_t> {
    304 public:
    305     SkNx(const uint8x8_t& vec) : fVec(vec) {}
    306 
    307     SkNx() {}
    308     static SkNx Load(const void* ptr) {
    309         return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr);
    310     }
    311     void store(void* ptr) const {
    312         return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0);
    313     }
    314 
    315     // TODO as needed
    316 
    317     uint8x8_t fVec;
    318 };
    319 
    320 template <>
    321 class SkNx<16, uint8_t> {
    322 public:
    323     SkNx(const uint8x16_t& vec) : fVec(vec) {}
    324 
    325     SkNx() {}
    326     SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
    327     static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
    328 
    329     SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
    330          uint8_t e, uint8_t f, uint8_t g, uint8_t h,
    331          uint8_t i, uint8_t j, uint8_t k, uint8_t l,
    332          uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
    333         fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
    334     }
    335 
    336     void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
    337 
    338     SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
    339 
    340     SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
    341     SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
    342 
    343     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
    344     SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
    345 
    346     uint8_t operator[](int k) const {
    347         SkASSERT(0 <= k && k < 16);
    348         union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
    349         return pun.us[k&15];
    350     }
    351 
    352     SkNx thenElse(const SkNx& t, const SkNx& e) const {
    353         return vbslq_u8(fVec, t.fVec, e.fVec);
    354     }
    355 
    356     uint8x16_t fVec;
    357 };
    358 
    359 #undef SHIFT32
    360 #undef SHIFT16
    361 #undef SHIFT8
    362 
    363 template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
    364     return vqmovn_u32(vcvtq_u32_f32(src.fVec));
    365 }
    366 
    367 template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
    368     return vcvtq_f32_u32(vmovl_u16(src.fVec));
    369 }
    370 
    371 template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
    372     uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
    373     uint16x4_t _16 = vqmovn_u32(_32);
    374     return vqmovn_u16(vcombine_u16(_16, _16));
    375 }
    376 
    377 template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
    378     uint16x8_t _16 = vmovl_u8 (src.fVec) ;
    379     uint32x4_t _32 = vmovl_u16(vget_low_u16(_16));
    380     return vcvtq_f32_u32(_32);
    381 }
    382 
    383 static inline void Sk4f_ToBytes(uint8_t bytes[16],
    384                                 const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
    385     vst1q_u8(bytes, vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
    386                                       (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
    387                              vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
    388                                       (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]);
    389 }
    390 
    391 template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
    392     return vget_low_u16(vmovl_u8(src.fVec));
    393 }
    394 
    395 template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
    396     return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
    397 }
    398 
    399 #endif//SkNx_neon_DEFINED
    400