1 /* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkNx_sse_DEFINED 9 #define SkNx_sse_DEFINED 10 11 #include <immintrin.h> 12 13 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent. 14 // If you do, make sure this is in a static inline function... anywhere else risks violating ODR. 15 16 namespace { 17 18 template <> 19 class SkNx<2, float> { 20 public: 21 AI SkNx(const __m128& vec) : fVec(vec) {} 22 23 AI SkNx() {} 24 AI SkNx(float val) : fVec(_mm_set1_ps(val)) {} 25 AI static SkNx Load(const void* ptr) { 26 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); 27 } 28 AI SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} 29 30 AI void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } 31 32 AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) { 33 auto lo = _mm_setr_ps(a[0], b[0], c[0], a[1]), 34 hi = _mm_setr_ps(b[1], c[1], 0, 0); 35 _mm_storeu_ps((float*)dst, lo); 36 _mm_storel_pi(((__m64*)dst) + 2, hi); 37 } 38 39 AI SkNx operator - () const { return _mm_xor_ps(_mm_set1_ps(-0.0f), fVec); } 40 41 AI SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 42 AI SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 43 AI SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 44 AI SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 45 46 AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 47 AI SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 48 AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 49 AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 50 AI SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 51 AI SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 52 53 AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 54 AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 55 56 AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } 57 AI SkNx sqrt() const { return _mm_sqrt_ps (fVec); } 58 AI SkNx rsqrt() const { return _mm_rsqrt_ps(fVec); } 59 AI SkNx invert() const { return _mm_rcp_ps(fVec); } 60 61 AI float operator[](int k) const { 62 SkASSERT(0 <= k && k < 2); 63 union { __m128 v; float fs[4]; } pun = {fVec}; 64 return pun.fs[k&1]; 65 } 66 67 AI bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 68 AI bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 69 70 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 72 return _mm_blendv_ps(e.fVec, t.fVec, fVec); 73 #else 74 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 75 _mm_andnot_ps(fVec, e.fVec)); 76 #endif 77 } 78 79 __m128 fVec; 80 }; 81 82 template <> 83 class SkNx<4, float> { 84 public: 85 AI SkNx(const __m128& vec) : fVec(vec) {} 86 87 AI SkNx() {} 88 AI SkNx(float val) : fVec( _mm_set1_ps(val) ) {} 89 AI SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} 90 91 AI static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } 92 AI void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } 93 94 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) { 95 SkNx lo = SkNx::Load((const float*)ptr+0), 96 hi = SkNx::Load((const float*)ptr+4); 97 *x = SkNx{lo[0], lo[2], hi[0], hi[2]}; 98 *y = SkNx{lo[1], lo[3], hi[1], hi[3]}; 99 } 100 101 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { 102 __m128 v0 = _mm_loadu_ps(((float*)ptr) + 0), 103 v1 = _mm_loadu_ps(((float*)ptr) + 4), 104 v2 = _mm_loadu_ps(((float*)ptr) + 8), 105 v3 = _mm_loadu_ps(((float*)ptr) + 12); 106 _MM_TRANSPOSE4_PS(v0, v1, v2, v3); 107 *r = v0; 108 *g = v1; 109 *b = v2; 110 *a = v3; 111 } 112 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { 113 __m128 v0 = r.fVec, 114 v1 = g.fVec, 115 v2 = b.fVec, 116 v3 = a.fVec; 117 _MM_TRANSPOSE4_PS(v0, v1, v2, v3); 118 _mm_storeu_ps(((float*) dst) + 0, v0); 119 _mm_storeu_ps(((float*) dst) + 4, v1); 120 _mm_storeu_ps(((float*) dst) + 8, v2); 121 _mm_storeu_ps(((float*) dst) + 12, v3); 122 } 123 124 AI SkNx operator - () const { return _mm_xor_ps(_mm_set1_ps(-0.0f), fVec); } 125 126 AI SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 127 AI SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 128 AI SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 129 AI SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 130 131 AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 132 AI SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 133 AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 134 AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 135 AI SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 136 AI SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 137 138 AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 139 AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 140 141 AI SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } 142 AI SkNx floor() const { 143 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 144 return _mm_floor_ps(fVec); 145 #else 146 // Emulate _mm_floor_ps() with SSE2: 147 // - roundtrip through integers via truncation 148 // - subtract 1 if that's too big (possible for negative values). 149 // This restricts the domain of our inputs to a maximum somehwere around 2^31. 150 // Seems plenty big. 151 __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec)); 152 __m128 too_big = _mm_cmpgt_ps(roundtrip, fVec); 153 return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f))); 154 #endif 155 } 156 157 AI SkNx sqrt() const { return _mm_sqrt_ps (fVec); } 158 AI SkNx rsqrt() const { return _mm_rsqrt_ps(fVec); } 159 AI SkNx invert() const { return _mm_rcp_ps(fVec); } 160 161 AI float operator[](int k) const { 162 SkASSERT(0 <= k && k < 4); 163 union { __m128 v; float fs[4]; } pun = {fVec}; 164 return pun.fs[k&3]; 165 } 166 167 AI bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); } 168 AI bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); } 169 170 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 172 return _mm_blendv_ps(e.fVec, t.fVec, fVec); 173 #else 174 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 175 _mm_andnot_ps(fVec, e.fVec)); 176 #endif 177 } 178 179 __m128 fVec; 180 }; 181 182 AI static __m128i mullo32(__m128i a, __m128i b) { 183 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 184 return _mm_mullo_epi32(a, b); 185 #else 186 __m128i mul20 = _mm_mul_epu32(a, b), 187 mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); 188 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), 189 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); 190 #endif 191 } 192 193 template <> 194 class SkNx<4, int32_t> { 195 public: 196 AI SkNx(const __m128i& vec) : fVec(vec) {} 197 198 AI SkNx() {} 199 AI SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {} 200 AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 201 AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {} 202 203 AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 204 205 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } 206 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } 207 AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } 208 209 AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } 210 AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } 211 AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } 212 213 AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } 214 AI SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } 215 216 AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); } 217 AI SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVec); } 218 AI SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVec); } 219 220 AI int32_t operator[](int k) const { 221 SkASSERT(0 <= k && k < 4); 222 union { __m128i v; int32_t is[4]; } pun = {fVec}; 223 return pun.is[k&3]; 224 } 225 226 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 227 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 228 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); 229 #else 230 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 231 _mm_andnot_si128(fVec, e.fVec)); 232 #endif 233 } 234 235 AI SkNx abs() const { 236 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 237 return _mm_abs_epi32(fVec); 238 #else 239 SkNx mask = (*this) >> 31; 240 return (mask ^ (*this)) - mask; 241 #endif 242 } 243 244 AI static SkNx Min(const SkNx& x, const SkNx& y) { 245 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 246 return _mm_min_epi32(x.fVec, y.fVec); 247 #else 248 return (x < y).thenElse(x, y); 249 #endif 250 } 251 252 AI static SkNx Max(const SkNx& x, const SkNx& y) { 253 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 254 return _mm_max_epi32(x.fVec, y.fVec); 255 #else 256 return (x > y).thenElse(x, y); 257 #endif 258 } 259 260 __m128i fVec; 261 }; 262 263 template <> 264 class SkNx<4, uint32_t> { 265 public: 266 AI SkNx(const __m128i& vec) : fVec(vec) {} 267 268 AI SkNx() {} 269 AI SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {} 270 AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 271 AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {} 272 273 AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 274 275 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } 276 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } 277 AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } 278 279 AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } 280 AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } 281 AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } 282 283 AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } 284 AI SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); } 285 286 AI SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); } 287 // operator < and > take a little extra fiddling to make work for unsigned ints. 288 289 AI uint32_t operator[](int k) const { 290 SkASSERT(0 <= k && k < 4); 291 union { __m128i v; uint32_t us[4]; } pun = {fVec}; 292 return pun.us[k&3]; 293 } 294 295 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 296 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 297 return _mm_blendv_epi8(e.fVec, t.fVec, fVec); 298 #else 299 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 300 _mm_andnot_si128(fVec, e.fVec)); 301 #endif 302 } 303 304 AI SkNx mulHi(SkNx m) const { 305 SkNx v20{_mm_mul_epu32(m.fVec, fVec)}; 306 SkNx v31{_mm_mul_epu32(_mm_srli_si128(m.fVec, 4), _mm_srli_si128(fVec, 4))}; 307 308 return SkNx{v20[1], v31[1], v20[3], v31[3]}; 309 } 310 311 __m128i fVec; 312 }; 313 314 template <> 315 class SkNx<4, uint16_t> { 316 public: 317 AI SkNx(const __m128i& vec) : fVec(vec) {} 318 319 AI SkNx() {} 320 AI SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 321 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) 322 : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} 323 324 AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } 325 AI void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } 326 327 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { 328 __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0), 329 hi = _mm_loadu_si128(((__m128i*)ptr) + 1); 330 __m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2 331 odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ... 332 __m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3 333 ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ... 334 *r = rg; 335 *g = _mm_srli_si128(rg, 8); 336 *b = ba; 337 *a = _mm_srli_si128(ba, 8); 338 } 339 AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { 340 // The idea here is to get 4 vectors that are R G B _ _ _ _ _. 341 // The second load is at a funny location to make sure we don't read past 342 // the bounds of memory. This is fine, we just need to shift it a little bit. 343 const uint8_t* ptr8 = (const uint8_t*) ptr; 344 __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0)); 345 __m128i rgb1 = _mm_srli_si128(rgb0, 3*2); 346 __m128i rgb2 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 4*2)), 2*2); 347 __m128i rgb3 = _mm_srli_si128(rgb2, 3*2); 348 349 __m128i rrggbb01 = _mm_unpacklo_epi16(rgb0, rgb1); 350 __m128i rrggbb23 = _mm_unpacklo_epi16(rgb2, rgb3); 351 *r = _mm_unpacklo_epi32(rrggbb01, rrggbb23); 352 *g = _mm_srli_si128(r->fVec, 4*2); 353 *b = _mm_unpackhi_epi32(rrggbb01, rrggbb23); 354 } 355 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { 356 __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec); 357 __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec); 358 __m128i lo = _mm_unpacklo_epi32(rg, ba); 359 __m128i hi = _mm_unpackhi_epi32(rg, ba); 360 _mm_storeu_si128(((__m128i*) dst) + 0, lo); 361 _mm_storeu_si128(((__m128i*) dst) + 1, hi); 362 } 363 364 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 365 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 366 AI SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 367 AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } 368 AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } 369 370 AI SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 371 AI SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 372 373 AI uint16_t operator[](int k) const { 374 SkASSERT(0 <= k && k < 4); 375 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 376 return pun.us[k&3]; 377 } 378 379 __m128i fVec; 380 }; 381 382 template <> 383 class SkNx<8, uint16_t> { 384 public: 385 AI SkNx(const __m128i& vec) : fVec(vec) {} 386 387 AI SkNx() {} 388 AI SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 389 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 390 uint16_t e, uint16_t f, uint16_t g, uint16_t h) 391 : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {} 392 393 AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 394 AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 395 396 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { 397 __m128i _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), 398 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), 399 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), 400 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); 401 402 __m128i _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 403 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 404 _46 = _mm_unpacklo_epi16(_45, _67), 405 _57 = _mm_unpackhi_epi16(_45, _67); 406 407 __m128i rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 408 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 409 rg4567 = _mm_unpacklo_epi16(_46, _57), 410 ba4567 = _mm_unpackhi_epi16(_46, _57); 411 412 *r = _mm_unpacklo_epi64(rg0123, rg4567); 413 *g = _mm_unpackhi_epi64(rg0123, rg4567); 414 *b = _mm_unpacklo_epi64(ba0123, ba4567); 415 *a = _mm_unpackhi_epi64(ba0123, ba4567); 416 } 417 AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { 418 const uint8_t* ptr8 = (const uint8_t*) ptr; 419 __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0*2)); 420 __m128i rgb1 = _mm_srli_si128(rgb0, 3*2); 421 __m128i rgb2 = _mm_loadu_si128((const __m128i*) (ptr8 + 6*2)); 422 __m128i rgb3 = _mm_srli_si128(rgb2, 3*2); 423 __m128i rgb4 = _mm_loadu_si128((const __m128i*) (ptr8 + 12*2)); 424 __m128i rgb5 = _mm_srli_si128(rgb4, 3*2); 425 __m128i rgb6 = _mm_srli_si128(_mm_loadu_si128((const __m128i*) (ptr8 + 16*2)), 2*2); 426 __m128i rgb7 = _mm_srli_si128(rgb6, 3*2); 427 428 __m128i rgb01 = _mm_unpacklo_epi16(rgb0, rgb1); 429 __m128i rgb23 = _mm_unpacklo_epi16(rgb2, rgb3); 430 __m128i rgb45 = _mm_unpacklo_epi16(rgb4, rgb5); 431 __m128i rgb67 = _mm_unpacklo_epi16(rgb6, rgb7); 432 433 __m128i rg03 = _mm_unpacklo_epi32(rgb01, rgb23); 434 __m128i bx03 = _mm_unpackhi_epi32(rgb01, rgb23); 435 __m128i rg47 = _mm_unpacklo_epi32(rgb45, rgb67); 436 __m128i bx47 = _mm_unpackhi_epi32(rgb45, rgb67); 437 438 *r = _mm_unpacklo_epi64(rg03, rg47); 439 *g = _mm_unpackhi_epi64(rg03, rg47); 440 *b = _mm_unpacklo_epi64(bx03, bx47); 441 } 442 AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { 443 __m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec), // r0 g0 r1 g1 r2 g2 r3 g3 444 rg4567 = _mm_unpackhi_epi16(r.fVec, g.fVec), // r4 g4 r5 g5 r6 g6 r7 g7 445 ba0123 = _mm_unpacklo_epi16(b.fVec, a.fVec), 446 ba4567 = _mm_unpackhi_epi16(b.fVec, a.fVec); 447 448 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123)); 449 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); 450 _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); 451 _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); 452 } 453 454 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 455 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 456 AI SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 457 AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } 458 AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } 459 460 AI SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 461 AI SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 462 463 AI static SkNx Min(const SkNx& a, const SkNx& b) { 464 // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the 465 // signed version, _mm_min_epi16, then shift back. 466 const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine. 467 const __m128i top_8x = _mm_set1_epi16(top); 468 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), 469 _mm_sub_epi8(b.fVec, top_8x))); 470 } 471 472 AI SkNx mulHi(const SkNx& m) const { 473 return _mm_mulhi_epu16(fVec, m.fVec); 474 } 475 476 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 477 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 478 _mm_andnot_si128(fVec, e.fVec)); 479 } 480 481 AI uint16_t operator[](int k) const { 482 SkASSERT(0 <= k && k < 8); 483 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 484 return pun.us[k&7]; 485 } 486 487 __m128i fVec; 488 }; 489 490 template <> 491 class SkNx<4, uint8_t> { 492 public: 493 AI SkNx() {} 494 AI SkNx(const __m128i& vec) : fVec(vec) {} 495 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) 496 : fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {} 497 498 AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } 499 AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } 500 501 AI uint8_t operator[](int k) const { 502 SkASSERT(0 <= k && k < 4); 503 union { __m128i v; uint8_t us[16]; } pun = {fVec}; 504 return pun.us[k&3]; 505 } 506 507 // TODO as needed 508 509 __m128i fVec; 510 }; 511 512 template <> 513 class SkNx<8, uint8_t> { 514 public: 515 AI SkNx(const __m128i& vec) : fVec(vec) {} 516 517 AI SkNx() {} 518 AI SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 519 AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } 520 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 521 uint8_t e, uint8_t f, uint8_t g, uint8_t h) 522 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, 0,0,0,0, 0,0,0,0)) {} 523 524 AI void store(void* ptr) const {_mm_storel_epi64((__m128i*)ptr, fVec);} 525 526 AI SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } 527 528 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } 529 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } 530 531 AI static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } 532 AI SkNx operator < (const SkNx& o) const { 533 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. 534 auto flip = _mm_set1_epi8(char(0x80)); 535 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); 536 } 537 538 AI uint8_t operator[](int k) const { 539 SkASSERT(0 <= k && k < 16); 540 union { __m128i v; uint8_t us[16]; } pun = {fVec}; 541 return pun.us[k&15]; 542 } 543 544 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 545 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 546 _mm_andnot_si128(fVec, e.fVec)); 547 } 548 549 __m128i fVec; 550 }; 551 552 template <> 553 class SkNx<16, uint8_t> { 554 public: 555 AI SkNx(const __m128i& vec) : fVec(vec) {} 556 557 AI SkNx() {} 558 AI SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 559 AI static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 560 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 561 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 562 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 563 uint8_t m, uint8_t n, uint8_t o, uint8_t p) 564 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} 565 566 AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 567 568 AI SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } 569 570 AI SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } 571 AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } 572 573 AI static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } 574 AI SkNx operator < (const SkNx& o) const { 575 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. 576 auto flip = _mm_set1_epi8(char(0x80)); 577 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); 578 } 579 580 AI uint8_t operator[](int k) const { 581 SkASSERT(0 <= k && k < 16); 582 union { __m128i v; uint8_t us[16]; } pun = {fVec}; 583 return pun.us[k&15]; 584 } 585 586 AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 587 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 588 _mm_andnot_si128(fVec, e.fVec)); 589 } 590 591 __m128i fVec; 592 }; 593 594 template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { 595 return _mm_cvtepi32_ps(src.fVec); 596 } 597 598 template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { 599 return SkNx_cast<float>(Sk4i::Load(&src)); 600 } 601 602 template <> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { 603 return _mm_cvttps_epi32(src.fVec); 604 } 605 606 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) { 607 #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 608 // TODO: This seems to be causing code generation problems. Investigate? 609 return _mm_packus_epi32(src.fVec); 610 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 611 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 612 const int _ = ~0; 613 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); 614 #else 615 // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do the pack we want. 616 __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16); 617 return _mm_packs_epi32(x,x); 618 #endif 619 } 620 621 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 622 return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src)); 623 } 624 625 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 626 auto _32 = _mm_cvttps_epi32(src.fVec); 627 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 628 const int _ = ~0; 629 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_)); 630 #else 631 auto _16 = _mm_packus_epi16(_32, _32); 632 return _mm_packus_epi16(_16, _16); 633 #endif 634 } 635 636 template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) { 637 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 638 const int _ = ~0; 639 return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); 640 #else 641 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 642 return _mm_unpacklo_epi16(_16, _mm_setzero_si128()); 643 #endif 644 } 645 646 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) { 647 return SkNx_cast<uint32_t>(src).fVec; 648 } 649 650 template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { 651 return _mm_cvtepi32_ps(SkNx_cast<int32_t>(src).fVec); 652 } 653 654 template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { 655 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 656 return _mm_cvtepi32_ps(_32); 657 } 658 659 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) { 660 Sk4i lo, hi; 661 SkNx_split(src, &lo, &hi); 662 663 auto t = _mm_packs_epi32(lo.fVec, hi.fVec); 664 return _mm_packus_epi16(t, t); 665 } 666 667 template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) { 668 Sk8f ab, cd; 669 SkNx_split(src, &ab, &cd); 670 671 Sk4f a,b,c,d; 672 SkNx_split(ab, &a, &b); 673 SkNx_split(cd, &c, &d); 674 675 return _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), 676 _mm_cvttps_epi32(b.fVec)), 677 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), 678 _mm_cvttps_epi32(d.fVec))); 679 } 680 681 template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { 682 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 683 } 684 685 template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) { 686 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 687 } 688 689 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 690 return _mm_packus_epi16(src.fVec, src.fVec); 691 } 692 693 template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) { 694 return _mm_packus_epi16(src.fVec, src.fVec); 695 } 696 697 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) { 698 return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 699 } 700 701 702 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { 703 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); 704 } 705 706 template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) { 707 return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); 708 } 709 710 template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) { 711 return src.fVec; 712 } 713 714 AI static Sk4i Sk4f_round(const Sk4f& x) { 715 return _mm_cvtps_epi32(x.fVec); 716 } 717 718 } // namespace 719 720 #endif//SkNx_sse_DEFINED 721