1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkJumper_vectors_DEFINED 9 #define SkJumper_vectors_DEFINED 10 11 #include "SkJumper.h" 12 #include "SkJumper_misc.h" 13 14 // This file contains vector types that SkJumper_stages.cpp uses to define stages. 15 16 // Every function in this file should be marked static and inline using SI (see SkJumper_misc.h). 17 18 #if !defined(JUMPER) 19 // This path should lead to portable code that can be compiled directly into Skia. 20 // (All other paths are compiled offline by Clang into SkJumper_generated.S.) 21 #include <math.h> 22 23 using F = float ; 24 using I32 = int32_t; 25 using U64 = uint64_t; 26 using U32 = uint32_t; 27 using U16 = uint16_t; 28 using U8 = uint8_t ; 29 30 SI F mad(F f, F m, F a) { return f*m+a; } 31 SI F min(F a, F b) { return fminf(a,b); } 32 SI F max(F a, F b) { return fmaxf(a,b); } 33 SI F abs_ (F v) { return fabsf(v); } 34 SI F floor_(F v) { return floorf(v); } 35 SI F rcp (F v) { return 1.0f / v; } 36 SI F rsqrt (F v) { return 1.0f / sqrtf(v); } 37 SI F sqrt_(F v) { return sqrtf(v); } 38 SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } 39 SI U16 pack(U32 v) { return (U16)v; } 40 SI U8 pack(U16 v) { return (U8)v; } 41 42 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } 43 44 template <typename T> 45 SI T gather(const T* p, U32 ix) { return p[ix]; } 46 47 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 48 *r = ptr[0]; 49 *g = ptr[1]; 50 *b = ptr[2]; 51 } 52 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 53 *r = ptr[0]; 54 *g = ptr[1]; 55 *b = ptr[2]; 56 *a = ptr[3]; 57 } 58 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 59 ptr[0] = r; 60 ptr[1] = g; 61 ptr[2] = b; 62 ptr[3] = a; 63 } 64 65 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 66 *r = ptr[0]; 67 *g = ptr[1]; 68 *b = ptr[2]; 69 *a = ptr[3]; 70 } 71 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 72 ptr[0] = r; 73 ptr[1] = g; 74 ptr[2] = b; 75 ptr[3] = a; 76 } 77 78 #elif defined(__aarch64__) 79 #include <arm_neon.h> 80 81 // Since we know we're using Clang, we can use its vector extensions. 82 template <typename T> using V = T __attribute__((ext_vector_type(4))); 83 using F = V<float >; 84 using I32 = V< int32_t>; 85 using U64 = V<uint64_t>; 86 using U32 = V<uint32_t>; 87 using U16 = V<uint16_t>; 88 using U8 = V<uint8_t >; 89 90 // We polyfill a few routines that Clang doesn't build into ext_vector_types. 91 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } 92 SI F min(F a, F b) { return vminq_f32(a,b); } 93 SI F max(F a, F b) { return vmaxq_f32(a,b); } 94 SI F abs_ (F v) { return vabsq_f32(v); } 95 SI F floor_(F v) { return vrndmq_f32(v); } 96 SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } 97 SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } 98 SI F sqrt_(F v) { return vsqrtq_f32(v); } 99 SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); } 100 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } 101 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } 102 103 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } 104 105 template <typename T> 106 SI V<T> gather(const T* p, U32 ix) { 107 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; 108 } 109 110 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 111 uint16x4x3_t rgb; 112 if (__builtin_expect(tail,0)) { 113 if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } 114 if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } 115 if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } 116 } else { 117 rgb = vld3_u16(ptr); 118 } 119 *r = rgb.val[0]; 120 *g = rgb.val[1]; 121 *b = rgb.val[2]; 122 } 123 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 124 uint16x4x4_t rgba; 125 if (__builtin_expect(tail,0)) { 126 if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } 127 if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } 128 if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } 129 } else { 130 rgba = vld4_u16(ptr); 131 } 132 *r = rgba.val[0]; 133 *g = rgba.val[1]; 134 *b = rgba.val[2]; 135 *a = rgba.val[3]; 136 } 137 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 138 if (__builtin_expect(tail,0)) { 139 if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } 140 if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } 141 if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } 142 } else { 143 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); 144 } 145 } 146 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 147 float32x4x4_t rgba; 148 if (__builtin_expect(tail,0)) { 149 if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } 150 if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } 151 if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } 152 } else { 153 rgba = vld4q_f32(ptr); 154 } 155 *r = rgba.val[0]; 156 *g = rgba.val[1]; 157 *b = rgba.val[2]; 158 *a = rgba.val[3]; 159 } 160 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 161 if (__builtin_expect(tail,0)) { 162 if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } 163 if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } 164 if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } 165 } else { 166 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); 167 } 168 } 169 170 #elif defined(__arm__) 171 #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) 172 #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. 173 #endif 174 #include <arm_neon.h> 175 176 // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers. 177 template <typename T> using V = T __attribute__((ext_vector_type(2))); 178 using F = V<float >; 179 using I32 = V< int32_t>; 180 using U64 = V<uint64_t>; 181 using U32 = V<uint32_t>; 182 using U16 = V<uint16_t>; 183 using U8 = V<uint8_t >; 184 185 SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); } 186 SI F min(F a, F b) { return vmin_f32(a,b); } 187 SI F max(F a, F b) { return vmax_f32(a,b); } 188 SI F abs_ (F v) { return vabs_f32(v); } 189 SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; } 190 SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; } 191 SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); } 192 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } 193 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } 194 195 SI F sqrt_(F v) { 196 auto e = vrsqrte_f32(v); // Estimate and two refinement steps for e = rsqrt(v). 197 e *= vrsqrts_f32(v,e*e); 198 e *= vrsqrts_f32(v,e*e); 199 return v*e; // sqrt(v) == v*rsqrt(v). 200 } 201 202 SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } 203 204 SI F floor_(F v) { 205 F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v)); 206 return roundtrip - if_then_else(roundtrip > v, 1, 0); 207 } 208 209 template <typename T> 210 SI V<T> gather(const T* p, U32 ix) { 211 return {p[ix[0]], p[ix[1]]}; 212 } 213 214 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 215 uint16x4x3_t rgb; 216 rgb = vld3_lane_u16(ptr + 0, rgb, 0); 217 if (__builtin_expect(tail, 0)) { 218 vset_lane_u16(0, rgb.val[0], 1); 219 vset_lane_u16(0, rgb.val[1], 1); 220 vset_lane_u16(0, rgb.val[2], 1); 221 } else { 222 rgb = vld3_lane_u16(ptr + 3, rgb, 1); 223 } 224 *r = unaligned_load<U16>(rgb.val+0); 225 *g = unaligned_load<U16>(rgb.val+1); 226 *b = unaligned_load<U16>(rgb.val+2); 227 } 228 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 229 uint16x4x4_t rgba; 230 rgba = vld4_lane_u16(ptr + 0, rgba, 0); 231 if (__builtin_expect(tail, 0)) { 232 vset_lane_u16(0, rgba.val[0], 1); 233 vset_lane_u16(0, rgba.val[1], 1); 234 vset_lane_u16(0, rgba.val[2], 1); 235 vset_lane_u16(0, rgba.val[3], 1); 236 } else { 237 rgba = vld4_lane_u16(ptr + 4, rgba, 1); 238 } 239 *r = unaligned_load<U16>(rgba.val+0); 240 *g = unaligned_load<U16>(rgba.val+1); 241 *b = unaligned_load<U16>(rgba.val+2); 242 *a = unaligned_load<U16>(rgba.val+3); 243 } 244 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 245 uint16x4x4_t rgba = {{ 246 widen_cast<uint16x4_t>(r), 247 widen_cast<uint16x4_t>(g), 248 widen_cast<uint16x4_t>(b), 249 widen_cast<uint16x4_t>(a), 250 }}; 251 vst4_lane_u16(ptr + 0, rgba, 0); 252 if (__builtin_expect(tail == 0, true)) { 253 vst4_lane_u16(ptr + 4, rgba, 1); 254 } 255 } 256 257 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 258 float32x2x4_t rgba; 259 if (__builtin_expect(tail, 0)) { 260 rgba = vld4_dup_f32(ptr); 261 } else { 262 rgba = vld4_f32(ptr); 263 } 264 *r = rgba.val[0]; 265 *g = rgba.val[1]; 266 *b = rgba.val[2]; 267 *a = rgba.val[3]; 268 } 269 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 270 if (__builtin_expect(tail, 0)) { 271 vst4_lane_f32(ptr, (float32x2x4_t{{r,g,b,a}}), 0); 272 } else { 273 vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}})); 274 } 275 } 276 277 278 #elif defined(__AVX__) 279 #include <immintrin.h> 280 281 // These are __m256 and __m256i, but friendlier and strongly-typed. 282 template <typename T> using V = T __attribute__((ext_vector_type(8))); 283 using F = V<float >; 284 using I32 = V< int32_t>; 285 using U64 = V<uint64_t>; 286 using U32 = V<uint32_t>; 287 using U16 = V<uint16_t>; 288 using U8 = V<uint8_t >; 289 290 SI F mad(F f, F m, F a) { 291 #if defined(__FMA__) 292 return _mm256_fmadd_ps(f,m,a); 293 #else 294 return f*m+a; 295 #endif 296 } 297 298 SI F min(F a, F b) { return _mm256_min_ps(a,b); } 299 SI F max(F a, F b) { return _mm256_max_ps(a,b); } 300 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } 301 SI F floor_(F v) { return _mm256_floor_ps(v); } 302 SI F rcp (F v) { return _mm256_rcp_ps (v); } 303 SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } 304 SI F sqrt_(F v) { return _mm256_sqrt_ps (v); } 305 SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } 306 307 SI U16 pack(U32 v) { 308 return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), 309 _mm256_extractf128_si256(v, 1)); 310 } 311 SI U8 pack(U16 v) { 312 auto r = _mm_packus_epi16(v,v); 313 return unaligned_load<U8>(&r); 314 } 315 316 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } 317 318 template <typename T> 319 SI V<T> gather(const T* p, U32 ix) { 320 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], 321 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; 322 } 323 #if defined(__AVX2__) 324 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } 325 SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } 326 SI U64 gather(const uint64_t* p, U32 ix) { 327 __m256i parts[] = { 328 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), 329 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), 330 }; 331 return bit_cast<U64>(parts); 332 } 333 #endif 334 335 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 336 __m128i _0,_1,_2,_3,_4,_5,_6,_7; 337 if (__builtin_expect(tail,0)) { 338 auto load_rgb = [](const uint16_t* src) { 339 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); 340 return _mm_insert_epi16(v, src[2], 2); 341 }; 342 if (tail > 0) { _0 = load_rgb(ptr + 0); } 343 if (tail > 1) { _1 = load_rgb(ptr + 3); } 344 if (tail > 2) { _2 = load_rgb(ptr + 6); } 345 if (tail > 3) { _3 = load_rgb(ptr + 9); } 346 if (tail > 4) { _4 = load_rgb(ptr + 12); } 347 if (tail > 5) { _5 = load_rgb(ptr + 15); } 348 if (tail > 6) { _6 = load_rgb(ptr + 18); } 349 } else { 350 // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. 351 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; 352 auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; 353 auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; 354 auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); 355 _0 = _01; _1 = _mm_srli_si128(_01, 6), 356 _2 = _23; _3 = _mm_srli_si128(_23, 6), 357 _4 = _45; _5 = _mm_srli_si128(_45, 6), 358 _6 = _67; _7 = _mm_srli_si128(_67, 6); 359 } 360 361 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx 362 _13 = _mm_unpacklo_epi16(_1, _3), 363 _46 = _mm_unpacklo_epi16(_4, _6), 364 _57 = _mm_unpacklo_epi16(_5, _7); 365 366 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 367 bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx 368 rg4567 = _mm_unpacklo_epi16(_46, _57), 369 bx4567 = _mm_unpackhi_epi16(_46, _57); 370 371 *r = _mm_unpacklo_epi64(rg0123, rg4567); 372 *g = _mm_unpackhi_epi64(rg0123, rg4567); 373 *b = _mm_unpacklo_epi64(bx0123, bx4567); 374 } 375 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 376 __m128i _01, _23, _45, _67; 377 if (__builtin_expect(tail,0)) { 378 auto src = (const double*)ptr; 379 _01 = _23 = _45 = _67 = _mm_setzero_si128(); 380 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } 381 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } 382 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } 383 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } 384 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } 385 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } 386 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } 387 } else { 388 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); 389 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); 390 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); 391 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); 392 } 393 394 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 395 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 396 _46 = _mm_unpacklo_epi16(_45, _67), 397 _57 = _mm_unpackhi_epi16(_45, _67); 398 399 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 400 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 401 rg4567 = _mm_unpacklo_epi16(_46, _57), 402 ba4567 = _mm_unpackhi_epi16(_46, _57); 403 404 *r = _mm_unpacklo_epi64(rg0123, rg4567); 405 *g = _mm_unpackhi_epi64(rg0123, rg4567); 406 *b = _mm_unpacklo_epi64(ba0123, ba4567); 407 *a = _mm_unpackhi_epi64(ba0123, ba4567); 408 } 409 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 410 auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 411 rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 412 ba0123 = _mm_unpacklo_epi16(b, a), 413 ba4567 = _mm_unpackhi_epi16(b, a); 414 415 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), 416 _23 = _mm_unpackhi_epi32(rg0123, ba0123), 417 _45 = _mm_unpacklo_epi32(rg4567, ba4567), 418 _67 = _mm_unpackhi_epi32(rg4567, ba4567); 419 420 if (__builtin_expect(tail,0)) { 421 auto dst = (double*)ptr; 422 if (tail > 0) { _mm_storel_pd(dst+0, _01); } 423 if (tail > 1) { _mm_storeh_pd(dst+1, _01); } 424 if (tail > 2) { _mm_storel_pd(dst+2, _23); } 425 if (tail > 3) { _mm_storeh_pd(dst+3, _23); } 426 if (tail > 4) { _mm_storel_pd(dst+4, _45); } 427 if (tail > 5) { _mm_storeh_pd(dst+5, _45); } 428 if (tail > 6) { _mm_storel_pd(dst+6, _67); } 429 } else { 430 _mm_storeu_si128((__m128i*)ptr + 0, _01); 431 _mm_storeu_si128((__m128i*)ptr + 1, _23); 432 _mm_storeu_si128((__m128i*)ptr + 2, _45); 433 _mm_storeu_si128((__m128i*)ptr + 3, _67); 434 } 435 } 436 437 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 438 F _04, _15, _26, _37; 439 440 switch (tail) { 441 case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); 442 case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); 443 case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); 444 case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); 445 case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); 446 case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); 447 case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); 448 case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); 449 } 450 451 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 452 ba0145 = _mm256_unpackhi_ps(_04,_15), 453 rg2367 = _mm256_unpacklo_ps(_26,_37), 454 ba2367 = _mm256_unpackhi_ps(_26,_37); 455 456 *r = _mm256_unpacklo_pd(rg0145, rg2367); 457 *g = _mm256_unpackhi_pd(rg0145, rg2367); 458 *b = _mm256_unpacklo_pd(ba0145, ba2367); 459 *a = _mm256_unpackhi_pd(ba0145, ba2367); 460 } 461 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 462 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 463 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... 464 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 465 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... 466 467 F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 468 _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... 469 _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... 470 _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... 471 472 if (__builtin_expect(tail, 0)) { 473 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } 474 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } 475 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } 476 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } 477 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } 478 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } 479 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } 480 } else { 481 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo 482 _23 = _mm256_permute2f128_ps(_26, _37, 32), 483 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi 484 _67 = _mm256_permute2f128_ps(_26, _37, 49); 485 _mm256_storeu_ps(ptr+ 0, _01); 486 _mm256_storeu_ps(ptr+ 8, _23); 487 _mm256_storeu_ps(ptr+16, _45); 488 _mm256_storeu_ps(ptr+24, _67); 489 } 490 } 491 492 #elif defined(__SSE2__) 493 #include <immintrin.h> 494 495 template <typename T> using V = T __attribute__((ext_vector_type(4))); 496 using F = V<float >; 497 using I32 = V< int32_t>; 498 using U64 = V<uint64_t>; 499 using U32 = V<uint32_t>; 500 using U16 = V<uint16_t>; 501 using U8 = V<uint8_t >; 502 503 SI F mad(F f, F m, F a) { return f*m+a; } 504 SI F min(F a, F b) { return _mm_min_ps(a,b); } 505 SI F max(F a, F b) { return _mm_max_ps(a,b); } 506 SI F abs_(F v) { return _mm_and_ps(v, 0-v); } 507 SI F rcp (F v) { return _mm_rcp_ps (v); } 508 SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } 509 SI F sqrt_(F v) { return _mm_sqrt_ps (v); } 510 SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } 511 512 SI U16 pack(U32 v) { 513 #if defined(__SSE4_1__) 514 auto p = _mm_packus_epi32(v,v); 515 #else 516 // Sign extend so that _mm_packs_epi32() does the pack we want. 517 auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); 518 p = _mm_packs_epi32(p,p); 519 #endif 520 return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. 521 } 522 SI U8 pack(U16 v) { 523 auto r = widen_cast<__m128i>(v); 524 r = _mm_packus_epi16(r,r); 525 return unaligned_load<U8>(&r); 526 } 527 528 SI F if_then_else(I32 c, F t, F e) { 529 return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); 530 } 531 532 SI F floor_(F v) { 533 #if defined(__SSE4_1__) 534 return _mm_floor_ps(v); 535 #else 536 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); 537 return roundtrip - if_then_else(roundtrip > v, 1, 0); 538 #endif 539 } 540 541 template <typename T> 542 SI V<T> gather(const T* p, U32 ix) { 543 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; 544 } 545 546 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 547 __m128i _0, _1, _2, _3; 548 if (__builtin_expect(tail,0)) { 549 _1 = _2 = _3 = _mm_setzero_si128(); 550 auto load_rgb = [](const uint16_t* src) { 551 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); 552 return _mm_insert_epi16(v, src[2], 2); 553 }; 554 if ( true ) { _0 = load_rgb(ptr + 0); } 555 if (tail > 1) { _1 = load_rgb(ptr + 3); } 556 if (tail > 2) { _2 = load_rgb(ptr + 6); } 557 } else { 558 // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. 559 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , 560 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); 561 562 // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). 563 _0 = _01; 564 _1 = _mm_srli_si128(_01, 6); 565 _2 = _23; 566 _3 = _mm_srli_si128(_23, 6); 567 } 568 569 // De-interlace to R,G,B. 570 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx 571 _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx 572 573 auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 574 G = _mm_srli_si128(R, 8), 575 B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx 576 577 *r = unaligned_load<U16>(&R); 578 *g = unaligned_load<U16>(&G); 579 *b = unaligned_load<U16>(&B); 580 } 581 582 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 583 __m128i _01, _23; 584 if (__builtin_expect(tail,0)) { 585 _01 = _23 = _mm_setzero_si128(); 586 auto src = (const double*)ptr; 587 if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 588 if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 589 if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 590 } else { 591 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 592 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 593 } 594 595 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 596 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 597 598 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 599 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 600 601 *r = unaligned_load<U16>((uint16_t*)&rg + 0); 602 *g = unaligned_load<U16>((uint16_t*)&rg + 4); 603 *b = unaligned_load<U16>((uint16_t*)&ba + 0); 604 *a = unaligned_load<U16>((uint16_t*)&ba + 4); 605 } 606 607 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 608 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), 609 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); 610 611 if (__builtin_expect(tail, 0)) { 612 auto dst = (double*)ptr; 613 if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } 614 if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } 615 if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } 616 } else { 617 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); 618 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); 619 } 620 } 621 622 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 623 F _0, _1, _2, _3; 624 if (__builtin_expect(tail, 0)) { 625 _1 = _2 = _3 = _mm_setzero_si128(); 626 if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } 627 if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } 628 if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } 629 } else { 630 _0 = _mm_loadu_ps(ptr + 0); 631 _1 = _mm_loadu_ps(ptr + 4); 632 _2 = _mm_loadu_ps(ptr + 8); 633 _3 = _mm_loadu_ps(ptr +12); 634 } 635 _MM_TRANSPOSE4_PS(_0,_1,_2,_3); 636 *r = _0; 637 *g = _1; 638 *b = _2; 639 *a = _3; 640 } 641 642 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 643 _MM_TRANSPOSE4_PS(r,g,b,a); 644 if (__builtin_expect(tail, 0)) { 645 if ( true ) { _mm_storeu_ps(ptr + 0, r); } 646 if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } 647 if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } 648 } else { 649 _mm_storeu_ps(ptr + 0, r); 650 _mm_storeu_ps(ptr + 4, g); 651 _mm_storeu_ps(ptr + 8, b); 652 _mm_storeu_ps(ptr +12, a); 653 } 654 } 655 #endif 656 657 // We need to be a careful with casts. 658 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. 659 // These named casts and bit_cast() are always what they seem to be. 660 #if defined(JUMPER) 661 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } 662 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } 663 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } 664 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } 665 #else 666 SI F cast (U32 v) { return (F)v; } 667 SI U32 trunc_(F v) { return (U32)v; } 668 SI U32 expand(U16 v) { return (U32)v; } 669 SI U32 expand(U8 v) { return (U32)v; } 670 #endif 671 672 template <typename V> 673 SI V if_then_else(I32 c, V t, V e) { 674 return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e))); 675 } 676 677 SI U16 bswap(U16 x) { 678 #if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__) 679 // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes 680 // when generating code for SSE2 and SSE4.1. We'll do it manually... 681 auto v = widen_cast<__m128i>(x); 682 v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); 683 return unaligned_load<U16>(&v); 684 #else 685 return (x<<8) | (x>>8); 686 #endif 687 } 688 689 SI F fract(F v) { return v - floor_(v); } 690 691 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. 692 SI F approx_log2(F x) { 693 // e - 127 is a fair approximation of log2(x) in its own right... 694 F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23)); 695 696 // ... but using the mantissa to refine its error is _much_ better. 697 F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); 698 return e 699 - 124.225514990f 700 - 1.498030302f * m 701 - 1.725879990f / (0.3520887068f + m); 702 } 703 SI F approx_pow2(F x) { 704 F f = fract(x); 705 return bit_cast<F>(round(1.0f * (1<<23), 706 x + 121.274057500f 707 - 1.490129070f * f 708 + 27.728023300f / (4.84252568f - f))); 709 } 710 711 SI F approx_powf(F x, F y) { 712 return approx_pow2(approx_log2(x) * y); 713 } 714 715 SI F from_half(U16 h) { 716 #if defined(JUMPER) && defined(__aarch64__) 717 return vcvt_f32_f16(h); 718 719 #elif defined(JUMPER) && defined(__arm__) 720 auto v = widen_cast<uint16x4_t>(h); 721 return vget_low_f32(vcvt_f32_f16(v)); 722 723 #elif defined(JUMPER) && defined(__AVX2__) 724 return _mm256_cvtph_ps(h); 725 726 #else 727 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. 728 U32 sem = expand(h), 729 s = sem & 0x8000, 730 em = sem ^ s; 731 732 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. 733 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. 734 return if_then_else(denorm, F(0) 735 , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); 736 #endif 737 } 738 739 SI U16 to_half(F f) { 740 #if defined(JUMPER) && defined(__aarch64__) 741 return vcvt_f16_f32(f); 742 743 #elif defined(JUMPER) && defined(__arm__) 744 auto v = widen_cast<float32x4_t>(f); 745 uint16x4_t h = vcvt_f16_f32(v); 746 return unaligned_load<U16>(&h); 747 748 #elif defined(JUMPER) && defined(__AVX2__) 749 return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); 750 751 #else 752 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. 753 U32 sem = bit_cast<U32>(f), 754 s = sem & 0x80000000, 755 em = sem ^ s; 756 757 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. 758 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. 759 return pack(if_then_else(denorm, U32(0) 760 , (s>>16) + (em>>13) - ((127-15)<<10))); 761 #endif 762 } 763 764 765 766 #endif//SkJumper_vectors_DEFINED 767