1 /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 #ifndef __CLANG_CUDA_INTRINSICS_H__ 24 #define __CLANG_CUDA_INTRINSICS_H__ 25 #ifndef __CUDA__ 26 #error "This file is for CUDA compilation only." 27 #endif 28 29 // sm_30 intrinsics: __shfl_{up,down,xor}. 30 31 #define __SM_30_INTRINSICS_H__ 32 #define __SM_30_INTRINSICS_HPP__ 33 34 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 35 36 #pragma push_macro("__MAKE_SHUFFLES") 37 #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ 38 inline __device__ int __FnName(int __in, int __offset, \ 39 int __width = warpSize) { \ 40 return __IntIntrinsic(__in, __offset, \ 41 ((warpSize - __width) << 8) | (__Mask)); \ 42 } \ 43 inline __device__ float __FnName(float __in, int __offset, \ 44 int __width = warpSize) { \ 45 return __FloatIntrinsic(__in, __offset, \ 46 ((warpSize - __width) << 8) | (__Mask)); \ 47 } \ 48 inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \ 49 int __width = warpSize) { \ 50 return static_cast<unsigned int>( \ 51 ::__FnName(static_cast<int>(__in), __offset, __width)); \ 52 } \ 53 inline __device__ long long __FnName(long long __in, int __offset, \ 54 int __width = warpSize) { \ 55 struct __Bits { \ 56 int __a, __b; \ 57 }; \ 58 _Static_assert(sizeof(__in) == sizeof(__Bits)); \ 59 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 60 __Bits __tmp; \ 61 memcpy(&__in, &__tmp, sizeof(__in)); \ 62 __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 63 __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 64 long long __out; \ 65 memcpy(&__out, &__tmp, sizeof(__tmp)); \ 66 return __out; \ 67 } \ 68 inline __device__ unsigned long long __FnName( \ 69 unsigned long long __in, int __offset, int __width = warpSize) { \ 70 return static_cast<unsigned long long>( \ 71 ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \ 72 } \ 73 inline __device__ double __FnName(double __in, int __offset, \ 74 int __width = warpSize) { \ 75 long long __tmp; \ 76 _Static_assert(sizeof(__tmp) == sizeof(__in)); \ 77 memcpy(&__tmp, &__in, sizeof(__in)); \ 78 __tmp = ::__FnName(__tmp, __offset, __width); \ 79 double __out; \ 80 memcpy(&__out, &__tmp, sizeof(__out)); \ 81 return __out; \ 82 } 83 84 __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f); 85 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 86 // maxLane. 87 __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0); 88 __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f); 89 __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); 90 91 #pragma pop_macro("__MAKE_SHUFFLES") 92 93 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 94 95 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 96 97 // Prevent the vanilla sm_32 intrinsics header from being included. 98 #define __SM_32_INTRINSICS_H__ 99 #define __SM_32_INTRINSICS_HPP__ 100 101 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 102 103 inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 104 inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 105 inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 106 inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 107 inline __device__ long long __ldg(const long long *ptr) { 108 return __nvvm_ldg_ll(ptr); 109 } 110 inline __device__ unsigned char __ldg(const unsigned char *ptr) { 111 return __nvvm_ldg_uc(ptr); 112 } 113 inline __device__ unsigned short __ldg(const unsigned short *ptr) { 114 return __nvvm_ldg_us(ptr); 115 } 116 inline __device__ unsigned int __ldg(const unsigned int *ptr) { 117 return __nvvm_ldg_ui(ptr); 118 } 119 inline __device__ unsigned long __ldg(const unsigned long *ptr) { 120 return __nvvm_ldg_ul(ptr); 121 } 122 inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 123 return __nvvm_ldg_ull(ptr); 124 } 125 inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 126 inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 127 128 inline __device__ char2 __ldg(const char2 *ptr) { 129 typedef char c2 __attribute__((ext_vector_type(2))); 130 // We can assume that ptr is aligned at least to char2's alignment, but the 131 // load will assume that ptr is aligned to char2's alignment. This is only 132 // safe if alignof(c2) <= alignof(char2). 133 c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 134 char2 ret; 135 ret.x = rv[0]; 136 ret.y = rv[1]; 137 return ret; 138 } 139 inline __device__ char4 __ldg(const char4 *ptr) { 140 typedef char c4 __attribute__((ext_vector_type(4))); 141 c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 142 char4 ret; 143 ret.x = rv[0]; 144 ret.y = rv[1]; 145 ret.z = rv[2]; 146 ret.w = rv[3]; 147 return ret; 148 } 149 inline __device__ short2 __ldg(const short2 *ptr) { 150 typedef short s2 __attribute__((ext_vector_type(2))); 151 s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 152 short2 ret; 153 ret.x = rv[0]; 154 ret.y = rv[1]; 155 return ret; 156 } 157 inline __device__ short4 __ldg(const short4 *ptr) { 158 typedef short s4 __attribute__((ext_vector_type(4))); 159 s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 160 short4 ret; 161 ret.x = rv[0]; 162 ret.y = rv[1]; 163 ret.z = rv[2]; 164 ret.w = rv[3]; 165 return ret; 166 } 167 inline __device__ int2 __ldg(const int2 *ptr) { 168 typedef int i2 __attribute__((ext_vector_type(2))); 169 i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 170 int2 ret; 171 ret.x = rv[0]; 172 ret.y = rv[1]; 173 return ret; 174 } 175 inline __device__ int4 __ldg(const int4 *ptr) { 176 typedef int i4 __attribute__((ext_vector_type(4))); 177 i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 178 int4 ret; 179 ret.x = rv[0]; 180 ret.y = rv[1]; 181 ret.z = rv[2]; 182 ret.w = rv[3]; 183 return ret; 184 } 185 inline __device__ longlong2 __ldg(const longlong2 *ptr) { 186 typedef long long ll2 __attribute__((ext_vector_type(2))); 187 ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 188 longlong2 ret; 189 ret.x = rv[0]; 190 ret.y = rv[1]; 191 return ret; 192 } 193 194 inline __device__ uchar2 __ldg(const uchar2 *ptr) { 195 typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 196 uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 197 uchar2 ret; 198 ret.x = rv[0]; 199 ret.y = rv[1]; 200 return ret; 201 } 202 inline __device__ uchar4 __ldg(const uchar4 *ptr) { 203 typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 204 uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 205 uchar4 ret; 206 ret.x = rv[0]; 207 ret.y = rv[1]; 208 ret.z = rv[2]; 209 ret.w = rv[3]; 210 return ret; 211 } 212 inline __device__ ushort2 __ldg(const ushort2 *ptr) { 213 typedef unsigned short us2 __attribute__((ext_vector_type(2))); 214 us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 215 ushort2 ret; 216 ret.x = rv[0]; 217 ret.y = rv[1]; 218 return ret; 219 } 220 inline __device__ ushort4 __ldg(const ushort4 *ptr) { 221 typedef unsigned short us4 __attribute__((ext_vector_type(4))); 222 us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 223 ushort4 ret; 224 ret.x = rv[0]; 225 ret.y = rv[1]; 226 ret.z = rv[2]; 227 ret.w = rv[3]; 228 return ret; 229 } 230 inline __device__ uint2 __ldg(const uint2 *ptr) { 231 typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 232 ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 233 uint2 ret; 234 ret.x = rv[0]; 235 ret.y = rv[1]; 236 return ret; 237 } 238 inline __device__ uint4 __ldg(const uint4 *ptr) { 239 typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 240 ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 241 uint4 ret; 242 ret.x = rv[0]; 243 ret.y = rv[1]; 244 ret.z = rv[2]; 245 ret.w = rv[3]; 246 return ret; 247 } 248 inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 249 typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 250 ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 251 ulonglong2 ret; 252 ret.x = rv[0]; 253 ret.y = rv[1]; 254 return ret; 255 } 256 257 inline __device__ float2 __ldg(const float2 *ptr) { 258 typedef float f2 __attribute__((ext_vector_type(2))); 259 f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 260 float2 ret; 261 ret.x = rv[0]; 262 ret.y = rv[1]; 263 return ret; 264 } 265 inline __device__ float4 __ldg(const float4 *ptr) { 266 typedef float f4 __attribute__((ext_vector_type(4))); 267 f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 268 float4 ret; 269 ret.x = rv[0]; 270 ret.y = rv[1]; 271 ret.z = rv[2]; 272 ret.w = rv[3]; 273 return ret; 274 } 275 inline __device__ double2 __ldg(const double2 *ptr) { 276 typedef double d2 __attribute__((ext_vector_type(2))); 277 d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 278 double2 ret; 279 ret.x = rv[0]; 280 ret.y = rv[1]; 281 return ret; 282 } 283 284 // TODO: Implement these as intrinsics, so the backend can work its magic on 285 // these. Alternatively, we could implement these as plain C and try to get 286 // llvm to recognize the relevant patterns. 287 inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 288 unsigned shiftWidth) { 289 unsigned result; 290 asm("shf.l.wrap.b32 %0, %1, %2, %3;" 291 : "=r"(result) 292 : "r"(low32), "r"(high32), "r"(shiftWidth)); 293 return result; 294 } 295 inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 296 unsigned shiftWidth) { 297 unsigned result; 298 asm("shf.l.clamp.b32 %0, %1, %2, %3;" 299 : "=r"(result) 300 : "r"(low32), "r"(high32), "r"(shiftWidth)); 301 return result; 302 } 303 inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 304 unsigned shiftWidth) { 305 unsigned result; 306 asm("shf.r.wrap.b32 %0, %1, %2, %3;" 307 : "=r"(result) 308 : "r"(low32), "r"(high32), "r"(shiftWidth)); 309 return result; 310 } 311 inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 312 unsigned shiftWidth) { 313 unsigned ret; 314 asm("shf.r.clamp.b32 %0, %1, %2, %3;" 315 : "=r"(ret) 316 : "r"(low32), "r"(high32), "r"(shiftWidth)); 317 return ret; 318 } 319 320 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 321 322 #endif // defined(__CLANG_CUDA_INTRINSICS_H__) 323