1 // REQUIRES: nvptx-registered-target 2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ 3 // RUN: FileCheck -check-prefix=CHECK -check-prefix=LP32 %s 4 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ 5 // RUN: FileCheck -check-prefix=CHECK -check-prefix=LP64 %s 6 7 #define __device__ __attribute__((device)) 8 #define __global__ __attribute__((global)) 9 #define __shared__ __attribute__((shared)) 10 #define __constant__ __attribute__((constant)) 11 12 __device__ int read_tid() { 13 14 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 15 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y() 16 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z() 17 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w() 18 19 int x = __nvvm_read_ptx_sreg_tid_x(); 20 int y = __nvvm_read_ptx_sreg_tid_y(); 21 int z = __nvvm_read_ptx_sreg_tid_z(); 22 int w = __nvvm_read_ptx_sreg_tid_w(); 23 24 return x + y + z + w; 25 26 } 27 28 __device__ int read_ntid() { 29 30 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 31 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() 32 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() 33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w() 34 35 int x = __nvvm_read_ptx_sreg_ntid_x(); 36 int y = __nvvm_read_ptx_sreg_ntid_y(); 37 int z = __nvvm_read_ptx_sreg_ntid_z(); 38 int w = __nvvm_read_ptx_sreg_ntid_w(); 39 40 return x + y + z + w; 41 42 } 43 44 __device__ int read_ctaid() { 45 46 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() 47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() 48 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() 49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w() 50 51 int x = __nvvm_read_ptx_sreg_ctaid_x(); 52 int y = __nvvm_read_ptx_sreg_ctaid_y(); 53 int z = __nvvm_read_ptx_sreg_ctaid_z(); 54 int w = __nvvm_read_ptx_sreg_ctaid_w(); 55 56 return x + y + z + w; 57 58 } 59 60 __device__ int read_nctaid() { 61 62 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() 63 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() 64 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() 65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w() 66 67 int x = __nvvm_read_ptx_sreg_nctaid_x(); 68 int y = __nvvm_read_ptx_sreg_nctaid_y(); 69 int z = __nvvm_read_ptx_sreg_nctaid_z(); 70 int w = __nvvm_read_ptx_sreg_nctaid_w(); 71 72 return x + y + z + w; 73 74 } 75 76 __device__ int read_ids() { 77 78 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid() 79 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid() 80 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid() 81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid() 82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid() 83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid() 84 85 int a = __nvvm_read_ptx_sreg_laneid(); 86 int b = __nvvm_read_ptx_sreg_warpid(); 87 int c = __nvvm_read_ptx_sreg_nwarpid(); 88 int d = __nvvm_read_ptx_sreg_smid(); 89 int e = __nvvm_read_ptx_sreg_nsmid(); 90 int f = __nvvm_read_ptx_sreg_gridid(); 91 92 return a + b + c + d + e + f; 93 94 } 95 96 __device__ int read_lanemasks() { 97 98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq() 99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le() 100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt() 101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge() 102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt() 103 104 int a = __nvvm_read_ptx_sreg_lanemask_eq(); 105 int b = __nvvm_read_ptx_sreg_lanemask_le(); 106 int c = __nvvm_read_ptx_sreg_lanemask_lt(); 107 int d = __nvvm_read_ptx_sreg_lanemask_ge(); 108 int e = __nvvm_read_ptx_sreg_lanemask_gt(); 109 110 return a + b + c + d + e; 111 112 } 113 114 __device__ long long read_clocks() { 115 116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock() 117 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64() 118 119 int a = __nvvm_read_ptx_sreg_clock(); 120 long long b = __nvvm_read_ptx_sreg_clock64(); 121 122 return a + b; 123 } 124 125 __device__ int read_pms() { 126 127 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0() 128 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1() 129 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2() 130 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3() 131 132 int a = __nvvm_read_ptx_sreg_pm0(); 133 int b = __nvvm_read_ptx_sreg_pm1(); 134 int c = __nvvm_read_ptx_sreg_pm2(); 135 int d = __nvvm_read_ptx_sreg_pm3(); 136 137 return a + b + c + d; 138 139 } 140 141 __device__ void sync() { 142 143 // CHECK: call void @llvm.nvvm.bar.sync(i32 0) 144 145 __nvvm_bar_sync(0); 146 147 } 148 149 150 // NVVM intrinsics 151 152 // The idea is not to test all intrinsics, just that Clang is recognizing the 153 // builtins defined in BuiltinsNVPTX.def 154 __device__ void nvvm_math(float f1, float f2, double d1, double d2) { 155 // CHECK: call float @llvm.nvvm.fmax.f 156 float t1 = __nvvm_fmax_f(f1, f2); 157 // CHECK: call float @llvm.nvvm.fmin.f 158 float t2 = __nvvm_fmin_f(f1, f2); 159 // CHECK: call float @llvm.nvvm.sqrt.rn.f 160 float t3 = __nvvm_sqrt_rn_f(f1); 161 // CHECK: call float @llvm.nvvm.rcp.rn.f 162 float t4 = __nvvm_rcp_rn_f(f2); 163 // CHECK: call float @llvm.nvvm.add.rn.f 164 float t5 = __nvvm_add_rn_f(f1, f2); 165 166 // CHECK: call double @llvm.nvvm.fmax.d 167 double td1 = __nvvm_fmax_d(d1, d2); 168 // CHECK: call double @llvm.nvvm.fmin.d 169 double td2 = __nvvm_fmin_d(d1, d2); 170 // CHECK: call double @llvm.nvvm.sqrt.rn.d 171 double td3 = __nvvm_sqrt_rn_d(d1); 172 // CHECK: call double @llvm.nvvm.rcp.rn.d 173 double td4 = __nvvm_rcp_rn_d(d2); 174 175 // CHECK: call void @llvm.nvvm.membar.cta() 176 __nvvm_membar_cta(); 177 // CHECK: call void @llvm.nvvm.membar.gl() 178 __nvvm_membar_gl(); 179 // CHECK: call void @llvm.nvvm.membar.sys() 180 __nvvm_membar_sys(); 181 // CHECK: call void @llvm.nvvm.barrier0() 182 __syncthreads(); 183 } 184 185 __device__ int di; 186 __shared__ int si; 187 __device__ long dl; 188 __shared__ long sl; 189 __device__ long long dll; 190 __shared__ long long sll; 191 192 // Check for atomic intrinsics 193 // CHECK-LABEL: nvvm_atom 194 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l, 195 long long *llp, long long ll) { 196 // CHECK: atomicrmw add 197 __nvvm_atom_add_gen_i(ip, i); 198 // CHECK: atomicrmw add 199 __nvvm_atom_add_gen_l(&dl, l); 200 // CHECK: atomicrmw add 201 __nvvm_atom_add_gen_ll(&sll, ll); 202 203 // CHECK: atomicrmw sub 204 __nvvm_atom_sub_gen_i(ip, i); 205 // CHECK: atomicrmw sub 206 __nvvm_atom_sub_gen_l(&dl, l); 207 // CHECK: atomicrmw sub 208 __nvvm_atom_sub_gen_ll(&sll, ll); 209 210 // CHECK: atomicrmw and 211 __nvvm_atom_and_gen_i(ip, i); 212 // CHECK: atomicrmw and 213 __nvvm_atom_and_gen_l(&dl, l); 214 // CHECK: atomicrmw and 215 __nvvm_atom_and_gen_ll(&sll, ll); 216 217 // CHECK: atomicrmw or 218 __nvvm_atom_or_gen_i(ip, i); 219 // CHECK: atomicrmw or 220 __nvvm_atom_or_gen_l(&dl, l); 221 // CHECK: atomicrmw or 222 __nvvm_atom_or_gen_ll(&sll, ll); 223 224 // CHECK: atomicrmw xor 225 __nvvm_atom_xor_gen_i(ip, i); 226 // CHECK: atomicrmw xor 227 __nvvm_atom_xor_gen_l(&dl, l); 228 // CHECK: atomicrmw xor 229 __nvvm_atom_xor_gen_ll(&sll, ll); 230 231 // CHECK: atomicrmw xchg 232 __nvvm_atom_xchg_gen_i(ip, i); 233 // CHECK: atomicrmw xchg 234 __nvvm_atom_xchg_gen_l(&dl, l); 235 // CHECK: atomicrmw xchg 236 __nvvm_atom_xchg_gen_ll(&sll, ll); 237 238 // CHECK: atomicrmw max i32* 239 __nvvm_atom_max_gen_i(ip, i); 240 // CHECK: atomicrmw umax i32* 241 __nvvm_atom_max_gen_ui((unsigned int *)ip, i); 242 // CHECK: atomicrmw max 243 __nvvm_atom_max_gen_l(&dl, l); 244 // CHECK: atomicrmw umax 245 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); 246 // CHECK: atomicrmw max i64* 247 __nvvm_atom_max_gen_ll(&sll, ll); 248 // CHECK: atomicrmw umax i64* 249 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); 250 251 // CHECK: atomicrmw min i32* 252 __nvvm_atom_min_gen_i(ip, i); 253 // CHECK: atomicrmw umin i32* 254 __nvvm_atom_min_gen_ui((unsigned int *)ip, i); 255 // CHECK: atomicrmw min 256 __nvvm_atom_min_gen_l(&dl, l); 257 // CHECK: atomicrmw umin 258 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); 259 // CHECK: atomicrmw min i64* 260 __nvvm_atom_min_gen_ll(&sll, ll); 261 // CHECK: atomicrmw umin i64* 262 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); 263 264 // CHECK: cmpxchg 265 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 266 __nvvm_atom_cas_gen_i(ip, 0, i); 267 // CHECK: cmpxchg 268 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0 269 __nvvm_atom_cas_gen_l(&dl, 0, l); 270 // CHECK: cmpxchg 271 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0 272 __nvvm_atom_cas_gen_ll(&sll, 0, ll); 273 274 // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32 275 __nvvm_atom_add_gen_f(fp, f); 276 277 // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32 278 __nvvm_atom_inc_gen_ui(uip, ui); 279 280 // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32 281 __nvvm_atom_dec_gen_ui(uip, ui); 282 283 // CHECK: ret 284 } 285 286 // CHECK-LABEL: nvvm_ldg 287 __device__ void nvvm_ldg(const void *p) { 288 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1) 289 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1) 290 __nvvm_ldg_c((const char *)p); 291 __nvvm_ldg_uc((const unsigned char *)p); 292 293 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2) 294 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2) 295 __nvvm_ldg_s((const short *)p); 296 __nvvm_ldg_us((const unsigned short *)p); 297 298 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) 299 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) 300 __nvvm_ldg_i((const int *)p); 301 __nvvm_ldg_ui((const unsigned int *)p); 302 303 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) 304 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4) 305 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8) 306 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8) 307 __nvvm_ldg_l((const long *)p); 308 __nvvm_ldg_ul((const unsigned long *)p); 309 310 // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4) 311 __nvvm_ldg_f((const float *)p); 312 // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8) 313 __nvvm_ldg_d((const double *)p); 314 315 // In practice, the pointers we pass to __ldg will be aligned as appropriate 316 // for the CUDA <type>N vector types (e.g. short4), which are not the same as 317 // the LLVM vector types. However, each LLVM vector type has an alignment 318 // less than or equal to its corresponding CUDA type, so we're OK. 319 // 320 // PTX Interoperability section 2.2: "For a vector with an even number of 321 // elements, its alignment is set to number of elements times the alignment of 322 // its member: n*alignof(t)." 323 324 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2) 325 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2) 326 typedef char char2 __attribute__((ext_vector_type(2))); 327 typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); 328 __nvvm_ldg_c2((const char2 *)p); 329 __nvvm_ldg_uc2((const uchar2 *)p); 330 331 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4) 332 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4) 333 typedef char char4 __attribute__((ext_vector_type(4))); 334 typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); 335 __nvvm_ldg_c4((const char4 *)p); 336 __nvvm_ldg_uc4((const uchar4 *)p); 337 338 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4) 339 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4) 340 typedef short short2 __attribute__((ext_vector_type(2))); 341 typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); 342 __nvvm_ldg_s2((const short2 *)p); 343 __nvvm_ldg_us2((const ushort2 *)p); 344 345 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8) 346 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8) 347 typedef short short4 __attribute__((ext_vector_type(4))); 348 typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); 349 __nvvm_ldg_s4((const short4 *)p); 350 __nvvm_ldg_us4((const ushort4 *)p); 351 352 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8) 353 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8) 354 typedef int int2 __attribute__((ext_vector_type(2))); 355 typedef unsigned int uint2 __attribute__((ext_vector_type(2))); 356 __nvvm_ldg_i2((const int2 *)p); 357 __nvvm_ldg_ui2((const uint2 *)p); 358 359 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16) 360 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16) 361 typedef int int4 __attribute__((ext_vector_type(4))); 362 typedef unsigned int uint4 __attribute__((ext_vector_type(4))); 363 __nvvm_ldg_i4((const int4 *)p); 364 __nvvm_ldg_ui4((const uint4 *)p); 365 366 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16) 367 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16) 368 typedef long long longlong2 __attribute__((ext_vector_type(2))); 369 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); 370 __nvvm_ldg_ll2((const longlong2 *)p); 371 __nvvm_ldg_ull2((const ulonglong2 *)p); 372 373 // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8) 374 typedef float float2 __attribute__((ext_vector_type(2))); 375 __nvvm_ldg_f2((const float2 *)p); 376 377 // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16) 378 typedef float float4 __attribute__((ext_vector_type(4))); 379 __nvvm_ldg_f4((const float4 *)p); 380 381 // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16) 382 typedef double double2 __attribute__((ext_vector_type(2))); 383 __nvvm_ldg_d2((const double2 *)p); 384 } 385