1 // REQUIRES: nvptx-registered-target 2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s 3 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s 4 5 #define __device__ __attribute__((device)) 6 #define __global__ __attribute__((global)) 7 #define __shared__ __attribute__((shared)) 8 #define __constant__ __attribute__((constant)) 9 10 __device__ int read_tid() { 11 12 // CHECK: call i32 @llvm.ptx.read.tid.x() 13 // CHECK: call i32 @llvm.ptx.read.tid.y() 14 // CHECK: call i32 @llvm.ptx.read.tid.z() 15 // CHECK: call i32 @llvm.ptx.read.tid.w() 16 17 int x = __builtin_ptx_read_tid_x(); 18 int y = __builtin_ptx_read_tid_y(); 19 int z = __builtin_ptx_read_tid_z(); 20 int w = __builtin_ptx_read_tid_w(); 21 22 return x + y + z + w; 23 24 } 25 26 __device__ int read_ntid() { 27 28 // CHECK: call i32 @llvm.ptx.read.ntid.x() 29 // CHECK: call i32 @llvm.ptx.read.ntid.y() 30 // CHECK: call i32 @llvm.ptx.read.ntid.z() 31 // CHECK: call i32 @llvm.ptx.read.ntid.w() 32 33 int x = __builtin_ptx_read_ntid_x(); 34 int y = __builtin_ptx_read_ntid_y(); 35 int z = __builtin_ptx_read_ntid_z(); 36 int w = __builtin_ptx_read_ntid_w(); 37 38 return x + y + z + w; 39 40 } 41 42 __device__ int read_ctaid() { 43 44 // CHECK: call i32 @llvm.ptx.read.ctaid.x() 45 // CHECK: call i32 @llvm.ptx.read.ctaid.y() 46 // CHECK: call i32 @llvm.ptx.read.ctaid.z() 47 // CHECK: call i32 @llvm.ptx.read.ctaid.w() 48 49 int x = __builtin_ptx_read_ctaid_x(); 50 int y = __builtin_ptx_read_ctaid_y(); 51 int z = __builtin_ptx_read_ctaid_z(); 52 int w = __builtin_ptx_read_ctaid_w(); 53 54 return x + y + z + w; 55 56 } 57 58 __device__ int read_nctaid() { 59 60 // CHECK: call i32 @llvm.ptx.read.nctaid.x() 61 // CHECK: call i32 @llvm.ptx.read.nctaid.y() 62 // CHECK: call i32 @llvm.ptx.read.nctaid.z() 63 // CHECK: call i32 @llvm.ptx.read.nctaid.w() 64 65 int x = __builtin_ptx_read_nctaid_x(); 66 int y = __builtin_ptx_read_nctaid_y(); 67 int z = __builtin_ptx_read_nctaid_z(); 68 int w = __builtin_ptx_read_nctaid_w(); 69 70 return x + y + z + w; 71 72 } 73 74 __device__ int read_ids() { 75 76 // CHECK: call i32 @llvm.ptx.read.laneid() 77 // CHECK: call i32 @llvm.ptx.read.warpid() 78 // CHECK: call i32 @llvm.ptx.read.nwarpid() 79 // CHECK: call i32 @llvm.ptx.read.smid() 80 // CHECK: call i32 @llvm.ptx.read.nsmid() 81 // CHECK: call i32 @llvm.ptx.read.gridid() 82 83 int a = __builtin_ptx_read_laneid(); 84 int b = __builtin_ptx_read_warpid(); 85 int c = __builtin_ptx_read_nwarpid(); 86 int d = __builtin_ptx_read_smid(); 87 int e = __builtin_ptx_read_nsmid(); 88 int f = __builtin_ptx_read_gridid(); 89 90 return a + b + c + d + e + f; 91 92 } 93 94 __device__ int read_lanemasks() { 95 96 // CHECK: call i32 @llvm.ptx.read.lanemask.eq() 97 // CHECK: call i32 @llvm.ptx.read.lanemask.le() 98 // CHECK: call i32 @llvm.ptx.read.lanemask.lt() 99 // CHECK: call i32 @llvm.ptx.read.lanemask.ge() 100 // CHECK: call i32 @llvm.ptx.read.lanemask.gt() 101 102 int a = __builtin_ptx_read_lanemask_eq(); 103 int b = __builtin_ptx_read_lanemask_le(); 104 int c = __builtin_ptx_read_lanemask_lt(); 105 int d = __builtin_ptx_read_lanemask_ge(); 106 int e = __builtin_ptx_read_lanemask_gt(); 107 108 return a + b + c + d + e; 109 110 } 111 112 __device__ long long read_clocks() { 113 114 // CHECK: call i32 @llvm.ptx.read.clock() 115 // CHECK: call i64 @llvm.ptx.read.clock64() 116 117 int a = __builtin_ptx_read_clock(); 118 long long b = __builtin_ptx_read_clock64(); 119 120 return a + b; 121 } 122 123 __device__ int read_pms() { 124 125 // CHECK: call i32 @llvm.ptx.read.pm0() 126 // CHECK: call i32 @llvm.ptx.read.pm1() 127 // CHECK: call i32 @llvm.ptx.read.pm2() 128 // CHECK: call i32 @llvm.ptx.read.pm3() 129 130 int a = __builtin_ptx_read_pm0(); 131 int b = __builtin_ptx_read_pm1(); 132 int c = __builtin_ptx_read_pm2(); 133 int d = __builtin_ptx_read_pm3(); 134 135 return a + b + c + d; 136 137 } 138 139 __device__ void sync() { 140 141 // CHECK: call void @llvm.ptx.bar.sync(i32 0) 142 143 __builtin_ptx_bar_sync(0); 144 145 } 146 147 148 // NVVM intrinsics 149 150 // The idea is not to test all intrinsics, just that Clang is recognizing the 151 // builtins defined in BuiltinsNVPTX.def 152 __device__ void nvvm_math(float f1, float f2, double d1, double d2) { 153 // CHECK: call float @llvm.nvvm.fmax.f 154 float t1 = __nvvm_fmax_f(f1, f2); 155 // CHECK: call float @llvm.nvvm.fmin.f 156 float t2 = __nvvm_fmin_f(f1, f2); 157 // CHECK: call float @llvm.nvvm.sqrt.rn.f 158 float t3 = __nvvm_sqrt_rn_f(f1); 159 // CHECK: call float @llvm.nvvm.rcp.rn.f 160 float t4 = __nvvm_rcp_rn_f(f2); 161 // CHECK: call float @llvm.nvvm.add.rn.f 162 float t5 = __nvvm_add_rn_f(f1, f2); 163 164 // CHECK: call double @llvm.nvvm.fmax.d 165 double td1 = __nvvm_fmax_d(d1, d2); 166 // CHECK: call double @llvm.nvvm.fmin.d 167 double td2 = __nvvm_fmin_d(d1, d2); 168 // CHECK: call double @llvm.nvvm.sqrt.rn.d 169 double td3 = __nvvm_sqrt_rn_d(d1); 170 // CHECK: call double @llvm.nvvm.rcp.rn.d 171 double td4 = __nvvm_rcp_rn_d(d2); 172 173 // CHECK: call void @llvm.nvvm.membar.cta() 174 __nvvm_membar_cta(); 175 // CHECK: call void @llvm.nvvm.membar.gl() 176 __nvvm_membar_gl(); 177 // CHECK: call void @llvm.nvvm.membar.sys() 178 __nvvm_membar_sys(); 179 // CHECK: call void @llvm.nvvm.barrier0() 180 __nvvm_bar0(); 181 } 182 183 __device__ int di; 184 __shared__ int si; 185 __device__ long dl; 186 __shared__ long sl; 187 __device__ long long dll; 188 __shared__ long long sll; 189 190 // Check for atomic intrinsics 191 // CHECK-LABEL: nvvm_atom 192 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l, 193 long long *llp, long long ll) { 194 // CHECK: atomicrmw add 195 __nvvm_atom_add_gen_i(ip, i); 196 // CHECK: atomicrmw add 197 __nvvm_atom_add_gen_l(&dl, l); 198 // CHECK: atomicrmw add 199 __nvvm_atom_add_gen_ll(&sll, ll); 200 201 // CHECK: atomicrmw sub 202 __nvvm_atom_sub_gen_i(ip, i); 203 // CHECK: atomicrmw sub 204 __nvvm_atom_sub_gen_l(&dl, l); 205 // CHECK: atomicrmw sub 206 __nvvm_atom_sub_gen_ll(&sll, ll); 207 208 // CHECK: atomicrmw and 209 __nvvm_atom_and_gen_i(ip, i); 210 // CHECK: atomicrmw and 211 __nvvm_atom_and_gen_l(&dl, l); 212 // CHECK: atomicrmw and 213 __nvvm_atom_and_gen_ll(&sll, ll); 214 215 // CHECK: atomicrmw or 216 __nvvm_atom_or_gen_i(ip, i); 217 // CHECK: atomicrmw or 218 __nvvm_atom_or_gen_l(&dl, l); 219 // CHECK: atomicrmw or 220 __nvvm_atom_or_gen_ll(&sll, ll); 221 222 // CHECK: atomicrmw xor 223 __nvvm_atom_xor_gen_i(ip, i); 224 // CHECK: atomicrmw xor 225 __nvvm_atom_xor_gen_l(&dl, l); 226 // CHECK: atomicrmw xor 227 __nvvm_atom_xor_gen_ll(&sll, ll); 228 229 // CHECK: atomicrmw xchg 230 __nvvm_atom_xchg_gen_i(ip, i); 231 // CHECK: atomicrmw xchg 232 __nvvm_atom_xchg_gen_l(&dl, l); 233 // CHECK: atomicrmw xchg 234 __nvvm_atom_xchg_gen_ll(&sll, ll); 235 236 // CHECK: atomicrmw max i32* 237 __nvvm_atom_max_gen_i(ip, i); 238 // CHECK: atomicrmw umax i32* 239 __nvvm_atom_max_gen_ui((unsigned int *)ip, i); 240 // CHECK: atomicrmw max 241 __nvvm_atom_max_gen_l(&dl, l); 242 // CHECK: atomicrmw umax 243 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); 244 // CHECK: atomicrmw max i64* 245 __nvvm_atom_max_gen_ll(&sll, ll); 246 // CHECK: atomicrmw umax i64* 247 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); 248 249 // CHECK: atomicrmw min i32* 250 __nvvm_atom_min_gen_i(ip, i); 251 // CHECK: atomicrmw umin i32* 252 __nvvm_atom_min_gen_ui((unsigned int *)ip, i); 253 // CHECK: atomicrmw min 254 __nvvm_atom_min_gen_l(&dl, l); 255 // CHECK: atomicrmw umin 256 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); 257 // CHECK: atomicrmw min i64* 258 __nvvm_atom_min_gen_ll(&sll, ll); 259 // CHECK: atomicrmw umin i64* 260 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); 261 262 // CHECK: cmpxchg 263 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 264 __nvvm_atom_cas_gen_i(ip, 0, i); 265 // CHECK: cmpxchg 266 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0 267 __nvvm_atom_cas_gen_l(&dl, 0, l); 268 // CHECK: cmpxchg 269 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0 270 __nvvm_atom_cas_gen_ll(&sll, 0, ll); 271 272 // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32 273 __nvvm_atom_add_gen_f(fp, f); 274 275 // CHECK: ret 276 } 277