Home | History | Annotate | Download | only in CodeGen
      1 // REQUIRES: nvptx-registered-target
      2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
      3 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
      4 
      5 #define __device__ __attribute__((device))
      6 #define __global__ __attribute__((global))
      7 #define __shared__ __attribute__((shared))
      8 #define __constant__ __attribute__((constant))
      9 
     10 __device__ int read_tid() {
     11 
     12 // CHECK: call i32 @llvm.ptx.read.tid.x()
     13 // CHECK: call i32 @llvm.ptx.read.tid.y()
     14 // CHECK: call i32 @llvm.ptx.read.tid.z()
     15 // CHECK: call i32 @llvm.ptx.read.tid.w()
     16 
     17   int x = __builtin_ptx_read_tid_x();
     18   int y = __builtin_ptx_read_tid_y();
     19   int z = __builtin_ptx_read_tid_z();
     20   int w = __builtin_ptx_read_tid_w();
     21 
     22   return x + y + z + w;
     23 
     24 }
     25 
     26 __device__ int read_ntid() {
     27 
     28 // CHECK: call i32 @llvm.ptx.read.ntid.x()
     29 // CHECK: call i32 @llvm.ptx.read.ntid.y()
     30 // CHECK: call i32 @llvm.ptx.read.ntid.z()
     31 // CHECK: call i32 @llvm.ptx.read.ntid.w()
     32 
     33   int x = __builtin_ptx_read_ntid_x();
     34   int y = __builtin_ptx_read_ntid_y();
     35   int z = __builtin_ptx_read_ntid_z();
     36   int w = __builtin_ptx_read_ntid_w();
     37 
     38   return x + y + z + w;
     39 
     40 }
     41 
     42 __device__ int read_ctaid() {
     43 
     44 // CHECK: call i32 @llvm.ptx.read.ctaid.x()
     45 // CHECK: call i32 @llvm.ptx.read.ctaid.y()
     46 // CHECK: call i32 @llvm.ptx.read.ctaid.z()
     47 // CHECK: call i32 @llvm.ptx.read.ctaid.w()
     48 
     49   int x = __builtin_ptx_read_ctaid_x();
     50   int y = __builtin_ptx_read_ctaid_y();
     51   int z = __builtin_ptx_read_ctaid_z();
     52   int w = __builtin_ptx_read_ctaid_w();
     53 
     54   return x + y + z + w;
     55 
     56 }
     57 
     58 __device__ int read_nctaid() {
     59 
     60 // CHECK: call i32 @llvm.ptx.read.nctaid.x()
     61 // CHECK: call i32 @llvm.ptx.read.nctaid.y()
     62 // CHECK: call i32 @llvm.ptx.read.nctaid.z()
     63 // CHECK: call i32 @llvm.ptx.read.nctaid.w()
     64 
     65   int x = __builtin_ptx_read_nctaid_x();
     66   int y = __builtin_ptx_read_nctaid_y();
     67   int z = __builtin_ptx_read_nctaid_z();
     68   int w = __builtin_ptx_read_nctaid_w();
     69 
     70   return x + y + z + w;
     71 
     72 }
     73 
     74 __device__ int read_ids() {
     75 
     76 // CHECK: call i32 @llvm.ptx.read.laneid()
     77 // CHECK: call i32 @llvm.ptx.read.warpid()
     78 // CHECK: call i32 @llvm.ptx.read.nwarpid()
     79 // CHECK: call i32 @llvm.ptx.read.smid()
     80 // CHECK: call i32 @llvm.ptx.read.nsmid()
     81 // CHECK: call i32 @llvm.ptx.read.gridid()
     82 
     83   int a = __builtin_ptx_read_laneid();
     84   int b = __builtin_ptx_read_warpid();
     85   int c = __builtin_ptx_read_nwarpid();
     86   int d = __builtin_ptx_read_smid();
     87   int e = __builtin_ptx_read_nsmid();
     88   int f = __builtin_ptx_read_gridid();
     89 
     90   return a + b + c + d + e + f;
     91 
     92 }
     93 
     94 __device__ int read_lanemasks() {
     95 
     96 // CHECK: call i32 @llvm.ptx.read.lanemask.eq()
     97 // CHECK: call i32 @llvm.ptx.read.lanemask.le()
     98 // CHECK: call i32 @llvm.ptx.read.lanemask.lt()
     99 // CHECK: call i32 @llvm.ptx.read.lanemask.ge()
    100 // CHECK: call i32 @llvm.ptx.read.lanemask.gt()
    101 
    102   int a = __builtin_ptx_read_lanemask_eq();
    103   int b = __builtin_ptx_read_lanemask_le();
    104   int c = __builtin_ptx_read_lanemask_lt();
    105   int d = __builtin_ptx_read_lanemask_ge();
    106   int e = __builtin_ptx_read_lanemask_gt();
    107 
    108   return a + b + c + d + e;
    109 
    110 }
    111 
    112 __device__ long long read_clocks() {
    113 
    114 // CHECK: call i32 @llvm.ptx.read.clock()
    115 // CHECK: call i64 @llvm.ptx.read.clock64()
    116 
    117   int a = __builtin_ptx_read_clock();
    118   long long b = __builtin_ptx_read_clock64();
    119 
    120   return a + b;
    121 }
    122 
    123 __device__ int read_pms() {
    124 
    125 // CHECK: call i32 @llvm.ptx.read.pm0()
    126 // CHECK: call i32 @llvm.ptx.read.pm1()
    127 // CHECK: call i32 @llvm.ptx.read.pm2()
    128 // CHECK: call i32 @llvm.ptx.read.pm3()
    129 
    130   int a = __builtin_ptx_read_pm0();
    131   int b = __builtin_ptx_read_pm1();
    132   int c = __builtin_ptx_read_pm2();
    133   int d = __builtin_ptx_read_pm3();
    134 
    135   return a + b + c + d;
    136 
    137 }
    138 
    139 __device__ void sync() {
    140 
    141 // CHECK: call void @llvm.ptx.bar.sync(i32 0)
    142 
    143   __builtin_ptx_bar_sync(0);
    144 
    145 }
    146 
    147 
    148 // NVVM intrinsics
    149 
    150 // The idea is not to test all intrinsics, just that Clang is recognizing the
    151 // builtins defined in BuiltinsNVPTX.def
    152 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
    153 // CHECK: call float @llvm.nvvm.fmax.f
    154   float t1 = __nvvm_fmax_f(f1, f2);
    155 // CHECK: call float @llvm.nvvm.fmin.f
    156   float t2 = __nvvm_fmin_f(f1, f2);
    157 // CHECK: call float @llvm.nvvm.sqrt.rn.f
    158   float t3 = __nvvm_sqrt_rn_f(f1);
    159 // CHECK: call float @llvm.nvvm.rcp.rn.f
    160   float t4 = __nvvm_rcp_rn_f(f2);
    161 // CHECK: call float @llvm.nvvm.add.rn.f
    162   float t5 = __nvvm_add_rn_f(f1, f2);
    163 
    164 // CHECK: call double @llvm.nvvm.fmax.d
    165   double td1 = __nvvm_fmax_d(d1, d2);
    166 // CHECK: call double @llvm.nvvm.fmin.d
    167   double td2 = __nvvm_fmin_d(d1, d2);
    168 // CHECK: call double @llvm.nvvm.sqrt.rn.d
    169   double td3 = __nvvm_sqrt_rn_d(d1);
    170 // CHECK: call double @llvm.nvvm.rcp.rn.d
    171   double td4 = __nvvm_rcp_rn_d(d2);
    172 
    173 // CHECK: call void @llvm.nvvm.membar.cta()
    174   __nvvm_membar_cta();
    175 // CHECK: call void @llvm.nvvm.membar.gl()
    176   __nvvm_membar_gl();
    177 // CHECK: call void @llvm.nvvm.membar.sys()
    178   __nvvm_membar_sys();
    179 // CHECK: call void @llvm.nvvm.barrier0()
    180   __nvvm_bar0();
    181 }
    182 
    183 __device__ int di;
    184 __shared__ int si;
    185 __device__ long dl;
    186 __shared__ long sl;
    187 __device__ long long dll;
    188 __shared__ long long sll;
    189 
    190 // Check for atomic intrinsics
    191 // CHECK-LABEL: nvvm_atom
    192 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
    193                           long long *llp, long long ll) {
    194   // CHECK: atomicrmw add
    195   __nvvm_atom_add_gen_i(ip, i);
    196   // CHECK: atomicrmw add
    197   __nvvm_atom_add_gen_l(&dl, l);
    198   // CHECK: atomicrmw add
    199   __nvvm_atom_add_gen_ll(&sll, ll);
    200 
    201   // CHECK: atomicrmw sub
    202   __nvvm_atom_sub_gen_i(ip, i);
    203   // CHECK: atomicrmw sub
    204   __nvvm_atom_sub_gen_l(&dl, l);
    205   // CHECK: atomicrmw sub
    206   __nvvm_atom_sub_gen_ll(&sll, ll);
    207 
    208   // CHECK: atomicrmw and
    209   __nvvm_atom_and_gen_i(ip, i);
    210   // CHECK: atomicrmw and
    211   __nvvm_atom_and_gen_l(&dl, l);
    212   // CHECK: atomicrmw and
    213   __nvvm_atom_and_gen_ll(&sll, ll);
    214 
    215   // CHECK: atomicrmw or
    216   __nvvm_atom_or_gen_i(ip, i);
    217   // CHECK: atomicrmw or
    218   __nvvm_atom_or_gen_l(&dl, l);
    219   // CHECK: atomicrmw or
    220   __nvvm_atom_or_gen_ll(&sll, ll);
    221 
    222   // CHECK: atomicrmw xor
    223   __nvvm_atom_xor_gen_i(ip, i);
    224   // CHECK: atomicrmw xor
    225   __nvvm_atom_xor_gen_l(&dl, l);
    226   // CHECK: atomicrmw xor
    227   __nvvm_atom_xor_gen_ll(&sll, ll);
    228 
    229   // CHECK: atomicrmw xchg
    230   __nvvm_atom_xchg_gen_i(ip, i);
    231   // CHECK: atomicrmw xchg
    232   __nvvm_atom_xchg_gen_l(&dl, l);
    233   // CHECK: atomicrmw xchg
    234   __nvvm_atom_xchg_gen_ll(&sll, ll);
    235 
    236   // CHECK: atomicrmw max i32*
    237   __nvvm_atom_max_gen_i(ip, i);
    238   // CHECK: atomicrmw umax i32*
    239   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
    240   // CHECK: atomicrmw max
    241   __nvvm_atom_max_gen_l(&dl, l);
    242   // CHECK: atomicrmw umax
    243   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
    244   // CHECK: atomicrmw max i64*
    245   __nvvm_atom_max_gen_ll(&sll, ll);
    246   // CHECK: atomicrmw umax i64*
    247   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
    248 
    249   // CHECK: atomicrmw min i32*
    250   __nvvm_atom_min_gen_i(ip, i);
    251   // CHECK: atomicrmw umin i32*
    252   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
    253   // CHECK: atomicrmw min
    254   __nvvm_atom_min_gen_l(&dl, l);
    255   // CHECK: atomicrmw umin
    256   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
    257   // CHECK: atomicrmw min i64*
    258   __nvvm_atom_min_gen_ll(&sll, ll);
    259   // CHECK: atomicrmw umin i64*
    260   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
    261 
    262   // CHECK: cmpxchg
    263   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
    264   __nvvm_atom_cas_gen_i(ip, 0, i);
    265   // CHECK: cmpxchg
    266   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
    267   __nvvm_atom_cas_gen_l(&dl, 0, l);
    268   // CHECK: cmpxchg
    269   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
    270   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
    271 
    272   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
    273   __nvvm_atom_add_gen_f(fp, f);
    274 
    275   // CHECK: ret
    276 }
    277