Home | History | Annotate | Download | only in CodeGen
      1 // REQUIRES: nvptx-registered-target
      2 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
      3 // RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
      4 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
      5 // RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
      6 
      7 #define __device__ __attribute__((device))
      8 #define __global__ __attribute__((global))
      9 #define __shared__ __attribute__((shared))
     10 #define __constant__ __attribute__((constant))
     11 
     12 __device__ int read_tid() {
     13 
     14 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
     15 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
     16 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
     17 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
     18 
     19   int x = __nvvm_read_ptx_sreg_tid_x();
     20   int y = __nvvm_read_ptx_sreg_tid_y();
     21   int z = __nvvm_read_ptx_sreg_tid_z();
     22   int w = __nvvm_read_ptx_sreg_tid_w();
     23 
     24   return x + y + z + w;
     25 
     26 }
     27 
     28 __device__ int read_ntid() {
     29 
     30 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
     31 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
     32 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
     33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
     34 
     35   int x = __nvvm_read_ptx_sreg_ntid_x();
     36   int y = __nvvm_read_ptx_sreg_ntid_y();
     37   int z = __nvvm_read_ptx_sreg_ntid_z();
     38   int w = __nvvm_read_ptx_sreg_ntid_w();
     39 
     40   return x + y + z + w;
     41 
     42 }
     43 
     44 __device__ int read_ctaid() {
     45 
     46 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
     47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
     48 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
     49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
     50 
     51   int x = __nvvm_read_ptx_sreg_ctaid_x();
     52   int y = __nvvm_read_ptx_sreg_ctaid_y();
     53   int z = __nvvm_read_ptx_sreg_ctaid_z();
     54   int w = __nvvm_read_ptx_sreg_ctaid_w();
     55 
     56   return x + y + z + w;
     57 
     58 }
     59 
     60 __device__ int read_nctaid() {
     61 
     62 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
     63 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
     64 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
     65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
     66 
     67   int x = __nvvm_read_ptx_sreg_nctaid_x();
     68   int y = __nvvm_read_ptx_sreg_nctaid_y();
     69   int z = __nvvm_read_ptx_sreg_nctaid_z();
     70   int w = __nvvm_read_ptx_sreg_nctaid_w();
     71 
     72   return x + y + z + w;
     73 
     74 }
     75 
     76 __device__ int read_ids() {
     77 
     78 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
     79 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
     80 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
     81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
     82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
     83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
     84 
     85   int a = __nvvm_read_ptx_sreg_laneid();
     86   int b = __nvvm_read_ptx_sreg_warpid();
     87   int c = __nvvm_read_ptx_sreg_nwarpid();
     88   int d = __nvvm_read_ptx_sreg_smid();
     89   int e = __nvvm_read_ptx_sreg_nsmid();
     90   int f = __nvvm_read_ptx_sreg_gridid();
     91 
     92   return a + b + c + d + e + f;
     93 
     94 }
     95 
     96 __device__ int read_lanemasks() {
     97 
     98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
     99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
    100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
    101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
    102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
    103 
    104   int a = __nvvm_read_ptx_sreg_lanemask_eq();
    105   int b = __nvvm_read_ptx_sreg_lanemask_le();
    106   int c = __nvvm_read_ptx_sreg_lanemask_lt();
    107   int d = __nvvm_read_ptx_sreg_lanemask_ge();
    108   int e = __nvvm_read_ptx_sreg_lanemask_gt();
    109 
    110   return a + b + c + d + e;
    111 
    112 }
    113 
    114 __device__ long long read_clocks() {
    115 
    116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
    117 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
    118 
    119   int a = __nvvm_read_ptx_sreg_clock();
    120   long long b = __nvvm_read_ptx_sreg_clock64();
    121 
    122   return a + b;
    123 }
    124 
    125 __device__ int read_pms() {
    126 
    127 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
    128 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
    129 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
    130 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
    131 
    132   int a = __nvvm_read_ptx_sreg_pm0();
    133   int b = __nvvm_read_ptx_sreg_pm1();
    134   int c = __nvvm_read_ptx_sreg_pm2();
    135   int d = __nvvm_read_ptx_sreg_pm3();
    136 
    137   return a + b + c + d;
    138 
    139 }
    140 
    141 __device__ void sync() {
    142 
    143 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
    144 
    145   __nvvm_bar_sync(0);
    146 
    147 }
    148 
    149 
    150 // NVVM intrinsics
    151 
    152 // The idea is not to test all intrinsics, just that Clang is recognizing the
    153 // builtins defined in BuiltinsNVPTX.def
    154 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
    155 // CHECK: call float @llvm.nvvm.fmax.f
    156   float t1 = __nvvm_fmax_f(f1, f2);
    157 // CHECK: call float @llvm.nvvm.fmin.f
    158   float t2 = __nvvm_fmin_f(f1, f2);
    159 // CHECK: call float @llvm.nvvm.sqrt.rn.f
    160   float t3 = __nvvm_sqrt_rn_f(f1);
    161 // CHECK: call float @llvm.nvvm.rcp.rn.f
    162   float t4 = __nvvm_rcp_rn_f(f2);
    163 // CHECK: call float @llvm.nvvm.add.rn.f
    164   float t5 = __nvvm_add_rn_f(f1, f2);
    165 
    166 // CHECK: call double @llvm.nvvm.fmax.d
    167   double td1 = __nvvm_fmax_d(d1, d2);
    168 // CHECK: call double @llvm.nvvm.fmin.d
    169   double td2 = __nvvm_fmin_d(d1, d2);
    170 // CHECK: call double @llvm.nvvm.sqrt.rn.d
    171   double td3 = __nvvm_sqrt_rn_d(d1);
    172 // CHECK: call double @llvm.nvvm.rcp.rn.d
    173   double td4 = __nvvm_rcp_rn_d(d2);
    174 
    175 // CHECK: call void @llvm.nvvm.membar.cta()
    176   __nvvm_membar_cta();
    177 // CHECK: call void @llvm.nvvm.membar.gl()
    178   __nvvm_membar_gl();
    179 // CHECK: call void @llvm.nvvm.membar.sys()
    180   __nvvm_membar_sys();
    181 // CHECK: call void @llvm.nvvm.barrier0()
    182   __syncthreads();
    183 }
    184 
    185 __device__ int di;
    186 __shared__ int si;
    187 __device__ long dl;
    188 __shared__ long sl;
    189 __device__ long long dll;
    190 __shared__ long long sll;
    191 
    192 // Check for atomic intrinsics
    193 // CHECK-LABEL: nvvm_atom
    194 __device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l,
    195                           long long *llp, long long ll) {
    196   // CHECK: atomicrmw add
    197   __nvvm_atom_add_gen_i(ip, i);
    198   // CHECK: atomicrmw add
    199   __nvvm_atom_add_gen_l(&dl, l);
    200   // CHECK: atomicrmw add
    201   __nvvm_atom_add_gen_ll(&sll, ll);
    202 
    203   // CHECK: atomicrmw sub
    204   __nvvm_atom_sub_gen_i(ip, i);
    205   // CHECK: atomicrmw sub
    206   __nvvm_atom_sub_gen_l(&dl, l);
    207   // CHECK: atomicrmw sub
    208   __nvvm_atom_sub_gen_ll(&sll, ll);
    209 
    210   // CHECK: atomicrmw and
    211   __nvvm_atom_and_gen_i(ip, i);
    212   // CHECK: atomicrmw and
    213   __nvvm_atom_and_gen_l(&dl, l);
    214   // CHECK: atomicrmw and
    215   __nvvm_atom_and_gen_ll(&sll, ll);
    216 
    217   // CHECK: atomicrmw or
    218   __nvvm_atom_or_gen_i(ip, i);
    219   // CHECK: atomicrmw or
    220   __nvvm_atom_or_gen_l(&dl, l);
    221   // CHECK: atomicrmw or
    222   __nvvm_atom_or_gen_ll(&sll, ll);
    223 
    224   // CHECK: atomicrmw xor
    225   __nvvm_atom_xor_gen_i(ip, i);
    226   // CHECK: atomicrmw xor
    227   __nvvm_atom_xor_gen_l(&dl, l);
    228   // CHECK: atomicrmw xor
    229   __nvvm_atom_xor_gen_ll(&sll, ll);
    230 
    231   // CHECK: atomicrmw xchg
    232   __nvvm_atom_xchg_gen_i(ip, i);
    233   // CHECK: atomicrmw xchg
    234   __nvvm_atom_xchg_gen_l(&dl, l);
    235   // CHECK: atomicrmw xchg
    236   __nvvm_atom_xchg_gen_ll(&sll, ll);
    237 
    238   // CHECK: atomicrmw max i32*
    239   __nvvm_atom_max_gen_i(ip, i);
    240   // CHECK: atomicrmw umax i32*
    241   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
    242   // CHECK: atomicrmw max
    243   __nvvm_atom_max_gen_l(&dl, l);
    244   // CHECK: atomicrmw umax
    245   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
    246   // CHECK: atomicrmw max i64*
    247   __nvvm_atom_max_gen_ll(&sll, ll);
    248   // CHECK: atomicrmw umax i64*
    249   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
    250 
    251   // CHECK: atomicrmw min i32*
    252   __nvvm_atom_min_gen_i(ip, i);
    253   // CHECK: atomicrmw umin i32*
    254   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
    255   // CHECK: atomicrmw min
    256   __nvvm_atom_min_gen_l(&dl, l);
    257   // CHECK: atomicrmw umin
    258   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
    259   // CHECK: atomicrmw min i64*
    260   __nvvm_atom_min_gen_ll(&sll, ll);
    261   // CHECK: atomicrmw umin i64*
    262   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
    263 
    264   // CHECK: cmpxchg
    265   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
    266   __nvvm_atom_cas_gen_i(ip, 0, i);
    267   // CHECK: cmpxchg
    268   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
    269   __nvvm_atom_cas_gen_l(&dl, 0, l);
    270   // CHECK: cmpxchg
    271   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
    272   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
    273 
    274   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
    275   __nvvm_atom_add_gen_f(fp, f);
    276 
    277   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
    278   __nvvm_atom_inc_gen_ui(uip, ui);
    279 
    280   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
    281   __nvvm_atom_dec_gen_ui(uip, ui);
    282 
    283   // CHECK: ret
    284 }
    285 
    286 // CHECK-LABEL: nvvm_ldg
    287 __device__ void nvvm_ldg(const void *p) {
    288   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
    289   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
    290   __nvvm_ldg_c((const char *)p);
    291   __nvvm_ldg_uc((const unsigned char *)p);
    292 
    293   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
    294   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
    295   __nvvm_ldg_s((const short *)p);
    296   __nvvm_ldg_us((const unsigned short *)p);
    297 
    298   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
    299   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
    300   __nvvm_ldg_i((const int *)p);
    301   __nvvm_ldg_ui((const unsigned int *)p);
    302 
    303   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
    304   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
    305   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
    306   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
    307   __nvvm_ldg_l((const long *)p);
    308   __nvvm_ldg_ul((const unsigned long *)p);
    309 
    310   // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
    311   __nvvm_ldg_f((const float *)p);
    312   // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
    313   __nvvm_ldg_d((const double *)p);
    314 
    315   // In practice, the pointers we pass to __ldg will be aligned as appropriate
    316   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
    317   // the LLVM vector types.  However, each LLVM vector type has an alignment
    318   // less than or equal to its corresponding CUDA type, so we're OK.
    319   //
    320   // PTX Interoperability section 2.2: "For a vector with an even number of
    321   // elements, its alignment is set to number of elements times the alignment of
    322   // its member: n*alignof(t)."
    323 
    324   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
    325   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
    326   typedef char char2 __attribute__((ext_vector_type(2)));
    327   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
    328   __nvvm_ldg_c2((const char2 *)p);
    329   __nvvm_ldg_uc2((const uchar2 *)p);
    330 
    331   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
    332   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
    333   typedef char char4 __attribute__((ext_vector_type(4)));
    334   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
    335   __nvvm_ldg_c4((const char4 *)p);
    336   __nvvm_ldg_uc4((const uchar4 *)p);
    337 
    338   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
    339   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
    340   typedef short short2 __attribute__((ext_vector_type(2)));
    341   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
    342   __nvvm_ldg_s2((const short2 *)p);
    343   __nvvm_ldg_us2((const ushort2 *)p);
    344 
    345   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
    346   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
    347   typedef short short4 __attribute__((ext_vector_type(4)));
    348   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
    349   __nvvm_ldg_s4((const short4 *)p);
    350   __nvvm_ldg_us4((const ushort4 *)p);
    351 
    352   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
    353   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
    354   typedef int int2 __attribute__((ext_vector_type(2)));
    355   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
    356   __nvvm_ldg_i2((const int2 *)p);
    357   __nvvm_ldg_ui2((const uint2 *)p);
    358 
    359   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
    360   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
    361   typedef int int4 __attribute__((ext_vector_type(4)));
    362   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
    363   __nvvm_ldg_i4((const int4 *)p);
    364   __nvvm_ldg_ui4((const uint4 *)p);
    365 
    366   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
    367   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
    368   typedef long long longlong2 __attribute__((ext_vector_type(2)));
    369   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
    370   __nvvm_ldg_ll2((const longlong2 *)p);
    371   __nvvm_ldg_ull2((const ulonglong2 *)p);
    372 
    373   // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
    374   typedef float float2 __attribute__((ext_vector_type(2)));
    375   __nvvm_ldg_f2((const float2 *)p);
    376 
    377   // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
    378   typedef float float4 __attribute__((ext_vector_type(4)));
    379   __nvvm_ldg_f4((const float4 *)p);
    380 
    381   // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
    382   typedef double double2 __attribute__((ext_vector_type(2)));
    383   __nvvm_ldg_d2((const double2 *)p);
    384 }
    385