Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
      2 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
      3 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
      4 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
      5 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
      6 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
      7 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
      8 
      9 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
     10 
     11 ; FUNC-LABEL: {{^}}mova_same_clause:
     12 
     13 ; R600: LDS_WRITE
     14 ; R600: LDS_WRITE
     15 ; R600: LDS_READ
     16 ; R600: LDS_READ
     17 
     18 ; HSA-PROMOTE: .amd_kernel_code_t
     19 ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
     20 ; HSA-PROMOTE: .end_amd_kernel_code_t
     21 
     22 ; SI-PROMOTE: ds_write_b32
     23 ; SI-PROMOTE: ds_write_b32
     24 ; SI-PROMOTE: ds_read_b32
     25 ; SI-PROMOTE: ds_read_b32
     26 
     27 ; HSA-ALLOCA: .amd_kernel_code_t
     28 ; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
     29 ; by 4 bytes.
     30 ; HSA-ALLOCA: workitem_private_segment_byte_size = 24
     31 ; HSA-ALLOCA: .end_amd_kernel_code_t
     32 
     33 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
     34 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
     35 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
     36 entry:
     37   %stack = alloca [5 x i32], align 4
     38   %0 = load i32, i32 addrspace(1)* %in, align 4
     39   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
     40   store i32 4, i32* %arrayidx1, align 4
     41   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     42   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     43   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
     44   store i32 5, i32* %arrayidx3, align 4
     45   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     46   %2 = load i32, i32* %arrayidx10, align 4
     47   store i32 %2, i32 addrspace(1)* %out, align 4
     48   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     49   %3 = load i32, i32* %arrayidx12
     50   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     51   store i32 %3, i32 addrspace(1)* %arrayidx13
     52   ret void
     53 }
     54 
     55 ; This test checks that the stack offset is calculated correctly for structs.
     56 ; All register loads/stores should be optimized away, so there shouldn't be
     57 ; any MOVA instructions.
     58 ;
     59 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
     60 ; this.
     61 
     62 ; FUNC-LABEL: {{^}}multiple_structs:
     63 ; R600-NOT: MOVA_INT
     64 ; SI-NOT: v_movrel
     65 ; SI-NOT: v_movrel
     66 %struct.point = type { i32, i32 }
     67 
     68 define void @multiple_structs(i32 addrspace(1)* %out) {
     69 entry:
     70   %a = alloca %struct.point
     71   %b = alloca %struct.point
     72   %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
     73   %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
     74   %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
     75   %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
     76   store i32 0, i32* %a.x.ptr
     77   store i32 1, i32* %a.y.ptr
     78   store i32 2, i32* %b.x.ptr
     79   store i32 3, i32* %b.y.ptr
     80   %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
     81   %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
     82   %a.indirect = load i32, i32* %a.indirect.ptr
     83   %b.indirect = load i32, i32* %b.indirect.ptr
     84   %0 = add i32 %a.indirect, %b.indirect
     85   store i32 %0, i32 addrspace(1)* %out
     86   ret void
     87 }
     88 
     89 ; Test direct access of a private array inside a loop.  The private array
     90 ; loads and stores should be lowered to copies, so there shouldn't be any
     91 ; MOVA instructions.
     92 
     93 ; FUNC-LABEL: {{^}}direct_loop:
     94 ; R600-NOT: MOVA_INT
     95 ; SI-NOT: v_movrel
     96 
     97 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
     98 entry:
     99   %prv_array_const = alloca [2 x i32]
    100   %prv_array = alloca [2 x i32]
    101   %a = load i32, i32 addrspace(1)* %in
    102   %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
    103   %b = load i32, i32 addrspace(1)* %b_src_ptr
    104   %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
    105   store i32 %a, i32* %a_dst_ptr
    106   %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
    107   store i32 %b, i32* %b_dst_ptr
    108   br label %for.body
    109 
    110 for.body:
    111   %inc = phi i32 [0, %entry], [%count, %for.body]
    112   %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
    113   %x = load i32, i32* %x_ptr
    114   %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
    115   %y = load i32, i32* %y_ptr
    116   %xy = add i32 %x, %y
    117   store i32 %xy, i32* %y_ptr
    118   %count = add i32 %inc, 1
    119   %done = icmp eq i32 %count, 4095
    120   br i1 %done, label %for.end, label %for.body
    121 
    122 for.end:
    123   %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
    124   %value = load i32, i32* %value_ptr
    125   store i32 %value, i32 addrspace(1)* %out
    126   ret void
    127 }
    128 
    129 ; FUNC-LABEL: {{^}}short_array:
    130 
    131 ; R600: MOVA_INT
    132 
    133 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
    134 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
    135 ; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
    136 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
    137 entry:
    138   %0 = alloca [2 x i16]
    139   %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
    140   %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
    141   store i16 0, i16* %1
    142   store i16 1, i16* %2
    143   %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
    144   %4 = load i16, i16* %3
    145   %5 = sext i16 %4 to i32
    146   store i32 %5, i32 addrspace(1)* %out
    147   ret void
    148 }
    149 
    150 ; FUNC-LABEL: {{^}}char_array:
    151 
    152 ; R600: MOVA_INT
    153 
    154 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
    155 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
    156 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
    157 entry:
    158   %0 = alloca [2 x i8]
    159   %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
    160   %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
    161   store i8 0, i8* %1
    162   store i8 1, i8* %2
    163   %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
    164   %4 = load i8, i8* %3
    165   %5 = sext i8 %4 to i32
    166   store i32 %5, i32 addrspace(1)* %out
    167   ret void
    168 
    169 }
    170 
    171 ; Make sure we don't overwrite workitem information with private memory
    172 
    173 ; FUNC-LABEL: {{^}}work_item_info:
    174 ; R600-NOT: MOV T0.X
    175 ; Additional check in case the move ends up in the last slot
    176 ; R600-NOT: MOV * TO.X
    177 
    178 ; SI-NOT: v_mov_b32_e{{(32|64)}} v0
    179 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
    180 entry:
    181   %0 = alloca [2 x i32]
    182   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
    183   %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
    184   store i32 0, i32* %1
    185   store i32 1, i32* %2
    186   %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
    187   %4 = load i32, i32* %3
    188   %5 = call i32 @llvm.r600.read.tidig.x()
    189   %6 = add i32 %4, %5
    190   store i32 %6, i32 addrspace(1)* %out
    191   ret void
    192 }
    193 
    194 ; Test that two stack objects are not stored in the same register
    195 ; The second stack object should be in T3.X
    196 ; FUNC-LABEL: {{^}}no_overlap:
    197 ; R600_CHECK: MOV
    198 ; R600_CHECK: [[CHAN:[XYZW]]]+
    199 ; R600-NOT: [[CHAN]]+
    200 ; SI: v_mov_b32_e32 v3
    201 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
    202 entry:
    203   %0 = alloca [3 x i8], align 1
    204   %1 = alloca [2 x i8], align 1
    205   %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
    206   %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
    207   %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
    208   %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
    209   %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
    210   store i8 0, i8* %2
    211   store i8 1, i8* %3
    212   store i8 2, i8* %4
    213   store i8 1, i8* %5
    214   store i8 0, i8* %6
    215   %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
    216   %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
    217   %9 = load i8, i8* %7
    218   %10 = load i8, i8* %8
    219   %11 = add i8 %9, %10
    220   %12 = sext i8 %11 to i32
    221   store i32 %12, i32 addrspace(1)* %out
    222   ret void
    223 }
    224 
    225 define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
    226 entry:
    227   %alloca = alloca [2 x [2 x i8]]
    228   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
    229   %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
    230   store i8 0, i8* %gep0
    231   store i8 1, i8* %gep1
    232   %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
    233   %load = load i8, i8* %gep2
    234   %sext = sext i8 %load to i32
    235   store i32 %sext, i32 addrspace(1)* %out
    236   ret void
    237 }
    238 
    239 define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
    240 entry:
    241   %alloca = alloca [2 x [2 x i32]]
    242   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
    243   %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
    244   store i32 0, i32* %gep0
    245   store i32 1, i32* %gep1
    246   %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
    247   %load = load i32, i32* %gep2
    248   store i32 %load, i32 addrspace(1)* %out
    249   ret void
    250 }
    251 
    252 define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
    253 entry:
    254   %alloca = alloca [2 x [2 x i64]]
    255   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
    256   %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
    257   store i64 0, i64* %gep0
    258   store i64 1, i64* %gep1
    259   %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
    260   %load = load i64, i64* %gep2
    261   store i64 %load, i64 addrspace(1)* %out
    262   ret void
    263 }
    264 
    265 %struct.pair32 = type { i32, i32 }
    266 
    267 define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
    268 entry:
    269   %alloca = alloca [2 x [2 x %struct.pair32]]
    270   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
    271   %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
    272   store i32 0, i32* %gep0
    273   store i32 1, i32* %gep1
    274   %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
    275   %load = load i32, i32* %gep2
    276   store i32 %load, i32 addrspace(1)* %out
    277   ret void
    278 }
    279 
    280 define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
    281 entry:
    282   %alloca = alloca [2 x %struct.pair32]
    283   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
    284   %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
    285   store i32 0, i32* %gep0
    286   store i32 1, i32* %gep1
    287   %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
    288   %load = load i32, i32* %gep2
    289   store i32 %load, i32 addrspace(1)* %out
    290   ret void
    291 }
    292 
    293 define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
    294 entry:
    295   %tmp = alloca [2 x i32]
    296   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
    297   %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
    298   store i32 0, i32* %tmp1
    299   store i32 1, i32* %tmp2
    300   %cmp = icmp eq i32 %in, 0
    301   %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
    302   %load = load i32, i32* %sel
    303   store i32 %load, i32 addrspace(1)* %out
    304   ret void
    305 }
    306 
    307 ; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
    308 ; finds one, it should stop trying to promote.
    309 
    310 ; FUNC-LABEL: ptrtoint:
    311 ; SI-NOT: ds_write
    312 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
    313 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
    314 define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
    315   %alloca = alloca [16 x i32]
    316   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
    317   store i32 5, i32* %tmp0
    318   %tmp1 = ptrtoint [16 x i32]* %alloca to i32
    319   %tmp2 = add i32 %tmp1, 5
    320   %tmp3 = inttoptr i32 %tmp2 to i32*
    321   %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
    322   %tmp5 = load i32, i32* %tmp4
    323   store i32 %tmp5, i32 addrspace(1)* %out
    324   ret void
    325 }
    326