Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s
      2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s
      3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s
      4 target datalayout = "A5"
      5 
      6 ; FIXME: Why is this commuted only sometimes?
      7 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
      8 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
      9 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
     10 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
     11 ; GCN-NEXT: s_setpc_b64
     12 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
     13   %add0 = add i32 %arg0, %arg1
     14   ret i32 %add0
     15 }
     16 
     17 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
     18 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
     19 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
     20 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
     21 ; GCN: s_mov_b32 s5, s32
     22 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24
     23 ; GCN: s_waitcnt vmcnt(0)
     24 ; GCN: s_setpc_b64
     25 ; GCN: ; ScratchSize: 68
     26 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
     27   %alloca = alloca [16 x i32], align 4, addrspace(5)
     28   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
     29   store volatile i32 9, i32 addrspace(5)* %gep
     30   %add0 = add i32 %arg0, %arg1
     31   ret i32 %add0
     32 }
     33 
     34 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
     35 define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
     36 entry:
     37   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
     38   ret i32 %ret
     39 }
     40 
     41 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
     42 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
     43 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24
     44 ; GCN: s_setpc_b64
     45 ; GCN: ; ScratchSize: 68
     46 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
     47 entry:
     48   %alloca = alloca [16 x i32], align 4, addrspace(5)
     49   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
     50   store volatile i32 9, i32 addrspace(5)* %gep
     51   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
     52   ret i32 %ret
     53 }
     54 
     55 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
     56 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
     57 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24
     58 ; GCN: s_setpc_b64
     59 ; GCN: ; ScratchSize: 136
     60 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
     61 entry:
     62   %alloca = alloca [16 x i32], align 4, addrspace(5)
     63   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
     64   store volatile i32 9, i32 addrspace(5)* %gep
     65   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
     66   ret i32 %ret
     67 }
     68 
     69 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
     70 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
     71 entry:
     72   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
     73   ret void
     74 }
     75 
     76 ; It doesn't make sense to do a tail from a kernel
     77 ; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
     78 ;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
     79 define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
     80 entry:
     81   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
     82   ret void
     83 }
     84 
     85 ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
     86 ; GCN: s_waitcnt
     87 ; GCN-NEXT: s_mov_b32 s5, s32
     88 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
     89 ; GCN-NEXT: s_waitcnt vmcnt(0)
     90 
     91 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
     92 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
     93 
     94 ; GCN-NEXT: s_setpc_b64 s[30:31]
     95 define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 {
     96   %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4
     97   %add0 = add i32 %arg0, %arg1.load
     98   ret i32 %add0
     99 }
    100 
    101 ; Tail call disallowed with byval in parent.
    102 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
    103 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
    104 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
    105 ; GCN: s_swappc_b64
    106 ; GCN-NOT: v_readlane_b32 s32
    107 ; GCN: s_setpc_b64
    108 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval %b.byval, i32 %c) #1 {
    109 entry:
    110   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* %b.byval)
    111   ret i32 %ret
    112 }
    113 
    114 ; Tail call disallowed with byval in parent, not callee.
    115 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
    116 ; GCN-NOT: v0
    117 ; GCN-NOT: s32
    118 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16
    119 ; GCN: s_mov_b32 s5, s32
    120 ; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4
    121 ; GCN-NEXT: s_setpc_b64
    122 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 {
    123 entry:
    124   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*))
    125   ret i32 %ret
    126 }
    127 
    128 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
    129 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
    130 ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
    131 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
    132 
    133 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
    134 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_0]], v0
    135 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_1]], v0
    136 
    137 
    138 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
    139 ; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]]
    140 ; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]]
    141 
    142 ; GCN-NEXT: s_setpc_b64
    143 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
    144   %val_firststack = extractvalue [32 x i32] %large, 30
    145   %val_laststack = extractvalue [32 x i32] %large, 31
    146   %add0 = add i32 %arg0, %arg1
    147   %add1 = add i32 %add0, %val_firststack
    148   %add2 = add i32 %add1, %val_laststack
    149   ret i32 %add2
    150 }
    151 
    152 ; FIXME: Why load and store same location for stack args?
    153 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
    154 ; GCN: s_mov_b32 s5, s32
    155 
    156 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
    157 ; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
    158 
    159 ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
    160 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
    161 
    162 ; GCN-NOT: s32
    163 
    164 ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4
    165 ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8
    166 
    167 ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
    168 ; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
    169 
    170 ; GCN-NOT: s32
    171 ; GCN: s_setpc_b64
    172 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
    173 entry:
    174   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
    175   ret i32 %ret
    176 }
    177 
    178 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
    179 ; GCN-DAG: s_mov_b32 s5, s32
    180 ; GCN-NOT: s32
    181 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
    182 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44
    183 
    184 ; GCN-NOT: s32
    185 ; GCN: s_setpc_b64
    186 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
    187 entry:
    188   %alloca = alloca [16 x i32], align 4, addrspace(5)
    189   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
    190   store volatile i32 9, i32 addrspace(5)* %gep
    191   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
    192   ret i32 %ret
    193 }
    194 
    195 ; If the callee requires more stack argument space than the caller,
    196 ; don't do a tail call.
    197 ; TODO: Do we really need this restriction?
    198 
    199 ; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
    200 ; GCN: s_swappc_b64
    201 ; GCN: s_setpc_b64
    202 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
    203 entry:
    204   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
    205   ret i32 %ret
    206 }
    207 
    208 ; Have another non-tail in the function
    209 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
    210 ; GCN: s_mov_b32 s5, s32
    211 ; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
    212 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
    213 ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
    214 ; GCN-DAG: v_writelane_b32 v34, s33, 0
    215 ; GCN-DAG: v_writelane_b32 v34, s34, 1
    216 ; GCN-DAG: v_writelane_b32 v34, s35, 2
    217 ; GCN-DAG: s_add_u32 s32, s32, 0x400
    218 
    219 ; GCN-DAG: s_getpc_b64
    220 ; GCN: s_swappc_b64
    221 
    222 ; GCN: s_getpc_b64 s[6:7]
    223 ; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
    224 ; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
    225 
    226 ; GCN-DAG: v_readlane_b32 s33, v34, 0
    227 ; GCN-DAG: v_readlane_b32 s34, v34, 1
    228 ; GCN-DAG: v_readlane_b32 s35, v34, 2
    229 
    230 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
    231 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
    232 ; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
    233 ; GCN: s_sub_u32 s32, s32, 0x400
    234 ; GCN: s_setpc_b64 s[6:7]
    235 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
    236 entry:
    237   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
    238   %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
    239   ret i32 %ret
    240 }
    241 
    242 ; Have stack object in caller and stack passed arguments. SP should be
    243 ; in same place at function exit.
    244 
    245 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
    246 ; GCN: s_mov_b32 s5, s32
    247 ; GCN-NOT: s32
    248 ; GCN: s_setpc_b64 s[6:7]
    249 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
    250 entry:
    251   %alloca = alloca [16 x i32], align 4, addrspace(5)
    252   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
    253   store volatile i32 9, i32 addrspace(5)* %gep
    254   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
    255   ret i32 %ret
    256 }
    257 
    258 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
    259 ; GCN: s_mov_b32 s5, s32
    260 ; GCN-NOT: s32
    261 ; GCN: s_setpc_b64 s[6:7]
    262 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
    263 entry:
    264   %alloca = alloca [16 x i32], align 4, addrspace(5)
    265   %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
    266   store volatile i32 9, i32 addrspace(5)* %gep
    267   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
    268   ret i32 %ret
    269 }
    270 
    271 attributes #0 = { nounwind }
    272 attributes #1 = { nounwind noinline }
    273