1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s 3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s 4 target datalayout = "A5" 5 6 ; FIXME: Why is this commuted only sometimes? 7 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32: 8 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 10 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 11 ; GCN-NEXT: s_setpc_b64 12 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { 13 %add0 = add i32 %arg0, %arg1 14 ret i32 %add0 15 } 16 17 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: 18 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 20 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 21 ; GCN: s_mov_b32 s5, s32 22 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 23 ; GCN: s_waitcnt vmcnt(0) 24 ; GCN: s_setpc_b64 25 ; GCN: ; ScratchSize: 68 26 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { 27 %alloca = alloca [16 x i32], align 4, addrspace(5) 28 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 29 store volatile i32 9, i32 addrspace(5)* %gep 30 %add0 = add i32 %arg0, %arg1 31 ret i32 %add0 32 } 33 34 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: 35 define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { 36 entry: 37 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 38 ret i32 %ret 39 } 40 41 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: 42 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 43 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 44 ; GCN: s_setpc_b64 45 ; GCN: ; ScratchSize: 68 46 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { 47 entry: 48 %alloca = alloca [16 x i32], align 4, addrspace(5) 49 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 50 store volatile i32 9, i32 addrspace(5)* %gep 51 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 52 ret i32 %ret 53 } 54 55 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: 56 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 57 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 58 ; GCN: s_setpc_b64 59 ; GCN: ; ScratchSize: 136 60 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { 61 entry: 62 %alloca = alloca [16 x i32], align 4, addrspace(5) 63 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 64 store volatile i32 9, i32 addrspace(5)* %gep 65 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b) 66 ret i32 %ret 67 } 68 69 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: 70 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 71 entry: 72 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 73 ret void 74 } 75 76 ; It doesn't make sense to do a tail from a kernel 77 ; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: 78 ;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 79 define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 80 entry: 81 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 82 ret void 83 } 84 85 ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: 86 ; GCN: s_waitcnt 87 ; GCN-NEXT: s_mov_b32 s5, s32 88 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 89 ; GCN-NEXT: s_waitcnt vmcnt(0) 90 91 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 92 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 93 94 ; GCN-NEXT: s_setpc_b64 s[30:31] 95 define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { 96 %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 97 %add0 = add i32 %arg0, %arg1.load 98 ret i32 %add0 99 } 100 101 ; Tail call disallowed with byval in parent. 102 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: 103 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 104 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 105 ; GCN: s_swappc_b64 106 ; GCN-NOT: v_readlane_b32 s32 107 ; GCN: s_setpc_b64 108 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval %b.byval, i32 %c) #1 { 109 entry: 110 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* %b.byval) 111 ret i32 %ret 112 } 113 114 ; Tail call disallowed with byval in parent, not callee. 115 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: 116 ; GCN-NOT: v0 117 ; GCN-NOT: s32 118 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 119 ; GCN: s_mov_b32 s5, s32 120 ; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 121 ; GCN-NEXT: s_setpc_b64 122 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { 123 entry: 124 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)) 125 ret i32 %ret 126 } 127 128 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: 129 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130 ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 131 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 132 133 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 134 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_0]], v0 135 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_1]], v0 136 137 138 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 139 ; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]] 140 ; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]] 141 142 ; GCN-NEXT: s_setpc_b64 143 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { 144 %val_firststack = extractvalue [32 x i32] %large, 30 145 %val_laststack = extractvalue [32 x i32] %large, 31 146 %add0 = add i32 %arg0, %arg1 147 %add1 = add i32 %add0, %val_firststack 148 %add2 = add i32 %add1, %val_laststack 149 ret i32 %add2 150 } 151 152 ; FIXME: Why load and store same location for stack args? 153 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: 154 ; GCN: s_mov_b32 s5, s32 155 156 ; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill 157 ; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill 158 159 ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 160 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 161 162 ; GCN-NOT: s32 163 164 ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 165 ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 166 167 ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload 168 ; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload 169 170 ; GCN-NOT: s32 171 ; GCN: s_setpc_b64 172 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 173 entry: 174 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 175 ret i32 %ret 176 } 177 178 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: 179 ; GCN-DAG: s_mov_b32 s5, s32 180 ; GCN-NOT: s32 181 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 182 ; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 183 184 ; GCN-NOT: s32 185 ; GCN: s_setpc_b64 186 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { 187 entry: 188 %alloca = alloca [16 x i32], align 4, addrspace(5) 189 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 190 store volatile i32 9, i32 addrspace(5)* %gep 191 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 192 ret i32 %ret 193 } 194 195 ; If the callee requires more stack argument space than the caller, 196 ; don't do a tail call. 197 ; TODO: Do we really need this restriction? 198 199 ; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: 200 ; GCN: s_swappc_b64 201 ; GCN: s_setpc_b64 202 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { 203 entry: 204 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 205 ret i32 %ret 206 } 207 208 ; Have another non-tail in the function 209 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: 210 ; GCN: s_mov_b32 s5, s32 211 ; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 212 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill 213 ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill 214 ; GCN-DAG: v_writelane_b32 v34, s33, 0 215 ; GCN-DAG: v_writelane_b32 v34, s34, 1 216 ; GCN-DAG: v_writelane_b32 v34, s35, 2 217 ; GCN-DAG: s_add_u32 s32, s32, 0x400 218 219 ; GCN-DAG: s_getpc_b64 220 ; GCN: s_swappc_b64 221 222 ; GCN: s_getpc_b64 s[6:7] 223 ; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 224 ; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 225 226 ; GCN-DAG: v_readlane_b32 s33, v34, 0 227 ; GCN-DAG: v_readlane_b32 s34, v34, 1 228 ; GCN-DAG: v_readlane_b32 s35, v34, 2 229 230 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 231 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 232 ; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 233 ; GCN: s_sub_u32 s32, s32, 0x400 234 ; GCN: s_setpc_b64 s[6:7] 235 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { 236 entry: 237 %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 238 %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) 239 ret i32 %ret 240 } 241 242 ; Have stack object in caller and stack passed arguments. SP should be 243 ; in same place at function exit. 244 245 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: 246 ; GCN: s_mov_b32 s5, s32 247 ; GCN-NOT: s32 248 ; GCN: s_setpc_b64 s[6:7] 249 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 250 entry: 251 %alloca = alloca [16 x i32], align 4, addrspace(5) 252 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 253 store volatile i32 9, i32 addrspace(5)* %gep 254 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 255 ret i32 %ret 256 } 257 258 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: 259 ; GCN: s_mov_b32 s5, s32 260 ; GCN-NOT: s32 261 ; GCN: s_setpc_b64 s[6:7] 262 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { 263 entry: 264 %alloca = alloca [16 x i32], align 4, addrspace(5) 265 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 266 store volatile i32 9, i32 addrspace(5)* %gep 267 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 268 ret i32 %ret 269 } 270 271 attributes #0 = { nounwind } 272 attributes #1 = { nounwind noinline } 273