1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s 2 3 ; If spilling to smem, additional registers are used for the resource 4 ; descriptor. 5 6 ; FIXME: Vectorization can increase required SGPR count beyond limit. 7 ; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 8 9 ; ALL-LABEL: {{^}}max_9_sgprs: 10 11 ; ALL: SGPRBlocks: 1 12 ; ALL: NumSGPRsForWavesPerEU: 9 13 define amdgpu_kernel void @max_9_sgprs() #0 { 14 %one = load volatile i32, i32 addrspace(4)* undef 15 %two = load volatile i32, i32 addrspace(4)* undef 16 %three = load volatile i32, i32 addrspace(4)* undef 17 %four = load volatile i32, i32 addrspace(4)* undef 18 %five = load volatile i32, i32 addrspace(4)* undef 19 %six = load volatile i32, i32 addrspace(4)* undef 20 %seven = load volatile i32, i32 addrspace(4)* undef 21 %eight = load volatile i32, i32 addrspace(4)* undef 22 %nine = load volatile i32, i32 addrspace(4)* undef 23 %ten = load volatile i32, i32 addrspace(4)* undef 24 call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine) 25 store volatile i32 %one, i32 addrspace(1)* undef 26 store volatile i32 %two, i32 addrspace(1)* undef 27 store volatile i32 %three, i32 addrspace(1)* undef 28 store volatile i32 %four, i32 addrspace(1)* undef 29 store volatile i32 %five, i32 addrspace(1)* undef 30 store volatile i32 %six, i32 addrspace(1)* undef 31 store volatile i32 %seven, i32 addrspace(1)* undef 32 store volatile i32 %eight, i32 addrspace(1)* undef 33 store volatile i32 %nine, i32 addrspace(1)* undef 34 store volatile i32 %ten, i32 addrspace(1)* undef 35 ret void 36 } 37 38 ; private resource: 4 39 ; scratch wave offset: 1 40 ; workgroup ids: 3 41 ; dispatch id: 2 42 ; queue ptr: 2 43 ; flat scratch init: 2 44 ; --------------------- 45 ; total: 14 46 47 ; + reserved vcc = 16 48 49 ; Because we can't handle re-using the last few input registers as the 50 ; special vcc etc. registers (as well as decide to not use the unused 51 ; features when the number of registers is frozen), this ends up using 52 ; more than expected. 53 54 ; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: 55 ; XTOSGPR: SGPRBlocks: 1 56 ; XTOSGPR: NumSGPRsForWavesPerEU: 16 57 58 ; XTOSMEM: s_mov_b64 s[10:11], s[2:3] 59 ; XTOSMEM: s_mov_b64 s[8:9], s[0:1] 60 ; XTOSMEM: s_mov_b32 s7, s13 61 62 ; XTOSMEM: SGPRBlocks: 1 63 ; XTOSMEM: NumSGPRsForWavesPerEU: 16 64 ; 65 ; This test case is disabled: When calculating the spillslot addresses AMDGPU 66 ; creates an extra vreg to save/restore m0 which in a point of maximum register 67 ; pressure would trigger an endless loop; the compiler aborts earlier with 68 ; "Incomplete scavenging after 2nd pass" in practice. 69 ;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, 70 ; i32 addrspace(1)* %out2, 71 ; i32 addrspace(1)* %out3, 72 ; i32 addrspace(1)* %out4, 73 ; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 74 ; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 75 ; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 76 ; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 77 ; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 78 ; %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() 79 ; %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() 80 ; store volatile i32 0, i32* undef 81 ; br label %stores 82 ; 83 ;stores: 84 ; store volatile i32 %x.0, i32 addrspace(1)* undef 85 ; store volatile i32 %x.0, i32 addrspace(1)* undef 86 ; store volatile i32 %x.0, i32 addrspace(1)* undef 87 ; store volatile i64 %x.3, i64 addrspace(1)* undef 88 ; store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef 89 ; store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef 90 ; 91 ; store i32 %one, i32 addrspace(1)* %out1 92 ; store i32 %two, i32 addrspace(1)* %out2 93 ; store i32 %three, i32 addrspace(1)* %out3 94 ; store i32 %four, i32 addrspace(1)* %out4 95 ; ret void 96 ;} 97 98 ; The following test is commented out for now; http://llvm.org/PR31230 99 ; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} 100 ; ; Make sure copies for input buffer are not clobbered. This requires 101 ; ; swapping the order the registers are copied from what normally 102 ; ; happens. 103 104 ; XTOSMEM: s_mov_b32 s5, s11 105 ; XTOSMEM: s_add_u32 m0, s5, 106 ; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 107 108 ; XALL: SGPRBlocks: 2 109 ; XALL: NumSGPRsForWavesPerEU: 18 110 ;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, 111 ; i32 addrspace(1)* %out2, 112 ; i32 addrspace(1)* %out3, 113 ; i32 addrspace(1)* %out4, 114 ; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 115 ; store volatile i32 0, i32* undef 116 ; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 117 ; store volatile i32 %x.0, i32 addrspace(1)* undef 118 ; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 119 ; store volatile i32 %x.0, i32 addrspace(1)* undef 120 ; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 121 ; store volatile i32 %x.0, i32 addrspace(1)* undef 122 ; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 123 ; store volatile i64 %x.3, i64 addrspace(1)* undef 124 ; %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() 125 ; store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef 126 ; 127 ; store i32 %one, i32 addrspace(1)* %out1 128 ; store i32 %two, i32 addrspace(1)* %out2 129 ; store i32 %three, i32 addrspace(1)* %out3 130 ; store i32 %four, i32 addrspace(1)* %out4 131 ; ret void 132 ;} 133 134 declare i32 @llvm.amdgcn.workgroup.id.x() #1 135 declare i32 @llvm.amdgcn.workgroup.id.y() #1 136 declare i32 @llvm.amdgcn.workgroup.id.z() #1 137 declare i64 @llvm.amdgcn.dispatch.id() #1 138 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 139 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1 140 141 attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } 142 attributes #1 = { nounwind readnone } 143 attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } 144 attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } 145