1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s 2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s 3 4 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 5 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 6 7 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { 8 entry: 9 %stack = alloca [5 x i32], align 4 10 %0 = load i32, i32 addrspace(1)* %in, align 4 11 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 12 store i32 4, i32* %arrayidx1, align 4 13 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 14 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 15 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 16 store i32 5, i32* %arrayidx3, align 4 17 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 18 %2 = load i32, i32* %arrayidx10, align 4 19 store i32 %2, i32 addrspace(1)* %out, align 4 20 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 21 %3 = load i32, i32* %arrayidx12 22 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 23 store i32 %3, i32 addrspace(1)* %arrayidx13 24 ret void 25 } 26 27 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 28 29 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { 30 entry: 31 %stack = alloca [5 x i32], align 4 32 %0 = load i32, i32 addrspace(1)* %in, align 4 33 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 34 store i32 4, i32* %arrayidx1, align 4 35 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 36 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 37 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 38 store i32 5, i32* %arrayidx3, align 4 39 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 40 %2 = load i32, i32* %arrayidx10, align 4 41 store i32 %2, i32 addrspace(1)* %out, align 4 42 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 43 %3 = load i32, i32* %arrayidx12 44 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 45 store i32 %3, i32 addrspace(1)* %arrayidx13 46 ret void 47 } 48 49 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 50 51 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { 52 entry: 53 %stack = alloca [5 x i32], align 4 54 %0 = load i32, i32 addrspace(1)* %in, align 4 55 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 56 store i32 4, i32* %arrayidx1, align 4 57 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 58 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 59 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 60 store i32 5, i32* %arrayidx3, align 4 61 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 62 %2 = load i32, i32* %arrayidx10, align 4 63 store i32 %2, i32 addrspace(1)* %out, align 4 64 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 65 %3 = load i32, i32* %arrayidx12 66 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 67 store i32 %3, i32 addrspace(1)* %arrayidx13 68 ret void 69 } 70 71 ; ALL-LABEL: @occupancy_0( 72 ; CI-NOT: alloca [5 x i32] 73 ; SI: alloca [5 x i32] 74 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { 75 entry: 76 %stack = alloca [5 x i32], align 4 77 %0 = load i32, i32 addrspace(1)* %in, align 4 78 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 79 store i32 4, i32* %arrayidx1, align 4 80 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 81 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 82 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 83 store i32 5, i32* %arrayidx3, align 4 84 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 85 %2 = load i32, i32* %arrayidx10, align 4 86 store i32 %2, i32 addrspace(1)* %out, align 4 87 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 88 %3 = load i32, i32* %arrayidx12 89 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 90 store i32 %3, i32 addrspace(1)* %arrayidx13 91 ret void 92 } 93 94 ; ALL-LABEL: @occupancy_max( 95 ; CI-NOT: alloca [5 x i32] 96 ; SI: alloca [5 x i32] 97 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { 98 entry: 99 %stack = alloca [5 x i32], align 4 100 %0 = load i32, i32 addrspace(1)* %in, align 4 101 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 102 store i32 4, i32* %arrayidx1, align 4 103 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 104 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 105 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 106 store i32 5, i32* %arrayidx3, align 4 107 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 108 %2 = load i32, i32* %arrayidx10, align 4 109 store i32 %2, i32 addrspace(1)* %out, align 4 110 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 111 %3 = load i32, i32* %arrayidx12 112 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 113 store i32 %3, i32 addrspace(1)* %arrayidx13 114 ret void 115 } 116 117 ; SI-LABEL: @occupancy_6( 118 ; CI-LABEL: @occupancy_6( 119 ; SI: alloca 120 ; CI-NOT: alloca 121 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 122 entry: 123 %stack = alloca [42 x i8], align 4 124 %tmp = load i8, i8 addrspace(1)* %in, align 1 125 %tmp4 = sext i8 %tmp to i64 126 %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4 127 store i8 4, i8* %arrayidx1, align 1 128 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 129 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 130 %tmp5 = sext i8 %tmp1 to i64 131 %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5 132 store i8 5, i8* %arrayidx3, align 1 133 %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0 134 %tmp2 = load i8, i8* %arrayidx10, align 1 135 store i8 %tmp2, i8 addrspace(1)* %out, align 1 136 %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1 137 %tmp3 = load i8, i8* %arrayidx12, align 1 138 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 139 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 140 ret void 141 } 142 143 ; ALL-LABEL: @occupancy_6_over( 144 ; ALL: alloca [43 x i8] 145 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 146 entry: 147 %stack = alloca [43 x i8], align 4 148 %tmp = load i8, i8 addrspace(1)* %in, align 1 149 %tmp4 = sext i8 %tmp to i64 150 %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4 151 store i8 4, i8* %arrayidx1, align 1 152 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 153 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 154 %tmp5 = sext i8 %tmp1 to i64 155 %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5 156 store i8 5, i8* %arrayidx3, align 1 157 %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0 158 %tmp2 = load i8, i8* %arrayidx10, align 1 159 store i8 %tmp2, i8 addrspace(1)* %out, align 1 160 %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1 161 %tmp3 = load i8, i8* %arrayidx12, align 1 162 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 163 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 164 ret void 165 } 166 167 ; SI-LABEL: @occupancy_8( 168 ; CI-LABEL: @occupancy_8( 169 ; SI: alloca 170 ; CI-NOT: alloca 171 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 172 entry: 173 %stack = alloca [32 x i8], align 4 174 %tmp = load i8, i8 addrspace(1)* %in, align 1 175 %tmp4 = sext i8 %tmp to i64 176 %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4 177 store i8 4, i8* %arrayidx1, align 1 178 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 179 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 180 %tmp5 = sext i8 %tmp1 to i64 181 %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5 182 store i8 5, i8* %arrayidx3, align 1 183 %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0 184 %tmp2 = load i8, i8* %arrayidx10, align 1 185 store i8 %tmp2, i8 addrspace(1)* %out, align 1 186 %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1 187 %tmp3 = load i8, i8* %arrayidx12, align 1 188 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 189 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 190 ret void 191 } 192 193 ; ALL-LABEL: @occupancy_8_over( 194 ; ALL: alloca [33 x i8] 195 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 196 entry: 197 %stack = alloca [33 x i8], align 4 198 %tmp = load i8, i8 addrspace(1)* %in, align 1 199 %tmp4 = sext i8 %tmp to i64 200 %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4 201 store i8 4, i8* %arrayidx1, align 1 202 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 203 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 204 %tmp5 = sext i8 %tmp1 to i64 205 %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5 206 store i8 5, i8* %arrayidx3, align 1 207 %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0 208 %tmp2 = load i8, i8* %arrayidx10, align 1 209 store i8 %tmp2, i8 addrspace(1)* %out, align 1 210 %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1 211 %tmp3 = load i8, i8* %arrayidx12, align 1 212 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 213 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 214 ret void 215 } 216 217 ; SI-LABEL: @occupancy_9( 218 ; CI-LABEL: @occupancy_9( 219 ; SI: alloca 220 ; CI-NOT: alloca 221 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 222 entry: 223 %stack = alloca [28 x i8], align 4 224 %tmp = load i8, i8 addrspace(1)* %in, align 1 225 %tmp4 = sext i8 %tmp to i64 226 %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4 227 store i8 4, i8* %arrayidx1, align 1 228 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 229 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 230 %tmp5 = sext i8 %tmp1 to i64 231 %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5 232 store i8 5, i8* %arrayidx3, align 1 233 %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0 234 %tmp2 = load i8, i8* %arrayidx10, align 1 235 store i8 %tmp2, i8 addrspace(1)* %out, align 1 236 %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1 237 %tmp3 = load i8, i8* %arrayidx12, align 1 238 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 239 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 240 ret void 241 } 242 243 ; ALL-LABEL: @occupancy_9_over( 244 ; ALL: alloca [29 x i8] 245 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 246 entry: 247 %stack = alloca [29 x i8], align 4 248 %tmp = load i8, i8 addrspace(1)* %in, align 1 249 %tmp4 = sext i8 %tmp to i64 250 %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4 251 store i8 4, i8* %arrayidx1, align 1 252 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 253 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 254 %tmp5 = sext i8 %tmp1 to i64 255 %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5 256 store i8 5, i8* %arrayidx3, align 1 257 %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0 258 %tmp2 = load i8, i8* %arrayidx10, align 1 259 store i8 %tmp2, i8 addrspace(1)* %out, align 1 260 %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1 261 %tmp3 = load i8, i8* %arrayidx12, align 1 262 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 263 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 264 ret void 265 } 266 267 attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" } 268 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" } 269 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" } 270 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" } 271 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" } 272 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" } 273 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" } 274 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" } 275