1 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s 2 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s 3 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s 4 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s 5 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s 6 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s 7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s 8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s 9 10 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" 11 12 ; OPT-LABEL: @test_sink_global_small_offset_i32( 13 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in 14 ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in 15 ; OPT: br i1 16 ; OPT-CI: getelementptr i8, 17 18 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: 19 define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 20 entry: 21 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 22 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 24 %tmp0 = icmp eq i32 %tid, 0 25 br i1 %tmp0, label %endif, label %if 26 27 if: 28 %tmp1 = load i32, i32 addrspace(1)* %in.gep 29 br label %endif 30 31 endif: 32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 33 store i32 %x, i32 addrspace(1)* %out.gep 34 br label %done 35 36 done: 37 ret void 38 } 39 40 ; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( 41 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 42 ; OPT: br i1 43 44 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: 45 ; GCN: s_and_saveexec_b64 46 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 47 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} 48 ; GCN: {{^}}BB1_2: 49 ; GCN: s_or_b64 exec 50 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 51 entry: 52 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 53 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 54 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 55 %tmp0 = icmp eq i32 %tid, 0 56 br i1 %tmp0, label %endif, label %if 57 58 if: 59 %tmp1 = load i8, i8 addrspace(1)* %in.gep 60 %tmp2 = sext i8 %tmp1 to i32 61 br label %endif 62 63 endif: 64 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 65 store i32 %x, i32 addrspace(1)* %out.gep 66 br label %done 67 68 done: 69 ret void 70 } 71 72 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: 73 ; GCN: s_and_saveexec_b64 74 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} 75 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}} 76 ; GCN: {{^}}BB2_2: 77 ; GCN: s_or_b64 exec 78 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 79 entry: 80 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 81 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 82 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 83 %tmp0 = icmp eq i32 %tid, 0 84 br i1 %tmp0, label %endif, label %if 85 86 if: 87 %tmp1 = load i8, i8 addrspace(1)* %in.gep 88 %tmp2 = sext i8 %tmp1 to i32 89 br label %endif 90 91 endif: 92 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 93 store i32 %x, i32 addrspace(1)* %out.gep 94 br label %done 95 96 done: 97 ret void 98 } 99 100 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: 101 ; GCN: s_and_saveexec_b64 102 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 103 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} 104 ; GCN: {{^}}BB3_2: 105 ; GCN: s_or_b64 exec 106 define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 107 entry: 108 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 109 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 110 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 111 %tmp0 = icmp eq i32 %tid, 0 112 br i1 %tmp0, label %endif, label %if 113 114 if: 115 %tmp1 = load i8, i8 addrspace(1)* %in.gep 116 %tmp2 = sext i8 %tmp1 to i32 117 br label %endif 118 119 endif: 120 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 121 store i32 %x, i32 addrspace(1)* %out.gep 122 br label %done 123 124 done: 125 ret void 126 } 127 128 ; OPT-LABEL: @test_sink_scratch_small_offset_i32( 129 ; OPT-NOT: getelementptr [512 x i32] 130 ; OPT: br i1 131 ; OPT: getelementptr i8, 132 133 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: 134 ; GCN: s_and_saveexec_b64 135 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} 136 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} 137 ; GCN: {{^}}BB4_2: 138 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 139 entry: 140 %alloca = alloca [512 x i32], align 4, addrspace(5) 141 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 142 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 143 %add.arg = add i32 %arg, 8 144 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022 145 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 146 %tmp0 = icmp eq i32 %tid, 0 147 br i1 %tmp0, label %endif, label %if 148 149 if: 150 store volatile i32 123, i32 addrspace(5)* %alloca.gep 151 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 152 br label %endif 153 154 endif: 155 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 156 store i32 %x, i32 addrspace(1)* %out.gep.0 157 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 158 store i32 %load, i32 addrspace(1)* %out.gep.1 159 br label %done 160 161 done: 162 ret void 163 } 164 165 ; This ends up not fitting due to the reserved 4 bytes at offset 0 166 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( 167 ; OPT-NOT: getelementptr [512 x i32] 168 ; OPT: br i1 169 ; OPT: getelementptr i8, 170 171 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: 172 ; GCN: s_and_saveexec_b64 173 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 174 ; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} 175 ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 176 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} 177 ; GCN: {{^BB[0-9]+}}_2: 178 179 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 180 entry: 181 %alloca = alloca [512 x i32], align 4, addrspace(5) 182 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 183 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 184 %add.arg = add i32 %arg, 8 185 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023 186 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 187 %tmp0 = icmp eq i32 %tid, 0 188 br i1 %tmp0, label %endif, label %if 189 190 if: 191 store volatile i32 123, i32 addrspace(5)* %alloca.gep 192 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 193 br label %endif 194 195 endif: 196 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 197 store i32 %x, i32 addrspace(1)* %out.gep.0 198 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 199 store i32 %load, i32 addrspace(1)* %out.gep.1 200 br label %done 201 202 done: 203 ret void 204 } 205 206 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( 207 ; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 208 ; OPT: br i1 209 ; OPT-NOT: ptrtoint 210 211 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: 212 ; GCN: s_and_saveexec_b64 213 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} 214 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} 215 ; GCN: {{^BB[0-9]+}}_2: 216 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { 217 entry: 218 %alloca = alloca [512 x i32], align 4, addrspace(5) 219 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 220 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 221 %add.arg = add i32 %arg, 8 222 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024 223 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 224 %tmp0 = icmp eq i32 %tid, 0 225 br i1 %tmp0, label %endif, label %if 226 227 if: 228 store volatile i32 123, i32 addrspace(5)* %alloca.gep 229 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep 230 br label %endif 231 232 endif: 233 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 234 store i32 %x, i32 addrspace(1)* %out.gep.0 235 %load = load volatile i32, i32 addrspace(5)* %alloca.gep 236 store i32 %load, i32 addrspace(1)* %out.gep.1 237 br label %done 238 239 done: 240 ret void 241 } 242 243 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: 244 ; GCN: s_and_saveexec_b64 245 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 246 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 247 ; GCN: {{^BB[0-9]+}}_2: 248 define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { 249 entry: 250 %offset.ext = zext i32 %offset to i64 251 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 252 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext 253 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 254 %tmp0 = icmp eq i32 %tid, 0 255 br i1 %tmp0, label %endif, label %if 256 257 if: 258 %tmp1 = load i32, i32 addrspace(1)* %in.gep 259 br label %endif 260 261 endif: 262 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 263 store i32 %x, i32 addrspace(1)* %out.gep 264 br label %done 265 266 done: 267 ret void 268 } 269 270 ; OPT-LABEL: @test_sink_constant_small_offset_i32 271 ; OPT-NOT: getelementptr i32, i32 addrspace(4)* 272 ; OPT: br i1 273 274 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: 275 ; GCN: s_and_saveexec_b64 276 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} 277 ; GCN: s_or_b64 exec, exec 278 define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 279 entry: 280 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 281 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 282 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 283 %tmp0 = icmp eq i32 %tid, 0 284 br i1 %tmp0, label %endif, label %if 285 286 if: 287 %tmp1 = load i32, i32 addrspace(4)* %in.gep 288 br label %endif 289 290 endif: 291 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 292 store i32 %x, i32 addrspace(1)* %out.gep 293 br label %done 294 295 done: 296 ret void 297 } 298 299 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 300 ; OPT-NOT: getelementptr i32, i32 addrspace(4)* 301 ; OPT: br i1 302 303 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: 304 ; GCN: s_and_saveexec_b64 305 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} 306 ; GCN: s_or_b64 exec, exec 307 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 308 entry: 309 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 310 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255 311 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 312 %tmp0 = icmp eq i32 %tid, 0 313 br i1 %tmp0, label %endif, label %if 314 315 if: 316 %tmp1 = load i32, i32 addrspace(4)* %in.gep 317 br label %endif 318 319 endif: 320 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 321 store i32 %x, i32 addrspace(1)* %out.gep 322 br label %done 323 324 done: 325 ret void 326 } 327 328 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 329 ; OPT-SI: getelementptr i32, i32 addrspace(4)* 330 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 331 ; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)* 332 ; OPT: br i1 333 334 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: 335 ; GCN: s_and_saveexec_b64 336 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 337 338 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 339 ; GCN: s_or_b64 exec, exec 340 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 341 entry: 342 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 343 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256 344 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 345 %tmp0 = icmp eq i32 %tid, 0 346 br i1 %tmp0, label %endif, label %if 347 348 if: 349 %tmp1 = load i32, i32 addrspace(4)* %in.gep 350 br label %endif 351 352 endif: 353 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 354 store i32 %x, i32 addrspace(1)* %out.gep 355 br label %done 356 357 done: 358 ret void 359 } 360 361 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 362 ; OPT-SI: getelementptr i32, i32 addrspace(4)* 363 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 364 ; OPT: br i1 365 366 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: 367 ; GCN: s_and_saveexec_b64 368 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 369 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 370 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 371 ; GCN: s_or_b64 exec, exec 372 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 373 entry: 374 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 375 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295 376 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 377 %tmp0 = icmp eq i32 %tid, 0 378 br i1 %tmp0, label %endif, label %if 379 380 if: 381 %tmp1 = load i32, i32 addrspace(4)* %in.gep 382 br label %endif 383 384 endif: 385 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 386 store i32 %x, i32 addrspace(1)* %out.gep 387 br label %done 388 389 done: 390 ret void 391 } 392 393 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 394 ; OPT: getelementptr i32, i32 addrspace(4)* 395 ; OPT: br i1 396 397 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: 398 ; GCN: s_and_saveexec_b64 399 ; GCN: s_add_u32 400 ; GCN: s_addc_u32 401 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 402 ; GCN: s_or_b64 exec, exec 403 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 404 entry: 405 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 406 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181 407 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 408 %tmp0 = icmp eq i32 %tid, 0 409 br i1 %tmp0, label %endif, label %if 410 411 if: 412 %tmp1 = load i32, i32 addrspace(4)* %in.gep 413 br label %endif 414 415 endif: 416 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 417 store i32 %x, i32 addrspace(1)* %out.gep 418 br label %done 419 420 done: 421 ret void 422 } 423 424 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: 425 ; GCN: s_and_saveexec_b64 426 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} 427 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 428 429 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}} 430 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} 431 432 ; GCN: s_or_b64 exec, exec 433 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 434 entry: 435 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 436 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143 437 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 438 %tmp0 = icmp eq i32 %tid, 0 439 br i1 %tmp0, label %endif, label %if 440 441 if: 442 %tmp1 = load i32, i32 addrspace(4)* %in.gep 443 br label %endif 444 445 endif: 446 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 447 store i32 %x, i32 addrspace(1)* %out.gep 448 br label %done 449 450 done: 451 ret void 452 } 453 454 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 455 ; OPT-SI: getelementptr i32, i32 addrspace(4)* 456 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* 457 ; OPT-VI: getelementptr i32, i32 addrspace(4)* 458 ; OPT: br i1 459 460 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: 461 ; GCN: s_and_saveexec_b64 462 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 463 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 464 465 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}} 466 467 ; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 468 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 469 470 ; GCN: s_or_b64 exec, exec 471 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { 472 entry: 473 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 474 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144 475 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 476 %tmp0 = icmp eq i32 %tid, 0 477 br i1 %tmp0, label %endif, label %if 478 479 if: 480 %tmp1 = load i32, i32 addrspace(4)* %in.gep 481 br label %endif 482 483 endif: 484 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 485 store i32 %x, i32 addrspace(1)* %out.gep 486 br label %done 487 488 done: 489 ret void 490 } 491 492 %struct.foo = type { [3 x float], [3 x float] } 493 494 ; OPT-LABEL: @sink_ds_address( 495 ; OPT: getelementptr i8, 496 497 ; GCN-LABEL: {{^}}sink_ds_address: 498 ; GCN: s_load_dword [[SREG1:s[0-9]+]], 499 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] 500 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 501 define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { 502 entry: 503 %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 504 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 505 br label %bb32 506 507 bb32: 508 %a = load float, float addrspace(3)* %x, align 4 509 %b = load float, float addrspace(3)* %y, align 4 510 %cmp = fcmp one float %a, %b 511 br i1 %cmp, label %bb34, label %bb33 512 513 bb33: 514 unreachable 515 516 bb34: 517 unreachable 518 } 519 520 ; Address offset is not a multiple of 4. This is a valid mubuf offset, 521 ; but not smrd. 522 523 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1( 524 ; OPT: br i1 %tmp0, 525 ; OPT: if: 526 ; OPT: getelementptr i8, {{.*}} 4095 527 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) { 528 entry: 529 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 530 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 531 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 532 %tmp0 = icmp eq i32 %tid, 0 533 br i1 %tmp0, label %endif, label %if 534 535 if: 536 %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)* 537 %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1 538 br label %endif 539 540 endif: 541 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 542 store i32 %x, i32 addrspace(1)* %out.gep 543 br label %done 544 545 done: 546 ret void 547 } 548 549 ; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32( 550 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 551 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 552 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 553 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst 554 define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 555 entry: 556 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 557 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 558 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 559 %tmp0 = icmp eq i32 %tid, 0 560 br i1 %tmp0, label %endif, label %if 561 562 if: 563 %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst 564 br label %endif 565 566 endif: 567 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 568 store i32 %x, i32 addrspace(3)* %out.gep 569 br label %done 570 571 done: 572 ret void 573 } 574 575 ; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32( 576 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 577 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 578 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 579 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic 580 define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 581 entry: 582 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 583 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 584 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 585 %tmp0 = icmp eq i32 %tid, 0 586 br i1 %tmp0, label %endif, label %if 587 588 if: 589 %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic 590 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0 591 br label %endif 592 593 endif: 594 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 595 store i32 %x, i32 addrspace(3)* %out.gep 596 br label %done 597 598 done: 599 ret void 600 } 601 602 ; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32( 603 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 604 ; OPT: br i1 605 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 606 define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { 607 entry: 608 %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999 609 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 610 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 611 %tmp0 = icmp eq i32 %tid, 0 612 br i1 %tmp0, label %endif, label %if 613 614 if: 615 %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic 616 %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0 617 br label %endif 618 619 endif: 620 %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ] 621 store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep 622 br label %done 623 624 done: 625 ret void 626 } 627 628 ; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32( 629 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 630 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 631 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 632 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 633 define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 634 entry: 635 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 636 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 637 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 638 %tmp0 = icmp eq i32 %tid, 0 639 br i1 %tmp0, label %endif, label %if 640 641 if: 642 %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 643 br label %endif 644 645 endif: 646 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 647 store i32 %x, i32 addrspace(3)* %out.gep 648 br label %done 649 650 done: 651 ret void 652 } 653 654 ; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32( 655 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* 656 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 657 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* 658 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) 659 define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { 660 entry: 661 %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 662 %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 663 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 664 %tmp0 = icmp eq i32 %tid, 0 665 br i1 %tmp0, label %endif, label %if 666 667 if: 668 %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) 669 br label %endif 670 671 endif: 672 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 673 store i32 %x, i32 addrspace(3)* %out.gep 674 br label %done 675 676 done: 677 ret void 678 } 679 680 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( 681 ; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 682 ; OPT-SICIV: br 683 ; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep 684 685 ; OPT-GFX9: br 686 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 687 ; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr 688 689 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: 690 ; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}} 691 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 692 entry: 693 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 694 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 695 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 696 %tmp0 = icmp eq i32 %tid, 0 697 br i1 %tmp0, label %endif, label %if 698 699 if: 700 %tmp1 = load i8, i8 addrspace(1)* %in.gep 701 %tmp2 = sext i8 %tmp1 to i32 702 br label %endif 703 704 endif: 705 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 706 store i32 %x, i32 addrspace(1)* %out.gep 707 br label %done 708 709 done: 710 ret void 711 } 712 713 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset( 714 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 715 ; OPT: br 716 ; OPT: load i8, i8 addrspace(1)* %in.gep 717 718 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset: 719 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 720 entry: 721 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 722 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 723 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 724 %tmp0 = icmp eq i32 %tid, 0 725 br i1 %tmp0, label %endif, label %if 726 727 if: 728 %tmp1 = load i8, i8 addrspace(1)* %in.gep 729 %tmp2 = sext i8 %tmp1 to i32 730 br label %endif 731 732 endif: 733 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 734 store i32 %x, i32 addrspace(1)* %out.gep 735 br label %done 736 737 done: 738 ret void 739 } 740 741 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 742 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 743 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 744 745 attributes #0 = { nounwind readnone } 746 attributes #1 = { nounwind } 747 attributes #2 = { nounwind argmemonly } 748