Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
      2 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
      3 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
      4 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
      5 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
      6 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
      7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
      8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
      9 
     10 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
     11 
     12 ; OPT-LABEL: @test_sink_global_small_offset_i32(
     13 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
     14 ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
     15 ; OPT: br i1
     16 ; OPT-CI: getelementptr i8,
     17 
     18 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
     19 define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
     20 entry:
     21   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
     22   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
     23   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
     24   %tmp0 = icmp eq i32 %tid, 0
     25   br i1 %tmp0, label %endif, label %if
     26 
     27 if:
     28   %tmp1 = load i32, i32 addrspace(1)* %in.gep
     29   br label %endif
     30 
     31 endif:
     32   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
     33   store i32 %x, i32 addrspace(1)* %out.gep
     34   br label %done
     35 
     36 done:
     37   ret void
     38 }
     39 
     40 ; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
     41 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
     42 ; OPT: br i1
     43 
     44 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
     45 ; GCN: s_and_saveexec_b64
     46 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
     47 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
     48 ; GCN: {{^}}BB1_2:
     49 ; GCN: s_or_b64 exec
     50 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
     51 entry:
     52   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
     53   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
     54   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
     55   %tmp0 = icmp eq i32 %tid, 0
     56   br i1 %tmp0, label %endif, label %if
     57 
     58 if:
     59   %tmp1 = load i8, i8 addrspace(1)* %in.gep
     60   %tmp2 = sext i8 %tmp1 to i32
     61   br label %endif
     62 
     63 endif:
     64   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
     65   store i32 %x, i32 addrspace(1)* %out.gep
     66   br label %done
     67 
     68 done:
     69   ret void
     70 }
     71 
     72 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
     73 ; GCN: s_and_saveexec_b64
     74 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
     75 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
     76 ; GCN: {{^}}BB2_2:
     77 ; GCN: s_or_b64 exec
     78 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
     79 entry:
     80   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
     81   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
     82   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
     83   %tmp0 = icmp eq i32 %tid, 0
     84   br i1 %tmp0, label %endif, label %if
     85 
     86 if:
     87   %tmp1 = load i8, i8 addrspace(1)* %in.gep
     88   %tmp2 = sext i8 %tmp1 to i32
     89   br label %endif
     90 
     91 endif:
     92   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
     93   store i32 %x, i32 addrspace(1)* %out.gep
     94   br label %done
     95 
     96 done:
     97   ret void
     98 }
     99 
    100 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
    101 ; GCN: s_and_saveexec_b64
    102 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
    103 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
    104 ; GCN: {{^}}BB3_2:
    105 ; GCN: s_or_b64 exec
    106 define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
    107 entry:
    108   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
    109   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
    110   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    111   %tmp0 = icmp eq i32 %tid, 0
    112   br i1 %tmp0, label %endif, label %if
    113 
    114 if:
    115   %tmp1 = load i8, i8 addrspace(1)* %in.gep
    116   %tmp2 = sext i8 %tmp1 to i32
    117   br label %endif
    118 
    119 endif:
    120   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
    121   store i32 %x, i32 addrspace(1)* %out.gep
    122   br label %done
    123 
    124 done:
    125   ret void
    126 }
    127 
    128 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
    129 ; OPT-NOT:  getelementptr [512 x i32]
    130 ; OPT: br i1
    131 ; OPT: getelementptr i8,
    132 
    133 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
    134 ; GCN: s_and_saveexec_b64
    135 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
    136 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
    137 ; GCN: {{^}}BB4_2:
    138 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
    139 entry:
    140   %alloca = alloca [512 x i32], align 4, addrspace(5)
    141   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
    142   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    143   %add.arg = add i32 %arg, 8
    144   %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022
    145   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    146   %tmp0 = icmp eq i32 %tid, 0
    147   br i1 %tmp0, label %endif, label %if
    148 
    149 if:
    150   store volatile i32 123, i32 addrspace(5)* %alloca.gep
    151   %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
    152   br label %endif
    153 
    154 endif:
    155   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    156   store i32 %x, i32 addrspace(1)* %out.gep.0
    157   %load = load volatile i32, i32 addrspace(5)* %alloca.gep
    158   store i32 %load, i32 addrspace(1)* %out.gep.1
    159   br label %done
    160 
    161 done:
    162   ret void
    163 }
    164 
    165 ; This ends up not fitting due to the reserved 4 bytes at offset 0
    166 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
    167 ; OPT-NOT:  getelementptr [512 x i32]
    168 ; OPT: br i1
    169 ; OPT: getelementptr i8,
    170 
    171 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
    172 ; GCN: s_and_saveexec_b64
    173 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
    174 ; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
    175 ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
    176 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
    177 ; GCN: {{^BB[0-9]+}}_2:
    178 
    179 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
    180 entry:
    181   %alloca = alloca [512 x i32], align 4, addrspace(5)
    182   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
    183   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    184   %add.arg = add i32 %arg, 8
    185   %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023
    186   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    187   %tmp0 = icmp eq i32 %tid, 0
    188   br i1 %tmp0, label %endif, label %if
    189 
    190 if:
    191   store volatile i32 123, i32 addrspace(5)* %alloca.gep
    192   %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
    193   br label %endif
    194 
    195 endif:
    196   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    197   store i32 %x, i32 addrspace(1)* %out.gep.0
    198   %load = load volatile i32, i32 addrspace(5)* %alloca.gep
    199   store i32 %load, i32 addrspace(1)* %out.gep.1
    200   br label %done
    201 
    202 done:
    203   ret void
    204 }
    205 
    206 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
    207 ; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
    208 ; OPT: br i1
    209 ; OPT-NOT: ptrtoint
    210 
    211 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
    212 ; GCN: s_and_saveexec_b64
    213 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    214 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    215 ; GCN: {{^BB[0-9]+}}_2:
    216 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
    217 entry:
    218   %alloca = alloca [512 x i32], align 4, addrspace(5)
    219   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
    220   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    221   %add.arg = add i32 %arg, 8
    222   %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
    223   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    224   %tmp0 = icmp eq i32 %tid, 0
    225   br i1 %tmp0, label %endif, label %if
    226 
    227 if:
    228   store volatile i32 123, i32 addrspace(5)* %alloca.gep
    229   %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
    230   br label %endif
    231 
    232 endif:
    233   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    234   store i32 %x, i32 addrspace(1)* %out.gep.0
    235   %load = load volatile i32, i32 addrspace(5)* %alloca.gep
    236   store i32 %load, i32 addrspace(1)* %out.gep.1
    237   br label %done
    238 
    239 done:
    240   ret void
    241 }
    242 
    243 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
    244 ; GCN: s_and_saveexec_b64
    245 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    246 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
    247 ; GCN: {{^BB[0-9]+}}_2:
    248 define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
    249 entry:
    250   %offset.ext = zext i32 %offset to i64
    251   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    252   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
    253   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    254   %tmp0 = icmp eq i32 %tid, 0
    255   br i1 %tmp0, label %endif, label %if
    256 
    257 if:
    258   %tmp1 = load i32, i32 addrspace(1)* %in.gep
    259   br label %endif
    260 
    261 endif:
    262   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    263   store i32 %x, i32 addrspace(1)* %out.gep
    264   br label %done
    265 
    266 done:
    267   ret void
    268 }
    269 
    270 ; OPT-LABEL: @test_sink_constant_small_offset_i32
    271 ; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
    272 ; OPT: br i1
    273 
    274 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
    275 ; GCN: s_and_saveexec_b64
    276 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
    277 ; GCN: s_or_b64 exec, exec
    278 define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    279 entry:
    280   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    281   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
    282   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    283   %tmp0 = icmp eq i32 %tid, 0
    284   br i1 %tmp0, label %endif, label %if
    285 
    286 if:
    287   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    288   br label %endif
    289 
    290 endif:
    291   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    292   store i32 %x, i32 addrspace(1)* %out.gep
    293   br label %done
    294 
    295 done:
    296   ret void
    297 }
    298 
    299 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
    300 ; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
    301 ; OPT: br i1
    302 
    303 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
    304 ; GCN: s_and_saveexec_b64
    305 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
    306 ; GCN: s_or_b64 exec, exec
    307 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    308 entry:
    309   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    310   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
    311   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    312   %tmp0 = icmp eq i32 %tid, 0
    313   br i1 %tmp0, label %endif, label %if
    314 
    315 if:
    316   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    317   br label %endif
    318 
    319 endif:
    320   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    321   store i32 %x, i32 addrspace(1)* %out.gep
    322   br label %done
    323 
    324 done:
    325   ret void
    326 }
    327 
    328 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
    329 ; OPT-SI:  getelementptr i32, i32 addrspace(4)*
    330 ; OPT-CI-NOT:  getelementptr i32, i32 addrspace(4)*
    331 ; OPT-VI-NOT:  getelementptr i32, i32 addrspace(4)*
    332 ; OPT: br i1
    333 
    334 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
    335 ; GCN: s_and_saveexec_b64
    336 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400
    337 
    338 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
    339 ; GCN: s_or_b64 exec, exec
    340 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    341 entry:
    342   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    343   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
    344   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    345   %tmp0 = icmp eq i32 %tid, 0
    346   br i1 %tmp0, label %endif, label %if
    347 
    348 if:
    349   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    350   br label %endif
    351 
    352 endif:
    353   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    354   store i32 %x, i32 addrspace(1)* %out.gep
    355   br label %done
    356 
    357 done:
    358   ret void
    359 }
    360 
    361 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
    362 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
    363 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
    364 ; OPT: br i1
    365 
    366 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
    367 ; GCN: s_and_saveexec_b64
    368 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}}
    369 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
    370 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
    371 ; GCN: s_or_b64 exec, exec
    372 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    373 entry:
    374   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    375   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
    376   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    377   %tmp0 = icmp eq i32 %tid, 0
    378   br i1 %tmp0, label %endif, label %if
    379 
    380 if:
    381   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    382   br label %endif
    383 
    384 endif:
    385   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    386   store i32 %x, i32 addrspace(1)* %out.gep
    387   br label %done
    388 
    389 done:
    390   ret void
    391 }
    392 
    393 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
    394 ; OPT: getelementptr i32, i32 addrspace(4)*
    395 ; OPT: br i1
    396 
    397 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
    398 ; GCN: s_and_saveexec_b64
    399 ; GCN: s_add_u32
    400 ; GCN: s_addc_u32
    401 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
    402 ; GCN: s_or_b64 exec, exec
    403 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    404 entry:
    405   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    406   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
    407   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    408   %tmp0 = icmp eq i32 %tid, 0
    409   br i1 %tmp0, label %endif, label %if
    410 
    411 if:
    412   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    413   br label %endif
    414 
    415 endif:
    416   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    417   store i32 %x, i32 addrspace(1)* %out.gep
    418   br label %done
    419 
    420 done:
    421   ret void
    422 }
    423 
    424 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32:
    425 ; GCN: s_and_saveexec_b64
    426 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}}
    427 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
    428 
    429 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}}
    430 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
    431 
    432 ; GCN: s_or_b64 exec, exec
    433 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    434 entry:
    435   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    436   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
    437   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    438   %tmp0 = icmp eq i32 %tid, 0
    439   br i1 %tmp0, label %endif, label %if
    440 
    441 if:
    442   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    443   br label %endif
    444 
    445 endif:
    446   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    447   store i32 %x, i32 addrspace(1)* %out.gep
    448   br label %done
    449 
    450 done:
    451   ret void
    452 }
    453 
    454 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
    455 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
    456 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
    457 ; OPT-VI: getelementptr i32, i32 addrspace(4)*
    458 ; OPT: br i1
    459 
    460 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
    461 ; GCN: s_and_saveexec_b64
    462 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
    463 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
    464 
    465 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}}
    466 
    467 ; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}}
    468 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
    469 
    470 ; GCN: s_or_b64 exec, exec
    471 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
    472 entry:
    473   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
    474   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
    475   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    476   %tmp0 = icmp eq i32 %tid, 0
    477   br i1 %tmp0, label %endif, label %if
    478 
    479 if:
    480   %tmp1 = load i32, i32 addrspace(4)* %in.gep
    481   br label %endif
    482 
    483 endif:
    484   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    485   store i32 %x, i32 addrspace(1)* %out.gep
    486   br label %done
    487 
    488 done:
    489   ret void
    490 }
    491 
    492 %struct.foo = type { [3 x float], [3 x float] }
    493 
    494 ; OPT-LABEL: @sink_ds_address(
    495 ; OPT: getelementptr i8,
    496 
    497 ; GCN-LABEL: {{^}}sink_ds_address:
    498 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
    499 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
    500 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
    501 define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
    502 entry:
    503   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
    504   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
    505   br label %bb32
    506 
    507 bb32:
    508   %a = load float, float addrspace(3)* %x, align 4
    509   %b = load float, float addrspace(3)* %y, align 4
    510   %cmp = fcmp one float %a, %b
    511   br i1 %cmp, label %bb34, label %bb33
    512 
    513 bb33:
    514   unreachable
    515 
    516 bb34:
    517   unreachable
    518 }
    519 
    520 ; Address offset is not a multiple of 4. This is a valid mubuf offset,
    521 ; but not smrd.
    522 
    523 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
    524 ; OPT: br i1 %tmp0,
    525 ; OPT: if:
    526 ; OPT: getelementptr i8, {{.*}} 4095
    527 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
    528 entry:
    529   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
    530   %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
    531   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    532   %tmp0 = icmp eq i32 %tid, 0
    533   br i1 %tmp0, label %endif, label %if
    534 
    535 if:
    536   %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
    537   %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
    538   br label %endif
    539 
    540 endif:
    541   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    542   store i32 %x, i32 addrspace(1)* %out.gep
    543   br label %done
    544 
    545 done:
    546   ret void
    547 }
    548 
    549 ; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
    550 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
    551 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
    552 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
    553 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
    554 define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
    555 entry:
    556   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
    557   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    558   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    559   %tmp0 = icmp eq i32 %tid, 0
    560   br i1 %tmp0, label %endif, label %if
    561 
    562 if:
    563   %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
    564   br label %endif
    565 
    566 endif:
    567   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    568   store i32 %x, i32 addrspace(3)* %out.gep
    569   br label %done
    570 
    571 done:
    572   ret void
    573 }
    574 
    575 ; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
    576 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
    577 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
    578 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
    579 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
    580 define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
    581 entry:
    582   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
    583   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    584   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    585   %tmp0 = icmp eq i32 %tid, 0
    586   br i1 %tmp0, label %endif, label %if
    587 
    588 if:
    589   %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
    590   %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
    591   br label %endif
    592 
    593 endif:
    594   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    595   store i32 %x, i32 addrspace(3)* %out.gep
    596   br label %done
    597 
    598 done:
    599   ret void
    600 }
    601 
    602 ; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
    603 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    604 ; OPT: br i1
    605 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
    606 define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
    607 entry:
    608   %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
    609   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    610   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    611   %tmp0 = icmp eq i32 %tid, 0
    612   br i1 %tmp0, label %endif, label %if
    613 
    614 if:
    615   %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
    616   %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
    617   br label %endif
    618 
    619 endif:
    620   %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
    621   store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
    622   br label %done
    623 
    624 done:
    625   ret void
    626 }
    627 
    628 ; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
    629 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
    630 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
    631 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
    632 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
    633 define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
    634 entry:
    635   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
    636   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    637   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    638   %tmp0 = icmp eq i32 %tid, 0
    639   br i1 %tmp0, label %endif, label %if
    640 
    641 if:
    642   %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
    643   br label %endif
    644 
    645 endif:
    646   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    647   store i32 %x, i32 addrspace(3)* %out.gep
    648   br label %done
    649 
    650 done:
    651   ret void
    652 }
    653 
    654 ; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
    655 ; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
    656 ; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
    657 ; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
    658 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
    659 define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
    660 entry:
    661   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
    662   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
    663   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    664   %tmp0 = icmp eq i32 %tid, 0
    665   br i1 %tmp0, label %endif, label %if
    666 
    667 if:
    668   %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
    669   br label %endif
    670 
    671 endif:
    672   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
    673   store i32 %x, i32 addrspace(3)* %out.gep
    674   br label %done
    675 
    676 done:
    677   ret void
    678 }
    679 
    680 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
    681 ; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
    682 ; OPT-SICIV: br
    683 ; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
    684 
    685 ; OPT-GFX9: br
    686 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
    687 ; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
    688 
    689 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
    690 ; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
    691 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
    692 entry:
    693   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
    694   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
    695   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    696   %tmp0 = icmp eq i32 %tid, 0
    697   br i1 %tmp0, label %endif, label %if
    698 
    699 if:
    700   %tmp1 = load i8, i8 addrspace(1)* %in.gep
    701   %tmp2 = sext i8 %tmp1 to i32
    702   br label %endif
    703 
    704 endif:
    705   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
    706   store i32 %x, i32 addrspace(1)* %out.gep
    707   br label %done
    708 
    709 done:
    710   ret void
    711 }
    712 
    713 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
    714 ; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
    715 ; OPT: br
    716 ; OPT: load i8, i8 addrspace(1)* %in.gep
    717 
    718 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
    719 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
    720 entry:
    721   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
    722   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
    723   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
    724   %tmp0 = icmp eq i32 %tid, 0
    725   br i1 %tmp0, label %endif, label %if
    726 
    727 if:
    728   %tmp1 = load i8, i8 addrspace(1)* %in.gep
    729   %tmp2 = sext i8 %tmp1 to i32
    730   br label %endif
    731 
    732 endif:
    733   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
    734   store i32 %x, i32 addrspace(1)* %out.gep
    735   br label %done
    736 
    737 done:
    738   ret void
    739 }
    740 
    741 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
    742 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
    743 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
    744 
    745 attributes #0 = { nounwind readnone }
    746 attributes #1 = { nounwind }
    747 attributes #2 = { nounwind argmemonly }
    748