Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
      2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
      3 
      4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
      5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
      6 
      7 ; Run with devices with different unaligned load restrictions.
      8 
      9 ; TODO: Vector element tests
     10 ; TODO: Non-zero base offset for load and store combinations
     11 ; TODO: Same base addrspacecasted
     12 
     13 
     14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
     15 ; GCN: buffer_store_byte
     16 ; GCN: buffer_store_byte
     17 ; GCN: s_endpgm
     18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
     19   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
     20 
     21   store i8 123, i8 addrspace(1)* %out.gep.1
     22   store i8 456, i8 addrspace(1)* %out, align 2
     23   ret void
     24 }
     25 
     26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
     27 ; GCN: buffer_store_byte
     28 ; GCN: buffer_store_byte
     29 ; GCN: s_endpgm
     30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
     31   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
     32 
     33   store i8 123, i8 addrspace(1)* %out.gep.1
     34   store i8 456, i8 addrspace(1)* %out
     35   ret void
     36 }
     37 
     38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
     39 ; GCN: buffer_store_dword v
     40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
     41   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
     42 
     43   store i16 123, i16 addrspace(1)* %out.gep.1
     44   store i16 456, i16 addrspace(1)* %out, align 4
     45   ret void
     46 }
     47 
     48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
     49 ; GCN: buffer_store_dword v
     50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
     51   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
     52 
     53   store i16 0, i16 addrspace(1)* %out.gep.1
     54   store i16 0, i16 addrspace(1)* %out, align 4
     55   ret void
     56 }
     57 
     58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
     59 ; GCN: buffer_store_short
     60 ; GCN: buffer_store_short
     61 ; GCN: s_endpgm
     62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
     63   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
     64 
     65   store i16 123, i16 addrspace(1)* %out.gep.1
     66   store i16 456, i16 addrspace(1)* %out
     67   ret void
     68 }
     69 
     70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
     71 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
     72 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
     73 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
     74 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
     75   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
     76 
     77   store i32 123, i32 addrspace(1)* %out.gep.1
     78   store i32 456, i32 addrspace(1)* %out
     79   ret void
     80 }
     81 
     82 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
     83 ; GCN: buffer_store_dwordx2
     84 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
     85   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
     86   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
     87   store float 1.0, float addrspace(1)* %out.gep.1.bc
     88   store i32 456, i32 addrspace(1)* %out
     89   ret void
     90 }
     91 
     92 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
     93 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
     94 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
     95 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
     96 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
     97   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
     98   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
     99   store i32 123, i32 addrspace(1)* %out.gep.1.bc
    100   store float 4.0, float addrspace(1)* %out
    101   ret void
    102 }
    103 
    104 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
    105 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
    106 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
    107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
    108 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
    109 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
    110 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
    111   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    112   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    113   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    114 
    115   store i32 123, i32 addrspace(1)* %out.gep.1
    116   store i32 456, i32 addrspace(1)* %out.gep.2
    117   store i32 333, i32 addrspace(1)* %out.gep.3
    118   store i32 1234, i32 addrspace(1)* %out
    119   ret void
    120 }
    121 
    122 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
    123 ; GCN: buffer_store_dwordx4
    124 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
    125   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
    126   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
    127   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
    128 
    129   store float 8.0, float addrspace(1)* %out
    130   store float 1.0, float addrspace(1)* %out.gep.1
    131   store float 2.0, float addrspace(1)* %out.gep.2
    132   store float 4.0, float addrspace(1)* %out.gep.3
    133   ret void
    134 }
    135 
    136 ; First store is out of order.
    137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
    138 ; GCN: buffer_store_dwordx4
    139 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
    140   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
    141   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
    142   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
    143 
    144   store float 1.0, float addrspace(1)* %out.gep.1
    145   store float 2.0, float addrspace(1)* %out.gep.2
    146   store float 4.0, float addrspace(1)* %out.gep.3
    147   store float 8.0, float addrspace(1)* %out
    148   ret void
    149 }
    150 
    151 ; FIXME: Should be able to merge this
    152 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
    153 ; GCN-NOAA: buffer_store_dword v
    154 ; GCN-NOAA: buffer_store_dword v
    155 ; GCN-NOAA: buffer_store_dword v
    156 ; GCN-NOAA: buffer_store_dword v
    157 
    158 ; GCN-AA: buffer_store_dwordx2
    159 ; GCN-AA: buffer_store_dword v
    160 ; GCN-AA: buffer_store_dword v
    161 
    162 ; GCN: s_endpgm
    163 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
    164   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
    165   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
    166   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
    167 
    168   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
    169   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
    170 
    171   store i32 11, i32 addrspace(1)* %out.gep.1.bc
    172   store float 2.0, float addrspace(1)* %out.gep.2
    173   store i32 17, i32 addrspace(1)* %out.gep.3.bc
    174   store float 8.0, float addrspace(1)* %out
    175   ret void
    176 }
    177 
    178 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
    179 ; SI-DAG: buffer_store_dwordx2
    180 ; SI-DAG: buffer_store_dword
    181 ; SI-NOT: buffer_store_dword
    182 ; GCN: s_endpgm
    183 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
    184   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    185   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    186 
    187   store i32 123, i32 addrspace(1)* %out.gep.1
    188   store i32 456, i32 addrspace(1)* %out.gep.2
    189   store i32 1234, i32 addrspace(1)* %out
    190   ret void
    191 }
    192 
    193 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
    194 ; GCN: buffer_store_dwordx4
    195 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
    196   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
    197 
    198   store i64 123, i64 addrspace(1)* %out.gep.1
    199   store i64 456, i64 addrspace(1)* %out
    200   ret void
    201 }
    202 
    203 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
    204 ; GCN: buffer_store_dwordx4
    205 ; GCN: buffer_store_dwordx4
    206 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
    207   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
    208   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
    209   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
    210 
    211   store i64 123, i64 addrspace(1)* %out.gep.1
    212   store i64 456, i64 addrspace(1)* %out.gep.2
    213   store i64 333, i64 addrspace(1)* %out.gep.3
    214   store i64 1234, i64 addrspace(1)* %out
    215   ret void
    216 }
    217 
    218 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
    219 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    220 ; GCN: buffer_store_dwordx2 [[LOAD]]
    221 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    222   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    223   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    224 
    225   %lo = load i32, i32 addrspace(1)* %in
    226   %hi = load i32, i32 addrspace(1)* %in.gep.1
    227 
    228   store i32 %lo, i32 addrspace(1)* %out
    229   store i32 %hi, i32 addrspace(1)* %out.gep.1
    230   ret void
    231 }
    232 
    233 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
    234 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    235 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    236 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    237   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
    238   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
    239 
    240   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    241   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    242   %lo = load i32, i32 addrspace(1)* %in.gep.0
    243   %hi = load i32, i32 addrspace(1)* %in.gep.1
    244 
    245   store i32 %lo, i32 addrspace(1)* %out.gep.0
    246   store i32 %hi, i32 addrspace(1)* %out.gep.1
    247   ret void
    248 }
    249 
    250 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
    251 ; GCN: buffer_load_dword v
    252 ; GCN: buffer_load_dword v
    253 ; GCN: buffer_store_dword v
    254 ; GCN: buffer_store_dword v
    255 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    256   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    257   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    258 
    259   %lo = load i32, i32 addrspace(1)* %in
    260   %hi = load i32, i32 addrspace(1)* %in.gep.1
    261 
    262   store i32 %hi, i32 addrspace(1)* %out
    263   store i32 %lo, i32 addrspace(1)* %out.gep.1
    264   ret void
    265 }
    266 
    267 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
    268 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    269 ; GCN: buffer_store_dwordx4 [[LOAD]]
    270 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    271   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    272   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    273   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    274   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    275   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
    276   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
    277 
    278   %x = load i32, i32 addrspace(1)* %in
    279   %y = load i32, i32 addrspace(1)* %in.gep.1
    280   %z = load i32, i32 addrspace(1)* %in.gep.2
    281   %w = load i32, i32 addrspace(1)* %in.gep.3
    282 
    283   store i32 %x, i32 addrspace(1)* %out
    284   store i32 %y, i32 addrspace(1)* %out.gep.1
    285   store i32 %z, i32 addrspace(1)* %out.gep.2
    286   store i32 %w, i32 addrspace(1)* %out.gep.3
    287   ret void
    288 }
    289 
    290 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
    291 ; SI-DAG: buffer_load_dwordx2
    292 ; SI-DAG: buffer_load_dword v
    293 ; GCN: s_waitcnt
    294 ; SI-DAG: buffer_store_dword v
    295 ; SI-DAG: buffer_store_dwordx2 v
    296 ; GCN: s_endpgm
    297 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    298   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    299   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    300   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    301   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
    302 
    303   %x = load i32, i32 addrspace(1)* %in
    304   %y = load i32, i32 addrspace(1)* %in.gep.1
    305   %z = load i32, i32 addrspace(1)* %in.gep.2
    306 
    307   store i32 %x, i32 addrspace(1)* %out
    308   store i32 %y, i32 addrspace(1)* %out.gep.1
    309   store i32 %z, i32 addrspace(1)* %out.gep.2
    310   ret void
    311 }
    312 
    313 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
    314 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    315 ; GCN: buffer_store_dwordx4 [[LOAD]]
    316 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    317   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
    318   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
    319   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
    320   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
    321   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
    322   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
    323 
    324   %x = load float, float addrspace(1)* %in
    325   %y = load float, float addrspace(1)* %in.gep.1
    326   %z = load float, float addrspace(1)* %in.gep.2
    327   %w = load float, float addrspace(1)* %in.gep.3
    328 
    329   store float %x, float addrspace(1)* %out
    330   store float %y, float addrspace(1)* %out.gep.1
    331   store float %z, float addrspace(1)* %out.gep.2
    332   store float %w, float addrspace(1)* %out.gep.3
    333   ret void
    334 }
    335 
    336 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
    337 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
    338 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
    339 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    340   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
    341   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
    342   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
    343   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
    344   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
    345   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
    346   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
    347   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
    348 
    349   %x = load i32, i32 addrspace(1)* %in.gep.0
    350   %y = load i32, i32 addrspace(1)* %in.gep.1
    351   %z = load i32, i32 addrspace(1)* %in.gep.2
    352   %w = load i32, i32 addrspace(1)* %in.gep.3
    353 
    354   store i32 %x, i32 addrspace(1)* %out.gep.0
    355   store i32 %y, i32 addrspace(1)* %out.gep.1
    356   store i32 %z, i32 addrspace(1)* %out.gep.2
    357   store i32 %w, i32 addrspace(1)* %out.gep.3
    358   ret void
    359 }
    360 
    361 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
    362 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    363 ; GCN: s_barrier
    364 ; GCN: buffer_store_dwordx4 [[LOAD]]
    365 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    366   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    367   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    368   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    369   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    370   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
    371   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
    372 
    373   %x = load i32, i32 addrspace(1)* %in
    374   %y = load i32, i32 addrspace(1)* %in.gep.1
    375   %z = load i32, i32 addrspace(1)* %in.gep.2
    376   %w = load i32, i32 addrspace(1)* %in.gep.3
    377 
    378   ; Make sure the barrier doesn't stop this
    379   tail call void @llvm.AMDGPU.barrier.local() #1
    380 
    381   store i32 %w, i32 addrspace(1)* %out.gep.3
    382   store i32 %z, i32 addrspace(1)* %out.gep.2
    383   store i32 %y, i32 addrspace(1)* %out.gep.1
    384   store i32 %x, i32 addrspace(1)* %out
    385 
    386   ret void
    387 }
    388 
    389 ; TODO: Re-packing of loaded register required. Maybe an IR pass
    390 ; should catch this?
    391 
    392 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
    393 ; GCN: buffer_load_dword v
    394 ; GCN: buffer_load_dword v
    395 ; GCN: buffer_load_dword v
    396 ; GCN: buffer_load_dword v
    397 ; GCN: s_barrier
    398 ; GCN: buffer_store_dword v
    399 ; GCN: buffer_store_dword v
    400 ; GCN: buffer_store_dword v
    401 ; GCN: buffer_store_dword v
    402 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    403   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    404   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    405   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    406   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    407   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
    408   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
    409 
    410   %x = load i32, i32 addrspace(1)* %in
    411   %y = load i32, i32 addrspace(1)* %in.gep.1
    412   %z = load i32, i32 addrspace(1)* %in.gep.2
    413   %w = load i32, i32 addrspace(1)* %in.gep.3
    414 
    415   ; Make sure the barrier doesn't stop this
    416   tail call void @llvm.AMDGPU.barrier.local() #1
    417 
    418   store i32 %w, i32 addrspace(1)* %out
    419   store i32 %z, i32 addrspace(1)* %out.gep.1
    420   store i32 %y, i32 addrspace(1)* %out.gep.2
    421   store i32 %x, i32 addrspace(1)* %out.gep.3
    422 
    423   ret void
    424 }
    425 
    426 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
    427 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
    428 ; GCN: buffer_store_dword [[LOAD]]
    429 ; GCN: s_endpgm
    430 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
    431   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
    432   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
    433   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
    434   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
    435   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
    436   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
    437 
    438   %x = load i8, i8 addrspace(1)* %in, align 4
    439   %y = load i8, i8 addrspace(1)* %in.gep.1
    440   %z = load i8, i8 addrspace(1)* %in.gep.2
    441   %w = load i8, i8 addrspace(1)* %in.gep.3
    442 
    443   store i8 %x, i8 addrspace(1)* %out, align 4
    444   store i8 %y, i8 addrspace(1)* %out.gep.1
    445   store i8 %z, i8 addrspace(1)* %out.gep.2
    446   store i8 %w, i8 addrspace(1)* %out.gep.3
    447   ret void
    448 }
    449 
    450 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
    451 ; GCN: buffer_load_ubyte
    452 ; GCN: buffer_load_ubyte
    453 ; GCN: buffer_load_ubyte
    454 ; GCN: buffer_load_ubyte
    455 ; GCN: buffer_store_byte
    456 ; GCN: buffer_store_byte
    457 ; GCN: buffer_store_byte
    458 ; GCN: buffer_store_byte
    459 ; GCN: s_endpgm
    460 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
    461   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
    462   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
    463   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
    464   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
    465   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
    466   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
    467 
    468   %x = load i8, i8 addrspace(1)* %in
    469   %y = load i8, i8 addrspace(1)* %in.gep.1
    470   %z = load i8, i8 addrspace(1)* %in.gep.2
    471   %w = load i8, i8 addrspace(1)* %in.gep.3
    472 
    473   store i8 %x, i8 addrspace(1)* %out
    474   store i8 %y, i8 addrspace(1)* %out.gep.1
    475   store i8 %z, i8 addrspace(1)* %out.gep.2
    476   store i8 %w, i8 addrspace(1)* %out.gep.3
    477   ret void
    478 }
    479 
    480 ; This works once AA is enabled on the subtarget
    481 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
    482 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
    483 
    484 ; GCN-NOAA: buffer_store_dword v
    485 ; GCN-NOAA: buffer_store_dword v
    486 ; GCN-NOAA: buffer_store_dword v
    487 ; GCN-NOAA: buffer_store_dword v
    488 
    489 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
    490 
    491 ; GCN: s_endpgm
    492 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
    493   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
    494   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
    495   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
    496   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
    497 
    498   %x = extractelement <4 x i32> %vec, i32 0
    499   %y = extractelement <4 x i32> %vec, i32 1
    500   %z = extractelement <4 x i32> %vec, i32 2
    501   %w = extractelement <4 x i32> %vec, i32 3
    502 
    503   store i32 %x, i32 addrspace(1)* %out
    504   store i32 %y, i32 addrspace(1)* %out.gep.1
    505   store i32 %z, i32 addrspace(1)* %out.gep.2
    506   store i32 %w, i32 addrspace(1)* %out.gep.3
    507   ret void
    508 }
    509 
    510 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
    511 ; GCN: ds_write_b8
    512 ; GCN: ds_write_b8
    513 ; GCN: s_endpgm
    514 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
    515   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
    516 
    517   store i8 123, i8 addrspace(3)* %out.gep.1
    518   store i8 456, i8 addrspace(3)* %out, align 2
    519   ret void
    520 }
    521 
    522 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
    523 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
    524 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
    525 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
    526 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
    527   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
    528 
    529   store i32 123, i32 addrspace(3)* %out.gep.1
    530   store i32 456, i32 addrspace(3)* %out
    531   ret void
    532 }
    533 
    534 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
    535 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
    536 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
    537 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
    538 
    539 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
    540 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
    541 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
    542 
    543 ; GCN: s_endpgm
    544 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
    545   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
    546   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
    547   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
    548 
    549   store i32 123, i32 addrspace(3)* %out.gep.1
    550   store i32 456, i32 addrspace(3)* %out.gep.2
    551   store i32 333, i32 addrspace(3)* %out.gep.3
    552   store i32 1234, i32 addrspace(3)* %out
    553   ret void
    554 }
    555 
    556 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
    557 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
    558 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
    559 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
    560 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
    561 ; GCN: buffer_store_dword v[[HI]]
    562 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
    563   store i32 9, i32 addrspace(1)* %out, align 4
    564   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
    565   store i32 12, i32 addrspace(1)* %idx1, align 4
    566   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
    567   store i32 16, i32 addrspace(1)* %idx2, align 4
    568   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
    569   store i32 -12, i32 addrspace(1)* %idx3, align 4
    570   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
    571   store i32 11, i32 addrspace(1)* %idx4, align 4
    572   ret void
    573 }
    574 
    575 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
    576 ; GCN: buffer_store_dwordx4
    577 ; GCN: buffer_store_dwordx2
    578 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
    579   store i32 13, i32 addrspace(1)* %out, align 4
    580   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
    581   store i32 15, i32 addrspace(1)* %idx1, align 4
    582   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
    583   store i32 62, i32 addrspace(1)* %idx2, align 4
    584   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
    585   store i32 63, i32 addrspace(1)* %idx3, align 4
    586   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
    587   store i32 11, i32 addrspace(1)* %idx4, align 4
    588   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
    589   store i32 123, i32 addrspace(1)* %idx5, align 4
    590   ret void
    591 }
    592 
    593 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
    594 ; GCN: buffer_store_dwordx4
    595 ; GCN: buffer_store_dwordx2
    596 ; GCN: buffer_store_dword v
    597 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
    598   store i32 34, i32 addrspace(1)* %out, align 4
    599   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
    600   store i32 999, i32 addrspace(1)* %idx1, align 4
    601   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
    602   store i32 65, i32 addrspace(1)* %idx2, align 4
    603   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
    604   store i32 33, i32 addrspace(1)* %idx3, align 4
    605   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
    606   store i32 98, i32 addrspace(1)* %idx4, align 4
    607   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
    608   store i32 91, i32 addrspace(1)* %idx5, align 4
    609   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
    610   store i32 212, i32 addrspace(1)* %idx6, align 4
    611   ret void
    612 }
    613 
    614 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
    615 ; GCN: buffer_store_dwordx4
    616 ; GCN: buffer_store_dwordx4
    617 ; GCN: s_endpgm
    618 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
    619   store i32 34, i32 addrspace(1)* %out, align 4
    620   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
    621   store i32 999, i32 addrspace(1)* %idx1, align 4
    622   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
    623   store i32 65, i32 addrspace(1)* %idx2, align 4
    624   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
    625   store i32 33, i32 addrspace(1)* %idx3, align 4
    626   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
    627   store i32 98, i32 addrspace(1)* %idx4, align 4
    628   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
    629   store i32 91, i32 addrspace(1)* %idx5, align 4
    630   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
    631   store i32 212, i32 addrspace(1)* %idx6, align 4
    632   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
    633   store i32 999, i32 addrspace(1)* %idx7, align 4
    634   ret void
    635 }
    636 
    637 ; This requires handling of scalar_to_vector for v2i64 to avoid
    638 ; scratch usage.
    639 ; FIXME: Should do single load and store
    640 
    641 ; GCN-LABEL: {{^}}copy_v3i32_align4:
    642 ; GCN-NOT: SCRATCH_RSRC_DWORD
    643 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    644 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    645 ; GCN-NOT: offen
    646 ; GCN: s_waitcnt vmcnt
    647 ; GCN-NOT: offen
    648 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    649 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    650 
    651 ; GCN: ScratchSize: 0{{$}}
    652 define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
    653   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
    654   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
    655   ret void
    656 }
    657 
    658 ; GCN-LABEL: {{^}}copy_v3i64_align4:
    659 ; GCN-NOT: SCRATCH_RSRC_DWORD
    660 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    661 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
    662 ; GCN-NOT: offen
    663 ; GCN: s_waitcnt vmcnt
    664 ; GCN-NOT: offen
    665 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    666 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
    667 ; GCN: ScratchSize: 0{{$}}
    668 define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
    669   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
    670   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
    671   ret void
    672 }
    673 
    674 ; GCN-LABEL: {{^}}copy_v3f32_align4:
    675 ; GCN-NOT: SCRATCH_RSRC_DWORD
    676 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    677 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    678 ; GCN-NOT: offen
    679 ; GCN: s_waitcnt vmcnt
    680 ; GCN-NOT: offen
    681 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    682 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
    683 ; GCN: ScratchSize: 0{{$}}
    684 define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
    685   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
    686   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
    687   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
    688   ret void
    689 }
    690 
    691 ; GCN-LABEL: {{^}}copy_v3f64_align4:
    692 ; GCN-NOT: SCRATCH_RSRC_DWORD
    693 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    694 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
    695 ; GCN-NOT: offen
    696 ; GCN: s_waitcnt vmcnt
    697 ; GCN-NOT: offen
    698 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
    699 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
    700 ; GCN: ScratchSize: 0{{$}}
    701 define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
    702   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
    703   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
    704   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
    705   ret void
    706 }
    707 
    708 declare void @llvm.AMDGPU.barrier.local() #1
    709 
    710 attributes #0 = { nounwind }
    711 attributes #1 = { convergent nounwind }
    712