Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
      2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
      3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
      4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
      5 
      6 ; Tests for indirect addressing on SI, which is implemented using dynamic
      7 ; indexing of vectors.
      8 
      9 ; GCN-LABEL: {{^}}extract_w_offset:
     10 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
     11 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
     12 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
     13 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0
     14 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
     15 
     16 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
     17 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
     18 
     19 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
     20 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
     21 ; IDXMODE-NEXT: s_set_gpr_idx_off
     22 define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
     23 entry:
     24   %idx = add i32 %in, 1
     25   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
     26   store float %elt, float addrspace(1)* %out
     27   ret void
     28 }
     29 
     30 ; XXX: Could do v_or_b32 directly
     31 ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
     32 ; MOVREL: s_mov_b32 m0
     33 ; GCN-DAG: s_or_b32
     34 ; GCN-DAG: s_or_b32
     35 ; GCN-DAG: s_or_b32
     36 ; GCN-DAG: s_or_b32
     37 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     38 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     39 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     40 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     41 
     42 ; MOVREL: v_movrels_b32_e32
     43 
     44 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
     45 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
     46 ; IDXMODE-NEXT: s_set_gpr_idx_off
     47 define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
     48 entry:
     49   %idx = add i32 %in, 1
     50   %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
     51   %elt = extractelement <4 x i32> %vec, i32 %idx
     52   store i32 %elt, i32 addrspace(1)* %out
     53   ret void
     54 }
     55 
     56 ; GCN-LABEL: {{^}}extract_wo_offset:
     57 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
     58 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
     59 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
     60 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
     61 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
     62 
     63 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
     64 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
     65 
     66 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
     67 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
     68 ; IDXMODE-NEXT: s_set_gpr_idx_off
     69 define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
     70 entry:
     71   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
     72   store float %elt, float addrspace(1)* %out
     73   ret void
     74 }
     75 
     76 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
     77 ; The offset depends on the register that holds the first element of the vector.
     78 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
     79 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
     80 
     81 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
     82 ; IDXMODE: v_mov_b32_e32 v2, 2
     83 ; IDXMODE: v_mov_b32_e32 v3, 3
     84 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
     85 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
     86 ; IDXMODE-NEXT: s_set_gpr_idx_off
     87 define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
     88 entry:
     89   %index = add i32 %offset, -512
     90   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
     91   store i32 %value, i32 addrspace(1)* %out
     92   ret void
     93 }
     94 
     95 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
     96 ; The offset depends on the register that holds the first element of the vector.
     97 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
     98 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
     99 
    100 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
    101 ; IDXMODE: v_mov_b32_e32 v0,
    102 ; IDXMODE: v_mov_b32_e32 v1,
    103 ; IDXMODE: v_mov_b32_e32 v2,
    104 ; IDXMODE: v_mov_b32_e32 v3,
    105 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
    106 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    107 ; IDXMODE-NEXT: s_set_gpr_idx_off
    108 define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
    109 entry:
    110   %index = add i32 %offset, -512
    111   %or = or <4 x i32> %vec0, %vec1
    112   %value = extractelement <4 x i32> %or, i32 %index
    113   store i32 %value, i32 addrspace(1)* %out
    114   ret void
    115 }
    116 
    117 ; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
    118 ; The offset depends on the register that holds the first element of the vector.
    119 
    120 ; FIXME: The waitcnt for the argument load can go after the loop
    121 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
    122 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
    123 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
    124 ; GCN: s_and_saveexec_b64 vcc, vcc
    125 
    126 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
    127 ; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
    128 
    129 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
    130 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
    131 ; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
    132 ; IDXMODE: s_set_gpr_idx_off
    133 
    134 ; GCN: s_cbranch_execnz
    135 
    136 ; GCN: buffer_store_dword [[RESULT]]
    137 define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
    138 entry:
    139   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
    140   %index = add i32 %id, -512
    141   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
    142   store i32 %value, i32 addrspace(1)* %out
    143   ret void
    144 }
    145 
    146 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
    147 ; undefined behavior, but shouldn't crash compiler
    148 define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
    149 entry:
    150   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
    151   %value = extractelement <4 x i32> %ld, i32 undef
    152   store i32 %value, i32 addrspace(1)* %out
    153   ret void
    154 }
    155 
    156 ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
    157 ; undefined behavior, but shouldn't crash compiler
    158 define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
    159 entry:
    160   %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
    161   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
    162   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
    163   ret void
    164 }
    165 
    166 ; GCN-LABEL: {{^}}insert_w_offset:
    167 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
    168 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
    169 ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
    170 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
    171 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
    172 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
    173 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
    174 
    175 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
    176 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
    177 define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
    178 entry:
    179   %0 = add i32 %in, 1
    180   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
    181   store <4 x float> %1, <4 x float> addrspace(1)* %out
    182   ret void
    183 }
    184 
    185 ; GCN-LABEL: {{^}}insert_wo_offset:
    186 ; GCN: s_load_dword [[IN:s[0-9]+]]
    187 
    188 ; MOVREL: s_mov_b32 m0, [[IN]]
    189 ; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
    190 
    191 ; IDXMODE: s_set_gpr_idx_on [[IN]], dst
    192 ; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
    193 ; IDXMODE-NEXT: s_set_gpr_idx_off
    194 
    195 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
    196 define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
    197 entry:
    198   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
    199   store <4 x float> %0, <4 x float> addrspace(1)* %out
    200   ret void
    201 }
    202 
    203 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
    204 ; The offset depends on the register that holds the first element of the vector.
    205 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
    206 ; MOVREL: v_movreld_b32_e32 v0, 5
    207 
    208 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
    209 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
    210 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
    211 ; IDXMODE-NEXT: s_set_gpr_idx_off
    212 define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
    213 entry:
    214   %index = add i32 %offset, -512
    215   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
    216   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
    217   ret void
    218 }
    219 
    220 ; The vector indexed into is originally loaded into an SGPR rather
    221 ; than built with a reg_sequence
    222 
    223 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
    224 ; The offset depends on the register that holds the first element of the vector.
    225 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
    226 ; MOVREL: v_movreld_b32_e32 v0, 5
    227 
    228 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
    229 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
    230 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
    231 ; IDXMODE-NEXT: s_set_gpr_idx_off
    232 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
    233 entry:
    234   %index = add i32 %offset, -512
    235   %value = insertelement <4 x i32> %vec, i32 5, i32 %index
    236   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
    237   ret void
    238 }
    239 
    240 ; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
    241 ; The offset depends on the register that holds the first element of the vector.
    242 
    243 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
    244 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
    245 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
    246 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
    247 
    248 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
    249 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
    250 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
    251 ; GCN: s_and_saveexec_b64 vcc, vcc
    252 
    253 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
    254 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
    255 
    256 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
    257 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
    258 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
    259 ; IDXMODE: s_set_gpr_idx_off
    260 
    261 ; GCN: s_cbranch_execnz [[LOOPBB]]
    262 ; GCN: s_mov_b64 exec, [[SAVEEXEC]]
    263 
    264 ; GCN: buffer_store_dword
    265 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
    266 entry:
    267   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
    268   %index = add i32 %id, -512
    269   %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 5, i32 %index
    270   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
    271   ret void
    272 }
    273 
    274 ; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
    275 
    276 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
    277 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
    278 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
    279 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
    280 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
    281 
    282 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
    283 
    284 ; The offset depends on the register that holds the first element of the vector.
    285 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
    286 
    287 ; MOVREL: s_add_i32 m0, [[READLANE]], -16
    288 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
    289 
    290 ; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
    291 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
    292 ; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
    293 ; IDXMODE: s_set_gpr_idx_off
    294 
    295 ; GCN: s_cbranch_execnz
    296 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
    297 entry:
    298   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
    299   %index = add i32 %id, -16
    300   %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 500, i32 %index
    301   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
    302   ret void
    303 }
    304 
    305 ; When the block is split to insert the loop, make sure any other
    306 ; places that need to be expanded in the same block are also handled.
    307 
    308 ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
    309 
    310 ; FIXME: Why is vector copied in between?
    311 
    312 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
    313 ; GCN-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
    314 ; GCN-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
    315 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
    316 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
    317 
    318 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
    319 
    320 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
    321 ; GCN-NEXT: s_waitcnt vmcnt(0)
    322 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
    323 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
    324 ; GCN: s_and_saveexec_b64 vcc, vcc
    325 
    326 ; MOVREL: s_mov_b32 m0, [[READLANE]]
    327 ; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
    328 
    329 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
    330 ; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
    331 ; IDXMODE: s_set_gpr_idx_off
    332 
    333 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
    334 ; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
    335 
    336 ; FIXME: Redundant copy
    337 ; GCN: s_mov_b64 exec, [[MASK]]
    338 
    339 ; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
    340 
    341 ; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
    342 
    343 ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
    344 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
    345 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
    346 ; GCN: s_and_saveexec_b64 vcc, vcc
    347 
    348 ; MOVREL: s_mov_b32 m0, [[READLANE]]
    349 ; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
    350 
    351 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
    352 ; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
    353 ; IDXMODE: s_set_gpr_idx_off
    354 
    355 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
    356 ; GCN: s_cbranch_execnz [[LOOP1]]
    357 
    358 ; GCN: buffer_store_dword [[MOVREL0]]
    359 ; GCN: buffer_store_dword [[MOVREL1]]
    360 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
    361 entry:
    362   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
    363   %id.ext = zext i32 %id to i64
    364   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
    365   %idx0 = load volatile i32, i32 addrspace(1)* %gep
    366   %idx1 = add i32 %idx0, 1
    367   %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
    368   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
    369   %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
    370   store volatile i32 %val0, i32 addrspace(1)* %out0
    371   store volatile i32 %val1, i32 addrspace(1)* %out0
    372   %cmp = icmp eq i32 %id, 0
    373   br i1 %cmp, label %bb1, label %bb2
    374 
    375 bb1:
    376   store volatile i32 %live.out.reg, i32 addrspace(1)* undef
    377   br label %bb2
    378 
    379 bb2:
    380   ret void
    381 }
    382 
    383 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
    384 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
    385 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
    386 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
    387 
    388 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
    389 ; GCN: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
    390 ; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
    391 ; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
    392 
    393 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
    394 ; GCN-NEXT: s_waitcnt vmcnt(0)
    395 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
    396 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
    397 ; GCN: s_and_saveexec_b64 vcc, vcc
    398 
    399 ; MOVREL: s_mov_b32 m0, [[READLANE]]
    400 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
    401 
    402 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
    403 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
    404 ; IDXMODE: s_set_gpr_idx_off
    405 
    406 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
    407 ; GCN: s_cbranch_execnz [[LOOP0]]
    408 
    409 ; FIXME: Redundant copy
    410 ; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
    411 
    412 ; GCN: s_mov_b64 [[MASK]], exec
    413 
    414 ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
    415 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
    416 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
    417 ; GCN: s_and_saveexec_b64 vcc, vcc
    418 
    419 ; MOVREL: s_mov_b32 m0, [[READLANE]]
    420 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
    421 
    422 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
    423 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
    424 ; IDXMODE: s_set_gpr_idx_off
    425 
    426 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
    427 ; GCN: s_cbranch_execnz [[LOOP1]]
    428 
    429 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
    430 
    431 ; GCN: buffer_store_dword [[INS0]]
    432 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
    433 entry:
    434   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
    435   %id.ext = zext i32 %id to i64
    436   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
    437   %idx0 = load volatile i32, i32 addrspace(1)* %gep
    438   %idx1 = add i32 %idx0, 1
    439   %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
    440   %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
    441   %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
    442   store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
    443   %cmp = icmp eq i32 %id, 0
    444   br i1 %cmp, label %bb1, label %bb2
    445 
    446 bb1:
    447   store volatile i32 %live.out.val, i32 addrspace(1)* undef
    448   br label %bb2
    449 
    450 bb2:
    451   ret void
    452 }
    453 
    454 
    455 ; GCN-LABEL: {{^}}insert_adjacent_blocks:
    456 define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
    457 bb:
    458   %tmp = icmp eq i32 %arg, 0
    459   br i1 %tmp, label %bb1, label %bb4
    460 
    461 bb1:                                              ; preds = %bb
    462   %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
    463   %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
    464   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
    465   br label %bb7
    466 
    467 bb4:                                              ; preds = %bb
    468   %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
    469   %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
    470   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
    471   br label %bb7
    472 
    473 bb7:                                              ; preds = %bb4, %bb1
    474   %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
    475   store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
    476   ret void
    477 }
    478 
    479 ; FIXME: Should be able to fold zero input to movreld to inline imm?
    480 
    481 ; GCN-LABEL: {{^}}multi_same_block:
    482 
    483 ; GCN: s_load_dword [[ARG:s[0-9]+]]
    484 
    485 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
    486 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
    487 ; MOVREL: s_waitcnt
    488 ; MOVREL: s_add_i32 m0, [[ARG]], -16
    489 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
    490 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
    491 ; MOVREL: s_mov_b32 m0, -1
    492 
    493 
    494 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
    495 ; IDXMODE: s_waitcnt
    496 ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
    497 ; IDXMODE: s_set_gpr_idx_on [[ARG]], dst
    498 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
    499 ; IDXMODE: s_set_gpr_idx_off
    500 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
    501 ; IDXMODE: s_set_gpr_idx_on [[ARG]], dst
    502 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
    503 ; IDXMODE: s_set_gpr_idx_off
    504 
    505 ; GCN: ds_write_b32
    506 ; GCN: ds_write_b32
    507 ; GCN: s_endpgm
    508 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
    509 bb:
    510   %tmp1 = add i32 %arg, -16
    511   %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
    512   %tmp3 = add i32 %arg, -16
    513   %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float -4.0, i32 %tmp3
    514   %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
    515   %tmp6 = extractelement <6 x i32> %tmp5, i32 1
    516   %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
    517   %tmp8 = extractelement <6 x i32> %tmp7, i32 5
    518   store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
    519   store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
    520   ret void
    521 }
    522 
    523 ; offset puts outside of superegister bounaries, so clamp to 1st element.
    524 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
    525 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
    526 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
    527 ; MOVREL: s_mov_b32 m0, [[IDX]]
    528 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
    529 
    530 ; IDXMODE: s_set_gpr_idx_on [[IDX]], src0
    531 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
    532 ; IDXMODE: s_set_gpr_idx_off
    533 
    534 ; GCN: buffer_store_dword [[EXTRACT]]
    535 define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
    536 entry:
    537   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
    538   %offset = add i32 %idx, 3
    539   %value = extractelement <4 x i32> %ld, i32 %offset
    540   store i32 %value, i32 addrspace(1)* %out
    541   ret void
    542 }
    543 
    544 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
    545 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
    546 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
    547 ; MOVREL: s_add_i32 m0, [[IDX]], 4
    548 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
    549 
    550 ; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4
    551 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
    552 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
    553 ; IDXMODE: s_set_gpr_idx_off
    554 
    555 ; GCN: buffer_store_dword [[EXTRACT]]
    556 define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
    557 entry:
    558   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
    559   %offset = add i32 %idx, 4
    560   %value = extractelement <4 x i32> %ld, i32 %offset
    561   store i32 %value, i32 addrspace(1)* %out
    562   ret void
    563 }
    564 
    565 ; Test that the or is folded into the base address register instead of
    566 ; added to m0
    567 
    568 ; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
    569 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
    570 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
    571 ; GCN-NOT: [[IDX_SHL]]
    572 
    573 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
    574 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    575 
    576 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
    577 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    578 ; IDXMODE: s_set_gpr_idx_off
    579 define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
    580 entry:
    581   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
    582   %idx.shl = shl i32 %idx.in, 2
    583   %idx = or i32 %idx.shl, 1
    584   %value = extractelement <4 x i32> %ld, i32 %idx
    585   store i32 %value, i32 addrspace(1)* %out
    586   ret void
    587 }
    588 
    589 ; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
    590 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
    591 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
    592 ; GCN-NOT: [[IDX_SHL]]
    593 
    594 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
    595 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    596 
    597 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
    598 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    599 ; IDXMODE: s_set_gpr_idx_off
    600 define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
    601   %idx.shl = shl i32 %idx.in, 2
    602   %idx = or i32 %idx.shl, 1
    603   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
    604   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
    605   ret void
    606 }
    607 
    608 ; GCN-LABEL: {{^}}broken_phi_bb:
    609 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
    610 
    611 ; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
    612 
    613 ; GCN: {{^BB[0-9]+_[0-9]+}}:
    614 ; GCN: s_mov_b64 exec,
    615 
    616 ; GCN: [[BB2]]:
    617 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
    618 ; GCN: buffer_load_dword
    619 
    620 ; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
    621 ; MOVREL: v_movreld_b32_e32
    622 
    623 ; IDXMODE: s_set_gpr_idx_on
    624 ; IDXMODE: v_mov_b32_e32
    625 ; IDXMODE: s_set_gpr_idx_off
    626 
    627 ; GCN: s_cbranch_execnz [[REGLOOP]]
    628 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
    629 bb:
    630   br label %bb2
    631 
    632 bb2:                                              ; preds = %bb4, %bb
    633   %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
    634   %tmp3 = icmp slt i32 %tmp, %arg
    635   br i1 %tmp3, label %bb4, label %bb8
    636 
    637 bb4:                                              ; preds = %bb2
    638   %vgpr = load volatile i32, i32 addrspace(1)* undef
    639   %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
    640   %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
    641   %tmp7 = extractelement <8 x i32> %tmp6, i32 0
    642   br label %bb2
    643 
    644 bb8:                                              ; preds = %bb2
    645   ret void
    646 }
    647 
    648 declare i32 @llvm.amdgcn.workitem.id.x() #1
    649 declare void @llvm.amdgcn.s.barrier() #2
    650 
    651 attributes #0 = { nounwind }
    652 attributes #1 = { nounwind readnone }
    653 attributes #2 = { nounwind convergent }
    654