Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
      3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
      4 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
      5 
      6 ; Testing for ds_read/write_b128
      7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
      8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
      9 
     10 ; FUNC-LABEL: {{^}}local_load_i16:
     11 ; GFX9-NOT: m0
     12 ; SICIVI: s_mov_b32 m0
     13 
     14 ; GCN: ds_read_u16 v{{[0-9]+}}
     15 
     16 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
     17 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
     18 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
     19 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
     20 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
     21 define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
     22 entry:
     23   %ld = load i16, i16 addrspace(3)* %in
     24   store i16 %ld, i16 addrspace(3)* %out
     25   ret void
     26 }
     27 
     28 ; FUNC-LABEL: {{^}}local_load_v2i16:
     29 ; GFX9-NOT: m0
     30 ; SICIVI: s_mov_b32 m0
     31 
     32 ; GCN: ds_read_b32
     33 
     34 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
     35 ; EG: LDS_READ_RET {{.*}} [[FROM]]
     36 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
     37 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
     38 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
     39 define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
     40 entry:
     41   %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
     42   store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
     43   ret void
     44 }
     45 
     46 ; FUNC-LABEL: {{^}}local_load_v3i16:
     47 ; GFX9-NOT: m0
     48 ; SICIVI: s_mov_b32 m0
     49 
     50 ; GCN: ds_read_b64
     51 ; GCN-DAG: ds_write_b32
     52 ; GCN-DAG: ds_write_b16
     53 
     54 ; EG-DAG: LDS_USHORT_READ_RET
     55 ; EG-DAG: LDS_READ_RET
     56 define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
     57 entry:
     58   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
     59   store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
     60   ret void
     61 }
     62 
     63 ; FUNC-LABEL: {{^}}local_load_v4i16:
     64 ; GFX9-NOT: m0
     65 ; SICIVI: s_mov_b32 m0
     66 
     67 ; GCN: ds_read_b64
     68 
     69 ; EG: LDS_READ_RET
     70 ; EG: LDS_READ_RET
     71 define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
     72 entry:
     73   %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
     74   store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
     75   ret void
     76 }
     77 
     78 ; FUNC-LABEL: {{^}}local_load_v8i16:
     79 ; GFX9-NOT: m0
     80 ; SICIVI: s_mov_b32 m0
     81 
     82 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
     83 
     84 ; EG: LDS_READ_RET
     85 ; EG: LDS_READ_RET
     86 ; EG: LDS_READ_RET
     87 ; EG: LDS_READ_RET
     88 define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
     89 entry:
     90   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
     91   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
     92   ret void
     93 }
     94 
     95 ; FUNC-LABEL: {{^}}local_load_v16i16:
     96 ; GFX9-NOT: m0
     97 ; SICIVI: s_mov_b32 m0
     98 
     99 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
    100 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
    101 
    102 
    103 ; EG: LDS_READ_RET
    104 ; EG: LDS_READ_RET
    105 ; EG: LDS_READ_RET
    106 ; EG: LDS_READ_RET
    107 
    108 ; EG: LDS_READ_RET
    109 ; EG: LDS_READ_RET
    110 ; EG: LDS_READ_RET
    111 ; EG: LDS_READ_RET
    112 define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
    113 entry:
    114   %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
    115   store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
    116   ret void
    117 }
    118 
    119 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
    120 ; GFX9-NOT: m0
    121 ; SICIVI: s_mov_b32 m0
    122 
    123 ; GCN: ds_read_u16
    124 ; GCN: ds_write_b32
    125 
    126 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    127 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    128 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
    129 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    130 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    131 define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
    132   %a = load i16, i16 addrspace(3)* %in
    133   %ext = zext i16 %a to i32
    134   store i32 %ext, i32 addrspace(3)* %out
    135   ret void
    136 }
    137 
    138 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
    139 ; GCN-NOT: s_wqm_b64
    140 
    141 ; GFX9-NOT: m0
    142 ; SICIVI: s_mov_b32 m0
    143 
    144 ; GCN: ds_read_i16
    145 
    146 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    147 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    148 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
    149 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    150 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
    151 ; EG: 16
    152 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    153 define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
    154   %a = load i16, i16 addrspace(3)* %in
    155   %ext = sext i16 %a to i32
    156   store i32 %ext, i32 addrspace(3)* %out
    157   ret void
    158 }
    159 
    160 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
    161 ; GFX9-NOT: m0
    162 ; SICIVI: s_mov_b32 m0
    163 
    164 ; GCN: ds_read_u16
    165 
    166 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    167 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    168 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
    169 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    170 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    171 define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
    172   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
    173   %ext = zext <1 x i16> %load to <1 x i32>
    174   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
    175   ret void
    176 }
    177 
    178 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
    179 ; GFX9-NOT: m0
    180 ; SICIVI: s_mov_b32 m0
    181 
    182 ; GCN: ds_read_i16
    183 
    184 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    185 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    186 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
    187 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    188 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
    189 ; EG: 16
    190 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    191 define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
    192   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
    193   %ext = sext <1 x i16> %load to <1 x i32>
    194   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
    195   ret void
    196 }
    197 
    198 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
    199 ; GCN-NOT: s_wqm_b64
    200 ; GFX9-NOT: m0
    201 ; SICIVI: s_mov_b32 m0
    202 
    203 ; GCN: ds_read_b32
    204 
    205 ; EG: LDS_READ_RET
    206 define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
    207   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
    208   %ext = zext <2 x i16> %load to <2 x i32>
    209   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
    210   ret void
    211 }
    212 
    213 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
    214 ; GCN-NOT: s_wqm_b64
    215 ; GFX9-NOT: m0
    216 ; SICIVI: s_mov_b32 m0
    217 
    218 ; GCN: ds_read_b32
    219 
    220 ; EG: LDS_READ_RET
    221 ; EG: BFE_INT
    222 ; EG: BFE_INT
    223 define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
    224   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
    225   %ext = sext <2 x i16> %load to <2 x i32>
    226   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
    227   ret void
    228 }
    229 
    230 ; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
    231 ; GFX9-NOT: m0
    232 ; SICIVI: s_mov_b32 m0
    233 
    234 ; GCN: ds_read_b64
    235 ; GCN-DAG: ds_write_b32
    236 ; GCN-DAG: ds_write_b64
    237 
    238 ; EG: LDS_READ_RET
    239 define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
    240 entry:
    241   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
    242   %ext = zext <3 x i16> %ld to <3 x i32>
    243   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
    244   ret void
    245 }
    246 
    247 ; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
    248 ; GFX9-NOT: m0
    249 ; SICIVI: s_mov_b32 m0
    250 
    251 ; GCN: ds_read_b64
    252 ; GCN-DAG: ds_write_b32
    253 ; GCN-DAG: ds_write_b64
    254 
    255 ; EG: LDS_READ_RET
    256 ; EG-DAG: BFE_INT
    257 ; EG-DAG: BFE_INT
    258 ; EG-DAG: BFE_INT
    259 define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
    260 entry:
    261   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
    262   %ext = sext <3 x i16> %ld to <3 x i32>
    263   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
    264   ret void
    265 }
    266 
    267 ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
    268 ; GCN-NOT: s_wqm_b64
    269 ; GFX9-NOT: m0
    270 ; SICIVI: s_mov_b32 m0
    271 
    272 ; GCN: ds_read_b64
    273 
    274 ; EG: LDS_READ_RET
    275 ; EG: LDS_READ_RET
    276 define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
    277   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
    278   %ext = zext <4 x i16> %load to <4 x i32>
    279   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
    280   ret void
    281 }
    282 
    283 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
    284 ; GCN-NOT: s_wqm_b64
    285 ; GFX9-NOT: m0
    286 ; SICIVI: s_mov_b32 m0
    287 
    288 ; GCN: ds_read_b64
    289 
    290 ; EG: LDS_READ_RET
    291 ; EG: LDS_READ_RET
    292 ; EG-DAG: BFE_INT
    293 ; EG-DAG: BFE_INT
    294 ; EG-DAG: BFE_INT
    295 ; EG-DAG: BFE_INT
    296 define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
    297   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
    298   %ext = sext <4 x i16> %load to <4 x i32>
    299   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
    300   ret void
    301 }
    302 
    303 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
    304 ; GFX9-NOT: m0
    305 ; SICIVI: s_mov_b32 m0
    306 
    307 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    308 
    309 ; EG: LDS_READ_RET
    310 ; EG: LDS_READ_RET
    311 ; EG: LDS_READ_RET
    312 ; EG: LDS_READ_RET
    313 define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
    314   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
    315   %ext = zext <8 x i16> %load to <8 x i32>
    316   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
    317   ret void
    318 }
    319 
    320 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
    321 ; GFX9-NOT: m0
    322 ; SICIVI: s_mov_b32 m0
    323 
    324 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    325 
    326 ; EG: LDS_READ_RET
    327 ; EG: LDS_READ_RET
    328 ; EG: LDS_READ_RET
    329 ; EG: LDS_READ_RET
    330 ; EG-DAG: BFE_INT
    331 ; EG-DAG: BFE_INT
    332 ; EG-DAG: BFE_INT
    333 ; EG-DAG: BFE_INT
    334 ; EG-DAG: BFE_INT
    335 ; EG-DAG: BFE_INT
    336 ; EG-DAG: BFE_INT
    337 ; EG-DAG: BFE_INT
    338 define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
    339   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
    340   %ext = sext <8 x i16> %load to <8 x i32>
    341   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
    342   ret void
    343 }
    344 
    345 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
    346 ; GFX9-NOT: m0
    347 ; SICIVI: s_mov_b32 m0
    348 
    349 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    350 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
    351 
    352 ; GCN: ds_write2_b64
    353 ; GCN: ds_write2_b64
    354 ; GCN: ds_write2_b64
    355 ; GCN: ds_write2_b64
    356 
    357 ; EG: LDS_READ_RET
    358 ; EG: LDS_READ_RET
    359 ; EG: LDS_READ_RET
    360 ; EG: LDS_READ_RET
    361 ; EG: LDS_READ_RET
    362 ; EG: LDS_READ_RET
    363 ; EG: LDS_READ_RET
    364 ; EG: LDS_READ_RET
    365 define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
    366   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
    367   %ext = zext <16 x i16> %load to <16 x i32>
    368   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
    369   ret void
    370 }
    371 
    372 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
    373 ; GFX9-NOT: m0
    374 ; SICIVI: s_mov_b32 m0
    375 
    376 
    377 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    378 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
    379 
    380 ; EG: LDS_READ_RET
    381 ; EG: LDS_READ_RET
    382 ; EG: LDS_READ_RET
    383 ; EG: LDS_READ_RET
    384 ; EG: LDS_READ_RET
    385 ; EG: LDS_READ_RET
    386 ; EG: LDS_READ_RET
    387 ; EG: LDS_READ_RET
    388 ; EG-DAG: BFE_INT
    389 ; EG-DAG: BFE_INT
    390 ; EG-DAG: BFE_INT
    391 ; EG-DAG: BFE_INT
    392 ; EG-DAG: BFE_INT
    393 ; EG-DAG: BFE_INT
    394 ; EG-DAG: BFE_INT
    395 ; EG-DAG: BFE_INT
    396 ; EG-DAG: BFE_INT
    397 ; EG-DAG: BFE_INT
    398 ; EG-DAG: BFE_INT
    399 ; EG-DAG: BFE_INT
    400 ; EG-DAG: BFE_INT
    401 ; EG-DAG: BFE_INT
    402 ; EG-DAG: BFE_INT
    403 ; EG-DAG: BFE_INT
    404 define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
    405   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
    406   %ext = sext <16 x i16> %load to <16 x i32>
    407   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
    408   ret void
    409 }
    410 
    411 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
    412 ; GFX9-NOT: m0
    413 ; SICIVI: s_mov_b32 m0
    414 
    415 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    416 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
    417 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
    418 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
    419 
    420 ; EG: LDS_READ_RET
    421 ; EG: LDS_READ_RET
    422 ; EG: LDS_READ_RET
    423 ; EG: LDS_READ_RET
    424 ; EG: LDS_READ_RET
    425 ; EG: LDS_READ_RET
    426 ; EG: LDS_READ_RET
    427 ; EG: LDS_READ_RET
    428 ; EG: LDS_READ_RET
    429 ; EG: LDS_READ_RET
    430 ; EG: LDS_READ_RET
    431 ; EG: LDS_READ_RET
    432 ; EG: LDS_READ_RET
    433 ; EG: LDS_READ_RET
    434 ; EG: LDS_READ_RET
    435 ; EG: LDS_READ_RET
    436 define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
    437   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
    438   %ext = zext <32 x i16> %load to <32 x i32>
    439   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
    440   ret void
    441 }
    442 
    443 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
    444 ; GFX9-NOT: m0
    445 ; SICIVI: s_mov_b32 m0
    446 
    447 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
    448 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
    449 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    450 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
    451 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
    452 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
    453 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
    454 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
    455 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
    456 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
    457 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
    458 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
    459 
    460 ; EG: LDS_READ_RET
    461 ; EG: LDS_READ_RET
    462 ; EG: LDS_READ_RET
    463 ; EG: LDS_READ_RET
    464 ; EG: LDS_READ_RET
    465 ; EG: LDS_READ_RET
    466 ; EG: LDS_READ_RET
    467 ; EG: LDS_READ_RET
    468 ; EG: LDS_READ_RET
    469 ; EG: LDS_READ_RET
    470 ; EG: LDS_READ_RET
    471 ; EG: LDS_READ_RET
    472 ; EG: LDS_READ_RET
    473 ; EG: LDS_READ_RET
    474 ; EG: LDS_READ_RET
    475 ; EG: LDS_READ_RET
    476 define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
    477   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
    478   %ext = sext <32 x i16> %load to <32 x i32>
    479   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
    480   ret void
    481 }
    482 
    483 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
    484 ; GFX9-NOT: m0
    485 ; SICIVI: s_mov_b32 m0
    486 
    487 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
    488 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
    489 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
    490 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
    491 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
    492 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
    493 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
    494 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
    495 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
    496 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
    497 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
    498 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
    499 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
    500 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
    501 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
    502 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
    503 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
    504 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
    505 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
    506 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
    507 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
    508 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
    509 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
    510 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
    511 
    512 ; EG: LDS_READ_RET
    513 ; EG: LDS_READ_RET
    514 ; EG: LDS_READ_RET
    515 ; EG: LDS_READ_RET
    516 ; EG: LDS_READ_RET
    517 ; EG: LDS_READ_RET
    518 ; EG: LDS_READ_RET
    519 ; EG: LDS_READ_RET
    520 ; EG: LDS_READ_RET
    521 ; EG: LDS_READ_RET
    522 ; EG: LDS_READ_RET
    523 ; EG: LDS_READ_RET
    524 ; EG: LDS_READ_RET
    525 ; EG: LDS_READ_RET
    526 ; EG: LDS_READ_RET
    527 ; EG: LDS_READ_RET
    528 ; EG: LDS_READ_RET
    529 ; EG: LDS_READ_RET
    530 ; EG: LDS_READ_RET
    531 ; EG: LDS_READ_RET
    532 ; EG: LDS_READ_RET
    533 ; EG: LDS_READ_RET
    534 ; EG: LDS_READ_RET
    535 ; EG: LDS_READ_RET
    536 ; EG: LDS_READ_RET
    537 ; EG: LDS_READ_RET
    538 ; EG: LDS_READ_RET
    539 ; EG: LDS_READ_RET
    540 ; EG: LDS_READ_RET
    541 ; EG: LDS_READ_RET
    542 ; EG: LDS_READ_RET
    543 ; EG: LDS_READ_RET
    544 define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
    545   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
    546   %ext = zext <64 x i16> %load to <64 x i32>
    547   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
    548   ret void
    549 }
    550 
    551 ; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
    552 ; GFX9-NOT: m0
    553 ; SICIVI: s_mov_b32 m0
    554 
    555 ; EG: LDS_READ_RET
    556 ; EG: LDS_READ_RET
    557 ; EG: LDS_READ_RET
    558 ; EG: LDS_READ_RET
    559 ; EG: LDS_READ_RET
    560 ; EG: LDS_READ_RET
    561 ; EG: LDS_READ_RET
    562 ; EG: LDS_READ_RET
    563 ; EG: LDS_READ_RET
    564 ; EG: LDS_READ_RET
    565 ; EG: LDS_READ_RET
    566 ; EG: LDS_READ_RET
    567 ; EG: LDS_READ_RET
    568 ; EG: LDS_READ_RET
    569 ; EG: LDS_READ_RET
    570 ; EG: LDS_READ_RET
    571 ; EG: LDS_READ_RET
    572 ; EG: LDS_READ_RET
    573 ; EG: LDS_READ_RET
    574 ; EG: LDS_READ_RET
    575 ; EG: LDS_READ_RET
    576 ; EG: LDS_READ_RET
    577 ; EG: LDS_READ_RET
    578 ; EG: LDS_READ_RET
    579 ; EG: LDS_READ_RET
    580 ; EG: LDS_READ_RET
    581 ; EG: LDS_READ_RET
    582 ; EG: LDS_READ_RET
    583 ; EG: LDS_READ_RET
    584 ; EG: LDS_READ_RET
    585 ; EG: LDS_READ_RET
    586 ; EG: LDS_READ_RET
    587 define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
    588   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
    589   %ext = sext <64 x i16> %load to <64 x i32>
    590   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
    591   ret void
    592 }
    593 
    594 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
    595 ; GFX9-NOT: m0
    596 ; SICIVI: s_mov_b32 m0
    597 
    598 ; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
    599 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
    600 
    601 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
    602 
    603 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    604 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    605 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
    606 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    607 ; EG-DAG: LDS_WRITE
    608 define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
    609   %a = load i16, i16 addrspace(3)* %in
    610   %ext = zext i16 %a to i64
    611   store i64 %ext, i64 addrspace(3)* %out
    612   ret void
    613 }
    614 
    615 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
    616 ; GFX9-NOT: m0
    617 ; SICIVI: s_mov_b32 m0
    618 
    619 ; FIXME: Need to optimize this sequence to avoid an extra shift.
    620 ;  t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
    621 ;          t28: i64 = any_extend t25
    622 ;        t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
    623 ; SI: ds_read_i16 v[[LO:[0-9]+]],
    624 ; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
    625 ; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
    626 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
    627 
    628 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
    629 
    630 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    631 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    632 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
    633 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    634 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
    635 ; EG-DAG: LDS_WRITE
    636 ; EG-DAG: 16
    637 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    638 define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
    639   %a = load i16, i16 addrspace(3)* %in
    640   %ext = sext i16 %a to i64
    641   store i64 %ext, i64 addrspace(3)* %out
    642   ret void
    643 }
    644 
    645 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
    646 ; GFX9-NOT: m0
    647 ; SICIVI: s_mov_b32 m0
    648 
    649 
    650 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    651 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    652 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
    653 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    654 ; EG-DAG: LDS_WRITE
    655 define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
    656   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
    657   %ext = zext <1 x i16> %load to <1 x i64>
    658   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
    659   ret void
    660 }
    661 
    662 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
    663 ; GFX9-NOT: m0
    664 ; SICIVI: s_mov_b32 m0
    665 
    666 
    667 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
    668 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
    669 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
    670 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
    671 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
    672 ; EG-DAG: LDS_WRITE
    673 ; EG-DAG: 16
    674 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
    675 define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
    676   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
    677   %ext = sext <1 x i16> %load to <1 x i64>
    678   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
    679   ret void
    680 }
    681 
    682 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
    683 ; GFX9-NOT: m0
    684 ; SICIVI: s_mov_b32 m0
    685 
    686 
    687 ; EG: LDS_READ_RET
    688 define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
    689   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
    690   %ext = zext <2 x i16> %load to <2 x i64>
    691   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
    692   ret void
    693 }
    694 
    695 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
    696 ; GFX9-NOT: m0
    697 ; SICIVI: s_mov_b32 m0
    698 
    699 
    700 ; EG: LDS_READ_RET
    701 ; EG-DAG: BFE_INT
    702 ; EG-DAG: ASHR
    703 define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
    704   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
    705   %ext = sext <2 x i16> %load to <2 x i64>
    706   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
    707   ret void
    708 }
    709 
    710 ; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
    711 ; GFX9-NOT: m0
    712 ; SICIVI: s_mov_b32 m0
    713 
    714 
    715 ; EG: LDS_READ_RET
    716 ; EG: LDS_READ_RET
    717 define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
    718   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
    719   %ext = zext <4 x i16> %load to <4 x i64>
    720   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
    721   ret void
    722 }
    723 
    724 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
    725 ; GFX9-NOT: m0
    726 ; SICIVI: s_mov_b32 m0
    727 
    728 
    729 ; EG: LDS_READ_RET
    730 ; EG: LDS_READ_RET
    731 ; EG-DAG: BFE_INT
    732 ; EG-DAG: BFE_INT
    733 ; EG-DAG: ASHR
    734 ; EG-DAG: ASHR
    735 define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
    736   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
    737   %ext = sext <4 x i16> %load to <4 x i64>
    738   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
    739   ret void
    740 }
    741 
    742 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
    743 ; GFX9-NOT: m0
    744 ; SICIVI: s_mov_b32 m0
    745 
    746 
    747 ; EG: LDS_READ_RET
    748 ; EG: LDS_READ_RET
    749 ; EG: LDS_READ_RET
    750 ; EG: LDS_READ_RET
    751 define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
    752   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
    753   %ext = zext <8 x i16> %load to <8 x i64>
    754   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
    755   ret void
    756 }
    757 
    758 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
    759 ; GFX9-NOT: m0
    760 ; SICIVI: s_mov_b32 m0
    761 
    762 
    763 ; EG: LDS_READ_RET
    764 ; EG: LDS_READ_RET
    765 ; EG: LDS_READ_RET
    766 ; EG: LDS_READ_RET
    767 ; EG-DAG: BFE_INT
    768 ; EG-DAG: BFE_INT
    769 ; EG-DAG: ASHR
    770 ; EG-DAG: ASHR
    771 ; EG-DAG: BFE_INT
    772 ; EG-DAG: BFE_INT
    773 ; EG-DAG: ASHR
    774 ; EG-DAG: ASHR
    775 define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
    776   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
    777   %ext = sext <8 x i16> %load to <8 x i64>
    778   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
    779   ret void
    780 }
    781 
    782 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
    783 ; GFX9-NOT: m0
    784 ; SICIVI: s_mov_b32 m0
    785 
    786 
    787 ; EG: LDS_READ_RET
    788 ; EG: LDS_READ_RET
    789 ; EG: LDS_READ_RET
    790 ; EG: LDS_READ_RET
    791 ; EG: LDS_READ_RET
    792 ; EG: LDS_READ_RET
    793 ; EG: LDS_READ_RET
    794 ; EG: LDS_READ_RET
    795 define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
    796   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
    797   %ext = zext <16 x i16> %load to <16 x i64>
    798   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
    799   ret void
    800 }
    801 
    802 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
    803 ; GFX9-NOT: m0
    804 ; SICIVI: s_mov_b32 m0
    805 
    806 
    807 ; EG: LDS_READ_RET
    808 ; EG: LDS_READ_RET
    809 ; EG: LDS_READ_RET
    810 ; EG: LDS_READ_RET
    811 ; EG: LDS_READ_RET
    812 ; EG: LDS_READ_RET
    813 ; EG: LDS_READ_RET
    814 ; EG: LDS_READ_RET
    815 ; EG-DAG: BFE_INT
    816 ; EG-DAG: BFE_INT
    817 ; EG-DAG: ASHR
    818 ; EG-DAG: ASHR
    819 ; EG-DAG: BFE_INT
    820 ; EG-DAG: BFE_INT
    821 ; EG-DAG: ASHR
    822 ; EG-DAG: ASHR
    823 ; EG-DAG: BFE_INT
    824 ; EG-DAG: BFE_INT
    825 ; EG-DAG: ASHR
    826 ; EG-DAG: ASHR
    827 ; EG-DAG: BFE_INT
    828 ; EG-DAG: BFE_INT
    829 ; EG-DAG: ASHR
    830 ; EG-DAG: ASHR
    831 define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
    832   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
    833   %ext = sext <16 x i16> %load to <16 x i64>
    834   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
    835   ret void
    836 }
    837 
    838 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
    839 ; GFX9-NOT: m0
    840 ; SICIVI: s_mov_b32 m0
    841 
    842 
    843 ; EG: LDS_READ_RET
    844 ; EG: LDS_READ_RET
    845 ; EG: LDS_READ_RET
    846 ; EG: LDS_READ_RET
    847 ; EG: LDS_READ_RET
    848 ; EG: LDS_READ_RET
    849 ; EG: LDS_READ_RET
    850 ; EG: LDS_READ_RET
    851 ; EG: LDS_READ_RET
    852 ; EG: LDS_READ_RET
    853 ; EG: LDS_READ_RET
    854 ; EG: LDS_READ_RET
    855 ; EG: LDS_READ_RET
    856 ; EG: LDS_READ_RET
    857 ; EG: LDS_READ_RET
    858 ; EG: LDS_READ_RET
    859 define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
    860   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
    861   %ext = zext <32 x i16> %load to <32 x i64>
    862   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
    863   ret void
    864 }
    865 
    866 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
    867 ; GFX9-NOT: m0
    868 ; SICIVI: s_mov_b32 m0
    869 
    870 
    871 ; EG: LDS_READ_RET
    872 ; EG: LDS_READ_RET
    873 ; EG: LDS_READ_RET
    874 ; EG: LDS_READ_RET
    875 ; EG: LDS_READ_RET
    876 ; EG: LDS_READ_RET
    877 ; EG: LDS_READ_RET
    878 ; EG: LDS_READ_RET
    879 ; EG: LDS_READ_RET
    880 ; EG: LDS_READ_RET
    881 ; EG: LDS_READ_RET
    882 ; EG: LDS_READ_RET
    883 ; EG: LDS_READ_RET
    884 ; EG: LDS_READ_RET
    885 ; EG: LDS_READ_RET
    886 ; EG: LDS_READ_RET
    887 ; EG-DAG: BFE_INT
    888 ; EG-DAG: BFE_INT
    889 ; EG-DAG: ASHR
    890 ; EG-DAG: ASHR
    891 ; EG-DAG: BFE_INT
    892 ; EG-DAG: BFE_INT
    893 ; EG-DAG: ASHR
    894 ; EG-DAG: ASHR
    895 ; EG-DAG: BFE_INT
    896 ; EG-DAG: BFE_INT
    897 ; EG-DAG: ASHR
    898 ; EG-DAG: ASHR
    899 ; EG-DAG: BFE_INT
    900 ; EG-DAG: BFE_INT
    901 ; EG-DAG: ASHR
    902 ; EG-DAG: ASHR
    903 ; EG-DAG: BFE_INT
    904 ; EG-DAG: BFE_INT
    905 ; EG-DAG: ASHR
    906 ; EG-DAG: ASHR
    907 ; EG-DAG: BFE_INT
    908 ; EG-DAG: BFE_INT
    909 ; EG-DAG: ASHR
    910 ; EG-DAG: ASHR
    911 ; EG-DAG: BFE_INT
    912 ; EG-DAG: BFE_INT
    913 ; EG-DAG: ASHR
    914 ; EG-DAG: ASHR
    915 ; EG-DAG: BFE_INT
    916 ; EG-DAG: BFE_INT
    917 ; EG-DAG: ASHR
    918 ; EG-DAG: ASHR
    919 define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
    920   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
    921   %ext = sext <32 x i16> %load to <32 x i64>
    922   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
    923   ret void
    924 }
    925 
    926 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
    927 ; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
    928 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
    929 ;   %ext = zext <64 x i16> %load to <64 x i64>
    930 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
    931 ;   ret void
    932 ; }
    933 
    934 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
    935 ; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
    936 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
    937 ;   %ext = sext <64 x i16> %load to <64 x i64>
    938 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
    939 ;   ret void
    940 ; }
    941 
    942 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
    943 ; FUNC-LABEL: {{^}}local_v8i16_to_128:
    944 
    945 ; SI-NOT: ds_read_b128
    946 ; SI-NOT: ds_write_b128
    947 
    948 ; CIVI: ds_read_b128
    949 ; CIVI: ds_write_b128
    950 
    951 ; EG: LDS_READ_RET
    952 ; EG: LDS_READ_RET
    953 ; EG: LDS_READ_RET
    954 ; EG: LDS_READ_RET
    955 define amdgpu_kernel void @local_v8i16_to_128(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
    956   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in, align 16
    957   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out, align 16
    958   ret void
    959 }
    960 
    961 attributes #0 = { nounwind }
    962