1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s 2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s 3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s 4 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5 6 ; FUNC-LABEL: {{^}}constant_load_i16: 7 ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}} 8 ; GCN-HSA: flat_load_ushort 9 10 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 11 define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) { 12 entry: 13 %ld = load i16, i16 addrspace(4)* %in 14 store i16 %ld, i16 addrspace(1)* %out 15 ret void 16 } 17 18 ; FUNC-LABEL: {{^}}constant_load_v2i16: 19 ; GCN: s_load_dword s 20 21 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 22 define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) { 23 entry: 24 %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in 25 store <2 x i16> %ld, <2 x i16> addrspace(1)* %out 26 ret void 27 } 28 29 ; FUNC-LABEL: {{^}}constant_load_v3i16: 30 ; GCN: s_load_dwordx2 s 31 32 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 33 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 34 define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 35 entry: 36 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 37 store <3 x i16> %ld, <3 x i16> addrspace(1)* %out 38 ret void 39 } 40 41 ; FUNC-LABEL: {{^}}constant_load_v4i16: 42 ; GCN: s_load_dwordx2 43 44 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 45 define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) { 46 entry: 47 %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in 48 store <4 x i16> %ld, <4 x i16> addrspace(1)* %out 49 ret void 50 } 51 52 ; FUNC-LABEL: {{^}}constant_load_v8i16: 53 ; GCN: s_load_dwordx4 54 55 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 56 define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) { 57 entry: 58 %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in 59 store <8 x i16> %ld, <8 x i16> addrspace(1)* %out 60 ret void 61 } 62 63 ; FUNC-LABEL: {{^}}constant_load_v16i16: 64 ; GCN: s_load_dwordx8 65 66 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 67 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 68 define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) { 69 entry: 70 %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in 71 store <16 x i16> %ld, <16 x i16> addrspace(1)* %out 72 ret void 73 } 74 75 ; FUNC-LABEL: {{^}}constant_load_v16i16_align2: 76 ; GCN-HSA: flat_load_dwordx4 77 ; GCN-HSA: flat_load_dwordx4 78 ; GCN-HSA: flat_store_dwordx4 79 ; GCN-HSA: flat_store_dwordx4 80 define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 { 81 entry: 82 %ld = load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2 83 store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32 84 ret void 85 } 86 87 ; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32: 88 ; GCN-NOHSA: buffer_load_ushort 89 ; GCN-NOHSA: buffer_store_dword 90 91 ; GCN-HSA: flat_load_ushort 92 ; GCN-HSA: flat_store_dword 93 94 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 95 define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 96 %a = load i16, i16 addrspace(4)* %in 97 %ext = zext i16 %a to i32 98 store i32 %ext, i32 addrspace(1)* %out 99 ret void 100 } 101 102 ; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32: 103 ; GCN-NOHSA: buffer_load_sshort 104 ; GCN-NOHSA: buffer_store_dword 105 106 ; GCN-HSA: flat_load_sshort 107 ; GCN-HSA: flat_store_dword 108 109 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 110 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 111 ; EG: 16 112 define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 113 %a = load i16, i16 addrspace(4)* %in 114 %ext = sext i16 %a to i32 115 store i32 %ext, i32 addrspace(1)* %out 116 ret void 117 } 118 119 ; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32: 120 ; GCN-NOHSA: buffer_load_ushort 121 ; GCN-HSA: flat_load_ushort 122 123 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 124 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 125 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 126 %ext = zext <1 x i16> %load to <1 x i32> 127 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 128 ret void 129 } 130 131 ; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32: 132 ; GCN-NOHSA: buffer_load_sshort 133 ; GCN-HSA: flat_load_sshort 134 135 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 136 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 137 ; EG: 16 138 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 139 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 140 %ext = sext <1 x i16> %load to <1 x i32> 141 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 142 ret void 143 } 144 145 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32: 146 ; GCN: s_load_dword s 147 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}} 148 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 149 150 ; v2i16 is naturally 4 byte aligned 151 ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 152 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal 153 ; EG: 16 154 ; EG: 16 155 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 156 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 157 %ext = zext <2 x i16> %load to <2 x i32> 158 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 159 ret void 160 } 161 162 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32: 163 ; GCN: s_load_dword s 164 ; GCN-DAG: s_ashr_i32 165 ; GCN-DAG: s_sext_i32_i16 166 167 ; v2i16 is naturally 4 byte aligned 168 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 169 ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 170 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal 171 ; TODO: We should use ASHR instead of LSHR + BFE 172 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal 173 ; EG-DAG: 16 174 ; EG-DAG: 16 175 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 176 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 177 %ext = sext <2 x i16> %load to <2 x i32> 178 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 179 ret void 180 } 181 182 ; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32: 183 ; GCN: s_load_dwordx2 184 185 ; v3i16 is naturally 8 byte aligned 186 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 187 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, 188 ; EG: CF_END 189 ; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 190 ; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 191 ; TODO: This should use DST, but for some there are redundant MOVs 192 ; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 193 ; EG-DAG: 16 194 ; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal 195 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal 196 ; EG-DAG: 65535 197 ; EG-DAG: 65535 198 define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 199 entry: 200 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 201 %ext = zext <3 x i16> %ld to <3 x i32> 202 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 203 ret void 204 } 205 206 ; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32: 207 ; GCN: s_load_dwordx2 208 209 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 210 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, 211 ; v3i16 is naturally 8 byte aligned 212 ; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1 213 ; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 214 ; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 215 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal 216 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal 217 ; EG-DAG: 16 218 ; EG-DAG: 16 219 define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 220 entry: 221 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 222 %ext = sext <3 x i16> %ld to <3 x i32> 223 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 224 ret void 225 } 226 227 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32: 228 ; GCN: s_load_dwordx2 229 ; GCN-DAG: s_and_b32 230 ; GCN-DAG: s_lshr_b32 231 232 ; v4i16 is naturally 8 byte aligned 233 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}} 234 ; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 235 ; TODO: This should use LD, but for some there are redundant MOVs 236 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal 237 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal 238 ; EG-DAG: 16 239 ; EG-DAG: 16 240 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal 241 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal 242 ; EG-DAG: 65535 243 ; EG-DAG: 65535 244 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 245 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 246 %ext = zext <4 x i16> %load to <4 x i32> 247 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 248 ret void 249 } 250 251 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32: 252 ; GCN: s_load_dwordx2 253 ; GCN-DAG: s_ashr_i32 254 ; GCN-DAG: s_sext_i32_i16 255 256 ; v4i16 is naturally 8 byte aligned 257 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 258 ; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 259 ; TODO: This should use LD, but for some there are redundant MOVs 260 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal 261 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal 262 ; TODO: We should use ASHR instead of LSHR + BFE 263 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal 264 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal 265 ; EG-DAG: 16 266 ; EG-DAG: 16 267 ; EG-DAG: 16 268 ; EG-DAG: 16 269 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 270 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 271 %ext = sext <4 x i16> %load to <4 x i32> 272 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 273 ret void 274 } 275 276 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32: 277 ; GCN: s_load_dwordx4 278 ; GCN-DAG: s_and_b32 279 ; GCN-DAG: s_lshr_b32 280 281 ; v8i16 is naturally 16 byte aligned 282 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 283 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 284 ; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 285 ; TODO: These should use LSHR instead of BFE_UINT 286 ; TODO: This should use DST, but for some there are redundant MOVs 287 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal 288 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal 289 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal 290 ; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal 291 ; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal 292 ; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal 293 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal 294 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal 295 ; EG-DAG: 16 296 ; EG-DAG: 16 297 ; EG-DAG: 16 298 ; EG-DAG: 16 299 ; EG-DAG: 65535 300 ; EG-DAG: 65535 301 ; EG-DAG: 65535 302 ; EG-DAG: 65535 303 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 304 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 305 %ext = zext <8 x i16> %load to <8 x i32> 306 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 307 ret void 308 } 309 310 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32: 311 ; GCN: s_load_dwordx4 312 ; GCN-DAG: s_ashr_i32 313 ; GCN-DAG: s_sext_i32_i16 314 315 ; v8i16 is naturally 16 byte aligned 316 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 317 ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 318 ; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 319 ; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT 320 ; TODO: This should use DST, but for some there are redundant MOVs 321 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal 322 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal 323 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal 324 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal 325 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal 326 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal 327 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal 328 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal 329 ; EG-DAG: 16 330 ; EG-DAG: 16 331 ; EG-DAG: 16 332 ; EG-DAG: 16 333 ; EG-DAG: 16 334 ; EG-DAG: 16 335 ; EG-DAG: 16 336 ; EG-DAG: 16 337 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 338 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 339 %ext = sext <8 x i16> %load to <8 x i32> 340 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 341 ret void 342 } 343 344 ; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32: 345 ; GCN: s_load_dwordx8 346 ; GCN-DAG: s_and_b32 347 ; GCN-DAG: s_lshr_b32 348 349 ; v16i16 is naturally 32 byte aligned 350 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1 351 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1 352 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 353 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 354 %ext = zext <16 x i16> %load to <16 x i32> 355 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 356 ret void 357 } 358 359 ; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32: 360 ; GCN: s_load_dwordx8 361 ; GCN-DAG: s_ashr_i32 362 ; GCN-DAG: s_sext_i32_i16 363 364 ; v16i16 is naturally 32 byte aligned 365 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1 366 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1 367 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 368 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 369 %ext = sext <16 x i16> %load to <16 x i32> 370 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 371 ret void 372 } 373 374 ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32: 375 ; GCN-DAG: s_load_dwordx16 376 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} 377 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 378 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] 379 380 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 381 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 382 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 383 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 384 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 385 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 386 %ext = zext <32 x i16> %load to <32 x i32> 387 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 388 ret void 389 } 390 391 ; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32: 392 ; GCN: s_load_dwordx16 393 ; GCN-DAG: s_ashr_i32 394 ; GCN-DAG: s_sext_i32_i16 395 396 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 397 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 398 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 399 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 400 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 401 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 402 %ext = sext <32 x i16> %load to <32 x i32> 403 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 404 ret void 405 } 406 407 ; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: 408 ; GCN: s_load_dwordx16 409 ; GCN: s_load_dwordx16 410 411 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 412 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 413 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 414 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 415 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1 416 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 417 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 418 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 419 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 420 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 421 %ext = zext <64 x i16> %load to <64 x i32> 422 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 423 ret void 424 } 425 426 ; FUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i32: 427 428 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 429 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 430 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 431 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 432 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1 433 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 434 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 435 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 436 define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 437 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 438 %ext = sext <64 x i16> %load to <64 x i32> 439 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 440 ret void 441 } 442 443 ; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64: 444 ; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]], 445 ; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]], 446 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 447 448 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 449 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 450 451 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 452 ; EG: MOV {{.*}}, 0.0 453 define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 454 %a = load i16, i16 addrspace(4)* %in 455 %ext = zext i16 %a to i64 456 store i64 %ext, i64 addrspace(1)* %out 457 ret void 458 } 459 460 ; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64: 461 ; FIXME: Need to optimize this sequence to avoid extra bfe: 462 ; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64 463 ; t31: i64 = any_extend t28 464 ; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 465 466 ; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], 467 ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], 468 ; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], 469 ; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 470 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 471 472 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 473 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 474 475 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 476 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 477 ; TODO: These could be expanded earlier using ASHR 15 478 ; EG: 31 479 define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 480 %a = load i16, i16 addrspace(4)* %in 481 %ext = sext i16 %a to i64 482 store i64 %ext, i64 addrspace(1)* %out 483 ret void 484 } 485 486 ; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i64: 487 488 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 489 ; EG: MOV {{.*}}, 0.0 490 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 491 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 492 %ext = zext <1 x i16> %load to <1 x i64> 493 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 494 ret void 495 } 496 497 ; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i64: 498 499 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 500 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 501 ; TODO: These could be expanded earlier using ASHR 15 502 ; EG: 31 503 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 504 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 505 %ext = sext <1 x i16> %load to <1 x i64> 506 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 507 ret void 508 } 509 510 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64: 511 512 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 513 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 514 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 515 %ext = zext <2 x i16> %load to <2 x i64> 516 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 517 ret void 518 } 519 520 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64: 521 522 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 523 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 524 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 525 %ext = sext <2 x i16> %load to <2 x i64> 526 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 527 ret void 528 } 529 530 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64: 531 532 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 533 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 534 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 535 %ext = zext <4 x i16> %load to <4 x i64> 536 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 537 ret void 538 } 539 540 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64: 541 542 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 543 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 544 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 545 %ext = sext <4 x i16> %load to <4 x i64> 546 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 547 ret void 548 } 549 550 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64: 551 552 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 553 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 554 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 555 %ext = zext <8 x i16> %load to <8 x i64> 556 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 557 ret void 558 } 559 560 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64: 561 562 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 563 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 564 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 565 %ext = sext <8 x i16> %load to <8 x i64> 566 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 567 ret void 568 } 569 570 ; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i64: 571 572 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 573 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 574 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 575 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 576 %ext = zext <16 x i16> %load to <16 x i64> 577 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 578 ret void 579 } 580 581 ; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i64: 582 583 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 584 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 585 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 586 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 587 %ext = sext <16 x i16> %load to <16 x i64> 588 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 589 ret void 590 } 591 592 ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i64: 593 594 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 595 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 596 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 597 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 598 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 599 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 600 %ext = zext <32 x i16> %load to <32 x i64> 601 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 602 ret void 603 } 604 605 ; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i64: 606 607 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 608 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 609 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 610 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 611 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 612 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 613 %ext = sext <32 x i16> %load to <32 x i64> 614 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 615 ret void 616 } 617 618 ; These trigger undefined register machine verifier errors 619 620 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64: 621 ; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 622 ; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 623 ; %ext = zext <64 x i16> %load to <64 x i64> 624 ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 625 ; ret void 626 ; } 627 628 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64: 629 ; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 630 ; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 631 ; %ext = sext <64 x i16> %load to <64 x i64> 632 ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 633 ; ret void 634 ; } 635 636 attributes #0 = { nounwind } 637