1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5 ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 6 ; GCN: s_waitcnt 7 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 8 ; GFX900-NEXT: s_waitcnt 9 ; GFX900-NEXT: s_setpc_b64 10 11 ; NO-D16-HI: ds_read_u16 v 12 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 13 entry: 14 %load = load i16, i16 addrspace(3)* %in 15 %build = insertelement <2 x i16> undef, i16 %load, i32 1 16 ret <2 x i16> %build 17 } 18 19 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 20 ; GCN: s_waitcnt 21 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 22 ; GFX900-NEXT: s_waitcnt 23 ; GFX900-NEXT: v_mov_b32_e32 v0, v1 24 ; GFX900-NEXT: s_setpc_b64 25 26 ; NO-D16-HI: ds_read_u16 v 27 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 28 entry: 29 %load = load i16, i16 addrspace(3)* %in 30 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 31 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 32 ret <2 x i16> %build1 33 } 34 35 ; Show that we get reasonable regalloc without physreg constraints. 36 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 37 ; GCN: s_waitcnt 38 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 39 ; GFX900-NEXT: s_waitcnt 40 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 41 ; GFX900-NEXT: s_waitcnt 42 ; GFX900-NEXT: s_setpc_b64 43 44 ; NO-D16-HI: ds_read_u16 v 45 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 46 entry: 47 %load = load i16, i16 addrspace(3)* %in 48 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 49 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 50 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 51 ret void 52 } 53 54 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 55 ; GCN: s_waitcnt 56 ; GFX900-NEXT: v_mov_b32_e32 v1, 0 57 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 58 ; GFX900-NEXT: s_waitcnt 59 ; GFX900-NEXT: v_mov_b32_e32 v0, v1 60 ; GFX900-NEXT: s_setpc_b64 61 62 ; NO-D16-HI: ds_read_u16 v 63 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 64 entry: 65 %load = load i16, i16 addrspace(3)* %in 66 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 67 ret <2 x i16> %build 68 } 69 70 ; FIXME: Remove m0 initialization 71 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 72 ; GCN: s_waitcnt 73 ; GFX900-NEXT: ds_read_u16 v0, v0 74 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) 75 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 76 ; GFX900-NEXT: s_setpc_b64 77 78 ; NO-D16-HI: ds_read_u16 v 79 ; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 80 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 81 entry: 82 %load = load i16, i16 addrspace(3)* %in 83 %zext = zext i16 %load to i32 84 %shift = shl i32 %zext, 16 85 ret i32 %shift 86 } 87 88 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 89 ; GCN: s_waitcnt 90 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 91 ; GFX900-NEXT: s_waitcnt 92 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 93 ; GFX900-NEXT: s_waitcnt 94 ; GFX900-NEXT: s_setpc_b64 95 96 ; NO-D16-HI: ds_read_u16 v 97 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 98 entry: 99 %load = load half, half addrspace(3)* %in 100 %build0 = insertelement <2 x half> undef, half %reg, i32 0 101 %build1 = insertelement <2 x half> %build0, half %load, i32 1 102 store <2 x half> %build1, <2 x half> addrspace(1)* undef 103 ret void 104 } 105 106 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 107 ; GCN: s_waitcnt 108 ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 109 ; GFX900-NEXT: s_waitcnt 110 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 111 ; GFX900-NEXT: s_waitcnt 112 ; GFX900-NEXT: s_setpc_b64 113 114 ; NO-D16-HI: ds_read_u8 v 115 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 116 entry: 117 %load = load i8, i8 addrspace(3)* %in 118 %ext = zext i8 %load to i16 119 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 120 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 121 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 122 ret void 123 } 124 125 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 126 ; GCN: s_waitcnt 127 ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 128 ; GFX900-NEXT: s_waitcnt 129 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 130 ; GFX900-NEXT: s_waitcnt 131 ; GFX900-NEXT: s_setpc_b64 132 133 ; NO-D16-HI: ds_read_i8 v 134 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 135 entry: 136 %load = load i8, i8 addrspace(3)* %in 137 %ext = sext i8 %load to i16 138 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 139 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 140 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 141 ret void 142 } 143 144 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 145 ; GCN: s_waitcnt 146 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 147 ; GFX900-NEXT: s_waitcnt 148 ; GFX900-NEXT: global_store_dword 149 ; GFX900-NEXT: s_waitcnt 150 ; GFX900-NEXT: s_setpc_b64 151 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 152 entry: 153 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 154 %load = load i16, i16 addrspace(1)* %gep 155 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 156 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 157 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 158 ret void 159 } 160 161 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 162 ; GCN: s_waitcnt 163 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 164 ; GFX900-NEXT: s_waitcnt 165 ; GFX900-NEXT: global_store_dword 166 ; GFX900-NEXT: s_waitcnt 167 ; GFX900-NEXT: s_setpc_b64 168 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 169 entry: 170 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 171 %load = load half, half addrspace(1)* %gep 172 %build0 = insertelement <2 x half> undef, half %reg, i32 0 173 %build1 = insertelement <2 x half> %build0, half %load, i32 1 174 store <2 x half> %build1, <2 x half> addrspace(1)* undef 175 ret void 176 } 177 178 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 179 ; GCN: s_waitcnt 180 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 181 ; GFX900-NEXT: s_waitcnt 182 ; GFX900-NEXT: global_store_dword 183 ; GFX900-NEXT: s_waitcnt 184 ; GFX900-NEXT: s_setpc_b64 185 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 186 entry: 187 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 188 %load = load i8, i8 addrspace(1)* %gep 189 %ext = zext i8 %load to i16 190 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 191 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 192 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 193 ret void 194 } 195 196 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 197 ; GCN: s_waitcnt 198 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 199 ; GFX900-NEXT: s_waitcnt 200 ; GFX900-NEXT: global_store_dword 201 ; GFX900-NEXT: s_waitcnt 202 ; GFX900-NEXT: s_setpc_b64 203 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 204 entry: 205 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 206 %load = load i8, i8 addrspace(1)* %gep 207 %ext = sext i8 %load to i16 208 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 209 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 210 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 211 ret void 212 } 213 214 ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 215 ; GCN: s_waitcnt 216 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 217 ; GFX900-NEXT: s_waitcnt 218 ; GFX900-NEXT: global_store_dword v[0:1], v2 219 ; GFX900-NEXT: s_waitcnt 220 ; GFX900-NEXT: s_setpc_b64 221 222 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 223 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 224 ; GFX803: v_or_b32_sdwa 225 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 226 define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 227 entry: 228 %load = load i16, i16* %in 229 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 230 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 231 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 232 ret void 233 } 234 235 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 236 ; GCN: s_waitcnt 237 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 238 ; GFX900-NEXT: s_waitcnt 239 ; GFX900-NEXT: global_store_dword v[0:1], v2 240 ; GFX900-NEXT: s_waitcnt 241 ; GFX900-NEXT: s_setpc_b64 242 243 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 244 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 245 ; GFX803: v_or_b32_sdwa 246 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 247 define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 248 entry: 249 %load = load half, half* %in 250 %build0 = insertelement <2 x half> undef, half %reg, i32 0 251 %build1 = insertelement <2 x half> %build0, half %load, i32 1 252 store <2 x half> %build1, <2 x half> addrspace(1)* undef 253 ret void 254 } 255 256 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 257 ; GCN: s_waitcnt 258 ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 259 ; GFX900-NEXT: s_waitcnt 260 ; GFX900-NEXT: global_store_dword v[0:1], v2 261 ; GFX900-NEXT: s_waitcnt 262 ; GFX900-NEXT: s_setpc_b64 263 264 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 265 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 266 ; GFX803: v_or_b32_sdwa 267 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 268 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 269 entry: 270 %load = load i8, i8* %in 271 %ext = zext i8 %load to i16 272 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 273 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 274 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 275 ret void 276 } 277 278 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 279 ; GCN: s_waitcnt 280 ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 281 ; GFX900-NEXT: s_waitcnt 282 ; GFX900-NEXT: global_store_dword v[0:1], v2 283 ; GFX900-NEXT: s_waitcnt 284 ; GFX900-NEXT: s_setpc_b64 285 286 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 287 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 288 ; GFX803: v_or_b32_sdwa 289 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 290 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 291 entry: 292 %load = load i8, i8* %in 293 %ext = sext i8 %load to i16 294 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 295 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 296 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 297 ret void 298 } 299 300 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 301 ; GCN: s_waitcnt 302 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} 303 ; GFX900-NEXT: s_waitcnt 304 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 305 ; GFX900-NEXT: s_waitcnt 306 ; GFX900-NEXT: s_setpc_b64 307 308 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} 309 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { 310 entry: 311 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 312 %load = load i16, i16 addrspace(5)* %gep 313 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 314 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 315 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 316 ret void 317 } 318 319 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 320 ; GCN: s_waitcnt 321 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} 322 ; GFX900-NEXT: s_waitcnt 323 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 324 ; GFX900-NEXT: s_waitcnt 325 ; GFX900-NEXT: s_setpc_b64 326 327 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} 328 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { 329 entry: 330 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 331 %load = load half, half addrspace(5)* %gep 332 %build0 = insertelement <2 x half> undef, half %reg, i32 0 333 %build1 = insertelement <2 x half> %build0, half %load, i32 1 334 store <2 x half> %build1, <2 x half> addrspace(1)* undef 335 ret void 336 } 337 338 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 339 ; GCN: s_waitcnt 340 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}} 341 ; GFX900: s_waitcnt 342 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 343 ; GFX900-NEXT: s_waitcnt 344 ; GFX900-NEXT: s_setpc_b64 345 346 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} 347 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { 348 entry: 349 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 350 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 351 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 352 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 353 ret void 354 } 355 356 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 357 ; GCN: s_waitcnt 358 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 359 ; GFX900-NEXT: s_waitcnt 360 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 361 ; GFX900-NEXT: s_waitcnt 362 ; GFX900-NEXT: s_setpc_b64 363 364 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} 365 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 366 entry: 367 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 368 %build0 = insertelement <2 x half> undef, half %reg, i32 0 369 %build1 = insertelement <2 x half> %build0, half %load, i32 1 370 store <2 x half> %build1, <2 x half> addrspace(1)* undef 371 ret void 372 } 373 374 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 375 ; GCN: s_waitcnt 376 ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} 377 ; GFX900-NEXT: s_waitcnt 378 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 379 ; GFX900-NEXT: s_waitcnt 380 ; GFX900-NEXT: s_setpc_b64 381 382 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} 383 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 384 entry: 385 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 386 %load = load i8, i8 addrspace(5)* %gep 387 %ext = zext i8 %load to i16 388 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 389 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 390 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 391 ret void 392 } 393 394 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 395 ; GCN: s_waitcnt 396 ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} 397 ; GFX900-NEXT: s_waitcnt 398 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 399 ; GFX900-NEXT: s_waitcnt 400 ; GFX900-NEXT: s_setpc_b64 401 402 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} 403 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 404 entry: 405 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 406 %load = load i8, i8 addrspace(5)* %gep 407 %ext = sext i8 %load to i16 408 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 409 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 410 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 411 ret void 412 } 413 414 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 415 ; GCN: s_waitcnt 416 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 417 ; GFX900-NEXT: s_waitcnt 418 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 419 ; GFX900-NEXT: s_waitcnt 420 ; GFX900-NEXT: s_setpc_b64 421 422 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} 423 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 424 entry: 425 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 426 %ext = zext i8 %load to i16 427 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 428 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 429 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 430 ret void 431 } 432 433 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 434 ; GCN: s_waitcnt 435 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 436 ; GFX900-NEXT: s_waitcnt 437 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 438 ; GFX900-NEXT: s_waitcnt 439 ; GFX900-NEXT: s_setpc_b64 440 441 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} 442 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 443 entry: 444 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 445 %ext = sext i8 %load to i16 446 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 447 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 448 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 449 ret void 450 } 451 452 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 453 ; GCN: s_waitcnt 454 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 455 ; GFX900-NEXT: s_waitcnt 456 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 457 ; GFX900-NEXT: s_waitcnt 458 ; GFX900-NEXT: s_setpc_b64 459 460 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} 461 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 462 entry: 463 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 464 %ext = zext i8 %load to i16 465 %bc.ext = bitcast i16 %ext to half 466 %build0 = insertelement <2 x half> undef, half %reg, i32 0 467 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 468 store <2 x half> %build1, <2 x half> addrspace(1)* undef 469 ret void 470 } 471 472 ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 473 ; GCN: s_waitcnt 474 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 475 ; GFX900-NEXT: s_waitcnt 476 ; GFX900-NEXT: global_store_dword 477 ; GFX900-NEXT: s_waitcnt 478 ; GFX900-NEXT: s_setpc_b64 479 480 ; GFX803: flat_load_ushort 481 ; GFX906: global_load_ushort 482 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 483 entry: 484 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 485 %load = load i16, i16 addrspace(4)* %gep 486 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 487 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 488 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 489 ret void 490 } 491 492 ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 493 ; GCN: s_waitcnt 494 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 495 ; GFX900-NEXT: s_waitcnt 496 ; GFX900-NEXT: global_store_dword 497 ; GFX900-NEXT: s_waitcnt 498 ; GFX900-NEXT: s_setpc_b64 499 500 ; GFX803: flat_load_ushort 501 ; GFX906: global_load_ushort 502 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 503 entry: 504 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 505 %load = load half, half addrspace(4)* %gep 506 %build0 = insertelement <2 x half> undef, half %reg, i32 0 507 %build1 = insertelement <2 x half> %build0, half %load, i32 1 508 store <2 x half> %build1, <2 x half> addrspace(1)* undef 509 ret void 510 } 511 512 ; Local object gives known offset, so requires converting from offen 513 ; to offset variant. 514 515 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 516 ; GFX900: buffer_store_dword 517 ; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 518 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { 519 entry: 520 %obj0 = alloca [10 x i32], align 4, addrspace(5) 521 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 522 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 523 store volatile i32 123, i32 addrspace(5)* %bc 524 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 525 %load = load i16, i16 addrspace(5)* %gep 526 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 527 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 528 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 529 ret void 530 } 531 532 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 533 ; GFX900: buffer_store_dword 534 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 535 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { 536 entry: 537 %obj0 = alloca [10 x i32], align 4, addrspace(5) 538 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 539 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 540 store volatile i32 123, i32 addrspace(5)* %bc 541 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 542 %load = load i8, i8 addrspace(5)* %gep 543 %ext = sext i8 %load to i16 544 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 545 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 546 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 547 ret void 548 } 549 550 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 551 ; GFX900: buffer_store_dword 552 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 553 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { 554 entry: 555 %obj0 = alloca [10 x i32], align 4, addrspace(5) 556 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 557 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 558 store volatile i32 123, i32 addrspace(5)* %bc 559 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 560 %load = load i8, i8 addrspace(5)* %gep 561 %ext = zext i8 %load to i16 562 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 563 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 564 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 565 ret void 566 } 567 568 ; FIXME: Remove m0 init and waitcnt between reads 569 ; FIXME: Is there a cost to using the extload over not? 570 ; GCN-LABEL: {{^}}load_local_v2i16_split: 571 ; GCN: s_waitcnt 572 ; GFX900-NEXT: ds_read_u16 v1, v0 573 ; GFX900-NEXT: s_waitcnt 574 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 575 ; GFX900-NEXT: s_waitcnt 576 ; GFX900-NEXT: v_mov_b32_e32 v0, v1 577 ; GFX900-NEXT: s_setpc_b64 578 define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 { 579 entry: 580 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 581 %load0 = load volatile i16, i16 addrspace(3)* %in 582 %load1 = load volatile i16, i16 addrspace(3)* %gep 583 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 584 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 585 ret <2 x i16> %build1 586 } 587 588 ; FIXME: Remove waitcnt between reads 589 ; GCN-LABEL: {{^}}load_global_v2i16_split: 590 ; GCN: s_waitcnt 591 ; GFX900-NEXT: global_load_ushort v2 592 ; GFX900-NEXT: s_waitcnt 593 ; GFX900-NEXT: global_load_short_d16_hi v2 594 ; GFX900-NEXT: s_waitcnt 595 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 596 ; GFX900-NEXT: s_setpc_b64 597 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 598 entry: 599 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 600 %load0 = load volatile i16, i16 addrspace(1)* %in 601 %load1 = load volatile i16, i16 addrspace(1)* %gep 602 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 603 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 604 ret <2 x i16> %build1 605 } 606 607 ; FIXME: Remove waitcnt between reads 608 ; GCN-LABEL: {{^}}load_flat_v2i16_split: 609 ; GCN: s_waitcnt 610 ; GFX900-NEXT: flat_load_ushort v2 611 ; GFX900-NEXT: s_waitcnt 612 ; GFX900-NEXT: flat_load_short_d16_hi v2 613 ; GFX900-NEXT: s_waitcnt 614 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 615 ; GFX900-NEXT: s_setpc_b64 616 define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 617 entry: 618 %gep = getelementptr inbounds i16, i16* %in, i64 1 619 %load0 = load volatile i16, i16* %in 620 %load1 = load volatile i16, i16* %gep 621 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 622 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 623 ret <2 x i16> %build1 624 } 625 626 ; FIXME: Remove waitcnt between reads 627 ; GCN-LABEL: {{^}}load_constant_v2i16_split: 628 ; GCN: s_waitcnt 629 ; GFX900-NEXT: global_load_ushort v2 630 ; GFX900-NEXT: s_waitcnt 631 ; GFX900-NEXT: global_load_short_d16_hi v2 632 ; GFX900-NEXT: s_waitcnt 633 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 634 ; GFX900-NEXT: s_setpc_b64 635 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 636 entry: 637 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 638 %load0 = load volatile i16, i16 addrspace(4)* %in 639 %load1 = load volatile i16, i16 addrspace(4)* %gep 640 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 641 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 642 ret <2 x i16> %build1 643 } 644 645 ; FIXME: Remove m0 init and waitcnt between reads 646 ; FIXME: Is there a cost to using the extload over not? 647 ; GCN-LABEL: {{^}}load_private_v2i16_split: 648 ; GCN: s_waitcnt 649 ; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}} 650 ; GFX900-NEXT: s_waitcnt 651 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 652 ; GFX900-NEXT: s_waitcnt 653 ; GFX900-NEXT: s_setpc_b64 654 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { 655 entry: 656 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 657 %load0 = load volatile i16, i16 addrspace(5)* %in 658 %load1 = load volatile i16, i16 addrspace(5)* %gep 659 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 660 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 661 ret <2 x i16> %build1 662 } 663 664 attributes #0 = { nounwind } 665