1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900 %s 2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,NO-D16-HI %s 3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5 ; GCN-LABEL: {{^}}store_global_hi_v2i16: 6 ; GCN: s_waitcnt 7 8 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 9 10 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11 ; GFX803-NEXT: flat_store_short v[0:1], v2 12 ; GFX906-NEXT: global_store_short v[0:1], v2, off 13 14 ; GCN-NEXT: s_waitcnt 15 ; GCN-NEXT: s_setpc_b64 16 define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 { 17 entry: 18 ; FIXME: ABI for pre-gfx9 19 %value = bitcast i32 %arg to <2 x i16> 20 %hi = extractelement <2 x i16> %value, i32 1 21 store i16 %hi, i16 addrspace(1)* %out 22 ret void 23 } 24 25 ; GCN-LABEL: {{^}}store_global_hi_v2f16: 26 ; GCN: s_waitcnt 27 28 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 29 30 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 31 ; GFX803-NEXT: flat_store_short v[0:1], v2 32 ; GFX906-NEXT: global_store_short v[0:1], v2, off 33 34 ; GCN-NEXT: s_waitcnt 35 ; GCN-NEXT: s_setpc_b64 36 define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 { 37 entry: 38 ; FIXME: ABI for pre-gfx9 39 %value = bitcast i32 %arg to <2 x half> 40 %hi = extractelement <2 x half> %value, i32 1 41 store half %hi, half addrspace(1)* %out 42 ret void 43 } 44 45 ; GCN-LABEL: {{^}}store_global_hi_i32_shift: 46 ; GCN: s_waitcnt 47 48 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off 49 50 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 51 ; GFX803-NEXT: flat_store_short v[0:1], v2 52 ; GFX906-NEXT: global_store_short v[0:1], v2, off 53 54 ; GCN-NEXT: s_waitcnt 55 ; GCN-NEXT: s_setpc_b64 56 define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 { 57 entry: 58 %hi32 = lshr i32 %value, 16 59 %hi = trunc i32 %hi32 to i16 60 store i16 %hi, i16 addrspace(1)* %out 61 ret void 62 } 63 64 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8: 65 ; GCN: s_waitcnt 66 67 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off 68 69 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 70 ; GFX803-NEXT: flat_store_byte v[0:1], v2 71 ; GFX906-NEXT: global_store_byte v[0:1], v2, off 72 73 ; GCN-NEXT: s_waitcnt 74 ; GCN-NEXT: s_setpc_b64 75 define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 { 76 entry: 77 %value = bitcast i32 %arg to <2 x i16> 78 %hi = extractelement <2 x i16> %value, i32 1 79 %trunc = trunc i16 %hi to i8 80 store i8 %trunc, i8 addrspace(1)* %out 81 ret void 82 } 83 84 ; GCN-LABEL: {{^}}store_global_hi_i8_shift: 85 ; GCN: s_waitcnt 86 87 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off 88 89 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 90 ; GFX803-NEXT: flat_store_byte v[0:1], v2 91 ; GFX906-NEXT: global_store_byte v[0:1], v2, off 92 93 ; GCN-NEXT: s_waitcnt 94 ; GCN-NEXT: s_setpc_b64 95 define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 { 96 entry: 97 %hi32 = lshr i32 %value, 16 98 %hi = trunc i32 %hi32 to i8 99 store i8 %hi, i8 addrspace(1)* %out 100 ret void 101 } 102 103 ; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset: 104 ; GCN: s_waitcnt 105 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094 106 107 ; GFX803-DAG: v_add_u32_e32 108 ; GFX803-DAG: v_addc_u32_e32 109 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 110 ; GFX803: flat_store_short v[0:1], v2{{$}} 111 112 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 113 ; GFX906-NEXT: global_store_short v[0:1], v2, off 114 115 ; GCN-NEXT: s_waitcnt 116 ; GCN-NEXT: s_setpc_b64 117 define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { 118 entry: 119 ; FIXME: ABI for pre-gfx9 120 %value = bitcast i32 %arg to <2 x i16> 121 %hi = extractelement <2 x i16> %value, i32 1 122 %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047 123 store i16 %hi, i16 addrspace(1)* %gep 124 ret void 125 } 126 127 ; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset: 128 ; GCN: s_waitcnt 129 ; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}} 130 131 ; GFX803-DAG: v_add_u32_e32 132 ; GFX803-DAG: v_addc_u32_e32 133 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 134 ; GFX803: flat_store_short v[0:1], v{{[0-9]$}} 135 136 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 137 ; GFX906-NEXT: global_store_short v[0:1], v2, off 138 139 ; GCN-NEXT: s_waitcnt 140 ; GCN-NEXT: s_setpc_b64 141 define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { 142 entry: 143 %value = bitcast i32 %arg to <2 x i16> 144 %hi = extractelement <2 x i16> %value, i32 1 145 %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048 146 store i16 %hi, i16 addrspace(1)* %gep 147 ret void 148 } 149 150 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset: 151 ; GCN: s_waitcnt 152 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095 153 154 ; GFX803-DAG: v_add_u32_e32 155 ; GFX803-DAG: v_addc_u32_e32 156 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 157 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} 158 159 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 160 ; GFX906-NEXT: global_store_byte v[0:1], v2, off 161 162 ; GCN-NEXT: s_waitcnt 163 ; GCN-NEXT: s_setpc_b64 164 define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { 165 entry: 166 %value = bitcast i32 %arg to <2 x i16> 167 %hi = extractelement <2 x i16> %value, i32 1 168 %trunc = trunc i16 %hi to i8 169 %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095 170 store i8 %trunc, i8 addrspace(1)* %gep 171 ret void 172 } 173 174 ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset: 175 ; GCN: s_waitcnt 176 ; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095 177 178 ; GFX803-DAG: v_add_u32_e32 179 ; GFX803-DAG: v_addc_u32_e32 180 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 181 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} 182 183 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 184 ; GFX906-NEXT: global_store_byte v[0:1], v2, off 185 186 ; GCN-NEXT: s_waitcnt 187 ; GCN-NEXT: s_setpc_b64 188 define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { 189 entry: 190 %value = bitcast i32 %arg to <2 x i16> 191 %hi = extractelement <2 x i16> %value, i32 1 192 %trunc = trunc i16 %hi to i8 193 %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095 194 store i8 %trunc, i8 addrspace(1)* %gep 195 ret void 196 } 197 198 ; GCN-LABEL: {{^}}store_flat_hi_v2i16: 199 ; GCN: s_waitcnt 200 201 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 202 203 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 204 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 205 206 ; GCN-NEXT: s_waitcnt 207 ; GCN-NEXT: s_setpc_b64 208 define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 { 209 entry: 210 %value = bitcast i32 %arg to <2 x i16> 211 %hi = extractelement <2 x i16> %value, i32 1 212 store i16 %hi, i16* %out 213 ret void 214 } 215 216 ; GCN-LABEL: {{^}}store_flat_hi_v2f16: 217 ; GCN: s_waitcnt 218 219 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 220 221 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 222 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 223 224 ; GCN-NEXT: s_waitcnt 225 ; GCN-NEXT: s_setpc_b64 226 define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 { 227 entry: 228 %value = bitcast i32 %arg to <2 x half> 229 %hi = extractelement <2 x half> %value, i32 1 230 store half %hi, half* %out 231 ret void 232 } 233 234 ; GCN-LABEL: {{^}}store_flat_hi_i32_shift: 235 ; GCN: s_waitcnt 236 237 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 238 239 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 240 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 241 242 ; GCN-NEXT: s_waitcnt 243 ; GCN-NEXT: s_setpc_b64 244 define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 { 245 entry: 246 %hi32 = lshr i32 %value, 16 247 %hi = trunc i32 %hi32 to i16 248 store i16 %hi, i16* %out 249 ret void 250 } 251 252 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8: 253 ; GCN: s_waitcnt 254 255 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 256 257 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 258 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 259 260 ; GCN-NEXT: s_waitcnt 261 ; GCN-NEXT: s_setpc_b64 262 define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 { 263 entry: 264 %value = bitcast i32 %arg to <2 x i16> 265 %hi = extractelement <2 x i16> %value, i32 1 266 %trunc = trunc i16 %hi to i8 267 store i8 %trunc, i8* %out 268 ret void 269 } 270 271 ; GCN-LABEL: {{^}}store_flat_hi_i8_shift: 272 ; GCN: s_waitcnt 273 274 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 275 276 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 277 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 278 279 ; GCN-NEXT: s_waitcnt 280 ; GCN-NEXT: s_setpc_b64 281 define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 { 282 entry: 283 %hi32 = lshr i32 %value, 16 284 %hi = trunc i32 %hi32 to i8 285 store i8 %hi, i8* %out 286 ret void 287 } 288 289 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset: 290 ; GCN: s_waitcnt 291 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} 292 293 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 294 ; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 295 296 ; GFX803-DAG: v_add_u32_e32 297 ; GFX803-DAG: v_addc_u32_e32 298 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 299 ; GFX803: flat_store_short v[0:1], v2{{$}} 300 301 ; GCN-NEXT: s_waitcnt 302 ; GCN-NEXT: s_setpc_b64 303 define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { 304 entry: 305 %value = bitcast i32 %arg to <2 x i16> 306 %hi = extractelement <2 x i16> %value, i32 1 307 %gep = getelementptr inbounds i16, i16* %out, i64 2047 308 store i16 %hi, i16* %gep 309 ret void 310 } 311 312 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset: 313 ; GCN: s_waitcnt 314 ; GCN: v_add{{(_co)?}}_{{i|u}}32_e32 315 316 ; GFX803: v_addc_u32_e32 317 ; GFX900: v_addc_co_u32_e32 318 319 ; GFX906-NEXT: v_lshrrev_b32_e32 320 ; GFX906-NEXT: v_addc_co_u32_e32 321 ; GFX906: flat_store_short v[0:1], v2 322 323 ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} 324 ; GFX803: flat_store_short v[0:1], v2{{$}} 325 ; GCN-NEXT: s_waitcnt 326 ; GCN-NEXT: s_setpc_b64 327 define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 { 328 entry: 329 %value = bitcast i32 %arg to <2 x i16> 330 %hi = extractelement <2 x i16> %value, i32 1 331 %gep = getelementptr inbounds i16, i16* %out, i64 -1023 332 store i16 %hi, i16* %gep 333 ret void 334 } 335 336 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset: 337 ; GCN: s_waitcnt 338 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}} 339 340 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 341 ; GFX803-DAG: v_add_u32_e32 342 ; GFX803-DAG: v_addc_u32_e32 343 ; GFX803: flat_store_byte v[0:1], v2{{$}} 344 345 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 346 ; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} 347 348 ; GCN-NEXT: s_waitcnt 349 ; GCN-NEXT: s_setpc_b64 350 define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { 351 entry: 352 %value = bitcast i32 %arg to <2 x i16> 353 %hi = extractelement <2 x i16> %value, i32 1 354 %trunc = trunc i16 %hi to i8 355 %gep = getelementptr inbounds i8, i8* %out, i64 4095 356 store i8 %trunc, i8* %gep 357 ret void 358 } 359 360 ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset: 361 ; GCN: s_waitcnt 362 ; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32 363 364 ; GFX803-DAG: v_addc_u32_e32 365 ; GFX900-DAG: v_addc_co_u32_e32 366 ; GFX906-DAG: v_add_co_u32_e32 367 368 ; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} 369 370 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 371 ; GFX906-NEXT: v_addc_co_u32_e32 372 ; GFX906-NEXT: flat_store_byte v[0:1], v2{{$}} 373 374 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 375 ; GFX803: flat_store_byte v[0:1], v2{{$}} 376 377 ; GCN-NEXT: s_waitcnt 378 ; GCN-NEXT: s_setpc_b64 379 define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 { 380 entry: 381 %value = bitcast i32 %arg to <2 x i16> 382 %hi = extractelement <2 x i16> %value, i32 1 383 %trunc = trunc i16 %hi to i8 384 %gep = getelementptr inbounds i8, i8* %out, i64 -4095 385 store i8 %trunc, i8* %gep 386 ret void 387 } 388 389 ; GCN-LABEL: {{^}}store_private_hi_v2i16: 390 ; GCN: s_waitcnt 391 392 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} 393 394 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 395 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} 396 397 ; GCN-NEXT: s_waitcnt 398 ; GCN-NEXT: s_setpc_b64 399 define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 { 400 entry: 401 ; FIXME: ABI for pre-gfx9 402 %value = bitcast i32 %arg to <2 x i16> 403 %hi = extractelement <2 x i16> %value, i32 1 404 store i16 %hi, i16 addrspace(5)* %out 405 ret void 406 } 407 408 ; GCN-LABEL: {{^}}store_private_hi_v2f16: 409 ; GCN: s_waitcnt 410 411 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} 412 413 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 414 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} 415 416 ; GCN-NEXT: s_waitcnt 417 ; GCN-NEXT: s_setpc_b64 418 define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 { 419 entry: 420 ; FIXME: ABI for pre-gfx9 421 %value = bitcast i32 %arg to <2 x half> 422 %hi = extractelement <2 x half> %value, i32 1 423 store half %hi, half addrspace(5)* %out 424 ret void 425 } 426 427 ; GCN-LABEL: {{^}}store_private_hi_i32_shift: 428 ; GCN: s_waitcnt 429 430 ; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} 431 432 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 433 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} 434 435 ; GCN-NEXT: s_waitcnt 436 ; GCN-NEXT: s_setpc_b64 437 define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 { 438 entry: 439 %hi32 = lshr i32 %value, 16 440 %hi = trunc i32 %hi32 to i16 441 store i16 %hi, i16 addrspace(5)* %out 442 ret void 443 } 444 445 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: 446 ; GCN: s_waitcnt 447 448 ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} 449 450 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 451 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} 452 453 ; GCN-NEXT: s_waitcnt 454 ; GCN-NEXT: s_setpc_b64 455 define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 { 456 entry: 457 %value = bitcast i32 %arg to <2 x i16> 458 %hi = extractelement <2 x i16> %value, i32 1 459 %trunc = trunc i16 %hi to i8 460 store i8 %trunc, i8 addrspace(5)* %out 461 ret void 462 } 463 464 ; GCN-LABEL: {{^}}store_private_hi_i8_shift: 465 ; GCN: s_waitcnt 466 467 ; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} 468 469 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 470 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} 471 472 ; GCN-NEXT: s_waitcnt 473 ; GCN-NEXT: s_setpc_b64 474 define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 { 475 entry: 476 %hi32 = lshr i32 %value, 16 477 %hi = trunc i32 %hi32 to i8 478 store i8 %hi, i8 addrspace(5)* %out 479 ret void 480 } 481 482 ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: 483 ; GCN: s_waitcnt 484 ; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} 485 486 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 487 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}} 488 489 ; GCN-NEXT: s_waitcnt 490 ; GCN-NEXT: s_setpc_b64 491 define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32 %arg) #0 { 492 entry: 493 %value = bitcast i32 %arg to <2 x i16> 494 %hi = extractelement <2 x i16> %value, i32 1 495 %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045 496 store i16 %hi, i16 addrspace(5)* %gep 497 ret void 498 } 499 500 501 502 ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: 503 ; GCN: s_waitcnt 504 505 ; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s4{{$}} 506 507 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 508 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s4{{$}} 509 510 ; GCN-NEXT: s_waitcnt 511 ; GCN-NEXT: s_setpc_b64 512 define void @store_private_hi_v2i16_nooff(i32 %arg) #0 { 513 entry: 514 ; FIXME: ABI for pre-gfx9 515 %value = bitcast i32 %arg to <2 x i16> 516 %hi = extractelement <2 x i16> %value, i32 1 517 store volatile i16 %hi, i16 addrspace(5)* null 518 ret void 519 } 520 521 522 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: 523 ; GCN: s_waitcnt 524 525 ; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s4{{$}} 526 527 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 528 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s4{{$}} 529 530 ; GCN-NEXT: s_waitcnt 531 ; GCN-NEXT: s_setpc_b64 532 define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 { 533 entry: 534 %value = bitcast i32 %arg to <2 x i16> 535 %hi = extractelement <2 x i16> %value, i32 1 536 %trunc = trunc i16 %hi to i8 537 store volatile i8 %trunc, i8 addrspace(5)* null 538 ret void 539 } 540 541 ; GCN-LABEL: {{^}}store_local_hi_v2i16: 542 ; GCN: s_waitcnt 543 544 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 545 546 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 547 ; NO-D16-HI: ds_write_b16 v0, v1 548 549 ; GCN-NEXT: s_waitcnt 550 ; GCN-NEXT: s_setpc_b64 551 define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 { 552 entry: 553 ; FIXME: ABI for pre-gfx9 554 %value = bitcast i32 %arg to <2 x i16> 555 %hi = extractelement <2 x i16> %value, i32 1 556 store i16 %hi, i16 addrspace(3)* %out 557 ret void 558 } 559 560 ; GCN-LABEL: {{^}}store_local_hi_v2f16: 561 ; GCN: s_waitcnt 562 563 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 564 565 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 566 ; NO-D16-HI: ds_write_b16 v0, v1 567 568 ; GCN-NEXT: s_waitcnt 569 ; GCN-NEXT: s_setpc_b64 570 define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 { 571 entry: 572 ; FIXME: ABI for pre-gfx9 573 %value = bitcast i32 %arg to <2 x half> 574 %hi = extractelement <2 x half> %value, i32 1 575 store half %hi, half addrspace(3)* %out 576 ret void 577 } 578 579 ; GCN-LABEL: {{^}}store_local_hi_i32_shift: 580 ; GCN: s_waitcnt 581 582 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} 583 584 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 585 ; NO-D16-HI: ds_write_b16 v0, v1 586 587 ; GCN-NEXT: s_waitcnt 588 ; GCN-NEXT: s_setpc_b64 589 define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 { 590 entry: 591 %hi32 = lshr i32 %value, 16 592 %hi = trunc i32 %hi32 to i16 593 store i16 %hi, i16 addrspace(3)* %out 594 ret void 595 } 596 597 ; GCN-LABEL: {{^}}store_local_hi_v2i16_i8: 598 ; GCN: s_waitcnt 599 600 ; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}} 601 602 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 603 ; NO-D16-HI: ds_write_b8 v0, v1 604 605 ; GCN-NEXT: s_waitcnt 606 ; GCN-NEXT: s_setpc_b64 607 define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 { 608 entry: 609 %value = bitcast i32 %arg to <2 x i16> 610 %hi = extractelement <2 x i16> %value, i32 1 611 %trunc = trunc i16 %hi to i8 612 store i8 %trunc, i8 addrspace(3)* %out 613 ret void 614 } 615 616 ; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset: 617 ; GCN: s_waitcnt 618 ; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}} 619 620 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 621 ; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}} 622 623 ; GCN-NEXT: s_waitcnt 624 ; GCN-NEXT: s_setpc_b64 625 define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 { 626 entry: 627 ; FIXME: ABI for pre-gfx9 628 %value = bitcast i32 %arg to <2 x i16> 629 %hi = extractelement <2 x i16> %value, i32 1 630 %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767 631 store i16 %hi, i16 addrspace(3)* %gep 632 ret void 633 } 634 635 ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: 636 ; GCN: s_waitcnt 637 ; GFX900: buffer_store_dword 638 ; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 639 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { 640 entry: 641 %obj0 = alloca [10 x i32], align 4, addrspace(5) 642 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 643 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 644 store volatile i32 123, i32 addrspace(5)* %bc 645 %value = bitcast i32 %arg to <2 x i16> 646 %hi = extractelement <2 x i16> %value, i32 1 647 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 648 store i16 %hi, i16 addrspace(5)* %gep 649 ret void 650 } 651 652 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: 653 ; GCN: s_waitcnt 654 ; GFX900: buffer_store_dword 655 ; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 656 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { 657 entry: 658 %obj0 = alloca [10 x i32], align 4, addrspace(5) 659 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 660 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 661 store volatile i32 123, i32 addrspace(5)* %bc 662 %value = bitcast i32 %arg to <2 x i16> 663 %hi = extractelement <2 x i16> %value, i32 1 664 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 665 %trunc = trunc i16 %hi to i8 666 store i8 %trunc, i8 addrspace(5)* %gep 667 ret void 668 } 669 670 attributes #0 = { nounwind } 671