1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4 ; half args should be promoted to float 5 6 ; GCN-LABEL: {{^}}load_f16_arg: 7 ; GCN: s_load_dword [[ARG:s[0-9]+]] 8 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] 9 ; GCN: buffer_store_short [[CVT]] 10 define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 11 store half %arg, half addrspace(1)* %out 12 ret void 13 } 14 15 ; GCN-LABEL: {{^}}load_v2f16_arg: 16 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 17 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 18 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] 19 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] 20 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 21 ; GCN: s_endpgm 22 define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 23 store <2 x half> %arg, <2 x half> addrspace(1)* %out 24 ret void 25 } 26 27 ; GCN-LABEL: {{^}}load_v3f16_arg: 28 ; GCN: buffer_load_ushort 29 ; GCN: buffer_load_ushort 30 ; GCN: buffer_load_ushort 31 ; GCN-NOT: buffer_load 32 ; GCN-DAG: buffer_store_dword 33 ; GCN-DAG: buffer_store_short 34 ; GCN-NOT: buffer_store 35 ; GCN: s_endpgm 36 define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 37 store <3 x half> %arg, <3 x half> addrspace(1)* %out 38 ret void 39 } 40 41 ; GCN-LABEL: {{^}}load_v4f16_arg: 42 ; GCN: buffer_load_ushort 43 ; GCN: buffer_load_ushort 44 ; GCN: buffer_load_ushort 45 ; GCN: buffer_load_ushort 46 ; GCN: buffer_store_dwordx2 47 ; GCN: s_endpgm 48 define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 49 store <4 x half> %arg, <4 x half> addrspace(1)* %out 50 ret void 51 } 52 53 ; GCN-LABEL: {{^}}load_v8f16_arg: 54 define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 55 store <8 x half> %arg, <8 x half> addrspace(1)* %out 56 ret void 57 } 58 59 ; GCN-LABEL: {{^}}extload_v2f16_arg: 60 define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 61 %fpext = fpext <2 x half> %in to <2 x float> 62 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 63 ret void 64 } 65 66 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 67 define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 68 %ext = fpext half %arg to float 69 store float %ext, float addrspace(1)* %out 70 ret void 71 } 72 73 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 74 define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 75 %ext = fpext <2 x half> %arg to <2 x float> 76 store <2 x float> %ext, <2 x float> addrspace(1)* %out 77 ret void 78 } 79 80 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 81 ; GCN: buffer_load_ushort 82 ; GCN: buffer_load_ushort 83 ; GCN: buffer_load_ushort 84 ; GCN-NOT: buffer_load 85 ; GCN: v_cvt_f32_f16_e32 86 ; GCN: v_cvt_f32_f16_e32 87 ; GCN: v_cvt_f32_f16_e32 88 ; GCN-NOT: v_cvt_f32_f16 89 ; GCN-DAG: buffer_store_dword 90 ; GCN-DAG: buffer_store_dwordx2 91 ; GCN: s_endpgm 92 define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 93 %ext = fpext <3 x half> %arg to <3 x float> 94 store <3 x float> %ext, <3 x float> addrspace(1)* %out 95 ret void 96 } 97 98 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 99 define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 100 %ext = fpext <4 x half> %arg to <4 x float> 101 store <4 x float> %ext, <4 x float> addrspace(1)* %out 102 ret void 103 } 104 105 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 106 ; GCN: buffer_load_ushort 107 ; GCN: buffer_load_ushort 108 ; GCN: buffer_load_ushort 109 ; GCN: buffer_load_ushort 110 ; GCN: buffer_load_ushort 111 ; GCN: buffer_load_ushort 112 ; GCN: buffer_load_ushort 113 ; GCN: buffer_load_ushort 114 115 ; GCN: v_cvt_f32_f16_e32 116 ; GCN: v_cvt_f32_f16_e32 117 ; GCN: v_cvt_f32_f16_e32 118 ; GCN: v_cvt_f32_f16_e32 119 ; GCN: v_cvt_f32_f16_e32 120 ; GCN: v_cvt_f32_f16_e32 121 ; GCN: v_cvt_f32_f16_e32 122 ; GCN: v_cvt_f32_f16_e32 123 124 ; GCN: buffer_store_dwordx4 125 ; GCN: buffer_store_dwordx4 126 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 127 %ext = fpext <8 x half> %arg to <8 x float> 128 store <8 x float> %ext, <8 x float> addrspace(1)* %out 129 ret void 130 } 131 132 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 133 ; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} 134 ; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} 135 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] 136 ; GCN: buffer_store_dwordx2 [[RESULT]] 137 define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 138 %ext = fpext half %arg to double 139 store double %ext, double addrspace(1)* %out 140 ret void 141 } 142 143 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 144 ; GCN-DAG: buffer_load_ushort v 145 ; GCN-DAG: buffer_load_ushort v 146 ; GCN-DAG: v_cvt_f32_f16_e32 147 ; GCN-DAG: v_cvt_f32_f16_e32 148 ; GCN-DAG: v_cvt_f64_f32_e32 149 ; GCN-DAG: v_cvt_f64_f32_e32 150 ; GCN: s_endpgm 151 define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 152 %ext = fpext <2 x half> %arg to <2 x double> 153 store <2 x double> %ext, <2 x double> addrspace(1)* %out 154 ret void 155 } 156 157 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 158 ; GCN-DAG: buffer_load_ushort v 159 ; GCN-DAG: buffer_load_ushort v 160 ; GCN-DAG: buffer_load_ushort v 161 ; GCN-DAG: v_cvt_f32_f16_e32 162 ; GCN-DAG: v_cvt_f32_f16_e32 163 ; GCN-DAG: v_cvt_f32_f16_e32 164 ; GCN-DAG: v_cvt_f64_f32_e32 165 ; GCN-DAG: v_cvt_f64_f32_e32 166 ; GCN-DAG: v_cvt_f64_f32_e32 167 ; GCN: s_endpgm 168 define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 169 %ext = fpext <3 x half> %arg to <3 x double> 170 store <3 x double> %ext, <3 x double> addrspace(1)* %out 171 ret void 172 } 173 174 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 175 ; GCN-DAG: buffer_load_ushort v 176 ; GCN-DAG: buffer_load_ushort v 177 ; GCN-DAG: buffer_load_ushort v 178 ; GCN-DAG: buffer_load_ushort v 179 ; GCN-DAG: v_cvt_f32_f16_e32 180 ; GCN-DAG: v_cvt_f32_f16_e32 181 ; GCN-DAG: v_cvt_f32_f16_e32 182 ; GCN-DAG: v_cvt_f32_f16_e32 183 ; GCN-DAG: v_cvt_f64_f32_e32 184 ; GCN-DAG: v_cvt_f64_f32_e32 185 ; GCN-DAG: v_cvt_f64_f32_e32 186 ; GCN-DAG: v_cvt_f64_f32_e32 187 ; GCN: s_endpgm 188 define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 189 %ext = fpext <4 x half> %arg to <4 x double> 190 store <4 x double> %ext, <4 x double> addrspace(1)* %out 191 ret void 192 } 193 194 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 195 ; GCN-DAG: buffer_load_ushort v 196 ; GCN-DAG: buffer_load_ushort v 197 ; GCN-DAG: buffer_load_ushort v 198 ; GCN-DAG: buffer_load_ushort v 199 200 ; GCN-DAG: buffer_load_ushort v 201 ; GCN-DAG: buffer_load_ushort v 202 ; GCN-DAG: buffer_load_ushort v 203 ; GCN-DAG: buffer_load_ushort v 204 205 ; GCN-DAG: v_cvt_f32_f16_e32 206 ; GCN-DAG: v_cvt_f32_f16_e32 207 ; GCN-DAG: v_cvt_f32_f16_e32 208 ; GCN-DAG: v_cvt_f32_f16_e32 209 210 ; GCN-DAG: v_cvt_f32_f16_e32 211 ; GCN-DAG: v_cvt_f32_f16_e32 212 ; GCN-DAG: v_cvt_f32_f16_e32 213 ; GCN-DAG: v_cvt_f32_f16_e32 214 215 ; GCN-DAG: v_cvt_f64_f32_e32 216 ; GCN-DAG: v_cvt_f64_f32_e32 217 ; GCN-DAG: v_cvt_f64_f32_e32 218 ; GCN-DAG: v_cvt_f64_f32_e32 219 220 ; GCN-DAG: v_cvt_f64_f32_e32 221 ; GCN-DAG: v_cvt_f64_f32_e32 222 ; GCN-DAG: v_cvt_f64_f32_e32 223 ; GCN-DAG: v_cvt_f64_f32_e32 224 225 ; GCN: s_endpgm 226 define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 227 %ext = fpext <8 x half> %arg to <8 x double> 228 store <8 x double> %ext, <8 x double> addrspace(1)* %out 229 ret void 230 } 231 232 ; GCN-LABEL: {{^}}global_load_store_f16: 233 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 234 ; GCN: buffer_store_short [[TMP]] 235 define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 236 %val = load half, half addrspace(1)* %in 237 store half %val, half addrspace(1)* %out 238 ret void 239 } 240 241 ; GCN-LABEL: {{^}}global_load_store_v2f16: 242 ; GCN: buffer_load_dword [[TMP:v[0-9]+]] 243 ; GCN: buffer_store_dword [[TMP]] 244 define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 245 %val = load <2 x half>, <2 x half> addrspace(1)* %in 246 store <2 x half> %val, <2 x half> addrspace(1)* %out 247 ret void 248 } 249 250 ; GCN-LABEL: {{^}}global_load_store_v4f16: 251 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 252 ; GCN: buffer_store_dwordx2 [[TMP]] 253 define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 254 %val = load <4 x half>, <4 x half> addrspace(1)* %in 255 store <4 x half> %val, <4 x half> addrspace(1)* %out 256 ret void 257 } 258 259 ; GCN-LABEL: {{^}}global_load_store_v8f16: 260 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 261 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 262 ; GCN: s_endpgm 263 define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 264 %val = load <8 x half>, <8 x half> addrspace(1)* %in 265 store <8 x half> %val, <8 x half> addrspace(1)* %out 266 ret void 267 } 268 269 ; GCN-LABEL: {{^}}global_extload_f16_to_f32: 270 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 271 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 272 ; GCN: buffer_store_dword [[CVT]] 273 define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 274 %val = load half, half addrspace(1)* %in 275 %cvt = fpext half %val to float 276 store float %cvt, float addrspace(1)* %out 277 ret void 278 } 279 280 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 281 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 282 ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 283 ; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 284 ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 285 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 286 ; GCN: s_endpgm 287 define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 288 %val = load <2 x half>, <2 x half> addrspace(1)* %in 289 %cvt = fpext <2 x half> %val to <2 x float> 290 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 291 ret void 292 } 293 294 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 295 define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 296 %val = load <3 x half>, <3 x half> addrspace(1)* %in 297 %cvt = fpext <3 x half> %val to <3 x float> 298 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 299 ret void 300 } 301 302 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 303 define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 304 %val = load <4 x half>, <4 x half> addrspace(1)* %in 305 %cvt = fpext <4 x half> %val to <4 x float> 306 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 307 ret void 308 } 309 310 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 311 define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 312 %val = load <8 x half>, <8 x half> addrspace(1)* %in 313 %cvt = fpext <8 x half> %val to <8 x float> 314 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 315 ret void 316 } 317 318 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 319 ; GCN: buffer_load_dwordx4 320 ; GCN: buffer_load_dwordx4 321 322 ; GCN: v_cvt_f32_f16_e32 323 ; GCN: v_cvt_f32_f16_e32 324 ; GCN: v_cvt_f32_f16_e32 325 ; GCN: v_cvt_f32_f16_e32 326 ; GCN: v_cvt_f32_f16_e32 327 ; GCN: v_cvt_f32_f16_e32 328 ; GCN: v_cvt_f32_f16_e32 329 ; GCN: v_cvt_f32_f16_e32 330 ; GCN: v_cvt_f32_f16_e32 331 ; GCN: v_cvt_f32_f16_e32 332 ; GCN: v_cvt_f32_f16_e32 333 ; GCN: v_cvt_f32_f16_e32 334 ; GCN: v_cvt_f32_f16_e32 335 ; GCN: v_cvt_f32_f16_e32 336 ; GCN: v_cvt_f32_f16_e32 337 ; GCN: v_cvt_f32_f16_e32 338 339 ; GCN: buffer_store_dwordx4 340 ; GCN: buffer_store_dwordx4 341 ; GCN: buffer_store_dwordx4 342 ; GCN: buffer_store_dwordx4 343 344 ; GCN: s_endpgm 345 define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 346 %val = load <16 x half>, <16 x half> addrspace(1)* %in 347 %cvt = fpext <16 x half> %val to <16 x float> 348 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 349 ret void 350 } 351 352 ; GCN-LABEL: {{^}}global_extload_f16_to_f64: 353 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 354 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 355 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 356 ; GCN: buffer_store_dwordx2 [[CVT1]] 357 define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 358 %val = load half, half addrspace(1)* %in 359 %cvt = fpext half %val to double 360 store double %cvt, double addrspace(1)* %out 361 ret void 362 } 363 364 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 365 ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 366 ; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 367 ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 368 ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 369 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 370 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 371 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 372 ; GCN: s_endpgm 373 define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 374 %val = load <2 x half>, <2 x half> addrspace(1)* %in 375 %cvt = fpext <2 x half> %val to <2 x double> 376 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 377 ret void 378 } 379 380 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 381 382 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 383 ; GCN-DAG: v_cvt_f32_f16_e32 384 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 385 ; GCN-DAG: v_cvt_f32_f16_e32 386 ; GCN-DAG: v_cvt_f32_f16_e32 387 388 ; GCN: v_cvt_f64_f32_e32 389 ; GCN: v_cvt_f64_f32_e32 390 ; GCN: v_cvt_f64_f32_e32 391 ; GCN-NOT: v_cvt_f64_f32_e32 392 393 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 394 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 395 ; GCN: s_endpgm 396 define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 397 %val = load <3 x half>, <3 x half> addrspace(1)* %in 398 %cvt = fpext <3 x half> %val to <3 x double> 399 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 400 ret void 401 } 402 403 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 404 define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 405 %val = load <4 x half>, <4 x half> addrspace(1)* %in 406 %cvt = fpext <4 x half> %val to <4 x double> 407 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 408 ret void 409 } 410 411 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 412 define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 413 %val = load <8 x half>, <8 x half> addrspace(1)* %in 414 %cvt = fpext <8 x half> %val to <8 x double> 415 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 416 ret void 417 } 418 419 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 420 define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 421 %val = load <16 x half>, <16 x half> addrspace(1)* %in 422 %cvt = fpext <16 x half> %val to <16 x double> 423 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 424 ret void 425 } 426 427 ; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 428 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 429 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 430 ; GCN: buffer_store_short [[CVT]] 431 define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 432 %val = load float, float addrspace(1)* %in 433 %cvt = fptrunc float %val to half 434 store half %cvt, half addrspace(1)* %out 435 ret void 436 } 437 438 ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 439 ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 440 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 441 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 442 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] 443 ; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] 444 ; GCN-DAG: buffer_store_dword [[PACKED]] 445 ; GCN: s_endpgm 446 define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 447 %val = load <2 x float>, <2 x float> addrspace(1)* %in 448 %cvt = fptrunc <2 x float> %val to <2 x half> 449 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 450 ret void 451 } 452 453 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 454 ; GCN: buffer_load_dwordx4 455 ; GCN: v_cvt_f16_f32_e32 456 ; GCN: v_cvt_f16_f32_e32 457 ; GCN: v_cvt_f16_f32_e32 458 ; GCN-NOT: v_cvt_f16_f32_e32 459 ; GCN: buffer_store_short 460 ; GCN: buffer_store_dword 461 ; GCN: s_endpgm 462 define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 463 %val = load <3 x float>, <3 x float> addrspace(1)* %in 464 %cvt = fptrunc <3 x float> %val to <3 x half> 465 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 466 ret void 467 } 468 469 ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 470 ; GCN: buffer_load_dwordx4 471 ; GCN: v_cvt_f16_f32_e32 472 ; GCN: v_cvt_f16_f32_e32 473 ; GCN: v_cvt_f16_f32_e32 474 ; GCN: v_cvt_f16_f32_e32 475 ; GCN: buffer_store_dwordx2 476 ; GCN: s_endpgm 477 define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 478 %val = load <4 x float>, <4 x float> addrspace(1)* %in 479 %cvt = fptrunc <4 x float> %val to <4 x half> 480 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 481 ret void 482 } 483 484 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 485 ; GCN: buffer_load_dwordx4 486 ; GCN: buffer_load_dwordx4 487 ; GCN: v_cvt_f16_f32_e32 488 ; GCN: v_cvt_f16_f32_e32 489 ; GCN: v_cvt_f16_f32_e32 490 ; GCN: v_cvt_f16_f32_e32 491 ; GCN: v_cvt_f16_f32_e32 492 ; GCN: v_cvt_f16_f32_e32 493 ; GCN: v_cvt_f16_f32_e32 494 ; GCN: v_cvt_f16_f32_e32 495 ; GCN: buffer_store_dwordx4 496 ; GCN: s_endpgm 497 define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 498 %val = load <8 x float>, <8 x float> addrspace(1)* %in 499 %cvt = fptrunc <8 x float> %val to <8 x half> 500 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 501 ret void 502 } 503 504 ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 505 ; GCN: buffer_load_dwordx4 506 ; GCN: buffer_load_dwordx4 507 ; GCN: buffer_load_dwordx4 508 ; GCN: buffer_load_dwordx4 509 ; GCN-DAG: v_cvt_f16_f32_e32 510 ; GCN-DAG: v_cvt_f16_f32_e32 511 ; GCN-DAG: v_cvt_f16_f32_e32 512 ; GCN-DAG: v_cvt_f16_f32_e32 513 ; GCN-DAG: v_cvt_f16_f32_e32 514 ; GCN-DAG: v_cvt_f16_f32_e32 515 ; GCN-DAG: v_cvt_f16_f32_e32 516 ; GCN-DAG: v_cvt_f16_f32_e32 517 ; GCN-DAG: v_cvt_f16_f32_e32 518 ; GCN-DAG: v_cvt_f16_f32_e32 519 ; GCN-DAG: v_cvt_f16_f32_e32 520 ; GCN-DAG: v_cvt_f16_f32_e32 521 ; GCN-DAG: v_cvt_f16_f32_e32 522 ; GCN-DAG: v_cvt_f16_f32_e32 523 ; GCN-DAG: v_cvt_f16_f32_e32 524 ; GCN-DAG: v_cvt_f16_f32_e32 525 ; GCN-DAG: buffer_store_dwordx4 526 ; GCN-DAG: buffer_store_dwordx4 527 ; GCN: s_endpgm 528 define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 529 %val = load <16 x float>, <16 x float> addrspace(1)* %in 530 %cvt = fptrunc <16 x float> %val to <16 x half> 531 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 532 ret void 533 } 534 535 ; FIXME: Unsafe math should fold conversions away 536 ; GCN-LABEL: {{^}}fadd_f16: 537 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 538 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 539 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 540 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 541 ; SI: v_add_f32 542 ; GCN: s_endpgm 543 define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 544 %add = fadd half %a, %b 545 store half %add, half addrspace(1)* %out, align 4 546 ret void 547 } 548 549 ; GCN-LABEL: {{^}}fadd_v2f16: 550 ; SI: v_add_f32 551 ; SI: v_add_f32 552 ; GCN: s_endpgm 553 define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 554 %add = fadd <2 x half> %a, %b 555 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 556 ret void 557 } 558 559 ; GCN-LABEL: {{^}}fadd_v4f16: 560 ; SI: v_add_f32 561 ; SI: v_add_f32 562 ; SI: v_add_f32 563 ; SI: v_add_f32 564 ; GCN: s_endpgm 565 define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 566 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 567 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 568 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 569 %result = fadd <4 x half> %a, %b 570 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 571 ret void 572 } 573 574 ; GCN-LABEL: {{^}}fadd_v8f16: 575 ; SI: v_add_f32 576 ; SI: v_add_f32 577 ; SI: v_add_f32 578 ; SI: v_add_f32 579 ; SI: v_add_f32 580 ; SI: v_add_f32 581 ; SI: v_add_f32 582 ; SI: v_add_f32 583 ; GCN: s_endpgm 584 define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 585 %add = fadd <8 x half> %a, %b 586 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 587 ret void 588 } 589 590 ; GCN-LABEL: {{^}}fsub_f16: 591 ; GCN: v_subrev_f32_e32 592 ; GCN: s_endpgm 593 define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 594 %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 595 %a = load half, half addrspace(1)* %in 596 %b = load half, half addrspace(1)* %b_ptr 597 %sub = fsub half %a, %b 598 store half %sub, half addrspace(1)* %out 599 ret void 600 } 601 602 ; GCN-LABEL: {{^}}test_bitcast_from_half: 603 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 604 ; GCN: buffer_store_short [[TMP]] 605 define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 606 %val = load half, half addrspace(1)* %in 607 %val_int = bitcast half %val to i16 608 store i16 %val_int, i16 addrspace(1)* %out 609 ret void 610 } 611 612 ; GCN-LABEL: {{^}}test_bitcast_to_half: 613 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 614 ; GCN: buffer_store_short [[TMP]] 615 define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 616 %val = load i16, i16 addrspace(1)* %in 617 %val_fp = bitcast i16 %val to half 618 store half %val_fp, half addrspace(1)* %out 619 ret void 620 } 621 622 attributes #0 = { nounwind } 623