1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s 2 3 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo: 4 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 5 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 6 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 7 8 ; GCN-NOT: pack 9 ; GCN-NOT: and 10 ; GCN-NOT: shl 11 ; GCN-NOT: or 12 13 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}} 14 define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 15 bb: 16 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 17 18 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 19 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 20 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 21 22 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 23 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer 24 25 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) 26 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 27 ret void 28 } 29 30 ; Apply fneg to broadcasted vector 31 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo: 32 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 33 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 34 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 35 36 ; GCN-NOT: pack 37 ; GCN-NOT: and 38 ; GCN-NOT: shl 39 ; GCN-NOT: or 40 41 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 42 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 43 bb: 44 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 45 46 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 47 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 48 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 49 50 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 51 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer 52 %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast 53 54 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) 55 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 56 ret void 57 } 58 59 ; Apply fneg before broadcast 60 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo: 61 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 62 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 63 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 64 65 ; GCN-NOT: pack 66 ; GCN-NOT: and 67 ; GCN-NOT: shl 68 ; GCN-NOT: or 69 70 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 71 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 72 bb: 73 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 74 75 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 76 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 77 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 78 79 %neg.scalar0 = fsub half -0.0, %scalar0 80 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 81 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer 82 83 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) 84 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 85 ret void 86 } 87 88 ; Apply fneg before and after broadcast, and should cancel out. 89 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo: 90 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 91 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 92 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 93 94 ; GCN-NOT: pack 95 ; GCN-NOT: and 96 ; GCN-NOT: shl 97 ; GCN-NOT: or 98 99 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}} 100 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 101 bb: 102 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 103 104 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 105 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 106 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 107 108 %neg.scalar0 = fsub half -0.0, %scalar0 109 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 110 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer 111 %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast 112 113 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast) 114 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 115 ret void 116 } 117 118 ; Add scalar, but negate low component 119 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo: 120 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 121 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 122 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 123 124 ; GCN-NOT: pack 125 ; GCN-NOT: and 126 ; GCN-NOT: shl 127 ; GCN-NOT: or 128 129 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} 130 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 131 bb: 132 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 133 134 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 135 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 136 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 137 138 %neg.scalar0 = fsub half -0.0, %scalar0 139 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 140 %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1 141 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0) 142 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 143 ret void 144 } 145 146 ; Add scalar, but negate high component 147 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi: 148 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 149 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 150 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 151 152 ; GCN-NOT: pack 153 ; GCN-NOT: and 154 ; GCN-NOT: shl 155 ; GCN-NOT: or 156 157 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}} 158 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 159 bb: 160 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 161 162 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 163 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 164 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 165 166 %neg.scalar0 = fsub half -0.0, %scalar0 167 %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 168 %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1 169 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0) 170 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 171 ret void 172 } 173 174 ; Apply fneg before broadcast with bitcast 175 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: 176 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 177 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 178 179 ; GCN-NOT: pack 180 ; GCN-NOT: and 181 ; GCN-NOT: shl 182 ; GCN-NOT: or 183 184 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 185 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 186 bb: 187 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 188 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 189 %neg.scalar0 = fsub half -0.0, %scalar0 190 %neg.scalar0.bc = bitcast half %neg.scalar0 to i16 191 192 %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0 193 %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer 194 195 %result = add <2 x i16> %vec0, %neg.scalar0.broadcast 196 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 197 ret void 198 } 199 200 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi: 201 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 202 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 203 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] 204 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]] 205 206 ; FIXME: Remove and 207 ; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]] 208 ; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]] 209 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]] 210 211 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}} 212 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 213 bb: 214 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 215 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2 216 217 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 218 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 219 220 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 221 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2 222 223 %neg.scalar1 = fsub half -0.0, %scalar1 224 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0 225 %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1 226 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2) 227 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 228 ret void 229 } 230 231 ; FIXME: Can we avoid waitcnt between the two halves? 232 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: 233 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 234 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 235 ; GCN: ds_read_u16 [[PACKED:v[0-9]+]] 236 ; GCN: s_waitcnt 237 ; GCN: ds_read_u16_d16_hi [[PACKED]] 238 239 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 240 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { 241 bb: 242 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 243 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2 244 245 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 246 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 247 248 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 249 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2 250 251 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0 252 %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1 253 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 254 255 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2) 256 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 257 ret void 258 } 259 260 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi: 261 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 262 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 263 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 264 265 ; GCN-NOT: pack 266 ; GCN-NOT: and 267 ; GCN-NOT: shl 268 ; GCN-NOT: or 269 270 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 271 define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 272 bb: 273 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 274 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 275 276 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 277 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 278 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 279 280 %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2 281 %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1> 282 283 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast) 284 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 285 ret void 286 } 287 288 ; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi: 289 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 290 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 291 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 292 293 ; GCN-NOT: pack 294 ; GCN-NOT: and 295 ; GCN-NOT: shl 296 ; GCN-NOT: or 297 298 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} 299 define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 300 bb: 301 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 302 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 303 304 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 305 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 306 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 307 308 %vec2.elt1 = extractelement <2 x half> %vec2, i32 1 309 %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1 310 311 %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1 312 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert) 313 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 314 ret void 315 } 316 317 ; GCN-LABEL: {{^}}add_vector_scalar_hi: 318 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 319 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 320 321 ; GCN-NOT: pack 322 ; GCN-NOT: and 323 ; GCN-NOT: shl 324 ; GCN-NOT: or 325 326 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}} 327 define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { 328 bb: 329 %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1 330 331 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 332 %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4 333 334 %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1> 335 %result = add <2 x i16> %vec0, %vec1.elt1.broadcast 336 337 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 338 ret void 339 } 340 341 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi: 342 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 343 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 344 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 345 346 ; GCN-NOT: pack 347 ; GCN-NOT: and 348 ; GCN-NOT: shl 349 ; GCN-NOT: or 350 351 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}} 352 define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 353 bb: 354 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 355 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 356 357 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 358 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 359 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 360 361 %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1> 362 363 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast) 364 365 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 366 ret void 367 } 368 369 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi: 370 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 371 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 372 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 373 374 ; GCN-NOT: pack 375 ; GCN-NOT: and 376 ; GCN-NOT: shl 377 ; GCN-NOT: or 378 379 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}} 380 define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 381 bb: 382 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 383 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 384 385 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 386 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 387 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 388 389 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 390 %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1 391 %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1 392 %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1 393 394 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert) 395 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 396 ret void 397 } 398 399 ; GCN-LABEL: {{^}}fma_vector_vector_swap_vector: 400 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 401 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 402 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 403 404 ; GCN-NOT: pack 405 ; GCN-NOT: and 406 ; GCN-NOT: shl 407 ; GCN-NOT: or 408 409 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} 410 define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 411 bb: 412 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 413 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 414 415 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 416 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 417 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 418 419 %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0> 420 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap) 421 422 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 423 ret void 424 } 425 426 ; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector: 427 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 428 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 429 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 430 431 ; GCN-NOT: pack 432 ; GCN-NOT: and 433 ; GCN-NOT: shl 434 ; GCN-NOT: or 435 ; GCN-NOT: xor 436 437 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 438 define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 439 bb: 440 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 441 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 442 443 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 444 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 445 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 446 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 447 448 %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0> 449 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap) 450 451 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 452 ret void 453 } 454 455 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0: 456 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 457 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 458 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 459 460 ; GCN-NOT: pack 461 ; GCN-NOT: and 462 ; GCN-NOT: shl 463 ; GCN-NOT: or 464 ; GCN-NOT: xor 465 466 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} 467 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 468 bb: 469 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 470 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 471 472 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 473 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 474 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 475 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 476 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0> 477 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) 478 479 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 480 ret void 481 } 482 483 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1: 484 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 485 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 486 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 487 488 ; GCN-NOT: pack 489 ; GCN-NOT: and 490 ; GCN-NOT: shl 491 ; GCN-NOT: or 492 ; GCN-NOT: xor 493 494 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}} 495 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 496 bb: 497 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 498 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 499 500 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 501 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 502 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 503 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 504 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1> 505 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) 506 507 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 508 ret void 509 } 510 511 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2: 512 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 513 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 514 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 515 516 ; GCN-NOT: pack 517 ; GCN-NOT: and 518 ; GCN-NOT: shl 519 ; GCN-NOT: or 520 ; GCN-NOT: xor 521 522 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} 523 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 524 bb: 525 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 526 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 527 528 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 529 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 530 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 531 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 532 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3> 533 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) 534 535 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 536 ret void 537 } 538 539 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3: 540 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 541 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 542 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 543 544 ; GCN-NOT: pack 545 ; GCN-NOT: and 546 ; GCN-NOT: shl 547 ; GCN-NOT: or 548 ; GCN-NOT: xor 549 550 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}} 551 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 552 bb: 553 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 554 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 555 556 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 557 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 558 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 559 %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 560 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1> 561 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) 562 563 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 564 ret void 565 } 566 567 ; GCN-LABEL: {{^}}bitcast_fneg_f32: 568 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} 569 define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 570 bb: 571 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 572 %f32 = load volatile float, float addrspace(3)* undef, align 4 573 %neg.f32 = fsub float -0.0, %f32 574 %bc = bitcast float %neg.f32 to <2 x half> 575 %result = fadd <2 x half> %vec0, %bc 576 577 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 578 ret void 579 } 580 581 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32: 582 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}} 583 define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 584 bb: 585 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 586 587 %f32 = load volatile float, float addrspace(3)* undef, align 4 588 %neg.f32 = fsub float -0.0, %f32 589 %bc = bitcast float %neg.f32 to <2 x half> 590 %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0> 591 %result = fadd <2 x half> %vec0, %shuf 592 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 593 ret void 594 } 595 596 ; GCN-LABEL: {{^}}extract_from_i64: 597 ; GCN: v_lshl_or_b32 598 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} 599 define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { 600 bb: 601 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 602 %i64 = load volatile i64, i64 addrspace(1)* undef 603 604 %elt0 = trunc i64 %i64 to i16 605 %hi = lshr i64 %i64, 16 606 %elt1 = trunc i64 %hi to i16 607 608 %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0 609 %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1 610 %result = add <2 x i16> %vec0, %ins1 611 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 612 ret void 613 } 614 615 616 ; Bitcast is final obstacle to identifying same source register 617 ; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel: 618 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 619 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 620 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 621 622 ; GCN-NOT: pack 623 ; GCN-NOT: and 624 ; GCN-NOT: shl 625 ; GCN-NOT: _or 626 627 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] 628 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} 629 define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 630 bb: 631 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 632 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 633 634 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 635 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 636 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 637 638 %scalar0 = load volatile i16, i16 addrspace(1)* undef 639 %shl = shl i16 %scalar0, 1 640 %shl.bc = bitcast i16 %shl to half 641 642 %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0> 643 %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0> 644 645 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle) 646 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 647 ret void 648 } 649 650 651 ; Bitcast is final obstacle to identifying same source register 652 ; GCN-LABEL: {{^}}mix_elt_types_op_sel: 653 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] 654 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] 655 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]] 656 657 ; GCN-NOT: pack 658 ; GCN-NOT: and 659 ; GCN-NOT: shl 660 ; GCN-NOT: _or 661 662 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] 663 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} 664 define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { 665 bb: 666 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 667 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 668 669 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 670 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 671 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 672 673 %scalar0 = load volatile i16, i16 addrspace(1)* undef 674 %scalar1 = load volatile half, half addrspace(1)* undef 675 %shl = shl i16 %scalar0, 1 676 %shl.bc = bitcast i16 %shl to half 677 678 %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0 679 680 %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0> 681 %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0> 682 683 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1) 684 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 685 ret void 686 } 687 688 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 689 690 attributes #0 = { nounwind } 691 attributes #1 = { nounwind readnone } 692