Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
      4 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
      5 
      6 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
      7 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
      8 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
      9 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
     10 
     11 declare i32 @llvm.amdgcn.workitem.id.x() #1
     12 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
     13 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
     14 
     15 ; GCN-LABEL: {{^}}fmuladd_v2f16:
     16 ; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     17 ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     18 
     19 ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     20 define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
     21                          <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
     22   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
     23   %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
     24   %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
     25   %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2)
     26   store <2 x half> %r3, <2 x half> addrspace(1)* %out
     27   ret void
     28 }
     29 
     30 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
     31 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
     32 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
     33 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
     34 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
     35 
     36 ; GFX9-FLUSH: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     37 
     38 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
     39 ; GFX9-DENORM: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     40 define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
     41   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     42   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     43   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
     44   %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     45 
     46   %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
     47   %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
     48 
     49   %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2)
     50   store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
     51   ret void
     52 }
     53 
     54 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16:
     55 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
     56 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
     57 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
     58 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
     59 
     60 ; GFX9-FLUSH: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     61 
     62 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
     63 ; GFX9-DENORM: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     64 define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
     65   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     66   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     67   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
     68   %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     69 
     70   %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
     71   %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
     72 
     73   %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2)
     74   store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
     75   ret void
     76 }
     77 
     78 ; GCN-LABEL: {{^}}fadd_a_a_b_v2f16:
     79 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
     80 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
     81 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
     82 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
     83 
     84 ; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
     85 ; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
     86 
     87 ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
     88 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     89 define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
     90                             <2 x half> addrspace(1)* %in1,
     91                             <2 x half> addrspace(1)* %in2) #0 {
     92   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     93   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     94   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
     95   %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
     96 
     97   %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
     98   %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
     99 
    100   %add.0 = fadd <2 x half> %r0, %r0
    101   %add.1 = fadd <2 x half> %add.0, %r1
    102   store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out
    103   ret void
    104 }
    105 
    106 attributes #0 = { nounwind }
    107 attributes #1 = { nounwind readnone }
    108