Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
      4 
      5 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
      6 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
      7 ; for correctness.
      8 
      9 declare i32 @llvm.amdgcn.workitem.id.x() #0
     10 declare double @llvm.fabs.f64(double) #0
     11 declare double @llvm.fma.f64(double, double, double) #0
     12 declare float @llvm.fma.f32(float, float, float) #0
     13 
     14 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     15 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
     16 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     17 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     18 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     19 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     20 ; SI: buffer_store_dwordx2 [[RESULT]]
     21 define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     22   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
     23   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     24   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     25   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     26   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
     27 
     28   %a = load volatile double, double addrspace(1)* %gep.0
     29   %b = load volatile double, double addrspace(1)* %gep.1
     30   %c = load volatile double, double addrspace(1)* %gep.2
     31 
     32   %mul = fmul double %a, %b
     33   %fma = fadd double %mul, %c
     34   store double %fma, double addrspace(1)* %gep.out
     35   ret void
     36 }
     37 
     38 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     39 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
     40 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     41 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     42 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     43 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
     44 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     45 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
     46 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     47 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     48 ; SI: s_endpgm
     49 define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     50   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
     51   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     52   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     53   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     54   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
     55   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
     56   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
     57 
     58   %a = load volatile double, double addrspace(1)* %gep.0
     59   %b = load volatile double, double addrspace(1)* %gep.1
     60   %c = load volatile double, double addrspace(1)* %gep.2
     61   %d = load volatile double, double addrspace(1)* %gep.3
     62 
     63   %mul = fmul double %a, %b
     64   %fma0 = fadd double %mul, %c
     65   %fma1 = fadd double %mul, %d
     66   store volatile double %fma0, double addrspace(1)* %gep.out.0
     67   store volatile double %fma1, double addrspace(1)* %gep.out.1
     68   ret void
     69 }
     70 
     71 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
     72 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
     73 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     74 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     75 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     76 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     77 ; SI: buffer_store_dwordx2 [[RESULT]]
     78 define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     79   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
     80   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     81   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     82   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     83   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
     84 
     85   %a = load volatile double, double addrspace(1)* %gep.0
     86   %b = load volatile double, double addrspace(1)* %gep.1
     87   %c = load volatile double, double addrspace(1)* %gep.2
     88 
     89   %mul = fmul double %a, %b
     90   %fma = fadd double %c, %mul
     91   store double %fma, double addrspace(1)* %gep.out
     92   ret void
     93 }
     94 
     95 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
     96 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
     97 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     98 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     99 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    100 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
    101 ; SI: buffer_store_dwordx2 [[RESULT]]
    102 define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    103   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    104   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    105   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    106   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    107   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    108 
    109   %a = load volatile double, double addrspace(1)* %gep.0
    110   %b = load volatile double, double addrspace(1)* %gep.1
    111   %c = load volatile double, double addrspace(1)* %gep.2
    112 
    113   %mul = fmul double %a, %b
    114   %fma = fsub double %mul, %c
    115   store double %fma, double addrspace(1)* %gep.out
    116   ret void
    117 }
    118 
    119 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    120 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
    121 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    122 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    123 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    124 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    125 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
    126 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
    127 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    128 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    129 ; SI: s_endpgm
    130 define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    131   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    132   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    133   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    134   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    135   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    136   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    137   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    138 
    139   %a = load volatile double, double addrspace(1)* %gep.0
    140   %b = load volatile double, double addrspace(1)* %gep.1
    141   %c = load volatile double, double addrspace(1)* %gep.2
    142   %d = load volatile double, double addrspace(1)* %gep.3
    143 
    144   %mul = fmul double %a, %b
    145   %fma0 = fsub double %mul, %c
    146   %fma1 = fsub double %mul, %d
    147   store volatile double %fma0, double addrspace(1)* %gep.out.0
    148   store volatile double %fma1, double addrspace(1)* %gep.out.1
    149   ret void
    150 }
    151 
    152 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    153 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
    154 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    155 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    156 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    157 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
    158 ; SI: buffer_store_dwordx2 [[RESULT]]
    159 define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    160   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    161   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    162   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    163   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    164   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    165 
    166   %a = load volatile double, double addrspace(1)* %gep.0
    167   %b = load volatile double, double addrspace(1)* %gep.1
    168   %c = load volatile double, double addrspace(1)* %gep.2
    169 
    170   %mul = fmul double %a, %b
    171   %fma = fsub double %c, %mul
    172   store double %fma, double addrspace(1)* %gep.out
    173   ret void
    174 }
    175 
    176 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    177 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
    178 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    179 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    180 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    181 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    182 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
    183 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
    184 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    185 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    186 ; SI: s_endpgm
    187 define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    188   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    189   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    190   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    191   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    192   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    193   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    194   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    195 
    196   %a = load volatile double, double addrspace(1)* %gep.0
    197   %b = load volatile double, double addrspace(1)* %gep.1
    198   %c = load volatile double, double addrspace(1)* %gep.2
    199   %d = load volatile double, double addrspace(1)* %gep.3
    200 
    201   %mul = fmul double %a, %b
    202   %fma0 = fsub double %c, %mul
    203   %fma1 = fsub double %d, %mul
    204   store volatile double %fma0, double addrspace(1)* %gep.out.0
    205   store volatile double %fma1, double addrspace(1)* %gep.out.1
    206   ret void
    207 }
    208 
    209 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    210 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
    211 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    212 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    213 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    214 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    215 ; SI: buffer_store_dwordx2 [[RESULT]]
    216 define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    217   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    218   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    219   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    220   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    221   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    222 
    223   %a = load volatile double, double addrspace(1)* %gep.0
    224   %b = load volatile double, double addrspace(1)* %gep.1
    225   %c = load volatile double, double addrspace(1)* %gep.2
    226 
    227   %mul = fmul double %a, %b
    228   %mul.neg = fsub double -0.0, %mul
    229   %fma = fsub double %mul.neg, %c
    230 
    231   store double %fma, double addrspace(1)* %gep.out
    232   ret void
    233 }
    234 
    235 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    236 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
    237 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    238 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    239 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    240 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    241 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    242 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
    243 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    244 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    245 ; SI: s_endpgm
    246 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    247   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    248   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    249   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    250   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    251   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    252   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    253   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    254 
    255   %a = load volatile double, double addrspace(1)* %gep.0
    256   %b = load volatile double, double addrspace(1)* %gep.1
    257   %c = load volatile double, double addrspace(1)* %gep.2
    258   %d = load volatile double, double addrspace(1)* %gep.3
    259 
    260   %mul = fmul double %a, %b
    261   %mul.neg = fsub double -0.0, %mul
    262   %fma0 = fsub double %mul.neg, %c
    263   %fma1 = fsub double %mul.neg, %d
    264 
    265   store volatile double %fma0, double addrspace(1)* %gep.out.0
    266   store volatile double %fma1, double addrspace(1)* %gep.out.1
    267   ret void
    268 }
    269 
    270 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    271 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
    272 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    273 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    274 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    275 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    276 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    277 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
    278 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    279 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    280 ; SI: s_endpgm
    281 define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    282   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    283   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    284   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    285   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    286   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    287   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    288   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    289 
    290   %a = load volatile double, double addrspace(1)* %gep.0
    291   %b = load volatile double, double addrspace(1)* %gep.1
    292   %c = load volatile double, double addrspace(1)* %gep.2
    293   %d = load volatile double, double addrspace(1)* %gep.3
    294 
    295   %mul = fmul double %a, %b
    296   %mul.neg = fsub double -0.0, %mul
    297   %fma0 = fsub double %mul.neg, %c
    298   %fma1 = fsub double %mul, %d
    299 
    300   store volatile double %fma0, double addrspace(1)* %gep.out.0
    301   store volatile double %fma1, double addrspace(1)* %gep.out.1
    302   ret void
    303 }
    304 
    305 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    306 
    307 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
    308 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    309 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    310 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    311 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    312 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
    313 
    314 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
    315 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
    316 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
    317 
    318 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
    319 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
    320 
    321 ; SI: buffer_store_dwordx2 [[RESULT]]
    322 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    323   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    324   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    325   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    326   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    327   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    328   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
    329   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    330 
    331   %x = load volatile double, double addrspace(1)* %gep.0
    332   %y = load volatile double, double addrspace(1)* %gep.1
    333   %z = load volatile double, double addrspace(1)* %gep.2
    334   %u = load volatile double, double addrspace(1)* %gep.3
    335   %v = load volatile double, double addrspace(1)* %gep.4
    336 
    337   %tmp0 = fmul double %u, %v
    338   %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
    339   %tmp2 = fsub double %tmp1, %z
    340 
    341   store double %tmp2, double addrspace(1)* %gep.out
    342   ret void
    343 }
    344 
    345 ; fold (fsub x, (fma y, z, (fmul u, v)))
    346 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))
    347 
    348 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
    349 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    350 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    351 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    352 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    353 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
    354 
    355 ; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
    356 ; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
    357 ; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
    358 
    359 ; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
    360 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
    361 
    362 ; SI: buffer_store_dwordx2 [[RESULT]]
    363 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    364   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    365   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    366   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    367   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    368   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    369   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
    370   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    371 
    372   %x = load volatile double, double addrspace(1)* %gep.0
    373   %y = load volatile double, double addrspace(1)* %gep.1
    374   %z = load volatile double, double addrspace(1)* %gep.2
    375   %u = load volatile double, double addrspace(1)* %gep.3
    376   %v = load volatile double, double addrspace(1)* %gep.4
    377 
    378   %tmp0 = fmul double %u, %v
    379   %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
    380   %tmp2 = fsub double %x, %tmp1
    381 
    382   store double %tmp2, double addrspace(1)* %gep.out
    383   ret void
    384 }
    385 
    386 ;
    387 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
    388 ;
    389 
    390 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
    391 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    392 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    393 ;
    394 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    395 define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
    396                                         float addrspace(1)* %in1,
    397                                         float addrspace(1)* %in2) {
    398   %x = load volatile float, float addrspace(1)* %in1
    399   %y = load volatile float, float addrspace(1)* %in2
    400   %a = fadd float %x, 1.0
    401   %m = fmul float %a, %y
    402   store float %m, float addrspace(1)* %out
    403   ret void
    404 }
    405 
    406 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
    407 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    408 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    409 ;
    410 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    411 define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
    412                                         float addrspace(1)* %in1,
    413                                         float addrspace(1)* %in2) {
    414   %x = load volatile float, float addrspace(1)* %in1
    415   %y = load volatile float, float addrspace(1)* %in2
    416   %a = fadd float %x, 1.0
    417   %m = fmul float %y, %a
    418   store float %m, float addrspace(1)* %out
    419   ret void
    420 }
    421 
    422 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
    423 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    424 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    425 ;
    426 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    427 define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
    428                                            float addrspace(1)* %in1,
    429                                            float addrspace(1)* %in2) {
    430   %x = load float, float addrspace(1)* %in1
    431   %y = load float, float addrspace(1)* %in2
    432   %a = fadd float %x, -1.0
    433   %m = fmul float %a, %y
    434   store float %m, float addrspace(1)* %out
    435   ret void
    436 }
    437 
    438 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
    439 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    440 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    441 ;
    442 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    443 define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
    444                                            float addrspace(1)* %in1,
    445                                            float addrspace(1)* %in2) {
    446   %x = load float, float addrspace(1)* %in1
    447   %y = load float, float addrspace(1)* %in2
    448   %a = fadd float %x, -1.0
    449   %m = fmul float %y, %a
    450   store float %m, float addrspace(1)* %out
    451   ret void
    452 }
    453 
    454 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
    455 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    456 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    457 ;
    458 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    459 define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
    460                                         float addrspace(1)* %in1,
    461                                         float addrspace(1)* %in2) {
    462   %x = load float, float addrspace(1)* %in1
    463   %y = load float, float addrspace(1)* %in2
    464   %s = fsub float 1.0, %x
    465   %m = fmul float %s, %y
    466   store float %m, float addrspace(1)* %out
    467   ret void
    468 }
    469 
    470 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
    471 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    472 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    473 ;
    474 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    475 define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
    476                                         float addrspace(1)* %in1,
    477                                         float addrspace(1)* %in2) {
    478   %x = load float, float addrspace(1)* %in1
    479   %y = load float, float addrspace(1)* %in2
    480   %s = fsub float 1.0, %x
    481   %m = fmul float %y, %s
    482   store float %m, float addrspace(1)* %out
    483   ret void
    484 }
    485 
    486 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
    487 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    488 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    489 ;
    490 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    491 define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
    492                                            float addrspace(1)* %in1,
    493                                            float addrspace(1)* %in2) {
    494   %x = load float, float addrspace(1)* %in1
    495   %y = load float, float addrspace(1)* %in2
    496   %s = fsub float -1.0, %x
    497   %m = fmul float %s, %y
    498   store float %m, float addrspace(1)* %out
    499   ret void
    500 }
    501 
    502 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
    503 ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    504 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    505 ;
    506 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    507 define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
    508                                          float addrspace(1)* %in1,
    509                                          float addrspace(1)* %in2) {
    510   %x = load float, float addrspace(1)* %in1
    511   %y = load float, float addrspace(1)* %in2
    512   %s = fsub float -1.0, %x
    513   %m = fmul float %y, %s
    514   store float %m, float addrspace(1)* %out
    515   ret void
    516 }
    517 
    518 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
    519 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    520 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    521 ;
    522 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    523 define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
    524                                         float addrspace(1)* %in1,
    525                                         float addrspace(1)* %in2) {
    526   %x = load float, float addrspace(1)* %in1
    527   %y = load float, float addrspace(1)* %in2
    528   %s = fsub float %x, 1.0
    529   %m = fmul float %s, %y
    530   store float %m, float addrspace(1)* %out
    531   ret void
    532 }
    533 
    534 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
    535 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
    536 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    537 ;
    538 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
    539 define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
    540                                       float addrspace(1)* %in1,
    541                                       float addrspace(1)* %in2) {
    542   %x = load float, float addrspace(1)* %in1
    543   %y = load float, float addrspace(1)* %in2
    544   %s = fsub float %x, 1.0
    545   %m = fmul float %y, %s
    546   store float %m, float addrspace(1)* %out
    547   ret void
    548 }
    549 
    550 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
    551 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    552 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
    553 ;
    554 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    555 define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
    556                                          float addrspace(1)* %in1,
    557                                          float addrspace(1)* %in2) {
    558   %x = load float, float addrspace(1)* %in1
    559   %y = load float, float addrspace(1)* %in2
    560   %s = fsub float %x, -1.0
    561   %m = fmul float %s, %y
    562   store float %m, float addrspace(1)* %out
    563   ret void
    564 }
    565 
    566 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
    567 ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
    568 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
    569 ;
    570 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
    571 define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
    572                                          float addrspace(1)* %in1,
    573                                          float addrspace(1)* %in2) {
    574   %x = load float, float addrspace(1)* %in1
    575   %y = load float, float addrspace(1)* %in2
    576   %s = fsub float %x, -1.0
    577   %m = fmul float %y, %s
    578   store float %m, float addrspace(1)* %out
    579   ret void
    580 }
    581 
    582 ;
    583 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
    584 ;
    585 
    586 ; FUNC-LABEL: {{^}}test_f32_interp:
    587 ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
    588 ; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
    589 ; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
    590 ;
    591 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
    592 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
    593 define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
    594                              float addrspace(1)* %in1,
    595                              float addrspace(1)* %in2,
    596                              float addrspace(1)* %in3) {
    597   %x = load float, float addrspace(1)* %in1
    598   %y = load float, float addrspace(1)* %in2
    599   %t = load float, float addrspace(1)* %in3
    600   %t1 = fsub float 1.0, %t
    601   %tx = fmul float %x, %t
    602   %ty = fmul float %y, %t1
    603   %r = fadd float %tx, %ty
    604   store float %r, float addrspace(1)* %out
    605   ret void
    606 }
    607 
    608 ; FUNC-LABEL: {{^}}test_f64_interp:
    609 ; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
    610 ; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
    611 ; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
    612 ;
    613 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
    614 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
    615 define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
    616                              double addrspace(1)* %in1,
    617                              double addrspace(1)* %in2,
    618                              double addrspace(1)* %in3) {
    619   %x = load double, double addrspace(1)* %in1
    620   %y = load double, double addrspace(1)* %in2
    621   %t = load double, double addrspace(1)* %in3
    622   %t1 = fsub double 1.0, %t
    623   %tx = fmul double %x, %t
    624   %ty = fmul double %y, %t1
    625   %r = fadd double %tx, %ty
    626   store double %r, double addrspace(1)* %out
    627   ret void
    628 }
    629 
    630 ; Make sure negative constant cancels out fneg
    631 ; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
    632 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    633 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
    634 ; GCN-NOT: [[A]]
    635 ; GCN-NOT: [[B]]
    636 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
    637 define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    638   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    639   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    640   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    641   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    642 
    643   %r1 = load volatile float, float addrspace(1)* %gep.0
    644   %r2 = load volatile float, float addrspace(1)* %gep.1
    645 
    646   %r1.fneg = fsub float -0.000000e+00, %r1
    647 
    648   %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
    649   store float %r3, float addrspace(1)* %gep.out
    650   ret void
    651 }
    652 
    653 ; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
    654 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    655 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
    656 ; GCN-NOT: [[A]]
    657 ; GCN-NOT: [[B]]
    658 ; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
    659 define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    660   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    661   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    662   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    663   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    664 
    665   %r1 = load volatile float, float addrspace(1)* %gep.0
    666   %r2 = load volatile float, float addrspace(1)* %gep.1
    667 
    668   %r1.fneg = fsub float -0.000000e+00, %r1
    669 
    670   %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
    671   store float %r3, float addrspace(1)* %gep.out
    672   ret void
    673 }
    674 
    675 attributes #0 = { nounwind readnone }
    676 attributes #1 = { nounwind }
    677 
    678