Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
      3 
      4 declare i32 @llvm.r600.read.tidig.x() #0
      5 declare double @llvm.fabs.f64(double) #0
      6 declare double @llvm.fma.f64(double, double, double) #0
      7 declare float @llvm.fma.f32(float, float, float) #0
      8 
      9 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     10 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
     11 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     12 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     13 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     14 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     15 ; SI: buffer_store_dwordx2 [[RESULT]]
     16 define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     17   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     18   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     19   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     20   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     21   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
     22 
     23   %a = load double, double addrspace(1)* %gep.0
     24   %b = load double, double addrspace(1)* %gep.1
     25   %c = load double, double addrspace(1)* %gep.2
     26 
     27   %mul = fmul double %a, %b
     28   %fma = fadd double %mul, %c
     29   store double %fma, double addrspace(1)* %gep.out
     30   ret void
     31 }
     32 
     33 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     34 ; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
     35 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     36 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     37 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     38 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
     39 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     40 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
     41 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     42 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     43 ; SI: s_endpgm
     44 define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     45   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     46   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     47   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     48   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     49   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
     50   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
     51   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
     52 
     53   %a = load double, double addrspace(1)* %gep.0
     54   %b = load double, double addrspace(1)* %gep.1
     55   %c = load double, double addrspace(1)* %gep.2
     56   %d = load double, double addrspace(1)* %gep.3
     57 
     58   %mul = fmul double %a, %b
     59   %fma0 = fadd double %mul, %c
     60   %fma1 = fadd double %mul, %d
     61   store double %fma0, double addrspace(1)* %gep.out.0
     62   store double %fma1, double addrspace(1)* %gep.out.1
     63   ret void
     64 }
     65 
     66 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
     67 ; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
     68 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     69 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     70 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     71 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
     72 ; SI: buffer_store_dwordx2 [[RESULT]]
     73 define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     74   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     75   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
     76   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
     77   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
     78   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
     79 
     80   %a = load double, double addrspace(1)* %gep.0
     81   %b = load double, double addrspace(1)* %gep.1
     82   %c = load double, double addrspace(1)* %gep.2
     83 
     84   %mul = fmul double %a, %b
     85   %fma = fadd double %c, %mul
     86   store double %fma, double addrspace(1)* %gep.out
     87   ret void
     88 }
     89 
     90 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
     91 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
     92 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     93 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     94 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
     95 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
     96 ; SI: buffer_store_dwordx2 [[RESULT]]
     97 define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
     98   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     99   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    100   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    101   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    102   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    103 
    104   %a = load double, double addrspace(1)* %gep.0
    105   %b = load double, double addrspace(1)* %gep.1
    106   %c = load double, double addrspace(1)* %gep.2
    107 
    108   %mul = fmul double %a, %b
    109   %fma = fsub double %mul, %c
    110   store double %fma, double addrspace(1)* %gep.out
    111   ret void
    112 }
    113 
    114 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    115 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
    116 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    117 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    118 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    119 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    120 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
    121 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
    122 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    123 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    124 ; SI: s_endpgm
    125 define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    126   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    127   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    128   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    129   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    130   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    131   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    132   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    133 
    134   %a = load double, double addrspace(1)* %gep.0
    135   %b = load double, double addrspace(1)* %gep.1
    136   %c = load double, double addrspace(1)* %gep.2
    137   %d = load double, double addrspace(1)* %gep.3
    138 
    139   %mul = fmul double %a, %b
    140   %fma0 = fsub double %mul, %c
    141   %fma1 = fsub double %mul, %d
    142   store double %fma0, double addrspace(1)* %gep.out.0
    143   store double %fma1, double addrspace(1)* %gep.out.1
    144   ret void
    145 }
    146 
    147 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    148 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
    149 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    150 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    151 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    152 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
    153 ; SI: buffer_store_dwordx2 [[RESULT]]
    154 define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    155   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    156   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    157   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    158   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    159   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    160 
    161   %a = load double, double addrspace(1)* %gep.0
    162   %b = load double, double addrspace(1)* %gep.1
    163   %c = load double, double addrspace(1)* %gep.2
    164 
    165   %mul = fmul double %a, %b
    166   %fma = fsub double %c, %mul
    167   store double %fma, double addrspace(1)* %gep.out
    168   ret void
    169 }
    170 
    171 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    172 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
    173 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    174 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    175 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    176 ; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    177 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
    178 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
    179 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    180 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    181 ; SI: s_endpgm
    182 define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    183   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    184   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    185   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    186   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    187   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    188   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    189   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    190 
    191   %a = load double, double addrspace(1)* %gep.0
    192   %b = load double, double addrspace(1)* %gep.1
    193   %c = load double, double addrspace(1)* %gep.2
    194   %d = load double, double addrspace(1)* %gep.3
    195 
    196   %mul = fmul double %a, %b
    197   %fma0 = fsub double %c, %mul
    198   %fma1 = fsub double %d, %mul
    199   store double %fma0, double addrspace(1)* %gep.out.0
    200   store double %fma1, double addrspace(1)* %gep.out.1
    201   ret void
    202 }
    203 
    204 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    205 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
    206 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    207 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    208 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    209 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    210 ; SI: buffer_store_dwordx2 [[RESULT]]
    211 define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    212   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    213   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    214   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    215   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    216   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    217 
    218   %a = load double, double addrspace(1)* %gep.0
    219   %b = load double, double addrspace(1)* %gep.1
    220   %c = load double, double addrspace(1)* %gep.2
    221 
    222   %mul = fmul double %a, %b
    223   %mul.neg = fsub double -0.0, %mul
    224   %fma = fsub double %mul.neg, %c
    225 
    226   store double %fma, double addrspace(1)* %gep.out
    227   ret void
    228 }
    229 
    230 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    231 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
    232 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    233 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    234 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    235 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    236 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
    237 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    238 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    239 ; SI: s_endpgm
    240 define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    241   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    242   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    243   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    244   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    245   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    246   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    247   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    248 
    249   %a = load double, double addrspace(1)* %gep.0
    250   %b = load double, double addrspace(1)* %gep.1
    251   %c = load double, double addrspace(1)* %gep.2
    252   %d = load double, double addrspace(1)* %gep.3
    253 
    254   %mul = fmul double %a, %b
    255   %mul.neg = fsub double -0.0, %mul
    256   %fma0 = fsub double %mul.neg, %c
    257   %fma1 = fsub double %mul.neg, %d
    258 
    259   store double %fma0, double addrspace(1)* %gep.out.0
    260   store double %fma1, double addrspace(1)* %gep.out.1
    261   ret void
    262 }
    263 
    264 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    265 ; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
    266 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    267 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    268 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    269 ; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
    270 ; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
    271 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    272 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    273 ; SI: s_endpgm
    274 define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    275   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    276   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    277   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    278   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    279   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    280   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
    281   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
    282 
    283   %a = load double, double addrspace(1)* %gep.0
    284   %b = load double, double addrspace(1)* %gep.1
    285   %c = load double, double addrspace(1)* %gep.2
    286   %d = load double, double addrspace(1)* %gep.3
    287 
    288   %mul = fmul double %a, %b
    289   %mul.neg = fsub double -0.0, %mul
    290   %fma0 = fsub double %mul.neg, %c
    291   %fma1 = fsub double %mul, %d
    292 
    293   store double %fma0, double addrspace(1)* %gep.out.0
    294   store double %fma1, double addrspace(1)* %gep.out.1
    295   ret void
    296 }
    297 
    298 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    299 
    300 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
    301 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    302 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    303 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    304 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    305 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
    306 ; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
    307 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
    308 ; SI: buffer_store_dwordx2 [[RESULT]]
    309 define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    310   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    311   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    312   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    313   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    314   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    315   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
    316   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    317 
    318   %x = load double, double addrspace(1)* %gep.0
    319   %y = load double, double addrspace(1)* %gep.1
    320   %z = load double, double addrspace(1)* %gep.2
    321   %u = load double, double addrspace(1)* %gep.3
    322   %v = load double, double addrspace(1)* %gep.4
    323 
    324   %tmp0 = fmul double %u, %v
    325   %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
    326   %tmp2 = fsub double %tmp1, %z
    327 
    328   store double %tmp2, double addrspace(1)* %gep.out
    329   ret void
    330 }
    331 
    332 ; fold (fsub x, (fma y, z, (fmul u, v)))
    333 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))
    334 
    335 ; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
    336 ; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    337 ; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    338 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    339 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
    340 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
    341 ; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
    342 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
    343 ; SI: buffer_store_dwordx2 [[RESULT]]
    344 define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
    345   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    346   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
    347   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
    348   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
    349   %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
    350   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
    351   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
    352 
    353   %x = load double, double addrspace(1)* %gep.0
    354   %y = load double, double addrspace(1)* %gep.1
    355   %z = load double, double addrspace(1)* %gep.2
    356   %u = load double, double addrspace(1)* %gep.3
    357   %v = load double, double addrspace(1)* %gep.4
    358 
    359   %tmp0 = fmul double %u, %v
    360   %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
    361   %tmp2 = fsub double %x, %tmp1
    362 
    363   store double %tmp2, double addrspace(1)* %gep.out
    364   ret void
    365 }
    366 
    367 ;
    368 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
    369 ;
    370 
    371 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
    372 ; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
    373 define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
    374                                         float addrspace(1)* %in1,
    375                                         float addrspace(1)* %in2) {
    376   %x = load float, float addrspace(1)* %in1
    377   %y = load float, float addrspace(1)* %in2
    378   %a = fadd float %x, 1.0
    379   %m = fmul float %a, %y
    380   store float %m, float addrspace(1)* %out
    381   ret void
    382 }
    383 
    384 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
    385 ; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
    386 define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
    387                                         float addrspace(1)* %in1,
    388                                         float addrspace(1)* %in2) {
    389   %x = load float, float addrspace(1)* %in1
    390   %y = load float, float addrspace(1)* %in2
    391   %a = fadd float %x, 1.0
    392   %m = fmul float %y, %a
    393   store float %m, float addrspace(1)* %out
    394   ret void
    395 }
    396 
    397 ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
    398 ; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
    399 define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
    400                                            float addrspace(1)* %in1,
    401                                            float addrspace(1)* %in2) {
    402   %x = load float, float addrspace(1)* %in1
    403   %y = load float, float addrspace(1)* %in2
    404   %a = fadd float %x, -1.0
    405   %m = fmul float %a, %y
    406   store float %m, float addrspace(1)* %out
    407   ret void
    408 }
    409 
    410 ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
    411 ; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
    412 define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
    413                                            float addrspace(1)* %in1,
    414                                            float addrspace(1)* %in2) {
    415   %x = load float, float addrspace(1)* %in1
    416   %y = load float, float addrspace(1)* %in2
    417   %a = fadd float %x, -1.0
    418   %m = fmul float %y, %a
    419   store float %m, float addrspace(1)* %out
    420   ret void
    421 }
    422 
    423 ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
    424 ; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
    425 define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
    426                                         float addrspace(1)* %in1,
    427                                         float addrspace(1)* %in2) {
    428   %x = load float, float addrspace(1)* %in1
    429   %y = load float, float addrspace(1)* %in2
    430   %s = fsub float 1.0, %x
    431   %m = fmul float %s, %y
    432   store float %m, float addrspace(1)* %out
    433   ret void
    434 }
    435 
    436 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
    437 ; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
    438 define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
    439                                         float addrspace(1)* %in1,
    440                                         float addrspace(1)* %in2) {
    441   %x = load float, float addrspace(1)* %in1
    442   %y = load float, float addrspace(1)* %in2
    443   %s = fsub float 1.0, %x
    444   %m = fmul float %y, %s
    445   store float %m, float addrspace(1)* %out
    446   ret void
    447 }
    448 
    449 ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
    450 ; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
    451 define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
    452                                            float addrspace(1)* %in1,
    453                                            float addrspace(1)* %in2) {
    454   %x = load float, float addrspace(1)* %in1
    455   %y = load float, float addrspace(1)* %in2
    456   %s = fsub float -1.0, %x
    457   %m = fmul float %s, %y
    458   store float %m, float addrspace(1)* %out
    459   ret void
    460 }
    461 
    462 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
    463 ; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
    464 define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
    465                                          float addrspace(1)* %in1,
    466                                          float addrspace(1)* %in2) {
    467   %x = load float, float addrspace(1)* %in1
    468   %y = load float, float addrspace(1)* %in2
    469   %s = fsub float -1.0, %x
    470   %m = fmul float %y, %s
    471   store float %m, float addrspace(1)* %out
    472   ret void
    473 }
    474 
    475 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
    476 ; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
    477 define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
    478                                         float addrspace(1)* %in1,
    479                                         float addrspace(1)* %in2) {
    480   %x = load float, float addrspace(1)* %in1
    481   %y = load float, float addrspace(1)* %in2
    482   %s = fsub float %x, 1.0
    483   %m = fmul float %s, %y
    484   store float %m, float addrspace(1)* %out
    485   ret void
    486 }
    487 
    488 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
    489 ; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
    490 define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
    491                                       float addrspace(1)* %in1,
    492                                       float addrspace(1)* %in2) {
    493   %x = load float, float addrspace(1)* %in1
    494   %y = load float, float addrspace(1)* %in2
    495   %s = fsub float %x, 1.0
    496   %m = fmul float %y, %s
    497   store float %m, float addrspace(1)* %out
    498   ret void
    499 }
    500 
    501 ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
    502 ; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
    503 define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
    504                                          float addrspace(1)* %in1,
    505                                          float addrspace(1)* %in2) {
    506   %x = load float, float addrspace(1)* %in1
    507   %y = load float, float addrspace(1)* %in2
    508   %s = fsub float %x, -1.0
    509   %m = fmul float %s, %y
    510   store float %m, float addrspace(1)* %out
    511   ret void
    512 }
    513 
    514 ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
    515 ; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
    516 define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
    517                                          float addrspace(1)* %in1,
    518                                          float addrspace(1)* %in2) {
    519   %x = load float, float addrspace(1)* %in1
    520   %y = load float, float addrspace(1)* %in2
    521   %s = fsub float %x, -1.0
    522   %m = fmul float %y, %s
    523   store float %m, float addrspace(1)* %out
    524   ret void
    525 }
    526 
    527 ;
    528 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
    529 ;
    530 
    531 ; FUNC-LABEL: {{^}}test_f32_interp:
    532 ; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
    533 ; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]]
    534 define void @test_f32_interp(float addrspace(1)* %out,
    535                              float addrspace(1)* %in1,
    536                              float addrspace(1)* %in2,
    537                              float addrspace(1)* %in3) {
    538   %x = load float, float addrspace(1)* %in1
    539   %y = load float, float addrspace(1)* %in2
    540   %t = load float, float addrspace(1)* %in3
    541   %t1 = fsub float 1.0, %t
    542   %tx = fmul float %x, %t
    543   %ty = fmul float %y, %t1
    544   %r = fadd float %tx, %ty
    545   store float %r, float addrspace(1)* %out
    546   ret void
    547 }
    548 
    549 ; FUNC-LABEL: {{^}}test_f64_interp:
    550 ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
    551 ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
    552 define void @test_f64_interp(double addrspace(1)* %out,
    553                              double addrspace(1)* %in1,
    554                              double addrspace(1)* %in2,
    555                              double addrspace(1)* %in3) {
    556   %x = load double, double addrspace(1)* %in1
    557   %y = load double, double addrspace(1)* %in2
    558   %t = load double, double addrspace(1)* %in3
    559   %t1 = fsub double 1.0, %t
    560   %tx = fmul double %x, %t
    561   %ty = fmul double %y, %t1
    562   %r = fadd double %tx, %ty
    563   store double %r, double addrspace(1)* %out
    564   ret void
    565 }
    566 
    567 attributes #0 = { nounwind readnone }
    568 attributes #1 = { nounwind }
    569