Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
      2 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
      3 
      4 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
      5 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
      6 
      7 ; FIXME: This should also fold when fma is actually fast if an FMA
      8 ; exists in the original program.
      9 
     10 ; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
     11 
     12 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
     13 ; GCN: buffer_load_dword [[X:v[0-9]+]]
     14 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
     15 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
     16 ; GCN: buffer_load_dword [[U:v[0-9]+]]
     17 ; GCN: buffer_load_dword [[V:v[0-9]+]]
     18 
     19 ; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
     20 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
     21 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
     22 
     23 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
     24 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
     25 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
     26 
     27 ; GCN-SLOWFMA: v_mul_f32_e32
     28 ; GCN-SLOWFMA: v_mul_f32_e32
     29 ; GCN-SLOWFMA: v_add_f32_e32
     30 ; GCN-SLOWFMA: v_add_f32_e32
     31 define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
     32   %x = load volatile float, float addrspace(1)* undef
     33   %y = load volatile float, float addrspace(1)* undef
     34   %z = load volatile float, float addrspace(1)* undef
     35   %u = load volatile float, float addrspace(1)* undef
     36   %v = load volatile float, float addrspace(1)* undef
     37   %mul.u.v = fmul fast float %u, %v
     38   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
     39   %add = fadd fast float %fma, %z
     40   store volatile float %add, float addrspace(1)* undef
     41   ret void
     42 }
     43 
     44 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
     45 ; GCN: buffer_load_dword [[X:v[0-9]+]]
     46 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
     47 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
     48 ; GCN: buffer_load_dword [[U:v[0-9]+]]
     49 ; GCN: buffer_load_dword [[V:v[0-9]+]]
     50 
     51 ; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
     52 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
     53 ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
     54 
     55 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
     56 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
     57 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
     58 define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
     59   %x = load volatile float, float addrspace(1)* undef
     60   %y = load volatile float, float addrspace(1)* undef
     61   %z = load volatile float, float addrspace(1)* undef
     62   %u = load volatile float, float addrspace(1)* undef
     63   %v = load volatile float, float addrspace(1)* undef
     64   %mul.u.v = fmul fast float %u, %v
     65   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
     66   %add = fsub fast float %fma, %z
     67   store volatile float %add, float addrspace(1)* undef
     68   ret void
     69 }
     70 
     71 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
     72 ; GCN: buffer_load_dword [[X:v[0-9]+]]
     73 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
     74 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
     75 ; GCN: buffer_load_dword [[U:v[0-9]+]]
     76 ; GCN: buffer_load_dword [[V:v[0-9]+]]
     77 
     78 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
     79 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
     80 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
     81 
     82 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
     83 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
     84 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
     85 
     86 ; GCN-SLOWFMA: v_mul_f32_e32
     87 ; GCN-SLOWFMA: v_mul_f32_e32
     88 ; GCN-SLOWFMA: v_add_f32_e32
     89 ; GCN-SLOWFMA: v_add_f32_e32
     90 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
     91   %x = load volatile float, float addrspace(1)* undef
     92   %y = load volatile float, float addrspace(1)* undef
     93   %z = load volatile float, float addrspace(1)* undef
     94   %u = load volatile float, float addrspace(1)* undef
     95   %v = load volatile float, float addrspace(1)* undef
     96   %mul.u.v = fmul fast float %u, %v
     97   store volatile float %mul.u.v, float addrspace(1)* undef
     98   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
     99   %add = fadd fast float %fma, %z
    100   store volatile float %add, float addrspace(1)* undef
    101   ret void
    102 }
    103 
    104 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
    105 ; GCN: buffer_load_dword [[X:v[0-9]+]]
    106 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
    107 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
    108 ; GCN: buffer_load_dword [[U:v[0-9]+]]
    109 ; GCN: buffer_load_dword [[V:v[0-9]+]]
    110 
    111 ; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
    112 ; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
    113 ; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
    114 
    115 ; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
    116 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
    117 ; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
    118 
    119 ; GCN-SLOWFMA: v_mul_f32_e32
    120 ; GCN-SLOWFMA: v_mul_f32_e32
    121 ; GCN-SLOWFMA: v_add_f32_e32
    122 ; GCN-SLOWFMA: v_add_f32_e32
    123 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
    124   %x = load volatile float, float addrspace(1)* undef
    125   %y = load volatile float, float addrspace(1)* undef
    126   %z = load volatile float, float addrspace(1)* undef
    127   %u = load volatile float, float addrspace(1)* undef
    128   %v = load volatile float, float addrspace(1)* undef
    129   %mul.u.v = fmul fast float %u, %v
    130   store volatile float %mul.u.v, float addrspace(1)* undef
    131   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
    132   %add = fadd fast float %z, %fma
    133   store volatile float %add, float addrspace(1)* undef
    134   ret void
    135 }
    136 
    137 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
    138 ; GCN: buffer_load_dword [[X:v[0-9]+]]
    139 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
    140 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
    141 ; GCN: buffer_load_dword [[U:v[0-9]+]]
    142 ; GCN: buffer_load_dword [[V:v[0-9]+]]
    143 
    144 ; GCN-SLOWFMA: v_mul_f32_e32
    145 ; GCN-SLOWFMA: v_mul_f32_e32
    146 ; GCN-SLOWFMA: v_add_f32_e32
    147 ; GCN-SLOWFMA: v_add_f32_e32
    148 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
    149   %x = load volatile float, float addrspace(1)* undef
    150   %y = load volatile float, float addrspace(1)* undef
    151   %z = load volatile float, float addrspace(1)* undef
    152   %u = load volatile float, float addrspace(1)* undef
    153   %v = load volatile float, float addrspace(1)* undef
    154   %mul.u.v = fmul fast float %u, %v
    155   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
    156   store volatile float %fma, float addrspace(1)* undef
    157   %add = fadd fast float %fma, %z
    158   store volatile float %add, float addrspace(1)* undef
    159   ret void
    160 }
    161 
    162 ; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
    163 ; GCN: buffer_load_dword [[X:v[0-9]+]]
    164 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
    165 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
    166 ; GCN: buffer_load_dword [[U:v[0-9]+]]
    167 ; GCN: buffer_load_dword [[V:v[0-9]+]]
    168 
    169 ; GCN-SLOWFMA: v_mul_f32_e32
    170 ; GCN-SLOWFMA: v_mul_f32_e32
    171 ; GCN-SLOWFMA: v_add_f32_e32
    172 ; GCN-SLOWFMA: v_add_f32_e32
    173 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
    174   %x = load volatile float, float addrspace(1)* undef
    175   %y = load volatile float, float addrspace(1)* undef
    176   %z = load volatile float, float addrspace(1)* undef
    177   %u = load volatile float, float addrspace(1)* undef
    178   %v = load volatile float, float addrspace(1)* undef
    179   %mul.u.v = fmul fast float %u, %v
    180   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
    181   store volatile float %fma, float addrspace(1)* undef
    182   %add = fadd fast float %z, %fma
    183   store volatile float %add, float addrspace(1)* undef
    184   ret void
    185 }
    186 
    187 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
    188 ; GCN: buffer_load_dword [[X:v[0-9]+]]
    189 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
    190 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
    191 ; GCN: buffer_load_dword [[U:v[0-9]+]]
    192 ; GCN: buffer_load_dword [[V:v[0-9]+]]
    193 
    194 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
    195 
    196 ; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
    197 ; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
    198 
    199 ; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
    200 ; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
    201 
    202 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
    203 ; GCN-SLOWFMA: v_add_f32_e32
    204 ; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
    205 
    206 ; GCN: buffer_store_dword [[MUL]]
    207 ; GCN: buffer_store_dword [[MAD]]
    208 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
    209   %x = load volatile float, float addrspace(1)* undef
    210   %y = load volatile float, float addrspace(1)* undef
    211   %z = load volatile float, float addrspace(1)* undef
    212   %u = load volatile float, float addrspace(1)* undef
    213   %v = load volatile float, float addrspace(1)* undef
    214   %mul.u.v = fmul fast float %u, %v
    215   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
    216   %add = fsub fast float %fma, %z
    217   store volatile float %mul.u.v, float addrspace(1)* undef
    218   store volatile float %add, float addrspace(1)* undef
    219   ret void
    220 }
    221 
    222 ; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd:
    223 ; GCN: buffer_load_dword [[X:v[0-9]+]]
    224 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
    225 ; GCN: buffer_load_dword [[Z:v[0-9]+]]
    226 ; GCN: buffer_load_dword [[U:v[0-9]+]]
    227 ; GCN: buffer_load_dword [[V:v[0-9]+]]
    228 
    229 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
    230 
    231 ; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
    232 ; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
    233 ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
    234 ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
    235 
    236 ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
    237 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
    238 ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
    239 ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
    240 
    241 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
    242 ; GCN-SLOWFMA: v_add_f32_e32
    243 ; GCN-SLOWFMA: v_sub_f32_e32
    244 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
    245   %x = load volatile float, float addrspace(1)* undef
    246   %y = load volatile float, float addrspace(1)* undef
    247   %z = load volatile float, float addrspace(1)* undef
    248   %u = load volatile float, float addrspace(1)* undef
    249   %v = load volatile float, float addrspace(1)* undef
    250   %mul.u.v = fmul fast float %u, %v
    251   %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
    252   %add = fsub fast float %fma, %z
    253   store volatile float %fma, float addrspace(1)* undef
    254   store volatile float %add, float addrspace(1)* undef
    255   ret void
    256 }
    257 
    258 declare float @llvm.fma.f32(float, float, float) #1
    259 declare float @llvm.fmuladd.f32(float, float, float) #1
    260 
    261 attributes #0 = { nounwind }
    262 attributes #1 = { nounwind readnone }
    263