Home | History | Annotate | Download | only in AMDGPU
      1 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
      2 
      3 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
      4 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
      5 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
      6 
      7 ; Make sure we don't form mad with denormals
      8 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
      9 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
     10 
     11 declare i32 @llvm.amdgcn.workitem.id.x() #0
     12 declare float @llvm.fabs.f32(float) #0
     13 declare float @llvm.fma.f32(float, float, float) #0
     14 declare float @llvm.fmuladd.f32(float, float, float) #0
     15 
     16 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     17 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
     18 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     19 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     20 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     21 
     22 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
     23 
     24 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
     25 
     26 ; SI-DENORM-SLOWFMAF-NOT: v_fma
     27 ; SI-DENORM-SLOWFMAF-NOT: v_mad
     28 
     29 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
     30 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]],  [[TMP]], [[C]]
     31 
     32 ; SI-DENORM: buffer_store_dword [[RESULT]]
     33 ; SI-STD: buffer_store_dword [[C]]
     34 define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
     35   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
     36   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
     37   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
     38   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
     39   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
     40 
     41   %a = load volatile float, float addrspace(1)* %gep.0
     42   %b = load volatile float, float addrspace(1)* %gep.1
     43   %c = load volatile float, float addrspace(1)* %gep.2
     44 
     45   %mul = fmul float %a, %b
     46   %fma = fadd float %mul, %c
     47   store float %fma, float addrspace(1)* %gep.out
     48   ret void
     49 }
     50 
     51 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     52 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
     53 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     54 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     55 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     56 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
     57 
     58 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
     59 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
     60 
     61 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
     62 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
     63 
     64 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
     65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
     66 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
     67 
     68 ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     69 ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     70 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     71 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     72 ; SI: s_endpgm
     73 define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
     74   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
     75   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
     76   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
     77   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
     78   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
     79   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
     80   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
     81 
     82   %a = load volatile float, float addrspace(1)* %gep.0
     83   %b = load volatile float, float addrspace(1)* %gep.1
     84   %c = load volatile float, float addrspace(1)* %gep.2
     85   %d = load volatile float, float addrspace(1)* %gep.3
     86 
     87   %mul = fmul float %a, %b
     88   %fma0 = fadd float %mul, %c
     89   %fma1 = fadd float %mul, %d
     90 
     91   store volatile float %fma0, float addrspace(1)* %gep.out.0
     92   store volatile float %fma1, float addrspace(1)* %gep.out.1
     93   ret void
     94 }
     95 
     96 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
     97 ; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
     98 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     99 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    100 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    101 
    102 ; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
    103 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
    104 
    105 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    106 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
    107 
    108 ; SI-DENORM: buffer_store_dword [[RESULT]]
    109 ; SI-STD: buffer_store_dword [[C]]
    110 define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    111   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    112   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    113   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    114   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    115   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    116 
    117   %a = load volatile float, float addrspace(1)* %gep.0
    118   %b = load volatile float, float addrspace(1)* %gep.1
    119   %c = load volatile float, float addrspace(1)* %gep.2
    120 
    121   %mul = fmul float %a, %b
    122   %fma = fadd float %c, %mul
    123   store float %fma, float addrspace(1)* %gep.out
    124   ret void
    125 }
    126 
    127 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    128 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
    129 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    130 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    131 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    132 
    133 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
    134 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
    135 
    136 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    137 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
    138 
    139 ; SI: buffer_store_dword [[RESULT]]
    140 define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    141   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    142   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    143   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    144   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    145   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    146 
    147   %a = load volatile float, float addrspace(1)* %gep.0
    148   %b = load volatile float, float addrspace(1)* %gep.1
    149   %c = load volatile float, float addrspace(1)* %gep.2
    150 
    151   %mul = fmul float %a, %b
    152   %fma = fsub float %mul, %c
    153   store float %fma, float addrspace(1)* %gep.out
    154   ret void
    155 }
    156 
    157 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    158 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
    159 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    160 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    161 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    162 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    163 
    164 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
    165 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    166 
    167 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
    168 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    169 
    170 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    171 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
    172 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
    173 
    174 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    175 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    176 ; SI: s_endpgm
    177 define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    178   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    179   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    180   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    181   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    182   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    183   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    184   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    185 
    186   %a = load volatile float, float addrspace(1)* %gep.0
    187   %b = load volatile float, float addrspace(1)* %gep.1
    188   %c = load volatile float, float addrspace(1)* %gep.2
    189   %d = load volatile float, float addrspace(1)* %gep.3
    190 
    191   %mul = fmul float %a, %b
    192   %fma0 = fsub float %mul, %c
    193   %fma1 = fsub float %mul, %d
    194   store volatile float %fma0, float addrspace(1)* %gep.out.0
    195   store volatile float %fma1, float addrspace(1)* %gep.out.1
    196   ret void
    197 }
    198 
    199 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    200 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
    201 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    202 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    203 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    204 
    205 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
    206 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
    207 
    208 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    209 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
    210 
    211 ; SI: buffer_store_dword [[RESULT]]
    212 define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    213   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    214   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    215   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    216   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    217   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    218 
    219   %a = load volatile float, float addrspace(1)* %gep.0
    220   %b = load volatile float, float addrspace(1)* %gep.1
    221   %c = load volatile float, float addrspace(1)* %gep.2
    222 
    223   %mul = fmul float %a, %b
    224   %fma = fsub float %c, %mul
    225   store float %fma, float addrspace(1)* %gep.out
    226   ret void
    227 }
    228 
    229 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    230 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
    231 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    232 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    233 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    234 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    235 
    236 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
    237 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
    238 
    239 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
    240 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
    241 
    242 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    243 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
    244 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[D]], [[TMP]]
    245 
    246 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    247 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    248 ; SI: s_endpgm
    249 define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    250   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    251   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    252   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    253   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    254   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    255   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    256   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    257 
    258   %a = load volatile float, float addrspace(1)* %gep.0
    259   %b = load volatile float, float addrspace(1)* %gep.1
    260   %c = load volatile float, float addrspace(1)* %gep.2
    261   %d = load volatile float, float addrspace(1)* %gep.3
    262 
    263   %mul = fmul float %a, %b
    264   %fma0 = fsub float %c, %mul
    265   %fma1 = fsub float %d, %mul
    266   store volatile float %fma0, float addrspace(1)* %gep.out.0
    267   store volatile float %fma1, float addrspace(1)* %gep.out.1
    268   ret void
    269 }
    270 
    271 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    272 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
    273 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    274 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    275 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    276 
    277 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
    278 
    279 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
    280 
    281 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
    282 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
    283 
    284 ; SI: buffer_store_dword [[RESULT]]
    285 define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    286   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    287   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    288   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    289   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    290   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    291 
    292   %a = load volatile float, float addrspace(1)* %gep.0
    293   %b = load volatile float, float addrspace(1)* %gep.1
    294   %c = load volatile float, float addrspace(1)* %gep.2
    295 
    296   %mul = fmul float %a, %b
    297   %mul.neg = fsub float -0.0, %mul
    298   %fma = fsub float %mul.neg, %c
    299 
    300   store float %fma, float addrspace(1)* %gep.out
    301   ret void
    302 }
    303 
    304 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    305 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
    306 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    307 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    308 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    309 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    310 
    311 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
    312 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
    313 
    314 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    315 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
    316 
    317 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
    318 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
    319 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[TMP]], [[D]]
    320 
    321 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    322 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    323 ; SI: s_endpgm
    324 define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    325   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    326   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    327   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    328   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    329   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    330   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    331   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    332 
    333   %a = load volatile float, float addrspace(1)* %gep.0
    334   %b = load volatile float, float addrspace(1)* %gep.1
    335   %c = load volatile float, float addrspace(1)* %gep.2
    336   %d = load volatile float, float addrspace(1)* %gep.3
    337 
    338   %mul = fmul float %a, %b
    339   %mul.neg = fsub float -0.0, %mul
    340   %fma0 = fsub float %mul.neg, %c
    341   %fma1 = fsub float %mul.neg, %d
    342 
    343   store volatile float %fma0, float addrspace(1)* %gep.out.0
    344   store volatile float %fma1, float addrspace(1)* %gep.out.1
    345   ret void
    346 }
    347 
    348 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    349 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
    350 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    351 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    352 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    353 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    354 
    355 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    356 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    357 
    358 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    359 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    360 
    361 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
    362 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
    363 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
    364 
    365 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    366 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    367 ; SI: s_endpgm
    368 define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    369   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    370   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    371   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    372   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    373   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    374   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    375   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    376 
    377   %a = load volatile float, float addrspace(1)* %gep.0
    378   %b = load volatile float, float addrspace(1)* %gep.1
    379   %c = load volatile float, float addrspace(1)* %gep.2
    380   %d = load volatile float, float addrspace(1)* %gep.3
    381 
    382   %mul = fmul float %a, %b
    383   %mul.neg = fsub float -0.0, %mul
    384   %fma0 = fsub float %mul.neg, %c
    385   %fma1 = fsub float %mul, %d
    386 
    387   store volatile float %fma0, float addrspace(1)* %gep.out.0
    388   store volatile float %fma1, float addrspace(1)* %gep.out.1
    389   ret void
    390 }
    391 
    392 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    393 
    394 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
    395 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    396 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    397 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    398 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    399 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    400 
    401 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    402 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    403 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
    404 
    405 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    406 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    407 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
    408 
    409 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    410 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    411   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    412   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    413   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    414   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    415   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    416   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    417   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    418 
    419   %x = load volatile float, float addrspace(1)* %gep.0
    420   %y = load volatile float, float addrspace(1)* %gep.1
    421   %z = load volatile float, float addrspace(1)* %gep.2
    422   %u = load volatile float, float addrspace(1)* %gep.3
    423   %v = load volatile float, float addrspace(1)* %gep.4
    424 
    425   %tmp0 = fmul float %u, %v
    426   %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
    427   %tmp2 = fsub float %tmp1, %z
    428 
    429   store float %tmp2, float addrspace(1)* %gep.out
    430   ret void
    431 }
    432 
    433 ; fold (fsub x, (fma y, z, (fmul u, v)))
    434 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))
    435 
    436 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
    437 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    438 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    439 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    440 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    441 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    442 
    443 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    444 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
    445 ; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
    446 
    447 ; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    448 ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
    449 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
    450 
    451 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    452 ; SI: s_endpgm
    453 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    454   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    455   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    456   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    457   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    458   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    459   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    460   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    461 
    462   %x = load volatile float, float addrspace(1)* %gep.0
    463   %y = load volatile float, float addrspace(1)* %gep.1
    464   %z = load volatile float, float addrspace(1)* %gep.2
    465   %u = load volatile float, float addrspace(1)* %gep.3
    466   %v = load volatile float, float addrspace(1)* %gep.4
    467 
    468   %tmp0 = fmul float %u, %v
    469   %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
    470   %tmp2 = fsub float %x, %tmp1
    471 
    472   store float %tmp2, float addrspace(1)* %gep.out
    473   ret void
    474 }
    475 
    476 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    477 
    478 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
    479 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    480 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    481 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    482 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    483 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    484 
    485 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    486 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
    487 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
    488 
    489 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
    490 ; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
    491 
    492 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    493 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    494 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]],  [[TMP1]], [[C]]
    495 
    496 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    497 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
    498 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
    499 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
    500 
    501 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    502 ; SI: s_endpgm
    503 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    504   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    505   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    506   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    507   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    508   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    509   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    510   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    511 
    512   %x = load volatile float, float addrspace(1)* %gep.0
    513   %y = load volatile float, float addrspace(1)* %gep.1
    514   %z = load volatile float, float addrspace(1)* %gep.2
    515   %u = load volatile float, float addrspace(1)* %gep.3
    516   %v = load volatile float, float addrspace(1)* %gep.4
    517 
    518   %tmp0 = fmul float %u, %v
    519   %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
    520   %tmp2 = fsub float %tmp1, %z
    521 
    522   store float %tmp2, float addrspace(1)* %gep.out
    523   ret void
    524 }
    525 
    526 ; fold (fsub x, (fmuladd y, z, (fmul u, v)))
    527 ;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
    528 
    529 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
    530 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    531 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    532 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    533 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    534 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    535 
    536 ; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    537 ; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
    538 ; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
    539 
    540 ; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
    541 ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
    542 
    543 ; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    544 ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
    545 ; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
    546 
    547 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
    548 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
    549 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
    550 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
    551 
    552 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    553 ; SI: s_endpgm
    554 define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    555   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    556   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    557   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    558   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    559   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    560   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    561   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    562 
    563   %x = load volatile float, float addrspace(1)* %gep.0
    564   %y = load volatile float, float addrspace(1)* %gep.1
    565   %z = load volatile float, float addrspace(1)* %gep.2
    566   %u = load volatile float, float addrspace(1)* %gep.3
    567   %v = load volatile float, float addrspace(1)* %gep.4
    568 
    569   %tmp0 = fmul float %u, %v
    570   %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
    571   %tmp2 = fsub float %x, %tmp1
    572 
    573   store float %tmp2, float addrspace(1)* %gep.out
    574   ret void
    575 }
    576 
    577 attributes #0 = { nounwind readnone }
    578 attributes #1 = { nounwind }
    579