Home | History | Annotate | Download | only in AMDGPU
      1 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
      2 
      3 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
      4 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
      5 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
      6 
      7 ; Make sure we don't form mad with denormals
      8 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
      9 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
     10 
     11 declare i32 @llvm.r600.read.tidig.x() #0
     12 declare float @llvm.fabs.f32(float) #0
     13 declare float @llvm.fma.f32(float, float, float) #0
     14 declare float @llvm.fmuladd.f32(float, float, float) #0
     15 
     16 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     17 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
     18 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     19 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     20 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     21 
     22 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
     23 
     24 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
     25 
     26 ; SI-DENORM-SLOWFMAF-NOT: v_fma
     27 ; SI-DENORM-SLOWFMAF-NOT: v_mad
     28 
     29 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
     30 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
     31 
     32 ; SI-DENORM: buffer_store_dword [[RESULT]]
     33 ; SI-STD: buffer_store_dword [[C]]
     34 define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
     35   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     36   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
     37   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
     38   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
     39   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
     40 
     41   %a = load float, float addrspace(1)* %gep.0
     42   %b = load float, float addrspace(1)* %gep.1
     43   %c = load float, float addrspace(1)* %gep.2
     44 
     45   %mul = fmul float %a, %b
     46   %fma = fadd float %mul, %c
     47   store float %fma, float addrspace(1)* %gep.out
     48   ret void
     49 }
     50 
     51 ; (fadd (fmul x, y), z) -> (fma x, y, z)
     52 ; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
     53 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     54 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     55 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
     56 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
     57 
     58 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
     59 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
     60 
     61 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
     62 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
     63 
     64 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
     65 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
     66 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
     67 
     68 ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     69 ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     70 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     71 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     72 ; SI: s_endpgm
     73 define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
     74   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
     75   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
     76   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
     77   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
     78   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
     79   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
     80   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
     81 
     82   %a = load float, float addrspace(1)* %gep.0
     83   %b = load float, float addrspace(1)* %gep.1
     84   %c = load float, float addrspace(1)* %gep.2
     85   %d = load float, float addrspace(1)* %gep.3
     86 
     87   %mul = fmul float %a, %b
     88   %fma0 = fadd float %mul, %c
     89   %fma1 = fadd float %mul, %d
     90 
     91   store float %fma0, float addrspace(1)* %gep.out.0
     92   store float %fma1, float addrspace(1)* %gep.out.1
     93   ret void
     94 }
     95 
     96 ; (fadd x, (fmul y, z)) -> (fma y, z, x)
     97 ; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
     98 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     99 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    100 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    101 
    102 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
    103 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
    104 
    105 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    106 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
    107 
    108 ; SI-DENORM: buffer_store_dword [[RESULT]]
    109 ; SI-STD: buffer_store_dword [[C]]
    110 define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    111   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    112   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    113   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    114   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    115   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    116 
    117   %a = load float, float addrspace(1)* %gep.0
    118   %b = load float, float addrspace(1)* %gep.1
    119   %c = load float, float addrspace(1)* %gep.2
    120 
    121   %mul = fmul float %a, %b
    122   %fma = fadd float %c, %mul
    123   store float %fma, float addrspace(1)* %gep.out
    124   ret void
    125 }
    126 
    127 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    128 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
    129 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    130 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    131 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    132 
    133 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
    134 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
    135 
    136 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    137 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
    138 
    139 ; SI: buffer_store_dword [[RESULT]]
    140 define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    141   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    142   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    143   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    144   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    145   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    146 
    147   %a = load float, float addrspace(1)* %gep.0
    148   %b = load float, float addrspace(1)* %gep.1
    149   %c = load float, float addrspace(1)* %gep.2
    150 
    151   %mul = fmul float %a, %b
    152   %fma = fsub float %mul, %c
    153   store float %fma, float addrspace(1)* %gep.out
    154   ret void
    155 }
    156 
    157 ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
    158 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
    159 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    160 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    161 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    162 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    163 
    164 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
    165 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    166 
    167 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
    168 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    169 
    170 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    171 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
    172 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
    173 
    174 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    175 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    176 ; SI: s_endpgm
    177 define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    178   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    179   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    180   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    181   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    182   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    183   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    184   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    185 
    186   %a = load float, float addrspace(1)* %gep.0
    187   %b = load float, float addrspace(1)* %gep.1
    188   %c = load float, float addrspace(1)* %gep.2
    189   %d = load float, float addrspace(1)* %gep.3
    190 
    191   %mul = fmul float %a, %b
    192   %fma0 = fsub float %mul, %c
    193   %fma1 = fsub float %mul, %d
    194   store float %fma0, float addrspace(1)* %gep.out.0
    195   store float %fma1, float addrspace(1)* %gep.out.1
    196   ret void
    197 }
    198 
    199 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    200 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
    201 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    202 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    203 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    204 
    205 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
    206 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
    207 
    208 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    209 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
    210 
    211 ; SI: buffer_store_dword [[RESULT]]
    212 define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    213   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    214   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    215   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    216   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    217   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    218 
    219   %a = load float, float addrspace(1)* %gep.0
    220   %b = load float, float addrspace(1)* %gep.1
    221   %c = load float, float addrspace(1)* %gep.2
    222 
    223   %mul = fmul float %a, %b
    224   %fma = fsub float %c, %mul
    225   store float %fma, float addrspace(1)* %gep.out
    226   ret void
    227 }
    228 
    229 ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
    230 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
    231 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    232 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    233 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    234 
    235 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
    236 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
    237 
    238 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
    239 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
    240 
    241 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    242 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
    243 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
    244 
    245 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    246 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    247 ; SI: s_endpgm
    248 define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    249   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    250   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    251   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    252   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    253   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    254   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    255   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    256 
    257   %a = load float, float addrspace(1)* %gep.0
    258   %b = load float, float addrspace(1)* %gep.1
    259   %c = load float, float addrspace(1)* %gep.2
    260   %d = load float, float addrspace(1)* %gep.3
    261 
    262   %mul = fmul float %a, %b
    263   %fma0 = fsub float %c, %mul
    264   %fma1 = fsub float %d, %mul
    265   store float %fma0, float addrspace(1)* %gep.out.0
    266   store float %fma1, float addrspace(1)* %gep.out.1
    267   ret void
    268 }
    269 
    270 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    271 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
    272 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    273 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    274 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    275 
    276 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
    277 
    278 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
    279 
    280 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    281 ; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
    282 
    283 ; SI: buffer_store_dword [[RESULT]]
    284 define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    285   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    286   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    287   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    288   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    289   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    290 
    291   %a = load float, float addrspace(1)* %gep.0
    292   %b = load float, float addrspace(1)* %gep.1
    293   %c = load float, float addrspace(1)* %gep.2
    294 
    295   %mul = fmul float %a, %b
    296   %mul.neg = fsub float -0.0, %mul
    297   %fma = fsub float %mul.neg, %c
    298 
    299   store float %fma, float addrspace(1)* %gep.out
    300   ret void
    301 }
    302 
    303 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    304 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
    305 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    306 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    307 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    308 
    309 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    310 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
    311 
    312 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    313 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
    314 
    315 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    316 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
    317 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
    318 
    319 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    320 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    321 ; SI: s_endpgm
    322 define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    323   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    324   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    325   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    326   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    327   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    328   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    329   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    330 
    331   %a = load float, float addrspace(1)* %gep.0
    332   %b = load float, float addrspace(1)* %gep.1
    333   %c = load float, float addrspace(1)* %gep.2
    334   %d = load float, float addrspace(1)* %gep.3
    335 
    336   %mul = fmul float %a, %b
    337   %mul.neg = fsub float -0.0, %mul
    338   %fma0 = fsub float %mul.neg, %c
    339   %fma1 = fsub float %mul.neg, %d
    340 
    341   store float %fma0, float addrspace(1)* %gep.out.0
    342   store float %fma1, float addrspace(1)* %gep.out.1
    343   ret void
    344 }
    345 
    346 ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
    347 ; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
    348 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    349 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    350 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    351 
    352 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    353 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    354 
    355 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
    356 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
    357 
    358 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
    359 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
    360 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
    361 
    362 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    363 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    364 ; SI: s_endpgm
    365 define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    366   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    367   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    368   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    369   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    370   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    371   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    372   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
    373 
    374   %a = load float, float addrspace(1)* %gep.0
    375   %b = load float, float addrspace(1)* %gep.1
    376   %c = load float, float addrspace(1)* %gep.2
    377   %d = load float, float addrspace(1)* %gep.3
    378 
    379   %mul = fmul float %a, %b
    380   %mul.neg = fsub float -0.0, %mul
    381   %fma0 = fsub float %mul.neg, %c
    382   %fma1 = fsub float %mul, %d
    383 
    384   store float %fma0, float addrspace(1)* %gep.out.0
    385   store float %fma1, float addrspace(1)* %gep.out.1
    386   ret void
    387 }
    388 
    389 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    390 
    391 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
    392 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    393 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    394 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    395 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    396 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    397 
    398 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    399 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    400 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
    401 
    402 ; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
    403 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    404 
    405 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    406 ; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
    407 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
    408 
    409 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    410 define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    411   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    412   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    413   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    414   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    415   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    416   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    417   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    418 
    419   %x = load float, float addrspace(1)* %gep.0
    420   %y = load float, float addrspace(1)* %gep.1
    421   %z = load float, float addrspace(1)* %gep.2
    422   %u = load float, float addrspace(1)* %gep.3
    423   %v = load float, float addrspace(1)* %gep.4
    424 
    425   %tmp0 = fmul float %u, %v
    426   %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
    427   %tmp2 = fsub float %tmp1, %z
    428 
    429   store float %tmp2, float addrspace(1)* %gep.out
    430   ret void
    431 }
    432 
    433 ; fold (fsub x, (fma y, z, (fmul u, v)))
    434 ;   -> (fma (fneg y), z, (fma (fneg u), v, x))
    435 
    436 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
    437 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    438 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    439 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    440 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    441 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    442 
    443 ; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    444 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
    445 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
    446 
    447 ; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
    448 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
    449 
    450 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    451 ; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
    452 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
    453 
    454 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    455 ; SI: s_endpgm
    456 define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    457   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    458   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    459   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    460   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    461   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    462   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    463   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    464 
    465   %x = load float, float addrspace(1)* %gep.0
    466   %y = load float, float addrspace(1)* %gep.1
    467   %z = load float, float addrspace(1)* %gep.2
    468   %u = load float, float addrspace(1)* %gep.3
    469   %v = load float, float addrspace(1)* %gep.4
    470 
    471   %tmp0 = fmul float %u, %v
    472   %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
    473   %tmp2 = fsub float %x, %tmp1
    474 
    475   store float %tmp2, float addrspace(1)* %gep.out
    476   ret void
    477 }
    478 
    479 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
    480 
    481 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
    482 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    483 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    484 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    485 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    486 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    487 
    488 ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
    489 ; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]]
    490 
    491 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
    492 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
    493 
    494 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    495 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
    496 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
    497 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
    498 
    499 ; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    500 ; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    501 ; SI: s_endpgm
    502 define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    503   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    504   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    505   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    506   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    507   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    508   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    509   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    510 
    511   %x = load float, float addrspace(1)* %gep.0
    512   %y = load float, float addrspace(1)* %gep.1
    513   %z = load float, float addrspace(1)* %gep.2
    514   %u = load float, float addrspace(1)* %gep.3
    515   %v = load float, float addrspace(1)* %gep.4
    516 
    517   %tmp0 = fmul float %u, %v
    518   %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
    519   %tmp2 = fsub float %tmp1, %z
    520 
    521   store float %tmp2, float addrspace(1)* %gep.out
    522   ret void
    523 }
    524 
    525 ; fold (fsub x, (fmuladd y, z, (fmul u, v)))
    526 ;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
    527 
    528 ; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
    529 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    530 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    531 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
    532 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
    533 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    534 
    535 ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
    536 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
    537 
    538 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
    539 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
    540 
    541 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
    542 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
    543 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
    544 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
    545 
    546 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    547 ; SI: s_endpgm
    548 define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
    549   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
    550   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
    551   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    552   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
    553   %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
    554   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
    555   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    556 
    557   %x = load float, float addrspace(1)* %gep.0
    558   %y = load float, float addrspace(1)* %gep.1
    559   %z = load float, float addrspace(1)* %gep.2
    560   %u = load float, float addrspace(1)* %gep.3
    561   %v = load float, float addrspace(1)* %gep.4
    562 
    563   %tmp0 = fmul float %u, %v
    564   %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
    565   %tmp2 = fsub float %x, %tmp1
    566 
    567   store float %tmp2, float addrspace(1)* %gep.out
    568   ret void
    569 }
    570 
    571 attributes #0 = { nounwind readnone }
    572 attributes #1 = { nounwind }
    573