Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
      2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
      3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
      4 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
      5 
      6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
      7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
      8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
      9 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
     10 
     11 
     12 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s
     13 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s
     14 
     15 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s
     16 
     17 ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
     18 ; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s
     19 
     20 
     21 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
     22 
     23 target triple = "amdgcn--"
     24 
     25 
     26 declare i32 @llvm.amdgcn.workitem.id.x() #1
     27 declare float @llvm.fmuladd.f32(float, float, float) #1
     28 declare half @llvm.fmuladd.f16(half, half, half) #1
     29 declare float @llvm.fabs.f32(float) #1
     30 
     31 ; GCN-LABEL: {{^}}fmuladd_f32:
     32 ; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     33 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     34 
     35 ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     36 
     37 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     38 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
     39 define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
     40                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
     41   %r0 = load float, float addrspace(1)* %in1
     42   %r1 = load float, float addrspace(1)* %in2
     43   %r2 = load float, float addrspace(1)* %in3
     44   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
     45   store float %r3, float addrspace(1)* %out
     46   ret void
     47 }
     48 
     49 ; GCN-LABEL: {{^}}fmul_fadd_f32:
     50 ; GCN-FLUSH: v_mac_f32
     51 
     52 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
     53 
     54 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
     55 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
     56 
     57 ; GCN-DENORM-STRICT: v_mul_f32_e32
     58 ; GCN-DENORM-STRICT: v_add_f32_e32
     59 define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
     60                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
     61   %r0 = load volatile float, float addrspace(1)* %in1
     62   %r1 = load volatile float, float addrspace(1)* %in2
     63   %r2 = load volatile float, float addrspace(1)* %in3
     64   %mul = fmul float %r0, %r1
     65   %add = fadd float %mul, %r2
     66   store float %add, float addrspace(1)* %out
     67   ret void
     68 }
     69 
     70 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
     71 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
     72 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
     73 
     74 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
     75 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
     76 ; SI-FLUSH: buffer_store_dword [[R2]]
     77 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
     78 
     79 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
     80 
     81 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
     82 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
     83 
     84 ; SI-DENORM buffer_store_dword [[RESULT]]
     85 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     86 define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     87   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     88   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
     89   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
     90   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
     91 
     92   %r1 = load volatile float, float addrspace(1)* %gep.0
     93   %r2 = load volatile float, float addrspace(1)* %gep.1
     94 
     95   %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
     96   store float %r3, float addrspace(1)* %gep.out
     97   ret void
     98 }
     99 
    100 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
    101 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    102 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    103 
    104 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
    105 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
    106 
    107 ; SI-FLUSH: buffer_store_dword [[R2]]
    108 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    109 
    110 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
    111 
    112 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    113 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    114 
    115 ; SI-DENORM: buffer_store_dword [[RESULT]]
    116 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    117 define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    118   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    119   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    120   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    121   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    122 
    123   %r1 = load volatile float, float addrspace(1)* %gep.0
    124   %r2 = load volatile float, float addrspace(1)* %gep.1
    125 
    126   %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
    127   store float %r3, float addrspace(1)* %gep.out
    128   ret void
    129 }
    130 
    131 ; GCN-LABEL: {{^}}fadd_a_a_b_f32:
    132 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    133 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    134 
    135 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
    136 
    137 ; SI-FLUSH: buffer_store_dword [[R2]]
    138 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    139 
    140 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
    141 
    142 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    143 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    144 
    145 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    146 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    147 
    148 ; SI-DENORM: buffer_store_dword [[RESULT]]
    149 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    150 define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
    151                             float addrspace(1)* %in1,
    152                             float addrspace(1)* %in2) #0 {
    153   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    154   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    155   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    156   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    157 
    158   %r0 = load volatile float, float addrspace(1)* %gep.0
    159   %r1 = load volatile float, float addrspace(1)* %gep.1
    160 
    161   %add.0 = fadd float %r0, %r0
    162   %add.1 = fadd float %add.0, %r1
    163   store float %add.1, float addrspace(1)* %gep.out
    164   ret void
    165 }
    166 
    167 ; GCN-LABEL: {{^}}fadd_b_a_a_f32:
    168 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    169 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    170 
    171 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
    172 
    173 ; SI-FLUSH: buffer_store_dword [[R2]]
    174 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    175 
    176 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
    177 
    178 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    179 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    180 
    181 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    182 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    183 
    184 ; SI-DENORM: buffer_store_dword [[RESULT]]
    185 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    186 define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
    187                             float addrspace(1)* %in1,
    188                             float addrspace(1)* %in2) #0 {
    189   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    190   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    191   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    192   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    193 
    194   %r0 = load volatile float, float addrspace(1)* %gep.0
    195   %r1 = load volatile float, float addrspace(1)* %gep.1
    196 
    197   %add.0 = fadd float %r0, %r0
    198   %add.1 = fadd float %r1, %add.0
    199   store float %add.1, float addrspace(1)* %gep.out
    200   ret void
    201 }
    202 
    203 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
    204 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    205 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    206 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
    207 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
    208 
    209 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
    210 
    211 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    212 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    213 
    214 ; SI-DENORM: buffer_store_dword [[RESULT]]
    215 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    216 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    217   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    218   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    219   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    220   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    221 
    222   %r1 = load volatile float, float addrspace(1)* %gep.0
    223   %r2 = load volatile float, float addrspace(1)* %gep.1
    224 
    225   %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
    226   store float %r3, float addrspace(1)* %gep.out
    227   ret void
    228 }
    229 
    230 ; XXX
    231 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
    232 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    233 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    234 
    235 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
    236 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
    237 
    238 ; SI-FLUSH: buffer_store_dword [[R2]]
    239 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    240 
    241 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
    242 
    243 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    244 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    245 
    246 ; SI-DENORM: buffer_store_dword [[RESULT]]
    247 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    248 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    249   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    250   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    251   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    252   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    253 
    254   %r1 = load volatile float, float addrspace(1)* %gep.0
    255   %r2 = load volatile float, float addrspace(1)* %gep.1
    256 
    257   %r1.fneg = fsub float -0.000000e+00, %r1
    258 
    259   %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
    260   store float %r3, float addrspace(1)* %gep.out
    261   ret void
    262 }
    263 
    264 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
    265 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    266 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    267 
    268 ; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
    269 ; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
    270 
    271 ; SI-FLUSH: buffer_store_dword [[R2]]
    272 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    273 
    274 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
    275 
    276 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    277 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    278 
    279 ; SI-DENORM: buffer_store_dword [[RESULT]]
    280 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    281 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    282   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    283   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    284   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    285   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    286 
    287   %r1 = load volatile float, float addrspace(1)* %gep.0
    288   %r2 = load volatile float, float addrspace(1)* %gep.1
    289 
    290   %r1.fneg = fsub float -0.000000e+00, %r1
    291 
    292   %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
    293   store float %r3, float addrspace(1)* %gep.out
    294   ret void
    295 }
    296 
    297 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
    298 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    299 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    300 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
    301 ; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
    302 
    303 ; SI-FLUSH: buffer_store_dword [[RESULT]]
    304 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    305 
    306 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
    307 
    308 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    309 ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    310 
    311 ; SI-DENORM: buffer_store_dword [[RESULT]]
    312 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    313 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    314   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    315   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    316   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    317   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    318 
    319   %r1 = load volatile float, float addrspace(1)* %gep.0
    320   %r2 = load volatile float, float addrspace(1)* %gep.1
    321 
    322   %r2.fneg = fsub float -0.000000e+00, %r2
    323 
    324   %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
    325   store float %r3, float addrspace(1)* %gep.out
    326   ret void
    327 }
    328 
    329 ; GCN-LABEL: {{^}}mad_sub_f32:
    330 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    331 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    332 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    333 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
    334 
    335 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
    336 
    337 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    338 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
    339 
    340 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    341 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
    342 
    343 ; SI: buffer_store_dword [[RESULT]]
    344 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    345 define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    346   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    347   %tid.ext = sext i32 %tid to i64
    348   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    349   %add1 = add i64 %tid.ext, 1
    350   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    351   %add2 = add i64 %tid.ext, 2
    352   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    353   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    354   %a = load volatile float, float addrspace(1)* %gep0, align 4
    355   %b = load volatile float, float addrspace(1)* %gep1, align 4
    356   %c = load volatile float, float addrspace(1)* %gep2, align 4
    357   %mul = fmul float %a, %b
    358   %sub = fsub float %mul, %c
    359   store float %sub, float addrspace(1)* %outgep, align 4
    360   ret void
    361 }
    362 
    363 ; GCN-LABEL: {{^}}mad_sub_inv_f32:
    364 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    365 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    366 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    367 
    368 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
    369 
    370 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
    371 
    372 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    373 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
    374 
    375 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    376 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
    377 
    378 ; SI: buffer_store_dword [[RESULT]]
    379 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    380 define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    381   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    382   %tid.ext = sext i32 %tid to i64
    383   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    384   %add1 = add i64 %tid.ext, 1
    385   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    386   %add2 = add i64 %tid.ext, 2
    387   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    388   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    389   %a = load volatile float, float addrspace(1)* %gep0, align 4
    390   %b = load volatile float, float addrspace(1)* %gep1, align 4
    391   %c = load volatile float, float addrspace(1)* %gep2, align 4
    392   %mul = fmul float %a, %b
    393   %sub = fsub float %c, %mul
    394   store float %sub, float addrspace(1)* %outgep, align 4
    395   ret void
    396 }
    397 
    398 ; GCN-LABEL: {{^}}mad_sub_fabs_f32:
    399 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    400 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    401 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    402 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
    403 
    404 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
    405 
    406 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    407 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
    408 
    409 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    410 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
    411 
    412 ; SI: buffer_store_dword [[RESULT]]
    413 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    414 define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    415   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    416   %tid.ext = sext i32 %tid to i64
    417   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    418   %add1 = add i64 %tid.ext, 1
    419   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    420   %add2 = add i64 %tid.ext, 2
    421   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    422   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    423   %a = load volatile float, float addrspace(1)* %gep0, align 4
    424   %b = load volatile float, float addrspace(1)* %gep1, align 4
    425   %c = load volatile float, float addrspace(1)* %gep2, align 4
    426   %c.abs = call float @llvm.fabs.f32(float %c) #0
    427   %mul = fmul float %a, %b
    428   %sub = fsub float %mul, %c.abs
    429   store float %sub, float addrspace(1)* %outgep, align 4
    430   ret void
    431 }
    432 
    433 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
    434 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    435 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    436 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    437 ; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
    438 ; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
    439 
    440 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
    441 
    442 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    443 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
    444 
    445 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    446 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
    447 
    448 ; SI: buffer_store_dword [[RESULT]]
    449 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    450 define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    451   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    452   %tid.ext = sext i32 %tid to i64
    453   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    454   %add1 = add i64 %tid.ext, 1
    455   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    456   %add2 = add i64 %tid.ext, 2
    457   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    458   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    459   %a = load volatile float, float addrspace(1)* %gep0, align 4
    460   %b = load volatile float, float addrspace(1)* %gep1, align 4
    461   %c = load volatile float, float addrspace(1)* %gep2, align 4
    462   %c.abs = call float @llvm.fabs.f32(float %c) #0
    463   %mul = fmul float %a, %b
    464   %sub = fsub float %c.abs, %mul
    465   store float %sub, float addrspace(1)* %outgep, align 4
    466   ret void
    467 }
    468 
    469 ; GCN-LABEL: {{^}}neg_neg_mad_f32:
    470 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    471 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    472 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    473 
    474 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
    475 ; SI-FLUSH: buffer_store_dword [[REGC]]
    476 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
    477 
    478 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
    479 
    480 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]],  [[REGA]], [[REGB]]
    481 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
    482 
    483 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
    484 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
    485 
    486 ; SI-DENORM: buffer_store_dword [[RESULT]]
    487 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    488 define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    489   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    490   %tid.ext = sext i32 %tid to i64
    491   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    492   %add1 = add i64 %tid.ext, 1
    493   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    494   %add2 = add i64 %tid.ext, 2
    495   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    496   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    497   %a = load volatile float, float addrspace(1)* %gep0, align 4
    498   %b = load volatile float, float addrspace(1)* %gep1, align 4
    499   %c = load volatile float, float addrspace(1)* %gep2, align 4
    500   %nega = fsub float -0.000000e+00, %a
    501   %negb = fsub float -0.000000e+00, %b
    502   %mul = fmul float %nega, %negb
    503   %sub = fadd float %mul, %c
    504   store float %sub, float addrspace(1)* %outgep, align 4
    505   ret void
    506 }
    507 
    508 ; GCN-LABEL: {{^}}mad_fabs_sub_f32:
    509 ; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
    510 ; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
    511 ; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
    512 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
    513 
    514 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
    515 
    516 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
    517 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
    518 
    519 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
    520 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
    521 
    522 ; SI: buffer_store_dword [[RESULT]]
    523 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    524 define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
    525   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
    526   %tid.ext = sext i32 %tid to i64
    527   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
    528   %add1 = add i64 %tid.ext, 1
    529   %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
    530   %add2 = add i64 %tid.ext, 2
    531   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
    532   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
    533   %a = load volatile float, float addrspace(1)* %gep0, align 4
    534   %b = load volatile float, float addrspace(1)* %gep1, align 4
    535   %c = load volatile float, float addrspace(1)* %gep2, align 4
    536   %b.abs = call float @llvm.fabs.f32(float %b) #0
    537   %mul = fmul float %a, %b.abs
    538   %sub = fsub float %mul, %c
    539   store float %sub, float addrspace(1)* %outgep, align 4
    540   ret void
    541 }
    542 
    543 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
    544 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    545 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    546 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
    547 ; SI-FLUSH: buffer_store_dword [[R2]]
    548 ; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
    549 
    550 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
    551 
    552 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    553 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    554 
    555 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    556 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
    557 
    558 ; SI-DENORM: buffer_store_dword [[RESULT]]
    559 ; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    560 define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    561   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
    562   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    563   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    564   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    565 
    566   %r1 = load volatile float, float addrspace(1)* %gep.0
    567   %r2 = load volatile float, float addrspace(1)* %gep.1
    568 
    569   %add = fadd float %r1, %r1
    570   %r3 = fsub float %r2, %add
    571 
    572   store float %r3, float addrspace(1)* %gep.out
    573   ret void
    574 }
    575 
    576 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
    577 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
    578 ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
    579 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
    580 
    581 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
    582 
    583 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    584 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    585 
    586 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
    587 ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
    588 
    589 ; SI: buffer_store_dword [[RESULT]]
    590 ; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    591 define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    592   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
    593   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
    594   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
    595   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
    596 
    597   %r1 = load volatile float, float addrspace(1)* %gep.0
    598   %r2 = load volatile float, float addrspace(1)* %gep.1
    599 
    600   %add = fadd float %r1, %r1
    601   %r3 = fsub float %add, %r2
    602 
    603   store float %r3, float addrspace(1)* %gep.out
    604   ret void
    605 }
    606 
    607 attributes #0 = { nounwind }
    608 attributes #1 = { nounwind readnone }
    609