Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
      3 
      4 ; --------------------------------------------------------------------------------
      5 ; fadd tests
      6 ; --------------------------------------------------------------------------------
      7 
      8 ; GCN-LABEL: {{^}}v_fneg_add_f32:
      9 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
     10 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
     11 
     12 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
     13 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
     14 
     15 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
     16 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
     17 define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
     18   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     19   %tid.ext = sext i32 %tid to i64
     20   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
     21   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
     22   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
     23   %a = load volatile float, float addrspace(1)* %a.gep
     24   %b = load volatile float, float addrspace(1)* %b.gep
     25   %add = fadd float %a, %b
     26   %fneg = fsub float -0.000000e+00, %add
     27   store float %fneg, float addrspace(1)* %out.gep
     28   ret void
     29 }
     30 
     31 ; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
     32 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
     33 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
     34 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
     35 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
     36 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
     37 ; GCN-NEXT: buffer_store_dword [[ADD]]
     38 define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
     39   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     40   %tid.ext = sext i32 %tid to i64
     41   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
     42   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
     43   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
     44   %a = load volatile float, float addrspace(1)* %a.gep
     45   %b = load volatile float, float addrspace(1)* %b.gep
     46   %add = fadd float %a, %b
     47   %fneg = fsub float -0.000000e+00, %add
     48   store volatile float %fneg, float addrspace(1)* %out
     49   store volatile float %add, float addrspace(1)* %out
     50   ret void
     51 }
     52 
     53 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
     54 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
     55 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
     56 
     57 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
     58 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
     59 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
     60 
     61 ; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
     62 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
     63 ; GCN: buffer_store_dword [[NEG_ADD]]
     64 ; GCN-NEXT: buffer_store_dword [[MUL]]
     65 define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
     66   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     67   %tid.ext = sext i32 %tid to i64
     68   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
     69   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
     70   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
     71   %a = load volatile float, float addrspace(1)* %a.gep
     72   %b = load volatile float, float addrspace(1)* %b.gep
     73   %add = fadd float %a, %b
     74   %fneg = fsub float -0.000000e+00, %add
     75   %use1 = fmul float %add, 4.0
     76   store volatile float %fneg, float addrspace(1)* %out
     77   store volatile float %use1, float addrspace(1)* %out
     78   ret void
     79 }
     80 
     81 ; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
     82 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
     83 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
     84 
     85 ; GCN-SAFE: v_sub_f32_e32
     86 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
     87 
     88 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
     89 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
     90 define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
     91   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     92   %tid.ext = sext i32 %tid to i64
     93   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
     94   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
     95   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
     96   %a = load volatile float, float addrspace(1)* %a.gep
     97   %b = load volatile float, float addrspace(1)* %b.gep
     98   %fneg.a = fsub float -0.000000e+00, %a
     99   %add = fadd float %fneg.a, %b
    100   %fneg = fsub float -0.000000e+00, %add
    101   store volatile float %fneg, float addrspace(1)* %out
    102   ret void
    103 }
    104 
    105 ; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
    106 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    107 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    108 
    109 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
    110 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
    111 
    112 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
    113 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
    114 define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    115   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    116   %tid.ext = sext i32 %tid to i64
    117   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    118   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    119   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    120   %a = load volatile float, float addrspace(1)* %a.gep
    121   %b = load volatile float, float addrspace(1)* %b.gep
    122   %fneg.b = fsub float -0.000000e+00, %b
    123   %add = fadd float %a, %fneg.b
    124   %fneg = fsub float -0.000000e+00, %add
    125   store volatile float %fneg, float addrspace(1)* %out
    126   ret void
    127 }
    128 
    129 ; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
    130 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    131 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    132 
    133 ; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
    134 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
    135 
    136 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
    137 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
    138 define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    139   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    140   %tid.ext = sext i32 %tid to i64
    141   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    142   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    143   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    144   %a = load volatile float, float addrspace(1)* %a.gep
    145   %b = load volatile float, float addrspace(1)* %b.gep
    146   %fneg.a = fsub float -0.000000e+00, %a
    147   %fneg.b = fsub float -0.000000e+00, %b
    148   %add = fadd float %fneg.a, %fneg.b
    149   %fneg = fsub float -0.000000e+00, %add
    150   store volatile float %fneg, float addrspace(1)* %out
    151   ret void
    152 }
    153 
    154 ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
    155 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    156 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    157 
    158 ; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
    159 ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
    160 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
    161 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
    162 
    163 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
    164 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
    165 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
    166 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
    167 define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    168   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    169   %tid.ext = sext i32 %tid to i64
    170   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    171   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    172   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    173   %a = load volatile float, float addrspace(1)* %a.gep
    174   %b = load volatile float, float addrspace(1)* %b.gep
    175   %fneg.a = fsub float -0.000000e+00, %a
    176   %add = fadd float %fneg.a, %b
    177   %fneg = fsub float -0.000000e+00, %add
    178   store volatile float %fneg, float addrspace(1)* %out
    179   store volatile float %fneg.a, float addrspace(1)* %out
    180   ret void
    181 }
    182 
    183 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
    184 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    185 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    186 
    187 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
    188 ; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
    189 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
    190 
    191 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
    192 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
    193 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
    194 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
    195 define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
    196   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    197   %tid.ext = sext i32 %tid to i64
    198   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    199   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    200   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    201   %a = load volatile float, float addrspace(1)* %a.gep
    202   %b = load volatile float, float addrspace(1)* %b.gep
    203   %fneg.a = fsub float -0.000000e+00, %a
    204   %add = fadd float %fneg.a, %b
    205   %fneg = fsub float -0.000000e+00, %add
    206   %use1 = fmul float %fneg.a, %c
    207   store volatile float %fneg, float addrspace(1)* %out
    208   store volatile float %use1, float addrspace(1)* %out
    209   ret void
    210 }
    211 
    212 ; --------------------------------------------------------------------------------
    213 ; fmul tests
    214 ; --------------------------------------------------------------------------------
    215 
    216 ; GCN-LABEL: {{^}}v_fneg_mul_f32:
    217 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    218 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    219 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
    220 ; GCN-NEXT: buffer_store_dword [[RESULT]]
    221 define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    222   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    223   %tid.ext = sext i32 %tid to i64
    224   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    225   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    226   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    227   %a = load volatile float, float addrspace(1)* %a.gep
    228   %b = load volatile float, float addrspace(1)* %b.gep
    229   %mul = fmul float %a, %b
    230   %fneg = fsub float -0.000000e+00, %mul
    231   store float %fneg, float addrspace(1)* %out.gep
    232   ret void
    233 }
    234 
    235 ; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
    236 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    237 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    238 ; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
    239 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
    240 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
    241 ; GCN: buffer_store_dword [[ADD]]
    242 define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    243   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    244   %tid.ext = sext i32 %tid to i64
    245   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    246   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    247   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    248   %a = load volatile float, float addrspace(1)* %a.gep
    249   %b = load volatile float, float addrspace(1)* %b.gep
    250   %mul = fmul float %a, %b
    251   %fneg = fsub float -0.000000e+00, %mul
    252   store volatile float %fneg, float addrspace(1)* %out
    253   store volatile float %mul, float addrspace(1)* %out
    254   ret void
    255 }
    256 
    257 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
    258 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    259 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    260 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
    261 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
    262 ; GCN-NEXT: buffer_store_dword [[MUL0]]
    263 ; GCN-NEXT: buffer_store_dword [[MUL1]]
    264 define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    265   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    266   %tid.ext = sext i32 %tid to i64
    267   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    268   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    269   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    270   %a = load volatile float, float addrspace(1)* %a.gep
    271   %b = load volatile float, float addrspace(1)* %b.gep
    272   %mul = fmul float %a, %b
    273   %fneg = fsub float -0.000000e+00, %mul
    274   %use1 = fmul float %mul, 4.0
    275   store volatile float %fneg, float addrspace(1)* %out
    276   store volatile float %use1, float addrspace(1)* %out
    277   ret void
    278 }
    279 
    280 ; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
    281 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    282 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    283 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
    284 ; GCN-NEXT: buffer_store_dword [[ADD]]
    285 define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    286   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    287   %tid.ext = sext i32 %tid to i64
    288   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    289   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    290   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    291   %a = load volatile float, float addrspace(1)* %a.gep
    292   %b = load volatile float, float addrspace(1)* %b.gep
    293   %fneg.a = fsub float -0.000000e+00, %a
    294   %mul = fmul float %fneg.a, %b
    295   %fneg = fsub float -0.000000e+00, %mul
    296   store volatile float %fneg, float addrspace(1)* %out
    297   ret void
    298 }
    299 
    300 ; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
    301 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    302 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    303 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
    304 ; GCN-NEXT: buffer_store_dword [[ADD]]
    305 define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    306   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    307   %tid.ext = sext i32 %tid to i64
    308   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    309   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    310   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    311   %a = load volatile float, float addrspace(1)* %a.gep
    312   %b = load volatile float, float addrspace(1)* %b.gep
    313   %fneg.b = fsub float -0.000000e+00, %b
    314   %mul = fmul float %a, %fneg.b
    315   %fneg = fsub float -0.000000e+00, %mul
    316   store volatile float %fneg, float addrspace(1)* %out
    317   ret void
    318 }
    319 
    320 ; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
    321 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    322 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    323 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
    324 ; GCN-NEXT: buffer_store_dword [[ADD]]
    325 define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    326   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    327   %tid.ext = sext i32 %tid to i64
    328   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    329   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    330   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    331   %a = load volatile float, float addrspace(1)* %a.gep
    332   %b = load volatile float, float addrspace(1)* %b.gep
    333   %fneg.a = fsub float -0.000000e+00, %a
    334   %fneg.b = fsub float -0.000000e+00, %b
    335   %mul = fmul float %fneg.a, %fneg.b
    336   %fneg = fsub float -0.000000e+00, %mul
    337   store volatile float %fneg, float addrspace(1)* %out
    338   ret void
    339 }
    340 
    341 ; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
    342 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    343 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    344 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
    345 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
    346 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
    347 ; GCN: buffer_store_dword [[NEG_A]]
    348 define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    349   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    350   %tid.ext = sext i32 %tid to i64
    351   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    352   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    353   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    354   %a = load volatile float, float addrspace(1)* %a.gep
    355   %b = load volatile float, float addrspace(1)* %b.gep
    356   %fneg.a = fsub float -0.000000e+00, %a
    357   %mul = fmul float %fneg.a, %b
    358   %fneg = fsub float -0.000000e+00, %mul
    359   store volatile float %fneg, float addrspace(1)* %out
    360   store volatile float %fneg.a, float addrspace(1)* %out
    361   ret void
    362 }
    363 
    364 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
    365 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    366 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    367 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
    368 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
    369 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
    370 ; GCN: buffer_store_dword [[MUL]]
    371 define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
    372   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    373   %tid.ext = sext i32 %tid to i64
    374   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    375   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    376   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    377   %a = load volatile float, float addrspace(1)* %a.gep
    378   %b = load volatile float, float addrspace(1)* %b.gep
    379   %fneg.a = fsub float -0.000000e+00, %a
    380   %mul = fmul float %fneg.a, %b
    381   %fneg = fsub float -0.000000e+00, %mul
    382   %use1 = fmul float %fneg.a, %c
    383   store volatile float %fneg, float addrspace(1)* %out
    384   store volatile float %use1, float addrspace(1)* %out
    385   ret void
    386 }
    387 
    388 ; --------------------------------------------------------------------------------
    389 ; fminnum tests
    390 ; --------------------------------------------------------------------------------
    391 
    392 ; GCN-LABEL: {{^}}v_fneg_minnum_f32:
    393 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    394 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    395 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
    396 ; GCN: buffer_store_dword [[RESULT]]
    397 define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    398   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    399   %tid.ext = sext i32 %tid to i64
    400   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    401   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    402   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    403   %a = load volatile float, float addrspace(1)* %a.gep
    404   %b = load volatile float, float addrspace(1)* %b.gep
    405   %min = call float @llvm.minnum.f32(float %a, float %b)
    406   %fneg = fsub float -0.000000e+00, %min
    407   store float %fneg, float addrspace(1)* %out.gep
    408   ret void
    409 }
    410 
    411 ; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
    412 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    413 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
    414 ; GCN: buffer_store_dword [[RESULT]]
    415 define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    416   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    417   %tid.ext = sext i32 %tid to i64
    418   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    419   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    420   %a = load volatile float, float addrspace(1)* %a.gep
    421   %min = call float @llvm.minnum.f32(float %a, float %a)
    422   %min.fneg = fsub float -0.0, %min
    423   store float %min.fneg, float addrspace(1)* %out.gep
    424   ret void
    425 }
    426 
    427 ; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
    428 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    429 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
    430 ; GCN: buffer_store_dword [[RESULT]]
    431 define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    432   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    433   %tid.ext = sext i32 %tid to i64
    434   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    435   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    436   %a = load volatile float, float addrspace(1)* %a.gep
    437   %min = call float @llvm.minnum.f32(float 4.0, float %a)
    438   %fneg = fsub float -0.000000e+00, %min
    439   store float %fneg, float addrspace(1)* %out.gep
    440   ret void
    441 }
    442 
    443 ; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
    444 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    445 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
    446 ; GCN: buffer_store_dword [[RESULT]]
    447 define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    448   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    449   %tid.ext = sext i32 %tid to i64
    450   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    451   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    452   %a = load volatile float, float addrspace(1)* %a.gep
    453   %min = call float @llvm.minnum.f32(float -4.0, float %a)
    454   %fneg = fsub float -0.000000e+00, %min
    455   store float %fneg, float addrspace(1)* %out.gep
    456   ret void
    457 }
    458 
    459 ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
    460 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    461 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
    462 ; GCN: buffer_store_dword [[RESULT]]
    463 define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    464   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    465   %tid.ext = sext i32 %tid to i64
    466   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    467   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    468   %a = load volatile float, float addrspace(1)* %a.gep
    469   %min = call float @llvm.minnum.f32(float 0.0, float %a)
    470   %fneg = fsub float -0.000000e+00, %min
    471   store float %fneg, float addrspace(1)* %out.gep
    472   ret void
    473 }
    474 
    475 ; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
    476 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    477 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
    478 ; GCN: buffer_store_dword [[RESULT]]
    479 define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    480   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    481   %tid.ext = sext i32 %tid to i64
    482   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    483   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    484   %a = load volatile float, float addrspace(1)* %a.gep
    485   %min = call float @llvm.minnum.f32(float -0.0, float %a)
    486   %fneg = fsub float -0.000000e+00, %min
    487   store float %fneg, float addrspace(1)* %out.gep
    488   ret void
    489 }
    490 
    491 ; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
    492 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    493 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    494 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
    495 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
    496 ; GCN: buffer_store_dword [[RESULT]]
    497 define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    498   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    499   %tid.ext = sext i32 %tid to i64
    500   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    501   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    502   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    503   %a = load volatile float, float addrspace(1)* %a.gep
    504   %b = load volatile float, float addrspace(1)* %b.gep
    505   %min = call float @llvm.minnum.f32(float 0.0, float %a)
    506   %fneg = fsub float -0.000000e+00, %min
    507   %mul = fmul float %fneg, %b
    508   store float %mul, float addrspace(1)* %out.gep
    509   ret void
    510 }
    511 
    512 ; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
    513 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    514 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    515 ; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
    516 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
    517 ; GCN-NEXT: buffer_store_dword [[MAX0]]
    518 ; GCN-NEXT: buffer_store_dword [[MUL1]]
    519 define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    520   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    521   %tid.ext = sext i32 %tid to i64
    522   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    523   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    524   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    525   %a = load volatile float, float addrspace(1)* %a.gep
    526   %b = load volatile float, float addrspace(1)* %b.gep
    527   %min = call float @llvm.minnum.f32(float %a, float %b)
    528   %fneg = fsub float -0.000000e+00, %min
    529   %use1 = fmul float %min, 4.0
    530   store volatile float %fneg, float addrspace(1)* %out
    531   store volatile float %use1, float addrspace(1)* %out
    532   ret void
    533 }
    534 
    535 ; --------------------------------------------------------------------------------
    536 ; fmaxnum tests
    537 ; --------------------------------------------------------------------------------
    538 
    539 ; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
    540 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    541 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    542 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
    543 ; GCN: buffer_store_dword [[RESULT]]
    544 define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    545   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    546   %tid.ext = sext i32 %tid to i64
    547   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    548   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    549   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    550   %a = load volatile float, float addrspace(1)* %a.gep
    551   %b = load volatile float, float addrspace(1)* %b.gep
    552   %min = call float @llvm.maxnum.f32(float %a, float %b)
    553   %fneg = fsub float -0.000000e+00, %min
    554   store float %fneg, float addrspace(1)* %out.gep
    555   ret void
    556 }
    557 
    558 ; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
    559 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    560 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
    561 ; GCN: buffer_store_dword [[RESULT]]
    562 define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    563   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    564   %tid.ext = sext i32 %tid to i64
    565   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    566   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    567   %a = load volatile float, float addrspace(1)* %a.gep
    568   %min = call float @llvm.maxnum.f32(float %a, float %a)
    569   %min.fneg = fsub float -0.0, %min
    570   store float %min.fneg, float addrspace(1)* %out.gep
    571   ret void
    572 }
    573 
    574 ; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
    575 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    576 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
    577 ; GCN: buffer_store_dword [[RESULT]]
    578 define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    579   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    580   %tid.ext = sext i32 %tid to i64
    581   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    582   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    583   %a = load volatile float, float addrspace(1)* %a.gep
    584   %min = call float @llvm.maxnum.f32(float 4.0, float %a)
    585   %fneg = fsub float -0.000000e+00, %min
    586   store float %fneg, float addrspace(1)* %out.gep
    587   ret void
    588 }
    589 
    590 ; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
    591 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    592 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
    593 ; GCN: buffer_store_dword [[RESULT]]
    594 define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    595   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    596   %tid.ext = sext i32 %tid to i64
    597   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    598   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    599   %a = load volatile float, float addrspace(1)* %a.gep
    600   %min = call float @llvm.maxnum.f32(float -4.0, float %a)
    601   %fneg = fsub float -0.000000e+00, %min
    602   store float %fneg, float addrspace(1)* %out.gep
    603   ret void
    604 }
    605 
    606 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
    607 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    608 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
    609 ; GCN: buffer_store_dword [[RESULT]]
    610 define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    611   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    612   %tid.ext = sext i32 %tid to i64
    613   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    614   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    615   %a = load volatile float, float addrspace(1)* %a.gep
    616   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
    617   %fneg = fsub float -0.000000e+00, %max
    618   store float %fneg, float addrspace(1)* %out.gep
    619   ret void
    620 }
    621 
    622 ; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
    623 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    624 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
    625 ; GCN: buffer_store_dword [[RESULT]]
    626 define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
    627   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    628   %tid.ext = sext i32 %tid to i64
    629   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    630   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    631   %a = load volatile float, float addrspace(1)* %a.gep
    632   %max = call float @llvm.maxnum.f32(float -0.0, float %a)
    633   %fneg = fsub float -0.000000e+00, %max
    634   store float %fneg, float addrspace(1)* %out.gep
    635   ret void
    636 }
    637 
    638 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
    639 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    640 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    641 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
    642 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
    643 ; GCN: buffer_store_dword [[RESULT]]
    644 define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    645   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    646   %tid.ext = sext i32 %tid to i64
    647   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    648   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    649   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    650   %a = load volatile float, float addrspace(1)* %a.gep
    651   %b = load volatile float, float addrspace(1)* %b.gep
    652   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
    653   %fneg = fsub float -0.000000e+00, %max
    654   %mul = fmul float %fneg, %b
    655   store float %mul, float addrspace(1)* %out.gep
    656   ret void
    657 }
    658 
    659 ; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
    660 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    661 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    662 ; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
    663 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
    664 ; GCN-NEXT: buffer_store_dword [[MAX0]]
    665 ; GCN-NEXT: buffer_store_dword [[MUL1]]
    666 define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    667   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    668   %tid.ext = sext i32 %tid to i64
    669   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    670   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    671   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    672   %a = load volatile float, float addrspace(1)* %a.gep
    673   %b = load volatile float, float addrspace(1)* %b.gep
    674   %min = call float @llvm.maxnum.f32(float %a, float %b)
    675   %fneg = fsub float -0.000000e+00, %min
    676   %use1 = fmul float %min, 4.0
    677   store volatile float %fneg, float addrspace(1)* %out
    678   store volatile float %use1, float addrspace(1)* %out
    679   ret void
    680 }
    681 
    682 ; --------------------------------------------------------------------------------
    683 ; fma tests
    684 ; --------------------------------------------------------------------------------
    685 
    686 ; GCN-LABEL: {{^}}v_fneg_fma_f32:
    687 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    688 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    689 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    690 
    691 ; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
    692 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
    693 
    694 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
    695 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
    696 define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    697   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    698   %tid.ext = sext i32 %tid to i64
    699   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    700   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    701   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    702   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    703   %a = load volatile float, float addrspace(1)* %a.gep
    704   %b = load volatile float, float addrspace(1)* %b.gep
    705   %c = load volatile float, float addrspace(1)* %c.gep
    706   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
    707   %fneg = fsub float -0.000000e+00, %fma
    708   store float %fneg, float addrspace(1)* %out.gep
    709   ret void
    710 }
    711 
    712 ; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
    713 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    714 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    715 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    716 ; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
    717 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
    718 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
    719 ; GCN-NEXT: buffer_store_dword [[FMA]]
    720 define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    721   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    722   %tid.ext = sext i32 %tid to i64
    723   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    724   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    725   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    726   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    727   %a = load volatile float, float addrspace(1)* %a.gep
    728   %b = load volatile float, float addrspace(1)* %b.gep
    729   %c = load volatile float, float addrspace(1)* %c.gep
    730   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
    731   %fneg = fsub float -0.000000e+00, %fma
    732   store volatile float %fneg, float addrspace(1)* %out
    733   store volatile float %fma, float addrspace(1)* %out
    734   ret void
    735 }
    736 
    737 ; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
    738 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    739 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    740 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    741 
    742 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
    743 ; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
    744 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
    745 
    746 ; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
    747 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
    748 
    749 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
    750 ; GCN-NEXT: buffer_store_dword [[MUL]]
    751 define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    752   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    753   %tid.ext = sext i32 %tid to i64
    754   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    755   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    756   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    757   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    758   %a = load volatile float, float addrspace(1)* %a.gep
    759   %b = load volatile float, float addrspace(1)* %b.gep
    760   %c = load volatile float, float addrspace(1)* %c.gep
    761   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
    762   %fneg = fsub float -0.000000e+00, %fma
    763   %use1 = fmul float %fma, 4.0
    764   store volatile float %fneg, float addrspace(1)* %out
    765   store volatile float %use1, float addrspace(1)* %out
    766   ret void
    767 }
    768 
    769 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
    770 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    771 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    772 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    773 
    774 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
    775 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
    776 
    777 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
    778 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    779 define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    780   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    781   %tid.ext = sext i32 %tid to i64
    782   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    783   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    784   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    785   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    786   %a = load volatile float, float addrspace(1)* %a.gep
    787   %b = load volatile float, float addrspace(1)* %b.gep
    788   %c = load volatile float, float addrspace(1)* %c.gep
    789   %fneg.a = fsub float -0.000000e+00, %a
    790   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
    791   %fneg = fsub float -0.000000e+00, %fma
    792   store volatile float %fneg, float addrspace(1)* %out
    793   ret void
    794 }
    795 
    796 ; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
    797 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    798 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    799 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    800 
    801 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
    802 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
    803 
    804 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
    805 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    806 define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    807   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    808   %tid.ext = sext i32 %tid to i64
    809   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    810   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    811   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    812   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    813   %a = load volatile float, float addrspace(1)* %a.gep
    814   %b = load volatile float, float addrspace(1)* %b.gep
    815   %c = load volatile float, float addrspace(1)* %c.gep
    816   %fneg.b = fsub float -0.000000e+00, %b
    817   %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
    818   %fneg = fsub float -0.000000e+00, %fma
    819   store volatile float %fneg, float addrspace(1)* %out
    820   ret void
    821 }
    822 
    823 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
    824 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    825 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    826 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    827 
    828 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], -[[B]], [[C]]
    829 ; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
    830 
    831 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
    832 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    833 define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    834   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    835   %tid.ext = sext i32 %tid to i64
    836   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    837   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    838   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    839   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    840   %a = load volatile float, float addrspace(1)* %a.gep
    841   %b = load volatile float, float addrspace(1)* %b.gep
    842   %c = load volatile float, float addrspace(1)* %c.gep
    843   %fneg.a = fsub float -0.000000e+00, %a
    844   %fneg.b = fsub float -0.000000e+00, %b
    845   %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
    846   %fneg = fsub float -0.000000e+00, %fma
    847   store volatile float %fneg, float addrspace(1)* %out
    848   ret void
    849 }
    850 
    851 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
    852 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    853 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    854 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    855 
    856 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
    857 ; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
    858 
    859 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
    860 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    861 define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    862   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    863   %tid.ext = sext i32 %tid to i64
    864   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    865   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    866   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    867   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    868   %a = load volatile float, float addrspace(1)* %a.gep
    869   %b = load volatile float, float addrspace(1)* %b.gep
    870   %c = load volatile float, float addrspace(1)* %c.gep
    871   %fneg.a = fsub float -0.000000e+00, %a
    872   %fneg.c = fsub float -0.000000e+00, %c
    873   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
    874   %fneg = fsub float -0.000000e+00, %fma
    875   store volatile float %fneg, float addrspace(1)* %out
    876   ret void
    877 }
    878 
    879 ; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
    880 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    881 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    882 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    883 
    884 ; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
    885 ; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
    886 
    887 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
    888 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    889 define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    890   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    891   %tid.ext = sext i32 %tid to i64
    892   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    893   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    894   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    895   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    896   %a = load volatile float, float addrspace(1)* %a.gep
    897   %b = load volatile float, float addrspace(1)* %b.gep
    898   %c = load volatile float, float addrspace(1)* %c.gep
    899   %fneg.c = fsub float -0.000000e+00, %c
    900   %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
    901   %fneg = fsub float -0.000000e+00, %fma
    902   store volatile float %fneg, float addrspace(1)* %out
    903   ret void
    904 }
    905 
    906 ; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
    907 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    908 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    909 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    910 
    911 ; GCN-SAFE: v_xor_b32
    912 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
    913 ; GCN-SAFE: v_xor_b32
    914 
    915 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
    916 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
    917 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
    918 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
    919 define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    920   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    921   %tid.ext = sext i32 %tid to i64
    922   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    923   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    924   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    925   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    926   %a = load volatile float, float addrspace(1)* %a.gep
    927   %b = load volatile float, float addrspace(1)* %b.gep
    928   %c = load volatile float, float addrspace(1)* %c.gep
    929   %fneg.a = fsub float -0.000000e+00, %a
    930   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
    931   %fneg = fsub float -0.000000e+00, %fma
    932   store volatile float %fneg, float addrspace(1)* %out
    933   store volatile float %fneg.a, float addrspace(1)* %out
    934   ret void
    935 }
    936 
    937 ; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
    938 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    939 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    940 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    941 
    942 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
    943 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
    944 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
    945 
    946 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
    947 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
    948 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
    949 define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
    950   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    951   %tid.ext = sext i32 %tid to i64
    952   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    953   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    954   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    955   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    956   %a = load volatile float, float addrspace(1)* %a.gep
    957   %b = load volatile float, float addrspace(1)* %b.gep
    958   %c = load volatile float, float addrspace(1)* %c.gep
    959   %fneg.a = fsub float -0.000000e+00, %a
    960   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
    961   %fneg = fsub float -0.000000e+00, %fma
    962   %use1 = fmul float %fneg.a, %d
    963   store volatile float %fneg, float addrspace(1)* %out
    964   store volatile float %use1, float addrspace(1)* %out
    965   ret void
    966 }
    967 
    968 ; --------------------------------------------------------------------------------
    969 ; fmad tests
    970 ; --------------------------------------------------------------------------------
    971 
    972 ; GCN-LABEL: {{^}}v_fneg_fmad_f32:
    973 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    974 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    975 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
    976 
    977 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
    978 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
    979 
    980 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
    981 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
    982 define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
    983   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    984   %tid.ext = sext i32 %tid to i64
    985   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
    986   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
    987   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
    988   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    989   %a = load volatile float, float addrspace(1)* %a.gep
    990   %b = load volatile float, float addrspace(1)* %b.gep
    991   %c = load volatile float, float addrspace(1)* %c.gep
    992   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
    993   %fneg = fsub float -0.000000e+00, %fma
    994   store float %fneg, float addrspace(1)* %out.gep
    995   ret void
    996 }
    997 
    998 ; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
    999 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1000 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1001 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   1002 
   1003 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
   1004 ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
   1005 ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
   1006 
   1007 ; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
   1008 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
   1009 
   1010 ; GCN: buffer_store_dword [[NEG_MAD]]
   1011 ; GCN-NEXT: buffer_store_dword [[MUL]]
   1012 define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   1013   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1014   %tid.ext = sext i32 %tid to i64
   1015   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1016   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1017   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1018   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1019   %a = load volatile float, float addrspace(1)* %a.gep
   1020   %b = load volatile float, float addrspace(1)* %b.gep
   1021   %c = load volatile float, float addrspace(1)* %c.gep
   1022   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
   1023   %fneg = fsub float -0.000000e+00, %fma
   1024   %use1 = fmul float %fma, 4.0
   1025   store volatile float %fneg, float addrspace(1)* %out
   1026   store volatile float %use1, float addrspace(1)* %out
   1027   ret void
   1028 }
   1029 
   1030 ; --------------------------------------------------------------------------------
   1031 ; fp_extend tests
   1032 ; --------------------------------------------------------------------------------
   1033 
   1034 ; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
   1035 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1036 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
   1037 ; GCN: buffer_store_dwordx2 [[RESULT]]
   1038 define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1039   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1040   %tid.ext = sext i32 %tid to i64
   1041   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1042   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   1043   %a = load volatile float, float addrspace(1)* %a.gep
   1044   %fpext = fpext float %a to double
   1045   %fneg = fsub double -0.000000e+00, %fpext
   1046   store double %fneg, double addrspace(1)* %out.gep
   1047   ret void
   1048 }
   1049 
   1050 ; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
   1051 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1052 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
   1053 ; GCN: buffer_store_dwordx2 [[RESULT]]
   1054 define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1055   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1056   %tid.ext = sext i32 %tid to i64
   1057   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1058   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   1059   %a = load volatile float, float addrspace(1)* %a.gep
   1060   %fneg.a = fsub float -0.000000e+00, %a
   1061   %fpext = fpext float %fneg.a to double
   1062   %fneg = fsub double -0.000000e+00, %fpext
   1063   store double %fneg, double addrspace(1)* %out.gep
   1064   ret void
   1065 }
   1066 
   1067 ; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
   1068 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1069 ; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
   1070 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
   1071 ; GCN: buffer_store_dwordx2 [[RESULT]]
   1072 ; GCN: buffer_store_dword [[FNEG_A]]
   1073 define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1074   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1075   %tid.ext = sext i32 %tid to i64
   1076   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1077   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   1078   %a = load volatile float, float addrspace(1)* %a.gep
   1079   %fneg.a = fsub float -0.000000e+00, %a
   1080   %fpext = fpext float %fneg.a to double
   1081   %fneg = fsub double -0.000000e+00, %fpext
   1082   store volatile double %fneg, double addrspace(1)* %out.gep
   1083   store volatile float %fneg.a, float addrspace(1)* undef
   1084   ret void
   1085 }
   1086 
   1087 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
   1088 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1089 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
   1090 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
   1091 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
   1092 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
   1093 define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1094   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1095   %tid.ext = sext i32 %tid to i64
   1096   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1097   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   1098   %a = load volatile float, float addrspace(1)* %a.gep
   1099   %fpext = fpext float %a to double
   1100   %fneg = fsub double -0.000000e+00, %fpext
   1101   store volatile double %fneg, double addrspace(1)* %out.gep
   1102   store volatile double %fpext, double addrspace(1)* undef
   1103   ret void
   1104 }
   1105 
   1106 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
   1107 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1108 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
   1109 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
   1110 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
   1111 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
   1112 ; GCN: buffer_store_dwordx2 [[MUL]]
   1113 define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1114   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1115   %tid.ext = sext i32 %tid to i64
   1116   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1117   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   1118   %a = load volatile float, float addrspace(1)* %a.gep
   1119   %fpext = fpext float %a to double
   1120   %fneg = fsub double -0.000000e+00, %fpext
   1121   %mul = fmul double %fpext, 4.0
   1122   store volatile double %fneg, double addrspace(1)* %out.gep
   1123   store volatile double %mul, double addrspace(1)* %out.gep
   1124   ret void
   1125 }
   1126 
   1127 ; FIXME: Source modifiers not folded for f16->f32
   1128 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
   1129 define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   1130   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1131   %tid.ext = sext i32 %tid to i64
   1132   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
   1133   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1134   %a = load volatile half, half addrspace(1)* %a.gep
   1135   %fpext = fpext half %a to float
   1136   %fneg = fsub float -0.000000e+00, %fpext
   1137   store volatile float %fneg, float addrspace(1)* %out.gep
   1138   store volatile float %fpext, float addrspace(1)* %out.gep
   1139   ret void
   1140 }
   1141 
   1142 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
   1143 define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   1144   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1145   %tid.ext = sext i32 %tid to i64
   1146   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
   1147   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1148   %a = load volatile half, half addrspace(1)* %a.gep
   1149   %fpext = fpext half %a to float
   1150   %fneg = fsub float -0.000000e+00, %fpext
   1151   %mul = fmul float %fpext, 4.0
   1152   store volatile float %fneg, float addrspace(1)* %out.gep
   1153   store volatile float %mul, float addrspace(1)* %out.gep
   1154   ret void
   1155 }
   1156 
   1157 ; --------------------------------------------------------------------------------
   1158 ; fp_round tests
   1159 ; --------------------------------------------------------------------------------
   1160 
   1161 ; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
   1162 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
   1163 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
   1164 ; GCN: buffer_store_dword [[RESULT]]
   1165 define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   1166   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1167   %tid.ext = sext i32 %tid to i64
   1168   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   1169   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1170   %a = load volatile double, double addrspace(1)* %a.gep
   1171   %fpround = fptrunc double %a to float
   1172   %fneg = fsub float -0.000000e+00, %fpround
   1173   store float %fneg, float addrspace(1)* %out.gep
   1174   ret void
   1175 }
   1176 
   1177 ; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
   1178 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
   1179 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
   1180 ; GCN: buffer_store_dword [[RESULT]]
   1181 define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   1182   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1183   %tid.ext = sext i32 %tid to i64
   1184   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   1185   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1186   %a = load volatile double, double addrspace(1)* %a.gep
   1187   %fneg.a = fsub double -0.000000e+00, %a
   1188   %fpround = fptrunc double %fneg.a to float
   1189   %fneg = fsub float -0.000000e+00, %fpround
   1190   store float %fneg, float addrspace(1)* %out.gep
   1191   ret void
   1192 }
   1193 
   1194 ; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
   1195 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
   1196 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
   1197 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
   1198 ; GCN: buffer_store_dword [[RESULT]]
   1199 ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
   1200 define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   1201   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1202   %tid.ext = sext i32 %tid to i64
   1203   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   1204   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1205   %a = load volatile double, double addrspace(1)* %a.gep
   1206   %fneg.a = fsub double -0.000000e+00, %a
   1207   %fpround = fptrunc double %fneg.a to float
   1208   %fneg = fsub float -0.000000e+00, %fpround
   1209   store volatile float %fneg, float addrspace(1)* %out.gep
   1210   store volatile double %fneg.a, double addrspace(1)* undef
   1211   ret void
   1212 }
   1213 
   1214 ; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
   1215 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
   1216 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
   1217 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
   1218 ; GCN: buffer_store_dword [[RESULT]]
   1219 ; GCN: buffer_store_dwordx2 [[USE1]]
   1220 define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   1221   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1222   %tid.ext = sext i32 %tid to i64
   1223   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   1224   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1225   %a = load volatile double, double addrspace(1)* %a.gep
   1226   %fneg.a = fsub double -0.000000e+00, %a
   1227   %fpround = fptrunc double %fneg.a to float
   1228   %fneg = fsub float -0.000000e+00, %fpround
   1229   %use1 = fmul double %fneg.a, %c
   1230   store volatile float %fneg, float addrspace(1)* %out.gep
   1231   store volatile double %use1, double addrspace(1)* undef
   1232   ret void
   1233 }
   1234 
   1235 ; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
   1236 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1237 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1238 ; GCN: buffer_store_short [[RESULT]]
   1239 define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1240   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1241   %tid.ext = sext i32 %tid to i64
   1242   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1243   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
   1244   %a = load volatile float, float addrspace(1)* %a.gep
   1245   %fpround = fptrunc float %a to half
   1246   %fneg = fsub half -0.000000e+00, %fpround
   1247   store half %fneg, half addrspace(1)* %out.gep
   1248   ret void
   1249 }
   1250 
   1251 ; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
   1252 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1253 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1254 ; GCN: buffer_store_short [[RESULT]]
   1255 define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1256   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1257   %tid.ext = sext i32 %tid to i64
   1258   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1259   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
   1260   %a = load volatile float, float addrspace(1)* %a.gep
   1261   %fneg.a = fsub float -0.000000e+00, %a
   1262   %fpround = fptrunc float %fneg.a to half
   1263   %fneg = fsub half -0.000000e+00, %fpround
   1264   store half %fneg, half addrspace(1)* %out.gep
   1265   ret void
   1266 }
   1267 
   1268 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
   1269 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
   1270 ; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
   1271 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
   1272 ; GCN: buffer_store_dword [[NEG]]
   1273 ; GCN: buffer_store_dword [[CVT]]
   1274 define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   1275   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1276   %tid.ext = sext i32 %tid to i64
   1277   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   1278   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1279   %a = load volatile double, double addrspace(1)* %a.gep
   1280   %fpround = fptrunc double %a to float
   1281   %fneg = fsub float -0.000000e+00, %fpround
   1282   store volatile float %fneg, float addrspace(1)* %out.gep
   1283   store volatile float %fpround, float addrspace(1)* %out.gep
   1284   ret void
   1285 }
   1286 
   1287 ; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
   1288 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1289 ; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1290 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
   1291 ; GCN: buffer_store_short [[RESULT]]
   1292 ; GCN: buffer_store_dword [[NEG_A]]
   1293 define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1294   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1295   %tid.ext = sext i32 %tid to i64
   1296   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1297   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
   1298   %a = load volatile float, float addrspace(1)* %a.gep
   1299   %fneg.a = fsub float -0.000000e+00, %a
   1300   %fpround = fptrunc float %fneg.a to half
   1301   %fneg = fsub half -0.000000e+00, %fpround
   1302   store volatile half %fneg, half addrspace(1)* %out.gep
   1303   store volatile float %fneg.a, float addrspace(1)* undef
   1304   ret void
   1305 }
   1306 
   1307 ; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
   1308 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1309 ; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1310 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
   1311 ; GCN: buffer_store_short [[RESULT]]
   1312 ; GCN: buffer_store_dword [[USE1]]
   1313 define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   1314   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1315   %tid.ext = sext i32 %tid to i64
   1316   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1317   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
   1318   %a = load volatile float, float addrspace(1)* %a.gep
   1319   %fneg.a = fsub float -0.000000e+00, %a
   1320   %fpround = fptrunc float %fneg.a to half
   1321   %fneg = fsub half -0.000000e+00, %fpround
   1322   %use1 = fmul float %fneg.a, %c
   1323   store volatile half %fneg, half addrspace(1)* %out.gep
   1324   store volatile float %use1, float addrspace(1)* undef
   1325   ret void
   1326 }
   1327 
   1328 ; --------------------------------------------------------------------------------
   1329 ; rcp tests
   1330 ; --------------------------------------------------------------------------------
   1331 
   1332 ; GCN-LABEL: {{^}}v_fneg_rcp_f32:
   1333 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1334 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1335 ; GCN: buffer_store_dword [[RESULT]]
   1336 define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1337   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1338   %tid.ext = sext i32 %tid to i64
   1339   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1340   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1341   %a = load volatile float, float addrspace(1)* %a.gep
   1342   %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
   1343   %fneg = fsub float -0.000000e+00, %rcp
   1344   store float %fneg, float addrspace(1)* %out.gep
   1345   ret void
   1346 }
   1347 
   1348 ; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
   1349 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1350 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1351 ; GCN: buffer_store_dword [[RESULT]]
   1352 define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1353   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1354   %tid.ext = sext i32 %tid to i64
   1355   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1356   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1357   %a = load volatile float, float addrspace(1)* %a.gep
   1358   %fneg.a = fsub float -0.000000e+00, %a
   1359   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
   1360   %fneg = fsub float -0.000000e+00, %rcp
   1361   store float %fneg, float addrspace(1)* %out.gep
   1362   ret void
   1363 }
   1364 
   1365 ; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
   1366 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1367 ; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1368 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
   1369 ; GCN: buffer_store_dword [[RESULT]]
   1370 ; GCN: buffer_store_dword [[NEG_A]]
   1371 define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1372   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1373   %tid.ext = sext i32 %tid to i64
   1374   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1375   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1376   %a = load volatile float, float addrspace(1)* %a.gep
   1377   %fneg.a = fsub float -0.000000e+00, %a
   1378   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
   1379   %fneg = fsub float -0.000000e+00, %rcp
   1380   store volatile float %fneg, float addrspace(1)* %out.gep
   1381   store volatile float %fneg.a, float addrspace(1)* undef
   1382   ret void
   1383 }
   1384 
   1385 ; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
   1386 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1387 ; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
   1388 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
   1389 ; GCN: buffer_store_dword [[RESULT]]
   1390 ; GCN: buffer_store_dword [[MUL]]
   1391 define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   1392   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1393   %tid.ext = sext i32 %tid to i64
   1394   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1395   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1396   %a = load volatile float, float addrspace(1)* %a.gep
   1397   %fneg.a = fsub float -0.000000e+00, %a
   1398   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
   1399   %fneg = fsub float -0.000000e+00, %rcp
   1400   %use1 = fmul float %fneg.a, %c
   1401   store volatile float %fneg, float addrspace(1)* %out.gep
   1402   store volatile float %use1, float addrspace(1)* undef
   1403   ret void
   1404 }
   1405 
   1406 ; --------------------------------------------------------------------------------
   1407 ; rcp_legacy tests
   1408 ; --------------------------------------------------------------------------------
   1409 
   1410 ; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
   1411 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1412 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1413 ; GCN: buffer_store_dword [[RESULT]]
   1414 define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1415   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1416   %tid.ext = sext i32 %tid to i64
   1417   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1418   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1419   %a = load volatile float, float addrspace(1)* %a.gep
   1420   %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
   1421   %fneg = fsub float -0.000000e+00, %rcp
   1422   store float %fneg, float addrspace(1)* %out.gep
   1423   ret void
   1424 }
   1425 
   1426 ; --------------------------------------------------------------------------------
   1427 ; fmul_legacy tests
   1428 ; --------------------------------------------------------------------------------
   1429 
   1430 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
   1431 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1432 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1433 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
   1434 ; GCN-NEXT: buffer_store_dword [[RESULT]]
   1435 define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1436   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1437   %tid.ext = sext i32 %tid to i64
   1438   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1439   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1440   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1441   %a = load volatile float, float addrspace(1)* %a.gep
   1442   %b = load volatile float, float addrspace(1)* %b.gep
   1443   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   1444   %fneg = fsub float -0.000000e+00, %mul
   1445   store float %fneg, float addrspace(1)* %out.gep
   1446   ret void
   1447 }
   1448 
   1449 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
   1450 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1451 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1452 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
   1453 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
   1454 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
   1455 ; GCN: buffer_store_dword [[ADD]]
   1456 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1457   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1458   %tid.ext = sext i32 %tid to i64
   1459   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1460   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1461   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1462   %a = load volatile float, float addrspace(1)* %a.gep
   1463   %b = load volatile float, float addrspace(1)* %b.gep
   1464   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   1465   %fneg = fsub float -0.000000e+00, %mul
   1466   store volatile float %fneg, float addrspace(1)* %out
   1467   store volatile float %mul, float addrspace(1)* %out
   1468   ret void
   1469 }
   1470 
   1471 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
   1472 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1473 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1474 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
   1475 ; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
   1476 ; GCN-NEXT: buffer_store_dword [[ADD]]
   1477 ; GCN-NEXT: buffer_store_dword [[MUL]]
   1478 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1479   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1480   %tid.ext = sext i32 %tid to i64
   1481   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1482   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1483   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1484   %a = load volatile float, float addrspace(1)* %a.gep
   1485   %b = load volatile float, float addrspace(1)* %b.gep
   1486   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   1487   %fneg = fsub float -0.000000e+00, %mul
   1488   %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
   1489   store volatile float %fneg, float addrspace(1)* %out
   1490   store volatile float %use1, float addrspace(1)* %out
   1491   ret void
   1492 }
   1493 
   1494 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
   1495 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1496 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1497 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
   1498 ; GCN-NEXT: buffer_store_dword [[ADD]]
   1499 define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1500   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1501   %tid.ext = sext i32 %tid to i64
   1502   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1503   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1504   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1505   %a = load volatile float, float addrspace(1)* %a.gep
   1506   %b = load volatile float, float addrspace(1)* %b.gep
   1507   %fneg.a = fsub float -0.000000e+00, %a
   1508   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
   1509   %fneg = fsub float -0.000000e+00, %mul
   1510   store volatile float %fneg, float addrspace(1)* %out
   1511   ret void
   1512 }
   1513 
   1514 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
   1515 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1516 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1517 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
   1518 ; GCN-NEXT: buffer_store_dword [[ADD]]
   1519 define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1520   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1521   %tid.ext = sext i32 %tid to i64
   1522   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1523   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1524   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1525   %a = load volatile float, float addrspace(1)* %a.gep
   1526   %b = load volatile float, float addrspace(1)* %b.gep
   1527   %fneg.b = fsub float -0.000000e+00, %b
   1528   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
   1529   %fneg = fsub float -0.000000e+00, %mul
   1530   store volatile float %fneg, float addrspace(1)* %out
   1531   ret void
   1532 }
   1533 
   1534 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
   1535 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1536 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1537 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
   1538 ; GCN-NEXT: buffer_store_dword [[ADD]]
   1539 define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1540   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1541   %tid.ext = sext i32 %tid to i64
   1542   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1543   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1544   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1545   %a = load volatile float, float addrspace(1)* %a.gep
   1546   %b = load volatile float, float addrspace(1)* %b.gep
   1547   %fneg.a = fsub float -0.000000e+00, %a
   1548   %fneg.b = fsub float -0.000000e+00, %b
   1549   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
   1550   %fneg = fsub float -0.000000e+00, %mul
   1551   store volatile float %fneg, float addrspace(1)* %out
   1552   ret void
   1553 }
   1554 
   1555 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
   1556 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1557 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1558 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
   1559 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
   1560 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
   1561 ; GCN: buffer_store_dword [[NEG_A]]
   1562 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1563   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1564   %tid.ext = sext i32 %tid to i64
   1565   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1566   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1567   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1568   %a = load volatile float, float addrspace(1)* %a.gep
   1569   %b = load volatile float, float addrspace(1)* %b.gep
   1570   %fneg.a = fsub float -0.000000e+00, %a
   1571   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
   1572   %fneg = fsub float -0.000000e+00, %mul
   1573   store volatile float %fneg, float addrspace(1)* %out
   1574   store volatile float %fneg.a, float addrspace(1)* %out
   1575   ret void
   1576 }
   1577 
   1578 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
   1579 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1580 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1581 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
   1582 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
   1583 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
   1584 ; GCN: buffer_store_dword [[MUL]]
   1585 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   1586   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1587   %tid.ext = sext i32 %tid to i64
   1588   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1589   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1590   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1591   %a = load volatile float, float addrspace(1)* %a.gep
   1592   %b = load volatile float, float addrspace(1)* %b.gep
   1593   %fneg.a = fsub float -0.000000e+00, %a
   1594   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
   1595   %fneg = fsub float -0.000000e+00, %mul
   1596   %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
   1597   store volatile float %fneg, float addrspace(1)* %out
   1598   store volatile float %use1, float addrspace(1)* %out
   1599   ret void
   1600 }
   1601 
   1602 ; --------------------------------------------------------------------------------
   1603 ; sin tests
   1604 ; --------------------------------------------------------------------------------
   1605 
   1606 ; GCN-LABEL: {{^}}v_fneg_sin_f32:
   1607 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1608 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
   1609 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
   1610 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
   1611 ; GCN: buffer_store_dword [[RESULT]]
   1612 define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1613   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1614   %tid.ext = sext i32 %tid to i64
   1615   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1616   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1617   %a = load volatile float, float addrspace(1)* %a.gep
   1618   %sin = call float @llvm.sin.f32(float %a)
   1619   %fneg = fsub float -0.000000e+00, %sin
   1620   store float %fneg, float addrspace(1)* %out.gep
   1621   ret void
   1622 }
   1623 
   1624 ; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
   1625 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1626 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1627 ; GCN: buffer_store_dword [[RESULT]]
   1628 define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1629   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1630   %tid.ext = sext i32 %tid to i64
   1631   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1632   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1633   %a = load volatile float, float addrspace(1)* %a.gep
   1634   %sin = call float @llvm.amdgcn.sin.f32(float %a)
   1635   %fneg = fsub float -0.0, %sin
   1636   store float %fneg, float addrspace(1)* %out.gep
   1637   ret void
   1638 }
   1639 
   1640 ; --------------------------------------------------------------------------------
   1641 ; ftrunc tests
   1642 ; --------------------------------------------------------------------------------
   1643 
   1644 ; GCN-LABEL: {{^}}v_fneg_trunc_f32:
   1645 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1646 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1647 ; GCN: buffer_store_dword [[RESULT]]
   1648 define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1649   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1650   %tid.ext = sext i32 %tid to i64
   1651   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1652   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1653   %a = load volatile float, float addrspace(1)* %a.gep
   1654   %trunc = call float @llvm.trunc.f32(float %a)
   1655   %fneg = fsub float -0.0, %trunc
   1656   store float %fneg, float addrspace(1)* %out.gep
   1657   ret void
   1658 }
   1659 
   1660 ; --------------------------------------------------------------------------------
   1661 ; fround tests
   1662 ; --------------------------------------------------------------------------------
   1663 
   1664 ; GCN-LABEL: {{^}}v_fneg_round_f32:
   1665 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1666 ; GCN: v_trunc_f32_e32
   1667 ; GCN: v_sub_f32_e32
   1668 ; GCN: v_cndmask_b32
   1669 
   1670 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
   1671 ; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
   1672 
   1673 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
   1674 ; GCN: buffer_store_dword [[RESULT]]
   1675 define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1676   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1677   %tid.ext = sext i32 %tid to i64
   1678   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1679   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1680   %a = load volatile float, float addrspace(1)* %a.gep
   1681   %round = call float @llvm.round.f32(float %a)
   1682   %fneg = fsub float -0.0, %round
   1683   store float %fneg, float addrspace(1)* %out.gep
   1684   ret void
   1685 }
   1686 
   1687 ; --------------------------------------------------------------------------------
   1688 ; rint tests
   1689 ; --------------------------------------------------------------------------------
   1690 
   1691 ; GCN-LABEL: {{^}}v_fneg_rint_f32:
   1692 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1693 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1694 ; GCN: buffer_store_dword [[RESULT]]
   1695 define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1696   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1697   %tid.ext = sext i32 %tid to i64
   1698   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1699   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1700   %a = load volatile float, float addrspace(1)* %a.gep
   1701   %rint = call float @llvm.rint.f32(float %a)
   1702   %fneg = fsub float -0.0, %rint
   1703   store float %fneg, float addrspace(1)* %out.gep
   1704   ret void
   1705 }
   1706 
   1707 ; --------------------------------------------------------------------------------
   1708 ; nearbyint tests
   1709 ; --------------------------------------------------------------------------------
   1710 
   1711 ; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
   1712 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1713 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
   1714 ; GCN: buffer_store_dword [[RESULT]]
   1715 define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1716   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1717   %tid.ext = sext i32 %tid to i64
   1718   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1719   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1720   %a = load volatile float, float addrspace(1)* %a.gep
   1721   %nearbyint = call float @llvm.nearbyint.f32(float %a)
   1722   %fneg = fsub float -0.0, %nearbyint
   1723   store float %fneg, float addrspace(1)* %out.gep
   1724   ret void
   1725 }
   1726 
   1727 ; --------------------------------------------------------------------------------
   1728 ; fcanonicalize tests
   1729 ; --------------------------------------------------------------------------------
   1730 
   1731 ; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
   1732 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1733 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
   1734 ; GCN: buffer_store_dword [[RESULT]]
   1735 define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   1736   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1737   %tid.ext = sext i32 %tid to i64
   1738   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1739   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1740   %a = load volatile float, float addrspace(1)* %a.gep
   1741   %trunc = call float @llvm.canonicalize.f32(float %a)
   1742   %fneg = fsub float -0.0, %trunc
   1743   store float %fneg, float addrspace(1)* %out.gep
   1744   ret void
   1745 }
   1746 
   1747 ; --------------------------------------------------------------------------------
   1748 ; vintrp tests
   1749 ; --------------------------------------------------------------------------------
   1750 
   1751 ; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
   1752 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1753 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1754 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
   1755 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
   1756 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
   1757 define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1758   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1759   %tid.ext = sext i32 %tid to i64
   1760   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1761   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1762   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1763   %a = load volatile float, float addrspace(1)* %a.gep
   1764   %b = load volatile float, float addrspace(1)* %b.gep
   1765   %mul = fmul float %a, %b
   1766   %fneg = fsub float -0.0, %mul
   1767   %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
   1768   %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
   1769   store volatile float %intrp0, float addrspace(1)* %out.gep
   1770   store volatile float %intrp1, float addrspace(1)* %out.gep
   1771   ret void
   1772 }
   1773 
   1774 ; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
   1775 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1776 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1777 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
   1778 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
   1779 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
   1780 define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   1781   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1782   %tid.ext = sext i32 %tid to i64
   1783   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1784   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1785   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1786   %a = load volatile float, float addrspace(1)* %a.gep
   1787   %b = load volatile float, float addrspace(1)* %b.gep
   1788   %mul = fmul float %a, %b
   1789   %fneg = fsub float -0.0, %mul
   1790   %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
   1791   %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
   1792   store volatile float %intrp0, float addrspace(1)* %out.gep
   1793   store volatile float %intrp1, float addrspace(1)* %out.gep
   1794   ret void
   1795 }
   1796 
   1797 ; --------------------------------------------------------------------------------
   1798 ; CopyToReg tests
   1799 ; --------------------------------------------------------------------------------
   1800 
   1801 ; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
   1802 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1803 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1804 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   1805 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
   1806 ; GCN: s_cbranch_scc1
   1807 
   1808 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
   1809 ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
   1810 ; GCN: buffer_store_dword [[MUL1]]
   1811 
   1812 ; GCN: buffer_store_dword [[MUL0]]
   1813 define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   1814   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1815   %tid.ext = sext i32 %tid to i64
   1816   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1817   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1818   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1819   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1820   %a = load volatile float, float addrspace(1)* %a.gep
   1821   %b = load volatile float, float addrspace(1)* %b.gep
   1822   %c = load volatile float, float addrspace(1)* %c.gep
   1823   %mul = fmul float %a, %b
   1824   %fneg = fsub float -0.0, %mul
   1825   %cmp0 = icmp eq i32 %d, 0
   1826   br i1 %cmp0, label %if, label %endif
   1827 
   1828 if:
   1829   %mul1 = fmul float %fneg, %c
   1830   store volatile float %mul1, float addrspace(1)* %out.gep
   1831   br label %endif
   1832 
   1833 endif:
   1834   store volatile float %mul, float addrspace(1)* %out.gep
   1835   ret void
   1836 }
   1837 
   1838 ; --------------------------------------------------------------------------------
   1839 ; inlineasm tests
   1840 ; --------------------------------------------------------------------------------
   1841 
   1842 ; Can't fold into use, so should fold into source
   1843 ; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
   1844 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1845 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1846 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
   1847 ; GCN: ; use [[MUL]]
   1848 ; GCN: buffer_store_dword [[MUL]]
   1849 define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   1850   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1851   %tid.ext = sext i32 %tid to i64
   1852   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1853   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1854   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1855   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1856   %a = load volatile float, float addrspace(1)* %a.gep
   1857   %b = load volatile float, float addrspace(1)* %b.gep
   1858   %c = load volatile float, float addrspace(1)* %c.gep
   1859   %mul = fmul float %a, %b
   1860   %fneg = fsub float -0.0, %mul
   1861   call void asm sideeffect "; use $0", "v"(float %fneg) #0
   1862   store volatile float %fneg, float addrspace(1)* %out.gep
   1863   ret void
   1864 }
   1865 
   1866 ; --------------------------------------------------------------------------------
   1867 ; inlineasm tests
   1868 ; --------------------------------------------------------------------------------
   1869 
   1870 ; Can't fold into use, so should fold into source
   1871 ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
   1872 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1873 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1874 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
   1875 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
   1876 ; GCN: ; use [[NEG]]
   1877 ; GCN: buffer_store_dword [[MUL]]
   1878 define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   1879   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1880   %tid.ext = sext i32 %tid to i64
   1881   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1882   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1883   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1884   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1885   %a = load volatile float, float addrspace(1)* %a.gep
   1886   %b = load volatile float, float addrspace(1)* %b.gep
   1887   %c = load volatile float, float addrspace(1)* %c.gep
   1888   %mul = fmul float %a, %b
   1889   %fneg = fsub float -0.0, %mul
   1890   call void asm sideeffect "; use $0", "v"(float %fneg) #0
   1891   store volatile float %mul, float addrspace(1)* %out.gep
   1892   ret void
   1893 }
   1894 
   1895 ; --------------------------------------------------------------------------------
   1896 ; code size regression tests
   1897 ; --------------------------------------------------------------------------------
   1898 
   1899 ; There are multiple users of the fneg that must use a VOP3
   1900 ; instruction, so there is no penalty
   1901 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
   1902 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1903 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1904 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   1905 
   1906 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
   1907 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
   1908 ; GCN-NEXT:	buffer_store_dword [[FMA0]]
   1909 ; GCN-NEXT:	buffer_store_dword [[FMA1]]
   1910 define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   1911   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1912   %tid.ext = sext i32 %tid to i64
   1913   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1914   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1915   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1916   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1917   %a = load volatile float, float addrspace(1)* %a.gep
   1918   %b = load volatile float, float addrspace(1)* %b.gep
   1919   %c = load volatile float, float addrspace(1)* %c.gep
   1920 
   1921   %fneg.a = fsub float -0.0, %a
   1922   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
   1923   %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
   1924 
   1925   store volatile float %fma0, float addrspace(1)* %out
   1926   store volatile float %fma1, float addrspace(1)* %out
   1927   ret void
   1928 }
   1929 
   1930 ; There are multiple users, but both require using a larger encoding
   1931 ; for the modifier.
   1932 
   1933 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
   1934 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1935 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1936 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   1937 
   1938 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
   1939 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
   1940 ; GCN-NEXT:	buffer_store_dword [[MUL0]]
   1941 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
   1942 define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   1943   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1944   %tid.ext = sext i32 %tid to i64
   1945   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1946   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1947   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1948   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1949   %a = load volatile float, float addrspace(1)* %a.gep
   1950   %b = load volatile float, float addrspace(1)* %b.gep
   1951   %c = load volatile float, float addrspace(1)* %c.gep
   1952 
   1953   %fneg.a = fsub float -0.0, %a
   1954   %mul0 = fmul float %fneg.a, %b
   1955   %mul1 = fmul float %fneg.a, %c
   1956 
   1957   store volatile float %mul0, float addrspace(1)* %out
   1958   store volatile float %mul1, float addrspace(1)* %out
   1959   ret void
   1960 }
   1961 
   1962 ; One user is VOP3 so has no cost to folding the modifier, the other does.
   1963 ; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
   1964 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1965 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1966 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   1967 
   1968 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
   1969 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
   1970 
   1971 ; GCN:	buffer_store_dword [[FMA0]]
   1972 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
   1973 define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   1974   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   1975   %tid.ext = sext i32 %tid to i64
   1976   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   1977   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   1978   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   1979   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   1980   %a = load volatile float, float addrspace(1)* %a.gep
   1981   %b = load volatile float, float addrspace(1)* %b.gep
   1982   %c = load volatile float, float addrspace(1)* %c.gep
   1983 
   1984   %fneg.a = fsub float -0.0, %a
   1985   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
   1986   %mul1 = fmul float %fneg.a, %c
   1987 
   1988   store volatile float %fma0, float addrspace(1)* %out
   1989   store volatile float %mul1, float addrspace(1)* %out
   1990   ret void
   1991 }
   1992 
   1993 ; The use of the fneg requires a code size increase, but folding into
   1994 ; the source does not
   1995 
   1996 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
   1997 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   1998 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   1999 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   2000 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
   2001 
   2002 ; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
   2003 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
   2004 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
   2005 
   2006 ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
   2007 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
   2008 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
   2009 
   2010 ; GCN: buffer_store_dword [[MUL1]]
   2011 ; GCN-NEXT:	buffer_store_dword [[MUL2]]
   2012 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   2013   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   2014   %tid.ext = sext i32 %tid to i64
   2015   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   2016   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   2017   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   2018   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
   2019   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   2020   %a = load volatile float, float addrspace(1)* %a.gep
   2021   %b = load volatile float, float addrspace(1)* %b.gep
   2022   %c = load volatile float, float addrspace(1)* %c.gep
   2023   %d = load volatile float, float addrspace(1)* %d.gep
   2024 
   2025   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
   2026   %fneg.fma0 = fsub float -0.0, %fma0
   2027   %mul1 = fmul float %fneg.fma0, %c
   2028   %mul2 = fmul float %fneg.fma0, %d
   2029 
   2030   store volatile float %mul1, float addrspace(1)* %out
   2031   store volatile float %mul2, float addrspace(1)* %out
   2032   ret void
   2033 }
   2034 
   2035 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
   2036 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
   2037 ; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
   2038 ; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
   2039 ; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
   2040 
   2041 ; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
   2042 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
   2043 ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
   2044 
   2045 ; GCN: buffer_store_dwordx2 [[MUL0]]
   2046 ; GCN: buffer_store_dwordx2 [[MUL1]]
   2047 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
   2048   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   2049   %tid.ext = sext i32 %tid to i64
   2050   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
   2051   %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
   2052   %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
   2053   %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
   2054   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
   2055   %a = load volatile double, double addrspace(1)* %a.gep
   2056   %b = load volatile double, double addrspace(1)* %b.gep
   2057   %c = load volatile double, double addrspace(1)* %c.gep
   2058   %d = load volatile double, double addrspace(1)* %d.gep
   2059 
   2060   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
   2061   %fneg.fma0 = fsub double -0.0, %fma0
   2062   %mul1 = fmul double %fneg.fma0, %c
   2063   %mul2 = fmul double %fneg.fma0, %d
   2064 
   2065   store volatile double %mul1, double addrspace(1)* %out
   2066   store volatile double %mul2, double addrspace(1)* %out
   2067   ret void
   2068 }
   2069 
   2070 ; %trunc.a has one fneg use, but it requires a code size increase and
   2071 ; %the fneg can instead be folded for free into the fma.
   2072 
   2073 ; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
   2074 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   2075 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   2076 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   2077 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
   2078 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
   2079 ; GCN: buffer_store_dword [[FMA0]]
   2080 define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   2081   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   2082   %tid.ext = sext i32 %tid to i64
   2083   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   2084   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   2085   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   2086   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
   2087   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   2088   %a = load volatile float, float addrspace(1)* %a.gep
   2089   %b = load volatile float, float addrspace(1)* %b.gep
   2090   %c = load volatile float, float addrspace(1)* %c.gep
   2091   %d = load volatile float, float addrspace(1)* %d.gep
   2092 
   2093   %trunc.a = call float @llvm.trunc.f32(float %a)
   2094   %trunc.fneg.a = fsub float -0.0, %trunc.a
   2095   %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
   2096   store volatile float %fma0, float addrspace(1)* %out
   2097   ret void
   2098 }
   2099 
   2100 ; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
   2101 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
   2102 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
   2103 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
   2104 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
   2105 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
   2106 ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
   2107 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
   2108 ; GCN: buffer_store_dword [[FMA0]]
   2109 ; GCN: buffer_store_dword [[MUL1]]
   2110 define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   2111   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   2112   %tid.ext = sext i32 %tid to i64
   2113   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   2114   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
   2115   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
   2116   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
   2117   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   2118   %a = load volatile float, float addrspace(1)* %a.gep
   2119   %b = load volatile float, float addrspace(1)* %b.gep
   2120   %c = load volatile float, float addrspace(1)* %c.gep
   2121   %d = load volatile float, float addrspace(1)* %d.gep
   2122 
   2123   %trunc.a = call float @llvm.trunc.f32(float %a)
   2124   %trunc.fneg.a = fsub float -0.0, %trunc.a
   2125   %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
   2126   %mul1 = fmul float %trunc.a, %d
   2127   store volatile float %fma0, float addrspace(1)* %out
   2128   store volatile float %mul1, float addrspace(1)* %out
   2129   ret void
   2130 }
   2131 
   2132 declare i32 @llvm.amdgcn.workitem.id.x() #1
   2133 declare float @llvm.fma.f32(float, float, float) #1
   2134 declare float @llvm.fmuladd.f32(float, float, float) #1
   2135 declare float @llvm.sin.f32(float) #1
   2136 declare float @llvm.trunc.f32(float) #1
   2137 declare float @llvm.round.f32(float) #1
   2138 declare float @llvm.rint.f32(float) #1
   2139 declare float @llvm.nearbyint.f32(float) #1
   2140 declare float @llvm.canonicalize.f32(float) #1
   2141 declare float @llvm.minnum.f32(float, float) #1
   2142 declare float @llvm.maxnum.f32(float, float) #1
   2143 
   2144 declare double @llvm.fma.f64(double, double, double) #1
   2145 
   2146 declare float @llvm.amdgcn.sin.f32(float) #1
   2147 declare float @llvm.amdgcn.rcp.f32(float) #1
   2148 declare float @llvm.amdgcn.rcp.legacy(float) #1
   2149 declare float @llvm.amdgcn.fmul.legacy(float, float) #1
   2150 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
   2151 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
   2152 
   2153 attributes #0 = { nounwind }
   2154 attributes #1 = { nounwind readnone }
   2155