Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
      2 
      3 ; CHECK-LABEL: fmaddsubpd_loop_128:
      4 ; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
      5 ; CHECK:   vmovaps %xmm2, %xmm0
      6 ; CHECK-NEXT: retq
      7 define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
      8 entry:
      9   br label %for.cond
     10 
     11 for.cond:
     12   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
     13   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
     14   %cmp = icmp slt i32 %i.0, %iter
     15   br i1 %cmp, label %for.body, label %for.end
     16 
     17 for.body:
     18   br label %for.inc
     19 
     20 for.inc:
     21   %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
     22   %inc = add nsw i32 %i.0, 1
     23   br label %for.cond
     24 
     25 for.end:
     26   ret <2 x double> %c.addr.0
     27 }
     28 
     29 ; CHECK-LABEL: fmsubaddpd_loop_128:
     30 ; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
     31 ; CHECK:   vmovaps %xmm2, %xmm0
     32 ; CHECK-NEXT: retq
     33 define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
     34 entry:
     35   br label %for.cond
     36 
     37 for.cond:
     38   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
     39   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
     40   %cmp = icmp slt i32 %i.0, %iter
     41   br i1 %cmp, label %for.body, label %for.end
     42 
     43 for.body:
     44   br label %for.inc
     45 
     46 for.inc:
     47   %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
     48   %inc = add nsw i32 %i.0, 1
     49   br label %for.cond
     50 
     51 for.end:
     52   ret <2 x double> %c.addr.0
     53 }
     54 
     55 ; CHECK-LABEL: fmaddpd_loop_128:
     56 ; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
     57 ; CHECK:   vmovaps %xmm2, %xmm0
     58 ; CHECK-NEXT: retq
     59 define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
     60 entry:
     61   br label %for.cond
     62 
     63 for.cond:
     64   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
     65   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
     66   %cmp = icmp slt i32 %i.0, %iter
     67   br i1 %cmp, label %for.body, label %for.end
     68 
     69 for.body:
     70   br label %for.inc
     71 
     72 for.inc:
     73   %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
     74   %inc = add nsw i32 %i.0, 1
     75   br label %for.cond
     76 
     77 for.end:
     78   ret <2 x double> %c.addr.0
     79 }
     80 
     81 ; CHECK-LABEL: fmsubpd_loop_128:
     82 ; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
     83 ; CHECK:   vmovaps %xmm2, %xmm0
     84 ; CHECK-NEXT: retq
     85 define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
     86 entry:
     87   br label %for.cond
     88 
     89 for.cond:
     90   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
     91   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
     92   %cmp = icmp slt i32 %i.0, %iter
     93   br i1 %cmp, label %for.body, label %for.end
     94 
     95 for.body:
     96   br label %for.inc
     97 
     98 for.inc:
     99   %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
    100   %inc = add nsw i32 %i.0, 1
    101   br label %for.cond
    102 
    103 for.end:
    104   ret <2 x double> %c.addr.0
    105 }
    106 
    107 ; CHECK-LABEL: fnmaddpd_loop_128:
    108 ; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
    109 ; CHECK:   vmovaps %xmm2, %xmm0
    110 ; CHECK-NEXT: retq
    111 define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
    112 entry:
    113   br label %for.cond
    114 
    115 for.cond:
    116   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
    117   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    118   %cmp = icmp slt i32 %i.0, %iter
    119   br i1 %cmp, label %for.body, label %for.end
    120 
    121 for.body:
    122   br label %for.inc
    123 
    124 for.inc:
    125   %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
    126   %inc = add nsw i32 %i.0, 1
    127   br label %for.cond
    128 
    129 for.end:
    130   ret <2 x double> %c.addr.0
    131 }
    132 
    133 ; CHECK-LABEL: fnmsubpd_loop_128:
    134 ; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
    135 ; CHECK:   vmovaps %xmm2, %xmm0
    136 ; CHECK-NEXT: retq
    137 define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
    138 entry:
    139   br label %for.cond
    140 
    141 for.cond:
    142   %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
    143   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    144   %cmp = icmp slt i32 %i.0, %iter
    145   br i1 %cmp, label %for.body, label %for.end
    146 
    147 for.body:
    148   br label %for.inc
    149 
    150 for.inc:
    151   %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
    152   %inc = add nsw i32 %i.0, 1
    153   br label %for.cond
    154 
    155 for.end:
    156   ret <2 x double> %c.addr.0
    157 }
    158 
    159 declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
    160 declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
    161 declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
    162 declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
    163 declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
    164 declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
    165 
    166 
    167 ; CHECK-LABEL: fmaddsubps_loop_128:
    168 ; CHECK:   vfmaddsub231ps %xmm1, %xmm0, %xmm2
    169 ; CHECK:   vmovaps %xmm2, %xmm0
    170 ; CHECK-NEXT: retq
    171 define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    172 entry:
    173   br label %for.cond
    174 
    175 for.cond:
    176   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    177   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    178   %cmp = icmp slt i32 %i.0, %iter
    179   br i1 %cmp, label %for.body, label %for.end
    180 
    181 for.body:
    182   br label %for.inc
    183 
    184 for.inc:
    185   %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    186   %inc = add nsw i32 %i.0, 1
    187   br label %for.cond
    188 
    189 for.end:
    190   ret <4 x float> %c.addr.0
    191 }
    192 
    193 ; CHECK-LABEL: fmsubaddps_loop_128:
    194 ; CHECK:   vfmsubadd231ps %xmm1, %xmm0, %xmm2
    195 ; CHECK:   vmovaps %xmm2, %xmm0
    196 ; CHECK-NEXT: retq
    197 define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    198 entry:
    199   br label %for.cond
    200 
    201 for.cond:
    202   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    203   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    204   %cmp = icmp slt i32 %i.0, %iter
    205   br i1 %cmp, label %for.body, label %for.end
    206 
    207 for.body:
    208   br label %for.inc
    209 
    210 for.inc:
    211   %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    212   %inc = add nsw i32 %i.0, 1
    213   br label %for.cond
    214 
    215 for.end:
    216   ret <4 x float> %c.addr.0
    217 }
    218 
    219 ; CHECK-LABEL: fmaddps_loop_128:
    220 ; CHECK:   vfmadd231ps %xmm1, %xmm0, %xmm2
    221 ; CHECK:   vmovaps %xmm2, %xmm0
    222 ; CHECK-NEXT: retq
    223 define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    224 entry:
    225   br label %for.cond
    226 
    227 for.cond:
    228   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    229   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    230   %cmp = icmp slt i32 %i.0, %iter
    231   br i1 %cmp, label %for.body, label %for.end
    232 
    233 for.body:
    234   br label %for.inc
    235 
    236 for.inc:
    237   %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    238   %inc = add nsw i32 %i.0, 1
    239   br label %for.cond
    240 
    241 for.end:
    242   ret <4 x float> %c.addr.0
    243 }
    244 
    245 ; CHECK-LABEL: fmsubps_loop_128:
    246 ; CHECK:   vfmsub231ps %xmm1, %xmm0, %xmm2
    247 ; CHECK:   vmovaps %xmm2, %xmm0
    248 ; CHECK-NEXT: retq
    249 define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    250 entry:
    251   br label %for.cond
    252 
    253 for.cond:
    254   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    255   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    256   %cmp = icmp slt i32 %i.0, %iter
    257   br i1 %cmp, label %for.body, label %for.end
    258 
    259 for.body:
    260   br label %for.inc
    261 
    262 for.inc:
    263   %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    264   %inc = add nsw i32 %i.0, 1
    265   br label %for.cond
    266 
    267 for.end:
    268   ret <4 x float> %c.addr.0
    269 }
    270 
    271 ; CHECK-LABEL: fnmaddps_loop_128:
    272 ; CHECK:   vfnmadd231ps %xmm1, %xmm0, %xmm2
    273 ; CHECK:   vmovaps %xmm2, %xmm0
    274 ; CHECK-NEXT: retq
    275 define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    276 entry:
    277   br label %for.cond
    278 
    279 for.cond:
    280   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    281   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    282   %cmp = icmp slt i32 %i.0, %iter
    283   br i1 %cmp, label %for.body, label %for.end
    284 
    285 for.body:
    286   br label %for.inc
    287 
    288 for.inc:
    289   %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    290   %inc = add nsw i32 %i.0, 1
    291   br label %for.cond
    292 
    293 for.end:
    294   ret <4 x float> %c.addr.0
    295 }
    296 
    297 ; CHECK-LABEL: fnmsubps_loop_128:
    298 ; CHECK:   vfnmsub231ps %xmm1, %xmm0, %xmm2
    299 ; CHECK:   vmovaps %xmm2, %xmm0
    300 ; CHECK-NEXT: retq
    301 define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
    302 entry:
    303   br label %for.cond
    304 
    305 for.cond:
    306   %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
    307   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    308   %cmp = icmp slt i32 %i.0, %iter
    309   br i1 %cmp, label %for.body, label %for.end
    310 
    311 for.body:
    312   br label %for.inc
    313 
    314 for.inc:
    315   %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
    316   %inc = add nsw i32 %i.0, 1
    317   br label %for.cond
    318 
    319 for.end:
    320   ret <4 x float> %c.addr.0
    321 }
    322 
    323 declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
    324 declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
    325 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
    326 declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
    327 declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
    328 declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
    329 
    330 ; CHECK-LABEL: fmaddsubpd_loop_256:
    331 ; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
    332 ; CHECK:   vmovaps %ymm2, %ymm0
    333 ; CHECK-NEXT: retq
    334 define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    335 entry:
    336   br label %for.cond
    337 
    338 for.cond:
    339   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    340   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    341   %cmp = icmp slt i32 %i.0, %iter
    342   br i1 %cmp, label %for.body, label %for.end
    343 
    344 for.body:
    345   br label %for.inc
    346 
    347 for.inc:
    348   %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    349   %inc = add nsw i32 %i.0, 1
    350   br label %for.cond
    351 
    352 for.end:
    353   ret <4 x double> %c.addr.0
    354 }
    355 
    356 ; CHECK-LABEL: fmsubaddpd_loop_256:
    357 ; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
    358 ; CHECK:   vmovaps %ymm2, %ymm0
    359 ; CHECK-NEXT: retq
    360 define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    361 entry:
    362   br label %for.cond
    363 
    364 for.cond:
    365   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    366   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    367   %cmp = icmp slt i32 %i.0, %iter
    368   br i1 %cmp, label %for.body, label %for.end
    369 
    370 for.body:
    371   br label %for.inc
    372 
    373 for.inc:
    374   %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    375   %inc = add nsw i32 %i.0, 1
    376   br label %for.cond
    377 
    378 for.end:
    379   ret <4 x double> %c.addr.0
    380 }
    381 
    382 ; CHECK-LABEL: fmaddpd_loop_256:
    383 ; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
    384 ; CHECK:   vmovaps %ymm2, %ymm0
    385 ; CHECK-NEXT: retq
    386 define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    387 entry:
    388   br label %for.cond
    389 
    390 for.cond:
    391   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    392   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    393   %cmp = icmp slt i32 %i.0, %iter
    394   br i1 %cmp, label %for.body, label %for.end
    395 
    396 for.body:
    397   br label %for.inc
    398 
    399 for.inc:
    400   %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    401   %inc = add nsw i32 %i.0, 1
    402   br label %for.cond
    403 
    404 for.end:
    405   ret <4 x double> %c.addr.0
    406 }
    407 
    408 ; CHECK-LABEL: fmsubpd_loop_256:
    409 ; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
    410 ; CHECK:   vmovaps %ymm2, %ymm0
    411 ; CHECK-NEXT: retq
    412 define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    413 entry:
    414   br label %for.cond
    415 
    416 for.cond:
    417   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    418   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    419   %cmp = icmp slt i32 %i.0, %iter
    420   br i1 %cmp, label %for.body, label %for.end
    421 
    422 for.body:
    423   br label %for.inc
    424 
    425 for.inc:
    426   %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    427   %inc = add nsw i32 %i.0, 1
    428   br label %for.cond
    429 
    430 for.end:
    431   ret <4 x double> %c.addr.0
    432 }
    433 
    434 ; CHECK-LABEL: fnmaddpd_loop_256:
    435 ; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
    436 ; CHECK:   vmovaps %ymm2, %ymm0
    437 ; CHECK-NEXT: retq
    438 define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    439 entry:
    440   br label %for.cond
    441 
    442 for.cond:
    443   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    444   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    445   %cmp = icmp slt i32 %i.0, %iter
    446   br i1 %cmp, label %for.body, label %for.end
    447 
    448 for.body:
    449   br label %for.inc
    450 
    451 for.inc:
    452   %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    453   %inc = add nsw i32 %i.0, 1
    454   br label %for.cond
    455 
    456 for.end:
    457   ret <4 x double> %c.addr.0
    458 }
    459 
    460 ; CHECK-LABEL: fnmsubpd_loop_256:
    461 ; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
    462 ; CHECK:   vmovaps %ymm2, %ymm0
    463 ; CHECK-NEXT: retq
    464 define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
    465 entry:
    466   br label %for.cond
    467 
    468 for.cond:
    469   %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
    470   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    471   %cmp = icmp slt i32 %i.0, %iter
    472   br i1 %cmp, label %for.body, label %for.end
    473 
    474 for.body:
    475   br label %for.inc
    476 
    477 for.inc:
    478   %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
    479   %inc = add nsw i32 %i.0, 1
    480   br label %for.cond
    481 
    482 for.end:
    483   ret <4 x double> %c.addr.0
    484 }
    485 
    486 declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    487 declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    488 declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    489 declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    490 declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    491 declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    492 
    493 
    494 ; CHECK-LABEL: fmaddsubps_loop_256:
    495 ; CHECK:   vfmaddsub231ps %ymm1, %ymm0, %ymm2
    496 ; CHECK:   vmovaps %ymm2, %ymm0
    497 ; CHECK-NEXT: retq
    498 define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    499 entry:
    500   br label %for.cond
    501 
    502 for.cond:
    503   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    504   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    505   %cmp = icmp slt i32 %i.0, %iter
    506   br i1 %cmp, label %for.body, label %for.end
    507 
    508 for.body:
    509   br label %for.inc
    510 
    511 for.inc:
    512   %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    513   %inc = add nsw i32 %i.0, 1
    514   br label %for.cond
    515 
    516 for.end:
    517   ret <8 x float> %c.addr.0
    518 }
    519 
    520 ; CHECK-LABEL: fmsubaddps_loop_256:
    521 ; CHECK:   vfmsubadd231ps %ymm1, %ymm0, %ymm2
    522 ; CHECK:   vmovaps %ymm2, %ymm0
    523 ; CHECK-NEXT: retq
    524 define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    525 entry:
    526   br label %for.cond
    527 
    528 for.cond:
    529   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    530   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    531   %cmp = icmp slt i32 %i.0, %iter
    532   br i1 %cmp, label %for.body, label %for.end
    533 
    534 for.body:
    535   br label %for.inc
    536 
    537 for.inc:
    538   %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    539   %inc = add nsw i32 %i.0, 1
    540   br label %for.cond
    541 
    542 for.end:
    543   ret <8 x float> %c.addr.0
    544 }
    545 
    546 ; CHECK-LABEL: fmaddps_loop_256:
    547 ; CHECK:   vfmadd231ps %ymm1, %ymm0, %ymm2
    548 ; CHECK:   vmovaps %ymm2, %ymm0
    549 ; CHECK-NEXT: retq
    550 define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    551 entry:
    552   br label %for.cond
    553 
    554 for.cond:
    555   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    556   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    557   %cmp = icmp slt i32 %i.0, %iter
    558   br i1 %cmp, label %for.body, label %for.end
    559 
    560 for.body:
    561   br label %for.inc
    562 
    563 for.inc:
    564   %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    565   %inc = add nsw i32 %i.0, 1
    566   br label %for.cond
    567 
    568 for.end:
    569   ret <8 x float> %c.addr.0
    570 }
    571 
    572 ; CHECK-LABEL: fmsubps_loop_256:
    573 ; CHECK:   vfmsub231ps %ymm1, %ymm0, %ymm2
    574 ; CHECK:   vmovaps %ymm2, %ymm0
    575 ; CHECK-NEXT: retq
    576 define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    577 entry:
    578   br label %for.cond
    579 
    580 for.cond:
    581   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    582   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    583   %cmp = icmp slt i32 %i.0, %iter
    584   br i1 %cmp, label %for.body, label %for.end
    585 
    586 for.body:
    587   br label %for.inc
    588 
    589 for.inc:
    590   %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    591   %inc = add nsw i32 %i.0, 1
    592   br label %for.cond
    593 
    594 for.end:
    595   ret <8 x float> %c.addr.0
    596 }
    597 
    598 ; CHECK-LABEL: fnmaddps_loop_256:
    599 ; CHECK:   vfnmadd231ps %ymm1, %ymm0, %ymm2
    600 ; CHECK:   vmovaps %ymm2, %ymm0
    601 ; CHECK-NEXT: retq
    602 define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    603 entry:
    604   br label %for.cond
    605 
    606 for.cond:
    607   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    608   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    609   %cmp = icmp slt i32 %i.0, %iter
    610   br i1 %cmp, label %for.body, label %for.end
    611 
    612 for.body:
    613   br label %for.inc
    614 
    615 for.inc:
    616   %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    617   %inc = add nsw i32 %i.0, 1
    618   br label %for.cond
    619 
    620 for.end:
    621   ret <8 x float> %c.addr.0
    622 }
    623 
    624 ; CHECK-LABEL: fnmsubps_loop_256:
    625 ; CHECK:   vfnmsub231ps %ymm1, %ymm0, %ymm2
    626 ; CHECK:   vmovaps %ymm2, %ymm0
    627 ; CHECK-NEXT: retq
    628 define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
    629 entry:
    630   br label %for.cond
    631 
    632 for.cond:
    633   %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
    634   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
    635   %cmp = icmp slt i32 %i.0, %iter
    636   br i1 %cmp, label %for.body, label %for.end
    637 
    638 for.body:
    639   br label %for.inc
    640 
    641 for.inc:
    642   %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
    643   %inc = add nsw i32 %i.0, 1
    644   br label %for.cond
    645 
    646 for.end:
    647   ret <8 x float> %c.addr.0
    648 }
    649 
    650 declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    651 declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    652 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    653 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    654 declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    655 declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    656