Home | History | Annotate | Download | only in amd64
      1 #include <stdio.h>
      2 #include <string.h>
      3 
      4 #define N 64
      5 struct float_test {
      6    float x[N], y[N], z[N], expected[N], res[N];
      7 } ft __attribute__((aligned (32)));
      8 
      9 struct double_test {
     10    double x[N], y[N], z[N], expected[N], res[N];
     11 } dt __attribute__((aligned (32)));
     12 
     13 float plus_zero, plus_infty, minus_infty, nan_value;
     14 
     15 static int testf( float x, float y )
     16 {
     17    unsigned int a, b;
     18    memcpy( &a, &x, sizeof (a) );
     19    memcpy( &b, &y, sizeof (b) );
     20    if ((a & 0x7fc00000U) == 0x7fc00000U)
     21       return (b & 0x7fc00000U) != 0x7fc00000U;
     22    return memcmp( &a, &b, sizeof (a) ) != 0;
     23 }
     24 
     25 static int test_fmaf( void )
     26 {
     27    int res = 0, i, j;
     28    float w;
     29    for (i = 0; i < N; i++) {
     30       int thisres = 0;
     31       __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     32       thisres |= testf( w, ft.expected[i] );
     33       __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
     34       thisres |= testf( w, ft.expected[i] );
     35       __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     36       thisres |= testf( w, ft.expected[i] );
     37       __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
     38       thisres |= testf( w, ft.expected[i] );
     39       __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
     40       thisres |= testf( w, ft.expected[i] );
     41       __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
     42       thisres |= testf( w, ft.expected[i] );
     43       if (thisres)
     44          printf( "Failure 1 %d %a %a\n", i, w, ft.expected[i] );
     45       res |= thisres;
     46       thisres = 0;
     47       __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     48       thisres |= testf( -w, ft.expected[i] );
     49       __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
     50       thisres |= testf( -w, ft.expected[i] );
     51       __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     52       thisres |= testf( -w, ft.expected[i] );
     53       __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
     54       thisres |= testf( -w, ft.expected[i] );
     55       __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
     56       thisres |= testf( -w, ft.expected[i] );
     57       __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
     58       thisres |= testf( -w, ft.expected[i] );
     59       if (thisres)
     60          printf( "Failure 2 %d %a %a\n", i, w, ft.expected[i] );
     61       res |= thisres;
     62    }
     63    for (i = 0; i < N; i++)
     64       ft.z[i] = -ft.z[i];
     65    for (i = 0; i < N; i++) {
     66       int thisres = 0;
     67       __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     68       thisres |= testf( w, ft.expected[i] );
     69       __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
     70       thisres |= testf( w, ft.expected[i] );
     71       __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     72       thisres |= testf( w, ft.expected[i] );
     73       __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
     74       thisres |= testf( w, ft.expected[i] );
     75       __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
     76       thisres |= testf( w, ft.expected[i] );
     77       __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
     78       thisres |= testf( w, ft.expected[i] );
     79       if (thisres)
     80          printf( "Failure 3 %d %a %a\n", i, w, ft.expected[i] );
     81       res |= thisres;
     82       thisres = 0;
     83       __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     84       thisres |= testf( -w, ft.expected[i] );
     85       __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
     86       thisres |= testf( -w, ft.expected[i] );
     87       __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
     88       thisres |= testf( -w, ft.expected[i] );
     89       __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
     90       thisres |= testf( -w, ft.expected[i] );
     91       __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
     92       thisres |= testf( -w, ft.expected[i] );
     93       __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
     94       thisres |= testf( -w, ft.expected[i] );
     95       if (thisres)
     96          printf( "Failure 4 %d %a %a\n", i, w, ft.expected[i] );
     97       res |= thisres;
     98    }
     99    for (i = 0; i < N; i++)
    100       ft.z[i] = -ft.z[i];
    101    for (i = 0; i < N; i += 4) {
    102       int thisres = 0;
    103       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    104                           "vfmadd132ps %%xmm7, %%xmm8, %%xmm9;"
    105                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    106                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    107       for (j = 0; j < 4; j++)
    108          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    109       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    110                           "vfmadd132ps (%2), %%xmm8, %%xmm9;"
    111                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    112                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    113       for (j = 0; j < 4; j++)
    114          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    115       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    116                           "vfmadd213ps %%xmm7, %%xmm8, %%xmm9;"
    117                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    118                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    119       for (j = 0; j < 4; j++)
    120          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    121       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    122                           "vfmadd213ps (%3), %%xmm8, %%xmm9;"
    123                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    124                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    125       for (j = 0; j < 4; j++)
    126          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    127       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    128                           "vfmadd231ps %%xmm7, %%xmm8, %%xmm9;"
    129                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    130                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    131       for (j = 0; j < 4; j++)
    132          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    133       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    134                           "vfmadd231ps (%2), %%xmm8, %%xmm9;"
    135                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    136                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    137       for (j = 0; j < 4; j++)
    138          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    139       if (thisres) {
    140          printf( "Failure 5 %d", i );
    141          for (j = 0; j < 4; j++)
    142             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    143          printf( "\n" );
    144       }
    145       res |= thisres;
    146       thisres = 0;
    147       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    148                           "vfnmsub132ps %%xmm7, %%xmm8, %%xmm9;"
    149                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    150                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    151       for (j = 0; j < 4; j++)
    152          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    153       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    154                           "vfnmsub132ps (%2), %%xmm8, %%xmm9;"
    155                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    156                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    157       for (j = 0; j < 4; j++)
    158          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    159       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    160                           "vfnmsub213ps %%xmm7, %%xmm8, %%xmm9;"
    161                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    162                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    163       for (j = 0; j < 4; j++)
    164          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    165       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    166                           "vfnmsub213ps (%3), %%xmm8, %%xmm9;"
    167                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    168                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    169       for (j = 0; j < 4; j++)
    170          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    171       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    172                           "vfnmsub231ps %%xmm7, %%xmm8, %%xmm9;"
    173                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    174                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    175       for (j = 0; j < 4; j++)
    176          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    177       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    178                           "vfnmsub231ps (%2), %%xmm8, %%xmm9;"
    179                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    180                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    181       for (j = 0; j < 4; j++)
    182          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    183       if (thisres) {
    184          printf( "Failure 6 %d", i );
    185          for (j = 0; j < 4; j++)
    186             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    187          printf( "\n" );
    188       }
    189       res |= thisres;
    190    }
    191    for (i = 0; i < N; i++)
    192       ft.z[i] = -ft.z[i];
    193    for (i = 0; i < N; i += 4) {
    194       int thisres = 0;
    195       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    196                           "vfmsub132ps %%xmm7, %%xmm8, %%xmm9;"
    197                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    198                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    199       for (j = 0; j < 4; j++)
    200          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    201       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    202                           "vfmsub132ps (%2), %%xmm8, %%xmm9;"
    203                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    204                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    205       for (j = 0; j < 4; j++)
    206          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    207       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    208                           "vfmsub213ps %%xmm7, %%xmm8, %%xmm9;"
    209                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    210                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    211       for (j = 0; j < 4; j++)
    212          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    213       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    214                           "vfmsub213ps (%3), %%xmm8, %%xmm9;"
    215                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    216                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    217       for (j = 0; j < 4; j++)
    218          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    219       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    220                           "vfmsub231ps %%xmm7, %%xmm8, %%xmm9;"
    221                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    222                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    223       for (j = 0; j < 4; j++)
    224          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    225       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    226                           "vfmsub231ps (%2), %%xmm8, %%xmm9;"
    227                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    228                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    229       for (j = 0; j < 4; j++)
    230          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    231       if (thisres) {
    232          printf( "Failure 7 %d", i );
    233          for (j = 0; j < 4; j++)
    234             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    235          printf( "\n" );
    236       }
    237       res |= thisres;
    238       thisres = 0;
    239       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    240                           "vfnmadd132ps %%xmm7, %%xmm8, %%xmm9;"
    241                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    242                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    243       for (j = 0; j < 4; j++)
    244          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    245       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    246                           "vfnmadd132ps (%2), %%xmm8, %%xmm9;"
    247                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    248                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    249       for (j = 0; j < 4; j++)
    250          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    251       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    252                           "vfnmadd213ps %%xmm7, %%xmm8, %%xmm9;"
    253                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    254                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    255       for (j = 0; j < 4; j++)
    256          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    257       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    258                           "vfnmadd213ps (%3), %%xmm8, %%xmm9;"
    259                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    260                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    261       for (j = 0; j < 4; j++)
    262          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    263       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    264                           "vfnmadd231ps %%xmm7, %%xmm8, %%xmm9;"
    265                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    266                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    267       for (j = 0; j < 4; j++)
    268          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    269       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    270                           "vfnmadd231ps (%2), %%xmm8, %%xmm9;"
    271                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    272                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    273       for (j = 0; j < 4; j++)
    274          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    275       if (thisres) {
    276          printf( "Failure 8 %d", i );
    277          for (j = 0; j < 4; j++)
    278             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    279          printf( "\n" );
    280       }
    281       res |= thisres;
    282    }
    283    for (i = 1; i < N; i += 2)
    284       ft.z[i] = -ft.z[i];
    285    for (i = 0; i < N; i += 4) {
    286       int thisres = 0;
    287       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    288                           "vfmaddsub132ps %%xmm7, %%xmm8, %%xmm9;"
    289                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    290                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    291       for (j = 0; j < 4; j++)
    292          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    293       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    294                           "vfmaddsub132ps (%2), %%xmm8, %%xmm9;"
    295                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    296                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    297       for (j = 0; j < 4; j++)
    298          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    299       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    300                           "vfmaddsub213ps %%xmm7, %%xmm8, %%xmm9;"
    301                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    302                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    303       for (j = 0; j < 4; j++)
    304          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    305       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    306                           "vfmaddsub213ps (%3), %%xmm8, %%xmm9;"
    307                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    308                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    309       for (j = 0; j < 4; j++)
    310          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    311       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    312                           "vfmaddsub231ps %%xmm7, %%xmm8, %%xmm9;"
    313                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    314                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    315       for (j = 0; j < 4; j++)
    316          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    317       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    318                           "vfmaddsub231ps (%2), %%xmm8, %%xmm9;"
    319                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    320                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    321       for (j = 0; j < 4; j++)
    322          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    323       if (thisres) {
    324          printf( "Failure 9 %d", i );
    325          for (j = 0; j < 4; j++)
    326             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    327          printf( "\n" );
    328       }
    329       res |= thisres;
    330    }
    331    for (i = 0; i < N; i++)
    332       ft.z[i] = -ft.z[i];
    333    for (i = 0; i < N; i += 4) {
    334       int thisres = 0;
    335       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
    336                           "vfmsubadd132ps %%xmm7, %%xmm8, %%xmm9;"
    337                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    338                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    339       for (j = 0; j < 4; j++)
    340          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    341       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
    342                           "vfmsubadd132ps (%2), %%xmm8, %%xmm9;"
    343                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    344                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    345       for (j = 0; j < 4; j++)
    346          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    347       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
    348                           "vfmsubadd213ps %%xmm7, %%xmm8, %%xmm9;"
    349                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    350                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    351       for (j = 0; j < 4; j++)
    352          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    353       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
    354                           "vfmsubadd213ps (%3), %%xmm8, %%xmm9;"
    355                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    356                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    357       for (j = 0; j < 4; j++)
    358          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    359       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
    360                           "vfmsubadd231ps %%xmm7, %%xmm8, %%xmm9;"
    361                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    362                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    363       for (j = 0; j < 4; j++)
    364          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    365       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
    366                           "vfmsubadd231ps (%2), %%xmm8, %%xmm9;"
    367                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    368                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    369       for (j = 0; j < 4; j++)
    370          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    371       if (thisres) {
    372          printf( "Failure 10 %d", i );
    373          for (j = 0; j < 4; j++)
    374             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    375          printf( "\n" );
    376       }
    377       res |= thisres;
    378    }
    379    for (i = 1; i < N; i += 2)
    380       ft.z[i] = -ft.z[i];
    381    for (i = 0; i < N; i += 8) {
    382       int thisres = 0;
    383       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    384                           "vfmadd132ps %%ymm7, %%ymm8, %%ymm9;"
    385                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    386                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    387       for (j = 0; j < 8; j++)
    388          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    389       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    390                           "vfmadd132ps (%2), %%ymm8, %%ymm9;"
    391                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    392                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    393       for (j = 0; j < 8; j++)
    394          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    395       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    396                           "vfmadd213ps %%ymm7, %%ymm8, %%ymm9;"
    397                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    398                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    399       for (j = 0; j < 8; j++)
    400          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    401       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    402                           "vfmadd213ps (%3), %%ymm8, %%ymm9;"
    403                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    404                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    405       for (j = 0; j < 8; j++)
    406          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    407       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    408                           "vfmadd231ps %%ymm7, %%ymm8, %%ymm9;"
    409                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    410                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    411       for (j = 0; j < 8; j++)
    412          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    413       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    414                           "vfmadd231ps (%2), %%ymm8, %%ymm9;"
    415                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    416                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    417       for (j = 0; j < 8; j++)
    418          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    419       if (thisres) {
    420          printf( "Failure 11 %d", i );
    421          for (j = 0; j < 8; j++)
    422             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    423          printf( "\n" );
    424       }
    425       res |= thisres;
    426       thisres = 0;
    427       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    428                           "vfnmsub132ps %%ymm7, %%ymm8, %%ymm9;"
    429                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    430                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    431       for (j = 0; j < 8; j++)
    432          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    433       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    434                           "vfnmsub132ps (%2), %%ymm8, %%ymm9;"
    435                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    436                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    437       for (j = 0; j < 8; j++)
    438          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    439       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    440                           "vfnmsub213ps %%ymm7, %%ymm8, %%ymm9;"
    441                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    442                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    443       for (j = 0; j < 8; j++)
    444          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    445       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    446                           "vfnmsub213ps (%3), %%ymm8, %%ymm9;"
    447                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    448                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    449       for (j = 0; j < 8; j++)
    450          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    451       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    452                           "vfnmsub231ps %%ymm7, %%ymm8, %%ymm9;"
    453                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    454                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    455       for (j = 0; j < 8; j++)
    456          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    457       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    458                           "vfnmsub231ps (%2), %%ymm8, %%ymm9;"
    459                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    460                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    461       for (j = 0; j < 8; j++)
    462          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    463       if (thisres) {
    464          printf( "Failure 12 %d", i );
    465          for (j = 0; j < 8; j++)
    466             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    467          printf( "\n" );
    468       }
    469       res |= thisres;
    470    }
    471    for (i = 0; i < N; i++)
    472       ft.z[i] = -ft.z[i];
    473    for (i = 0; i < N; i += 8) {
    474       int thisres = 0;
    475       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    476                           "vfmsub132ps %%ymm7, %%ymm8, %%ymm9;"
    477                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    478                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    479       for (j = 0; j < 8; j++)
    480          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    481       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    482                           "vfmsub132ps (%2), %%ymm8, %%ymm9;"
    483                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    484                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    485       for (j = 0; j < 8; j++)
    486          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    487       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    488                           "vfmsub213ps %%ymm7, %%ymm8, %%ymm9;"
    489                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    490                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    491       for (j = 0; j < 8; j++)
    492          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    493       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    494                           "vfmsub213ps (%3), %%ymm8, %%ymm9;"
    495                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    496                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    497       for (j = 0; j < 8; j++)
    498          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    499       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    500                           "vfmsub231ps %%ymm7, %%ymm8, %%ymm9;"
    501                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    502                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    503       for (j = 0; j < 8; j++)
    504          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    505       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    506                           "vfmsub231ps (%2), %%ymm8, %%ymm9;"
    507                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    508                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    509       for (j = 0; j < 8; j++)
    510          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    511       if (thisres) {
    512          printf( "Failure 13 %d", i );
    513          for (j = 0; j < 8; j++)
    514             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    515          printf( "\n" );
    516       }
    517       res |= thisres;
    518       thisres = 0;
    519       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    520                           "vfnmadd132ps %%ymm7, %%ymm8, %%ymm9;"
    521                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    522                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    523       for (j = 0; j < 8; j++)
    524          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    525       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    526                           "vfnmadd132ps (%2), %%ymm8, %%ymm9;"
    527                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    528                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    529       for (j = 0; j < 8; j++)
    530          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    531       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    532                           "vfnmadd213ps %%ymm7, %%ymm8, %%ymm9;"
    533                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    534                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    535       for (j = 0; j < 8; j++)
    536          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    537       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    538                           "vfnmadd213ps (%3), %%ymm8, %%ymm9;"
    539                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    540                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    541       for (j = 0; j < 8; j++)
    542          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    543       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    544                           "vfnmadd231ps %%ymm7, %%ymm8, %%ymm9;"
    545                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    546                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    547       for (j = 0; j < 8; j++)
    548          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    549       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    550                           "vfnmadd231ps (%2), %%ymm8, %%ymm9;"
    551                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    552                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    553       for (j = 0; j < 8; j++)
    554          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
    555       if (thisres) {
    556          printf( "Failure 14 %d", i );
    557          for (j = 0; j < 8; j++)
    558             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    559          printf( "\n" );
    560       }
    561       res |= thisres;
    562    }
    563    for (i = 1; i < N; i += 2)
    564       ft.z[i] = -ft.z[i];
    565    for (i = 0; i < N; i += 8) {
    566       int thisres = 0;
    567       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    568                           "vfmaddsub132ps %%ymm7, %%ymm8, %%ymm9;"
    569                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    570                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    571       for (j = 0; j < 8; j++)
    572          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    573       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    574                           "vfmaddsub132ps (%2), %%ymm8, %%ymm9;"
    575                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    576                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    577       for (j = 0; j < 8; j++)
    578          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    579       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    580                           "vfmaddsub213ps %%ymm7, %%ymm8, %%ymm9;"
    581                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    582                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    583       for (j = 0; j < 8; j++)
    584          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    585       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    586                           "vfmaddsub213ps (%3), %%ymm8, %%ymm9;"
    587                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    588                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    589       for (j = 0; j < 8; j++)
    590          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    591       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    592                           "vfmaddsub231ps %%ymm7, %%ymm8, %%ymm9;"
    593                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    594                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    595       for (j = 0; j < 8; j++)
    596          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    597       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    598                           "vfmaddsub231ps (%2), %%ymm8, %%ymm9;"
    599                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    600                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    601       for (j = 0; j < 8; j++)
    602          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    603       if (thisres) {
    604          printf( "Failure 15 %d", i );
    605          for (j = 0; j < 8; j++)
    606             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    607          printf( "\n" );
    608       }
    609       res |= thisres;
    610    }
    611    for (i = 0; i < N; i++)
    612       ft.z[i] = -ft.z[i];
    613    for (i = 0; i < N; i += 8) {
    614       int thisres = 0;
    615       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
    616                           "vfmsubadd132ps %%ymm7, %%ymm8, %%ymm9;"
    617                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    618                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    619       for (j = 0; j < 8; j++)
    620          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    621       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
    622                           "vfmsubadd132ps (%2), %%ymm8, %%ymm9;"
    623                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    624                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    625       for (j = 0; j < 8; j++)
    626          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    627       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
    628                           "vfmsubadd213ps %%ymm7, %%ymm8, %%ymm9;"
    629                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    630                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    631       for (j = 0; j < 8; j++)
    632          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    633       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
    634                           "vfmsubadd213ps (%3), %%ymm8, %%ymm9;"
    635                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    636                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    637       for (j = 0; j < 8; j++)
    638          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    639       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
    640                           "vfmsubadd231ps %%ymm7, %%ymm8, %%ymm9;"
    641                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    642                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    643       for (j = 0; j < 8; j++)
    644          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    645       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
    646                           "vfmsubadd231ps (%2), %%ymm8, %%ymm9;"
    647                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
    648                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
    649       for (j = 0; j < 8; j++)
    650          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
    651       if (thisres) {
    652          printf( "Failure 16 %d", i );
    653          for (j = 0; j < 8; j++)
    654             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
    655          printf( "\n" );
    656       }
    657       res |= thisres;
    658    }
    659    for (i = 1; i < N; i += 2)
    660       ft.z[i] = -ft.z[i];
    661    return res;
    662 }
    663 
    664 static int test( double x, double y )
    665 {
    666    unsigned long long a, b;
    667    memcpy( &a, &x, sizeof (a) );
    668    memcpy( &b, &y, sizeof (b) );
    669    if ((a & 0x7ff8000000000000ULL) == 0x7ff8000000000000ULL)
    670       return (b & 0x7ff8000000000000ULL) != 0x7ff8000000000000ULL;
    671    return memcmp( &a, &b, sizeof (a) ) != 0;
    672 }
    673 
    674 static int test_fma( void )
    675 {
    676    int res = 0, i, j;
    677    double w;
    678    for (i = 0; i < N; i++) {
    679       int thisres = 0;
    680       __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    681       thisres |= test( w, dt.expected[i] );
    682       __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
    683       thisres |= test( w, dt.expected[i] );
    684       __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    685       thisres |= test( w, dt.expected[i] );
    686       __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
    687       thisres |= test( w, dt.expected[i] );
    688       __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
    689       thisres |= test( w, dt.expected[i] );
    690       __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
    691       thisres |= test( w, dt.expected[i] );
    692       if (thisres)
    693          printf( "Failure 1 %d %a %a\n", i, w, dt.expected[i] );
    694       res |= thisres;
    695       thisres = 0;
    696       __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    697       thisres |= test( -w, dt.expected[i] );
    698       __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
    699       thisres |= test( -w, dt.expected[i] );
    700       __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    701       thisres |= test( -w, dt.expected[i] );
    702       __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
    703       thisres |= test( -w, dt.expected[i] );
    704       __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
    705       thisres |= test( -w, dt.expected[i] );
    706       __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
    707       thisres |= test( -w, dt.expected[i] );
    708       if (thisres)
    709          printf( "Failure 2 %d %a %a\n", i, w, dt.expected[i] );
    710       res |= thisres;
    711    }
    712    for (i = 0; i < N; i++)
    713       dt.z[i] = -dt.z[i];
    714    for (i = 0; i < N; i++) {
    715       int thisres = 0;
    716       __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    717       thisres |= test( w, dt.expected[i] );
    718       __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
    719       thisres |= test( w, dt.expected[i] );
    720       __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    721       thisres |= test( w, dt.expected[i] );
    722       __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
    723       thisres |= test( w, dt.expected[i] );
    724       __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
    725       thisres |= test( w, dt.expected[i] );
    726       __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
    727       thisres |= test( w, dt.expected[i] );
    728       if (thisres)
    729          printf( "Failure 3 %d %a %a\n", i, w, dt.expected[i] );
    730       res |= thisres;
    731       thisres = 0;
    732       __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    733       thisres |= test( -w, dt.expected[i] );
    734       __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
    735       thisres |= test( -w, dt.expected[i] );
    736       __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
    737       thisres |= test( -w, dt.expected[i] );
    738       __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
    739       thisres |= test( -w, dt.expected[i] );
    740       __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
    741       thisres |= test( -w, dt.expected[i] );
    742       __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
    743       thisres |= test( -w, dt.expected[i] );
    744       if (thisres)
    745          printf( "Failure 4 %d %a %a\n", i, w, dt.expected[i] );
    746       res |= thisres;
    747    }
    748    for (i = 0; i < N; i++)
    749       dt.z[i] = -dt.z[i];
    750    for (i = 0; i < N; i += 2) {
    751       int thisres = 0;
    752       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    753                           "vfmadd132pd %%xmm7, %%xmm8, %%xmm9;"
    754                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    755                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    756       for (j = 0; j < 2; j++)
    757          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    758       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    759                           "vfmadd132pd (%2), %%xmm8, %%xmm9;"
    760                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    761                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    762       for (j = 0; j < 2; j++)
    763          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    764       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    765                           "vfmadd213pd %%xmm7, %%xmm8, %%xmm9;"
    766                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    767                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    768       for (j = 0; j < 2; j++)
    769          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    770       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
    771                           "vfmadd213pd (%3), %%xmm8, %%xmm9;"
    772                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    773                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    774       for (j = 0; j < 2; j++)
    775          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    776       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
    777                           "vfmadd231pd %%xmm7, %%xmm8, %%xmm9;"
    778                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    779                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    780       for (j = 0; j < 2; j++)
    781          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    782       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
    783                           "vfmadd231pd (%2), %%xmm8, %%xmm9;"
    784                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    785                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    786       for (j = 0; j < 2; j++)
    787          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    788       if (thisres) {
    789          printf( "Failure 5 %d", i );
    790          for (j = 0; j < 2; j++)
    791             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
    792          printf( "\n" );
    793       }
    794       res |= thisres;
    795       thisres = 0;
    796       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    797                           "vfnmsub132pd %%xmm7, %%xmm8, %%xmm9;"
    798                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    799                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    800       for (j = 0; j < 2; j++)
    801          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    802       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    803                           "vfnmsub132pd (%2), %%xmm8, %%xmm9;"
    804                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    805                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    806       for (j = 0; j < 2; j++)
    807          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    808       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    809                           "vfnmsub213pd %%xmm7, %%xmm8, %%xmm9;"
    810                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    811                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    812       for (j = 0; j < 2; j++)
    813          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    814       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
    815                           "vfnmsub213pd (%3), %%xmm8, %%xmm9;"
    816                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    817                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    818       for (j = 0; j < 2; j++)
    819          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    820       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
    821                           "vfnmsub231pd %%xmm7, %%xmm8, %%xmm9;"
    822                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    823                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    824       for (j = 0; j < 2; j++)
    825          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    826       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
    827                           "vfnmsub231pd (%2), %%xmm8, %%xmm9;"
    828                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    829                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    830       for (j = 0; j < 2; j++)
    831          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    832       if (thisres) {
    833          printf( "Failure 6 %d", i );
    834          for (j = 0; j < 2; j++)
    835             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
    836          printf( "\n" );
    837       }
    838       res |= thisres;
    839    }
    840    for (i = 0; i < N; i++)
    841       dt.z[i] = -dt.z[i];
    842    for (i = 0; i < N; i += 2) {
    843       int thisres = 0;
    844       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    845                           "vfmsub132pd %%xmm7, %%xmm8, %%xmm9;"
    846                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    847                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    848       for (j = 0; j < 2; j++)
    849          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    850       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    851                           "vfmsub132pd (%2), %%xmm8, %%xmm9;"
    852                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    853                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    854       for (j = 0; j < 2; j++)
    855          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    856       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    857                           "vfmsub213pd %%xmm7, %%xmm8, %%xmm9;"
    858                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    859                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    860       for (j = 0; j < 2; j++)
    861          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    862       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
    863                           "vfmsub213pd (%3), %%xmm8, %%xmm9;"
    864                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    865                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    866       for (j = 0; j < 2; j++)
    867          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    868       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
    869                           "vfmsub231pd %%xmm7, %%xmm8, %%xmm9;"
    870                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    871                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    872       for (j = 0; j < 2; j++)
    873          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    874       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
    875                           "vfmsub231pd (%2), %%xmm8, %%xmm9;"
    876                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    877                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    878       for (j = 0; j < 2; j++)
    879          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    880       if (thisres) {
    881          printf( "Failure 7 %d", i );
    882          for (j = 0; j < 2; j++)
    883             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
    884          printf( "\n" );
    885       }
    886       res |= thisres;
    887       thisres = 0;
    888       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    889                           "vfnmadd132pd %%xmm7, %%xmm8, %%xmm9;"
    890                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    891                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    892       for (j = 0; j < 2; j++)
    893          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    894       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    895                           "vfnmadd132pd (%2), %%xmm8, %%xmm9;"
    896                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    897                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    898       for (j = 0; j < 2; j++)
    899          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    900       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    901                           "vfnmadd213pd %%xmm7, %%xmm8, %%xmm9;"
    902                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    903                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    904       for (j = 0; j < 2; j++)
    905          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    906       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
    907                           "vfnmadd213pd (%3), %%xmm8, %%xmm9;"
    908                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    909                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    910       for (j = 0; j < 2; j++)
    911          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    912       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
    913                           "vfnmadd231pd %%xmm7, %%xmm8, %%xmm9;"
    914                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    915                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    916       for (j = 0; j < 2; j++)
    917          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    918       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
    919                           "vfnmadd231pd (%2), %%xmm8, %%xmm9;"
    920                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    921                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    922       for (j = 0; j < 2; j++)
    923          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
    924       if (thisres) {
    925          printf( "Failure 8 %d", i );
    926          for (j = 0; j < 2; j++)
    927             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
    928          printf( "\n" );
    929       }
    930       res |= thisres;
    931    }
    932    for (i = 1; i < N; i += 2)
    933       dt.z[i] = -dt.z[i];
    934    for (i = 0; i < N; i += 2) {
    935       int thisres = 0;
    936       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    937                           "vfmaddsub132pd %%xmm7, %%xmm8, %%xmm9;"
    938                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    939                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    940       for (j = 0; j < 2; j++)
    941          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    942       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    943                           "vfmaddsub132pd (%2), %%xmm8, %%xmm9;"
    944                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    945                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    946       for (j = 0; j < 2; j++)
    947          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    948       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    949                           "vfmaddsub213pd %%xmm7, %%xmm8, %%xmm9;"
    950                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    951                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    952       for (j = 0; j < 2; j++)
    953          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    954       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
    955                           "vfmaddsub213pd (%3), %%xmm8, %%xmm9;"
    956                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    957                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    958       for (j = 0; j < 2; j++)
    959          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    960       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
    961                           "vfmaddsub231pd %%xmm7, %%xmm8, %%xmm9;"
    962                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    963                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    964       for (j = 0; j < 2; j++)
    965          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    966       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
    967                           "vfmaddsub231pd (%2), %%xmm8, %%xmm9;"
    968                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    969                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    970       for (j = 0; j < 2; j++)
    971          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    972       if (thisres) {
    973          printf( "Failure 9 %d", i );
    974          for (j = 0; j < 2; j++)
    975             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
    976          printf( "\n" );
    977       }
    978       res |= thisres;
    979    }
    980    for (i = 0; i < N; i++)
    981       dt.z[i] = -dt.z[i];
    982    for (i = 0; i < N; i += 2) {
    983       int thisres = 0;
    984       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
    985                           "vfmsubadd132pd %%xmm7, %%xmm8, %%xmm9;"
    986                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    987                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    988       for (j = 0; j < 2; j++)
    989          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    990       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
    991                           "vfmsubadd132pd (%2), %%xmm8, %%xmm9;"
    992                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    993                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
    994       for (j = 0; j < 2; j++)
    995          thisres |= test( dt.res[i+j], dt.expected[i+j] );
    996       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
    997                           "vfmsubadd213pd %%xmm7, %%xmm8, %%xmm9;"
    998                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
    999                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1000       for (j = 0; j < 2; j++)
   1001          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1002       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
   1003                           "vfmsubadd213pd (%3), %%xmm8, %%xmm9;"
   1004                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1005                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1006       for (j = 0; j < 2; j++)
   1007          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1008       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
   1009                           "vfmsubadd231pd %%xmm7, %%xmm8, %%xmm9;"
   1010                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1011                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1012       for (j = 0; j < 2; j++)
   1013          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1014       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
   1015                           "vfmsubadd231pd (%2), %%xmm8, %%xmm9;"
   1016                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1017                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1018       for (j = 0; j < 2; j++)
   1019          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1020       if (thisres) {
   1021          printf( "Failure 10 %d", i );
   1022          for (j = 0; j < 2; j++)
   1023             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1024          printf( "\n" );
   1025       }
   1026       res |= thisres;
   1027    }
   1028    for (i = 1; i < N; i += 2)
   1029       dt.z[i] = -dt.z[i];
   1030    for (i = 0; i < N; i += 4) {
   1031       int thisres = 0;
   1032       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1033                           "vfmadd132pd %%ymm7, %%ymm8, %%ymm9;"
   1034                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1035                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1036       for (j = 0; j < 4; j++)
   1037          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1038       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1039                           "vfmadd132pd (%2), %%ymm8, %%ymm9;"
   1040                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1041                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1042       for (j = 0; j < 4; j++)
   1043          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1044       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1045                           "vfmadd213pd %%ymm7, %%ymm8, %%ymm9;"
   1046                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1047                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1048       for (j = 0; j < 4; j++)
   1049          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1050       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1051                           "vfmadd213pd (%3), %%ymm8, %%ymm9;"
   1052                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1053                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1054       for (j = 0; j < 4; j++)
   1055          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1056       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1057                           "vfmadd231pd %%ymm7, %%ymm8, %%ymm9;"
   1058                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1059                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1060       for (j = 0; j < 4; j++)
   1061          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1062       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1063                           "vfmadd231pd (%2), %%ymm8, %%ymm9;"
   1064                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1065                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1066       for (j = 0; j < 4; j++)
   1067          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1068       if (thisres) {
   1069          printf( "Failure 11 %d", i );
   1070          for (j = 0; j < 4; j++)
   1071             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1072          printf( "\n" );
   1073       }
   1074       res |= thisres;
   1075       thisres = 0;
   1076       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1077                           "vfnmsub132pd %%ymm7, %%ymm8, %%ymm9;"
   1078                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1079                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1080       for (j = 0; j < 4; j++)
   1081          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1082       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1083                           "vfnmsub132pd (%2), %%ymm8, %%ymm9;"
   1084                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1085                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1086       for (j = 0; j < 4; j++)
   1087          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1088       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1089                           "vfnmsub213pd %%ymm7, %%ymm8, %%ymm9;"
   1090                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1091                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1092       for (j = 0; j < 4; j++)
   1093          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1094       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1095                           "vfnmsub213pd (%3), %%ymm8, %%ymm9;"
   1096                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1097                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1098       for (j = 0; j < 4; j++)
   1099          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1100       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1101                           "vfnmsub231pd %%ymm7, %%ymm8, %%ymm9;"
   1102                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1103                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1104       for (j = 0; j < 4; j++)
   1105          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1106       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1107                           "vfnmsub231pd (%2), %%ymm8, %%ymm9;"
   1108                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1109                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1110       for (j = 0; j < 4; j++)
   1111          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1112       if (thisres) {
   1113          printf( "Failure 12 %d", i );
   1114          for (j = 0; j < 4; j++)
   1115             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1116          printf( "\n" );
   1117       }
   1118       res |= thisres;
   1119    }
   1120    for (i = 0; i < N; i++)
   1121       dt.z[i] = -dt.z[i];
   1122    for (i = 0; i < N; i += 4) {
   1123       int thisres = 0;
   1124       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1125                           "vfmsub132pd %%ymm7, %%ymm8, %%ymm9;"
   1126                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1127                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1128       for (j = 0; j < 4; j++)
   1129          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1130       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1131                           "vfmsub132pd (%2), %%ymm8, %%ymm9;"
   1132                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1133                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1134       for (j = 0; j < 4; j++)
   1135          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1136       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1137                           "vfmsub213pd %%ymm7, %%ymm8, %%ymm9;"
   1138                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1139                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1140       for (j = 0; j < 4; j++)
   1141          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1142       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1143                           "vfmsub213pd (%3), %%ymm8, %%ymm9;"
   1144                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1145                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1146       for (j = 0; j < 4; j++)
   1147          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1148       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1149                           "vfmsub231pd %%ymm7, %%ymm8, %%ymm9;"
   1150                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1151                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1152       for (j = 0; j < 4; j++)
   1153          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1154       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1155                           "vfmsub231pd (%2), %%ymm8, %%ymm9;"
   1156                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1157                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1158       for (j = 0; j < 4; j++)
   1159          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1160       if (thisres) {
   1161          printf( "Failure 13 %d", i );
   1162          for (j = 0; j < 4; j++)
   1163             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1164          printf( "\n" );
   1165       }
   1166       res |= thisres;
   1167       thisres = 0;
   1168       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1169                           "vfnmadd132pd %%ymm7, %%ymm8, %%ymm9;"
   1170                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1171                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1172       for (j = 0; j < 4; j++)
   1173          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1174       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1175                           "vfnmadd132pd (%2), %%ymm8, %%ymm9;"
   1176                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1177                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1178       for (j = 0; j < 4; j++)
   1179          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1180       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1181                           "vfnmadd213pd %%ymm7, %%ymm8, %%ymm9;"
   1182                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1183                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1184       for (j = 0; j < 4; j++)
   1185          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1186       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1187                           "vfnmadd213pd (%3), %%ymm8, %%ymm9;"
   1188                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1189                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1190       for (j = 0; j < 4; j++)
   1191          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1192       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1193                           "vfnmadd231pd %%ymm7, %%ymm8, %%ymm9;"
   1194                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1195                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1196       for (j = 0; j < 4; j++)
   1197          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1198       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1199                           "vfnmadd231pd (%2), %%ymm8, %%ymm9;"
   1200                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1201                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1202       for (j = 0; j < 4; j++)
   1203          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
   1204       if (thisres) {
   1205          printf( "Failure 14 %d", i );
   1206          for (j = 0; j < 4; j++)
   1207             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1208          printf( "\n" );
   1209       }
   1210       res |= thisres;
   1211    }
   1212    for (i = 1; i < N; i += 2)
   1213       dt.z[i] = -dt.z[i];
   1214    for (i = 0; i < N; i += 4) {
   1215       int thisres = 0;
   1216       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1217                           "vfmaddsub132pd %%ymm7, %%ymm8, %%ymm9;"
   1218                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1219                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1220       for (j = 0; j < 4; j++)
   1221          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1222       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1223                           "vfmaddsub132pd (%2), %%ymm8, %%ymm9;"
   1224                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1225                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1226       for (j = 0; j < 4; j++)
   1227          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1228       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1229                           "vfmaddsub213pd %%ymm7, %%ymm8, %%ymm9;"
   1230                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1231                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1232       for (j = 0; j < 4; j++)
   1233          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1234       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1235                           "vfmaddsub213pd (%3), %%ymm8, %%ymm9;"
   1236                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1237                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1238       for (j = 0; j < 4; j++)
   1239          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1240       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1241                           "vfmaddsub231pd %%ymm7, %%ymm8, %%ymm9;"
   1242                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1243                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1244       for (j = 0; j < 4; j++)
   1245          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1246       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1247                           "vfmaddsub231pd (%2), %%ymm8, %%ymm9;"
   1248                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1249                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1250       for (j = 0; j < 4; j++)
   1251          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1252       if (thisres) {
   1253          printf( "Failure 15 %d", i );
   1254          for (j = 0; j < 4; j++)
   1255             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1256          printf( "\n" );
   1257       }
   1258       res |= thisres;
   1259    }
   1260    for (i = 0; i < N; i++)
   1261       dt.z[i] = -dt.z[i];
   1262    for (i = 0; i < N; i += 4) {
   1263       int thisres = 0;
   1264       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
   1265                           "vfmsubadd132pd %%ymm7, %%ymm8, %%ymm9;"
   1266                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1267                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1268       for (j = 0; j < 4; j++)
   1269          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1270       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
   1271                           "vfmsubadd132pd (%2), %%ymm8, %%ymm9;"
   1272                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1273                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1274       for (j = 0; j < 4; j++)
   1275          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1276       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
   1277                           "vfmsubadd213pd %%ymm7, %%ymm8, %%ymm9;"
   1278                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1279                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1280       for (j = 0; j < 4; j++)
   1281          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1282       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
   1283                           "vfmsubadd213pd (%3), %%ymm8, %%ymm9;"
   1284                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1285                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1286       for (j = 0; j < 4; j++)
   1287          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1288       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
   1289                           "vfmsubadd231pd %%ymm7, %%ymm8, %%ymm9;"
   1290                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1291                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1292       for (j = 0; j < 4; j++)
   1293          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1294       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
   1295                           "vfmsubadd231pd (%2), %%ymm8, %%ymm9;"
   1296                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
   1297                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
   1298       for (j = 0; j < 4; j++)
   1299          thisres |= test( dt.res[i+j], dt.expected[i+j] );
   1300       if (thisres) {
   1301          printf( "Failure 16 %d", i );
   1302          for (j = 0; j < 4; j++)
   1303             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
   1304          printf( "\n" );
   1305       }
   1306       res |= thisres;
   1307    }
   1308    for (i = 1; i < N; i += 2)
   1309       dt.z[i] = -dt.z[i];
   1310    return res;
   1311 }
   1312 
   1313 int main( )
   1314 {
   1315    int res = 0;
   1316    int i = 0;
   1317    plus_zero = 0.0;
   1318    __asm __volatile__ ("" : : "r" (&plus_zero) : "memory");
   1319    nan_value = plus_zero / plus_zero;
   1320    plus_infty = 3.40282346638528859812e+38F * 16.0F;
   1321    minus_infty = -plus_infty;
   1322 #define TEST_F( a, b, c, d ) \
   1323    do {				\
   1324       ft.x[i] = a;		\
   1325       ft.y[i] = b;		\
   1326       ft.z[i] = c;		\
   1327       ft.expected[i] = d;	\
   1328       i++;			\
   1329    } while (0)
   1330    TEST_F( 1.0, 2.0, 3.0, 5.0 );
   1331    TEST_F( nan_value, 2.0, 3.0, nan_value );
   1332    TEST_F( 1.0, nan_value, 3.0, nan_value );
   1333    TEST_F( 1.0, 2.0, nan_value, nan_value );
   1334    TEST_F( plus_infty, 0.0, nan_value, nan_value );
   1335    TEST_F( minus_infty, 0.0, nan_value, nan_value );
   1336    TEST_F( 0.0, plus_infty, nan_value, nan_value );
   1337    TEST_F( 0.0, minus_infty, nan_value, nan_value );
   1338    TEST_F( plus_infty, 0.0, 1.0, nan_value );
   1339    TEST_F( minus_infty, 0.0, 1.0, nan_value );
   1340    TEST_F( 0.0, plus_infty, 1.0, nan_value );
   1341    TEST_F( 0.0, minus_infty, 1.0, nan_value );
   1342    TEST_F( plus_infty, plus_infty, minus_infty, nan_value );
   1343    TEST_F( minus_infty, plus_infty, plus_infty, nan_value );
   1344    TEST_F( plus_infty, minus_infty, plus_infty, nan_value );
   1345    TEST_F( minus_infty, minus_infty, minus_infty, nan_value );
   1346    TEST_F( plus_infty, 3.5L, minus_infty, nan_value );
   1347    TEST_F( minus_infty, -7.5L, minus_infty, nan_value );
   1348    TEST_F( -13.5L, plus_infty, plus_infty, nan_value );
   1349    TEST_F( minus_infty, 7.5L, plus_infty, nan_value );
   1350    TEST_F( 1.25L, 0.75L, 0.0625L, 1.0L );
   1351    TEST_F( -3.40282346638528859812e+38F, -3.40282346638528859812e+38F, minus_infty, minus_infty );
   1352    TEST_F( 3.40282346638528859812e+38F / 2, 3.40282346638528859812e+38F / 2, minus_infty, minus_infty );
   1353    TEST_F( -3.40282346638528859812e+38F, 3.40282346638528859812e+38F, plus_infty, plus_infty );
   1354    TEST_F( 3.40282346638528859812e+38F / 2, -3.40282346638528859812e+38F / 4, plus_infty, plus_infty );
   1355    TEST_F( plus_infty, 4, plus_infty, plus_infty );
   1356    TEST_F( 2, minus_infty, minus_infty, minus_infty );
   1357    TEST_F( minus_infty, minus_infty, plus_infty, plus_infty );
   1358    TEST_F( plus_infty, minus_infty, minus_infty, minus_infty );
   1359    TEST_F( 0x1.7ff8p+13, 0x1.000002p+0, 0x1.ffffp-24, 0x1.7ff802p+13 );
   1360    TEST_F( 0x1.fffp+0, 0x1.00001p+0, -0x1.fffp+0, 0x1.fffp-20 );
   1361    TEST_F( 0x1.9abcdep+127, 0x0.9abcdep-126, -0x1.f08948p+0, 0x1.bb421p-25 );
   1362    TEST_F( 0x1.9abcdep+100, 0x0.9abcdep-126, -0x1.f08948p-27, 0x1.bb421p-52 );
   1363    TEST_F( 0x1.fffffep+127, 0x1.001p+0, -0x1.fffffep+127, 0x1.fffffep+115 );
   1364    TEST_F( -0x1.fffffep+127, 0x1.fffffep+0, 0x1.fffffep+127, -0x1.fffffap+127 );
   1365    TEST_F( 0x1.fffffep+127, 2.0, -0x1.fffffep+127, 0x1.fffffep+127 );
   1366 
   1367    res |= test_fmaf( );
   1368    i = 0;
   1369 #define TEST( a, b, c, d ) \
   1370    do {				\
   1371       dt.x[i] = a;		\
   1372       dt.y[i] = b;		\
   1373       dt.z[i] = c;		\
   1374       dt.expected[i] = d;	\
   1375       i++;			\
   1376    } while (0)
   1377    TEST( 1.0, 2.0, 3.0, 5.0 );
   1378    TEST( nan_value, 2.0, 3.0, nan_value );
   1379    TEST( 1.0, nan_value, 3.0, nan_value );
   1380    TEST( 1.0, 2.0, nan_value, nan_value );
   1381    TEST( plus_infty, 0.0, nan_value, nan_value );
   1382    TEST( minus_infty, 0.0, nan_value, nan_value );
   1383    TEST( 0.0, plus_infty, nan_value, nan_value );
   1384    TEST( 0.0, minus_infty, nan_value, nan_value );
   1385    TEST( plus_infty, 0.0, 1.0, nan_value );
   1386    TEST( minus_infty, 0.0, 1.0, nan_value );
   1387    TEST( 0.0, plus_infty, 1.0, nan_value );
   1388    TEST( 0.0, minus_infty, 1.0, nan_value );
   1389    TEST( plus_infty, plus_infty, minus_infty, nan_value );
   1390    TEST( minus_infty, plus_infty, plus_infty, nan_value );
   1391    TEST( plus_infty, minus_infty, plus_infty, nan_value );
   1392    TEST( minus_infty, minus_infty, minus_infty, nan_value );
   1393    TEST( plus_infty, 3.5L, minus_infty, nan_value );
   1394    TEST( minus_infty, -7.5L, minus_infty, nan_value );
   1395    TEST( -13.5L, plus_infty, plus_infty, nan_value );
   1396    TEST( minus_infty, 7.5L, plus_infty, nan_value );
   1397    TEST( 1.25L, 0.75L, 0.0625L, 1.0L );
   1398    TEST( -1.79769313486231570815e+308L, -1.79769313486231570815e+308L, minus_infty, minus_infty );
   1399    TEST( 1.79769313486231570815e+308L / 2, 1.79769313486231570815e+308L / 2, minus_infty, minus_infty );
   1400    TEST( -1.79769313486231570815e+308L, 1.79769313486231570815e+308L, plus_infty, plus_infty );
   1401    TEST( 1.79769313486231570815e+308L / 2, -1.79769313486231570815e+308L / 4, plus_infty, plus_infty );
   1402    TEST( plus_infty, 4, plus_infty, plus_infty );
   1403    TEST( 2, minus_infty, minus_infty, minus_infty );
   1404    TEST( minus_infty, minus_infty, plus_infty, plus_infty );
   1405    TEST( plus_infty, minus_infty, minus_infty, minus_infty );
   1406    TEST( 0x1.7fp+13, 0x1.0000000000001p+0, 0x1.ffep-48, 0x1.7f00000000001p+13 );
   1407    TEST( 0x1.fffp+0, 0x1.0000000000001p+0, -0x1.fffp+0, 0x1.fffp-52 );
   1408    TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, 0x1p-300, 1.0 );
   1409    TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, -0x1p-300, 0x1.fffffffffffffp-1 );
   1410    TEST( 0x1.deadbeef2feedp+1023, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp+1, 0x1.0989687bc9da4p-53 );
   1411    TEST( 0x1.deadbeef2feedp+900, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp-122, 0x1.0989687bc9da4p-176 );
   1412    TEST( 0x1.fffffffffffffp+1023, 0x1.001p+0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1011 );
   1413    TEST( -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+0, 0x1.fffffffffffffp+1023, -0x1.ffffffffffffdp+1023 );
   1414    TEST( 0x1.fffffffffffffp+1023, 2.0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1023 );
   1415    TEST( 0x1.6a09e667f3bccp-538, 0x1.6a09e667f3bccp-538, 0.0, 0.0 );
   1416    TEST( 0x1.deadbeef2feedp-495, 0x1.deadbeef2feedp-495, -0x1.bf86a5786a574p-989, 0x0.0000042625a1fp-1022 );
   1417    TEST( 0x1.deadbeef2feedp-503, 0x1.deadbeef2feedp-503, -0x1.bf86a5786a574p-1005, 0x0.0000000004262p-1022 );
   1418    TEST( 0x1p-537, 0x1p-538, 0x1p-1074, 0x0.0000000000002p-1022 );
   1419    TEST( 0x1.7fffff8p-968, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000001p-1022 );
   1420    TEST( 0x1.4000004p-967, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000003p-1022 );
   1421    TEST( 0x1.4p-967, -0x1p-106, -0x0.000001p-1022, -0x0.0000010000002p-1022 );
   1422    TEST( -0x1.19cab66d73e17p-959, 0x1.c7108a8c5ff51p-107, -0x0.80b0ad65d9b64p-1022, -0x0.80b0ad65d9d59p-1022 );
   1423    TEST( -0x1.d2eaed6e8e9d3p-979, -0x1.4e066c62ac9ddp-63, -0x0.9245e6b003454p-1022, -0x0.9245c09c5fb5dp-1022 );
   1424    TEST( 0x1.153d650bb9f06p-907, 0x1.2d01230d48407p-125, -0x0.b278d5acfc3cp-1022, -0x0.b22757123bbe9p-1022 );
   1425    TEST( -0x1.fffffffffffffp-711, 0x1.fffffffffffffp-275, 0x1.fffffe00007ffp-983, 0x1.7ffffe00007ffp-983 );
   1426 
   1427    res |= test_fma( );
   1428    if (res == 0)
   1429       printf( "Testing successful\n");
   1430    return 0;
   1431 }
   1432