Home | History | Annotate | Download | only in ppc32
      1 /*  Copyright (C) 2011 IBM
      2 
      3  Author: Maynard Johnson <maynardj (at) us.ibm.com>
      4 
      5  This program is free software; you can redistribute it and/or
      6  modify it under the terms of the GNU General Public License as
      7  published by the Free Software Foundation; either version 2 of the
      8  License, or (at your option) any later version.
      9 
     10  This program is distributed in the hope that it will be useful, but
     11  WITHOUT ANY WARRANTY; without even the implied warranty of
     12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  General Public License for more details.
     14 
     15  You should have received a copy of the GNU General Public License
     16  along with this program; if not, write to the Free Software
     17  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     18  02111-1307, USA.
     19 
     20  The GNU General Public License is contained in the file COPYING.
     21  */
     22 
     23 #ifdef HAS_VSX
     24 
     25 #include <stdio.h>
     26 #include <stdint.h>
     27 #include <stdlib.h>
     28 #include <string.h>
     29 #include <malloc.h>
     30 #include <altivec.h>
     31 #include <math.h>
     32 
     33 #ifndef __powerpc64__
     34 typedef uint32_t HWord_t;
     35 #else
     36 typedef uint64_t HWord_t;
     37 #endif /* __powerpc64__ */
     38 
     39 #ifdef VGP_ppc64le_linux
     40 #define isLE 1
     41 #else
     42 #define isLE 0
     43 #endif
     44 
     45 typedef unsigned char Bool;
     46 #define True 1
     47 #define False 0
     48 register HWord_t r14 __asm__ ("r14");
     49 register HWord_t r15 __asm__ ("r15");
     50 register HWord_t r16 __asm__ ("r16");
     51 register HWord_t r17 __asm__ ("r17");
     52 register double f14 __asm__ ("fr14");
     53 register double f15 __asm__ ("fr15");
     54 register double f16 __asm__ ("fr16");
     55 register double f17 __asm__ ("fr17");
     56 
     57 static volatile unsigned int div_flags, div_xer;
     58 
     59 #define ALLCR "cr0","cr1","cr2","cr3","cr4","cr5","cr6","cr7"
     60 
     61 #define SET_CR(_arg) \
     62       __asm__ __volatile__ ("mtcr  %0" : : "b"(_arg) : ALLCR );
     63 
     64 #define SET_XER(_arg) \
     65       __asm__ __volatile__ ("mtxer %0" : : "b"(_arg) : "xer" );
     66 
     67 #define GET_CR(_lval) \
     68       __asm__ __volatile__ ("mfcr %0"  : "=b"(_lval) )
     69 
     70 #define GET_XER(_lval) \
     71       __asm__ __volatile__ ("mfxer %0" : "=b"(_lval) )
     72 
     73 #define GET_CR_XER(_lval_cr,_lval_xer) \
     74    do { GET_CR(_lval_cr); GET_XER(_lval_xer); } while (0)
     75 
     76 #define SET_CR_ZERO \
     77       SET_CR(0)
     78 
     79 #define SET_XER_ZERO \
     80       SET_XER(0)
     81 
     82 #define SET_CR_XER_ZERO \
     83    do { SET_CR_ZERO; SET_XER_ZERO; } while (0)
     84 
     85 #define SET_FPSCR_ZERO \
     86    do { double _d = 0.0; \
     87         __asm__ __volatile__ ("mtfsf 0xFF, %0" : : "f"(_d) ); \
     88    } while (0)
     89 
     90 
     91 typedef void (*test_func_t)(void);
     92 typedef struct test_table test_table_t;
     93 
     94 
     95 /* These functions below that construct a table of floating point
     96  * values were lifted from none/tests/ppc32/jm-insns.c.
     97  */
     98 
     99 #if defined (DEBUG_ARGS_BUILD)
    100 #define AB_DPRINTF(fmt, args...) do { fprintf(stderr, fmt , ##args); } while (0)
    101 #else
    102 #define AB_DPRINTF(fmt, args...) do { } while (0)
    103 #endif
    104 
    105 static inline void register_farg (void *farg,
    106                                   int s, uint16_t _exp, uint64_t mant)
    107 {
    108    uint64_t tmp;
    109 
    110    tmp = ((uint64_t)s << 63) | ((uint64_t)_exp << 52) | mant;
    111    *(uint64_t *)farg = tmp;
    112    AB_DPRINTF("%d %03x %013llx => %016llx %0e\n",
    113               s, _exp, mant, *(uint64_t *)farg, *(double *)farg);
    114 }
    115 
    116 static inline void register_sp_farg (void *farg,
    117                                      int s, uint16_t _exp, uint32_t mant)
    118 {
    119    uint32_t tmp;
    120    tmp = ((uint32_t)s << 31) | ((uint32_t)_exp << 23) | mant;
    121    *(uint32_t *)farg = tmp;
    122 }
    123 
    124 
    125 typedef struct fp_test_args {
    126    int fra_idx;
    127    int frb_idx;
    128 } fp_test_args_t;
    129 
    130 
    131 fp_test_args_t two_arg_fp_tests[] = {
    132                                      {8, 8},
    133                                      {8, 14},
    134                                      {15, 16},
    135                                      {8, 5},
    136                                      {8, 4},
    137                                      {8, 7},
    138                                      {8, 9},
    139                                      {8, 11},
    140                                      {14, 8},
    141                                      {14, 14},
    142                                      {14, 6},
    143                                      {14, 5},
    144                                      {14, 4},
    145                                      {14, 7},
    146                                      {14, 9},
    147                                      {14, 11},
    148                                      {6, 8},
    149                                      {6, 14},
    150                                      {6, 6},
    151                                      {6, 5},
    152                                      {6, 4},
    153                                      {6, 7},
    154                                      {6, 9},
    155                                      {6, 11},
    156                                      {5, 8},
    157                                      {5, 14},
    158                                      {5, 6},
    159                                      {5, 5},
    160                                      {5, 4},
    161                                      {5, 7},
    162                                      {5, 9},
    163                                      {5, 11},
    164                                      {4, 8},
    165                                      {4, 14},
    166                                      {4, 6},
    167                                      {4, 5},
    168                                      {4, 1},
    169                                      {4, 7},
    170                                      {4, 9},
    171                                      {4, 11},
    172                                      {7, 8},
    173                                      {7, 14},
    174                                      {7, 6},
    175                                      {7, 5},
    176                                      {7, 4},
    177                                      {7, 7},
    178                                      {7, 9},
    179                                      {7, 11},
    180                                      {10, 8},
    181                                      {10, 14},
    182                                      {12, 6},
    183                                      {12, 5},
    184                                      {10, 4},
    185                                      {10, 7},
    186                                      {10, 9},
    187                                      {10, 11},
    188                                      {12, 8 },
    189                                      {12, 14},
    190                                      {12, 6},
    191                                      {15, 16},
    192                                      {15, 16},
    193                                      {9, 11},
    194                                      {11, 11},
    195                                      {11, 12},
    196                                      {16, 18},
    197                                      {17, 16},
    198                                      {19, 19},
    199                                      {19, 18}
    200 };
    201 
    202 
    203 static int nb_special_fargs;
    204 static double * spec_fargs;
    205 static float * spec_sp_fargs;
    206 
    207 static void build_special_fargs_table(void)
    208 {
    209 /*
    210   Entry  Sign Exp   fraction                  Special value
    211    0      0   3fd   0x8000000000000ULL         Positive finite number
    212    1      0   404   0xf000000000000ULL         ...
    213    2      0   001   0x8000000b77501ULL         ...
    214    3      0   7fe   0x800000000051bULL         ...
    215    4      0   012   0x3214569900000ULL         ...
    216    5      0   000   0x0000000000000ULL         +0.0 (+zero)
    217    6      1   000   0x0000000000000ULL         -0.0 (-zero)
    218    7      0   7ff   0x0000000000000ULL         +infinity
    219    8      1   7ff   0x0000000000000ULL         -infinity
    220    9      0   7ff   0x7FFFFFFFFFFFFULL         +SNaN
    221    10     1   7ff   0x7FFFFFFFFFFFFULL         -SNaN
    222    11     0   7ff   0x8000000000000ULL         +QNaN
    223    12     1   7ff   0x8000000000000ULL         -QNaN
    224    13     1   000   0x8340000078000ULL         Denormalized val (zero exp and non-zero fraction)
    225    14     1   40d   0x0650f5a07b353ULL         Negative finite number
    226    15     0   412   0x32585a9900000ULL         A few more positive finite numbers
    227    16     0   413   0x82511a2000000ULL         ...
    228    17  . . . . . . . . . . . . . . . . . . . . . . .
    229    18  . . . . . . . . . . . . . . . . . . . . . . .
    230    19  . . . . . . . . . . . . . . . . . . . . . . .
    231 */
    232 
    233    uint64_t mant;
    234    uint32_t mant_sp;
    235    uint16_t _exp;
    236    int s;
    237    int j, i = 0;
    238 
    239    if (spec_fargs)
    240       return;
    241 
    242    spec_fargs = malloc( 20 * sizeof(double) );
    243    spec_sp_fargs = malloc( 20 * sizeof(float) );
    244 
    245    // #0
    246    s = 0;
    247    _exp = 0x3fd;
    248    mant = 0x8000000000000ULL;
    249    register_farg(&spec_fargs[i++], s, _exp, mant);
    250 
    251    // #1
    252    s = 0;
    253    _exp = 0x404;
    254    mant = 0xf000000000000ULL;
    255    register_farg(&spec_fargs[i++], s, _exp, mant);
    256 
    257    // #2
    258    s = 0;
    259    _exp = 0x001;
    260    mant = 0x8000000b77501ULL;
    261    register_farg(&spec_fargs[i++], s, _exp, mant);
    262 
    263    // #3
    264    s = 0;
    265    _exp = 0x7fe;
    266    mant = 0x800000000051bULL;
    267    register_farg(&spec_fargs[i++], s, _exp, mant);
    268 
    269    // #4
    270    s = 0;
    271    _exp = 0x012;
    272    mant = 0x3214569900000ULL;
    273    register_farg(&spec_fargs[i++], s, _exp, mant);
    274 
    275 
    276    /* Special values */
    277    /* +0.0      : 0 0x000 0x0000000000000 */
    278    // #5
    279    s = 0;
    280    _exp = 0x000;
    281    mant = 0x0000000000000ULL;
    282    register_farg(&spec_fargs[i++], s, _exp, mant);
    283 
    284    /* -0.0      : 1 0x000 0x0000000000000 */
    285    // #6
    286    s = 1;
    287    _exp = 0x000;
    288    mant = 0x0000000000000ULL;
    289    register_farg(&spec_fargs[i++], s, _exp, mant);
    290 
    291    /* +infinity : 0 0x7FF 0x0000000000000  */
    292    // #7
    293    s = 0;
    294    _exp = 0x7FF;
    295    mant = 0x0000000000000ULL;
    296    register_farg(&spec_fargs[i++], s, _exp, mant);
    297 
    298    /* -infinity : 1 0x7FF 0x0000000000000 */
    299    // #8
    300    s = 1;
    301    _exp = 0x7FF;
    302    mant = 0x0000000000000ULL;
    303    register_farg(&spec_fargs[i++], s, _exp, mant);
    304 
    305    /*
    306     * This comment applies to values #9 and #10 below:
    307     * When src is a SNaN, it's converted to a QNaN first before rounding to single-precision,
    308     * so we can't just copy the double-precision value to the corresponding slot in the
    309     * single-precision array (i.e., in the loop at the end of this function).  Instead, we
    310     * have to manually set the bits using register_sp_farg().
    311     */
    312 
    313    /* +SNaN     : 0 0x7FF 0x7FFFFFFFFFFFF */
    314    // #9
    315    s = 0;
    316    _exp = 0x7FF;
    317    mant = 0x7FFFFFFFFFFFFULL;
    318    register_farg(&spec_fargs[i++], s, _exp, mant);
    319    _exp = 0xff;
    320    mant_sp = 0x3FFFFF;
    321    register_sp_farg(&spec_sp_fargs[i-1], s, _exp, mant_sp);
    322 
    323    /* -SNaN     : 1 0x7FF 0x7FFFFFFFFFFFF */
    324    // #10
    325    s = 1;
    326    _exp = 0x7FF;
    327    mant = 0x7FFFFFFFFFFFFULL;
    328    register_farg(&spec_fargs[i++], s, _exp, mant);
    329    _exp = 0xff;
    330    mant_sp = 0x3FFFFF;
    331    register_sp_farg(&spec_sp_fargs[i-1], s, _exp, mant_sp);
    332 
    333    /* +QNaN     : 0 0x7FF 0x8000000000000 */
    334    // #11
    335    s = 0;
    336    _exp = 0x7FF;
    337    mant = 0x8000000000000ULL;
    338    register_farg(&spec_fargs[i++], s, _exp, mant);
    339 
    340    /* -QNaN     : 1 0x7FF 0x8000000000000 */
    341    // #12
    342    s = 1;
    343    _exp = 0x7FF;
    344    mant = 0x8000000000000ULL;
    345    register_farg(&spec_fargs[i++], s, _exp, mant);
    346 
    347    /* denormalized value */
    348    // #13
    349    s = 1;
    350    _exp = 0x000;
    351    mant = 0x8340000078000ULL;
    352    register_farg(&spec_fargs[i++], s, _exp, mant);
    353 
    354    /* Negative finite number */
    355    // #14
    356    s = 1;
    357    _exp = 0x40d;
    358    mant = 0x0650f5a07b353ULL;
    359    register_farg(&spec_fargs[i++], s, _exp, mant);
    360 
    361    /* A few positive finite numbers ... */
    362    // #15
    363    s = 0;
    364    _exp = 0x412;
    365    mant = 0x32585a9900000ULL;
    366    register_farg(&spec_fargs[i++], s, _exp, mant);
    367 
    368    // #16
    369    s = 0;
    370    _exp = 0x413;
    371    mant = 0x82511a2000000ULL;
    372    register_farg(&spec_fargs[i++], s, _exp, mant);
    373 
    374    // #17
    375    s = 0;
    376    _exp = 0x403;
    377    mant = 0x12ef5a9300000ULL;
    378    register_farg(&spec_fargs[i++], s, _exp, mant);
    379 
    380    // #18
    381    s = 0;
    382    _exp = 0x405;
    383    mant = 0x14bf5d2300000ULL;
    384    register_farg(&spec_fargs[i++], s, _exp, mant);
    385 
    386    // #19
    387    s = 0;
    388    _exp = 0x409;
    389    mant = 0x76bf982440000ULL;
    390    register_farg(&spec_fargs[i++], s, _exp, mant);
    391 
    392    nb_special_fargs = i;
    393    for (j = 0; j < i; j++) {
    394       if (!(j == 9 || j == 10))
    395          spec_sp_fargs[j] = spec_fargs[j];
    396    }
    397 }
    398 
    399 
    400 struct test_table
    401 {
    402    test_func_t test_category;
    403    char * name;
    404 };
    405 
    406 /*  Type of input for floating point operations.*/
    407 typedef enum {
    408    SINGLE_TEST,
    409    DOUBLE_TEST
    410 } precision_type_t;
    411 
    412 typedef enum {
    413    VX_SCALAR_CONV_TO_WORD,
    414    VX_CONV_TO_SINGLE,
    415    VX_CONV_TO_DOUBLE,
    416    VX_ESTIMATE,
    417    VX_DEFAULT
    418 } vx_fp_test_type;
    419 
    420 static vector unsigned int vec_out, vec_inA, vec_inB;
    421 
    422 /* This function is for checking the reciprocal and reciprocal square root
    423  * estimate instructions.
    424  */
    425 Bool check_estimate(precision_type_t type, Bool is_rsqrte, int idx, int output_vec_idx)
    426 {
    427    /* Technically, the number of bits of precision for xvredp and xvrsqrtedp is
    428     * 14 bits (14 = log2 16384).  However, the VEX emulation of these instructions
    429     * does an actual reciprocal calculation versus estimation, so the answer we get back from
    430     * valgrind can easily differ from the estimate in the lower bits (within the 14 bits of
    431     * precision) and the estimate may still be within expected tolerances.  On top of that,
    432     * we can't count on these estimates always being the same across implementations.
    433     * For example, with the fre[s] instruction (which should be correct to within one part
    434     * in 256 -- i.e., 8 bits of precision) . . . When approximating the value 1.0111_1111_1111,
    435     * one implementation could return 1.0111_1111_0000 and another implementation could return
    436     * 1.1000_0000_0000.  Both estimates meet the 1/256 accuracy requirement, but share only a
    437     * single bit in common.
    438     *
    439     * The upshot is we can't validate the VEX output for these instructions by comparing against
    440     * stored bit patterns.  We must check that the result is within expected tolerances.
    441     */
    442 
    443 
    444    /* A mask to be used for validation as a last resort.
    445     * Only use 12 bits of precision for reasons discussed above.
    446     */
    447 #define VSX_RECIP_ESTIMATE_MASK_DP 0xFFFFFF0000000000ULL
    448 #define VSX_RECIP_ESTIMATE_MASK_SP 0xFFFFFF00
    449 
    450    Bool result = False;
    451    Bool dp_test = type == DOUBLE_TEST;
    452    double src_dp, res_dp;
    453    float src_sp, res_sp;
    454    src_dp = res_dp = 0;
    455    src_sp = res_sp = 0;
    456 #define SRC (dp_test ? src_dp : src_sp)
    457 #define RES (dp_test ? res_dp : res_sp)
    458    Bool src_is_negative = False;
    459    Bool res_is_negative = False;
    460    unsigned long long * dst_dp = NULL;
    461    unsigned int * dst_sp = NULL;
    462    if (dp_test) {
    463       unsigned long long * src_dp_ull;
    464       dst_dp = (unsigned long long *) &vec_out;
    465       src_dp = spec_fargs[idx];
    466       src_dp_ull = (unsigned long long *) &src_dp;
    467       src_is_negative = (*src_dp_ull & 0x8000000000000000ULL) ? True : False;
    468       res_is_negative = (dst_dp[output_vec_idx] & 0x8000000000000000ULL) ? True : False;
    469       memcpy(&res_dp, &dst_dp[output_vec_idx], 8);
    470    } else {
    471       unsigned int * src_sp_uint;
    472       dst_sp = (unsigned int *) &vec_out;
    473       src_sp = spec_sp_fargs[idx];
    474       src_sp_uint = (unsigned int *) &src_sp;
    475       src_is_negative = (*src_sp_uint & 0x80000000) ? True : False;
    476       res_is_negative = (dst_sp[output_vec_idx] & 0x80000000) ? True : False;
    477       memcpy(&res_sp, &dst_sp[output_vec_idx], 4);
    478    }
    479 
    480    // Below are common rules for xvre{d|s}p and xvrsqrte{d|s}p
    481    if (isnan(SRC))
    482       return isnan(RES);
    483    if (fpclassify(SRC) == FP_ZERO)
    484       return isinf(RES);
    485    if (!src_is_negative && isinf(SRC))
    486       return !res_is_negative && (fpclassify(RES) == FP_ZERO);
    487    if (is_rsqrte) {
    488       if (src_is_negative)
    489          return isnan(RES);
    490    } else {
    491       if (src_is_negative && isinf(SRC))
    492          return res_is_negative && (fpclassify(RES) == FP_ZERO);
    493    }
    494    if (dp_test) {
    495       double calc_diff;
    496       double real_diff;
    497       double recip_divisor;
    498       double div_result;
    499       double calc_diff_tmp;
    500 
    501       if (is_rsqrte)
    502          recip_divisor = sqrt(src_dp);
    503       else
    504          recip_divisor = src_dp;
    505 
    506       div_result = 1.0/recip_divisor;
    507       calc_diff_tmp = recip_divisor * 16384.0;
    508       if (isnormal(calc_diff_tmp)) {
    509          calc_diff = fabs(1.0/calc_diff_tmp);
    510          real_diff = fabs(res_dp - div_result);
    511          result = ( ( res_dp == div_result )
    512                   || ( real_diff <= calc_diff ) );
    513       } else {
    514          /* Unable to compute theoretical difference, so we fall back to masking out
    515           * un-precise bits.
    516           */
    517          unsigned long long * div_result_dp = (unsigned long long *) &div_result;
    518          result = (dst_dp[output_vec_idx] & VSX_RECIP_ESTIMATE_MASK_DP) == (*div_result_dp & VSX_RECIP_ESTIMATE_MASK_DP);
    519       }
    520       /* For debug use . . .
    521          if (!result) {
    522              unsigned long long * dv = &div_result;
    523              unsigned long long * rd = &real_diff;
    524              unsigned long long * cd = &calc_diff;
    525              printf("\n\t {actual div_result: %016llx; real_diff:  %016llx; calc_diff:  %016llx}\n",
    526        *dv, *rd, *cd);
    527           }
    528        */
    529    } else {  // single precision test (only have xvrsqrtesp, since xvresp was implemented in stage 2)
    530       float calc_diff;
    531       float real_diff;
    532       float div_result;
    533       float calc_diff_tmp;
    534       float recip_divisor = sqrt(src_sp);
    535 
    536       div_result = 1.0/recip_divisor;
    537       calc_diff_tmp = recip_divisor * 16384.0;
    538       if (isnormal(calc_diff_tmp)) {
    539          calc_diff = fabsf(1.0/calc_diff_tmp);
    540          real_diff = fabsf(res_sp - div_result);
    541          result = ( ( res_sp == div_result )
    542                   || ( real_diff <= calc_diff ) );
    543       } else {
    544          /* Unable to compute theoretical difference, so we fall back to masking out
    545           * un-precise bits.
    546           */
    547          unsigned int * div_result_sp = (unsigned int *) &div_result;
    548          result = (dst_sp[output_vec_idx] & VSX_RECIP_ESTIMATE_MASK_SP) == (*div_result_sp & VSX_RECIP_ESTIMATE_MASK_SP);
    549       }
    550       /* For debug use . . .
    551          if (!result) {
    552              unsigned long long * dv = &div_result;
    553              unsigned long long * rd = &real_diff;
    554              unsigned long long * cd = &calc_diff;
    555              printf("\n\t {actual div_result: %016llx; real_diff:  %016llx; calc_diff:  %016llx}\n",
    556        *dv, *rd, *cd);
    557           }
    558        */
    559    }
    560    return result;
    561 }
    562 
    563 typedef struct vx_fp_test
    564 {
    565    test_func_t test_func;
    566    const char * name;
    567    fp_test_args_t * targs;
    568    int num_tests;
    569    precision_type_t precision;
    570    vx_fp_test_type type;
    571    const char * op;
    572 } vx_fp_test_t;
    573 
    574 
    575 static Bool do_dot;
    576 
    577 static void test_xvredp(void)
    578 {
    579    __asm__ __volatile__ ("xvredp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    580 }
    581 
    582 static void test_xsredp(void)
    583 {
    584    __asm__ __volatile__ ("xsredp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    585 }
    586 
    587 static void test_xvrsqrtedp(void)
    588 {
    589    __asm__ __volatile__ ("xvrsqrtedp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    590 }
    591 
    592 static void test_xsrsqrtedp(void)
    593 {
    594    __asm__ __volatile__ ("xsrsqrtedp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    595 }
    596 
    597 static void test_xvrsqrtesp(void)
    598 {
    599    __asm__ __volatile__ ("xvrsqrtesp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    600 }
    601 
    602 static void test_xstsqrtdp(void)
    603 {
    604    __asm__ __volatile__ ("xstsqrtdp   cr1, %x0" : : "wa" (vec_inB));
    605 }
    606 
    607 static void test_xvtsqrtdp(void)
    608 {
    609    __asm__ __volatile__ ("xvtsqrtdp   cr1, %x0" : : "wa" (vec_inB));
    610 }
    611 
    612 static void test_xvtsqrtsp(void)
    613 {
    614    __asm__ __volatile__ ("xvtsqrtsp   cr1, %x0" : : "wa" (vec_inB));
    615 }
    616 
    617 static void test_xvsqrtdp(void)
    618 {
    619    __asm__ __volatile__ ("xvsqrtdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    620 }
    621 
    622 static void test_xvsqrtsp(void)
    623 {
    624    __asm__ __volatile__ ("xvsqrtsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    625 }
    626 
    627 static void test_xvtdivdp(void)
    628 {
    629    __asm__ __volatile__ ("xvtdivdp   cr1, %x0, %x1" : : "wa" (vec_inA), "wa" (vec_inB));
    630 }
    631 
    632 static void test_xvtdivsp(void)
    633 {
    634    __asm__ __volatile__ ("xvtdivsp   cr1, %x0, %x1" : : "wa" (vec_inA), "wa" (vec_inB));
    635 }
    636 
    637 static void test_xscvdpsp(void)
    638 {
    639    __asm__ __volatile__ ("xscvdpsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    640 }
    641 
    642 static void test_xscvdpuxws(void)
    643 {
    644    __asm__ __volatile__ ("xscvdpuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    645 }
    646 
    647 static void test_xscvspdp(void)
    648 {
    649    __asm__ __volatile__ ("xscvspdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    650 }
    651 
    652 static void test_xvcvdpsp(void)
    653 {
    654    __asm__ __volatile__ ("xvcvdpsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    655 }
    656 
    657 static void test_xvcvdpuxds(void)
    658 {
    659    __asm__ __volatile__ ("xvcvdpuxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    660 }
    661 
    662 static void test_xvcvdpuxws(void)
    663 {
    664    __asm__ __volatile__ ("xvcvdpuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    665 }
    666 
    667 static void test_xvcvspdp(void)
    668 {
    669    __asm__ __volatile__ ("xvcvspdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    670 }
    671 
    672 static void test_xvcvspsxds(void)
    673 {
    674    __asm__ __volatile__ ("xvcvspsxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    675 }
    676 
    677 static void test_xvcvspuxds(void)
    678 {
    679    __asm__ __volatile__ ("xvcvspuxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    680 }
    681 
    682 static void test_xvcvdpsxds(void)
    683 {
    684    __asm__ __volatile__ ("xvcvdpsxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    685 }
    686 
    687 static void test_xvcvspuxws(void)
    688 {
    689    __asm__ __volatile__ ("xvcvspuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    690 }
    691 
    692 static void test_xvcvsxddp(void)
    693 {
    694    __asm__ __volatile__ ("xvcvsxddp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    695 }
    696 
    697 static void test_xvcvuxddp(void)
    698 {
    699    __asm__ __volatile__ ("xvcvuxddp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    700 }
    701 
    702 static void test_xvcvsxdsp(void)
    703 {
    704    __asm__ __volatile__ ("xvcvsxdsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    705 }
    706 
    707 static void test_xvcvuxdsp(void)
    708 {
    709    __asm__ __volatile__ ("xvcvuxdsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    710 }
    711 
    712 static void test_xvcvsxwdp(void)
    713 {
    714    __asm__ __volatile__ ("xvcvsxwdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    715 }
    716 
    717 static void test_xvcvuxwdp(void)
    718 {
    719    __asm__ __volatile__ ("xvcvuxwdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    720 }
    721 
    722 static void test_xvcvsxwsp(void)
    723 {
    724    __asm__ __volatile__ ("xvcvsxwsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    725 }
    726 
    727 static void test_xvcvuxwsp(void)
    728 {
    729    __asm__ __volatile__ ("xvcvuxwsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    730 }
    731 
    732 static void test_xsrdpic(void)
    733 {
    734    __asm__ __volatile__ ("xsrdpic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    735 }
    736 
    737 static void test_xsrdpiz(void)
    738 {
    739    __asm__ __volatile__ ("xsrdpiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    740 }
    741 
    742 static void test_xsrdpi(void)
    743 {
    744    __asm__ __volatile__ ("xsrdpi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    745 }
    746 
    747 static void test_xvabsdp(void)
    748 {
    749    __asm__ __volatile__ ("xvabsdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    750 }
    751 
    752 static void test_xvnabsdp(void)
    753 {
    754    __asm__ __volatile__ ("xvnabsdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    755 }
    756 
    757 static void test_xvnegdp(void)
    758 {
    759    __asm__ __volatile__ ("xvnegdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    760 }
    761 
    762 static void test_xvabssp(void)
    763 {
    764    __asm__ __volatile__ ("xvabssp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    765 }
    766 
    767 static void test_xvnabssp(void)
    768 {
    769    __asm__ __volatile__ ("xvnabssp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    770 }
    771 
    772 static void test_xvrdpi(void)
    773 {
    774    __asm__ __volatile__ ("xvrdpi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    775 }
    776 
    777 static void test_xvrdpic(void)
    778 {
    779    __asm__ __volatile__ ("xvrdpic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    780 }
    781 
    782 static void test_xvrdpim(void)
    783 {
    784    __asm__ __volatile__ ("xvrdpim   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    785 }
    786 
    787 static void test_xvrdpip(void)
    788 {
    789    __asm__ __volatile__ ("xvrdpip   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    790 }
    791 
    792 static void test_xvrdpiz(void)
    793 {
    794    __asm__ __volatile__ ("xvrdpiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    795 }
    796 
    797 static void test_xvrspi(void)
    798 {
    799    __asm__ __volatile__ ("xvrspi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    800 }
    801 
    802 static void test_xvrspic(void)
    803 {
    804    __asm__ __volatile__ ("xvrspic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    805 }
    806 
    807 static void test_xvrspim(void)
    808 {
    809    __asm__ __volatile__ ("xvrspim   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    810 }
    811 
    812 static void test_xvrspip(void)
    813 {
    814    __asm__ __volatile__ ("xvrspip   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    815 }
    816 
    817 static void test_xvrspiz(void)
    818 {
    819    __asm__ __volatile__ ("xvrspiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
    820 }
    821 
    822 static vx_fp_test_t
    823 vsx_one_fp_arg_tests[] = {
    824                                 { &test_xvredp, "xvredp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x"},
    825                                 { &test_xsredp, "xsredp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x"},
    826                                 { &test_xvrsqrtedp, "xvrsqrtedp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
    827                                 { &test_xsrsqrtedp, "xsrsqrtedp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
    828                                 { &test_xvrsqrtesp, "xvrsqrtesp", NULL, 18, SINGLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
    829                                 { &test_xvsqrtdp, "xvsqrtdp", NULL, 18, DOUBLE_TEST, VX_DEFAULT, "sqrt"},
    830                                 { &test_xvsqrtsp, "xvsqrtsp", NULL, 18, SINGLE_TEST, VX_DEFAULT, "sqrt"},
    831                                 { &test_xscvdpsp, "xscvdpsp", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
    832                                 { &test_xscvdpuxws, "xscvdpuxws", NULL, 20, DOUBLE_TEST, VX_SCALAR_CONV_TO_WORD, "conv"},
    833                                 { &test_xscvspdp, "xscvspdp", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    834                                 { &test_xvcvdpsp, "xvcvdpsp", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
    835                                 { &test_xvcvdpuxds, "xvcvdpuxds", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    836                                 { &test_xvcvdpuxws, "xvcvdpuxws", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
    837                                 { &test_xvcvspdp, "xvcvspdp", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    838                                 { &test_xvcvspsxds, "xvcvspsxds", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    839                                 { &test_xvcvdpsxds, "xvcvdpsxds", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    840                                 { &test_xvcvspuxds, "xvcvspuxds", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
    841                                 { &test_xvcvspuxws, "xvcvspuxws", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "conv"},
    842                                 { &test_xsrdpic, "xsrdpic", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    843                                 { &test_xsrdpiz, "xsrdpiz", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    844                                 { &test_xsrdpi, "xsrdpi", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    845                                 { &test_xvabsdp, "xvabsdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "abs"},
    846                                 { &test_xvnabsdp, "xvnabsdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "nabs"},
    847                                 { &test_xvnegdp, "xvnegdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "neg"},
    848                                 { &test_xvabssp, "xvabssp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "abs"},
    849                                 { &test_xvnabssp, "xvnabssp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "nabs"},
    850                                 { &test_xvrdpi,  "xvrdpi",  NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    851                                 { &test_xvrdpic, "xvrdpic", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    852                                 { &test_xvrdpim, "xvrdpim", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    853                                 { &test_xvrdpip, "xvrdpip", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    854                                 { &test_xvrdpiz, "xvrdpiz", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
    855                                 { &test_xvrspi,  "xvrspi",  NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
    856                                 { &test_xvrspic, "xvrspic", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
    857                                 { &test_xvrspim, "xvrspim", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
    858                                 { &test_xvrspip, "xvrspip", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
    859                                 { &test_xvrspiz, "xvrspiz", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
    860                                 { NULL, NULL, NULL, 0, 0, 0, NULL}
    861 };
    862 
    863 static vx_fp_test_t
    864 vx_tdivORtsqrt_tests[] = {
    865                           { &test_xstsqrtdp, "xstsqrtdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "test-sqrt"},
    866                           { &test_xvtsqrtdp, "xvtsqrtdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "test-sqrt"},
    867                           { &test_xvtsqrtsp, "xvtsqrtsp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "test-sqrt"},
    868                           { &test_xvtdivdp, "xvtdivdp", two_arg_fp_tests, 68, DOUBLE_TEST, VX_DEFAULT, "test-div"},
    869                           { &test_xvtdivsp, "xvtdivsp", two_arg_fp_tests, 68, SINGLE_TEST, VX_DEFAULT, "test-div"},
    870                           { NULL, NULL, NULL, 0 , 0, 0, NULL}
    871 };
    872 
    873 static unsigned long long doubleWord[] = { 0,
    874                                   0xffffffff00000000LL,
    875                                   0x00000000ffffffffLL,
    876                                   0xffffffffffffffffLL,
    877                                   0x89abcde123456789LL,
    878                                   0x0102030405060708LL,
    879                                   0x00000000a0b1c2d3LL,
    880                                   0x1111222233334444LL
    881 };
    882 
    883 static unsigned int singleWord[] = {0,
    884                                   0xffff0000,
    885                                   0x0000ffff,
    886                                   0xffffffff,
    887                                   0x89a73522,
    888                                   0x01020304,
    889                                   0x0000abcd,
    890                                   0x11223344
    891 };
    892 
    893 typedef struct vx_intToFp_test
    894 {
    895    test_func_t test_func;
    896    const char * name;
    897    void * targs;
    898    int num_tests;
    899    precision_type_t precision;
    900    vx_fp_test_type type;
    901 } vx_intToFp_test_t;
    902 
    903 static vx_intToFp_test_t
    904 intToFp_tests[] = {
    905                    { test_xvcvsxddp, "xvcvsxddp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_DOUBLE },
    906                    { test_xvcvuxddp, "xvcvuxddp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_DOUBLE },
    907                    { test_xvcvsxdsp, "xvcvsxdsp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_SINGLE },
    908                    { test_xvcvuxdsp, "xvcvuxdsp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_SINGLE },
    909                    { test_xvcvsxwdp, "xvcvsxwdp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_DOUBLE },
    910                    { test_xvcvuxwdp, "xvcvuxwdp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_DOUBLE },
    911                    { test_xvcvsxwsp, "xvcvsxwsp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_SINGLE },
    912                    { test_xvcvuxwsp, "xvcvuxwsp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_SINGLE },
    913                    { NULL, NULL, NULL, 0, 0 }
    914 };
    915 
    916 static Bool do_OE;
    917 typedef enum {
    918    DIV_BASE = 1,
    919    DIV_OE = 2,
    920    DIV_DOT = 4,
    921 } div_type_t;
    922 /* Possible divde type combinations are:
    923  *   - base
    924  *   - base+dot
    925  *   - base+OE
    926  *   - base+OE+dot
    927  */
    928 #ifdef __powerpc64__
    929 static void test_divdeu(void)
    930 {
    931    int divdeu_type = DIV_BASE;
    932    if (do_OE)
    933       divdeu_type |= DIV_OE;
    934    if (do_dot)
    935       divdeu_type |= DIV_DOT;
    936 
    937    switch (divdeu_type) {
    938       case 1:
    939         SET_CR_XER_ZERO;
    940          __asm__ __volatile__ ("divdeu %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    941          GET_CR_XER(div_flags, div_xer);
    942          break;
    943       case 3:
    944         SET_CR_XER_ZERO;
    945          __asm__ __volatile__ ("divdeuo %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    946          GET_CR_XER(div_flags, div_xer);
    947          break;
    948       case 5:
    949         SET_CR_XER_ZERO;
    950          __asm__ __volatile__ ("divdeu. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    951          GET_CR_XER(div_flags, div_xer);
    952          break;
    953       case 7:
    954         SET_CR_XER_ZERO;
    955          __asm__ __volatile__ ("divdeuo. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    956          GET_CR_XER(div_flags, div_xer);
    957          break;
    958       default:
    959          fprintf(stderr, "Invalid divdeu type. Exiting\n");
    960          exit(1);
    961    }
    962 }
    963 #endif
    964 
    965 static void test_divwe(void)
    966 {
    967    int divwe_type = DIV_BASE;
    968    if (do_OE)
    969       divwe_type |= DIV_OE;
    970    if (do_dot)
    971       divwe_type |= DIV_DOT;
    972 
    973    switch (divwe_type) {
    974       case 1:
    975         SET_CR_XER_ZERO;
    976          __asm__ __volatile__ ("divwe %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    977          GET_CR_XER(div_flags, div_xer);
    978          break;
    979       case 3:
    980         SET_CR_XER_ZERO;
    981          __asm__ __volatile__ ("divweo %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    982          GET_CR_XER(div_flags, div_xer);
    983          break;
    984       case 5:
    985         SET_CR_XER_ZERO;
    986          __asm__ __volatile__ ("divwe. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    987          GET_CR_XER(div_flags, div_xer);
    988          break;
    989       case 7:
    990         SET_CR_XER_ZERO;
    991          __asm__ __volatile__ ("divweo. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
    992          GET_CR_XER(div_flags, div_xer);
    993          break;
    994       default:
    995          fprintf(stderr, "Invalid divweu type. Exiting\n");
    996          exit(1);
    997    }
    998 }
    999 
   1000 
   1001 typedef struct simple_test {
   1002    test_func_t test_func;
   1003    char * name;
   1004    precision_type_t precision;
   1005 } simple_test_t;
   1006 
   1007 
   1008 static void setup_sp_fp_args(fp_test_args_t * targs, Bool swap_inputs)
   1009 {
   1010    int a_idx, b_idx, i;
   1011    void * inA, * inB;
   1012    void * vec_src = swap_inputs ? &vec_out : &vec_inB;
   1013 
   1014    for (i = 0; i < 4; i++) {
   1015       a_idx = targs->fra_idx;
   1016       b_idx = targs->frb_idx;
   1017       inA = (void *)&spec_sp_fargs[a_idx];
   1018       inB = (void *)&spec_sp_fargs[b_idx];
   1019       // copy single precision FP  into vector element i
   1020       memcpy(((void *)&vec_inA) + (i * 4), inA, 4);
   1021       memcpy(vec_src + (i * 4), inB, 4);
   1022       targs++;
   1023    }
   1024 }
   1025 
   1026 static void setup_dp_fp_args(fp_test_args_t * targs, Bool swap_inputs)
   1027 {
   1028    int a_idx, b_idx, i;
   1029    void * inA, * inB;
   1030    void * vec_src = swap_inputs ? (void *)&vec_out : (void *)&vec_inB;
   1031 
   1032    for (i = 0; i < 2; i++) {
   1033       a_idx = targs->fra_idx;
   1034       b_idx = targs->frb_idx;
   1035       inA = (void *)&spec_fargs[a_idx];
   1036       inB = (void *)&spec_fargs[b_idx];
   1037       // copy double precision FP  into vector element i
   1038       memcpy(((void *)&vec_inA) + (i * 8), inA, 8);
   1039       memcpy(vec_src + (i * 8), inB, 8);
   1040       targs++;
   1041    }
   1042 }
   1043 
   1044 #define VX_NOT_CMP_OP 0xffffffff
   1045 static void print_vector_fp_result(unsigned int cc, vx_fp_test_t * test_group, int i, Bool print_vec_out)
   1046 {
   1047    int a_idx, b_idx, k;
   1048    char * name = malloc(20);
   1049    int dp = test_group->precision == DOUBLE_TEST ? 1 : 0;
   1050    int loops = dp ? 2 : 4;
   1051    fp_test_args_t * targs = &test_group->targs[i];
   1052    unsigned long long * frA_dp, * frB_dp, * dst_dp;
   1053    unsigned int * frA_sp, *frB_sp, * dst_sp;
   1054    strcpy(name, test_group->name);
   1055    printf("#%d: %s%s ", dp? i/2 : i/4, name, (do_dot ? "." : ""));
   1056    for (k = 0; k < loops; k++) {
   1057       a_idx = targs->fra_idx;
   1058       b_idx = targs->frb_idx;
   1059       if (k)
   1060          printf(" AND ");
   1061       if (dp) {
   1062          frA_dp = (unsigned long long *)&spec_fargs[a_idx];
   1063          frB_dp = (unsigned long long *)&spec_fargs[b_idx];
   1064          printf("%016llx %s %016llx", *frA_dp, test_group->op, *frB_dp);
   1065       } else {
   1066          frA_sp = (unsigned int *)&spec_sp_fargs[a_idx];
   1067          frB_sp = (unsigned int *)&spec_sp_fargs[b_idx];
   1068          printf("%08x %s %08x", *frA_sp, test_group->op, *frB_sp);
   1069       }
   1070       targs++;
   1071    }
   1072    if (cc != VX_NOT_CMP_OP)
   1073       printf(" ? cc=%x", cc);
   1074 
   1075    if (print_vec_out) {
   1076       if (dp) {
   1077          dst_dp = (unsigned long long *) &vec_out;
   1078          printf(" => %016llx %016llx\n", dst_dp[0], dst_dp[1]);
   1079       } else {
   1080          dst_sp = (unsigned int *) &vec_out;
   1081          printf(" => %08x %08x %08x %08x\n", dst_sp[0], dst_sp[1], dst_sp[2], dst_sp[3]);
   1082       }
   1083    } else {
   1084       printf("\n");
   1085    }
   1086    free(name);
   1087 }
   1088 
   1089 
   1090 
   1091 static void test_vsx_one_fp_arg(void)
   1092 {
   1093    test_func_t func;
   1094    int k;
   1095    k = 0;
   1096    build_special_fargs_table();
   1097 
   1098    while ((func = vsx_one_fp_arg_tests[k].test_func)) {
   1099       int idx, i;
   1100       vx_fp_test_t test_group = vsx_one_fp_arg_tests[k];
   1101       Bool estimate = (test_group.type == VX_ESTIMATE);
   1102       Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
   1103       Bool is_sqrt = (strstr(test_group.name, "sqrt")) ? True : False;
   1104       Bool is_scalar = (strstr(test_group.name, "xs")) ? True : False;
   1105       Bool sparse_sp = False;
   1106       int stride = dp ? 2 : 4;
   1107       int loops = is_scalar ? 1 : stride;
   1108       stride = is_scalar ? 1: stride;
   1109 
   1110       /* For conversions of single to double, the 128-bit input register is sparsely populated:
   1111        *    |___ SP___|_Unused_|___SP___|__Unused__|   // for vector op
   1112        *                     or
   1113        *    |___ SP___|_Unused_|_Unused_|__Unused__|   // for scalar op
   1114        *
   1115        * For the vector op case, we need to adjust stride from '4' to '2', since
   1116        * we'll only be loading two values per loop into the input register.
   1117        */
   1118       if (!dp && !is_scalar && test_group.type == VX_CONV_TO_DOUBLE) {
   1119          sparse_sp = True;
   1120          stride = 2;
   1121       }
   1122 
   1123       for (i = 0; i < test_group.num_tests; i+=stride) {
   1124          unsigned int * pv;
   1125          void * inB, * vecB_void_ptr = (void *)&vec_inB;
   1126 
   1127          pv = (unsigned int *)&vec_out;
   1128          // clear vec_out
   1129          for (idx = 0; idx < 4; idx++, pv++)
   1130             *pv = 0;
   1131 
   1132          if (dp) {
   1133             int j;
   1134             unsigned long long * frB_dp, *dst_dp;
   1135             for (j = 0; j < loops; j++) {
   1136                inB = (void *)&spec_fargs[i + j];
   1137                // copy double precision FP into vector element i
   1138                if (isLE && is_scalar)
   1139                   vecB_void_ptr += 8;
   1140                memcpy(vecB_void_ptr + (j * 8), inB, 8);
   1141             }
   1142             // execute test insn
   1143             (*func)();
   1144             dst_dp = (unsigned long long *) &vec_out;
   1145             if (isLE && is_scalar)
   1146                dst_dp++;
   1147             printf("#%d: %s ", i/stride, test_group.name);
   1148             for (j = 0; j < loops; j++) {
   1149                if (j)
   1150                   printf("; ");
   1151                frB_dp = (unsigned long long *)&spec_fargs[i + j];
   1152                printf("%s(%016llx)", test_group.op, *frB_dp);
   1153                if (estimate) {
   1154                   Bool res = check_estimate(DOUBLE_TEST, is_sqrt, i + j, (isLE && is_scalar) ? 1: j);
   1155                   printf(" ==> %s)", res ? "PASS" : "FAIL");
   1156                   /* For debugging . . .
   1157                    printf(" ==> %s (res=%016llx)", res ? "PASS" : "FAIL", dst_dp[j]);
   1158                    */
   1159                } else {
   1160                   vx_fp_test_type type = test_group.type;
   1161                   switch (type) {
   1162                      case VX_SCALAR_CONV_TO_WORD:
   1163                         printf(" = %016llx", dst_dp[j] & 0x00000000ffffffffULL);
   1164                         break;
   1165                      case VX_CONV_TO_SINGLE:
   1166                         printf(" = %016llx", dst_dp[j] & 0xffffffff00000000ULL);
   1167                         break;
   1168                      default:  // For VX_CONV_TO_DOUBLE and non-convert instructions . . .
   1169                         printf(" = %016llx", dst_dp[j]);
   1170                   }
   1171                }
   1172             }
   1173             printf("\n");
   1174          } else {
   1175             int j;
   1176             unsigned int * frB_sp, * dst_sp = NULL;
   1177             unsigned long long * dst_dp = NULL;
   1178             if (sparse_sp)
   1179                loops = 2;
   1180             for (j = 0; j < loops; j++) {
   1181                inB = (void *)&spec_sp_fargs[i + j];
   1182                // copy single precision FP into vector element i
   1183                if (sparse_sp) {
   1184                   if (isLE)
   1185                      memcpy(vecB_void_ptr + ((2 * j * 4) + 4), inB, 4);
   1186                   else
   1187                      memcpy(vecB_void_ptr + ((2 * j * 4) ), inB, 4);
   1188                } else {
   1189                   if (isLE && is_scalar)
   1190                      vecB_void_ptr += 12;
   1191                   memcpy(vecB_void_ptr + (j * 4), inB, 4);
   1192                }
   1193             }
   1194             // execute test insn
   1195             (*func)();
   1196             if (test_group.type == VX_CONV_TO_DOUBLE) {
   1197                dst_dp = (unsigned long long *) &vec_out;
   1198                if (isLE && is_scalar)
   1199                   dst_dp++;
   1200             } else {
   1201                dst_sp = (unsigned int *) &vec_out;
   1202                if (isLE && is_scalar)
   1203                   dst_sp += 3;
   1204             }
   1205             // print result
   1206             printf("#%d: %s ", i/stride, test_group.name);
   1207             for (j = 0; j < loops; j++) {
   1208                if (j)
   1209                   printf("; ");
   1210                frB_sp = (unsigned int *)&spec_sp_fargs[i + j];
   1211                printf("%s(%08x)", test_group.op, *frB_sp);
   1212                if (estimate) {
   1213                   Bool res = check_estimate(SINGLE_TEST, is_sqrt, i + j, (isLE && is_scalar) ? 3 : j);
   1214                   printf(" ==> %s)", res ? "PASS" : "FAIL");
   1215                } else {
   1216                   if (test_group.type == VX_CONV_TO_DOUBLE)
   1217                         printf(" = %016llx", dst_dp[j]);
   1218                   else
   1219                   /* Special case: Current VEX implementation for fsqrts (single precision)
   1220                    * uses the same implementation as that used for double precision fsqrt.
   1221                    * However, I've found that for xvsqrtsp, the result from that implementation
   1222                    * may be off by the two LSBs.  Generally, even this small inaccuracy can cause the
   1223                    * output to appear very different if you end up with a carry.  But for the given
   1224                    * inputs in this testcase, we can simply mask out these bits.
   1225                    */
   1226                      printf(" = %08x", is_sqrt ? (dst_sp[j] & 0xfffffffc) : dst_sp[j]);
   1227                }
   1228             }
   1229             printf("\n");
   1230          }
   1231       }
   1232       k++;
   1233       printf( "\n" );
   1234    }
   1235 }
   1236 
   1237 static void test_int_to_fp_convert(void)
   1238 {
   1239    test_func_t func;
   1240    int k;
   1241    k = 0;
   1242 
   1243    while ((func = intToFp_tests[k].test_func)) {
   1244       int idx, i;
   1245       vx_intToFp_test_t test_group = intToFp_tests[k];
   1246       Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
   1247       Bool sparse_sp = False;
   1248       int stride = dp ? 2 : 4;
   1249       int loops = stride;
   1250 
   1251       /* For conversions of single to double, the 128-bit input register is sparsely populated:
   1252        *    |___ int___|_Unused_|___int___|__Unused__|   // for vector op
   1253        *                     or
   1254        * We need to adjust stride from '4' to '2', since we'll only be loading
   1255        * two values per loop into the input register.
   1256        */
   1257       if (!dp && test_group.type == VX_CONV_TO_DOUBLE) {
   1258          sparse_sp = True;
   1259          stride = 2;
   1260       }
   1261 
   1262       for (i = 0; i < test_group.num_tests; i+=stride) {
   1263          unsigned int * pv;
   1264          void * inB;
   1265 
   1266          pv = (unsigned int *)&vec_out;
   1267          // clear vec_out
   1268          for (idx = 0; idx < 4; idx++, pv++)
   1269             *pv = 0;
   1270 
   1271          if (dp) {
   1272             int j;
   1273             unsigned long long  *dst_dw, * targs = test_group.targs;
   1274             for (j = 0; j < loops; j++) {
   1275                inB = (void *)&targs[i + j];
   1276                // copy doubleword into vector element i
   1277                memcpy(((void *)&vec_inB) + (j * 8), inB, 8);
   1278             }
   1279             // execute test insn
   1280             (*func)();
   1281             dst_dw = (unsigned long long *) &vec_out;
   1282             printf("#%d: %s ", i/stride, test_group.name);
   1283             for (j = 0; j < loops; j++) {
   1284                if (j)
   1285                   printf("; ");
   1286                printf("conv(%016llx)", targs[i + j]);
   1287 
   1288                if (test_group.type == VX_CONV_TO_SINGLE)
   1289                   printf(" = %016llx", dst_dw[j] & 0xffffffff00000000ULL);
   1290                else
   1291                   printf(" = %016llx", dst_dw[j]);
   1292             }
   1293             printf("\n");
   1294          } else {
   1295             int j;
   1296             unsigned int * dst_sp = NULL;
   1297             unsigned int * targs = test_group.targs;
   1298             unsigned long long * dst_dp = NULL;
   1299             void * vecB_void_ptr = (void *)&vec_inB;
   1300             if (sparse_sp)
   1301                loops = 2;
   1302             for (j = 0; j < loops; j++) {
   1303                inB = (void *)&targs[i + j];
   1304                // copy single word into vector element i
   1305                if (sparse_sp) {
   1306                   if (isLE)
   1307                      memcpy(vecB_void_ptr + ((2 * j * 4) + 4), inB, 4);
   1308                   else
   1309                      memcpy(vecB_void_ptr + ((2 * j * 4) ), inB, 4);
   1310                } else {
   1311                   memcpy(vecB_void_ptr + (j * 4), inB, 4);
   1312                }
   1313             }
   1314             // execute test insn
   1315             (*func)();
   1316             if (test_group.type == VX_CONV_TO_DOUBLE)
   1317                dst_dp = (unsigned long long *) &vec_out;
   1318             else
   1319                dst_sp = (unsigned int *) &vec_out;
   1320             // print result
   1321             printf("#%d: %s ", i/stride, test_group.name);
   1322             for (j = 0; j < loops; j++) {
   1323                if (j)
   1324                   printf("; ");
   1325                printf("conv(%08x)", targs[i + j]);
   1326                if (test_group.type == VX_CONV_TO_DOUBLE)
   1327                   printf(" = %016llx", dst_dp[j]);
   1328                else
   1329                   printf(" = %08x", dst_sp[j]);
   1330             }
   1331             printf("\n");
   1332          }
   1333       }
   1334       k++;
   1335       printf( "\n" );
   1336    }
   1337 }
   1338 
   1339 
   1340 
   1341 // The div doubleword test data
   1342 signed long long div_dw_tdata[13][2] = {
   1343                                        { 4, -4 },
   1344                                        { 4, -3 },
   1345                                        { 4, 4 },
   1346                                        { 4, -5 },
   1347                                        { 3, 8 },
   1348                                        { 0x8000000000000000ULL, 0xa },
   1349                                        { 0x50c, -1 },
   1350                                        { 0x50c, -4096 },
   1351                                        { 0x1234fedc, 0x8000a873 },
   1352                                        { 0xabcd87651234fedcULL, 0xa123b893 },
   1353                                        { 0x123456789abdcULL, 0 },
   1354                                        { 0, 2 },
   1355                                        { 0x77, 0xa3499 }
   1356 };
   1357 #define dw_tdata_len (sizeof(div_dw_tdata)/sizeof(signed long long)/2)
   1358 
   1359 // The div word test data
   1360 unsigned int div_w_tdata[6][2] = {
   1361                               { 0, 2 },
   1362                               { 2, 0 },
   1363                               { 0x7abc1234, 0xf0000000 },
   1364                               { 0xfabc1234, 5 },
   1365                               { 77, 66 },
   1366                               { 5, 0xfabc1234 },
   1367 };
   1368 #define w_tdata_len (sizeof(div_w_tdata)/sizeof(unsigned int)/2)
   1369 
   1370 typedef struct div_ext_test
   1371 {
   1372    test_func_t test_func;
   1373    const char *name;
   1374    int num_tests;
   1375    div_type_t div_type;
   1376    precision_type_t precision;
   1377 } div_ext_test_t;
   1378 
   1379 static div_ext_test_t div_tests[] = {
   1380 #ifdef __powerpc64__
   1381                                    { &test_divdeu, "divdeu", dw_tdata_len, DIV_BASE, DOUBLE_TEST },
   1382                                    { &test_divdeu, "divdeuo", dw_tdata_len, DIV_OE, DOUBLE_TEST },
   1383 #endif
   1384                                    { &test_divwe, "divwe", w_tdata_len, DIV_BASE, SINGLE_TEST },
   1385                                    { &test_divwe, "divweo", w_tdata_len, DIV_OE, SINGLE_TEST },
   1386                                    { NULL, NULL, 0, 0, 0 }
   1387 };
   1388 
   1389 static void test_div_extensions(void)
   1390 {
   1391    test_func_t func;
   1392    int k;
   1393    k = 0;
   1394 
   1395    while ((func = div_tests[k].test_func)) {
   1396       int i, repeat = 1;
   1397       div_ext_test_t test_group = div_tests[k];
   1398       do_dot = False;
   1399 
   1400 again:
   1401       for (i = 0; i < test_group.num_tests; i++) {
   1402          unsigned int condreg;
   1403 
   1404          if (test_group.div_type == DIV_OE)
   1405             do_OE = True;
   1406          else
   1407             do_OE = False;
   1408 
   1409          if (test_group.precision == DOUBLE_TEST) {
   1410             r14 = div_dw_tdata[i][0];
   1411             r15 = div_dw_tdata[i][1];
   1412          } else {
   1413             r14 = div_w_tdata[i][0];
   1414             r15 = div_w_tdata[i][1];
   1415          }
   1416          // execute test insn
   1417          (*func)();
   1418          condreg = (div_flags & 0xf0000000) >> 28;
   1419          printf("#%d: %s%s: ", i, test_group.name, do_dot ? "." : "");
   1420          if (test_group.precision == DOUBLE_TEST) {
   1421             printf("0x%016llx0000000000000000 / 0x%016llx = 0x%016llx;",
   1422                    div_dw_tdata[i][0], div_dw_tdata[i][1], (signed long long) r17);
   1423          } else {
   1424             printf("0x%08x00000000 / 0x%08x = 0x%08x;",
   1425                    div_w_tdata[i][0], div_w_tdata[i][1], (unsigned int) r17);
   1426          }
   1427          printf(" CR=%x; XER=%x\n", condreg, div_xer);
   1428       }
   1429       printf("\n");
   1430       if (repeat) {
   1431          repeat = 0;
   1432          do_dot = True;
   1433          goto again;
   1434       }
   1435       k++;
   1436       printf( "\n" );
   1437    }
   1438 }
   1439 
   1440 
   1441 static void test_vx_tdivORtsqrt(void)
   1442 {
   1443    test_func_t func;
   1444    int k, crx;
   1445    unsigned int flags;
   1446    k = 0;
   1447    do_dot = False;
   1448    build_special_fargs_table();
   1449 
   1450    while ((func = vx_tdivORtsqrt_tests[k].test_func)) {
   1451       int idx, i;
   1452       vx_fp_test_t test_group = vx_tdivORtsqrt_tests[k];
   1453       Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
   1454       Bool is_scalar = (strstr(test_group.name, "xs")) ? True : False;
   1455       Bool two_args = test_group.targs ?  True : False;
   1456       int stride = dp ? 2 : 4;
   1457       int loops = is_scalar ? 1 : stride;
   1458       stride = is_scalar ? 1: stride;
   1459 
   1460       for (i = 0; i < test_group.num_tests; i+=stride) {
   1461          unsigned int * pv;
   1462          void * inB, * vecB_void_ptr = (void *)&vec_inB;
   1463 
   1464          pv = (unsigned int *)&vec_out;
   1465          // clear vec_out
   1466          for (idx = 0; idx < 4; idx++, pv++)
   1467             *pv = 0;
   1468 
   1469          if (dp) {
   1470             int j;
   1471             unsigned long long * frB_dp;
   1472             if (two_args) {
   1473                setup_dp_fp_args(&test_group.targs[i], False);
   1474             } else {
   1475                for (j = 0; j < loops; j++) {
   1476                   inB = (void *)&spec_fargs[i + j];
   1477                   // copy double precision FP into vector element i
   1478                   if (isLE && is_scalar)
   1479                      vecB_void_ptr += 8;
   1480                   memcpy(vecB_void_ptr + (j * 8), inB, 8);
   1481                }
   1482             }
   1483             // execute test insn
   1484             // Must do set/get of CRs immediately before/after calling the asm func
   1485             // to avoid CRs being modified by other instructions.
   1486             SET_FPSCR_ZERO;
   1487             SET_CR_XER_ZERO;
   1488             (*func)();
   1489             GET_CR(flags);
   1490             // assumes using CR1
   1491             crx = (flags & 0x0f000000) >> 24;
   1492             if (two_args) {
   1493                print_vector_fp_result(crx, &test_group, i, False/*do not print vec_out*/);
   1494             } else {
   1495                printf("#%d: %s ", i/stride, test_group.name);
   1496                for (j = 0; j < loops; j++) {
   1497                   if (j)
   1498                      printf("; ");
   1499                   frB_dp = (unsigned long long *)&spec_fargs[i + j];
   1500                   printf("%s(%016llx)", test_group.op, *frB_dp);
   1501                }
   1502                printf( " ? %x (CRx)\n", crx);
   1503             }
   1504          } else {
   1505             int j;
   1506             unsigned int * frB_sp;
   1507             if (two_args) {
   1508                setup_sp_fp_args(&test_group.targs[i], False);
   1509             } else {
   1510                for (j = 0; j < loops; j++) {
   1511                   inB = (void *)&spec_sp_fargs[i + j];
   1512                   // copy single precision FP into vector element i
   1513                   memcpy(((void *)&vec_inB) + (j * 4), inB, 4);
   1514                }
   1515             }
   1516             // execute test insn
   1517             SET_FPSCR_ZERO;
   1518             SET_CR_XER_ZERO;
   1519             (*func)();
   1520             GET_CR(flags);
   1521             crx = (flags & 0x0f000000) >> 24;
   1522             // print result
   1523             if (two_args) {
   1524                print_vector_fp_result(crx, &test_group, i, False/*do not print vec_out*/);
   1525             } else {
   1526                printf("#%d: %s ", i/stride, test_group.name);
   1527                for (j = 0; j < loops; j++) {
   1528                   if (j)
   1529                      printf("; ");
   1530                   frB_sp = (unsigned int *)&spec_sp_fargs[i + j];
   1531                   printf("%s(%08x)", test_group.op, *frB_sp);
   1532                }
   1533                printf( " ? %x (CRx)\n", crx);
   1534             }
   1535          }
   1536       }
   1537       k++;
   1538       printf( "\n" );
   1539    }
   1540 }
   1541 
   1542 
   1543 static void test_ftsqrt(void)
   1544 {
   1545    int i, crx;
   1546    unsigned int flags;
   1547    unsigned long long * frbp;
   1548    build_special_fargs_table();
   1549 
   1550 
   1551    for (i = 0; i < nb_special_fargs; i++) {
   1552       f14 = spec_fargs[i];
   1553       frbp = (unsigned long long *)&spec_fargs[i];
   1554       SET_FPSCR_ZERO;
   1555       SET_CR_XER_ZERO;
   1556       __asm__ __volatile__ ("ftsqrt           cr1, %0" : : "d" (f14));
   1557       GET_CR(flags);
   1558       crx = (flags & 0x0f000000) >> 24;
   1559       printf( "ftsqrt: %016llx ? %x (CRx)\n", *frbp, crx);
   1560    }
   1561    printf( "\n" );
   1562 }
   1563 
   1564 static void
   1565 test_popcntw(void)
   1566 {
   1567 #ifdef __powerpc64__
   1568    uint64_t res;
   1569    unsigned long long src = 0x9182736405504536ULL;
   1570    r14 = src;
   1571    __asm__ __volatile__ ("popcntw          %0, %1" : "=r" (res): "r" (r14));
   1572    printf("popcntw: 0x%llx => 0x%016llx\n", (unsigned long long)src, (unsigned long long)res);
   1573 #else
   1574    uint32_t res;
   1575    unsigned int src = 0x9182730E;
   1576    r14 = src;
   1577    __asm__ __volatile__ ("popcntw          %0, %1" : "=r" (res): "r" (r14));
   1578    printf("popcntw: 0x%x => 0x%08x\n", src, (int)res);
   1579 #endif
   1580    printf( "\n" );
   1581 }
   1582 
   1583 
   1584 static test_table_t
   1585          all_tests[] =
   1586 {
   1587 
   1588                     { &test_vsx_one_fp_arg,
   1589                       "Test VSX vector and scalar single argument instructions"} ,
   1590                     { &test_int_to_fp_convert,
   1591                       "Test VSX vector integer to float conversion instructions" },
   1592                     { &test_div_extensions,
   1593                        "Test div extensions" },
   1594                     { &test_ftsqrt,
   1595                        "Test ftsqrt instruction" },
   1596                     { &test_vx_tdivORtsqrt,
   1597                        "Test vector and scalar tdiv and tsqrt instructions" },
   1598                     { &test_popcntw,
   1599                        "Test popcntw instruction" },
   1600                     { NULL, NULL }
   1601 };
   1602 #endif // HAS_VSX
   1603 
   1604 int main(int argc, char *argv[])
   1605 {
   1606 #ifdef HAS_VSX
   1607 
   1608    test_table_t aTest;
   1609    test_func_t func;
   1610    int i = 0;
   1611 
   1612    while ((func = all_tests[i].test_category)) {
   1613       aTest = all_tests[i];
   1614       printf( "%s\n", aTest.name );
   1615       (*func)();
   1616       i++;
   1617    }
   1618    if (spec_fargs)
   1619      free(spec_fargs);
   1620    if (spec_sp_fargs)
   1621      free(spec_sp_fargs);
   1622 
   1623 #endif // HAS _VSX
   1624 
   1625    return 0;
   1626 }
   1627