Home | History | Annotate | Download | only in libmpdec
      1 /*
      2  * Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  *
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
     16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     25  * SUCH DAMAGE.
     26  */
     27 
     28 
     29 #ifndef UMODARITH_H
     30 #define UMODARITH_H
     31 
     32 
     33 #include "constants.h"
     34 #include "mpdecimal.h"
     35 #include "typearith.h"
     36 
     37 
     38 /* Bignum: Low level routines for unsigned modular arithmetic. These are
     39    used in the fast convolution functions for very large coefficients. */
     40 
     41 
     42 /**************************************************************************/
     43 /*                        ANSI modular arithmetic                         */
     44 /**************************************************************************/
     45 
     46 
     47 /*
     48  * Restrictions: a < m and b < m
     49  * ACL2 proof: umodarith.lisp: addmod-correct
     50  */
     51 static inline mpd_uint_t
     52 addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     53 {
     54     mpd_uint_t s;
     55 
     56     s = a + b;
     57     s = (s < a) ? s - m : s;
     58     s = (s >= m) ? s - m : s;
     59 
     60     return s;
     61 }
     62 
     63 /*
     64  * Restrictions: a < m and b < m
     65  * ACL2 proof: umodarith.lisp: submod-2-correct
     66  */
     67 static inline mpd_uint_t
     68 submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     69 {
     70     mpd_uint_t d;
     71 
     72     d = a - b;
     73     d = (a < b) ? d + m : d;
     74 
     75     return d;
     76 }
     77 
     78 /*
     79  * Restrictions: a < 2m and b < 2m
     80  * ACL2 proof: umodarith.lisp: section ext-submod
     81  */
     82 static inline mpd_uint_t
     83 ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
     84 {
     85     mpd_uint_t d;
     86 
     87     a = (a >= m) ? a - m : a;
     88     b = (b >= m) ? b - m : b;
     89 
     90     d = a - b;
     91     d = (a < b) ? d + m : d;
     92 
     93     return d;
     94 }
     95 
     96 /*
     97  * Reduce double word modulo m.
     98  * Restrictions: m != 0
     99  * ACL2 proof: umodarith.lisp: section dw-reduce
    100  */
    101 static inline mpd_uint_t
    102 dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
    103 {
    104     mpd_uint_t r1, r2, w;
    105 
    106     _mpd_div_word(&w, &r1, hi, m);
    107     _mpd_div_words(&w, &r2, r1, lo, m);
    108 
    109     return r2;
    110 }
    111 
    112 /*
    113  * Subtract double word from a.
    114  * Restrictions: a < m
    115  * ACL2 proof: umodarith.lisp: section dw-submod
    116  */
    117 static inline mpd_uint_t
    118 dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
    119 {
    120     mpd_uint_t d, r;
    121 
    122     r = dw_reduce(hi, lo, m);
    123     d = a - r;
    124     d = (a < r) ? d + m : d;
    125 
    126     return d;
    127 }
    128 
    129 #ifdef CONFIG_64
    130 
    131 /**************************************************************************/
    132 /*                        64-bit modular arithmetic                       */
    133 /**************************************************************************/
    134 
    135 /*
    136  * A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
    137  * proof is in umodarith.lisp: section "Fast modular reduction".
    138  *
    139  * Algorithm: calculate (a * b) % p:
    140  *
    141  *   a) hi, lo <- a * b       # Calculate a * b.
    142  *
    143  *   b) hi, lo <-  R(hi, lo)  # Reduce modulo p.
    144  *
    145  *   c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
    146  *
    147  *   d) If the result is less than p, return lo. Otherwise return lo - p.
    148  */
    149 
    150 static inline mpd_uint_t
    151 x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
    152 {
    153     mpd_uint_t hi, lo, x, y;
    154 
    155 
    156     _mpd_mul_words(&hi, &lo, a, b);
    157 
    158     if (m & (1ULL<<32)) { /* P1 */
    159 
    160         /* first reduction */
    161         x = y = hi;
    162         hi >>= 32;
    163 
    164         x = lo - x;
    165         if (x > lo) hi--;
    166 
    167         y <<= 32;
    168         lo = y + x;
    169         if (lo < y) hi++;
    170 
    171         /* second reduction */
    172         x = y = hi;
    173         hi >>= 32;
    174 
    175         x = lo - x;
    176         if (x > lo) hi--;
    177 
    178         y <<= 32;
    179         lo = y + x;
    180         if (lo < y) hi++;
    181 
    182         return (hi || lo >= m ? lo - m : lo);
    183     }
    184     else if (m & (1ULL<<34)) { /* P2 */
    185 
    186         /* first reduction */
    187         x = y = hi;
    188         hi >>= 30;
    189 
    190         x = lo - x;
    191         if (x > lo) hi--;
    192 
    193         y <<= 34;
    194         lo = y + x;
    195         if (lo < y) hi++;
    196 
    197         /* second reduction */
    198         x = y = hi;
    199         hi >>= 30;
    200 
    201         x = lo - x;
    202         if (x > lo) hi--;
    203 
    204         y <<= 34;
    205         lo = y + x;
    206         if (lo < y) hi++;
    207 
    208         /* third reduction */
    209         x = y = hi;
    210         hi >>= 30;
    211 
    212         x = lo - x;
    213         if (x > lo) hi--;
    214 
    215         y <<= 34;
    216         lo = y + x;
    217         if (lo < y) hi++;
    218 
    219         return (hi || lo >= m ? lo - m : lo);
    220     }
    221     else { /* P3 */
    222 
    223         /* first reduction */
    224         x = y = hi;
    225         hi >>= 24;
    226 
    227         x = lo - x;
    228         if (x > lo) hi--;
    229 
    230         y <<= 40;
    231         lo = y + x;
    232         if (lo < y) hi++;
    233 
    234         /* second reduction */
    235         x = y = hi;
    236         hi >>= 24;
    237 
    238         x = lo - x;
    239         if (x > lo) hi--;
    240 
    241         y <<= 40;
    242         lo = y + x;
    243         if (lo < y) hi++;
    244 
    245         /* third reduction */
    246         x = y = hi;
    247         hi >>= 24;
    248 
    249         x = lo - x;
    250         if (x > lo) hi--;
    251 
    252         y <<= 40;
    253         lo = y + x;
    254         if (lo < y) hi++;
    255 
    256         return (hi || lo >= m ? lo - m : lo);
    257     }
    258 }
    259 
    260 static inline void
    261 x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
    262 {
    263     *a = x64_mulmod(*a, w, m);
    264     *b = x64_mulmod(*b, w, m);
    265 }
    266 
    267 static inline void
    268 x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
    269             mpd_uint_t m)
    270 {
    271     *a0 = x64_mulmod(*a0, b0, m);
    272     *a1 = x64_mulmod(*a1, b1, m);
    273 }
    274 
    275 static inline mpd_uint_t
    276 x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
    277 {
    278     mpd_uint_t r = 1;
    279 
    280     while (exp > 0) {
    281         if (exp & 1)
    282             r = x64_mulmod(r, base, umod);
    283         base = x64_mulmod(base, base, umod);
    284         exp >>= 1;
    285     }
    286 
    287     return r;
    288 }
    289 
    290 /* END CONFIG_64 */
    291 #else /* CONFIG_32 */
    292 
    293 
    294 /**************************************************************************/
    295 /*                        32-bit modular arithmetic                       */
    296 /**************************************************************************/
    297 
    298 #if defined(ANSI)
    299 #if !defined(LEGACY_COMPILER)
    300 /* HAVE_UINT64_T */
    301 static inline mpd_uint_t
    302 std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
    303 {
    304     return ((mpd_uuint_t) a * b) % m;
    305 }
    306 
    307 static inline void
    308 std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
    309 {
    310     *a = ((mpd_uuint_t) *a * w) % m;
    311     *b = ((mpd_uuint_t) *b * w) % m;
    312 }
    313 
    314 static inline void
    315 std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
    316             mpd_uint_t m)
    317 {
    318     *a0 = ((mpd_uuint_t) *a0 * b0) % m;
    319     *a1 = ((mpd_uuint_t) *a1 * b1) % m;
    320 }
    321 /* END HAVE_UINT64_T */
    322 #else
    323 /* LEGACY_COMPILER */
    324 static inline mpd_uint_t
    325 std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
    326 {
    327     mpd_uint_t hi, lo, q, r;
    328     _mpd_mul_words(&hi, &lo, a, b);
    329     _mpd_div_words(&q, &r, hi, lo, m);
    330     return r;
    331 }
    332 
    333 static inline void
    334 std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
    335 {
    336     *a = std_mulmod(*a, w, m);
    337     *b = std_mulmod(*b, w, m);
    338 }
    339 
    340 static inline void
    341 std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
    342             mpd_uint_t m)
    343 {
    344     *a0 = std_mulmod(*a0, b0, m);
    345     *a1 = std_mulmod(*a1, b1, m);
    346 }
    347 /* END LEGACY_COMPILER */
    348 #endif
    349 
    350 static inline mpd_uint_t
    351 std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
    352 {
    353     mpd_uint_t r = 1;
    354 
    355     while (exp > 0) {
    356         if (exp & 1)
    357             r = std_mulmod(r, base, umod);
    358         base = std_mulmod(base, base, umod);
    359         exp >>= 1;
    360     }
    361 
    362     return r;
    363 }
    364 #endif /* ANSI CONFIG_32 */
    365 
    366 
    367 /**************************************************************************/
    368 /*                    Pentium Pro modular arithmetic                      */
    369 /**************************************************************************/
    370 
    371 /*
    372  * A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
    373  * control word must be set to 64-bit precision and truncation mode
    374  * prior to using these functions.
    375  *
    376  * Algorithm: calculate (a * b) % p:
    377  *
    378  *   p    := prime < 2**31
    379  *   pinv := (long double)1.0 / p (precalculated)
    380  *
    381  *   a) n = a * b              # Calculate exact product.
    382  *   b) qest = n * pinv        # Calculate estimate for q = n / p.
    383  *   c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
    384  *   d) r = n - q * p          # Calculate remainder.
    385  *
    386  * Remarks:
    387  *
    388  *   - p = dmod and pinv = dinvmod.
    389  *   - dinvmod points to an array of three uint32_t, which is interpreted
    390  *     as an 80 bit long double by fldt.
    391  *   - Intel compilers prior to version 11 do not seem to handle the
    392  *     __GNUC__ inline assembly correctly.
    393  *   - random tests are provided in tests/extended/ppro_mulmod.c
    394  */
    395 
    396 #if defined(PPRO)
    397 #if defined(ASM)
    398 
    399 /* Return (a * b) % dmod */
    400 static inline mpd_uint_t
    401 ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
    402 {
    403     mpd_uint_t retval;
    404 
    405     __asm__ (
    406             "fildl  %2\n\t"
    407             "fildl  %1\n\t"
    408             "fmulp  %%st, %%st(1)\n\t"
    409             "fldt   (%4)\n\t"
    410             "fmul   %%st(1), %%st\n\t"
    411             "flds   %5\n\t"
    412             "fadd   %%st, %%st(1)\n\t"
    413             "fsubrp %%st, %%st(1)\n\t"
    414             "fldl   (%3)\n\t"
    415             "fmulp  %%st, %%st(1)\n\t"
    416             "fsubrp %%st, %%st(1)\n\t"
    417             "fistpl %0\n\t"
    418             : "=m" (retval)
    419             : "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
    420             : "st", "memory"
    421     );
    422 
    423     return retval;
    424 }
    425 
    426 /*
    427  * Two modular multiplications in parallel:
    428  *      *a0 = (*a0 * w) % dmod
    429  *      *a1 = (*a1 * w) % dmod
    430  */
    431 static inline void
    432 ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
    433               double *dmod, uint32_t *dinvmod)
    434 {
    435     __asm__ (
    436             "fildl  %2\n\t"
    437             "fildl  (%1)\n\t"
    438             "fmul   %%st(1), %%st\n\t"
    439             "fxch   %%st(1)\n\t"
    440             "fildl  (%0)\n\t"
    441             "fmulp  %%st, %%st(1) \n\t"
    442             "fldt   (%4)\n\t"
    443             "flds   %5\n\t"
    444             "fld    %%st(2)\n\t"
    445             "fmul   %%st(2)\n\t"
    446             "fadd   %%st(1)\n\t"
    447             "fsub   %%st(1)\n\t"
    448             "fmull  (%3)\n\t"
    449             "fsubrp %%st, %%st(3)\n\t"
    450             "fxch   %%st(2)\n\t"
    451             "fistpl (%0)\n\t"
    452             "fmul   %%st(2)\n\t"
    453             "fadd   %%st(1)\n\t"
    454             "fsubp  %%st, %%st(1)\n\t"
    455             "fmull  (%3)\n\t"
    456             "fsubrp %%st, %%st(1)\n\t"
    457             "fistpl (%1)\n\t"
    458             : : "r" (a0), "r" (a1), "m" (w),
    459                 "r" (dmod), "r" (dinvmod),
    460                 "m" (MPD_TWO63)
    461             : "st", "memory"
    462     );
    463 }
    464 
    465 /*
    466  * Two modular multiplications in parallel:
    467  *      *a0 = (*a0 * b0) % dmod
    468  *      *a1 = (*a1 * b1) % dmod
    469  */
    470 static inline void
    471 ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
    472              double *dmod, uint32_t *dinvmod)
    473 {
    474     __asm__ (
    475             "fildl  %3\n\t"
    476             "fildl  (%2)\n\t"
    477             "fmulp  %%st, %%st(1)\n\t"
    478             "fildl  %1\n\t"
    479             "fildl  (%0)\n\t"
    480             "fmulp  %%st, %%st(1)\n\t"
    481             "fldt   (%5)\n\t"
    482             "fld    %%st(2)\n\t"
    483             "fmul   %%st(1), %%st\n\t"
    484             "fxch   %%st(1)\n\t"
    485             "fmul   %%st(2), %%st\n\t"
    486             "flds   %6\n\t"
    487             "fldl   (%4)\n\t"
    488             "fxch   %%st(3)\n\t"
    489             "fadd   %%st(1), %%st\n\t"
    490             "fxch   %%st(2)\n\t"
    491             "fadd   %%st(1), %%st\n\t"
    492             "fxch   %%st(2)\n\t"
    493             "fsub   %%st(1), %%st\n\t"
    494             "fxch   %%st(2)\n\t"
    495             "fsubp  %%st, %%st(1)\n\t"
    496             "fxch   %%st(1)\n\t"
    497             "fmul   %%st(2), %%st\n\t"
    498             "fxch   %%st(1)\n\t"
    499             "fmulp  %%st, %%st(2)\n\t"
    500             "fsubrp %%st, %%st(3)\n\t"
    501             "fsubrp %%st, %%st(1)\n\t"
    502             "fxch   %%st(1)\n\t"
    503             "fistpl (%2)\n\t"
    504             "fistpl (%0)\n\t"
    505             : : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
    506                 "r" (dmod), "r" (dinvmod),
    507                 "m" (MPD_TWO63)
    508             : "st", "memory"
    509     );
    510 }
    511 /* END PPRO GCC ASM */
    512 #elif defined(MASM)
    513 
    514 /* Return (a * b) % dmod */
    515 static inline mpd_uint_t __cdecl
    516 ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
    517 {
    518     mpd_uint_t retval;
    519 
    520     __asm {
    521         mov     eax, dinvmod
    522         mov     edx, dmod
    523         fild    b
    524         fild    a
    525         fmulp   st(1), st
    526         fld     TBYTE PTR [eax]
    527         fmul    st, st(1)
    528         fld     MPD_TWO63
    529         fadd    st(1), st
    530         fsubp   st(1), st
    531         fld     QWORD PTR [edx]
    532         fmulp   st(1), st
    533         fsubp   st(1), st
    534         fistp   retval
    535     }
    536 
    537     return retval;
    538 }
    539 
    540 /*
    541  * Two modular multiplications in parallel:
    542  *      *a0 = (*a0 * w) % dmod
    543  *      *a1 = (*a1 * w) % dmod
    544  */
    545 static inline mpd_uint_t __cdecl
    546 ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
    547               double *dmod, uint32_t *dinvmod)
    548 {
    549     __asm {
    550         mov     ecx, dmod
    551         mov     edx, a1
    552         mov     ebx, dinvmod
    553         mov     eax, a0
    554         fild    w
    555         fild    DWORD PTR [edx]
    556         fmul    st, st(1)
    557         fxch    st(1)
    558         fild    DWORD PTR [eax]
    559         fmulp   st(1), st
    560         fld     TBYTE PTR [ebx]
    561         fld     MPD_TWO63
    562         fld     st(2)
    563         fmul    st, st(2)
    564         fadd    st, st(1)
    565         fsub    st, st(1)
    566         fmul    QWORD PTR [ecx]
    567         fsubp   st(3), st
    568         fxch    st(2)
    569         fistp   DWORD PTR [eax]
    570         fmul    st, st(2)
    571         fadd    st, st(1)
    572         fsubrp  st(1), st
    573         fmul    QWORD PTR [ecx]
    574         fsubp   st(1), st
    575         fistp   DWORD PTR [edx]
    576     }
    577 }
    578 
    579 /*
    580  * Two modular multiplications in parallel:
    581  *      *a0 = (*a0 * b0) % dmod
    582  *      *a1 = (*a1 * b1) % dmod
    583  */
    584 static inline void __cdecl
    585 ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
    586              double *dmod, uint32_t *dinvmod)
    587 {
    588     __asm {
    589         mov     ecx, dmod
    590         mov     edx, a1
    591         mov     ebx, dinvmod
    592         mov     eax, a0
    593         fild    b1
    594         fild    DWORD PTR [edx]
    595         fmulp   st(1), st
    596         fild    b0
    597         fild    DWORD PTR [eax]
    598         fmulp   st(1), st
    599         fld     TBYTE PTR [ebx]
    600         fld     st(2)
    601         fmul    st, st(1)
    602         fxch    st(1)
    603         fmul    st, st(2)
    604         fld     DWORD PTR MPD_TWO63
    605         fld     QWORD PTR [ecx]
    606         fxch    st(3)
    607         fadd    st, st(1)
    608         fxch    st(2)
    609         fadd    st, st(1)
    610         fxch    st(2)
    611         fsub    st, st(1)
    612         fxch    st(2)
    613         fsubrp  st(1), st
    614         fxch    st(1)
    615         fmul    st, st(2)
    616         fxch    st(1)
    617         fmulp   st(2), st
    618         fsubp   st(3), st
    619         fsubp   st(1), st
    620         fxch    st(1)
    621         fistp   DWORD PTR [edx]
    622         fistp   DWORD PTR [eax]
    623     }
    624 }
    625 #endif /* PPRO MASM (_MSC_VER) */
    626 
    627 
    628 /* Return (base ** exp) % dmod */
    629 static inline mpd_uint_t
    630 ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
    631 {
    632     mpd_uint_t r = 1;
    633 
    634     while (exp > 0) {
    635         if (exp & 1)
    636             r = ppro_mulmod(r, base, dmod, dinvmod);
    637         base = ppro_mulmod(base, base, dmod, dinvmod);
    638         exp >>= 1;
    639     }
    640 
    641     return r;
    642 }
    643 #endif /* PPRO */
    644 #endif /* CONFIG_32 */
    645 
    646 
    647 #endif /* UMODARITH_H */
    648 
    649 
    650 
    651