1 /* Copyright (C) 1995-1998 Eric Young (eay (at) cryptsoft.com) 2 * All rights reserved. 3 * 4 * This package is an SSL implementation written 5 * by Eric Young (eay (at) cryptsoft.com). 6 * The implementation was written so as to conform with Netscapes SSL. 7 * 8 * This library is free for commercial and non-commercial use as long as 9 * the following conditions are aheared to. The following conditions 10 * apply to all code found in this distribution, be it the RC4, RSA, 11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 12 * included with this distribution is covered by the same copyright terms 13 * except that the holder is Tim Hudson (tjh (at) cryptsoft.com). 14 * 15 * Copyright remains Eric Young's, and as such any Copyright notices in 16 * the code are not to be removed. 17 * If this package is used in a product, Eric Young should be given attribution 18 * as the author of the parts of the library used. 19 * This can be in the form of a textual message at program startup or 20 * in documentation (online or textual) provided with the package. 21 * 22 * Redistribution and use in source and binary forms, with or without 23 * modification, are permitted provided that the following conditions 24 * are met: 25 * 1. Redistributions of source code must retain the copyright 26 * notice, this list of conditions and the following disclaimer. 27 * 2. Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in the 29 * documentation and/or other materials provided with the distribution. 30 * 3. All advertising materials mentioning features or use of this software 31 * must display the following acknowledgement: 32 * "This product includes cryptographic software written by 33 * Eric Young (eay (at) cryptsoft.com)" 34 * The word 'cryptographic' can be left out if the rouines from the library 35 * being used are not cryptographic related :-). 36 * 4. If you include any Windows specific code (or a derivative thereof) from 37 * the apps directory (application code) you must include an acknowledgement: 38 * "This product includes software written by Tim Hudson (tjh (at) cryptsoft.com)" 39 * 40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * The licence and distribution terms for any publically available version or 53 * derivative of this code cannot be changed. i.e. this code cannot simply be 54 * copied and put under another distribution licence 55 * [including the GNU Public Licence.] 56 */ 57 /* ==================================================================== 58 * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. 59 * 60 * Redistribution and use in source and binary forms, with or without 61 * modification, are permitted provided that the following conditions 62 * are met: 63 * 64 * 1. Redistributions of source code must retain the above copyright 65 * notice, this list of conditions and the following disclaimer. 66 * 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in 69 * the documentation and/or other materials provided with the 70 * distribution. 71 * 72 * 3. All advertising materials mentioning features or use of this 73 * software must display the following acknowledgment: 74 * "This product includes software developed by the OpenSSL Project 75 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 76 * 77 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 78 * endorse or promote products derived from this software without 79 * prior written permission. For written permission, please contact 80 * openssl-core (at) openssl.org. 81 * 82 * 5. Products derived from this software may not be called "OpenSSL" 83 * nor may "OpenSSL" appear in their names without prior written 84 * permission of the OpenSSL Project. 85 * 86 * 6. Redistributions of any form whatsoever must retain the following 87 * acknowledgment: 88 * "This product includes software developed by the OpenSSL Project 89 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 90 * 91 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 92 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 93 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 95 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 96 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 97 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 98 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 100 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 101 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 102 * OF THE POSSIBILITY OF SUCH DAMAGE. 103 * ==================================================================== 104 * 105 * This product includes cryptographic software written by Eric Young 106 * (eay (at) cryptsoft.com). This product includes software written by Tim 107 * Hudson (tjh (at) cryptsoft.com). */ 108 109 #include <openssl/bn.h> 110 111 #include <assert.h> 112 #include <string.h> 113 114 #include <openssl/cpu.h> 115 #include <openssl/err.h> 116 #include <openssl/mem.h> 117 118 #include "internal.h" 119 120 121 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) 122 #define OPENSSL_BN_ASM_MONT5 123 #define RSAZ_ENABLED 124 125 #include "rsaz_exp.h" 126 127 void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, const void *table, 128 const BN_ULONG *np, const BN_ULONG *n0, int num, 129 int power); 130 void bn_scatter5(const BN_ULONG *inp, size_t num, void *table, size_t power); 131 void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power); 132 void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const void *table, 133 const BN_ULONG *np, const BN_ULONG *n0, int num, int power); 134 int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap, 135 const BN_ULONG *not_used, const BN_ULONG *np, 136 const BN_ULONG *n0, int num); 137 #endif 138 139 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { 140 int i, bits, ret = 0; 141 BIGNUM *v, *rr; 142 143 BN_CTX_start(ctx); 144 if (r == a || r == p) { 145 rr = BN_CTX_get(ctx); 146 } else { 147 rr = r; 148 } 149 150 v = BN_CTX_get(ctx); 151 if (rr == NULL || v == NULL) { 152 goto err; 153 } 154 155 if (BN_copy(v, a) == NULL) { 156 goto err; 157 } 158 bits = BN_num_bits(p); 159 160 if (BN_is_odd(p)) { 161 if (BN_copy(rr, a) == NULL) { 162 goto err; 163 } 164 } else { 165 if (!BN_one(rr)) { 166 goto err; 167 } 168 } 169 170 for (i = 1; i < bits; i++) { 171 if (!BN_sqr(v, v, ctx)) { 172 goto err; 173 } 174 if (BN_is_bit_set(p, i)) { 175 if (!BN_mul(rr, rr, v, ctx)) { 176 goto err; 177 } 178 } 179 } 180 181 if (r != rr && !BN_copy(r, rr)) { 182 goto err; 183 } 184 ret = 1; 185 186 err: 187 BN_CTX_end(ctx); 188 return ret; 189 } 190 191 /* maximum precomputation table size for *variable* sliding windows */ 192 #define TABLE_SIZE 32 193 194 typedef struct bn_recp_ctx_st { 195 BIGNUM N; /* the divisor */ 196 BIGNUM Nr; /* the reciprocal */ 197 int num_bits; 198 int shift; 199 int flags; 200 } BN_RECP_CTX; 201 202 static void BN_RECP_CTX_init(BN_RECP_CTX *recp) { 203 BN_init(&recp->N); 204 BN_init(&recp->Nr); 205 recp->num_bits = 0; 206 recp->shift = 0; 207 recp->flags = 0; 208 } 209 210 static void BN_RECP_CTX_free(BN_RECP_CTX *recp) { 211 if (recp == NULL) { 212 return; 213 } 214 215 BN_free(&recp->N); 216 BN_free(&recp->Nr); 217 } 218 219 static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) { 220 if (!BN_copy(&(recp->N), d)) { 221 return 0; 222 } 223 BN_zero(&recp->Nr); 224 recp->num_bits = BN_num_bits(d); 225 recp->shift = 0; 226 227 return 1; 228 } 229 230 /* len is the expected size of the result We actually calculate with an extra 231 * word of precision, so we can do faster division if the remainder is not 232 * required. 233 * r := 2^len / m */ 234 static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) { 235 int ret = -1; 236 BIGNUM *t; 237 238 BN_CTX_start(ctx); 239 t = BN_CTX_get(ctx); 240 if (t == NULL) { 241 goto err; 242 } 243 244 if (!BN_set_bit(t, len)) { 245 goto err; 246 } 247 248 if (!BN_div(r, NULL, t, m, ctx)) { 249 goto err; 250 } 251 252 ret = len; 253 254 err: 255 BN_CTX_end(ctx); 256 return ret; 257 } 258 259 static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, 260 BN_RECP_CTX *recp, BN_CTX *ctx) { 261 int i, j, ret = 0; 262 BIGNUM *a, *b, *d, *r; 263 264 BN_CTX_start(ctx); 265 a = BN_CTX_get(ctx); 266 b = BN_CTX_get(ctx); 267 if (dv != NULL) { 268 d = dv; 269 } else { 270 d = BN_CTX_get(ctx); 271 } 272 273 if (rem != NULL) { 274 r = rem; 275 } else { 276 r = BN_CTX_get(ctx); 277 } 278 279 if (a == NULL || b == NULL || d == NULL || r == NULL) { 280 goto err; 281 } 282 283 if (BN_ucmp(m, &recp->N) < 0) { 284 BN_zero(d); 285 if (!BN_copy(r, m)) { 286 goto err; 287 } 288 BN_CTX_end(ctx); 289 return 1; 290 } 291 292 /* We want the remainder 293 * Given input of ABCDEF / ab 294 * we need multiply ABCDEF by 3 digests of the reciprocal of ab */ 295 296 /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */ 297 i = BN_num_bits(m); 298 j = recp->num_bits << 1; 299 if (j > i) { 300 i = j; 301 } 302 303 /* Nr := round(2^i / N) */ 304 if (i != recp->shift) { 305 recp->shift = 306 BN_reciprocal(&(recp->Nr), &(recp->N), i, 307 ctx); /* BN_reciprocal returns i, or -1 for an error */ 308 } 309 310 if (recp->shift == -1) { 311 goto err; 312 } 313 314 /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - 315 * BN_num_bits(N)))| 316 * = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - 317 * BN_num_bits(N)))| 318 * <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)| 319 * = |m/N| */ 320 if (!BN_rshift(a, m, recp->num_bits)) { 321 goto err; 322 } 323 if (!BN_mul(b, a, &(recp->Nr), ctx)) { 324 goto err; 325 } 326 if (!BN_rshift(d, b, i - recp->num_bits)) { 327 goto err; 328 } 329 d->neg = 0; 330 331 if (!BN_mul(b, &(recp->N), d, ctx)) { 332 goto err; 333 } 334 if (!BN_usub(r, m, b)) { 335 goto err; 336 } 337 r->neg = 0; 338 339 j = 0; 340 while (BN_ucmp(r, &(recp->N)) >= 0) { 341 if (j++ > 2) { 342 OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL); 343 goto err; 344 } 345 if (!BN_usub(r, r, &(recp->N))) { 346 goto err; 347 } 348 if (!BN_add_word(d, 1)) { 349 goto err; 350 } 351 } 352 353 r->neg = BN_is_zero(r) ? 0 : m->neg; 354 d->neg = m->neg ^ recp->N.neg; 355 ret = 1; 356 357 err: 358 BN_CTX_end(ctx); 359 return ret; 360 } 361 362 static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, 363 BN_RECP_CTX *recp, BN_CTX *ctx) { 364 int ret = 0; 365 BIGNUM *a; 366 const BIGNUM *ca; 367 368 BN_CTX_start(ctx); 369 a = BN_CTX_get(ctx); 370 if (a == NULL) { 371 goto err; 372 } 373 374 if (y != NULL) { 375 if (x == y) { 376 if (!BN_sqr(a, x, ctx)) { 377 goto err; 378 } 379 } else { 380 if (!BN_mul(a, x, y, ctx)) { 381 goto err; 382 } 383 } 384 ca = a; 385 } else { 386 ca = x; /* Just do the mod */ 387 } 388 389 ret = BN_div_recp(NULL, r, ca, recp, ctx); 390 391 err: 392 BN_CTX_end(ctx); 393 return ret; 394 } 395 396 /* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp 397 * functions 398 * 399 * For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of 400 * multiplications is a constant plus on average 401 * 402 * 2^(w-1) + (b-w)/(w+1); 403 * 404 * here 2^(w-1) is for precomputing the table (we actually need entries only 405 * for windows that have the lowest bit set), and (b-w)/(w+1) is an 406 * approximation for the expected number of w-bit windows, not counting the 407 * first one. 408 * 409 * Thus we should use 410 * 411 * w >= 6 if b > 671 412 * w = 5 if 671 > b > 239 413 * w = 4 if 239 > b > 79 414 * w = 3 if 79 > b > 23 415 * w <= 2 if 23 > b 416 * 417 * (with draws in between). Very small exponents are often selected 418 * with low Hamming weight, so we use w = 1 for b <= 23. */ 419 #define BN_window_bits_for_exponent_size(b) \ 420 ((b) > 671 ? 6 : \ 421 (b) > 239 ? 5 : \ 422 (b) > 79 ? 4 : \ 423 (b) > 23 ? 3 : 1) 424 425 static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, 426 const BIGNUM *m, BN_CTX *ctx) { 427 int i, j, bits, ret = 0, wstart, window; 428 int start = 1; 429 BIGNUM *aa; 430 /* Table of variables obtained from 'ctx' */ 431 BIGNUM *val[TABLE_SIZE]; 432 BN_RECP_CTX recp; 433 434 bits = BN_num_bits(p); 435 436 if (bits == 0) { 437 /* x**0 mod 1 is still zero. */ 438 if (BN_is_one(m)) { 439 BN_zero(r); 440 return 1; 441 } 442 return BN_one(r); 443 } 444 445 BN_CTX_start(ctx); 446 aa = BN_CTX_get(ctx); 447 val[0] = BN_CTX_get(ctx); 448 if (!aa || !val[0]) { 449 goto err; 450 } 451 452 BN_RECP_CTX_init(&recp); 453 if (m->neg) { 454 /* ignore sign of 'm' */ 455 if (!BN_copy(aa, m)) { 456 goto err; 457 } 458 aa->neg = 0; 459 if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) { 460 goto err; 461 } 462 } else { 463 if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) { 464 goto err; 465 } 466 } 467 468 if (!BN_nnmod(val[0], a, m, ctx)) { 469 goto err; /* 1 */ 470 } 471 if (BN_is_zero(val[0])) { 472 BN_zero(r); 473 ret = 1; 474 goto err; 475 } 476 477 window = BN_window_bits_for_exponent_size(bits); 478 if (window > 1) { 479 if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) { 480 goto err; /* 2 */ 481 } 482 j = 1 << (window - 1); 483 for (i = 1; i < j; i++) { 484 if (((val[i] = BN_CTX_get(ctx)) == NULL) || 485 !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) { 486 goto err; 487 } 488 } 489 } 490 491 start = 1; /* This is used to avoid multiplication etc 492 * when there is only the value '1' in the 493 * buffer. */ 494 wstart = bits - 1; /* The top bit of the window */ 495 496 if (!BN_one(r)) { 497 goto err; 498 } 499 500 for (;;) { 501 int wvalue; /* The 'value' of the window */ 502 int wend; /* The bottom bit of the window */ 503 504 if (BN_is_bit_set(p, wstart) == 0) { 505 if (!start) { 506 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { 507 goto err; 508 } 509 } 510 if (wstart == 0) { 511 break; 512 } 513 wstart--; 514 continue; 515 } 516 517 /* We now have wstart on a 'set' bit, we now need to work out 518 * how bit a window to do. To do this we need to scan 519 * forward until the last set bit before the end of the 520 * window */ 521 wvalue = 1; 522 wend = 0; 523 for (i = 1; i < window; i++) { 524 if (wstart - i < 0) { 525 break; 526 } 527 if (BN_is_bit_set(p, wstart - i)) { 528 wvalue <<= (i - wend); 529 wvalue |= 1; 530 wend = i; 531 } 532 } 533 534 /* wend is the size of the current window */ 535 j = wend + 1; 536 /* add the 'bytes above' */ 537 if (!start) { 538 for (i = 0; i < j; i++) { 539 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { 540 goto err; 541 } 542 } 543 } 544 545 /* wvalue will be an odd number < 2^window */ 546 if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) { 547 goto err; 548 } 549 550 /* move the 'window' down further */ 551 wstart -= wend + 1; 552 start = 0; 553 if (wstart < 0) { 554 break; 555 } 556 } 557 ret = 1; 558 559 err: 560 BN_CTX_end(ctx); 561 BN_RECP_CTX_free(&recp); 562 return ret; 563 } 564 565 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, 566 BN_CTX *ctx) { 567 if (BN_is_odd(m)) { 568 return BN_mod_exp_mont(r, a, p, m, ctx, NULL); 569 } 570 571 return mod_exp_recp(r, a, p, m, ctx); 572 } 573 574 int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, 575 const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { 576 int i, j, bits, ret = 0, wstart, window; 577 int start = 1; 578 BIGNUM *d, *r; 579 const BIGNUM *aa; 580 /* Table of variables obtained from 'ctx' */ 581 BIGNUM *val[TABLE_SIZE]; 582 BN_MONT_CTX *new_mont = NULL; 583 584 if (!BN_is_odd(m)) { 585 OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); 586 return 0; 587 } 588 bits = BN_num_bits(p); 589 if (bits == 0) { 590 /* x**0 mod 1 is still zero. */ 591 if (BN_is_one(m)) { 592 BN_zero(rr); 593 return 1; 594 } 595 return BN_one(rr); 596 } 597 598 BN_CTX_start(ctx); 599 d = BN_CTX_get(ctx); 600 r = BN_CTX_get(ctx); 601 val[0] = BN_CTX_get(ctx); 602 if (!d || !r || !val[0]) { 603 goto err; 604 } 605 606 /* Allocate a montgomery context if it was not supplied by the caller. */ 607 if (mont == NULL) { 608 new_mont = BN_MONT_CTX_new(); 609 if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) { 610 goto err; 611 } 612 mont = new_mont; 613 } 614 615 if (a->neg || BN_ucmp(a, m) >= 0) { 616 if (!BN_nnmod(val[0], a, m, ctx)) { 617 goto err; 618 } 619 aa = val[0]; 620 } else { 621 aa = a; 622 } 623 624 if (BN_is_zero(aa)) { 625 BN_zero(rr); 626 ret = 1; 627 goto err; 628 } 629 if (!BN_to_montgomery(val[0], aa, mont, ctx)) { 630 goto err; /* 1 */ 631 } 632 633 window = BN_window_bits_for_exponent_size(bits); 634 if (window > 1) { 635 if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { 636 goto err; /* 2 */ 637 } 638 j = 1 << (window - 1); 639 for (i = 1; i < j; i++) { 640 if (((val[i] = BN_CTX_get(ctx)) == NULL) || 641 !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { 642 goto err; 643 } 644 } 645 } 646 647 start = 1; /* This is used to avoid multiplication etc 648 * when there is only the value '1' in the 649 * buffer. */ 650 wstart = bits - 1; /* The top bit of the window */ 651 652 j = m->top; /* borrow j */ 653 if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { 654 if (bn_wexpand(r, j) == NULL) { 655 goto err; 656 } 657 /* 2^(top*BN_BITS2) - m */ 658 r->d[0] = (0 - m->d[0]) & BN_MASK2; 659 for (i = 1; i < j; i++) { 660 r->d[i] = (~m->d[i]) & BN_MASK2; 661 } 662 r->top = j; 663 /* Upper words will be zero if the corresponding words of 'm' 664 * were 0xfff[...], so decrement r->top accordingly. */ 665 bn_correct_top(r); 666 } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) { 667 goto err; 668 } 669 670 for (;;) { 671 int wvalue; /* The 'value' of the window */ 672 int wend; /* The bottom bit of the window */ 673 674 if (BN_is_bit_set(p, wstart) == 0) { 675 if (!start && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { 676 goto err; 677 } 678 if (wstart == 0) { 679 break; 680 } 681 wstart--; 682 continue; 683 } 684 685 /* We now have wstart on a 'set' bit, we now need to work out how bit a 686 * window to do. To do this we need to scan forward until the last set bit 687 * before the end of the window */ 688 wvalue = 1; 689 wend = 0; 690 for (i = 1; i < window; i++) { 691 if (wstart - i < 0) { 692 break; 693 } 694 if (BN_is_bit_set(p, wstart - i)) { 695 wvalue <<= (i - wend); 696 wvalue |= 1; 697 wend = i; 698 } 699 } 700 701 /* wend is the size of the current window */ 702 j = wend + 1; 703 /* add the 'bytes above' */ 704 if (!start) { 705 for (i = 0; i < j; i++) { 706 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { 707 goto err; 708 } 709 } 710 } 711 712 /* wvalue will be an odd number < 2^window */ 713 if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { 714 goto err; 715 } 716 717 /* move the 'window' down further */ 718 wstart -= wend + 1; 719 start = 0; 720 if (wstart < 0) { 721 break; 722 } 723 } 724 725 if (!BN_from_montgomery(rr, r, mont, ctx)) { 726 goto err; 727 } 728 ret = 1; 729 730 err: 731 BN_MONT_CTX_free(new_mont); 732 BN_CTX_end(ctx); 733 return ret; 734 } 735 736 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific 737 * layout so that accessing any of these table values shows the same access 738 * pattern as far as cache lines are concerned. The following functions are 739 * used to transfer a BIGNUM from/to that table. */ 740 static int copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf, int idx, 741 int window) { 742 int i, j; 743 const int width = 1 << window; 744 BN_ULONG *table = (BN_ULONG *) buf; 745 746 if (top > b->top) { 747 top = b->top; /* this works because 'buf' is explicitly zeroed */ 748 } 749 750 for (i = 0, j = idx; i < top; i++, j += width) { 751 table[j] = b->d[i]; 752 } 753 754 return 1; 755 } 756 757 static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx, 758 int window) { 759 int i, j; 760 const int width = 1 << window; 761 volatile BN_ULONG *table = (volatile BN_ULONG *)buf; 762 763 if (bn_wexpand(b, top) == NULL) { 764 return 0; 765 } 766 767 if (window <= 3) { 768 for (i = 0; i < top; i++, table += width) { 769 BN_ULONG acc = 0; 770 771 for (j = 0; j < width; j++) { 772 acc |= table[j] & ((BN_ULONG)0 - (constant_time_eq_int(j, idx) & 1)); 773 } 774 775 b->d[i] = acc; 776 } 777 } else { 778 int xstride = 1 << (window - 2); 779 BN_ULONG y0, y1, y2, y3; 780 781 i = idx >> (window - 2); /* equivalent of idx / xstride */ 782 idx &= xstride - 1; /* equivalent of idx % xstride */ 783 784 y0 = (BN_ULONG)0 - (constant_time_eq_int(i, 0) & 1); 785 y1 = (BN_ULONG)0 - (constant_time_eq_int(i, 1) & 1); 786 y2 = (BN_ULONG)0 - (constant_time_eq_int(i, 2) & 1); 787 y3 = (BN_ULONG)0 - (constant_time_eq_int(i, 3) & 1); 788 789 for (i = 0; i < top; i++, table += width) { 790 BN_ULONG acc = 0; 791 792 for (j = 0; j < xstride; j++) { 793 acc |= ((table[j + 0 * xstride] & y0) | (table[j + 1 * xstride] & y1) | 794 (table[j + 2 * xstride] & y2) | (table[j + 3 * xstride] & y3)) & 795 ((BN_ULONG)0 - (constant_time_eq_int(j, idx) & 1)); 796 } 797 798 b->d[i] = acc; 799 } 800 } 801 802 b->top = top; 803 bn_correct_top(b); 804 return 1; 805 } 806 807 /* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache 808 * line width of the target processor is at least the following value. */ 809 #define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64) 810 #define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \ 811 (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1) 812 813 /* Window sizes optimized for fixed window size modular exponentiation 814 * algorithm (BN_mod_exp_mont_consttime). 815 * 816 * To achieve the security goals of BN_mode_exp_mont_consttime, the maximum 817 * size of the window must not exceed 818 * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). 819 * 820 * Window size thresholds are defined for cache line sizes of 32 and 64, cache 821 * line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of 822 * 7 should only be used on processors that have a 128 byte or greater cache 823 * line size. */ 824 #if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64 825 826 #define BN_window_bits_for_ctime_exponent_size(b) \ 827 ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) 828 #define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6) 829 830 #elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32 831 832 #define BN_window_bits_for_ctime_exponent_size(b) \ 833 ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) 834 #define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5) 835 836 #endif 837 838 /* Given a pointer value, compute the next address that is a cache line 839 * multiple. */ 840 #define MOD_EXP_CTIME_ALIGN(x_) \ 841 ((unsigned char *)(x_) + \ 842 (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - \ 843 (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) 844 845 /* This variant of BN_mod_exp_mont() uses fixed windows and the special 846 * precomputation memory layout to limit data-dependency to a minimum 847 * to protect secret exponents (cf. the hyper-threading timing attacks 848 * pointed out by Colin Percival, 849 * http://www.daemonology.net/hyperthreading-considered-harmful/) 850 */ 851 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, 852 const BIGNUM *m, BN_CTX *ctx, 853 const BN_MONT_CTX *mont) { 854 int i, bits, ret = 0, window, wvalue; 855 int top; 856 BN_MONT_CTX *new_mont = NULL; 857 858 int numPowers; 859 unsigned char *powerbufFree = NULL; 860 int powerbufLen = 0; 861 unsigned char *powerbuf = NULL; 862 BIGNUM tmp, am; 863 BIGNUM *new_a = NULL; 864 865 if (!BN_is_odd(m)) { 866 OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); 867 return 0; 868 } 869 870 top = m->top; 871 872 bits = BN_num_bits(p); 873 if (bits == 0) { 874 /* x**0 mod 1 is still zero. */ 875 if (BN_is_one(m)) { 876 BN_zero(rr); 877 return 1; 878 } 879 return BN_one(rr); 880 } 881 882 /* Allocate a montgomery context if it was not supplied by the caller. */ 883 if (mont == NULL) { 884 new_mont = BN_MONT_CTX_new(); 885 if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) { 886 goto err; 887 } 888 mont = new_mont; 889 } 890 891 if (a->neg || BN_ucmp(a, m) >= 0) { 892 new_a = BN_new(); 893 if (new_a == NULL || 894 !BN_nnmod(new_a, a, m, ctx)) { 895 goto err; 896 } 897 a = new_a; 898 } 899 900 #ifdef RSAZ_ENABLED 901 /* If the size of the operands allow it, perform the optimized 902 * RSAZ exponentiation. For further information see 903 * crypto/bn/rsaz_exp.c and accompanying assembly modules. */ 904 if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) && 905 rsaz_avx2_eligible()) { 906 if (NULL == bn_wexpand(rr, 16)) { 907 goto err; 908 } 909 RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]); 910 rr->top = 16; 911 rr->neg = 0; 912 bn_correct_top(rr); 913 ret = 1; 914 goto err; 915 } 916 #endif 917 918 /* Get the window size to use with size of p. */ 919 window = BN_window_bits_for_ctime_exponent_size(bits); 920 #if defined(OPENSSL_BN_ASM_MONT5) 921 if (window >= 5) { 922 window = 5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */ 923 /* reserve space for mont->N.d[] copy */ 924 powerbufLen += top * sizeof(mont->N.d[0]); 925 } 926 #endif 927 928 /* Allocate a buffer large enough to hold all of the pre-computed 929 * powers of am, am itself and tmp. 930 */ 931 numPowers = 1 << window; 932 powerbufLen += 933 sizeof(m->d[0]) * 934 (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers)); 935 #ifdef alloca 936 if (powerbufLen < 3072) { 937 powerbufFree = alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); 938 } else 939 #endif 940 { 941 if ((powerbufFree = OPENSSL_malloc( 942 powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) { 943 goto err; 944 } 945 } 946 947 powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); 948 OPENSSL_memset(powerbuf, 0, powerbufLen); 949 950 #ifdef alloca 951 if (powerbufLen < 3072) { 952 powerbufFree = NULL; 953 } 954 #endif 955 956 /* lay down tmp and am right after powers table */ 957 tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers); 958 am.d = tmp.d + top; 959 tmp.top = am.top = 0; 960 tmp.dmax = am.dmax = top; 961 tmp.neg = am.neg = 0; 962 tmp.flags = am.flags = BN_FLG_STATIC_DATA; 963 964 /* prepare a^0 in Montgomery domain */ 965 /* by Shay Gueron's suggestion */ 966 if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { 967 /* 2^(top*BN_BITS2) - m */ 968 tmp.d[0] = (0 - m->d[0]) & BN_MASK2; 969 for (i = 1; i < top; i++) { 970 tmp.d[i] = (~m->d[i]) & BN_MASK2; 971 } 972 tmp.top = top; 973 } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx)) { 974 goto err; 975 } 976 977 /* prepare a^1 in Montgomery domain */ 978 assert(!a->neg); 979 assert(BN_ucmp(a, m) < 0); 980 if (!BN_to_montgomery(&am, a, mont, ctx)) { 981 goto err; 982 } 983 984 #if defined(OPENSSL_BN_ASM_MONT5) 985 /* This optimization uses ideas from http://eprint.iacr.org/2011/239, 986 * specifically optimization of cache-timing attack countermeasures 987 * and pre-computation optimization. */ 988 989 /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as 990 * 512-bit RSA is hardly relevant, we omit it to spare size... */ 991 if (window == 5 && top > 1) { 992 const BN_ULONG *n0 = mont->n0; 993 BN_ULONG *np; 994 995 /* BN_to_montgomery can contaminate words above .top 996 * [in BN_DEBUG[_DEBUG] build]... */ 997 for (i = am.top; i < top; i++) { 998 am.d[i] = 0; 999 } 1000 for (i = tmp.top; i < top; i++) { 1001 tmp.d[i] = 0; 1002 } 1003 1004 /* copy mont->N.d[] to improve cache locality */ 1005 for (np = am.d + top, i = 0; i < top; i++) { 1006 np[i] = mont->N.d[i]; 1007 } 1008 1009 bn_scatter5(tmp.d, top, powerbuf, 0); 1010 bn_scatter5(am.d, am.top, powerbuf, 1); 1011 bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); 1012 bn_scatter5(tmp.d, top, powerbuf, 2); 1013 1014 /* same as above, but uses squaring for 1/2 of operations */ 1015 for (i = 4; i < 32; i *= 2) { 1016 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1017 bn_scatter5(tmp.d, top, powerbuf, i); 1018 } 1019 for (i = 3; i < 8; i += 2) { 1020 int j; 1021 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 1022 bn_scatter5(tmp.d, top, powerbuf, i); 1023 for (j = 2 * i; j < 32; j *= 2) { 1024 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1025 bn_scatter5(tmp.d, top, powerbuf, j); 1026 } 1027 } 1028 for (; i < 16; i += 2) { 1029 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 1030 bn_scatter5(tmp.d, top, powerbuf, i); 1031 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1032 bn_scatter5(tmp.d, top, powerbuf, 2 * i); 1033 } 1034 for (; i < 32; i += 2) { 1035 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 1036 bn_scatter5(tmp.d, top, powerbuf, i); 1037 } 1038 1039 bits--; 1040 for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { 1041 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); 1042 } 1043 bn_gather5(tmp.d, top, powerbuf, wvalue); 1044 1045 /* At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit 1046 * that has not been read yet.) */ 1047 assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); 1048 1049 /* Scan the exponent one window at a time starting from the most 1050 * significant bits. 1051 */ 1052 if (top & 7) { 1053 while (bits >= 0) { 1054 for (wvalue = 0, i = 0; i < 5; i++, bits--) { 1055 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); 1056 } 1057 1058 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1059 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1060 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1061 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1062 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1063 bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); 1064 } 1065 } else { 1066 const uint8_t *p_bytes = (const uint8_t *)p->d; 1067 int max_bits = p->top * BN_BITS2; 1068 assert(bits < max_bits); 1069 /* |p = 0| has been handled as a special case, so |max_bits| is at least 1070 * one word. */ 1071 assert(max_bits >= 64); 1072 1073 /* If the first bit to be read lands in the last byte, unroll the first 1074 * iteration to avoid reading past the bounds of |p->d|. (After the first 1075 * iteration, we are guaranteed to be past the last byte.) Note |bits| 1076 * here is the top bit, inclusive. */ 1077 if (bits - 4 >= max_bits - 8) { 1078 /* Read five bits from |bits-4| through |bits|, inclusive. */ 1079 wvalue = p_bytes[p->top * BN_BYTES - 1]; 1080 wvalue >>= (bits - 4) & 7; 1081 wvalue &= 0x1f; 1082 bits -= 5; 1083 bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); 1084 } 1085 while (bits >= 0) { 1086 /* Read five bits from |bits-4| through |bits|, inclusive. */ 1087 int first_bit = bits - 4; 1088 wvalue = *(const uint16_t *) (p_bytes + (first_bit >> 3)); 1089 wvalue >>= first_bit & 7; 1090 wvalue &= 0x1f; 1091 bits -= 5; 1092 bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); 1093 } 1094 } 1095 1096 ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top); 1097 tmp.top = top; 1098 bn_correct_top(&tmp); 1099 if (ret) { 1100 if (!BN_copy(rr, &tmp)) { 1101 ret = 0; 1102 } 1103 goto err; /* non-zero ret means it's not error */ 1104 } 1105 } else 1106 #endif 1107 { 1108 if (!copy_to_prebuf(&tmp, top, powerbuf, 0, window) || 1109 !copy_to_prebuf(&am, top, powerbuf, 1, window)) { 1110 goto err; 1111 } 1112 1113 /* If the window size is greater than 1, then calculate 1114 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) 1115 * (even powers could instead be computed as (a^(i/2))^2 1116 * to use the slight performance advantage of sqr over mul). 1117 */ 1118 if (window > 1) { 1119 if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx) || 1120 !copy_to_prebuf(&tmp, top, powerbuf, 2, window)) { 1121 goto err; 1122 } 1123 for (i = 3; i < numPowers; i++) { 1124 /* Calculate a^i = a^(i-1) * a */ 1125 if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx) || 1126 !copy_to_prebuf(&tmp, top, powerbuf, i, window)) { 1127 goto err; 1128 } 1129 } 1130 } 1131 1132 bits--; 1133 for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { 1134 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); 1135 } 1136 if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { 1137 goto err; 1138 } 1139 1140 /* Scan the exponent one window at a time starting from the most 1141 * significant bits. 1142 */ 1143 while (bits >= 0) { 1144 wvalue = 0; /* The 'value' of the window */ 1145 1146 /* Scan the window, squaring the result as we go */ 1147 for (i = 0; i < window; i++, bits--) { 1148 if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { 1149 goto err; 1150 } 1151 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); 1152 } 1153 1154 /* Fetch the appropriate pre-computed value from the pre-buf */ 1155 if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { 1156 goto err; 1157 } 1158 1159 /* Multiply the result into the intermediate result */ 1160 if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { 1161 goto err; 1162 } 1163 } 1164 } 1165 1166 /* Convert the final result from montgomery to standard format */ 1167 if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { 1168 goto err; 1169 } 1170 ret = 1; 1171 1172 err: 1173 BN_MONT_CTX_free(new_mont); 1174 BN_clear_free(new_a); 1175 if (powerbuf != NULL) { 1176 OPENSSL_cleanse(powerbuf, powerbufLen); 1177 OPENSSL_free(powerbufFree); 1178 } 1179 return (ret); 1180 } 1181 1182 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, 1183 const BIGNUM *m, BN_CTX *ctx, 1184 const BN_MONT_CTX *mont) { 1185 BIGNUM a_bignum; 1186 BN_init(&a_bignum); 1187 1188 int ret = 0; 1189 1190 if (!BN_set_word(&a_bignum, a)) { 1191 OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); 1192 goto err; 1193 } 1194 1195 ret = BN_mod_exp_mont(rr, &a_bignum, p, m, ctx, mont); 1196 1197 err: 1198 BN_free(&a_bignum); 1199 1200 return ret; 1201 } 1202 1203 #define TABLE_SIZE 32 1204 1205 int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, 1206 const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, 1207 BN_CTX *ctx, const BN_MONT_CTX *mont) { 1208 BIGNUM tmp; 1209 BN_init(&tmp); 1210 1211 int ret = 0; 1212 BN_MONT_CTX *new_mont = NULL; 1213 1214 /* Allocate a montgomery context if it was not supplied by the caller. */ 1215 if (mont == NULL) { 1216 new_mont = BN_MONT_CTX_new(); 1217 if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) { 1218 goto err; 1219 } 1220 mont = new_mont; 1221 } 1222 1223 /* BN_mod_mul_montgomery removes one Montgomery factor, so passing one 1224 * Montgomery-encoded and one non-Montgomery-encoded value gives a 1225 * non-Montgomery-encoded result. */ 1226 if (!BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || 1227 !BN_mod_exp_mont(&tmp, a2, p2, m, ctx, mont) || 1228 !BN_to_montgomery(rr, rr, mont, ctx) || 1229 !BN_mod_mul_montgomery(rr, rr, &tmp, mont, ctx)) { 1230 goto err; 1231 } 1232 1233 ret = 1; 1234 1235 err: 1236 BN_MONT_CTX_free(new_mont); 1237 BN_free(&tmp); 1238 1239 return ret; 1240 } 1241