Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 .extern	OPENSSL_ia32cap_P
      5 .hidden OPENSSL_ia32cap_P
      6 
      7 .globl	rsaz_512_sqr
      8 .hidden rsaz_512_sqr
      9 .type	rsaz_512_sqr,@function
     10 .align	32
     11 rsaz_512_sqr:
     12 	pushq	%rbx
     13 	pushq	%rbp
     14 	pushq	%r12
     15 	pushq	%r13
     16 	pushq	%r14
     17 	pushq	%r15
     18 
     19 	subq	$128+24,%rsp
     20 .Lsqr_body:
     21 	movq	%rdx,%rbp
     22 	movq	(%rsi),%rdx
     23 	movq	8(%rsi),%rax
     24 	movq	%rcx,128(%rsp)
     25 	jmp	.Loop_sqr
     26 
     27 .align	32
     28 .Loop_sqr:
     29 	movl	%r8d,128+8(%rsp)
     30 
     31 	movq	%rdx,%rbx
     32 	mulq	%rdx
     33 	movq	%rax,%r8
     34 	movq	16(%rsi),%rax
     35 	movq	%rdx,%r9
     36 
     37 	mulq	%rbx
     38 	addq	%rax,%r9
     39 	movq	24(%rsi),%rax
     40 	movq	%rdx,%r10
     41 	adcq	$0,%r10
     42 
     43 	mulq	%rbx
     44 	addq	%rax,%r10
     45 	movq	32(%rsi),%rax
     46 	movq	%rdx,%r11
     47 	adcq	$0,%r11
     48 
     49 	mulq	%rbx
     50 	addq	%rax,%r11
     51 	movq	40(%rsi),%rax
     52 	movq	%rdx,%r12
     53 	adcq	$0,%r12
     54 
     55 	mulq	%rbx
     56 	addq	%rax,%r12
     57 	movq	48(%rsi),%rax
     58 	movq	%rdx,%r13
     59 	adcq	$0,%r13
     60 
     61 	mulq	%rbx
     62 	addq	%rax,%r13
     63 	movq	56(%rsi),%rax
     64 	movq	%rdx,%r14
     65 	adcq	$0,%r14
     66 
     67 	mulq	%rbx
     68 	addq	%rax,%r14
     69 	movq	%rbx,%rax
     70 	movq	%rdx,%r15
     71 	adcq	$0,%r15
     72 
     73 	addq	%r8,%r8
     74 	movq	%r9,%rcx
     75 	adcq	%r9,%r9
     76 
     77 	mulq	%rax
     78 	movq	%rax,(%rsp)
     79 	addq	%rdx,%r8
     80 	adcq	$0,%r9
     81 
     82 	movq	%r8,8(%rsp)
     83 	shrq	$63,%rcx
     84 
     85 
     86 	movq	8(%rsi),%r8
     87 	movq	16(%rsi),%rax
     88 	mulq	%r8
     89 	addq	%rax,%r10
     90 	movq	24(%rsi),%rax
     91 	movq	%rdx,%rbx
     92 	adcq	$0,%rbx
     93 
     94 	mulq	%r8
     95 	addq	%rax,%r11
     96 	movq	32(%rsi),%rax
     97 	adcq	$0,%rdx
     98 	addq	%rbx,%r11
     99 	movq	%rdx,%rbx
    100 	adcq	$0,%rbx
    101 
    102 	mulq	%r8
    103 	addq	%rax,%r12
    104 	movq	40(%rsi),%rax
    105 	adcq	$0,%rdx
    106 	addq	%rbx,%r12
    107 	movq	%rdx,%rbx
    108 	adcq	$0,%rbx
    109 
    110 	mulq	%r8
    111 	addq	%rax,%r13
    112 	movq	48(%rsi),%rax
    113 	adcq	$0,%rdx
    114 	addq	%rbx,%r13
    115 	movq	%rdx,%rbx
    116 	adcq	$0,%rbx
    117 
    118 	mulq	%r8
    119 	addq	%rax,%r14
    120 	movq	56(%rsi),%rax
    121 	adcq	$0,%rdx
    122 	addq	%rbx,%r14
    123 	movq	%rdx,%rbx
    124 	adcq	$0,%rbx
    125 
    126 	mulq	%r8
    127 	addq	%rax,%r15
    128 	movq	%r8,%rax
    129 	adcq	$0,%rdx
    130 	addq	%rbx,%r15
    131 	movq	%rdx,%r8
    132 	movq	%r10,%rdx
    133 	adcq	$0,%r8
    134 
    135 	addq	%rdx,%rdx
    136 	leaq	(%rcx,%r10,2),%r10
    137 	movq	%r11,%rbx
    138 	adcq	%r11,%r11
    139 
    140 	mulq	%rax
    141 	addq	%rax,%r9
    142 	adcq	%rdx,%r10
    143 	adcq	$0,%r11
    144 
    145 	movq	%r9,16(%rsp)
    146 	movq	%r10,24(%rsp)
    147 	shrq	$63,%rbx
    148 
    149 
    150 	movq	16(%rsi),%r9
    151 	movq	24(%rsi),%rax
    152 	mulq	%r9
    153 	addq	%rax,%r12
    154 	movq	32(%rsi),%rax
    155 	movq	%rdx,%rcx
    156 	adcq	$0,%rcx
    157 
    158 	mulq	%r9
    159 	addq	%rax,%r13
    160 	movq	40(%rsi),%rax
    161 	adcq	$0,%rdx
    162 	addq	%rcx,%r13
    163 	movq	%rdx,%rcx
    164 	adcq	$0,%rcx
    165 
    166 	mulq	%r9
    167 	addq	%rax,%r14
    168 	movq	48(%rsi),%rax
    169 	adcq	$0,%rdx
    170 	addq	%rcx,%r14
    171 	movq	%rdx,%rcx
    172 	adcq	$0,%rcx
    173 
    174 	mulq	%r9
    175 	movq	%r12,%r10
    176 	leaq	(%rbx,%r12,2),%r12
    177 	addq	%rax,%r15
    178 	movq	56(%rsi),%rax
    179 	adcq	$0,%rdx
    180 	addq	%rcx,%r15
    181 	movq	%rdx,%rcx
    182 	adcq	$0,%rcx
    183 
    184 	mulq	%r9
    185 	shrq	$63,%r10
    186 	addq	%rax,%r8
    187 	movq	%r9,%rax
    188 	adcq	$0,%rdx
    189 	addq	%rcx,%r8
    190 	movq	%rdx,%r9
    191 	adcq	$0,%r9
    192 
    193 	movq	%r13,%rcx
    194 	leaq	(%r10,%r13,2),%r13
    195 
    196 	mulq	%rax
    197 	addq	%rax,%r11
    198 	adcq	%rdx,%r12
    199 	adcq	$0,%r13
    200 
    201 	movq	%r11,32(%rsp)
    202 	movq	%r12,40(%rsp)
    203 	shrq	$63,%rcx
    204 
    205 
    206 	movq	24(%rsi),%r10
    207 	movq	32(%rsi),%rax
    208 	mulq	%r10
    209 	addq	%rax,%r14
    210 	movq	40(%rsi),%rax
    211 	movq	%rdx,%rbx
    212 	adcq	$0,%rbx
    213 
    214 	mulq	%r10
    215 	addq	%rax,%r15
    216 	movq	48(%rsi),%rax
    217 	adcq	$0,%rdx
    218 	addq	%rbx,%r15
    219 	movq	%rdx,%rbx
    220 	adcq	$0,%rbx
    221 
    222 	mulq	%r10
    223 	movq	%r14,%r12
    224 	leaq	(%rcx,%r14,2),%r14
    225 	addq	%rax,%r8
    226 	movq	56(%rsi),%rax
    227 	adcq	$0,%rdx
    228 	addq	%rbx,%r8
    229 	movq	%rdx,%rbx
    230 	adcq	$0,%rbx
    231 
    232 	mulq	%r10
    233 	shrq	$63,%r12
    234 	addq	%rax,%r9
    235 	movq	%r10,%rax
    236 	adcq	$0,%rdx
    237 	addq	%rbx,%r9
    238 	movq	%rdx,%r10
    239 	adcq	$0,%r10
    240 
    241 	movq	%r15,%rbx
    242 	leaq	(%r12,%r15,2),%r15
    243 
    244 	mulq	%rax
    245 	addq	%rax,%r13
    246 	adcq	%rdx,%r14
    247 	adcq	$0,%r15
    248 
    249 	movq	%r13,48(%rsp)
    250 	movq	%r14,56(%rsp)
    251 	shrq	$63,%rbx
    252 
    253 
    254 	movq	32(%rsi),%r11
    255 	movq	40(%rsi),%rax
    256 	mulq	%r11
    257 	addq	%rax,%r8
    258 	movq	48(%rsi),%rax
    259 	movq	%rdx,%rcx
    260 	adcq	$0,%rcx
    261 
    262 	mulq	%r11
    263 	addq	%rax,%r9
    264 	movq	56(%rsi),%rax
    265 	adcq	$0,%rdx
    266 	movq	%r8,%r12
    267 	leaq	(%rbx,%r8,2),%r8
    268 	addq	%rcx,%r9
    269 	movq	%rdx,%rcx
    270 	adcq	$0,%rcx
    271 
    272 	mulq	%r11
    273 	shrq	$63,%r12
    274 	addq	%rax,%r10
    275 	movq	%r11,%rax
    276 	adcq	$0,%rdx
    277 	addq	%rcx,%r10
    278 	movq	%rdx,%r11
    279 	adcq	$0,%r11
    280 
    281 	movq	%r9,%rcx
    282 	leaq	(%r12,%r9,2),%r9
    283 
    284 	mulq	%rax
    285 	addq	%rax,%r15
    286 	adcq	%rdx,%r8
    287 	adcq	$0,%r9
    288 
    289 	movq	%r15,64(%rsp)
    290 	movq	%r8,72(%rsp)
    291 	shrq	$63,%rcx
    292 
    293 
    294 	movq	40(%rsi),%r12
    295 	movq	48(%rsi),%rax
    296 	mulq	%r12
    297 	addq	%rax,%r10
    298 	movq	56(%rsi),%rax
    299 	movq	%rdx,%rbx
    300 	adcq	$0,%rbx
    301 
    302 	mulq	%r12
    303 	addq	%rax,%r11
    304 	movq	%r12,%rax
    305 	movq	%r10,%r15
    306 	leaq	(%rcx,%r10,2),%r10
    307 	adcq	$0,%rdx
    308 	shrq	$63,%r15
    309 	addq	%rbx,%r11
    310 	movq	%rdx,%r12
    311 	adcq	$0,%r12
    312 
    313 	movq	%r11,%rbx
    314 	leaq	(%r15,%r11,2),%r11
    315 
    316 	mulq	%rax
    317 	addq	%rax,%r9
    318 	adcq	%rdx,%r10
    319 	adcq	$0,%r11
    320 
    321 	movq	%r9,80(%rsp)
    322 	movq	%r10,88(%rsp)
    323 
    324 
    325 	movq	48(%rsi),%r13
    326 	movq	56(%rsi),%rax
    327 	mulq	%r13
    328 	addq	%rax,%r12
    329 	movq	%r13,%rax
    330 	movq	%rdx,%r13
    331 	adcq	$0,%r13
    332 
    333 	xorq	%r14,%r14
    334 	shlq	$1,%rbx
    335 	adcq	%r12,%r12
    336 	adcq	%r13,%r13
    337 	adcq	%r14,%r14
    338 
    339 	mulq	%rax
    340 	addq	%rax,%r11
    341 	adcq	%rdx,%r12
    342 	adcq	$0,%r13
    343 
    344 	movq	%r11,96(%rsp)
    345 	movq	%r12,104(%rsp)
    346 
    347 
    348 	movq	56(%rsi),%rax
    349 	mulq	%rax
    350 	addq	%rax,%r13
    351 	adcq	$0,%rdx
    352 
    353 	addq	%rdx,%r14
    354 
    355 	movq	%r13,112(%rsp)
    356 	movq	%r14,120(%rsp)
    357 
    358 	movq	(%rsp),%r8
    359 	movq	8(%rsp),%r9
    360 	movq	16(%rsp),%r10
    361 	movq	24(%rsp),%r11
    362 	movq	32(%rsp),%r12
    363 	movq	40(%rsp),%r13
    364 	movq	48(%rsp),%r14
    365 	movq	56(%rsp),%r15
    366 
    367 	call	__rsaz_512_reduce
    368 
    369 	addq	64(%rsp),%r8
    370 	adcq	72(%rsp),%r9
    371 	adcq	80(%rsp),%r10
    372 	adcq	88(%rsp),%r11
    373 	adcq	96(%rsp),%r12
    374 	adcq	104(%rsp),%r13
    375 	adcq	112(%rsp),%r14
    376 	adcq	120(%rsp),%r15
    377 	sbbq	%rcx,%rcx
    378 
    379 	call	__rsaz_512_subtract
    380 
    381 	movq	%r8,%rdx
    382 	movq	%r9,%rax
    383 	movl	128+8(%rsp),%r8d
    384 	movq	%rdi,%rsi
    385 
    386 	decl	%r8d
    387 	jnz	.Loop_sqr
    388 
    389 	leaq	128+24+48(%rsp),%rax
    390 	movq	-48(%rax),%r15
    391 	movq	-40(%rax),%r14
    392 	movq	-32(%rax),%r13
    393 	movq	-24(%rax),%r12
    394 	movq	-16(%rax),%rbp
    395 	movq	-8(%rax),%rbx
    396 	leaq	(%rax),%rsp
    397 .Lsqr_epilogue:
    398 	.byte	0xf3,0xc3
    399 .size	rsaz_512_sqr,.-rsaz_512_sqr
    400 .globl	rsaz_512_mul
    401 .hidden rsaz_512_mul
    402 .type	rsaz_512_mul,@function
    403 .align	32
    404 rsaz_512_mul:
    405 	pushq	%rbx
    406 	pushq	%rbp
    407 	pushq	%r12
    408 	pushq	%r13
    409 	pushq	%r14
    410 	pushq	%r15
    411 
    412 	subq	$128+24,%rsp
    413 .Lmul_body:
    414 .byte	102,72,15,110,199
    415 .byte	102,72,15,110,201
    416 	movq	%r8,128(%rsp)
    417 	movq	(%rdx),%rbx
    418 	movq	%rdx,%rbp
    419 	call	__rsaz_512_mul
    420 
    421 .byte	102,72,15,126,199
    422 .byte	102,72,15,126,205
    423 
    424 	movq	(%rsp),%r8
    425 	movq	8(%rsp),%r9
    426 	movq	16(%rsp),%r10
    427 	movq	24(%rsp),%r11
    428 	movq	32(%rsp),%r12
    429 	movq	40(%rsp),%r13
    430 	movq	48(%rsp),%r14
    431 	movq	56(%rsp),%r15
    432 
    433 	call	__rsaz_512_reduce
    434 	addq	64(%rsp),%r8
    435 	adcq	72(%rsp),%r9
    436 	adcq	80(%rsp),%r10
    437 	adcq	88(%rsp),%r11
    438 	adcq	96(%rsp),%r12
    439 	adcq	104(%rsp),%r13
    440 	adcq	112(%rsp),%r14
    441 	adcq	120(%rsp),%r15
    442 	sbbq	%rcx,%rcx
    443 
    444 	call	__rsaz_512_subtract
    445 
    446 	leaq	128+24+48(%rsp),%rax
    447 	movq	-48(%rax),%r15
    448 	movq	-40(%rax),%r14
    449 	movq	-32(%rax),%r13
    450 	movq	-24(%rax),%r12
    451 	movq	-16(%rax),%rbp
    452 	movq	-8(%rax),%rbx
    453 	leaq	(%rax),%rsp
    454 .Lmul_epilogue:
    455 	.byte	0xf3,0xc3
    456 .size	rsaz_512_mul,.-rsaz_512_mul
    457 .globl	rsaz_512_mul_gather4
    458 .hidden rsaz_512_mul_gather4
    459 .type	rsaz_512_mul_gather4,@function
    460 .align	32
    461 rsaz_512_mul_gather4:
    462 	pushq	%rbx
    463 	pushq	%rbp
    464 	pushq	%r12
    465 	pushq	%r13
    466 	pushq	%r14
    467 	pushq	%r15
    468 
    469 	movl	%r9d,%r9d
    470 	subq	$128+24,%rsp
    471 .Lmul_gather4_body:
    472 	movl	64(%rdx,%r9,4),%eax
    473 .byte	102,72,15,110,199
    474 	movl	(%rdx,%r9,4),%ebx
    475 .byte	102,72,15,110,201
    476 	movq	%r8,128(%rsp)
    477 
    478 	shlq	$32,%rax
    479 	orq	%rax,%rbx
    480 	movq	(%rsi),%rax
    481 	movq	8(%rsi),%rcx
    482 	leaq	128(%rdx,%r9,4),%rbp
    483 	mulq	%rbx
    484 	movq	%rax,(%rsp)
    485 	movq	%rcx,%rax
    486 	movq	%rdx,%r8
    487 
    488 	mulq	%rbx
    489 	movd	(%rbp),%xmm4
    490 	addq	%rax,%r8
    491 	movq	16(%rsi),%rax
    492 	movq	%rdx,%r9
    493 	adcq	$0,%r9
    494 
    495 	mulq	%rbx
    496 	movd	64(%rbp),%xmm5
    497 	addq	%rax,%r9
    498 	movq	24(%rsi),%rax
    499 	movq	%rdx,%r10
    500 	adcq	$0,%r10
    501 
    502 	mulq	%rbx
    503 	pslldq	$4,%xmm5
    504 	addq	%rax,%r10
    505 	movq	32(%rsi),%rax
    506 	movq	%rdx,%r11
    507 	adcq	$0,%r11
    508 
    509 	mulq	%rbx
    510 	por	%xmm5,%xmm4
    511 	addq	%rax,%r11
    512 	movq	40(%rsi),%rax
    513 	movq	%rdx,%r12
    514 	adcq	$0,%r12
    515 
    516 	mulq	%rbx
    517 	addq	%rax,%r12
    518 	movq	48(%rsi),%rax
    519 	movq	%rdx,%r13
    520 	adcq	$0,%r13
    521 
    522 	mulq	%rbx
    523 	leaq	128(%rbp),%rbp
    524 	addq	%rax,%r13
    525 	movq	56(%rsi),%rax
    526 	movq	%rdx,%r14
    527 	adcq	$0,%r14
    528 
    529 	mulq	%rbx
    530 .byte	102,72,15,126,227
    531 	addq	%rax,%r14
    532 	movq	(%rsi),%rax
    533 	movq	%rdx,%r15
    534 	adcq	$0,%r15
    535 
    536 	leaq	8(%rsp),%rdi
    537 	movl	$7,%ecx
    538 	jmp	.Loop_mul_gather
    539 
    540 .align	32
    541 .Loop_mul_gather:
    542 	mulq	%rbx
    543 	addq	%rax,%r8
    544 	movq	8(%rsi),%rax
    545 	movq	%r8,(%rdi)
    546 	movq	%rdx,%r8
    547 	adcq	$0,%r8
    548 
    549 	mulq	%rbx
    550 	movd	(%rbp),%xmm4
    551 	addq	%rax,%r9
    552 	movq	16(%rsi),%rax
    553 	adcq	$0,%rdx
    554 	addq	%r9,%r8
    555 	movq	%rdx,%r9
    556 	adcq	$0,%r9
    557 
    558 	mulq	%rbx
    559 	movd	64(%rbp),%xmm5
    560 	addq	%rax,%r10
    561 	movq	24(%rsi),%rax
    562 	adcq	$0,%rdx
    563 	addq	%r10,%r9
    564 	movq	%rdx,%r10
    565 	adcq	$0,%r10
    566 
    567 	mulq	%rbx
    568 	pslldq	$4,%xmm5
    569 	addq	%rax,%r11
    570 	movq	32(%rsi),%rax
    571 	adcq	$0,%rdx
    572 	addq	%r11,%r10
    573 	movq	%rdx,%r11
    574 	adcq	$0,%r11
    575 
    576 	mulq	%rbx
    577 	por	%xmm5,%xmm4
    578 	addq	%rax,%r12
    579 	movq	40(%rsi),%rax
    580 	adcq	$0,%rdx
    581 	addq	%r12,%r11
    582 	movq	%rdx,%r12
    583 	adcq	$0,%r12
    584 
    585 	mulq	%rbx
    586 	addq	%rax,%r13
    587 	movq	48(%rsi),%rax
    588 	adcq	$0,%rdx
    589 	addq	%r13,%r12
    590 	movq	%rdx,%r13
    591 	adcq	$0,%r13
    592 
    593 	mulq	%rbx
    594 	addq	%rax,%r14
    595 	movq	56(%rsi),%rax
    596 	adcq	$0,%rdx
    597 	addq	%r14,%r13
    598 	movq	%rdx,%r14
    599 	adcq	$0,%r14
    600 
    601 	mulq	%rbx
    602 .byte	102,72,15,126,227
    603 	addq	%rax,%r15
    604 	movq	(%rsi),%rax
    605 	adcq	$0,%rdx
    606 	addq	%r15,%r14
    607 	movq	%rdx,%r15
    608 	adcq	$0,%r15
    609 
    610 	leaq	128(%rbp),%rbp
    611 	leaq	8(%rdi),%rdi
    612 
    613 	decl	%ecx
    614 	jnz	.Loop_mul_gather
    615 
    616 	movq	%r8,(%rdi)
    617 	movq	%r9,8(%rdi)
    618 	movq	%r10,16(%rdi)
    619 	movq	%r11,24(%rdi)
    620 	movq	%r12,32(%rdi)
    621 	movq	%r13,40(%rdi)
    622 	movq	%r14,48(%rdi)
    623 	movq	%r15,56(%rdi)
    624 
    625 .byte	102,72,15,126,199
    626 .byte	102,72,15,126,205
    627 
    628 	movq	(%rsp),%r8
    629 	movq	8(%rsp),%r9
    630 	movq	16(%rsp),%r10
    631 	movq	24(%rsp),%r11
    632 	movq	32(%rsp),%r12
    633 	movq	40(%rsp),%r13
    634 	movq	48(%rsp),%r14
    635 	movq	56(%rsp),%r15
    636 
    637 	call	__rsaz_512_reduce
    638 	addq	64(%rsp),%r8
    639 	adcq	72(%rsp),%r9
    640 	adcq	80(%rsp),%r10
    641 	adcq	88(%rsp),%r11
    642 	adcq	96(%rsp),%r12
    643 	adcq	104(%rsp),%r13
    644 	adcq	112(%rsp),%r14
    645 	adcq	120(%rsp),%r15
    646 	sbbq	%rcx,%rcx
    647 
    648 	call	__rsaz_512_subtract
    649 
    650 	leaq	128+24+48(%rsp),%rax
    651 	movq	-48(%rax),%r15
    652 	movq	-40(%rax),%r14
    653 	movq	-32(%rax),%r13
    654 	movq	-24(%rax),%r12
    655 	movq	-16(%rax),%rbp
    656 	movq	-8(%rax),%rbx
    657 	leaq	(%rax),%rsp
    658 .Lmul_gather4_epilogue:
    659 	.byte	0xf3,0xc3
    660 .size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
    661 .globl	rsaz_512_mul_scatter4
    662 .hidden rsaz_512_mul_scatter4
    663 .type	rsaz_512_mul_scatter4,@function
    664 .align	32
    665 rsaz_512_mul_scatter4:
    666 	pushq	%rbx
    667 	pushq	%rbp
    668 	pushq	%r12
    669 	pushq	%r13
    670 	pushq	%r14
    671 	pushq	%r15
    672 
    673 	movl	%r9d,%r9d
    674 	subq	$128+24,%rsp
    675 .Lmul_scatter4_body:
    676 	leaq	(%r8,%r9,4),%r8
    677 .byte	102,72,15,110,199
    678 .byte	102,72,15,110,202
    679 .byte	102,73,15,110,208
    680 	movq	%rcx,128(%rsp)
    681 
    682 	movq	%rdi,%rbp
    683 	movq	(%rdi),%rbx
    684 	call	__rsaz_512_mul
    685 
    686 .byte	102,72,15,126,199
    687 .byte	102,72,15,126,205
    688 
    689 	movq	(%rsp),%r8
    690 	movq	8(%rsp),%r9
    691 	movq	16(%rsp),%r10
    692 	movq	24(%rsp),%r11
    693 	movq	32(%rsp),%r12
    694 	movq	40(%rsp),%r13
    695 	movq	48(%rsp),%r14
    696 	movq	56(%rsp),%r15
    697 
    698 	call	__rsaz_512_reduce
    699 	addq	64(%rsp),%r8
    700 	adcq	72(%rsp),%r9
    701 	adcq	80(%rsp),%r10
    702 	adcq	88(%rsp),%r11
    703 	adcq	96(%rsp),%r12
    704 	adcq	104(%rsp),%r13
    705 	adcq	112(%rsp),%r14
    706 	adcq	120(%rsp),%r15
    707 .byte	102,72,15,126,214
    708 	sbbq	%rcx,%rcx
    709 
    710 	call	__rsaz_512_subtract
    711 
    712 	movl	%r8d,0(%rsi)
    713 	shrq	$32,%r8
    714 	movl	%r9d,128(%rsi)
    715 	shrq	$32,%r9
    716 	movl	%r10d,256(%rsi)
    717 	shrq	$32,%r10
    718 	movl	%r11d,384(%rsi)
    719 	shrq	$32,%r11
    720 	movl	%r12d,512(%rsi)
    721 	shrq	$32,%r12
    722 	movl	%r13d,640(%rsi)
    723 	shrq	$32,%r13
    724 	movl	%r14d,768(%rsi)
    725 	shrq	$32,%r14
    726 	movl	%r15d,896(%rsi)
    727 	shrq	$32,%r15
    728 	movl	%r8d,64(%rsi)
    729 	movl	%r9d,192(%rsi)
    730 	movl	%r10d,320(%rsi)
    731 	movl	%r11d,448(%rsi)
    732 	movl	%r12d,576(%rsi)
    733 	movl	%r13d,704(%rsi)
    734 	movl	%r14d,832(%rsi)
    735 	movl	%r15d,960(%rsi)
    736 
    737 	leaq	128+24+48(%rsp),%rax
    738 	movq	-48(%rax),%r15
    739 	movq	-40(%rax),%r14
    740 	movq	-32(%rax),%r13
    741 	movq	-24(%rax),%r12
    742 	movq	-16(%rax),%rbp
    743 	movq	-8(%rax),%rbx
    744 	leaq	(%rax),%rsp
    745 .Lmul_scatter4_epilogue:
    746 	.byte	0xf3,0xc3
    747 .size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
    748 .globl	rsaz_512_mul_by_one
    749 .hidden rsaz_512_mul_by_one
    750 .type	rsaz_512_mul_by_one,@function
    751 .align	32
    752 rsaz_512_mul_by_one:
    753 	pushq	%rbx
    754 	pushq	%rbp
    755 	pushq	%r12
    756 	pushq	%r13
    757 	pushq	%r14
    758 	pushq	%r15
    759 
    760 	subq	$128+24,%rsp
    761 .Lmul_by_one_body:
    762 	movq	%rdx,%rbp
    763 	movq	%rcx,128(%rsp)
    764 
    765 	movq	(%rsi),%r8
    766 	pxor	%xmm0,%xmm0
    767 	movq	8(%rsi),%r9
    768 	movq	16(%rsi),%r10
    769 	movq	24(%rsi),%r11
    770 	movq	32(%rsi),%r12
    771 	movq	40(%rsi),%r13
    772 	movq	48(%rsi),%r14
    773 	movq	56(%rsi),%r15
    774 
    775 	movdqa	%xmm0,(%rsp)
    776 	movdqa	%xmm0,16(%rsp)
    777 	movdqa	%xmm0,32(%rsp)
    778 	movdqa	%xmm0,48(%rsp)
    779 	movdqa	%xmm0,64(%rsp)
    780 	movdqa	%xmm0,80(%rsp)
    781 	movdqa	%xmm0,96(%rsp)
    782 	call	__rsaz_512_reduce
    783 	movq	%r8,(%rdi)
    784 	movq	%r9,8(%rdi)
    785 	movq	%r10,16(%rdi)
    786 	movq	%r11,24(%rdi)
    787 	movq	%r12,32(%rdi)
    788 	movq	%r13,40(%rdi)
    789 	movq	%r14,48(%rdi)
    790 	movq	%r15,56(%rdi)
    791 
    792 	leaq	128+24+48(%rsp),%rax
    793 	movq	-48(%rax),%r15
    794 	movq	-40(%rax),%r14
    795 	movq	-32(%rax),%r13
    796 	movq	-24(%rax),%r12
    797 	movq	-16(%rax),%rbp
    798 	movq	-8(%rax),%rbx
    799 	leaq	(%rax),%rsp
    800 .Lmul_by_one_epilogue:
    801 	.byte	0xf3,0xc3
    802 .size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
    803 .type	__rsaz_512_reduce,@function
    804 .align	32
    805 __rsaz_512_reduce:
    806 	movq	%r8,%rbx
    807 	imulq	128+8(%rsp),%rbx
    808 	movq	0(%rbp),%rax
    809 	movl	$8,%ecx
    810 	jmp	.Lreduction_loop
    811 
    812 .align	32
    813 .Lreduction_loop:
    814 	mulq	%rbx
    815 	movq	8(%rbp),%rax
    816 	negq	%r8
    817 	movq	%rdx,%r8
    818 	adcq	$0,%r8
    819 
    820 	mulq	%rbx
    821 	addq	%rax,%r9
    822 	movq	16(%rbp),%rax
    823 	adcq	$0,%rdx
    824 	addq	%r9,%r8
    825 	movq	%rdx,%r9
    826 	adcq	$0,%r9
    827 
    828 	mulq	%rbx
    829 	addq	%rax,%r10
    830 	movq	24(%rbp),%rax
    831 	adcq	$0,%rdx
    832 	addq	%r10,%r9
    833 	movq	%rdx,%r10
    834 	adcq	$0,%r10
    835 
    836 	mulq	%rbx
    837 	addq	%rax,%r11
    838 	movq	32(%rbp),%rax
    839 	adcq	$0,%rdx
    840 	addq	%r11,%r10
    841 	movq	128+8(%rsp),%rsi
    842 
    843 
    844 	adcq	$0,%rdx
    845 	movq	%rdx,%r11
    846 
    847 	mulq	%rbx
    848 	addq	%rax,%r12
    849 	movq	40(%rbp),%rax
    850 	adcq	$0,%rdx
    851 	imulq	%r8,%rsi
    852 	addq	%r12,%r11
    853 	movq	%rdx,%r12
    854 	adcq	$0,%r12
    855 
    856 	mulq	%rbx
    857 	addq	%rax,%r13
    858 	movq	48(%rbp),%rax
    859 	adcq	$0,%rdx
    860 	addq	%r13,%r12
    861 	movq	%rdx,%r13
    862 	adcq	$0,%r13
    863 
    864 	mulq	%rbx
    865 	addq	%rax,%r14
    866 	movq	56(%rbp),%rax
    867 	adcq	$0,%rdx
    868 	addq	%r14,%r13
    869 	movq	%rdx,%r14
    870 	adcq	$0,%r14
    871 
    872 	mulq	%rbx
    873 	movq	%rsi,%rbx
    874 	addq	%rax,%r15
    875 	movq	0(%rbp),%rax
    876 	adcq	$0,%rdx
    877 	addq	%r15,%r14
    878 	movq	%rdx,%r15
    879 	adcq	$0,%r15
    880 
    881 	decl	%ecx
    882 	jne	.Lreduction_loop
    883 
    884 	.byte	0xf3,0xc3
    885 .size	__rsaz_512_reduce,.-__rsaz_512_reduce
    886 .type	__rsaz_512_subtract,@function
    887 .align	32
    888 __rsaz_512_subtract:
    889 	movq	%r8,(%rdi)
    890 	movq	%r9,8(%rdi)
    891 	movq	%r10,16(%rdi)
    892 	movq	%r11,24(%rdi)
    893 	movq	%r12,32(%rdi)
    894 	movq	%r13,40(%rdi)
    895 	movq	%r14,48(%rdi)
    896 	movq	%r15,56(%rdi)
    897 
    898 	movq	0(%rbp),%r8
    899 	movq	8(%rbp),%r9
    900 	negq	%r8
    901 	notq	%r9
    902 	andq	%rcx,%r8
    903 	movq	16(%rbp),%r10
    904 	andq	%rcx,%r9
    905 	notq	%r10
    906 	movq	24(%rbp),%r11
    907 	andq	%rcx,%r10
    908 	notq	%r11
    909 	movq	32(%rbp),%r12
    910 	andq	%rcx,%r11
    911 	notq	%r12
    912 	movq	40(%rbp),%r13
    913 	andq	%rcx,%r12
    914 	notq	%r13
    915 	movq	48(%rbp),%r14
    916 	andq	%rcx,%r13
    917 	notq	%r14
    918 	movq	56(%rbp),%r15
    919 	andq	%rcx,%r14
    920 	notq	%r15
    921 	andq	%rcx,%r15
    922 
    923 	addq	(%rdi),%r8
    924 	adcq	8(%rdi),%r9
    925 	adcq	16(%rdi),%r10
    926 	adcq	24(%rdi),%r11
    927 	adcq	32(%rdi),%r12
    928 	adcq	40(%rdi),%r13
    929 	adcq	48(%rdi),%r14
    930 	adcq	56(%rdi),%r15
    931 
    932 	movq	%r8,(%rdi)
    933 	movq	%r9,8(%rdi)
    934 	movq	%r10,16(%rdi)
    935 	movq	%r11,24(%rdi)
    936 	movq	%r12,32(%rdi)
    937 	movq	%r13,40(%rdi)
    938 	movq	%r14,48(%rdi)
    939 	movq	%r15,56(%rdi)
    940 
    941 	.byte	0xf3,0xc3
    942 .size	__rsaz_512_subtract,.-__rsaz_512_subtract
    943 .type	__rsaz_512_mul,@function
    944 .align	32
    945 __rsaz_512_mul:
    946 	leaq	8(%rsp),%rdi
    947 
    948 	movq	(%rsi),%rax
    949 	mulq	%rbx
    950 	movq	%rax,(%rdi)
    951 	movq	8(%rsi),%rax
    952 	movq	%rdx,%r8
    953 
    954 	mulq	%rbx
    955 	addq	%rax,%r8
    956 	movq	16(%rsi),%rax
    957 	movq	%rdx,%r9
    958 	adcq	$0,%r9
    959 
    960 	mulq	%rbx
    961 	addq	%rax,%r9
    962 	movq	24(%rsi),%rax
    963 	movq	%rdx,%r10
    964 	adcq	$0,%r10
    965 
    966 	mulq	%rbx
    967 	addq	%rax,%r10
    968 	movq	32(%rsi),%rax
    969 	movq	%rdx,%r11
    970 	adcq	$0,%r11
    971 
    972 	mulq	%rbx
    973 	addq	%rax,%r11
    974 	movq	40(%rsi),%rax
    975 	movq	%rdx,%r12
    976 	adcq	$0,%r12
    977 
    978 	mulq	%rbx
    979 	addq	%rax,%r12
    980 	movq	48(%rsi),%rax
    981 	movq	%rdx,%r13
    982 	adcq	$0,%r13
    983 
    984 	mulq	%rbx
    985 	addq	%rax,%r13
    986 	movq	56(%rsi),%rax
    987 	movq	%rdx,%r14
    988 	adcq	$0,%r14
    989 
    990 	mulq	%rbx
    991 	addq	%rax,%r14
    992 	movq	(%rsi),%rax
    993 	movq	%rdx,%r15
    994 	adcq	$0,%r15
    995 
    996 	leaq	8(%rbp),%rbp
    997 	leaq	8(%rdi),%rdi
    998 
    999 	movl	$7,%ecx
   1000 	jmp	.Loop_mul
   1001 
   1002 .align	32
   1003 .Loop_mul:
   1004 	movq	(%rbp),%rbx
   1005 	mulq	%rbx
   1006 	addq	%rax,%r8
   1007 	movq	8(%rsi),%rax
   1008 	movq	%r8,(%rdi)
   1009 	movq	%rdx,%r8
   1010 	adcq	$0,%r8
   1011 
   1012 	mulq	%rbx
   1013 	addq	%rax,%r9
   1014 	movq	16(%rsi),%rax
   1015 	adcq	$0,%rdx
   1016 	addq	%r9,%r8
   1017 	movq	%rdx,%r9
   1018 	adcq	$0,%r9
   1019 
   1020 	mulq	%rbx
   1021 	addq	%rax,%r10
   1022 	movq	24(%rsi),%rax
   1023 	adcq	$0,%rdx
   1024 	addq	%r10,%r9
   1025 	movq	%rdx,%r10
   1026 	adcq	$0,%r10
   1027 
   1028 	mulq	%rbx
   1029 	addq	%rax,%r11
   1030 	movq	32(%rsi),%rax
   1031 	adcq	$0,%rdx
   1032 	addq	%r11,%r10
   1033 	movq	%rdx,%r11
   1034 	adcq	$0,%r11
   1035 
   1036 	mulq	%rbx
   1037 	addq	%rax,%r12
   1038 	movq	40(%rsi),%rax
   1039 	adcq	$0,%rdx
   1040 	addq	%r12,%r11
   1041 	movq	%rdx,%r12
   1042 	adcq	$0,%r12
   1043 
   1044 	mulq	%rbx
   1045 	addq	%rax,%r13
   1046 	movq	48(%rsi),%rax
   1047 	adcq	$0,%rdx
   1048 	addq	%r13,%r12
   1049 	movq	%rdx,%r13
   1050 	adcq	$0,%r13
   1051 
   1052 	mulq	%rbx
   1053 	addq	%rax,%r14
   1054 	movq	56(%rsi),%rax
   1055 	adcq	$0,%rdx
   1056 	addq	%r14,%r13
   1057 	movq	%rdx,%r14
   1058 	leaq	8(%rbp),%rbp
   1059 	adcq	$0,%r14
   1060 
   1061 	mulq	%rbx
   1062 	addq	%rax,%r15
   1063 	movq	(%rsi),%rax
   1064 	adcq	$0,%rdx
   1065 	addq	%r15,%r14
   1066 	movq	%rdx,%r15
   1067 	adcq	$0,%r15
   1068 
   1069 	leaq	8(%rdi),%rdi
   1070 
   1071 	decl	%ecx
   1072 	jnz	.Loop_mul
   1073 
   1074 	movq	%r8,(%rdi)
   1075 	movq	%r9,8(%rdi)
   1076 	movq	%r10,16(%rdi)
   1077 	movq	%r11,24(%rdi)
   1078 	movq	%r12,32(%rdi)
   1079 	movq	%r13,40(%rdi)
   1080 	movq	%r14,48(%rdi)
   1081 	movq	%r15,56(%rdi)
   1082 
   1083 	.byte	0xf3,0xc3
   1084 .size	__rsaz_512_mul,.-__rsaz_512_mul
   1085 .globl	rsaz_512_scatter4
   1086 .hidden rsaz_512_scatter4
   1087 .type	rsaz_512_scatter4,@function
   1088 .align	16
   1089 rsaz_512_scatter4:
   1090 	leaq	(%rdi,%rdx,4),%rdi
   1091 	movl	$8,%r9d
   1092 	jmp	.Loop_scatter
   1093 .align	16
   1094 .Loop_scatter:
   1095 	movq	(%rsi),%rax
   1096 	leaq	8(%rsi),%rsi
   1097 	movl	%eax,(%rdi)
   1098 	shrq	$32,%rax
   1099 	movl	%eax,64(%rdi)
   1100 	leaq	128(%rdi),%rdi
   1101 	decl	%r9d
   1102 	jnz	.Loop_scatter
   1103 	.byte	0xf3,0xc3
   1104 .size	rsaz_512_scatter4,.-rsaz_512_scatter4
   1105 
   1106 .globl	rsaz_512_gather4
   1107 .hidden rsaz_512_gather4
   1108 .type	rsaz_512_gather4,@function
   1109 .align	16
   1110 rsaz_512_gather4:
   1111 	leaq	(%rsi,%rdx,4),%rsi
   1112 	movl	$8,%r9d
   1113 	jmp	.Loop_gather
   1114 .align	16
   1115 .Loop_gather:
   1116 	movl	(%rsi),%eax
   1117 	movl	64(%rsi),%r8d
   1118 	leaq	128(%rsi),%rsi
   1119 	shlq	$32,%r8
   1120 	orq	%r8,%rax
   1121 	movq	%rax,(%rdi)
   1122 	leaq	8(%rdi),%rdi
   1123 	decl	%r9d
   1124 	jnz	.Loop_gather
   1125 	.byte	0xf3,0xc3
   1126 .size	rsaz_512_gather4,.-rsaz_512_gather4
   1127 #endif
   1128