Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 
      5 
      6 .globl	_rsaz_512_sqr
      7 .private_extern _rsaz_512_sqr
      8 
      9 .p2align	5
     10 _rsaz_512_sqr:
     11 	pushq	%rbx
     12 	pushq	%rbp
     13 	pushq	%r12
     14 	pushq	%r13
     15 	pushq	%r14
     16 	pushq	%r15
     17 
     18 	subq	$128+24,%rsp
     19 L$sqr_body:
     20 	movq	%rdx,%rbp
     21 	movq	(%rsi),%rdx
     22 	movq	8(%rsi),%rax
     23 	movq	%rcx,128(%rsp)
     24 	jmp	L$oop_sqr
     25 
     26 .p2align	5
     27 L$oop_sqr:
     28 	movl	%r8d,128+8(%rsp)
     29 
     30 	movq	%rdx,%rbx
     31 	mulq	%rdx
     32 	movq	%rax,%r8
     33 	movq	16(%rsi),%rax
     34 	movq	%rdx,%r9
     35 
     36 	mulq	%rbx
     37 	addq	%rax,%r9
     38 	movq	24(%rsi),%rax
     39 	movq	%rdx,%r10
     40 	adcq	$0,%r10
     41 
     42 	mulq	%rbx
     43 	addq	%rax,%r10
     44 	movq	32(%rsi),%rax
     45 	movq	%rdx,%r11
     46 	adcq	$0,%r11
     47 
     48 	mulq	%rbx
     49 	addq	%rax,%r11
     50 	movq	40(%rsi),%rax
     51 	movq	%rdx,%r12
     52 	adcq	$0,%r12
     53 
     54 	mulq	%rbx
     55 	addq	%rax,%r12
     56 	movq	48(%rsi),%rax
     57 	movq	%rdx,%r13
     58 	adcq	$0,%r13
     59 
     60 	mulq	%rbx
     61 	addq	%rax,%r13
     62 	movq	56(%rsi),%rax
     63 	movq	%rdx,%r14
     64 	adcq	$0,%r14
     65 
     66 	mulq	%rbx
     67 	addq	%rax,%r14
     68 	movq	%rbx,%rax
     69 	movq	%rdx,%r15
     70 	adcq	$0,%r15
     71 
     72 	addq	%r8,%r8
     73 	movq	%r9,%rcx
     74 	adcq	%r9,%r9
     75 
     76 	mulq	%rax
     77 	movq	%rax,(%rsp)
     78 	addq	%rdx,%r8
     79 	adcq	$0,%r9
     80 
     81 	movq	%r8,8(%rsp)
     82 	shrq	$63,%rcx
     83 
     84 
     85 	movq	8(%rsi),%r8
     86 	movq	16(%rsi),%rax
     87 	mulq	%r8
     88 	addq	%rax,%r10
     89 	movq	24(%rsi),%rax
     90 	movq	%rdx,%rbx
     91 	adcq	$0,%rbx
     92 
     93 	mulq	%r8
     94 	addq	%rax,%r11
     95 	movq	32(%rsi),%rax
     96 	adcq	$0,%rdx
     97 	addq	%rbx,%r11
     98 	movq	%rdx,%rbx
     99 	adcq	$0,%rbx
    100 
    101 	mulq	%r8
    102 	addq	%rax,%r12
    103 	movq	40(%rsi),%rax
    104 	adcq	$0,%rdx
    105 	addq	%rbx,%r12
    106 	movq	%rdx,%rbx
    107 	adcq	$0,%rbx
    108 
    109 	mulq	%r8
    110 	addq	%rax,%r13
    111 	movq	48(%rsi),%rax
    112 	adcq	$0,%rdx
    113 	addq	%rbx,%r13
    114 	movq	%rdx,%rbx
    115 	adcq	$0,%rbx
    116 
    117 	mulq	%r8
    118 	addq	%rax,%r14
    119 	movq	56(%rsi),%rax
    120 	adcq	$0,%rdx
    121 	addq	%rbx,%r14
    122 	movq	%rdx,%rbx
    123 	adcq	$0,%rbx
    124 
    125 	mulq	%r8
    126 	addq	%rax,%r15
    127 	movq	%r8,%rax
    128 	adcq	$0,%rdx
    129 	addq	%rbx,%r15
    130 	movq	%rdx,%r8
    131 	movq	%r10,%rdx
    132 	adcq	$0,%r8
    133 
    134 	addq	%rdx,%rdx
    135 	leaq	(%rcx,%r10,2),%r10
    136 	movq	%r11,%rbx
    137 	adcq	%r11,%r11
    138 
    139 	mulq	%rax
    140 	addq	%rax,%r9
    141 	adcq	%rdx,%r10
    142 	adcq	$0,%r11
    143 
    144 	movq	%r9,16(%rsp)
    145 	movq	%r10,24(%rsp)
    146 	shrq	$63,%rbx
    147 
    148 
    149 	movq	16(%rsi),%r9
    150 	movq	24(%rsi),%rax
    151 	mulq	%r9
    152 	addq	%rax,%r12
    153 	movq	32(%rsi),%rax
    154 	movq	%rdx,%rcx
    155 	adcq	$0,%rcx
    156 
    157 	mulq	%r9
    158 	addq	%rax,%r13
    159 	movq	40(%rsi),%rax
    160 	adcq	$0,%rdx
    161 	addq	%rcx,%r13
    162 	movq	%rdx,%rcx
    163 	adcq	$0,%rcx
    164 
    165 	mulq	%r9
    166 	addq	%rax,%r14
    167 	movq	48(%rsi),%rax
    168 	adcq	$0,%rdx
    169 	addq	%rcx,%r14
    170 	movq	%rdx,%rcx
    171 	adcq	$0,%rcx
    172 
    173 	mulq	%r9
    174 	movq	%r12,%r10
    175 	leaq	(%rbx,%r12,2),%r12
    176 	addq	%rax,%r15
    177 	movq	56(%rsi),%rax
    178 	adcq	$0,%rdx
    179 	addq	%rcx,%r15
    180 	movq	%rdx,%rcx
    181 	adcq	$0,%rcx
    182 
    183 	mulq	%r9
    184 	shrq	$63,%r10
    185 	addq	%rax,%r8
    186 	movq	%r9,%rax
    187 	adcq	$0,%rdx
    188 	addq	%rcx,%r8
    189 	movq	%rdx,%r9
    190 	adcq	$0,%r9
    191 
    192 	movq	%r13,%rcx
    193 	leaq	(%r10,%r13,2),%r13
    194 
    195 	mulq	%rax
    196 	addq	%rax,%r11
    197 	adcq	%rdx,%r12
    198 	adcq	$0,%r13
    199 
    200 	movq	%r11,32(%rsp)
    201 	movq	%r12,40(%rsp)
    202 	shrq	$63,%rcx
    203 
    204 
    205 	movq	24(%rsi),%r10
    206 	movq	32(%rsi),%rax
    207 	mulq	%r10
    208 	addq	%rax,%r14
    209 	movq	40(%rsi),%rax
    210 	movq	%rdx,%rbx
    211 	adcq	$0,%rbx
    212 
    213 	mulq	%r10
    214 	addq	%rax,%r15
    215 	movq	48(%rsi),%rax
    216 	adcq	$0,%rdx
    217 	addq	%rbx,%r15
    218 	movq	%rdx,%rbx
    219 	adcq	$0,%rbx
    220 
    221 	mulq	%r10
    222 	movq	%r14,%r12
    223 	leaq	(%rcx,%r14,2),%r14
    224 	addq	%rax,%r8
    225 	movq	56(%rsi),%rax
    226 	adcq	$0,%rdx
    227 	addq	%rbx,%r8
    228 	movq	%rdx,%rbx
    229 	adcq	$0,%rbx
    230 
    231 	mulq	%r10
    232 	shrq	$63,%r12
    233 	addq	%rax,%r9
    234 	movq	%r10,%rax
    235 	adcq	$0,%rdx
    236 	addq	%rbx,%r9
    237 	movq	%rdx,%r10
    238 	adcq	$0,%r10
    239 
    240 	movq	%r15,%rbx
    241 	leaq	(%r12,%r15,2),%r15
    242 
    243 	mulq	%rax
    244 	addq	%rax,%r13
    245 	adcq	%rdx,%r14
    246 	adcq	$0,%r15
    247 
    248 	movq	%r13,48(%rsp)
    249 	movq	%r14,56(%rsp)
    250 	shrq	$63,%rbx
    251 
    252 
    253 	movq	32(%rsi),%r11
    254 	movq	40(%rsi),%rax
    255 	mulq	%r11
    256 	addq	%rax,%r8
    257 	movq	48(%rsi),%rax
    258 	movq	%rdx,%rcx
    259 	adcq	$0,%rcx
    260 
    261 	mulq	%r11
    262 	addq	%rax,%r9
    263 	movq	56(%rsi),%rax
    264 	adcq	$0,%rdx
    265 	movq	%r8,%r12
    266 	leaq	(%rbx,%r8,2),%r8
    267 	addq	%rcx,%r9
    268 	movq	%rdx,%rcx
    269 	adcq	$0,%rcx
    270 
    271 	mulq	%r11
    272 	shrq	$63,%r12
    273 	addq	%rax,%r10
    274 	movq	%r11,%rax
    275 	adcq	$0,%rdx
    276 	addq	%rcx,%r10
    277 	movq	%rdx,%r11
    278 	adcq	$0,%r11
    279 
    280 	movq	%r9,%rcx
    281 	leaq	(%r12,%r9,2),%r9
    282 
    283 	mulq	%rax
    284 	addq	%rax,%r15
    285 	adcq	%rdx,%r8
    286 	adcq	$0,%r9
    287 
    288 	movq	%r15,64(%rsp)
    289 	movq	%r8,72(%rsp)
    290 	shrq	$63,%rcx
    291 
    292 
    293 	movq	40(%rsi),%r12
    294 	movq	48(%rsi),%rax
    295 	mulq	%r12
    296 	addq	%rax,%r10
    297 	movq	56(%rsi),%rax
    298 	movq	%rdx,%rbx
    299 	adcq	$0,%rbx
    300 
    301 	mulq	%r12
    302 	addq	%rax,%r11
    303 	movq	%r12,%rax
    304 	movq	%r10,%r15
    305 	leaq	(%rcx,%r10,2),%r10
    306 	adcq	$0,%rdx
    307 	shrq	$63,%r15
    308 	addq	%rbx,%r11
    309 	movq	%rdx,%r12
    310 	adcq	$0,%r12
    311 
    312 	movq	%r11,%rbx
    313 	leaq	(%r15,%r11,2),%r11
    314 
    315 	mulq	%rax
    316 	addq	%rax,%r9
    317 	adcq	%rdx,%r10
    318 	adcq	$0,%r11
    319 
    320 	movq	%r9,80(%rsp)
    321 	movq	%r10,88(%rsp)
    322 
    323 
    324 	movq	48(%rsi),%r13
    325 	movq	56(%rsi),%rax
    326 	mulq	%r13
    327 	addq	%rax,%r12
    328 	movq	%r13,%rax
    329 	movq	%rdx,%r13
    330 	adcq	$0,%r13
    331 
    332 	xorq	%r14,%r14
    333 	shlq	$1,%rbx
    334 	adcq	%r12,%r12
    335 	adcq	%r13,%r13
    336 	adcq	%r14,%r14
    337 
    338 	mulq	%rax
    339 	addq	%rax,%r11
    340 	adcq	%rdx,%r12
    341 	adcq	$0,%r13
    342 
    343 	movq	%r11,96(%rsp)
    344 	movq	%r12,104(%rsp)
    345 
    346 
    347 	movq	56(%rsi),%rax
    348 	mulq	%rax
    349 	addq	%rax,%r13
    350 	adcq	$0,%rdx
    351 
    352 	addq	%rdx,%r14
    353 
    354 	movq	%r13,112(%rsp)
    355 	movq	%r14,120(%rsp)
    356 
    357 	movq	(%rsp),%r8
    358 	movq	8(%rsp),%r9
    359 	movq	16(%rsp),%r10
    360 	movq	24(%rsp),%r11
    361 	movq	32(%rsp),%r12
    362 	movq	40(%rsp),%r13
    363 	movq	48(%rsp),%r14
    364 	movq	56(%rsp),%r15
    365 
    366 	call	__rsaz_512_reduce
    367 
    368 	addq	64(%rsp),%r8
    369 	adcq	72(%rsp),%r9
    370 	adcq	80(%rsp),%r10
    371 	adcq	88(%rsp),%r11
    372 	adcq	96(%rsp),%r12
    373 	adcq	104(%rsp),%r13
    374 	adcq	112(%rsp),%r14
    375 	adcq	120(%rsp),%r15
    376 	sbbq	%rcx,%rcx
    377 
    378 	call	__rsaz_512_subtract
    379 
    380 	movq	%r8,%rdx
    381 	movq	%r9,%rax
    382 	movl	128+8(%rsp),%r8d
    383 	movq	%rdi,%rsi
    384 
    385 	decl	%r8d
    386 	jnz	L$oop_sqr
    387 
    388 	leaq	128+24+48(%rsp),%rax
    389 	movq	-48(%rax),%r15
    390 	movq	-40(%rax),%r14
    391 	movq	-32(%rax),%r13
    392 	movq	-24(%rax),%r12
    393 	movq	-16(%rax),%rbp
    394 	movq	-8(%rax),%rbx
    395 	leaq	(%rax),%rsp
    396 L$sqr_epilogue:
    397 	.byte	0xf3,0xc3
    398 
    399 .globl	_rsaz_512_mul
    400 .private_extern _rsaz_512_mul
    401 
    402 .p2align	5
    403 _rsaz_512_mul:
    404 	pushq	%rbx
    405 	pushq	%rbp
    406 	pushq	%r12
    407 	pushq	%r13
    408 	pushq	%r14
    409 	pushq	%r15
    410 
    411 	subq	$128+24,%rsp
    412 L$mul_body:
    413 .byte	102,72,15,110,199
    414 .byte	102,72,15,110,201
    415 	movq	%r8,128(%rsp)
    416 	movq	(%rdx),%rbx
    417 	movq	%rdx,%rbp
    418 	call	__rsaz_512_mul
    419 
    420 .byte	102,72,15,126,199
    421 .byte	102,72,15,126,205
    422 
    423 	movq	(%rsp),%r8
    424 	movq	8(%rsp),%r9
    425 	movq	16(%rsp),%r10
    426 	movq	24(%rsp),%r11
    427 	movq	32(%rsp),%r12
    428 	movq	40(%rsp),%r13
    429 	movq	48(%rsp),%r14
    430 	movq	56(%rsp),%r15
    431 
    432 	call	__rsaz_512_reduce
    433 	addq	64(%rsp),%r8
    434 	adcq	72(%rsp),%r9
    435 	adcq	80(%rsp),%r10
    436 	adcq	88(%rsp),%r11
    437 	adcq	96(%rsp),%r12
    438 	adcq	104(%rsp),%r13
    439 	adcq	112(%rsp),%r14
    440 	adcq	120(%rsp),%r15
    441 	sbbq	%rcx,%rcx
    442 
    443 	call	__rsaz_512_subtract
    444 
    445 	leaq	128+24+48(%rsp),%rax
    446 	movq	-48(%rax),%r15
    447 	movq	-40(%rax),%r14
    448 	movq	-32(%rax),%r13
    449 	movq	-24(%rax),%r12
    450 	movq	-16(%rax),%rbp
    451 	movq	-8(%rax),%rbx
    452 	leaq	(%rax),%rsp
    453 L$mul_epilogue:
    454 	.byte	0xf3,0xc3
    455 
    456 .globl	_rsaz_512_mul_gather4
    457 .private_extern _rsaz_512_mul_gather4
    458 
    459 .p2align	5
    460 _rsaz_512_mul_gather4:
    461 	pushq	%rbx
    462 	pushq	%rbp
    463 	pushq	%r12
    464 	pushq	%r13
    465 	pushq	%r14
    466 	pushq	%r15
    467 
    468 	movl	%r9d,%r9d
    469 	subq	$128+24,%rsp
    470 L$mul_gather4_body:
    471 	movl	64(%rdx,%r9,4),%eax
    472 .byte	102,72,15,110,199
    473 	movl	(%rdx,%r9,4),%ebx
    474 .byte	102,72,15,110,201
    475 	movq	%r8,128(%rsp)
    476 
    477 	shlq	$32,%rax
    478 	orq	%rax,%rbx
    479 	movq	(%rsi),%rax
    480 	movq	8(%rsi),%rcx
    481 	leaq	128(%rdx,%r9,4),%rbp
    482 	mulq	%rbx
    483 	movq	%rax,(%rsp)
    484 	movq	%rcx,%rax
    485 	movq	%rdx,%r8
    486 
    487 	mulq	%rbx
    488 	movd	(%rbp),%xmm4
    489 	addq	%rax,%r8
    490 	movq	16(%rsi),%rax
    491 	movq	%rdx,%r9
    492 	adcq	$0,%r9
    493 
    494 	mulq	%rbx
    495 	movd	64(%rbp),%xmm5
    496 	addq	%rax,%r9
    497 	movq	24(%rsi),%rax
    498 	movq	%rdx,%r10
    499 	adcq	$0,%r10
    500 
    501 	mulq	%rbx
    502 	pslldq	$4,%xmm5
    503 	addq	%rax,%r10
    504 	movq	32(%rsi),%rax
    505 	movq	%rdx,%r11
    506 	adcq	$0,%r11
    507 
    508 	mulq	%rbx
    509 	por	%xmm5,%xmm4
    510 	addq	%rax,%r11
    511 	movq	40(%rsi),%rax
    512 	movq	%rdx,%r12
    513 	adcq	$0,%r12
    514 
    515 	mulq	%rbx
    516 	addq	%rax,%r12
    517 	movq	48(%rsi),%rax
    518 	movq	%rdx,%r13
    519 	adcq	$0,%r13
    520 
    521 	mulq	%rbx
    522 	leaq	128(%rbp),%rbp
    523 	addq	%rax,%r13
    524 	movq	56(%rsi),%rax
    525 	movq	%rdx,%r14
    526 	adcq	$0,%r14
    527 
    528 	mulq	%rbx
    529 .byte	102,72,15,126,227
    530 	addq	%rax,%r14
    531 	movq	(%rsi),%rax
    532 	movq	%rdx,%r15
    533 	adcq	$0,%r15
    534 
    535 	leaq	8(%rsp),%rdi
    536 	movl	$7,%ecx
    537 	jmp	L$oop_mul_gather
    538 
    539 .p2align	5
    540 L$oop_mul_gather:
    541 	mulq	%rbx
    542 	addq	%rax,%r8
    543 	movq	8(%rsi),%rax
    544 	movq	%r8,(%rdi)
    545 	movq	%rdx,%r8
    546 	adcq	$0,%r8
    547 
    548 	mulq	%rbx
    549 	movd	(%rbp),%xmm4
    550 	addq	%rax,%r9
    551 	movq	16(%rsi),%rax
    552 	adcq	$0,%rdx
    553 	addq	%r9,%r8
    554 	movq	%rdx,%r9
    555 	adcq	$0,%r9
    556 
    557 	mulq	%rbx
    558 	movd	64(%rbp),%xmm5
    559 	addq	%rax,%r10
    560 	movq	24(%rsi),%rax
    561 	adcq	$0,%rdx
    562 	addq	%r10,%r9
    563 	movq	%rdx,%r10
    564 	adcq	$0,%r10
    565 
    566 	mulq	%rbx
    567 	pslldq	$4,%xmm5
    568 	addq	%rax,%r11
    569 	movq	32(%rsi),%rax
    570 	adcq	$0,%rdx
    571 	addq	%r11,%r10
    572 	movq	%rdx,%r11
    573 	adcq	$0,%r11
    574 
    575 	mulq	%rbx
    576 	por	%xmm5,%xmm4
    577 	addq	%rax,%r12
    578 	movq	40(%rsi),%rax
    579 	adcq	$0,%rdx
    580 	addq	%r12,%r11
    581 	movq	%rdx,%r12
    582 	adcq	$0,%r12
    583 
    584 	mulq	%rbx
    585 	addq	%rax,%r13
    586 	movq	48(%rsi),%rax
    587 	adcq	$0,%rdx
    588 	addq	%r13,%r12
    589 	movq	%rdx,%r13
    590 	adcq	$0,%r13
    591 
    592 	mulq	%rbx
    593 	addq	%rax,%r14
    594 	movq	56(%rsi),%rax
    595 	adcq	$0,%rdx
    596 	addq	%r14,%r13
    597 	movq	%rdx,%r14
    598 	adcq	$0,%r14
    599 
    600 	mulq	%rbx
    601 .byte	102,72,15,126,227
    602 	addq	%rax,%r15
    603 	movq	(%rsi),%rax
    604 	adcq	$0,%rdx
    605 	addq	%r15,%r14
    606 	movq	%rdx,%r15
    607 	adcq	$0,%r15
    608 
    609 	leaq	128(%rbp),%rbp
    610 	leaq	8(%rdi),%rdi
    611 
    612 	decl	%ecx
    613 	jnz	L$oop_mul_gather
    614 
    615 	movq	%r8,(%rdi)
    616 	movq	%r9,8(%rdi)
    617 	movq	%r10,16(%rdi)
    618 	movq	%r11,24(%rdi)
    619 	movq	%r12,32(%rdi)
    620 	movq	%r13,40(%rdi)
    621 	movq	%r14,48(%rdi)
    622 	movq	%r15,56(%rdi)
    623 
    624 .byte	102,72,15,126,199
    625 .byte	102,72,15,126,205
    626 
    627 	movq	(%rsp),%r8
    628 	movq	8(%rsp),%r9
    629 	movq	16(%rsp),%r10
    630 	movq	24(%rsp),%r11
    631 	movq	32(%rsp),%r12
    632 	movq	40(%rsp),%r13
    633 	movq	48(%rsp),%r14
    634 	movq	56(%rsp),%r15
    635 
    636 	call	__rsaz_512_reduce
    637 	addq	64(%rsp),%r8
    638 	adcq	72(%rsp),%r9
    639 	adcq	80(%rsp),%r10
    640 	adcq	88(%rsp),%r11
    641 	adcq	96(%rsp),%r12
    642 	adcq	104(%rsp),%r13
    643 	adcq	112(%rsp),%r14
    644 	adcq	120(%rsp),%r15
    645 	sbbq	%rcx,%rcx
    646 
    647 	call	__rsaz_512_subtract
    648 
    649 	leaq	128+24+48(%rsp),%rax
    650 	movq	-48(%rax),%r15
    651 	movq	-40(%rax),%r14
    652 	movq	-32(%rax),%r13
    653 	movq	-24(%rax),%r12
    654 	movq	-16(%rax),%rbp
    655 	movq	-8(%rax),%rbx
    656 	leaq	(%rax),%rsp
    657 L$mul_gather4_epilogue:
    658 	.byte	0xf3,0xc3
    659 
    660 .globl	_rsaz_512_mul_scatter4
    661 .private_extern _rsaz_512_mul_scatter4
    662 
    663 .p2align	5
    664 _rsaz_512_mul_scatter4:
    665 	pushq	%rbx
    666 	pushq	%rbp
    667 	pushq	%r12
    668 	pushq	%r13
    669 	pushq	%r14
    670 	pushq	%r15
    671 
    672 	movl	%r9d,%r9d
    673 	subq	$128+24,%rsp
    674 L$mul_scatter4_body:
    675 	leaq	(%r8,%r9,4),%r8
    676 .byte	102,72,15,110,199
    677 .byte	102,72,15,110,202
    678 .byte	102,73,15,110,208
    679 	movq	%rcx,128(%rsp)
    680 
    681 	movq	%rdi,%rbp
    682 	movq	(%rdi),%rbx
    683 	call	__rsaz_512_mul
    684 
    685 .byte	102,72,15,126,199
    686 .byte	102,72,15,126,205
    687 
    688 	movq	(%rsp),%r8
    689 	movq	8(%rsp),%r9
    690 	movq	16(%rsp),%r10
    691 	movq	24(%rsp),%r11
    692 	movq	32(%rsp),%r12
    693 	movq	40(%rsp),%r13
    694 	movq	48(%rsp),%r14
    695 	movq	56(%rsp),%r15
    696 
    697 	call	__rsaz_512_reduce
    698 	addq	64(%rsp),%r8
    699 	adcq	72(%rsp),%r9
    700 	adcq	80(%rsp),%r10
    701 	adcq	88(%rsp),%r11
    702 	adcq	96(%rsp),%r12
    703 	adcq	104(%rsp),%r13
    704 	adcq	112(%rsp),%r14
    705 	adcq	120(%rsp),%r15
    706 .byte	102,72,15,126,214
    707 	sbbq	%rcx,%rcx
    708 
    709 	call	__rsaz_512_subtract
    710 
    711 	movl	%r8d,0(%rsi)
    712 	shrq	$32,%r8
    713 	movl	%r9d,128(%rsi)
    714 	shrq	$32,%r9
    715 	movl	%r10d,256(%rsi)
    716 	shrq	$32,%r10
    717 	movl	%r11d,384(%rsi)
    718 	shrq	$32,%r11
    719 	movl	%r12d,512(%rsi)
    720 	shrq	$32,%r12
    721 	movl	%r13d,640(%rsi)
    722 	shrq	$32,%r13
    723 	movl	%r14d,768(%rsi)
    724 	shrq	$32,%r14
    725 	movl	%r15d,896(%rsi)
    726 	shrq	$32,%r15
    727 	movl	%r8d,64(%rsi)
    728 	movl	%r9d,192(%rsi)
    729 	movl	%r10d,320(%rsi)
    730 	movl	%r11d,448(%rsi)
    731 	movl	%r12d,576(%rsi)
    732 	movl	%r13d,704(%rsi)
    733 	movl	%r14d,832(%rsi)
    734 	movl	%r15d,960(%rsi)
    735 
    736 	leaq	128+24+48(%rsp),%rax
    737 	movq	-48(%rax),%r15
    738 	movq	-40(%rax),%r14
    739 	movq	-32(%rax),%r13
    740 	movq	-24(%rax),%r12
    741 	movq	-16(%rax),%rbp
    742 	movq	-8(%rax),%rbx
    743 	leaq	(%rax),%rsp
    744 L$mul_scatter4_epilogue:
    745 	.byte	0xf3,0xc3
    746 
    747 .globl	_rsaz_512_mul_by_one
    748 .private_extern _rsaz_512_mul_by_one
    749 
    750 .p2align	5
    751 _rsaz_512_mul_by_one:
    752 	pushq	%rbx
    753 	pushq	%rbp
    754 	pushq	%r12
    755 	pushq	%r13
    756 	pushq	%r14
    757 	pushq	%r15
    758 
    759 	subq	$128+24,%rsp
    760 L$mul_by_one_body:
    761 	movq	%rdx,%rbp
    762 	movq	%rcx,128(%rsp)
    763 
    764 	movq	(%rsi),%r8
    765 	pxor	%xmm0,%xmm0
    766 	movq	8(%rsi),%r9
    767 	movq	16(%rsi),%r10
    768 	movq	24(%rsi),%r11
    769 	movq	32(%rsi),%r12
    770 	movq	40(%rsi),%r13
    771 	movq	48(%rsi),%r14
    772 	movq	56(%rsi),%r15
    773 
    774 	movdqa	%xmm0,(%rsp)
    775 	movdqa	%xmm0,16(%rsp)
    776 	movdqa	%xmm0,32(%rsp)
    777 	movdqa	%xmm0,48(%rsp)
    778 	movdqa	%xmm0,64(%rsp)
    779 	movdqa	%xmm0,80(%rsp)
    780 	movdqa	%xmm0,96(%rsp)
    781 	call	__rsaz_512_reduce
    782 	movq	%r8,(%rdi)
    783 	movq	%r9,8(%rdi)
    784 	movq	%r10,16(%rdi)
    785 	movq	%r11,24(%rdi)
    786 	movq	%r12,32(%rdi)
    787 	movq	%r13,40(%rdi)
    788 	movq	%r14,48(%rdi)
    789 	movq	%r15,56(%rdi)
    790 
    791 	leaq	128+24+48(%rsp),%rax
    792 	movq	-48(%rax),%r15
    793 	movq	-40(%rax),%r14
    794 	movq	-32(%rax),%r13
    795 	movq	-24(%rax),%r12
    796 	movq	-16(%rax),%rbp
    797 	movq	-8(%rax),%rbx
    798 	leaq	(%rax),%rsp
    799 L$mul_by_one_epilogue:
    800 	.byte	0xf3,0xc3
    801 
    802 
    803 .p2align	5
    804 __rsaz_512_reduce:
    805 	movq	%r8,%rbx
    806 	imulq	128+8(%rsp),%rbx
    807 	movq	0(%rbp),%rax
    808 	movl	$8,%ecx
    809 	jmp	L$reduction_loop
    810 
    811 .p2align	5
    812 L$reduction_loop:
    813 	mulq	%rbx
    814 	movq	8(%rbp),%rax
    815 	negq	%r8
    816 	movq	%rdx,%r8
    817 	adcq	$0,%r8
    818 
    819 	mulq	%rbx
    820 	addq	%rax,%r9
    821 	movq	16(%rbp),%rax
    822 	adcq	$0,%rdx
    823 	addq	%r9,%r8
    824 	movq	%rdx,%r9
    825 	adcq	$0,%r9
    826 
    827 	mulq	%rbx
    828 	addq	%rax,%r10
    829 	movq	24(%rbp),%rax
    830 	adcq	$0,%rdx
    831 	addq	%r10,%r9
    832 	movq	%rdx,%r10
    833 	adcq	$0,%r10
    834 
    835 	mulq	%rbx
    836 	addq	%rax,%r11
    837 	movq	32(%rbp),%rax
    838 	adcq	$0,%rdx
    839 	addq	%r11,%r10
    840 	movq	128+8(%rsp),%rsi
    841 
    842 
    843 	adcq	$0,%rdx
    844 	movq	%rdx,%r11
    845 
    846 	mulq	%rbx
    847 	addq	%rax,%r12
    848 	movq	40(%rbp),%rax
    849 	adcq	$0,%rdx
    850 	imulq	%r8,%rsi
    851 	addq	%r12,%r11
    852 	movq	%rdx,%r12
    853 	adcq	$0,%r12
    854 
    855 	mulq	%rbx
    856 	addq	%rax,%r13
    857 	movq	48(%rbp),%rax
    858 	adcq	$0,%rdx
    859 	addq	%r13,%r12
    860 	movq	%rdx,%r13
    861 	adcq	$0,%r13
    862 
    863 	mulq	%rbx
    864 	addq	%rax,%r14
    865 	movq	56(%rbp),%rax
    866 	adcq	$0,%rdx
    867 	addq	%r14,%r13
    868 	movq	%rdx,%r14
    869 	adcq	$0,%r14
    870 
    871 	mulq	%rbx
    872 	movq	%rsi,%rbx
    873 	addq	%rax,%r15
    874 	movq	0(%rbp),%rax
    875 	adcq	$0,%rdx
    876 	addq	%r15,%r14
    877 	movq	%rdx,%r15
    878 	adcq	$0,%r15
    879 
    880 	decl	%ecx
    881 	jne	L$reduction_loop
    882 
    883 	.byte	0xf3,0xc3
    884 
    885 
    886 .p2align	5
    887 __rsaz_512_subtract:
    888 	movq	%r8,(%rdi)
    889 	movq	%r9,8(%rdi)
    890 	movq	%r10,16(%rdi)
    891 	movq	%r11,24(%rdi)
    892 	movq	%r12,32(%rdi)
    893 	movq	%r13,40(%rdi)
    894 	movq	%r14,48(%rdi)
    895 	movq	%r15,56(%rdi)
    896 
    897 	movq	0(%rbp),%r8
    898 	movq	8(%rbp),%r9
    899 	negq	%r8
    900 	notq	%r9
    901 	andq	%rcx,%r8
    902 	movq	16(%rbp),%r10
    903 	andq	%rcx,%r9
    904 	notq	%r10
    905 	movq	24(%rbp),%r11
    906 	andq	%rcx,%r10
    907 	notq	%r11
    908 	movq	32(%rbp),%r12
    909 	andq	%rcx,%r11
    910 	notq	%r12
    911 	movq	40(%rbp),%r13
    912 	andq	%rcx,%r12
    913 	notq	%r13
    914 	movq	48(%rbp),%r14
    915 	andq	%rcx,%r13
    916 	notq	%r14
    917 	movq	56(%rbp),%r15
    918 	andq	%rcx,%r14
    919 	notq	%r15
    920 	andq	%rcx,%r15
    921 
    922 	addq	(%rdi),%r8
    923 	adcq	8(%rdi),%r9
    924 	adcq	16(%rdi),%r10
    925 	adcq	24(%rdi),%r11
    926 	adcq	32(%rdi),%r12
    927 	adcq	40(%rdi),%r13
    928 	adcq	48(%rdi),%r14
    929 	adcq	56(%rdi),%r15
    930 
    931 	movq	%r8,(%rdi)
    932 	movq	%r9,8(%rdi)
    933 	movq	%r10,16(%rdi)
    934 	movq	%r11,24(%rdi)
    935 	movq	%r12,32(%rdi)
    936 	movq	%r13,40(%rdi)
    937 	movq	%r14,48(%rdi)
    938 	movq	%r15,56(%rdi)
    939 
    940 	.byte	0xf3,0xc3
    941 
    942 
    943 .p2align	5
    944 __rsaz_512_mul:
    945 	leaq	8(%rsp),%rdi
    946 
    947 	movq	(%rsi),%rax
    948 	mulq	%rbx
    949 	movq	%rax,(%rdi)
    950 	movq	8(%rsi),%rax
    951 	movq	%rdx,%r8
    952 
    953 	mulq	%rbx
    954 	addq	%rax,%r8
    955 	movq	16(%rsi),%rax
    956 	movq	%rdx,%r9
    957 	adcq	$0,%r9
    958 
    959 	mulq	%rbx
    960 	addq	%rax,%r9
    961 	movq	24(%rsi),%rax
    962 	movq	%rdx,%r10
    963 	adcq	$0,%r10
    964 
    965 	mulq	%rbx
    966 	addq	%rax,%r10
    967 	movq	32(%rsi),%rax
    968 	movq	%rdx,%r11
    969 	adcq	$0,%r11
    970 
    971 	mulq	%rbx
    972 	addq	%rax,%r11
    973 	movq	40(%rsi),%rax
    974 	movq	%rdx,%r12
    975 	adcq	$0,%r12
    976 
    977 	mulq	%rbx
    978 	addq	%rax,%r12
    979 	movq	48(%rsi),%rax
    980 	movq	%rdx,%r13
    981 	adcq	$0,%r13
    982 
    983 	mulq	%rbx
    984 	addq	%rax,%r13
    985 	movq	56(%rsi),%rax
    986 	movq	%rdx,%r14
    987 	adcq	$0,%r14
    988 
    989 	mulq	%rbx
    990 	addq	%rax,%r14
    991 	movq	(%rsi),%rax
    992 	movq	%rdx,%r15
    993 	adcq	$0,%r15
    994 
    995 	leaq	8(%rbp),%rbp
    996 	leaq	8(%rdi),%rdi
    997 
    998 	movl	$7,%ecx
    999 	jmp	L$oop_mul
   1000 
   1001 .p2align	5
   1002 L$oop_mul:
   1003 	movq	(%rbp),%rbx
   1004 	mulq	%rbx
   1005 	addq	%rax,%r8
   1006 	movq	8(%rsi),%rax
   1007 	movq	%r8,(%rdi)
   1008 	movq	%rdx,%r8
   1009 	adcq	$0,%r8
   1010 
   1011 	mulq	%rbx
   1012 	addq	%rax,%r9
   1013 	movq	16(%rsi),%rax
   1014 	adcq	$0,%rdx
   1015 	addq	%r9,%r8
   1016 	movq	%rdx,%r9
   1017 	adcq	$0,%r9
   1018 
   1019 	mulq	%rbx
   1020 	addq	%rax,%r10
   1021 	movq	24(%rsi),%rax
   1022 	adcq	$0,%rdx
   1023 	addq	%r10,%r9
   1024 	movq	%rdx,%r10
   1025 	adcq	$0,%r10
   1026 
   1027 	mulq	%rbx
   1028 	addq	%rax,%r11
   1029 	movq	32(%rsi),%rax
   1030 	adcq	$0,%rdx
   1031 	addq	%r11,%r10
   1032 	movq	%rdx,%r11
   1033 	adcq	$0,%r11
   1034 
   1035 	mulq	%rbx
   1036 	addq	%rax,%r12
   1037 	movq	40(%rsi),%rax
   1038 	adcq	$0,%rdx
   1039 	addq	%r12,%r11
   1040 	movq	%rdx,%r12
   1041 	adcq	$0,%r12
   1042 
   1043 	mulq	%rbx
   1044 	addq	%rax,%r13
   1045 	movq	48(%rsi),%rax
   1046 	adcq	$0,%rdx
   1047 	addq	%r13,%r12
   1048 	movq	%rdx,%r13
   1049 	adcq	$0,%r13
   1050 
   1051 	mulq	%rbx
   1052 	addq	%rax,%r14
   1053 	movq	56(%rsi),%rax
   1054 	adcq	$0,%rdx
   1055 	addq	%r14,%r13
   1056 	movq	%rdx,%r14
   1057 	leaq	8(%rbp),%rbp
   1058 	adcq	$0,%r14
   1059 
   1060 	mulq	%rbx
   1061 	addq	%rax,%r15
   1062 	movq	(%rsi),%rax
   1063 	adcq	$0,%rdx
   1064 	addq	%r15,%r14
   1065 	movq	%rdx,%r15
   1066 	adcq	$0,%r15
   1067 
   1068 	leaq	8(%rdi),%rdi
   1069 
   1070 	decl	%ecx
   1071 	jnz	L$oop_mul
   1072 
   1073 	movq	%r8,(%rdi)
   1074 	movq	%r9,8(%rdi)
   1075 	movq	%r10,16(%rdi)
   1076 	movq	%r11,24(%rdi)
   1077 	movq	%r12,32(%rdi)
   1078 	movq	%r13,40(%rdi)
   1079 	movq	%r14,48(%rdi)
   1080 	movq	%r15,56(%rdi)
   1081 
   1082 	.byte	0xf3,0xc3
   1083 
   1084 .globl	_rsaz_512_scatter4
   1085 .private_extern _rsaz_512_scatter4
   1086 
   1087 .p2align	4
   1088 _rsaz_512_scatter4:
   1089 	leaq	(%rdi,%rdx,4),%rdi
   1090 	movl	$8,%r9d
   1091 	jmp	L$oop_scatter
   1092 .p2align	4
   1093 L$oop_scatter:
   1094 	movq	(%rsi),%rax
   1095 	leaq	8(%rsi),%rsi
   1096 	movl	%eax,(%rdi)
   1097 	shrq	$32,%rax
   1098 	movl	%eax,64(%rdi)
   1099 	leaq	128(%rdi),%rdi
   1100 	decl	%r9d
   1101 	jnz	L$oop_scatter
   1102 	.byte	0xf3,0xc3
   1103 
   1104 
   1105 .globl	_rsaz_512_gather4
   1106 .private_extern _rsaz_512_gather4
   1107 
   1108 .p2align	4
   1109 _rsaz_512_gather4:
   1110 	leaq	(%rsi,%rdx,4),%rsi
   1111 	movl	$8,%r9d
   1112 	jmp	L$oop_gather
   1113 .p2align	4
   1114 L$oop_gather:
   1115 	movl	(%rsi),%eax
   1116 	movl	64(%rsi),%r8d
   1117 	leaq	128(%rsi),%rsi
   1118 	shlq	$32,%r8
   1119 	orq	%r8,%rax
   1120 	movq	%rax,(%rdi)
   1121 	leaq	8(%rdi),%rdi
   1122 	decl	%r9d
   1123 	jnz	L$oop_gather
   1124 	.byte	0xf3,0xc3
   1125 
   1126 #endif
   1127