Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 
      5 
      6 .globl	_bn_mul_mont_gather5
      7 .private_extern _bn_mul_mont_gather5
      8 
      9 .p2align	6
     10 _bn_mul_mont_gather5:
     11 	testl	$7,%r9d
     12 	jnz	L$mul_enter
     13 	jmp	L$mul4x_enter
     14 
     15 .p2align	4
     16 L$mul_enter:
     17 	movl	%r9d,%r9d
     18 	movq	%rsp,%rax
     19 	movl	8(%rsp),%r10d
     20 	pushq	%rbx
     21 	pushq	%rbp
     22 	pushq	%r12
     23 	pushq	%r13
     24 	pushq	%r14
     25 	pushq	%r15
     26 	leaq	2(%r9),%r11
     27 	negq	%r11
     28 	leaq	(%rsp,%r11,8),%rsp
     29 	andq	$-1024,%rsp
     30 
     31 	movq	%rax,8(%rsp,%r9,8)
     32 L$mul_body:
     33 	movq	%rdx,%r12
     34 	movq	%r10,%r11
     35 	shrq	$3,%r10
     36 	andq	$7,%r11
     37 	notq	%r10
     38 	leaq	L$magic_masks(%rip),%rax
     39 	andq	$3,%r10
     40 	leaq	96(%r12,%r11,8),%r12
     41 	movq	0(%rax,%r10,8),%xmm4
     42 	movq	8(%rax,%r10,8),%xmm5
     43 	movq	16(%rax,%r10,8),%xmm6
     44 	movq	24(%rax,%r10,8),%xmm7
     45 
     46 	movq	-96(%r12),%xmm0
     47 	movq	-32(%r12),%xmm1
     48 	pand	%xmm4,%xmm0
     49 	movq	32(%r12),%xmm2
     50 	pand	%xmm5,%xmm1
     51 	movq	96(%r12),%xmm3
     52 	pand	%xmm6,%xmm2
     53 	por	%xmm1,%xmm0
     54 	pand	%xmm7,%xmm3
     55 	por	%xmm2,%xmm0
     56 	leaq	256(%r12),%r12
     57 	por	%xmm3,%xmm0
     58 
     59 .byte	102,72,15,126,195
     60 
     61 	movq	(%r8),%r8
     62 	movq	(%rsi),%rax
     63 
     64 	xorq	%r14,%r14
     65 	xorq	%r15,%r15
     66 
     67 	movq	-96(%r12),%xmm0
     68 	movq	-32(%r12),%xmm1
     69 	pand	%xmm4,%xmm0
     70 	movq	32(%r12),%xmm2
     71 	pand	%xmm5,%xmm1
     72 
     73 	movq	%r8,%rbp
     74 	mulq	%rbx
     75 	movq	%rax,%r10
     76 	movq	(%rcx),%rax
     77 
     78 	movq	96(%r12),%xmm3
     79 	pand	%xmm6,%xmm2
     80 	por	%xmm1,%xmm0
     81 	pand	%xmm7,%xmm3
     82 
     83 	imulq	%r10,%rbp
     84 	movq	%rdx,%r11
     85 
     86 	por	%xmm2,%xmm0
     87 	leaq	256(%r12),%r12
     88 	por	%xmm3,%xmm0
     89 
     90 	mulq	%rbp
     91 	addq	%rax,%r10
     92 	movq	8(%rsi),%rax
     93 	adcq	$0,%rdx
     94 	movq	%rdx,%r13
     95 
     96 	leaq	1(%r15),%r15
     97 	jmp	L$1st_enter
     98 
     99 .p2align	4
    100 L$1st:
    101 	addq	%rax,%r13
    102 	movq	(%rsi,%r15,8),%rax
    103 	adcq	$0,%rdx
    104 	addq	%r11,%r13
    105 	movq	%r10,%r11
    106 	adcq	$0,%rdx
    107 	movq	%r13,-16(%rsp,%r15,8)
    108 	movq	%rdx,%r13
    109 
    110 L$1st_enter:
    111 	mulq	%rbx
    112 	addq	%rax,%r11
    113 	movq	(%rcx,%r15,8),%rax
    114 	adcq	$0,%rdx
    115 	leaq	1(%r15),%r15
    116 	movq	%rdx,%r10
    117 
    118 	mulq	%rbp
    119 	cmpq	%r9,%r15
    120 	jne	L$1st
    121 
    122 .byte	102,72,15,126,195
    123 
    124 	addq	%rax,%r13
    125 	movq	(%rsi),%rax
    126 	adcq	$0,%rdx
    127 	addq	%r11,%r13
    128 	adcq	$0,%rdx
    129 	movq	%r13,-16(%rsp,%r15,8)
    130 	movq	%rdx,%r13
    131 	movq	%r10,%r11
    132 
    133 	xorq	%rdx,%rdx
    134 	addq	%r11,%r13
    135 	adcq	$0,%rdx
    136 	movq	%r13,-8(%rsp,%r9,8)
    137 	movq	%rdx,(%rsp,%r9,8)
    138 
    139 	leaq	1(%r14),%r14
    140 	jmp	L$outer
    141 .p2align	4
    142 L$outer:
    143 	xorq	%r15,%r15
    144 	movq	%r8,%rbp
    145 	movq	(%rsp),%r10
    146 
    147 	movq	-96(%r12),%xmm0
    148 	movq	-32(%r12),%xmm1
    149 	pand	%xmm4,%xmm0
    150 	movq	32(%r12),%xmm2
    151 	pand	%xmm5,%xmm1
    152 
    153 	mulq	%rbx
    154 	addq	%rax,%r10
    155 	movq	(%rcx),%rax
    156 	adcq	$0,%rdx
    157 
    158 	movq	96(%r12),%xmm3
    159 	pand	%xmm6,%xmm2
    160 	por	%xmm1,%xmm0
    161 	pand	%xmm7,%xmm3
    162 
    163 	imulq	%r10,%rbp
    164 	movq	%rdx,%r11
    165 
    166 	por	%xmm2,%xmm0
    167 	leaq	256(%r12),%r12
    168 	por	%xmm3,%xmm0
    169 
    170 	mulq	%rbp
    171 	addq	%rax,%r10
    172 	movq	8(%rsi),%rax
    173 	adcq	$0,%rdx
    174 	movq	8(%rsp),%r10
    175 	movq	%rdx,%r13
    176 
    177 	leaq	1(%r15),%r15
    178 	jmp	L$inner_enter
    179 
    180 .p2align	4
    181 L$inner:
    182 	addq	%rax,%r13
    183 	movq	(%rsi,%r15,8),%rax
    184 	adcq	$0,%rdx
    185 	addq	%r10,%r13
    186 	movq	(%rsp,%r15,8),%r10
    187 	adcq	$0,%rdx
    188 	movq	%r13,-16(%rsp,%r15,8)
    189 	movq	%rdx,%r13
    190 
    191 L$inner_enter:
    192 	mulq	%rbx
    193 	addq	%rax,%r11
    194 	movq	(%rcx,%r15,8),%rax
    195 	adcq	$0,%rdx
    196 	addq	%r11,%r10
    197 	movq	%rdx,%r11
    198 	adcq	$0,%r11
    199 	leaq	1(%r15),%r15
    200 
    201 	mulq	%rbp
    202 	cmpq	%r9,%r15
    203 	jne	L$inner
    204 
    205 .byte	102,72,15,126,195
    206 
    207 	addq	%rax,%r13
    208 	movq	(%rsi),%rax
    209 	adcq	$0,%rdx
    210 	addq	%r10,%r13
    211 	movq	(%rsp,%r15,8),%r10
    212 	adcq	$0,%rdx
    213 	movq	%r13,-16(%rsp,%r15,8)
    214 	movq	%rdx,%r13
    215 
    216 	xorq	%rdx,%rdx
    217 	addq	%r11,%r13
    218 	adcq	$0,%rdx
    219 	addq	%r10,%r13
    220 	adcq	$0,%rdx
    221 	movq	%r13,-8(%rsp,%r9,8)
    222 	movq	%rdx,(%rsp,%r9,8)
    223 
    224 	leaq	1(%r14),%r14
    225 	cmpq	%r9,%r14
    226 	jb	L$outer
    227 
    228 	xorq	%r14,%r14
    229 	movq	(%rsp),%rax
    230 	leaq	(%rsp),%rsi
    231 	movq	%r9,%r15
    232 	jmp	L$sub
    233 .p2align	4
    234 L$sub:	sbbq	(%rcx,%r14,8),%rax
    235 	movq	%rax,(%rdi,%r14,8)
    236 	movq	8(%rsi,%r14,8),%rax
    237 	leaq	1(%r14),%r14
    238 	decq	%r15
    239 	jnz	L$sub
    240 
    241 	sbbq	$0,%rax
    242 	xorq	%r14,%r14
    243 	movq	%r9,%r15
    244 .p2align	4
    245 L$copy:
    246 	movq	(%rsp,%r14,8),%rsi
    247 	movq	(%rdi,%r14,8),%rcx
    248 	xorq	%rcx,%rsi
    249 	andq	%rax,%rsi
    250 	xorq	%rcx,%rsi
    251 	movq	%r14,(%rsp,%r14,8)
    252 	movq	%rsi,(%rdi,%r14,8)
    253 	leaq	1(%r14),%r14
    254 	subq	$1,%r15
    255 	jnz	L$copy
    256 
    257 	movq	8(%rsp,%r9,8),%rsi
    258 	movq	$1,%rax
    259 	movq	-48(%rsi),%r15
    260 	movq	-40(%rsi),%r14
    261 	movq	-32(%rsi),%r13
    262 	movq	-24(%rsi),%r12
    263 	movq	-16(%rsi),%rbp
    264 	movq	-8(%rsi),%rbx
    265 	leaq	(%rsi),%rsp
    266 L$mul_epilogue:
    267 	.byte	0xf3,0xc3
    268 
    269 
    270 .p2align	5
    271 bn_mul4x_mont_gather5:
    272 L$mul4x_enter:
    273 .byte	0x67
    274 	movq	%rsp,%rax
    275 	pushq	%rbx
    276 	pushq	%rbp
    277 	pushq	%r12
    278 	pushq	%r13
    279 	pushq	%r14
    280 	pushq	%r15
    281 .byte	0x67
    282 	movl	%r9d,%r10d
    283 	shll	$3,%r9d
    284 	shll	$3+2,%r10d
    285 	negq	%r9
    286 
    287 
    288 
    289 
    290 
    291 
    292 
    293 
    294 	leaq	-64(%rsp,%r9,2),%r11
    295 	subq	%rsi,%r11
    296 	andq	$4095,%r11
    297 	cmpq	%r11,%r10
    298 	jb	L$mul4xsp_alt
    299 	subq	%r11,%rsp
    300 	leaq	-64(%rsp,%r9,2),%rsp
    301 	jmp	L$mul4xsp_done
    302 
    303 .p2align	5
    304 L$mul4xsp_alt:
    305 	leaq	4096-64(,%r9,2),%r10
    306 	leaq	-64(%rsp,%r9,2),%rsp
    307 	subq	%r10,%r11
    308 	movq	$0,%r10
    309 	cmovcq	%r10,%r11
    310 	subq	%r11,%rsp
    311 L$mul4xsp_done:
    312 	andq	$-64,%rsp
    313 	negq	%r9
    314 
    315 	movq	%rax,40(%rsp)
    316 L$mul4x_body:
    317 
    318 	call	mul4x_internal
    319 
    320 	movq	40(%rsp),%rsi
    321 	movq	$1,%rax
    322 	movq	-48(%rsi),%r15
    323 	movq	-40(%rsi),%r14
    324 	movq	-32(%rsi),%r13
    325 	movq	-24(%rsi),%r12
    326 	movq	-16(%rsi),%rbp
    327 	movq	-8(%rsi),%rbx
    328 	leaq	(%rsi),%rsp
    329 L$mul4x_epilogue:
    330 	.byte	0xf3,0xc3
    331 
    332 
    333 
    334 .p2align	5
    335 mul4x_internal:
    336 	shlq	$5,%r9
    337 	movl	8(%rax),%r10d
    338 	leaq	256(%rdx,%r9,1),%r13
    339 	shrq	$5,%r9
    340 	movq	%r10,%r11
    341 	shrq	$3,%r10
    342 	andq	$7,%r11
    343 	notq	%r10
    344 	leaq	L$magic_masks(%rip),%rax
    345 	andq	$3,%r10
    346 	leaq	96(%rdx,%r11,8),%r12
    347 	movq	0(%rax,%r10,8),%xmm4
    348 	movq	8(%rax,%r10,8),%xmm5
    349 	addq	$7,%r11
    350 	movq	16(%rax,%r10,8),%xmm6
    351 	movq	24(%rax,%r10,8),%xmm7
    352 	andq	$7,%r11
    353 
    354 	movq	-96(%r12),%xmm0
    355 	leaq	256(%r12),%r14
    356 	movq	-32(%r12),%xmm1
    357 	pand	%xmm4,%xmm0
    358 	movq	32(%r12),%xmm2
    359 	pand	%xmm5,%xmm1
    360 	movq	96(%r12),%xmm3
    361 	pand	%xmm6,%xmm2
    362 .byte	0x67
    363 	por	%xmm1,%xmm0
    364 	movq	-96(%r14),%xmm1
    365 .byte	0x67
    366 	pand	%xmm7,%xmm3
    367 .byte	0x67
    368 	por	%xmm2,%xmm0
    369 	movq	-32(%r14),%xmm2
    370 .byte	0x67
    371 	pand	%xmm4,%xmm1
    372 .byte	0x67
    373 	por	%xmm3,%xmm0
    374 	movq	32(%r14),%xmm3
    375 
    376 .byte	102,72,15,126,195
    377 	movq	96(%r14),%xmm0
    378 	movq	%r13,16+8(%rsp)
    379 	movq	%rdi,56+8(%rsp)
    380 
    381 	movq	(%r8),%r8
    382 	movq	(%rsi),%rax
    383 	leaq	(%rsi,%r9,1),%rsi
    384 	negq	%r9
    385 
    386 	movq	%r8,%rbp
    387 	mulq	%rbx
    388 	movq	%rax,%r10
    389 	movq	(%rcx),%rax
    390 
    391 	pand	%xmm5,%xmm2
    392 	pand	%xmm6,%xmm3
    393 	por	%xmm2,%xmm1
    394 
    395 	imulq	%r10,%rbp
    396 
    397 
    398 
    399 
    400 
    401 
    402 
    403 	leaq	64+8(%rsp,%r11,8),%r14
    404 	movq	%rdx,%r11
    405 
    406 	pand	%xmm7,%xmm0
    407 	por	%xmm3,%xmm1
    408 	leaq	512(%r12),%r12
    409 	por	%xmm1,%xmm0
    410 
    411 	mulq	%rbp
    412 	addq	%rax,%r10
    413 	movq	8(%rsi,%r9,1),%rax
    414 	adcq	$0,%rdx
    415 	movq	%rdx,%rdi
    416 
    417 	mulq	%rbx
    418 	addq	%rax,%r11
    419 	movq	16(%rcx),%rax
    420 	adcq	$0,%rdx
    421 	movq	%rdx,%r10
    422 
    423 	mulq	%rbp
    424 	addq	%rax,%rdi
    425 	movq	16(%rsi,%r9,1),%rax
    426 	adcq	$0,%rdx
    427 	addq	%r11,%rdi
    428 	leaq	32(%r9),%r15
    429 	leaq	64(%rcx),%rcx
    430 	adcq	$0,%rdx
    431 	movq	%rdi,(%r14)
    432 	movq	%rdx,%r13
    433 	jmp	L$1st4x
    434 
    435 .p2align	5
    436 L$1st4x:
    437 	mulq	%rbx
    438 	addq	%rax,%r10
    439 	movq	-32(%rcx),%rax
    440 	leaq	32(%r14),%r14
    441 	adcq	$0,%rdx
    442 	movq	%rdx,%r11
    443 
    444 	mulq	%rbp
    445 	addq	%rax,%r13
    446 	movq	-8(%rsi,%r15,1),%rax
    447 	adcq	$0,%rdx
    448 	addq	%r10,%r13
    449 	adcq	$0,%rdx
    450 	movq	%r13,-24(%r14)
    451 	movq	%rdx,%rdi
    452 
    453 	mulq	%rbx
    454 	addq	%rax,%r11
    455 	movq	-16(%rcx),%rax
    456 	adcq	$0,%rdx
    457 	movq	%rdx,%r10
    458 
    459 	mulq	%rbp
    460 	addq	%rax,%rdi
    461 	movq	(%rsi,%r15,1),%rax
    462 	adcq	$0,%rdx
    463 	addq	%r11,%rdi
    464 	adcq	$0,%rdx
    465 	movq	%rdi,-16(%r14)
    466 	movq	%rdx,%r13
    467 
    468 	mulq	%rbx
    469 	addq	%rax,%r10
    470 	movq	0(%rcx),%rax
    471 	adcq	$0,%rdx
    472 	movq	%rdx,%r11
    473 
    474 	mulq	%rbp
    475 	addq	%rax,%r13
    476 	movq	8(%rsi,%r15,1),%rax
    477 	adcq	$0,%rdx
    478 	addq	%r10,%r13
    479 	adcq	$0,%rdx
    480 	movq	%r13,-8(%r14)
    481 	movq	%rdx,%rdi
    482 
    483 	mulq	%rbx
    484 	addq	%rax,%r11
    485 	movq	16(%rcx),%rax
    486 	adcq	$0,%rdx
    487 	movq	%rdx,%r10
    488 
    489 	mulq	%rbp
    490 	addq	%rax,%rdi
    491 	movq	16(%rsi,%r15,1),%rax
    492 	adcq	$0,%rdx
    493 	addq	%r11,%rdi
    494 	leaq	64(%rcx),%rcx
    495 	adcq	$0,%rdx
    496 	movq	%rdi,(%r14)
    497 	movq	%rdx,%r13
    498 
    499 	addq	$32,%r15
    500 	jnz	L$1st4x
    501 
    502 	mulq	%rbx
    503 	addq	%rax,%r10
    504 	movq	-32(%rcx),%rax
    505 	leaq	32(%r14),%r14
    506 	adcq	$0,%rdx
    507 	movq	%rdx,%r11
    508 
    509 	mulq	%rbp
    510 	addq	%rax,%r13
    511 	movq	-8(%rsi),%rax
    512 	adcq	$0,%rdx
    513 	addq	%r10,%r13
    514 	adcq	$0,%rdx
    515 	movq	%r13,-24(%r14)
    516 	movq	%rdx,%rdi
    517 
    518 	mulq	%rbx
    519 	addq	%rax,%r11
    520 	movq	-16(%rcx),%rax
    521 	adcq	$0,%rdx
    522 	movq	%rdx,%r10
    523 
    524 	mulq	%rbp
    525 	addq	%rax,%rdi
    526 	movq	(%rsi,%r9,1),%rax
    527 	adcq	$0,%rdx
    528 	addq	%r11,%rdi
    529 	adcq	$0,%rdx
    530 	movq	%rdi,-16(%r14)
    531 	movq	%rdx,%r13
    532 
    533 .byte	102,72,15,126,195
    534 	leaq	(%rcx,%r9,2),%rcx
    535 
    536 	xorq	%rdi,%rdi
    537 	addq	%r10,%r13
    538 	adcq	$0,%rdi
    539 	movq	%r13,-8(%r14)
    540 
    541 	jmp	L$outer4x
    542 
    543 .p2align	5
    544 L$outer4x:
    545 	movq	(%r14,%r9,1),%r10
    546 	movq	%r8,%rbp
    547 	mulq	%rbx
    548 	addq	%rax,%r10
    549 	movq	(%rcx),%rax
    550 	adcq	$0,%rdx
    551 
    552 	movq	-96(%r12),%xmm0
    553 	movq	-32(%r12),%xmm1
    554 	pand	%xmm4,%xmm0
    555 	movq	32(%r12),%xmm2
    556 	pand	%xmm5,%xmm1
    557 	movq	96(%r12),%xmm3
    558 
    559 	imulq	%r10,%rbp
    560 .byte	0x67
    561 	movq	%rdx,%r11
    562 	movq	%rdi,(%r14)
    563 
    564 	pand	%xmm6,%xmm2
    565 	por	%xmm1,%xmm0
    566 	pand	%xmm7,%xmm3
    567 	por	%xmm2,%xmm0
    568 	leaq	(%r14,%r9,1),%r14
    569 	leaq	256(%r12),%r12
    570 	por	%xmm3,%xmm0
    571 
    572 	mulq	%rbp
    573 	addq	%rax,%r10
    574 	movq	8(%rsi,%r9,1),%rax
    575 	adcq	$0,%rdx
    576 	movq	%rdx,%rdi
    577 
    578 	mulq	%rbx
    579 	addq	%rax,%r11
    580 	movq	16(%rcx),%rax
    581 	adcq	$0,%rdx
    582 	addq	8(%r14),%r11
    583 	adcq	$0,%rdx
    584 	movq	%rdx,%r10
    585 
    586 	mulq	%rbp
    587 	addq	%rax,%rdi
    588 	movq	16(%rsi,%r9,1),%rax
    589 	adcq	$0,%rdx
    590 	addq	%r11,%rdi
    591 	leaq	32(%r9),%r15
    592 	leaq	64(%rcx),%rcx
    593 	adcq	$0,%rdx
    594 	movq	%rdx,%r13
    595 	jmp	L$inner4x
    596 
    597 .p2align	5
    598 L$inner4x:
    599 	mulq	%rbx
    600 	addq	%rax,%r10
    601 	movq	-32(%rcx),%rax
    602 	adcq	$0,%rdx
    603 	addq	16(%r14),%r10
    604 	leaq	32(%r14),%r14
    605 	adcq	$0,%rdx
    606 	movq	%rdx,%r11
    607 
    608 	mulq	%rbp
    609 	addq	%rax,%r13
    610 	movq	-8(%rsi,%r15,1),%rax
    611 	adcq	$0,%rdx
    612 	addq	%r10,%r13
    613 	adcq	$0,%rdx
    614 	movq	%rdi,-32(%r14)
    615 	movq	%rdx,%rdi
    616 
    617 	mulq	%rbx
    618 	addq	%rax,%r11
    619 	movq	-16(%rcx),%rax
    620 	adcq	$0,%rdx
    621 	addq	-8(%r14),%r11
    622 	adcq	$0,%rdx
    623 	movq	%rdx,%r10
    624 
    625 	mulq	%rbp
    626 	addq	%rax,%rdi
    627 	movq	(%rsi,%r15,1),%rax
    628 	adcq	$0,%rdx
    629 	addq	%r11,%rdi
    630 	adcq	$0,%rdx
    631 	movq	%r13,-24(%r14)
    632 	movq	%rdx,%r13
    633 
    634 	mulq	%rbx
    635 	addq	%rax,%r10
    636 	movq	0(%rcx),%rax
    637 	adcq	$0,%rdx
    638 	addq	(%r14),%r10
    639 	adcq	$0,%rdx
    640 	movq	%rdx,%r11
    641 
    642 	mulq	%rbp
    643 	addq	%rax,%r13
    644 	movq	8(%rsi,%r15,1),%rax
    645 	adcq	$0,%rdx
    646 	addq	%r10,%r13
    647 	adcq	$0,%rdx
    648 	movq	%rdi,-16(%r14)
    649 	movq	%rdx,%rdi
    650 
    651 	mulq	%rbx
    652 	addq	%rax,%r11
    653 	movq	16(%rcx),%rax
    654 	adcq	$0,%rdx
    655 	addq	8(%r14),%r11
    656 	adcq	$0,%rdx
    657 	movq	%rdx,%r10
    658 
    659 	mulq	%rbp
    660 	addq	%rax,%rdi
    661 	movq	16(%rsi,%r15,1),%rax
    662 	adcq	$0,%rdx
    663 	addq	%r11,%rdi
    664 	leaq	64(%rcx),%rcx
    665 	adcq	$0,%rdx
    666 	movq	%r13,-8(%r14)
    667 	movq	%rdx,%r13
    668 
    669 	addq	$32,%r15
    670 	jnz	L$inner4x
    671 
    672 	mulq	%rbx
    673 	addq	%rax,%r10
    674 	movq	-32(%rcx),%rax
    675 	adcq	$0,%rdx
    676 	addq	16(%r14),%r10
    677 	leaq	32(%r14),%r14
    678 	adcq	$0,%rdx
    679 	movq	%rdx,%r11
    680 
    681 	mulq	%rbp
    682 	addq	%rax,%r13
    683 	movq	-8(%rsi),%rax
    684 	adcq	$0,%rdx
    685 	addq	%r10,%r13
    686 	adcq	$0,%rdx
    687 	movq	%rdi,-32(%r14)
    688 	movq	%rdx,%rdi
    689 
    690 	mulq	%rbx
    691 	addq	%rax,%r11
    692 	movq	%rbp,%rax
    693 	movq	-16(%rcx),%rbp
    694 	adcq	$0,%rdx
    695 	addq	-8(%r14),%r11
    696 	adcq	$0,%rdx
    697 	movq	%rdx,%r10
    698 
    699 	mulq	%rbp
    700 	addq	%rax,%rdi
    701 	movq	(%rsi,%r9,1),%rax
    702 	adcq	$0,%rdx
    703 	addq	%r11,%rdi
    704 	adcq	$0,%rdx
    705 	movq	%r13,-24(%r14)
    706 	movq	%rdx,%r13
    707 
    708 .byte	102,72,15,126,195
    709 	movq	%rdi,-16(%r14)
    710 	leaq	(%rcx,%r9,2),%rcx
    711 
    712 	xorq	%rdi,%rdi
    713 	addq	%r10,%r13
    714 	adcq	$0,%rdi
    715 	addq	(%r14),%r13
    716 	adcq	$0,%rdi
    717 	movq	%r13,-8(%r14)
    718 
    719 	cmpq	16+8(%rsp),%r12
    720 	jb	L$outer4x
    721 	subq	%r13,%rbp
    722 	adcq	%r15,%r15
    723 	orq	%r15,%rdi
    724 	xorq	$1,%rdi
    725 	leaq	(%r14,%r9,1),%rbx
    726 	leaq	(%rcx,%rdi,8),%rbp
    727 	movq	%r9,%rcx
    728 	sarq	$3+2,%rcx
    729 	movq	56+8(%rsp),%rdi
    730 	jmp	L$sqr4x_sub
    731 
    732 .globl	_bn_power5
    733 .private_extern _bn_power5
    734 
    735 .p2align	5
    736 _bn_power5:
    737 	movq	%rsp,%rax
    738 	pushq	%rbx
    739 	pushq	%rbp
    740 	pushq	%r12
    741 	pushq	%r13
    742 	pushq	%r14
    743 	pushq	%r15
    744 	movl	%r9d,%r10d
    745 	shll	$3,%r9d
    746 	shll	$3+2,%r10d
    747 	negq	%r9
    748 	movq	(%r8),%r8
    749 
    750 
    751 
    752 
    753 
    754 
    755 
    756 	leaq	-64(%rsp,%r9,2),%r11
    757 	subq	%rsi,%r11
    758 	andq	$4095,%r11
    759 	cmpq	%r11,%r10
    760 	jb	L$pwr_sp_alt
    761 	subq	%r11,%rsp
    762 	leaq	-64(%rsp,%r9,2),%rsp
    763 	jmp	L$pwr_sp_done
    764 
    765 .p2align	5
    766 L$pwr_sp_alt:
    767 	leaq	4096-64(,%r9,2),%r10
    768 	leaq	-64(%rsp,%r9,2),%rsp
    769 	subq	%r10,%r11
    770 	movq	$0,%r10
    771 	cmovcq	%r10,%r11
    772 	subq	%r11,%rsp
    773 L$pwr_sp_done:
    774 	andq	$-64,%rsp
    775 	movq	%r9,%r10
    776 	negq	%r9
    777 
    778 
    779 
    780 
    781 
    782 
    783 
    784 
    785 
    786 
    787 	movq	%r8,32(%rsp)
    788 	movq	%rax,40(%rsp)
    789 L$power5_body:
    790 .byte	102,72,15,110,207
    791 .byte	102,72,15,110,209
    792 .byte	102,73,15,110,218
    793 .byte	102,72,15,110,226
    794 
    795 	call	__bn_sqr8x_internal
    796 	call	__bn_sqr8x_internal
    797 	call	__bn_sqr8x_internal
    798 	call	__bn_sqr8x_internal
    799 	call	__bn_sqr8x_internal
    800 
    801 .byte	102,72,15,126,209
    802 .byte	102,72,15,126,226
    803 	movq	%rsi,%rdi
    804 	movq	40(%rsp),%rax
    805 	leaq	32(%rsp),%r8
    806 
    807 	call	mul4x_internal
    808 
    809 	movq	40(%rsp),%rsi
    810 	movq	$1,%rax
    811 	movq	-48(%rsi),%r15
    812 	movq	-40(%rsi),%r14
    813 	movq	-32(%rsi),%r13
    814 	movq	-24(%rsi),%r12
    815 	movq	-16(%rsi),%rbp
    816 	movq	-8(%rsi),%rbx
    817 	leaq	(%rsi),%rsp
    818 L$power5_epilogue:
    819 	.byte	0xf3,0xc3
    820 
    821 
    822 .globl	_bn_sqr8x_internal
    823 .private_extern _bn_sqr8x_internal
    824 .private_extern	_bn_sqr8x_internal
    825 
    826 .p2align	5
    827 _bn_sqr8x_internal:
    828 __bn_sqr8x_internal:
    829 
    830 
    831 
    832 
    833 
    834 
    835 
    836 
    837 
    838 
    839 
    840 
    841 
    842 
    843 
    844 
    845 
    846 
    847 
    848 
    849 
    850 
    851 
    852 
    853 
    854 
    855 
    856 
    857 
    858 
    859 
    860 
    861 
    862 
    863 
    864 
    865 
    866 
    867 
    868 
    869 
    870 
    871 
    872 
    873 
    874 
    875 
    876 
    877 
    878 
    879 
    880 
    881 
    882 
    883 
    884 
    885 
    886 
    887 
    888 
    889 
    890 
    891 
    892 
    893 
    894 
    895 
    896 
    897 
    898 
    899 
    900 
    901 
    902 	leaq	32(%r10),%rbp
    903 	leaq	(%rsi,%r9,1),%rsi
    904 
    905 	movq	%r9,%rcx
    906 
    907 
    908 	movq	-32(%rsi,%rbp,1),%r14
    909 	leaq	48+8(%rsp,%r9,2),%rdi
    910 	movq	-24(%rsi,%rbp,1),%rax
    911 	leaq	-32(%rdi,%rbp,1),%rdi
    912 	movq	-16(%rsi,%rbp,1),%rbx
    913 	movq	%rax,%r15
    914 
    915 	mulq	%r14
    916 	movq	%rax,%r10
    917 	movq	%rbx,%rax
    918 	movq	%rdx,%r11
    919 	movq	%r10,-24(%rdi,%rbp,1)
    920 
    921 	mulq	%r14
    922 	addq	%rax,%r11
    923 	movq	%rbx,%rax
    924 	adcq	$0,%rdx
    925 	movq	%r11,-16(%rdi,%rbp,1)
    926 	movq	%rdx,%r10
    927 
    928 
    929 	movq	-8(%rsi,%rbp,1),%rbx
    930 	mulq	%r15
    931 	movq	%rax,%r12
    932 	movq	%rbx,%rax
    933 	movq	%rdx,%r13
    934 
    935 	leaq	(%rbp),%rcx
    936 	mulq	%r14
    937 	addq	%rax,%r10
    938 	movq	%rbx,%rax
    939 	movq	%rdx,%r11
    940 	adcq	$0,%r11
    941 	addq	%r12,%r10
    942 	adcq	$0,%r11
    943 	movq	%r10,-8(%rdi,%rcx,1)
    944 	jmp	L$sqr4x_1st
    945 
    946 .p2align	5
    947 L$sqr4x_1st:
    948 	movq	(%rsi,%rcx,1),%rbx
    949 	mulq	%r15
    950 	addq	%rax,%r13
    951 	movq	%rbx,%rax
    952 	movq	%rdx,%r12
    953 	adcq	$0,%r12
    954 
    955 	mulq	%r14
    956 	addq	%rax,%r11
    957 	movq	%rbx,%rax
    958 	movq	8(%rsi,%rcx,1),%rbx
    959 	movq	%rdx,%r10
    960 	adcq	$0,%r10
    961 	addq	%r13,%r11
    962 	adcq	$0,%r10
    963 
    964 
    965 	mulq	%r15
    966 	addq	%rax,%r12
    967 	movq	%rbx,%rax
    968 	movq	%r11,(%rdi,%rcx,1)
    969 	movq	%rdx,%r13
    970 	adcq	$0,%r13
    971 
    972 	mulq	%r14
    973 	addq	%rax,%r10
    974 	movq	%rbx,%rax
    975 	movq	16(%rsi,%rcx,1),%rbx
    976 	movq	%rdx,%r11
    977 	adcq	$0,%r11
    978 	addq	%r12,%r10
    979 	adcq	$0,%r11
    980 
    981 	mulq	%r15
    982 	addq	%rax,%r13
    983 	movq	%rbx,%rax
    984 	movq	%r10,8(%rdi,%rcx,1)
    985 	movq	%rdx,%r12
    986 	adcq	$0,%r12
    987 
    988 	mulq	%r14
    989 	addq	%rax,%r11
    990 	movq	%rbx,%rax
    991 	movq	24(%rsi,%rcx,1),%rbx
    992 	movq	%rdx,%r10
    993 	adcq	$0,%r10
    994 	addq	%r13,%r11
    995 	adcq	$0,%r10
    996 
    997 
    998 	mulq	%r15
    999 	addq	%rax,%r12
   1000 	movq	%rbx,%rax
   1001 	movq	%r11,16(%rdi,%rcx,1)
   1002 	movq	%rdx,%r13
   1003 	adcq	$0,%r13
   1004 	leaq	32(%rcx),%rcx
   1005 
   1006 	mulq	%r14
   1007 	addq	%rax,%r10
   1008 	movq	%rbx,%rax
   1009 	movq	%rdx,%r11
   1010 	adcq	$0,%r11
   1011 	addq	%r12,%r10
   1012 	adcq	$0,%r11
   1013 	movq	%r10,-8(%rdi,%rcx,1)
   1014 
   1015 	cmpq	$0,%rcx
   1016 	jne	L$sqr4x_1st
   1017 
   1018 	mulq	%r15
   1019 	addq	%rax,%r13
   1020 	leaq	16(%rbp),%rbp
   1021 	adcq	$0,%rdx
   1022 	addq	%r11,%r13
   1023 	adcq	$0,%rdx
   1024 
   1025 	movq	%r13,(%rdi)
   1026 	movq	%rdx,%r12
   1027 	movq	%rdx,8(%rdi)
   1028 	jmp	L$sqr4x_outer
   1029 
   1030 .p2align	5
   1031 L$sqr4x_outer:
   1032 	movq	-32(%rsi,%rbp,1),%r14
   1033 	leaq	48+8(%rsp,%r9,2),%rdi
   1034 	movq	-24(%rsi,%rbp,1),%rax
   1035 	leaq	-32(%rdi,%rbp,1),%rdi
   1036 	movq	-16(%rsi,%rbp,1),%rbx
   1037 	movq	%rax,%r15
   1038 
   1039 	mulq	%r14
   1040 	movq	-24(%rdi,%rbp,1),%r10
   1041 	addq	%rax,%r10
   1042 	movq	%rbx,%rax
   1043 	adcq	$0,%rdx
   1044 	movq	%r10,-24(%rdi,%rbp,1)
   1045 	movq	%rdx,%r11
   1046 
   1047 	mulq	%r14
   1048 	addq	%rax,%r11
   1049 	movq	%rbx,%rax
   1050 	adcq	$0,%rdx
   1051 	addq	-16(%rdi,%rbp,1),%r11
   1052 	movq	%rdx,%r10
   1053 	adcq	$0,%r10
   1054 	movq	%r11,-16(%rdi,%rbp,1)
   1055 
   1056 	xorq	%r12,%r12
   1057 
   1058 	movq	-8(%rsi,%rbp,1),%rbx
   1059 	mulq	%r15
   1060 	addq	%rax,%r12
   1061 	movq	%rbx,%rax
   1062 	adcq	$0,%rdx
   1063 	addq	-8(%rdi,%rbp,1),%r12
   1064 	movq	%rdx,%r13
   1065 	adcq	$0,%r13
   1066 
   1067 	mulq	%r14
   1068 	addq	%rax,%r10
   1069 	movq	%rbx,%rax
   1070 	adcq	$0,%rdx
   1071 	addq	%r12,%r10
   1072 	movq	%rdx,%r11
   1073 	adcq	$0,%r11
   1074 	movq	%r10,-8(%rdi,%rbp,1)
   1075 
   1076 	leaq	(%rbp),%rcx
   1077 	jmp	L$sqr4x_inner
   1078 
   1079 .p2align	5
   1080 L$sqr4x_inner:
   1081 	movq	(%rsi,%rcx,1),%rbx
   1082 	mulq	%r15
   1083 	addq	%rax,%r13
   1084 	movq	%rbx,%rax
   1085 	movq	%rdx,%r12
   1086 	adcq	$0,%r12
   1087 	addq	(%rdi,%rcx,1),%r13
   1088 	adcq	$0,%r12
   1089 
   1090 .byte	0x67
   1091 	mulq	%r14
   1092 	addq	%rax,%r11
   1093 	movq	%rbx,%rax
   1094 	movq	8(%rsi,%rcx,1),%rbx
   1095 	movq	%rdx,%r10
   1096 	adcq	$0,%r10
   1097 	addq	%r13,%r11
   1098 	adcq	$0,%r10
   1099 
   1100 	mulq	%r15
   1101 	addq	%rax,%r12
   1102 	movq	%r11,(%rdi,%rcx,1)
   1103 	movq	%rbx,%rax
   1104 	movq	%rdx,%r13
   1105 	adcq	$0,%r13
   1106 	addq	8(%rdi,%rcx,1),%r12
   1107 	leaq	16(%rcx),%rcx
   1108 	adcq	$0,%r13
   1109 
   1110 	mulq	%r14
   1111 	addq	%rax,%r10
   1112 	movq	%rbx,%rax
   1113 	adcq	$0,%rdx
   1114 	addq	%r12,%r10
   1115 	movq	%rdx,%r11
   1116 	adcq	$0,%r11
   1117 	movq	%r10,-8(%rdi,%rcx,1)
   1118 
   1119 	cmpq	$0,%rcx
   1120 	jne	L$sqr4x_inner
   1121 
   1122 .byte	0x67
   1123 	mulq	%r15
   1124 	addq	%rax,%r13
   1125 	adcq	$0,%rdx
   1126 	addq	%r11,%r13
   1127 	adcq	$0,%rdx
   1128 
   1129 	movq	%r13,(%rdi)
   1130 	movq	%rdx,%r12
   1131 	movq	%rdx,8(%rdi)
   1132 
   1133 	addq	$16,%rbp
   1134 	jnz	L$sqr4x_outer
   1135 
   1136 
   1137 	movq	-32(%rsi),%r14
   1138 	leaq	48+8(%rsp,%r9,2),%rdi
   1139 	movq	-24(%rsi),%rax
   1140 	leaq	-32(%rdi,%rbp,1),%rdi
   1141 	movq	-16(%rsi),%rbx
   1142 	movq	%rax,%r15
   1143 
   1144 	mulq	%r14
   1145 	addq	%rax,%r10
   1146 	movq	%rbx,%rax
   1147 	movq	%rdx,%r11
   1148 	adcq	$0,%r11
   1149 
   1150 	mulq	%r14
   1151 	addq	%rax,%r11
   1152 	movq	%rbx,%rax
   1153 	movq	%r10,-24(%rdi)
   1154 	movq	%rdx,%r10
   1155 	adcq	$0,%r10
   1156 	addq	%r13,%r11
   1157 	movq	-8(%rsi),%rbx
   1158 	adcq	$0,%r10
   1159 
   1160 	mulq	%r15
   1161 	addq	%rax,%r12
   1162 	movq	%rbx,%rax
   1163 	movq	%r11,-16(%rdi)
   1164 	movq	%rdx,%r13
   1165 	adcq	$0,%r13
   1166 
   1167 	mulq	%r14
   1168 	addq	%rax,%r10
   1169 	movq	%rbx,%rax
   1170 	movq	%rdx,%r11
   1171 	adcq	$0,%r11
   1172 	addq	%r12,%r10
   1173 	adcq	$0,%r11
   1174 	movq	%r10,-8(%rdi)
   1175 
   1176 	mulq	%r15
   1177 	addq	%rax,%r13
   1178 	movq	-16(%rsi),%rax
   1179 	adcq	$0,%rdx
   1180 	addq	%r11,%r13
   1181 	adcq	$0,%rdx
   1182 
   1183 	movq	%r13,(%rdi)
   1184 	movq	%rdx,%r12
   1185 	movq	%rdx,8(%rdi)
   1186 
   1187 	mulq	%rbx
   1188 	addq	$16,%rbp
   1189 	xorq	%r14,%r14
   1190 	subq	%r9,%rbp
   1191 	xorq	%r15,%r15
   1192 
   1193 	addq	%r12,%rax
   1194 	adcq	$0,%rdx
   1195 	movq	%rax,8(%rdi)
   1196 	movq	%rdx,16(%rdi)
   1197 	movq	%r15,24(%rdi)
   1198 
   1199 	movq	-16(%rsi,%rbp,1),%rax
   1200 	leaq	48+8(%rsp),%rdi
   1201 	xorq	%r10,%r10
   1202 	movq	8(%rdi),%r11
   1203 
   1204 	leaq	(%r14,%r10,2),%r12
   1205 	shrq	$63,%r10
   1206 	leaq	(%rcx,%r11,2),%r13
   1207 	shrq	$63,%r11
   1208 	orq	%r10,%r13
   1209 	movq	16(%rdi),%r10
   1210 	movq	%r11,%r14
   1211 	mulq	%rax
   1212 	negq	%r15
   1213 	movq	24(%rdi),%r11
   1214 	adcq	%rax,%r12
   1215 	movq	-8(%rsi,%rbp,1),%rax
   1216 	movq	%r12,(%rdi)
   1217 	adcq	%rdx,%r13
   1218 
   1219 	leaq	(%r14,%r10,2),%rbx
   1220 	movq	%r13,8(%rdi)
   1221 	sbbq	%r15,%r15
   1222 	shrq	$63,%r10
   1223 	leaq	(%rcx,%r11,2),%r8
   1224 	shrq	$63,%r11
   1225 	orq	%r10,%r8
   1226 	movq	32(%rdi),%r10
   1227 	movq	%r11,%r14
   1228 	mulq	%rax
   1229 	negq	%r15
   1230 	movq	40(%rdi),%r11
   1231 	adcq	%rax,%rbx
   1232 	movq	0(%rsi,%rbp,1),%rax
   1233 	movq	%rbx,16(%rdi)
   1234 	adcq	%rdx,%r8
   1235 	leaq	16(%rbp),%rbp
   1236 	movq	%r8,24(%rdi)
   1237 	sbbq	%r15,%r15
   1238 	leaq	64(%rdi),%rdi
   1239 	jmp	L$sqr4x_shift_n_add
   1240 
   1241 .p2align	5
   1242 L$sqr4x_shift_n_add:
   1243 	leaq	(%r14,%r10,2),%r12
   1244 	shrq	$63,%r10
   1245 	leaq	(%rcx,%r11,2),%r13
   1246 	shrq	$63,%r11
   1247 	orq	%r10,%r13
   1248 	movq	-16(%rdi),%r10
   1249 	movq	%r11,%r14
   1250 	mulq	%rax
   1251 	negq	%r15
   1252 	movq	-8(%rdi),%r11
   1253 	adcq	%rax,%r12
   1254 	movq	-8(%rsi,%rbp,1),%rax
   1255 	movq	%r12,-32(%rdi)
   1256 	adcq	%rdx,%r13
   1257 
   1258 	leaq	(%r14,%r10,2),%rbx
   1259 	movq	%r13,-24(%rdi)
   1260 	sbbq	%r15,%r15
   1261 	shrq	$63,%r10
   1262 	leaq	(%rcx,%r11,2),%r8
   1263 	shrq	$63,%r11
   1264 	orq	%r10,%r8
   1265 	movq	0(%rdi),%r10
   1266 	movq	%r11,%r14
   1267 	mulq	%rax
   1268 	negq	%r15
   1269 	movq	8(%rdi),%r11
   1270 	adcq	%rax,%rbx
   1271 	movq	0(%rsi,%rbp,1),%rax
   1272 	movq	%rbx,-16(%rdi)
   1273 	adcq	%rdx,%r8
   1274 
   1275 	leaq	(%r14,%r10,2),%r12
   1276 	movq	%r8,-8(%rdi)
   1277 	sbbq	%r15,%r15
   1278 	shrq	$63,%r10
   1279 	leaq	(%rcx,%r11,2),%r13
   1280 	shrq	$63,%r11
   1281 	orq	%r10,%r13
   1282 	movq	16(%rdi),%r10
   1283 	movq	%r11,%r14
   1284 	mulq	%rax
   1285 	negq	%r15
   1286 	movq	24(%rdi),%r11
   1287 	adcq	%rax,%r12
   1288 	movq	8(%rsi,%rbp,1),%rax
   1289 	movq	%r12,0(%rdi)
   1290 	adcq	%rdx,%r13
   1291 
   1292 	leaq	(%r14,%r10,2),%rbx
   1293 	movq	%r13,8(%rdi)
   1294 	sbbq	%r15,%r15
   1295 	shrq	$63,%r10
   1296 	leaq	(%rcx,%r11,2),%r8
   1297 	shrq	$63,%r11
   1298 	orq	%r10,%r8
   1299 	movq	32(%rdi),%r10
   1300 	movq	%r11,%r14
   1301 	mulq	%rax
   1302 	negq	%r15
   1303 	movq	40(%rdi),%r11
   1304 	adcq	%rax,%rbx
   1305 	movq	16(%rsi,%rbp,1),%rax
   1306 	movq	%rbx,16(%rdi)
   1307 	adcq	%rdx,%r8
   1308 	movq	%r8,24(%rdi)
   1309 	sbbq	%r15,%r15
   1310 	leaq	64(%rdi),%rdi
   1311 	addq	$32,%rbp
   1312 	jnz	L$sqr4x_shift_n_add
   1313 
   1314 	leaq	(%r14,%r10,2),%r12
   1315 .byte	0x67
   1316 	shrq	$63,%r10
   1317 	leaq	(%rcx,%r11,2),%r13
   1318 	shrq	$63,%r11
   1319 	orq	%r10,%r13
   1320 	movq	-16(%rdi),%r10
   1321 	movq	%r11,%r14
   1322 	mulq	%rax
   1323 	negq	%r15
   1324 	movq	-8(%rdi),%r11
   1325 	adcq	%rax,%r12
   1326 	movq	-8(%rsi),%rax
   1327 	movq	%r12,-32(%rdi)
   1328 	adcq	%rdx,%r13
   1329 
   1330 	leaq	(%r14,%r10,2),%rbx
   1331 	movq	%r13,-24(%rdi)
   1332 	sbbq	%r15,%r15
   1333 	shrq	$63,%r10
   1334 	leaq	(%rcx,%r11,2),%r8
   1335 	shrq	$63,%r11
   1336 	orq	%r10,%r8
   1337 	mulq	%rax
   1338 	negq	%r15
   1339 	adcq	%rax,%rbx
   1340 	adcq	%rdx,%r8
   1341 	movq	%rbx,-16(%rdi)
   1342 	movq	%r8,-8(%rdi)
   1343 .byte	102,72,15,126,213
   1344 sqr8x_reduction:
   1345 	xorq	%rax,%rax
   1346 	leaq	(%rbp,%r9,2),%rcx
   1347 	leaq	48+8(%rsp,%r9,2),%rdx
   1348 	movq	%rcx,0+8(%rsp)
   1349 	leaq	48+8(%rsp,%r9,1),%rdi
   1350 	movq	%rdx,8+8(%rsp)
   1351 	negq	%r9
   1352 	jmp	L$8x_reduction_loop
   1353 
   1354 .p2align	5
   1355 L$8x_reduction_loop:
   1356 	leaq	(%rdi,%r9,1),%rdi
   1357 .byte	0x66
   1358 	movq	0(%rdi),%rbx
   1359 	movq	8(%rdi),%r9
   1360 	movq	16(%rdi),%r10
   1361 	movq	24(%rdi),%r11
   1362 	movq	32(%rdi),%r12
   1363 	movq	40(%rdi),%r13
   1364 	movq	48(%rdi),%r14
   1365 	movq	56(%rdi),%r15
   1366 	movq	%rax,(%rdx)
   1367 	leaq	64(%rdi),%rdi
   1368 
   1369 .byte	0x67
   1370 	movq	%rbx,%r8
   1371 	imulq	32+8(%rsp),%rbx
   1372 	movq	0(%rbp),%rax
   1373 	movl	$8,%ecx
   1374 	jmp	L$8x_reduce
   1375 
   1376 .p2align	5
   1377 L$8x_reduce:
   1378 	mulq	%rbx
   1379 	movq	16(%rbp),%rax
   1380 	negq	%r8
   1381 	movq	%rdx,%r8
   1382 	adcq	$0,%r8
   1383 
   1384 	mulq	%rbx
   1385 	addq	%rax,%r9
   1386 	movq	32(%rbp),%rax
   1387 	adcq	$0,%rdx
   1388 	addq	%r9,%r8
   1389 	movq	%rbx,48-8+8(%rsp,%rcx,8)
   1390 	movq	%rdx,%r9
   1391 	adcq	$0,%r9
   1392 
   1393 	mulq	%rbx
   1394 	addq	%rax,%r10
   1395 	movq	48(%rbp),%rax
   1396 	adcq	$0,%rdx
   1397 	addq	%r10,%r9
   1398 	movq	32+8(%rsp),%rsi
   1399 	movq	%rdx,%r10
   1400 	adcq	$0,%r10
   1401 
   1402 	mulq	%rbx
   1403 	addq	%rax,%r11
   1404 	movq	64(%rbp),%rax
   1405 	adcq	$0,%rdx
   1406 	imulq	%r8,%rsi
   1407 	addq	%r11,%r10
   1408 	movq	%rdx,%r11
   1409 	adcq	$0,%r11
   1410 
   1411 	mulq	%rbx
   1412 	addq	%rax,%r12
   1413 	movq	80(%rbp),%rax
   1414 	adcq	$0,%rdx
   1415 	addq	%r12,%r11
   1416 	movq	%rdx,%r12
   1417 	adcq	$0,%r12
   1418 
   1419 	mulq	%rbx
   1420 	addq	%rax,%r13
   1421 	movq	96(%rbp),%rax
   1422 	adcq	$0,%rdx
   1423 	addq	%r13,%r12
   1424 	movq	%rdx,%r13
   1425 	adcq	$0,%r13
   1426 
   1427 	mulq	%rbx
   1428 	addq	%rax,%r14
   1429 	movq	112(%rbp),%rax
   1430 	adcq	$0,%rdx
   1431 	addq	%r14,%r13
   1432 	movq	%rdx,%r14
   1433 	adcq	$0,%r14
   1434 
   1435 	mulq	%rbx
   1436 	movq	%rsi,%rbx
   1437 	addq	%rax,%r15
   1438 	movq	0(%rbp),%rax
   1439 	adcq	$0,%rdx
   1440 	addq	%r15,%r14
   1441 	movq	%rdx,%r15
   1442 	adcq	$0,%r15
   1443 
   1444 	decl	%ecx
   1445 	jnz	L$8x_reduce
   1446 
   1447 	leaq	128(%rbp),%rbp
   1448 	xorq	%rax,%rax
   1449 	movq	8+8(%rsp),%rdx
   1450 	cmpq	0+8(%rsp),%rbp
   1451 	jae	L$8x_no_tail
   1452 
   1453 .byte	0x66
   1454 	addq	0(%rdi),%r8
   1455 	adcq	8(%rdi),%r9
   1456 	adcq	16(%rdi),%r10
   1457 	adcq	24(%rdi),%r11
   1458 	adcq	32(%rdi),%r12
   1459 	adcq	40(%rdi),%r13
   1460 	adcq	48(%rdi),%r14
   1461 	adcq	56(%rdi),%r15
   1462 	sbbq	%rsi,%rsi
   1463 
   1464 	movq	48+56+8(%rsp),%rbx
   1465 	movl	$8,%ecx
   1466 	movq	0(%rbp),%rax
   1467 	jmp	L$8x_tail
   1468 
   1469 .p2align	5
   1470 L$8x_tail:
   1471 	mulq	%rbx
   1472 	addq	%rax,%r8
   1473 	movq	16(%rbp),%rax
   1474 	movq	%r8,(%rdi)
   1475 	movq	%rdx,%r8
   1476 	adcq	$0,%r8
   1477 
   1478 	mulq	%rbx
   1479 	addq	%rax,%r9
   1480 	movq	32(%rbp),%rax
   1481 	adcq	$0,%rdx
   1482 	addq	%r9,%r8
   1483 	leaq	8(%rdi),%rdi
   1484 	movq	%rdx,%r9
   1485 	adcq	$0,%r9
   1486 
   1487 	mulq	%rbx
   1488 	addq	%rax,%r10
   1489 	movq	48(%rbp),%rax
   1490 	adcq	$0,%rdx
   1491 	addq	%r10,%r9
   1492 	movq	%rdx,%r10
   1493 	adcq	$0,%r10
   1494 
   1495 	mulq	%rbx
   1496 	addq	%rax,%r11
   1497 	movq	64(%rbp),%rax
   1498 	adcq	$0,%rdx
   1499 	addq	%r11,%r10
   1500 	movq	%rdx,%r11
   1501 	adcq	$0,%r11
   1502 
   1503 	mulq	%rbx
   1504 	addq	%rax,%r12
   1505 	movq	80(%rbp),%rax
   1506 	adcq	$0,%rdx
   1507 	addq	%r12,%r11
   1508 	movq	%rdx,%r12
   1509 	adcq	$0,%r12
   1510 
   1511 	mulq	%rbx
   1512 	addq	%rax,%r13
   1513 	movq	96(%rbp),%rax
   1514 	adcq	$0,%rdx
   1515 	addq	%r13,%r12
   1516 	movq	%rdx,%r13
   1517 	adcq	$0,%r13
   1518 
   1519 	mulq	%rbx
   1520 	addq	%rax,%r14
   1521 	movq	112(%rbp),%rax
   1522 	adcq	$0,%rdx
   1523 	addq	%r14,%r13
   1524 	movq	%rdx,%r14
   1525 	adcq	$0,%r14
   1526 
   1527 	mulq	%rbx
   1528 	movq	48-16+8(%rsp,%rcx,8),%rbx
   1529 	addq	%rax,%r15
   1530 	adcq	$0,%rdx
   1531 	addq	%r15,%r14
   1532 	movq	0(%rbp),%rax
   1533 	movq	%rdx,%r15
   1534 	adcq	$0,%r15
   1535 
   1536 	decl	%ecx
   1537 	jnz	L$8x_tail
   1538 
   1539 	leaq	128(%rbp),%rbp
   1540 	movq	8+8(%rsp),%rdx
   1541 	cmpq	0+8(%rsp),%rbp
   1542 	jae	L$8x_tail_done
   1543 
   1544 	movq	48+56+8(%rsp),%rbx
   1545 	negq	%rsi
   1546 	movq	0(%rbp),%rax
   1547 	adcq	0(%rdi),%r8
   1548 	adcq	8(%rdi),%r9
   1549 	adcq	16(%rdi),%r10
   1550 	adcq	24(%rdi),%r11
   1551 	adcq	32(%rdi),%r12
   1552 	adcq	40(%rdi),%r13
   1553 	adcq	48(%rdi),%r14
   1554 	adcq	56(%rdi),%r15
   1555 	sbbq	%rsi,%rsi
   1556 
   1557 	movl	$8,%ecx
   1558 	jmp	L$8x_tail
   1559 
   1560 .p2align	5
   1561 L$8x_tail_done:
   1562 	addq	(%rdx),%r8
   1563 	xorq	%rax,%rax
   1564 
   1565 	negq	%rsi
   1566 L$8x_no_tail:
   1567 	adcq	0(%rdi),%r8
   1568 	adcq	8(%rdi),%r9
   1569 	adcq	16(%rdi),%r10
   1570 	adcq	24(%rdi),%r11
   1571 	adcq	32(%rdi),%r12
   1572 	adcq	40(%rdi),%r13
   1573 	adcq	48(%rdi),%r14
   1574 	adcq	56(%rdi),%r15
   1575 	adcq	$0,%rax
   1576 	movq	-16(%rbp),%rcx
   1577 	xorq	%rsi,%rsi
   1578 
   1579 .byte	102,72,15,126,213
   1580 
   1581 	movq	%r8,0(%rdi)
   1582 	movq	%r9,8(%rdi)
   1583 .byte	102,73,15,126,217
   1584 	movq	%r10,16(%rdi)
   1585 	movq	%r11,24(%rdi)
   1586 	movq	%r12,32(%rdi)
   1587 	movq	%r13,40(%rdi)
   1588 	movq	%r14,48(%rdi)
   1589 	movq	%r15,56(%rdi)
   1590 	leaq	64(%rdi),%rdi
   1591 
   1592 	cmpq	%rdx,%rdi
   1593 	jb	L$8x_reduction_loop
   1594 
   1595 	subq	%r15,%rcx
   1596 	leaq	(%rdi,%r9,1),%rbx
   1597 	adcq	%rsi,%rsi
   1598 	movq	%r9,%rcx
   1599 	orq	%rsi,%rax
   1600 .byte	102,72,15,126,207
   1601 	xorq	$1,%rax
   1602 .byte	102,72,15,126,206
   1603 	leaq	(%rbp,%rax,8),%rbp
   1604 	sarq	$3+2,%rcx
   1605 	jmp	L$sqr4x_sub
   1606 
   1607 .p2align	5
   1608 L$sqr4x_sub:
   1609 .byte	0x66
   1610 	movq	0(%rbx),%r12
   1611 	movq	8(%rbx),%r13
   1612 	sbbq	0(%rbp),%r12
   1613 	movq	16(%rbx),%r14
   1614 	sbbq	16(%rbp),%r13
   1615 	movq	24(%rbx),%r15
   1616 	leaq	32(%rbx),%rbx
   1617 	sbbq	32(%rbp),%r14
   1618 	movq	%r12,0(%rdi)
   1619 	sbbq	48(%rbp),%r15
   1620 	leaq	64(%rbp),%rbp
   1621 	movq	%r13,8(%rdi)
   1622 	movq	%r14,16(%rdi)
   1623 	movq	%r15,24(%rdi)
   1624 	leaq	32(%rdi),%rdi
   1625 
   1626 	incq	%rcx
   1627 	jnz	L$sqr4x_sub
   1628 	movq	%r9,%r10
   1629 	negq	%r9
   1630 	.byte	0xf3,0xc3
   1631 
   1632 .globl	_bn_from_montgomery
   1633 .private_extern _bn_from_montgomery
   1634 
   1635 .p2align	5
   1636 _bn_from_montgomery:
   1637 	testl	$7,%r9d
   1638 	jz	bn_from_mont8x
   1639 	xorl	%eax,%eax
   1640 	.byte	0xf3,0xc3
   1641 
   1642 
   1643 
   1644 .p2align	5
   1645 bn_from_mont8x:
   1646 .byte	0x67
   1647 	movq	%rsp,%rax
   1648 	pushq	%rbx
   1649 	pushq	%rbp
   1650 	pushq	%r12
   1651 	pushq	%r13
   1652 	pushq	%r14
   1653 	pushq	%r15
   1654 .byte	0x67
   1655 	movl	%r9d,%r10d
   1656 	shll	$3,%r9d
   1657 	shll	$3+2,%r10d
   1658 	negq	%r9
   1659 	movq	(%r8),%r8
   1660 
   1661 
   1662 
   1663 
   1664 
   1665 
   1666 
   1667 	leaq	-64(%rsp,%r9,2),%r11
   1668 	subq	%rsi,%r11
   1669 	andq	$4095,%r11
   1670 	cmpq	%r11,%r10
   1671 	jb	L$from_sp_alt
   1672 	subq	%r11,%rsp
   1673 	leaq	-64(%rsp,%r9,2),%rsp
   1674 	jmp	L$from_sp_done
   1675 
   1676 .p2align	5
   1677 L$from_sp_alt:
   1678 	leaq	4096-64(,%r9,2),%r10
   1679 	leaq	-64(%rsp,%r9,2),%rsp
   1680 	subq	%r10,%r11
   1681 	movq	$0,%r10
   1682 	cmovcq	%r10,%r11
   1683 	subq	%r11,%rsp
   1684 L$from_sp_done:
   1685 	andq	$-64,%rsp
   1686 	movq	%r9,%r10
   1687 	negq	%r9
   1688 
   1689 
   1690 
   1691 
   1692 
   1693 
   1694 
   1695 
   1696 
   1697 
   1698 	movq	%r8,32(%rsp)
   1699 	movq	%rax,40(%rsp)
   1700 L$from_body:
   1701 	movq	%r9,%r11
   1702 	leaq	48(%rsp),%rax
   1703 	pxor	%xmm0,%xmm0
   1704 	jmp	L$mul_by_1
   1705 
   1706 .p2align	5
   1707 L$mul_by_1:
   1708 	movdqu	(%rsi),%xmm1
   1709 	movdqu	16(%rsi),%xmm2
   1710 	movdqu	32(%rsi),%xmm3
   1711 	movdqa	%xmm0,(%rax,%r9,1)
   1712 	movdqu	48(%rsi),%xmm4
   1713 	movdqa	%xmm0,16(%rax,%r9,1)
   1714 .byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
   1715 	movdqa	%xmm1,(%rax)
   1716 	movdqa	%xmm0,32(%rax,%r9,1)
   1717 	movdqa	%xmm2,16(%rax)
   1718 	movdqa	%xmm0,48(%rax,%r9,1)
   1719 	movdqa	%xmm3,32(%rax)
   1720 	movdqa	%xmm4,48(%rax)
   1721 	leaq	64(%rax),%rax
   1722 	subq	$64,%r11
   1723 	jnz	L$mul_by_1
   1724 
   1725 .byte	102,72,15,110,207
   1726 .byte	102,72,15,110,209
   1727 .byte	0x67
   1728 	movq	%rcx,%rbp
   1729 .byte	102,73,15,110,218
   1730 	call	sqr8x_reduction
   1731 
   1732 	pxor	%xmm0,%xmm0
   1733 	leaq	48(%rsp),%rax
   1734 	movq	40(%rsp),%rsi
   1735 	jmp	L$from_mont_zero
   1736 
   1737 .p2align	5
   1738 L$from_mont_zero:
   1739 	movdqa	%xmm0,0(%rax)
   1740 	movdqa	%xmm0,16(%rax)
   1741 	movdqa	%xmm0,32(%rax)
   1742 	movdqa	%xmm0,48(%rax)
   1743 	leaq	64(%rax),%rax
   1744 	subq	$32,%r9
   1745 	jnz	L$from_mont_zero
   1746 
   1747 	movq	$1,%rax
   1748 	movq	-48(%rsi),%r15
   1749 	movq	-40(%rsi),%r14
   1750 	movq	-32(%rsi),%r13
   1751 	movq	-24(%rsi),%r12
   1752 	movq	-16(%rsi),%rbp
   1753 	movq	-8(%rsi),%rbx
   1754 	leaq	(%rsi),%rsp
   1755 L$from_epilogue:
   1756 	.byte	0xf3,0xc3
   1757 
   1758 .globl	_bn_scatter5
   1759 .private_extern _bn_scatter5
   1760 
   1761 .p2align	4
   1762 _bn_scatter5:
   1763 	cmpl	$0,%esi
   1764 	jz	L$scatter_epilogue
   1765 	leaq	(%rdx,%rcx,8),%rdx
   1766 L$scatter:
   1767 	movq	(%rdi),%rax
   1768 	leaq	8(%rdi),%rdi
   1769 	movq	%rax,(%rdx)
   1770 	leaq	256(%rdx),%rdx
   1771 	subl	$1,%esi
   1772 	jnz	L$scatter
   1773 L$scatter_epilogue:
   1774 	.byte	0xf3,0xc3
   1775 
   1776 
   1777 .globl	_bn_gather5
   1778 .private_extern _bn_gather5
   1779 
   1780 .p2align	4
   1781 _bn_gather5:
   1782 	movl	%ecx,%r11d
   1783 	shrl	$3,%ecx
   1784 	andq	$7,%r11
   1785 	notl	%ecx
   1786 	leaq	L$magic_masks(%rip),%rax
   1787 	andl	$3,%ecx
   1788 	leaq	128(%rdx,%r11,8),%rdx
   1789 	movq	0(%rax,%rcx,8),%xmm4
   1790 	movq	8(%rax,%rcx,8),%xmm5
   1791 	movq	16(%rax,%rcx,8),%xmm6
   1792 	movq	24(%rax,%rcx,8),%xmm7
   1793 	jmp	L$gather
   1794 .p2align	4
   1795 L$gather:
   1796 	movq	-128(%rdx),%xmm0
   1797 	movq	-64(%rdx),%xmm1
   1798 	pand	%xmm4,%xmm0
   1799 	movq	0(%rdx),%xmm2
   1800 	pand	%xmm5,%xmm1
   1801 	movq	64(%rdx),%xmm3
   1802 	pand	%xmm6,%xmm2
   1803 	por	%xmm1,%xmm0
   1804 	pand	%xmm7,%xmm3
   1805 .byte	0x67,0x67
   1806 	por	%xmm2,%xmm0
   1807 	leaq	256(%rdx),%rdx
   1808 	por	%xmm3,%xmm0
   1809 
   1810 	movq	%xmm0,(%rdi)
   1811 	leaq	8(%rdi),%rdi
   1812 	subl	$1,%esi
   1813 	jnz	L$gather
   1814 	.byte	0xf3,0xc3
   1815 L$SEH_end_bn_gather5:
   1816 
   1817 .p2align	6
   1818 L$magic_masks:
   1819 .long	0,0, 0,0, 0,0, -1,-1
   1820 .long	0,0, 0,0, 0,0,  0,0
   1821 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1822 #endif
   1823