Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 .extern	OPENSSL_ia32cap_P
      5 .hidden OPENSSL_ia32cap_P
      6 
      7 .globl	bn_mul_mont_gather5
      8 .hidden bn_mul_mont_gather5
      9 .type	bn_mul_mont_gather5,@function
     10 .align	64
     11 bn_mul_mont_gather5:
     12 	testl	$7,%r9d
     13 	jnz	.Lmul_enter
     14 	jmp	.Lmul4x_enter
     15 
     16 .align	16
     17 .Lmul_enter:
     18 	movl	%r9d,%r9d
     19 	movq	%rsp,%rax
     20 	movl	8(%rsp),%r10d
     21 	pushq	%rbx
     22 	pushq	%rbp
     23 	pushq	%r12
     24 	pushq	%r13
     25 	pushq	%r14
     26 	pushq	%r15
     27 	leaq	2(%r9),%r11
     28 	negq	%r11
     29 	leaq	(%rsp,%r11,8),%rsp
     30 	andq	$-1024,%rsp
     31 
     32 	movq	%rax,8(%rsp,%r9,8)
     33 .Lmul_body:
     34 	movq	%rdx,%r12
     35 	movq	%r10,%r11
     36 	shrq	$3,%r10
     37 	andq	$7,%r11
     38 	notq	%r10
     39 	leaq	.Lmagic_masks(%rip),%rax
     40 	andq	$3,%r10
     41 	leaq	96(%r12,%r11,8),%r12
     42 	movq	0(%rax,%r10,8),%xmm4
     43 	movq	8(%rax,%r10,8),%xmm5
     44 	movq	16(%rax,%r10,8),%xmm6
     45 	movq	24(%rax,%r10,8),%xmm7
     46 
     47 	movq	-96(%r12),%xmm0
     48 	movq	-32(%r12),%xmm1
     49 	pand	%xmm4,%xmm0
     50 	movq	32(%r12),%xmm2
     51 	pand	%xmm5,%xmm1
     52 	movq	96(%r12),%xmm3
     53 	pand	%xmm6,%xmm2
     54 	por	%xmm1,%xmm0
     55 	pand	%xmm7,%xmm3
     56 	por	%xmm2,%xmm0
     57 	leaq	256(%r12),%r12
     58 	por	%xmm3,%xmm0
     59 
     60 .byte	102,72,15,126,195
     61 
     62 	movq	(%r8),%r8
     63 	movq	(%rsi),%rax
     64 
     65 	xorq	%r14,%r14
     66 	xorq	%r15,%r15
     67 
     68 	movq	-96(%r12),%xmm0
     69 	movq	-32(%r12),%xmm1
     70 	pand	%xmm4,%xmm0
     71 	movq	32(%r12),%xmm2
     72 	pand	%xmm5,%xmm1
     73 
     74 	movq	%r8,%rbp
     75 	mulq	%rbx
     76 	movq	%rax,%r10
     77 	movq	(%rcx),%rax
     78 
     79 	movq	96(%r12),%xmm3
     80 	pand	%xmm6,%xmm2
     81 	por	%xmm1,%xmm0
     82 	pand	%xmm7,%xmm3
     83 
     84 	imulq	%r10,%rbp
     85 	movq	%rdx,%r11
     86 
     87 	por	%xmm2,%xmm0
     88 	leaq	256(%r12),%r12
     89 	por	%xmm3,%xmm0
     90 
     91 	mulq	%rbp
     92 	addq	%rax,%r10
     93 	movq	8(%rsi),%rax
     94 	adcq	$0,%rdx
     95 	movq	%rdx,%r13
     96 
     97 	leaq	1(%r15),%r15
     98 	jmp	.L1st_enter
     99 
    100 .align	16
    101 .L1st:
    102 	addq	%rax,%r13
    103 	movq	(%rsi,%r15,8),%rax
    104 	adcq	$0,%rdx
    105 	addq	%r11,%r13
    106 	movq	%r10,%r11
    107 	adcq	$0,%rdx
    108 	movq	%r13,-16(%rsp,%r15,8)
    109 	movq	%rdx,%r13
    110 
    111 .L1st_enter:
    112 	mulq	%rbx
    113 	addq	%rax,%r11
    114 	movq	(%rcx,%r15,8),%rax
    115 	adcq	$0,%rdx
    116 	leaq	1(%r15),%r15
    117 	movq	%rdx,%r10
    118 
    119 	mulq	%rbp
    120 	cmpq	%r9,%r15
    121 	jne	.L1st
    122 
    123 .byte	102,72,15,126,195
    124 
    125 	addq	%rax,%r13
    126 	movq	(%rsi),%rax
    127 	adcq	$0,%rdx
    128 	addq	%r11,%r13
    129 	adcq	$0,%rdx
    130 	movq	%r13,-16(%rsp,%r15,8)
    131 	movq	%rdx,%r13
    132 	movq	%r10,%r11
    133 
    134 	xorq	%rdx,%rdx
    135 	addq	%r11,%r13
    136 	adcq	$0,%rdx
    137 	movq	%r13,-8(%rsp,%r9,8)
    138 	movq	%rdx,(%rsp,%r9,8)
    139 
    140 	leaq	1(%r14),%r14
    141 	jmp	.Louter
    142 .align	16
    143 .Louter:
    144 	xorq	%r15,%r15
    145 	movq	%r8,%rbp
    146 	movq	(%rsp),%r10
    147 
    148 	movq	-96(%r12),%xmm0
    149 	movq	-32(%r12),%xmm1
    150 	pand	%xmm4,%xmm0
    151 	movq	32(%r12),%xmm2
    152 	pand	%xmm5,%xmm1
    153 
    154 	mulq	%rbx
    155 	addq	%rax,%r10
    156 	movq	(%rcx),%rax
    157 	adcq	$0,%rdx
    158 
    159 	movq	96(%r12),%xmm3
    160 	pand	%xmm6,%xmm2
    161 	por	%xmm1,%xmm0
    162 	pand	%xmm7,%xmm3
    163 
    164 	imulq	%r10,%rbp
    165 	movq	%rdx,%r11
    166 
    167 	por	%xmm2,%xmm0
    168 	leaq	256(%r12),%r12
    169 	por	%xmm3,%xmm0
    170 
    171 	mulq	%rbp
    172 	addq	%rax,%r10
    173 	movq	8(%rsi),%rax
    174 	adcq	$0,%rdx
    175 	movq	8(%rsp),%r10
    176 	movq	%rdx,%r13
    177 
    178 	leaq	1(%r15),%r15
    179 	jmp	.Linner_enter
    180 
    181 .align	16
    182 .Linner:
    183 	addq	%rax,%r13
    184 	movq	(%rsi,%r15,8),%rax
    185 	adcq	$0,%rdx
    186 	addq	%r10,%r13
    187 	movq	(%rsp,%r15,8),%r10
    188 	adcq	$0,%rdx
    189 	movq	%r13,-16(%rsp,%r15,8)
    190 	movq	%rdx,%r13
    191 
    192 .Linner_enter:
    193 	mulq	%rbx
    194 	addq	%rax,%r11
    195 	movq	(%rcx,%r15,8),%rax
    196 	adcq	$0,%rdx
    197 	addq	%r11,%r10
    198 	movq	%rdx,%r11
    199 	adcq	$0,%r11
    200 	leaq	1(%r15),%r15
    201 
    202 	mulq	%rbp
    203 	cmpq	%r9,%r15
    204 	jne	.Linner
    205 
    206 .byte	102,72,15,126,195
    207 
    208 	addq	%rax,%r13
    209 	movq	(%rsi),%rax
    210 	adcq	$0,%rdx
    211 	addq	%r10,%r13
    212 	movq	(%rsp,%r15,8),%r10
    213 	adcq	$0,%rdx
    214 	movq	%r13,-16(%rsp,%r15,8)
    215 	movq	%rdx,%r13
    216 
    217 	xorq	%rdx,%rdx
    218 	addq	%r11,%r13
    219 	adcq	$0,%rdx
    220 	addq	%r10,%r13
    221 	adcq	$0,%rdx
    222 	movq	%r13,-8(%rsp,%r9,8)
    223 	movq	%rdx,(%rsp,%r9,8)
    224 
    225 	leaq	1(%r14),%r14
    226 	cmpq	%r9,%r14
    227 	jb	.Louter
    228 
    229 	xorq	%r14,%r14
    230 	movq	(%rsp),%rax
    231 	leaq	(%rsp),%rsi
    232 	movq	%r9,%r15
    233 	jmp	.Lsub
    234 .align	16
    235 .Lsub:	sbbq	(%rcx,%r14,8),%rax
    236 	movq	%rax,(%rdi,%r14,8)
    237 	movq	8(%rsi,%r14,8),%rax
    238 	leaq	1(%r14),%r14
    239 	decq	%r15
    240 	jnz	.Lsub
    241 
    242 	sbbq	$0,%rax
    243 	xorq	%r14,%r14
    244 	movq	%r9,%r15
    245 .align	16
    246 .Lcopy:
    247 	movq	(%rsp,%r14,8),%rsi
    248 	movq	(%rdi,%r14,8),%rcx
    249 	xorq	%rcx,%rsi
    250 	andq	%rax,%rsi
    251 	xorq	%rcx,%rsi
    252 	movq	%r14,(%rsp,%r14,8)
    253 	movq	%rsi,(%rdi,%r14,8)
    254 	leaq	1(%r14),%r14
    255 	subq	$1,%r15
    256 	jnz	.Lcopy
    257 
    258 	movq	8(%rsp,%r9,8),%rsi
    259 	movq	$1,%rax
    260 	movq	-48(%rsi),%r15
    261 	movq	-40(%rsi),%r14
    262 	movq	-32(%rsi),%r13
    263 	movq	-24(%rsi),%r12
    264 	movq	-16(%rsi),%rbp
    265 	movq	-8(%rsi),%rbx
    266 	leaq	(%rsi),%rsp
    267 .Lmul_epilogue:
    268 	.byte	0xf3,0xc3
    269 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    270 .type	bn_mul4x_mont_gather5,@function
    271 .align	32
    272 bn_mul4x_mont_gather5:
    273 .Lmul4x_enter:
    274 .byte	0x67
    275 	movq	%rsp,%rax
    276 	pushq	%rbx
    277 	pushq	%rbp
    278 	pushq	%r12
    279 	pushq	%r13
    280 	pushq	%r14
    281 	pushq	%r15
    282 .byte	0x67
    283 	movl	%r9d,%r10d
    284 	shll	$3,%r9d
    285 	shll	$3+2,%r10d
    286 	negq	%r9
    287 
    288 
    289 
    290 
    291 
    292 
    293 
    294 
    295 	leaq	-64(%rsp,%r9,2),%r11
    296 	subq	%rsi,%r11
    297 	andq	$4095,%r11
    298 	cmpq	%r11,%r10
    299 	jb	.Lmul4xsp_alt
    300 	subq	%r11,%rsp
    301 	leaq	-64(%rsp,%r9,2),%rsp
    302 	jmp	.Lmul4xsp_done
    303 
    304 .align	32
    305 .Lmul4xsp_alt:
    306 	leaq	4096-64(,%r9,2),%r10
    307 	leaq	-64(%rsp,%r9,2),%rsp
    308 	subq	%r10,%r11
    309 	movq	$0,%r10
    310 	cmovcq	%r10,%r11
    311 	subq	%r11,%rsp
    312 .Lmul4xsp_done:
    313 	andq	$-64,%rsp
    314 	negq	%r9
    315 
    316 	movq	%rax,40(%rsp)
    317 .Lmul4x_body:
    318 
    319 	call	mul4x_internal
    320 
    321 	movq	40(%rsp),%rsi
    322 	movq	$1,%rax
    323 	movq	-48(%rsi),%r15
    324 	movq	-40(%rsi),%r14
    325 	movq	-32(%rsi),%r13
    326 	movq	-24(%rsi),%r12
    327 	movq	-16(%rsi),%rbp
    328 	movq	-8(%rsi),%rbx
    329 	leaq	(%rsi),%rsp
    330 .Lmul4x_epilogue:
    331 	.byte	0xf3,0xc3
    332 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    333 
    334 .type	mul4x_internal,@function
    335 .align	32
    336 mul4x_internal:
    337 	shlq	$5,%r9
    338 	movl	8(%rax),%r10d
    339 	leaq	256(%rdx,%r9,1),%r13
    340 	shrq	$5,%r9
    341 	movq	%r10,%r11
    342 	shrq	$3,%r10
    343 	andq	$7,%r11
    344 	notq	%r10
    345 	leaq	.Lmagic_masks(%rip),%rax
    346 	andq	$3,%r10
    347 	leaq	96(%rdx,%r11,8),%r12
    348 	movq	0(%rax,%r10,8),%xmm4
    349 	movq	8(%rax,%r10,8),%xmm5
    350 	addq	$7,%r11
    351 	movq	16(%rax,%r10,8),%xmm6
    352 	movq	24(%rax,%r10,8),%xmm7
    353 	andq	$7,%r11
    354 
    355 	movq	-96(%r12),%xmm0
    356 	leaq	256(%r12),%r14
    357 	movq	-32(%r12),%xmm1
    358 	pand	%xmm4,%xmm0
    359 	movq	32(%r12),%xmm2
    360 	pand	%xmm5,%xmm1
    361 	movq	96(%r12),%xmm3
    362 	pand	%xmm6,%xmm2
    363 .byte	0x67
    364 	por	%xmm1,%xmm0
    365 	movq	-96(%r14),%xmm1
    366 .byte	0x67
    367 	pand	%xmm7,%xmm3
    368 .byte	0x67
    369 	por	%xmm2,%xmm0
    370 	movq	-32(%r14),%xmm2
    371 .byte	0x67
    372 	pand	%xmm4,%xmm1
    373 .byte	0x67
    374 	por	%xmm3,%xmm0
    375 	movq	32(%r14),%xmm3
    376 
    377 .byte	102,72,15,126,195
    378 	movq	96(%r14),%xmm0
    379 	movq	%r13,16+8(%rsp)
    380 	movq	%rdi,56+8(%rsp)
    381 
    382 	movq	(%r8),%r8
    383 	movq	(%rsi),%rax
    384 	leaq	(%rsi,%r9,1),%rsi
    385 	negq	%r9
    386 
    387 	movq	%r8,%rbp
    388 	mulq	%rbx
    389 	movq	%rax,%r10
    390 	movq	(%rcx),%rax
    391 
    392 	pand	%xmm5,%xmm2
    393 	pand	%xmm6,%xmm3
    394 	por	%xmm2,%xmm1
    395 
    396 	imulq	%r10,%rbp
    397 
    398 
    399 
    400 
    401 
    402 
    403 
    404 	leaq	64+8(%rsp,%r11,8),%r14
    405 	movq	%rdx,%r11
    406 
    407 	pand	%xmm7,%xmm0
    408 	por	%xmm3,%xmm1
    409 	leaq	512(%r12),%r12
    410 	por	%xmm1,%xmm0
    411 
    412 	mulq	%rbp
    413 	addq	%rax,%r10
    414 	movq	8(%rsi,%r9,1),%rax
    415 	adcq	$0,%rdx
    416 	movq	%rdx,%rdi
    417 
    418 	mulq	%rbx
    419 	addq	%rax,%r11
    420 	movq	16(%rcx),%rax
    421 	adcq	$0,%rdx
    422 	movq	%rdx,%r10
    423 
    424 	mulq	%rbp
    425 	addq	%rax,%rdi
    426 	movq	16(%rsi,%r9,1),%rax
    427 	adcq	$0,%rdx
    428 	addq	%r11,%rdi
    429 	leaq	32(%r9),%r15
    430 	leaq	64(%rcx),%rcx
    431 	adcq	$0,%rdx
    432 	movq	%rdi,(%r14)
    433 	movq	%rdx,%r13
    434 	jmp	.L1st4x
    435 
    436 .align	32
    437 .L1st4x:
    438 	mulq	%rbx
    439 	addq	%rax,%r10
    440 	movq	-32(%rcx),%rax
    441 	leaq	32(%r14),%r14
    442 	adcq	$0,%rdx
    443 	movq	%rdx,%r11
    444 
    445 	mulq	%rbp
    446 	addq	%rax,%r13
    447 	movq	-8(%rsi,%r15,1),%rax
    448 	adcq	$0,%rdx
    449 	addq	%r10,%r13
    450 	adcq	$0,%rdx
    451 	movq	%r13,-24(%r14)
    452 	movq	%rdx,%rdi
    453 
    454 	mulq	%rbx
    455 	addq	%rax,%r11
    456 	movq	-16(%rcx),%rax
    457 	adcq	$0,%rdx
    458 	movq	%rdx,%r10
    459 
    460 	mulq	%rbp
    461 	addq	%rax,%rdi
    462 	movq	(%rsi,%r15,1),%rax
    463 	adcq	$0,%rdx
    464 	addq	%r11,%rdi
    465 	adcq	$0,%rdx
    466 	movq	%rdi,-16(%r14)
    467 	movq	%rdx,%r13
    468 
    469 	mulq	%rbx
    470 	addq	%rax,%r10
    471 	movq	0(%rcx),%rax
    472 	adcq	$0,%rdx
    473 	movq	%rdx,%r11
    474 
    475 	mulq	%rbp
    476 	addq	%rax,%r13
    477 	movq	8(%rsi,%r15,1),%rax
    478 	adcq	$0,%rdx
    479 	addq	%r10,%r13
    480 	adcq	$0,%rdx
    481 	movq	%r13,-8(%r14)
    482 	movq	%rdx,%rdi
    483 
    484 	mulq	%rbx
    485 	addq	%rax,%r11
    486 	movq	16(%rcx),%rax
    487 	adcq	$0,%rdx
    488 	movq	%rdx,%r10
    489 
    490 	mulq	%rbp
    491 	addq	%rax,%rdi
    492 	movq	16(%rsi,%r15,1),%rax
    493 	adcq	$0,%rdx
    494 	addq	%r11,%rdi
    495 	leaq	64(%rcx),%rcx
    496 	adcq	$0,%rdx
    497 	movq	%rdi,(%r14)
    498 	movq	%rdx,%r13
    499 
    500 	addq	$32,%r15
    501 	jnz	.L1st4x
    502 
    503 	mulq	%rbx
    504 	addq	%rax,%r10
    505 	movq	-32(%rcx),%rax
    506 	leaq	32(%r14),%r14
    507 	adcq	$0,%rdx
    508 	movq	%rdx,%r11
    509 
    510 	mulq	%rbp
    511 	addq	%rax,%r13
    512 	movq	-8(%rsi),%rax
    513 	adcq	$0,%rdx
    514 	addq	%r10,%r13
    515 	adcq	$0,%rdx
    516 	movq	%r13,-24(%r14)
    517 	movq	%rdx,%rdi
    518 
    519 	mulq	%rbx
    520 	addq	%rax,%r11
    521 	movq	-16(%rcx),%rax
    522 	adcq	$0,%rdx
    523 	movq	%rdx,%r10
    524 
    525 	mulq	%rbp
    526 	addq	%rax,%rdi
    527 	movq	(%rsi,%r9,1),%rax
    528 	adcq	$0,%rdx
    529 	addq	%r11,%rdi
    530 	adcq	$0,%rdx
    531 	movq	%rdi,-16(%r14)
    532 	movq	%rdx,%r13
    533 
    534 .byte	102,72,15,126,195
    535 	leaq	(%rcx,%r9,2),%rcx
    536 
    537 	xorq	%rdi,%rdi
    538 	addq	%r10,%r13
    539 	adcq	$0,%rdi
    540 	movq	%r13,-8(%r14)
    541 
    542 	jmp	.Louter4x
    543 
    544 .align	32
    545 .Louter4x:
    546 	movq	(%r14,%r9,1),%r10
    547 	movq	%r8,%rbp
    548 	mulq	%rbx
    549 	addq	%rax,%r10
    550 	movq	(%rcx),%rax
    551 	adcq	$0,%rdx
    552 
    553 	movq	-96(%r12),%xmm0
    554 	movq	-32(%r12),%xmm1
    555 	pand	%xmm4,%xmm0
    556 	movq	32(%r12),%xmm2
    557 	pand	%xmm5,%xmm1
    558 	movq	96(%r12),%xmm3
    559 
    560 	imulq	%r10,%rbp
    561 .byte	0x67
    562 	movq	%rdx,%r11
    563 	movq	%rdi,(%r14)
    564 
    565 	pand	%xmm6,%xmm2
    566 	por	%xmm1,%xmm0
    567 	pand	%xmm7,%xmm3
    568 	por	%xmm2,%xmm0
    569 	leaq	(%r14,%r9,1),%r14
    570 	leaq	256(%r12),%r12
    571 	por	%xmm3,%xmm0
    572 
    573 	mulq	%rbp
    574 	addq	%rax,%r10
    575 	movq	8(%rsi,%r9,1),%rax
    576 	adcq	$0,%rdx
    577 	movq	%rdx,%rdi
    578 
    579 	mulq	%rbx
    580 	addq	%rax,%r11
    581 	movq	16(%rcx),%rax
    582 	adcq	$0,%rdx
    583 	addq	8(%r14),%r11
    584 	adcq	$0,%rdx
    585 	movq	%rdx,%r10
    586 
    587 	mulq	%rbp
    588 	addq	%rax,%rdi
    589 	movq	16(%rsi,%r9,1),%rax
    590 	adcq	$0,%rdx
    591 	addq	%r11,%rdi
    592 	leaq	32(%r9),%r15
    593 	leaq	64(%rcx),%rcx
    594 	adcq	$0,%rdx
    595 	movq	%rdx,%r13
    596 	jmp	.Linner4x
    597 
    598 .align	32
    599 .Linner4x:
    600 	mulq	%rbx
    601 	addq	%rax,%r10
    602 	movq	-32(%rcx),%rax
    603 	adcq	$0,%rdx
    604 	addq	16(%r14),%r10
    605 	leaq	32(%r14),%r14
    606 	adcq	$0,%rdx
    607 	movq	%rdx,%r11
    608 
    609 	mulq	%rbp
    610 	addq	%rax,%r13
    611 	movq	-8(%rsi,%r15,1),%rax
    612 	adcq	$0,%rdx
    613 	addq	%r10,%r13
    614 	adcq	$0,%rdx
    615 	movq	%rdi,-32(%r14)
    616 	movq	%rdx,%rdi
    617 
    618 	mulq	%rbx
    619 	addq	%rax,%r11
    620 	movq	-16(%rcx),%rax
    621 	adcq	$0,%rdx
    622 	addq	-8(%r14),%r11
    623 	adcq	$0,%rdx
    624 	movq	%rdx,%r10
    625 
    626 	mulq	%rbp
    627 	addq	%rax,%rdi
    628 	movq	(%rsi,%r15,1),%rax
    629 	adcq	$0,%rdx
    630 	addq	%r11,%rdi
    631 	adcq	$0,%rdx
    632 	movq	%r13,-24(%r14)
    633 	movq	%rdx,%r13
    634 
    635 	mulq	%rbx
    636 	addq	%rax,%r10
    637 	movq	0(%rcx),%rax
    638 	adcq	$0,%rdx
    639 	addq	(%r14),%r10
    640 	adcq	$0,%rdx
    641 	movq	%rdx,%r11
    642 
    643 	mulq	%rbp
    644 	addq	%rax,%r13
    645 	movq	8(%rsi,%r15,1),%rax
    646 	adcq	$0,%rdx
    647 	addq	%r10,%r13
    648 	adcq	$0,%rdx
    649 	movq	%rdi,-16(%r14)
    650 	movq	%rdx,%rdi
    651 
    652 	mulq	%rbx
    653 	addq	%rax,%r11
    654 	movq	16(%rcx),%rax
    655 	adcq	$0,%rdx
    656 	addq	8(%r14),%r11
    657 	adcq	$0,%rdx
    658 	movq	%rdx,%r10
    659 
    660 	mulq	%rbp
    661 	addq	%rax,%rdi
    662 	movq	16(%rsi,%r15,1),%rax
    663 	adcq	$0,%rdx
    664 	addq	%r11,%rdi
    665 	leaq	64(%rcx),%rcx
    666 	adcq	$0,%rdx
    667 	movq	%r13,-8(%r14)
    668 	movq	%rdx,%r13
    669 
    670 	addq	$32,%r15
    671 	jnz	.Linner4x
    672 
    673 	mulq	%rbx
    674 	addq	%rax,%r10
    675 	movq	-32(%rcx),%rax
    676 	adcq	$0,%rdx
    677 	addq	16(%r14),%r10
    678 	leaq	32(%r14),%r14
    679 	adcq	$0,%rdx
    680 	movq	%rdx,%r11
    681 
    682 	mulq	%rbp
    683 	addq	%rax,%r13
    684 	movq	-8(%rsi),%rax
    685 	adcq	$0,%rdx
    686 	addq	%r10,%r13
    687 	adcq	$0,%rdx
    688 	movq	%rdi,-32(%r14)
    689 	movq	%rdx,%rdi
    690 
    691 	mulq	%rbx
    692 	addq	%rax,%r11
    693 	movq	%rbp,%rax
    694 	movq	-16(%rcx),%rbp
    695 	adcq	$0,%rdx
    696 	addq	-8(%r14),%r11
    697 	adcq	$0,%rdx
    698 	movq	%rdx,%r10
    699 
    700 	mulq	%rbp
    701 	addq	%rax,%rdi
    702 	movq	(%rsi,%r9,1),%rax
    703 	adcq	$0,%rdx
    704 	addq	%r11,%rdi
    705 	adcq	$0,%rdx
    706 	movq	%r13,-24(%r14)
    707 	movq	%rdx,%r13
    708 
    709 .byte	102,72,15,126,195
    710 	movq	%rdi,-16(%r14)
    711 	leaq	(%rcx,%r9,2),%rcx
    712 
    713 	xorq	%rdi,%rdi
    714 	addq	%r10,%r13
    715 	adcq	$0,%rdi
    716 	addq	(%r14),%r13
    717 	adcq	$0,%rdi
    718 	movq	%r13,-8(%r14)
    719 
    720 	cmpq	16+8(%rsp),%r12
    721 	jb	.Louter4x
    722 	subq	%r13,%rbp
    723 	adcq	%r15,%r15
    724 	orq	%r15,%rdi
    725 	xorq	$1,%rdi
    726 	leaq	(%r14,%r9,1),%rbx
    727 	leaq	(%rcx,%rdi,8),%rbp
    728 	movq	%r9,%rcx
    729 	sarq	$3+2,%rcx
    730 	movq	56+8(%rsp),%rdi
    731 	jmp	.Lsqr4x_sub
    732 .size	mul4x_internal,.-mul4x_internal
    733 .globl	bn_power5
    734 .hidden bn_power5
    735 .type	bn_power5,@function
    736 .align	32
    737 bn_power5:
    738 	movq	%rsp,%rax
    739 	pushq	%rbx
    740 	pushq	%rbp
    741 	pushq	%r12
    742 	pushq	%r13
    743 	pushq	%r14
    744 	pushq	%r15
    745 	movl	%r9d,%r10d
    746 	shll	$3,%r9d
    747 	shll	$3+2,%r10d
    748 	negq	%r9
    749 	movq	(%r8),%r8
    750 
    751 
    752 
    753 
    754 
    755 
    756 
    757 	leaq	-64(%rsp,%r9,2),%r11
    758 	subq	%rsi,%r11
    759 	andq	$4095,%r11
    760 	cmpq	%r11,%r10
    761 	jb	.Lpwr_sp_alt
    762 	subq	%r11,%rsp
    763 	leaq	-64(%rsp,%r9,2),%rsp
    764 	jmp	.Lpwr_sp_done
    765 
    766 .align	32
    767 .Lpwr_sp_alt:
    768 	leaq	4096-64(,%r9,2),%r10
    769 	leaq	-64(%rsp,%r9,2),%rsp
    770 	subq	%r10,%r11
    771 	movq	$0,%r10
    772 	cmovcq	%r10,%r11
    773 	subq	%r11,%rsp
    774 .Lpwr_sp_done:
    775 	andq	$-64,%rsp
    776 	movq	%r9,%r10
    777 	negq	%r9
    778 
    779 
    780 
    781 
    782 
    783 
    784 
    785 
    786 
    787 
    788 	movq	%r8,32(%rsp)
    789 	movq	%rax,40(%rsp)
    790 .Lpower5_body:
    791 .byte	102,72,15,110,207
    792 .byte	102,72,15,110,209
    793 .byte	102,73,15,110,218
    794 .byte	102,72,15,110,226
    795 
    796 	call	__bn_sqr8x_internal
    797 	call	__bn_sqr8x_internal
    798 	call	__bn_sqr8x_internal
    799 	call	__bn_sqr8x_internal
    800 	call	__bn_sqr8x_internal
    801 
    802 .byte	102,72,15,126,209
    803 .byte	102,72,15,126,226
    804 	movq	%rsi,%rdi
    805 	movq	40(%rsp),%rax
    806 	leaq	32(%rsp),%r8
    807 
    808 	call	mul4x_internal
    809 
    810 	movq	40(%rsp),%rsi
    811 	movq	$1,%rax
    812 	movq	-48(%rsi),%r15
    813 	movq	-40(%rsi),%r14
    814 	movq	-32(%rsi),%r13
    815 	movq	-24(%rsi),%r12
    816 	movq	-16(%rsi),%rbp
    817 	movq	-8(%rsi),%rbx
    818 	leaq	(%rsi),%rsp
    819 .Lpower5_epilogue:
    820 	.byte	0xf3,0xc3
    821 .size	bn_power5,.-bn_power5
    822 
    823 .globl	bn_sqr8x_internal
    824 .hidden bn_sqr8x_internal
    825 .hidden	bn_sqr8x_internal
    826 .type	bn_sqr8x_internal,@function
    827 .align	32
    828 bn_sqr8x_internal:
    829 __bn_sqr8x_internal:
    830 
    831 
    832 
    833 
    834 
    835 
    836 
    837 
    838 
    839 
    840 
    841 
    842 
    843 
    844 
    845 
    846 
    847 
    848 
    849 
    850 
    851 
    852 
    853 
    854 
    855 
    856 
    857 
    858 
    859 
    860 
    861 
    862 
    863 
    864 
    865 
    866 
    867 
    868 
    869 
    870 
    871 
    872 
    873 
    874 
    875 
    876 
    877 
    878 
    879 
    880 
    881 
    882 
    883 
    884 
    885 
    886 
    887 
    888 
    889 
    890 
    891 
    892 
    893 
    894 
    895 
    896 
    897 
    898 
    899 
    900 
    901 
    902 
    903 	leaq	32(%r10),%rbp
    904 	leaq	(%rsi,%r9,1),%rsi
    905 
    906 	movq	%r9,%rcx
    907 
    908 
    909 	movq	-32(%rsi,%rbp,1),%r14
    910 	leaq	48+8(%rsp,%r9,2),%rdi
    911 	movq	-24(%rsi,%rbp,1),%rax
    912 	leaq	-32(%rdi,%rbp,1),%rdi
    913 	movq	-16(%rsi,%rbp,1),%rbx
    914 	movq	%rax,%r15
    915 
    916 	mulq	%r14
    917 	movq	%rax,%r10
    918 	movq	%rbx,%rax
    919 	movq	%rdx,%r11
    920 	movq	%r10,-24(%rdi,%rbp,1)
    921 
    922 	mulq	%r14
    923 	addq	%rax,%r11
    924 	movq	%rbx,%rax
    925 	adcq	$0,%rdx
    926 	movq	%r11,-16(%rdi,%rbp,1)
    927 	movq	%rdx,%r10
    928 
    929 
    930 	movq	-8(%rsi,%rbp,1),%rbx
    931 	mulq	%r15
    932 	movq	%rax,%r12
    933 	movq	%rbx,%rax
    934 	movq	%rdx,%r13
    935 
    936 	leaq	(%rbp),%rcx
    937 	mulq	%r14
    938 	addq	%rax,%r10
    939 	movq	%rbx,%rax
    940 	movq	%rdx,%r11
    941 	adcq	$0,%r11
    942 	addq	%r12,%r10
    943 	adcq	$0,%r11
    944 	movq	%r10,-8(%rdi,%rcx,1)
    945 	jmp	.Lsqr4x_1st
    946 
    947 .align	32
    948 .Lsqr4x_1st:
    949 	movq	(%rsi,%rcx,1),%rbx
    950 	mulq	%r15
    951 	addq	%rax,%r13
    952 	movq	%rbx,%rax
    953 	movq	%rdx,%r12
    954 	adcq	$0,%r12
    955 
    956 	mulq	%r14
    957 	addq	%rax,%r11
    958 	movq	%rbx,%rax
    959 	movq	8(%rsi,%rcx,1),%rbx
    960 	movq	%rdx,%r10
    961 	adcq	$0,%r10
    962 	addq	%r13,%r11
    963 	adcq	$0,%r10
    964 
    965 
    966 	mulq	%r15
    967 	addq	%rax,%r12
    968 	movq	%rbx,%rax
    969 	movq	%r11,(%rdi,%rcx,1)
    970 	movq	%rdx,%r13
    971 	adcq	$0,%r13
    972 
    973 	mulq	%r14
    974 	addq	%rax,%r10
    975 	movq	%rbx,%rax
    976 	movq	16(%rsi,%rcx,1),%rbx
    977 	movq	%rdx,%r11
    978 	adcq	$0,%r11
    979 	addq	%r12,%r10
    980 	adcq	$0,%r11
    981 
    982 	mulq	%r15
    983 	addq	%rax,%r13
    984 	movq	%rbx,%rax
    985 	movq	%r10,8(%rdi,%rcx,1)
    986 	movq	%rdx,%r12
    987 	adcq	$0,%r12
    988 
    989 	mulq	%r14
    990 	addq	%rax,%r11
    991 	movq	%rbx,%rax
    992 	movq	24(%rsi,%rcx,1),%rbx
    993 	movq	%rdx,%r10
    994 	adcq	$0,%r10
    995 	addq	%r13,%r11
    996 	adcq	$0,%r10
    997 
    998 
    999 	mulq	%r15
   1000 	addq	%rax,%r12
   1001 	movq	%rbx,%rax
   1002 	movq	%r11,16(%rdi,%rcx,1)
   1003 	movq	%rdx,%r13
   1004 	adcq	$0,%r13
   1005 	leaq	32(%rcx),%rcx
   1006 
   1007 	mulq	%r14
   1008 	addq	%rax,%r10
   1009 	movq	%rbx,%rax
   1010 	movq	%rdx,%r11
   1011 	adcq	$0,%r11
   1012 	addq	%r12,%r10
   1013 	adcq	$0,%r11
   1014 	movq	%r10,-8(%rdi,%rcx,1)
   1015 
   1016 	cmpq	$0,%rcx
   1017 	jne	.Lsqr4x_1st
   1018 
   1019 	mulq	%r15
   1020 	addq	%rax,%r13
   1021 	leaq	16(%rbp),%rbp
   1022 	adcq	$0,%rdx
   1023 	addq	%r11,%r13
   1024 	adcq	$0,%rdx
   1025 
   1026 	movq	%r13,(%rdi)
   1027 	movq	%rdx,%r12
   1028 	movq	%rdx,8(%rdi)
   1029 	jmp	.Lsqr4x_outer
   1030 
   1031 .align	32
   1032 .Lsqr4x_outer:
   1033 	movq	-32(%rsi,%rbp,1),%r14
   1034 	leaq	48+8(%rsp,%r9,2),%rdi
   1035 	movq	-24(%rsi,%rbp,1),%rax
   1036 	leaq	-32(%rdi,%rbp,1),%rdi
   1037 	movq	-16(%rsi,%rbp,1),%rbx
   1038 	movq	%rax,%r15
   1039 
   1040 	mulq	%r14
   1041 	movq	-24(%rdi,%rbp,1),%r10
   1042 	addq	%rax,%r10
   1043 	movq	%rbx,%rax
   1044 	adcq	$0,%rdx
   1045 	movq	%r10,-24(%rdi,%rbp,1)
   1046 	movq	%rdx,%r11
   1047 
   1048 	mulq	%r14
   1049 	addq	%rax,%r11
   1050 	movq	%rbx,%rax
   1051 	adcq	$0,%rdx
   1052 	addq	-16(%rdi,%rbp,1),%r11
   1053 	movq	%rdx,%r10
   1054 	adcq	$0,%r10
   1055 	movq	%r11,-16(%rdi,%rbp,1)
   1056 
   1057 	xorq	%r12,%r12
   1058 
   1059 	movq	-8(%rsi,%rbp,1),%rbx
   1060 	mulq	%r15
   1061 	addq	%rax,%r12
   1062 	movq	%rbx,%rax
   1063 	adcq	$0,%rdx
   1064 	addq	-8(%rdi,%rbp,1),%r12
   1065 	movq	%rdx,%r13
   1066 	adcq	$0,%r13
   1067 
   1068 	mulq	%r14
   1069 	addq	%rax,%r10
   1070 	movq	%rbx,%rax
   1071 	adcq	$0,%rdx
   1072 	addq	%r12,%r10
   1073 	movq	%rdx,%r11
   1074 	adcq	$0,%r11
   1075 	movq	%r10,-8(%rdi,%rbp,1)
   1076 
   1077 	leaq	(%rbp),%rcx
   1078 	jmp	.Lsqr4x_inner
   1079 
   1080 .align	32
   1081 .Lsqr4x_inner:
   1082 	movq	(%rsi,%rcx,1),%rbx
   1083 	mulq	%r15
   1084 	addq	%rax,%r13
   1085 	movq	%rbx,%rax
   1086 	movq	%rdx,%r12
   1087 	adcq	$0,%r12
   1088 	addq	(%rdi,%rcx,1),%r13
   1089 	adcq	$0,%r12
   1090 
   1091 .byte	0x67
   1092 	mulq	%r14
   1093 	addq	%rax,%r11
   1094 	movq	%rbx,%rax
   1095 	movq	8(%rsi,%rcx,1),%rbx
   1096 	movq	%rdx,%r10
   1097 	adcq	$0,%r10
   1098 	addq	%r13,%r11
   1099 	adcq	$0,%r10
   1100 
   1101 	mulq	%r15
   1102 	addq	%rax,%r12
   1103 	movq	%r11,(%rdi,%rcx,1)
   1104 	movq	%rbx,%rax
   1105 	movq	%rdx,%r13
   1106 	adcq	$0,%r13
   1107 	addq	8(%rdi,%rcx,1),%r12
   1108 	leaq	16(%rcx),%rcx
   1109 	adcq	$0,%r13
   1110 
   1111 	mulq	%r14
   1112 	addq	%rax,%r10
   1113 	movq	%rbx,%rax
   1114 	adcq	$0,%rdx
   1115 	addq	%r12,%r10
   1116 	movq	%rdx,%r11
   1117 	adcq	$0,%r11
   1118 	movq	%r10,-8(%rdi,%rcx,1)
   1119 
   1120 	cmpq	$0,%rcx
   1121 	jne	.Lsqr4x_inner
   1122 
   1123 .byte	0x67
   1124 	mulq	%r15
   1125 	addq	%rax,%r13
   1126 	adcq	$0,%rdx
   1127 	addq	%r11,%r13
   1128 	adcq	$0,%rdx
   1129 
   1130 	movq	%r13,(%rdi)
   1131 	movq	%rdx,%r12
   1132 	movq	%rdx,8(%rdi)
   1133 
   1134 	addq	$16,%rbp
   1135 	jnz	.Lsqr4x_outer
   1136 
   1137 
   1138 	movq	-32(%rsi),%r14
   1139 	leaq	48+8(%rsp,%r9,2),%rdi
   1140 	movq	-24(%rsi),%rax
   1141 	leaq	-32(%rdi,%rbp,1),%rdi
   1142 	movq	-16(%rsi),%rbx
   1143 	movq	%rax,%r15
   1144 
   1145 	mulq	%r14
   1146 	addq	%rax,%r10
   1147 	movq	%rbx,%rax
   1148 	movq	%rdx,%r11
   1149 	adcq	$0,%r11
   1150 
   1151 	mulq	%r14
   1152 	addq	%rax,%r11
   1153 	movq	%rbx,%rax
   1154 	movq	%r10,-24(%rdi)
   1155 	movq	%rdx,%r10
   1156 	adcq	$0,%r10
   1157 	addq	%r13,%r11
   1158 	movq	-8(%rsi),%rbx
   1159 	adcq	$0,%r10
   1160 
   1161 	mulq	%r15
   1162 	addq	%rax,%r12
   1163 	movq	%rbx,%rax
   1164 	movq	%r11,-16(%rdi)
   1165 	movq	%rdx,%r13
   1166 	adcq	$0,%r13
   1167 
   1168 	mulq	%r14
   1169 	addq	%rax,%r10
   1170 	movq	%rbx,%rax
   1171 	movq	%rdx,%r11
   1172 	adcq	$0,%r11
   1173 	addq	%r12,%r10
   1174 	adcq	$0,%r11
   1175 	movq	%r10,-8(%rdi)
   1176 
   1177 	mulq	%r15
   1178 	addq	%rax,%r13
   1179 	movq	-16(%rsi),%rax
   1180 	adcq	$0,%rdx
   1181 	addq	%r11,%r13
   1182 	adcq	$0,%rdx
   1183 
   1184 	movq	%r13,(%rdi)
   1185 	movq	%rdx,%r12
   1186 	movq	%rdx,8(%rdi)
   1187 
   1188 	mulq	%rbx
   1189 	addq	$16,%rbp
   1190 	xorq	%r14,%r14
   1191 	subq	%r9,%rbp
   1192 	xorq	%r15,%r15
   1193 
   1194 	addq	%r12,%rax
   1195 	adcq	$0,%rdx
   1196 	movq	%rax,8(%rdi)
   1197 	movq	%rdx,16(%rdi)
   1198 	movq	%r15,24(%rdi)
   1199 
   1200 	movq	-16(%rsi,%rbp,1),%rax
   1201 	leaq	48+8(%rsp),%rdi
   1202 	xorq	%r10,%r10
   1203 	movq	8(%rdi),%r11
   1204 
   1205 	leaq	(%r14,%r10,2),%r12
   1206 	shrq	$63,%r10
   1207 	leaq	(%rcx,%r11,2),%r13
   1208 	shrq	$63,%r11
   1209 	orq	%r10,%r13
   1210 	movq	16(%rdi),%r10
   1211 	movq	%r11,%r14
   1212 	mulq	%rax
   1213 	negq	%r15
   1214 	movq	24(%rdi),%r11
   1215 	adcq	%rax,%r12
   1216 	movq	-8(%rsi,%rbp,1),%rax
   1217 	movq	%r12,(%rdi)
   1218 	adcq	%rdx,%r13
   1219 
   1220 	leaq	(%r14,%r10,2),%rbx
   1221 	movq	%r13,8(%rdi)
   1222 	sbbq	%r15,%r15
   1223 	shrq	$63,%r10
   1224 	leaq	(%rcx,%r11,2),%r8
   1225 	shrq	$63,%r11
   1226 	orq	%r10,%r8
   1227 	movq	32(%rdi),%r10
   1228 	movq	%r11,%r14
   1229 	mulq	%rax
   1230 	negq	%r15
   1231 	movq	40(%rdi),%r11
   1232 	adcq	%rax,%rbx
   1233 	movq	0(%rsi,%rbp,1),%rax
   1234 	movq	%rbx,16(%rdi)
   1235 	adcq	%rdx,%r8
   1236 	leaq	16(%rbp),%rbp
   1237 	movq	%r8,24(%rdi)
   1238 	sbbq	%r15,%r15
   1239 	leaq	64(%rdi),%rdi
   1240 	jmp	.Lsqr4x_shift_n_add
   1241 
   1242 .align	32
   1243 .Lsqr4x_shift_n_add:
   1244 	leaq	(%r14,%r10,2),%r12
   1245 	shrq	$63,%r10
   1246 	leaq	(%rcx,%r11,2),%r13
   1247 	shrq	$63,%r11
   1248 	orq	%r10,%r13
   1249 	movq	-16(%rdi),%r10
   1250 	movq	%r11,%r14
   1251 	mulq	%rax
   1252 	negq	%r15
   1253 	movq	-8(%rdi),%r11
   1254 	adcq	%rax,%r12
   1255 	movq	-8(%rsi,%rbp,1),%rax
   1256 	movq	%r12,-32(%rdi)
   1257 	adcq	%rdx,%r13
   1258 
   1259 	leaq	(%r14,%r10,2),%rbx
   1260 	movq	%r13,-24(%rdi)
   1261 	sbbq	%r15,%r15
   1262 	shrq	$63,%r10
   1263 	leaq	(%rcx,%r11,2),%r8
   1264 	shrq	$63,%r11
   1265 	orq	%r10,%r8
   1266 	movq	0(%rdi),%r10
   1267 	movq	%r11,%r14
   1268 	mulq	%rax
   1269 	negq	%r15
   1270 	movq	8(%rdi),%r11
   1271 	adcq	%rax,%rbx
   1272 	movq	0(%rsi,%rbp,1),%rax
   1273 	movq	%rbx,-16(%rdi)
   1274 	adcq	%rdx,%r8
   1275 
   1276 	leaq	(%r14,%r10,2),%r12
   1277 	movq	%r8,-8(%rdi)
   1278 	sbbq	%r15,%r15
   1279 	shrq	$63,%r10
   1280 	leaq	(%rcx,%r11,2),%r13
   1281 	shrq	$63,%r11
   1282 	orq	%r10,%r13
   1283 	movq	16(%rdi),%r10
   1284 	movq	%r11,%r14
   1285 	mulq	%rax
   1286 	negq	%r15
   1287 	movq	24(%rdi),%r11
   1288 	adcq	%rax,%r12
   1289 	movq	8(%rsi,%rbp,1),%rax
   1290 	movq	%r12,0(%rdi)
   1291 	adcq	%rdx,%r13
   1292 
   1293 	leaq	(%r14,%r10,2),%rbx
   1294 	movq	%r13,8(%rdi)
   1295 	sbbq	%r15,%r15
   1296 	shrq	$63,%r10
   1297 	leaq	(%rcx,%r11,2),%r8
   1298 	shrq	$63,%r11
   1299 	orq	%r10,%r8
   1300 	movq	32(%rdi),%r10
   1301 	movq	%r11,%r14
   1302 	mulq	%rax
   1303 	negq	%r15
   1304 	movq	40(%rdi),%r11
   1305 	adcq	%rax,%rbx
   1306 	movq	16(%rsi,%rbp,1),%rax
   1307 	movq	%rbx,16(%rdi)
   1308 	adcq	%rdx,%r8
   1309 	movq	%r8,24(%rdi)
   1310 	sbbq	%r15,%r15
   1311 	leaq	64(%rdi),%rdi
   1312 	addq	$32,%rbp
   1313 	jnz	.Lsqr4x_shift_n_add
   1314 
   1315 	leaq	(%r14,%r10,2),%r12
   1316 .byte	0x67
   1317 	shrq	$63,%r10
   1318 	leaq	(%rcx,%r11,2),%r13
   1319 	shrq	$63,%r11
   1320 	orq	%r10,%r13
   1321 	movq	-16(%rdi),%r10
   1322 	movq	%r11,%r14
   1323 	mulq	%rax
   1324 	negq	%r15
   1325 	movq	-8(%rdi),%r11
   1326 	adcq	%rax,%r12
   1327 	movq	-8(%rsi),%rax
   1328 	movq	%r12,-32(%rdi)
   1329 	adcq	%rdx,%r13
   1330 
   1331 	leaq	(%r14,%r10,2),%rbx
   1332 	movq	%r13,-24(%rdi)
   1333 	sbbq	%r15,%r15
   1334 	shrq	$63,%r10
   1335 	leaq	(%rcx,%r11,2),%r8
   1336 	shrq	$63,%r11
   1337 	orq	%r10,%r8
   1338 	mulq	%rax
   1339 	negq	%r15
   1340 	adcq	%rax,%rbx
   1341 	adcq	%rdx,%r8
   1342 	movq	%rbx,-16(%rdi)
   1343 	movq	%r8,-8(%rdi)
   1344 .byte	102,72,15,126,213
   1345 sqr8x_reduction:
   1346 	xorq	%rax,%rax
   1347 	leaq	(%rbp,%r9,2),%rcx
   1348 	leaq	48+8(%rsp,%r9,2),%rdx
   1349 	movq	%rcx,0+8(%rsp)
   1350 	leaq	48+8(%rsp,%r9,1),%rdi
   1351 	movq	%rdx,8+8(%rsp)
   1352 	negq	%r9
   1353 	jmp	.L8x_reduction_loop
   1354 
   1355 .align	32
   1356 .L8x_reduction_loop:
   1357 	leaq	(%rdi,%r9,1),%rdi
   1358 .byte	0x66
   1359 	movq	0(%rdi),%rbx
   1360 	movq	8(%rdi),%r9
   1361 	movq	16(%rdi),%r10
   1362 	movq	24(%rdi),%r11
   1363 	movq	32(%rdi),%r12
   1364 	movq	40(%rdi),%r13
   1365 	movq	48(%rdi),%r14
   1366 	movq	56(%rdi),%r15
   1367 	movq	%rax,(%rdx)
   1368 	leaq	64(%rdi),%rdi
   1369 
   1370 .byte	0x67
   1371 	movq	%rbx,%r8
   1372 	imulq	32+8(%rsp),%rbx
   1373 	movq	0(%rbp),%rax
   1374 	movl	$8,%ecx
   1375 	jmp	.L8x_reduce
   1376 
   1377 .align	32
   1378 .L8x_reduce:
   1379 	mulq	%rbx
   1380 	movq	16(%rbp),%rax
   1381 	negq	%r8
   1382 	movq	%rdx,%r8
   1383 	adcq	$0,%r8
   1384 
   1385 	mulq	%rbx
   1386 	addq	%rax,%r9
   1387 	movq	32(%rbp),%rax
   1388 	adcq	$0,%rdx
   1389 	addq	%r9,%r8
   1390 	movq	%rbx,48-8+8(%rsp,%rcx,8)
   1391 	movq	%rdx,%r9
   1392 	adcq	$0,%r9
   1393 
   1394 	mulq	%rbx
   1395 	addq	%rax,%r10
   1396 	movq	48(%rbp),%rax
   1397 	adcq	$0,%rdx
   1398 	addq	%r10,%r9
   1399 	movq	32+8(%rsp),%rsi
   1400 	movq	%rdx,%r10
   1401 	adcq	$0,%r10
   1402 
   1403 	mulq	%rbx
   1404 	addq	%rax,%r11
   1405 	movq	64(%rbp),%rax
   1406 	adcq	$0,%rdx
   1407 	imulq	%r8,%rsi
   1408 	addq	%r11,%r10
   1409 	movq	%rdx,%r11
   1410 	adcq	$0,%r11
   1411 
   1412 	mulq	%rbx
   1413 	addq	%rax,%r12
   1414 	movq	80(%rbp),%rax
   1415 	adcq	$0,%rdx
   1416 	addq	%r12,%r11
   1417 	movq	%rdx,%r12
   1418 	adcq	$0,%r12
   1419 
   1420 	mulq	%rbx
   1421 	addq	%rax,%r13
   1422 	movq	96(%rbp),%rax
   1423 	adcq	$0,%rdx
   1424 	addq	%r13,%r12
   1425 	movq	%rdx,%r13
   1426 	adcq	$0,%r13
   1427 
   1428 	mulq	%rbx
   1429 	addq	%rax,%r14
   1430 	movq	112(%rbp),%rax
   1431 	adcq	$0,%rdx
   1432 	addq	%r14,%r13
   1433 	movq	%rdx,%r14
   1434 	adcq	$0,%r14
   1435 
   1436 	mulq	%rbx
   1437 	movq	%rsi,%rbx
   1438 	addq	%rax,%r15
   1439 	movq	0(%rbp),%rax
   1440 	adcq	$0,%rdx
   1441 	addq	%r15,%r14
   1442 	movq	%rdx,%r15
   1443 	adcq	$0,%r15
   1444 
   1445 	decl	%ecx
   1446 	jnz	.L8x_reduce
   1447 
   1448 	leaq	128(%rbp),%rbp
   1449 	xorq	%rax,%rax
   1450 	movq	8+8(%rsp),%rdx
   1451 	cmpq	0+8(%rsp),%rbp
   1452 	jae	.L8x_no_tail
   1453 
   1454 .byte	0x66
   1455 	addq	0(%rdi),%r8
   1456 	adcq	8(%rdi),%r9
   1457 	adcq	16(%rdi),%r10
   1458 	adcq	24(%rdi),%r11
   1459 	adcq	32(%rdi),%r12
   1460 	adcq	40(%rdi),%r13
   1461 	adcq	48(%rdi),%r14
   1462 	adcq	56(%rdi),%r15
   1463 	sbbq	%rsi,%rsi
   1464 
   1465 	movq	48+56+8(%rsp),%rbx
   1466 	movl	$8,%ecx
   1467 	movq	0(%rbp),%rax
   1468 	jmp	.L8x_tail
   1469 
   1470 .align	32
   1471 .L8x_tail:
   1472 	mulq	%rbx
   1473 	addq	%rax,%r8
   1474 	movq	16(%rbp),%rax
   1475 	movq	%r8,(%rdi)
   1476 	movq	%rdx,%r8
   1477 	adcq	$0,%r8
   1478 
   1479 	mulq	%rbx
   1480 	addq	%rax,%r9
   1481 	movq	32(%rbp),%rax
   1482 	adcq	$0,%rdx
   1483 	addq	%r9,%r8
   1484 	leaq	8(%rdi),%rdi
   1485 	movq	%rdx,%r9
   1486 	adcq	$0,%r9
   1487 
   1488 	mulq	%rbx
   1489 	addq	%rax,%r10
   1490 	movq	48(%rbp),%rax
   1491 	adcq	$0,%rdx
   1492 	addq	%r10,%r9
   1493 	movq	%rdx,%r10
   1494 	adcq	$0,%r10
   1495 
   1496 	mulq	%rbx
   1497 	addq	%rax,%r11
   1498 	movq	64(%rbp),%rax
   1499 	adcq	$0,%rdx
   1500 	addq	%r11,%r10
   1501 	movq	%rdx,%r11
   1502 	adcq	$0,%r11
   1503 
   1504 	mulq	%rbx
   1505 	addq	%rax,%r12
   1506 	movq	80(%rbp),%rax
   1507 	adcq	$0,%rdx
   1508 	addq	%r12,%r11
   1509 	movq	%rdx,%r12
   1510 	adcq	$0,%r12
   1511 
   1512 	mulq	%rbx
   1513 	addq	%rax,%r13
   1514 	movq	96(%rbp),%rax
   1515 	adcq	$0,%rdx
   1516 	addq	%r13,%r12
   1517 	movq	%rdx,%r13
   1518 	adcq	$0,%r13
   1519 
   1520 	mulq	%rbx
   1521 	addq	%rax,%r14
   1522 	movq	112(%rbp),%rax
   1523 	adcq	$0,%rdx
   1524 	addq	%r14,%r13
   1525 	movq	%rdx,%r14
   1526 	adcq	$0,%r14
   1527 
   1528 	mulq	%rbx
   1529 	movq	48-16+8(%rsp,%rcx,8),%rbx
   1530 	addq	%rax,%r15
   1531 	adcq	$0,%rdx
   1532 	addq	%r15,%r14
   1533 	movq	0(%rbp),%rax
   1534 	movq	%rdx,%r15
   1535 	adcq	$0,%r15
   1536 
   1537 	decl	%ecx
   1538 	jnz	.L8x_tail
   1539 
   1540 	leaq	128(%rbp),%rbp
   1541 	movq	8+8(%rsp),%rdx
   1542 	cmpq	0+8(%rsp),%rbp
   1543 	jae	.L8x_tail_done
   1544 
   1545 	movq	48+56+8(%rsp),%rbx
   1546 	negq	%rsi
   1547 	movq	0(%rbp),%rax
   1548 	adcq	0(%rdi),%r8
   1549 	adcq	8(%rdi),%r9
   1550 	adcq	16(%rdi),%r10
   1551 	adcq	24(%rdi),%r11
   1552 	adcq	32(%rdi),%r12
   1553 	adcq	40(%rdi),%r13
   1554 	adcq	48(%rdi),%r14
   1555 	adcq	56(%rdi),%r15
   1556 	sbbq	%rsi,%rsi
   1557 
   1558 	movl	$8,%ecx
   1559 	jmp	.L8x_tail
   1560 
   1561 .align	32
   1562 .L8x_tail_done:
   1563 	addq	(%rdx),%r8
   1564 	xorq	%rax,%rax
   1565 
   1566 	negq	%rsi
   1567 .L8x_no_tail:
   1568 	adcq	0(%rdi),%r8
   1569 	adcq	8(%rdi),%r9
   1570 	adcq	16(%rdi),%r10
   1571 	adcq	24(%rdi),%r11
   1572 	adcq	32(%rdi),%r12
   1573 	adcq	40(%rdi),%r13
   1574 	adcq	48(%rdi),%r14
   1575 	adcq	56(%rdi),%r15
   1576 	adcq	$0,%rax
   1577 	movq	-16(%rbp),%rcx
   1578 	xorq	%rsi,%rsi
   1579 
   1580 .byte	102,72,15,126,213
   1581 
   1582 	movq	%r8,0(%rdi)
   1583 	movq	%r9,8(%rdi)
   1584 .byte	102,73,15,126,217
   1585 	movq	%r10,16(%rdi)
   1586 	movq	%r11,24(%rdi)
   1587 	movq	%r12,32(%rdi)
   1588 	movq	%r13,40(%rdi)
   1589 	movq	%r14,48(%rdi)
   1590 	movq	%r15,56(%rdi)
   1591 	leaq	64(%rdi),%rdi
   1592 
   1593 	cmpq	%rdx,%rdi
   1594 	jb	.L8x_reduction_loop
   1595 
   1596 	subq	%r15,%rcx
   1597 	leaq	(%rdi,%r9,1),%rbx
   1598 	adcq	%rsi,%rsi
   1599 	movq	%r9,%rcx
   1600 	orq	%rsi,%rax
   1601 .byte	102,72,15,126,207
   1602 	xorq	$1,%rax
   1603 .byte	102,72,15,126,206
   1604 	leaq	(%rbp,%rax,8),%rbp
   1605 	sarq	$3+2,%rcx
   1606 	jmp	.Lsqr4x_sub
   1607 
   1608 .align	32
   1609 .Lsqr4x_sub:
   1610 .byte	0x66
   1611 	movq	0(%rbx),%r12
   1612 	movq	8(%rbx),%r13
   1613 	sbbq	0(%rbp),%r12
   1614 	movq	16(%rbx),%r14
   1615 	sbbq	16(%rbp),%r13
   1616 	movq	24(%rbx),%r15
   1617 	leaq	32(%rbx),%rbx
   1618 	sbbq	32(%rbp),%r14
   1619 	movq	%r12,0(%rdi)
   1620 	sbbq	48(%rbp),%r15
   1621 	leaq	64(%rbp),%rbp
   1622 	movq	%r13,8(%rdi)
   1623 	movq	%r14,16(%rdi)
   1624 	movq	%r15,24(%rdi)
   1625 	leaq	32(%rdi),%rdi
   1626 
   1627 	incq	%rcx
   1628 	jnz	.Lsqr4x_sub
   1629 	movq	%r9,%r10
   1630 	negq	%r9
   1631 	.byte	0xf3,0xc3
   1632 .size	bn_sqr8x_internal,.-bn_sqr8x_internal
   1633 .globl	bn_from_montgomery
   1634 .hidden bn_from_montgomery
   1635 .type	bn_from_montgomery,@function
   1636 .align	32
   1637 bn_from_montgomery:
   1638 	testl	$7,%r9d
   1639 	jz	bn_from_mont8x
   1640 	xorl	%eax,%eax
   1641 	.byte	0xf3,0xc3
   1642 .size	bn_from_montgomery,.-bn_from_montgomery
   1643 
   1644 .type	bn_from_mont8x,@function
   1645 .align	32
   1646 bn_from_mont8x:
   1647 .byte	0x67
   1648 	movq	%rsp,%rax
   1649 	pushq	%rbx
   1650 	pushq	%rbp
   1651 	pushq	%r12
   1652 	pushq	%r13
   1653 	pushq	%r14
   1654 	pushq	%r15
   1655 .byte	0x67
   1656 	movl	%r9d,%r10d
   1657 	shll	$3,%r9d
   1658 	shll	$3+2,%r10d
   1659 	negq	%r9
   1660 	movq	(%r8),%r8
   1661 
   1662 
   1663 
   1664 
   1665 
   1666 
   1667 
   1668 	leaq	-64(%rsp,%r9,2),%r11
   1669 	subq	%rsi,%r11
   1670 	andq	$4095,%r11
   1671 	cmpq	%r11,%r10
   1672 	jb	.Lfrom_sp_alt
   1673 	subq	%r11,%rsp
   1674 	leaq	-64(%rsp,%r9,2),%rsp
   1675 	jmp	.Lfrom_sp_done
   1676 
   1677 .align	32
   1678 .Lfrom_sp_alt:
   1679 	leaq	4096-64(,%r9,2),%r10
   1680 	leaq	-64(%rsp,%r9,2),%rsp
   1681 	subq	%r10,%r11
   1682 	movq	$0,%r10
   1683 	cmovcq	%r10,%r11
   1684 	subq	%r11,%rsp
   1685 .Lfrom_sp_done:
   1686 	andq	$-64,%rsp
   1687 	movq	%r9,%r10
   1688 	negq	%r9
   1689 
   1690 
   1691 
   1692 
   1693 
   1694 
   1695 
   1696 
   1697 
   1698 
   1699 	movq	%r8,32(%rsp)
   1700 	movq	%rax,40(%rsp)
   1701 .Lfrom_body:
   1702 	movq	%r9,%r11
   1703 	leaq	48(%rsp),%rax
   1704 	pxor	%xmm0,%xmm0
   1705 	jmp	.Lmul_by_1
   1706 
   1707 .align	32
   1708 .Lmul_by_1:
   1709 	movdqu	(%rsi),%xmm1
   1710 	movdqu	16(%rsi),%xmm2
   1711 	movdqu	32(%rsi),%xmm3
   1712 	movdqa	%xmm0,(%rax,%r9,1)
   1713 	movdqu	48(%rsi),%xmm4
   1714 	movdqa	%xmm0,16(%rax,%r9,1)
   1715 .byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
   1716 	movdqa	%xmm1,(%rax)
   1717 	movdqa	%xmm0,32(%rax,%r9,1)
   1718 	movdqa	%xmm2,16(%rax)
   1719 	movdqa	%xmm0,48(%rax,%r9,1)
   1720 	movdqa	%xmm3,32(%rax)
   1721 	movdqa	%xmm4,48(%rax)
   1722 	leaq	64(%rax),%rax
   1723 	subq	$64,%r11
   1724 	jnz	.Lmul_by_1
   1725 
   1726 .byte	102,72,15,110,207
   1727 .byte	102,72,15,110,209
   1728 .byte	0x67
   1729 	movq	%rcx,%rbp
   1730 .byte	102,73,15,110,218
   1731 	call	sqr8x_reduction
   1732 
   1733 	pxor	%xmm0,%xmm0
   1734 	leaq	48(%rsp),%rax
   1735 	movq	40(%rsp),%rsi
   1736 	jmp	.Lfrom_mont_zero
   1737 
   1738 .align	32
   1739 .Lfrom_mont_zero:
   1740 	movdqa	%xmm0,0(%rax)
   1741 	movdqa	%xmm0,16(%rax)
   1742 	movdqa	%xmm0,32(%rax)
   1743 	movdqa	%xmm0,48(%rax)
   1744 	leaq	64(%rax),%rax
   1745 	subq	$32,%r9
   1746 	jnz	.Lfrom_mont_zero
   1747 
   1748 	movq	$1,%rax
   1749 	movq	-48(%rsi),%r15
   1750 	movq	-40(%rsi),%r14
   1751 	movq	-32(%rsi),%r13
   1752 	movq	-24(%rsi),%r12
   1753 	movq	-16(%rsi),%rbp
   1754 	movq	-8(%rsi),%rbx
   1755 	leaq	(%rsi),%rsp
   1756 .Lfrom_epilogue:
   1757 	.byte	0xf3,0xc3
   1758 .size	bn_from_mont8x,.-bn_from_mont8x
   1759 .globl	bn_scatter5
   1760 .hidden bn_scatter5
   1761 .type	bn_scatter5,@function
   1762 .align	16
   1763 bn_scatter5:
   1764 	cmpl	$0,%esi
   1765 	jz	.Lscatter_epilogue
   1766 	leaq	(%rdx,%rcx,8),%rdx
   1767 .Lscatter:
   1768 	movq	(%rdi),%rax
   1769 	leaq	8(%rdi),%rdi
   1770 	movq	%rax,(%rdx)
   1771 	leaq	256(%rdx),%rdx
   1772 	subl	$1,%esi
   1773 	jnz	.Lscatter
   1774 .Lscatter_epilogue:
   1775 	.byte	0xf3,0xc3
   1776 .size	bn_scatter5,.-bn_scatter5
   1777 
   1778 .globl	bn_gather5
   1779 .hidden bn_gather5
   1780 .type	bn_gather5,@function
   1781 .align	16
   1782 bn_gather5:
   1783 	movl	%ecx,%r11d
   1784 	shrl	$3,%ecx
   1785 	andq	$7,%r11
   1786 	notl	%ecx
   1787 	leaq	.Lmagic_masks(%rip),%rax
   1788 	andl	$3,%ecx
   1789 	leaq	128(%rdx,%r11,8),%rdx
   1790 	movq	0(%rax,%rcx,8),%xmm4
   1791 	movq	8(%rax,%rcx,8),%xmm5
   1792 	movq	16(%rax,%rcx,8),%xmm6
   1793 	movq	24(%rax,%rcx,8),%xmm7
   1794 	jmp	.Lgather
   1795 .align	16
   1796 .Lgather:
   1797 	movq	-128(%rdx),%xmm0
   1798 	movq	-64(%rdx),%xmm1
   1799 	pand	%xmm4,%xmm0
   1800 	movq	0(%rdx),%xmm2
   1801 	pand	%xmm5,%xmm1
   1802 	movq	64(%rdx),%xmm3
   1803 	pand	%xmm6,%xmm2
   1804 	por	%xmm1,%xmm0
   1805 	pand	%xmm7,%xmm3
   1806 .byte	0x67,0x67
   1807 	por	%xmm2,%xmm0
   1808 	leaq	256(%rdx),%rdx
   1809 	por	%xmm3,%xmm0
   1810 
   1811 	movq	%xmm0,(%rdi)
   1812 	leaq	8(%rdi),%rdi
   1813 	subl	$1,%esi
   1814 	jnz	.Lgather
   1815 	.byte	0xf3,0xc3
   1816 .LSEH_end_bn_gather5:
   1817 .size	bn_gather5,.-bn_gather5
   1818 .align	64
   1819 .Lmagic_masks:
   1820 .long	0,0, 0,0, 0,0, -1,-1
   1821 .long	0,0, 0,0, 0,0,  0,0
   1822 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1823 #endif
   1824