Home | History | Annotate | Download | only in fipsmodule
      1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
      2 .text
      3 
      4 .extern	OPENSSL_ia32cap_P
      5 .hidden OPENSSL_ia32cap_P
      6 
      7 .globl	bn_mul_mont_gather5
      8 .hidden bn_mul_mont_gather5
      9 .type	bn_mul_mont_gather5,@function
     10 .align	64
     11 bn_mul_mont_gather5:
     12 .cfi_startproc
     13 	movl	%r9d,%r9d
     14 	movq	%rsp,%rax
     15 .cfi_def_cfa_register	%rax
     16 	testl	$7,%r9d
     17 	jnz	.Lmul_enter
     18 	jmp	.Lmul4x_enter
     19 
     20 .align	16
     21 .Lmul_enter:
     22 	movd	8(%rsp),%xmm5
     23 	pushq	%rbx
     24 .cfi_offset	%rbx,-16
     25 	pushq	%rbp
     26 .cfi_offset	%rbp,-24
     27 	pushq	%r12
     28 .cfi_offset	%r12,-32
     29 	pushq	%r13
     30 .cfi_offset	%r13,-40
     31 	pushq	%r14
     32 .cfi_offset	%r14,-48
     33 	pushq	%r15
     34 .cfi_offset	%r15,-56
     35 
     36 	negq	%r9
     37 	movq	%rsp,%r11
     38 	leaq	-280(%rsp,%r9,8),%r10
     39 	negq	%r9
     40 	andq	$-1024,%r10
     41 
     42 
     43 
     44 
     45 
     46 
     47 
     48 
     49 
     50 	subq	%r10,%r11
     51 	andq	$-4096,%r11
     52 	leaq	(%r10,%r11,1),%rsp
     53 	movq	(%rsp),%r11
     54 	cmpq	%r10,%rsp
     55 	ja	.Lmul_page_walk
     56 	jmp	.Lmul_page_walk_done
     57 
     58 .Lmul_page_walk:
     59 	leaq	-4096(%rsp),%rsp
     60 	movq	(%rsp),%r11
     61 	cmpq	%r10,%rsp
     62 	ja	.Lmul_page_walk
     63 .Lmul_page_walk_done:
     64 
     65 	leaq	.Linc(%rip),%r10
     66 	movq	%rax,8(%rsp,%r9,8)
     67 .cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
     68 .Lmul_body:
     69 
     70 	leaq	128(%rdx),%r12
     71 	movdqa	0(%r10),%xmm0
     72 	movdqa	16(%r10),%xmm1
     73 	leaq	24-112(%rsp,%r9,8),%r10
     74 	andq	$-16,%r10
     75 
     76 	pshufd	$0,%xmm5,%xmm5
     77 	movdqa	%xmm1,%xmm4
     78 	movdqa	%xmm1,%xmm2
     79 	paddd	%xmm0,%xmm1
     80 	pcmpeqd	%xmm5,%xmm0
     81 .byte	0x67
     82 	movdqa	%xmm4,%xmm3
     83 	paddd	%xmm1,%xmm2
     84 	pcmpeqd	%xmm5,%xmm1
     85 	movdqa	%xmm0,112(%r10)
     86 	movdqa	%xmm4,%xmm0
     87 
     88 	paddd	%xmm2,%xmm3
     89 	pcmpeqd	%xmm5,%xmm2
     90 	movdqa	%xmm1,128(%r10)
     91 	movdqa	%xmm4,%xmm1
     92 
     93 	paddd	%xmm3,%xmm0
     94 	pcmpeqd	%xmm5,%xmm3
     95 	movdqa	%xmm2,144(%r10)
     96 	movdqa	%xmm4,%xmm2
     97 
     98 	paddd	%xmm0,%xmm1
     99 	pcmpeqd	%xmm5,%xmm0
    100 	movdqa	%xmm3,160(%r10)
    101 	movdqa	%xmm4,%xmm3
    102 	paddd	%xmm1,%xmm2
    103 	pcmpeqd	%xmm5,%xmm1
    104 	movdqa	%xmm0,176(%r10)
    105 	movdqa	%xmm4,%xmm0
    106 
    107 	paddd	%xmm2,%xmm3
    108 	pcmpeqd	%xmm5,%xmm2
    109 	movdqa	%xmm1,192(%r10)
    110 	movdqa	%xmm4,%xmm1
    111 
    112 	paddd	%xmm3,%xmm0
    113 	pcmpeqd	%xmm5,%xmm3
    114 	movdqa	%xmm2,208(%r10)
    115 	movdqa	%xmm4,%xmm2
    116 
    117 	paddd	%xmm0,%xmm1
    118 	pcmpeqd	%xmm5,%xmm0
    119 	movdqa	%xmm3,224(%r10)
    120 	movdqa	%xmm4,%xmm3
    121 	paddd	%xmm1,%xmm2
    122 	pcmpeqd	%xmm5,%xmm1
    123 	movdqa	%xmm0,240(%r10)
    124 	movdqa	%xmm4,%xmm0
    125 
    126 	paddd	%xmm2,%xmm3
    127 	pcmpeqd	%xmm5,%xmm2
    128 	movdqa	%xmm1,256(%r10)
    129 	movdqa	%xmm4,%xmm1
    130 
    131 	paddd	%xmm3,%xmm0
    132 	pcmpeqd	%xmm5,%xmm3
    133 	movdqa	%xmm2,272(%r10)
    134 	movdqa	%xmm4,%xmm2
    135 
    136 	paddd	%xmm0,%xmm1
    137 	pcmpeqd	%xmm5,%xmm0
    138 	movdqa	%xmm3,288(%r10)
    139 	movdqa	%xmm4,%xmm3
    140 	paddd	%xmm1,%xmm2
    141 	pcmpeqd	%xmm5,%xmm1
    142 	movdqa	%xmm0,304(%r10)
    143 
    144 	paddd	%xmm2,%xmm3
    145 .byte	0x67
    146 	pcmpeqd	%xmm5,%xmm2
    147 	movdqa	%xmm1,320(%r10)
    148 
    149 	pcmpeqd	%xmm5,%xmm3
    150 	movdqa	%xmm2,336(%r10)
    151 	pand	64(%r12),%xmm0
    152 
    153 	pand	80(%r12),%xmm1
    154 	pand	96(%r12),%xmm2
    155 	movdqa	%xmm3,352(%r10)
    156 	pand	112(%r12),%xmm3
    157 	por	%xmm2,%xmm0
    158 	por	%xmm3,%xmm1
    159 	movdqa	-128(%r12),%xmm4
    160 	movdqa	-112(%r12),%xmm5
    161 	movdqa	-96(%r12),%xmm2
    162 	pand	112(%r10),%xmm4
    163 	movdqa	-80(%r12),%xmm3
    164 	pand	128(%r10),%xmm5
    165 	por	%xmm4,%xmm0
    166 	pand	144(%r10),%xmm2
    167 	por	%xmm5,%xmm1
    168 	pand	160(%r10),%xmm3
    169 	por	%xmm2,%xmm0
    170 	por	%xmm3,%xmm1
    171 	movdqa	-64(%r12),%xmm4
    172 	movdqa	-48(%r12),%xmm5
    173 	movdqa	-32(%r12),%xmm2
    174 	pand	176(%r10),%xmm4
    175 	movdqa	-16(%r12),%xmm3
    176 	pand	192(%r10),%xmm5
    177 	por	%xmm4,%xmm0
    178 	pand	208(%r10),%xmm2
    179 	por	%xmm5,%xmm1
    180 	pand	224(%r10),%xmm3
    181 	por	%xmm2,%xmm0
    182 	por	%xmm3,%xmm1
    183 	movdqa	0(%r12),%xmm4
    184 	movdqa	16(%r12),%xmm5
    185 	movdqa	32(%r12),%xmm2
    186 	pand	240(%r10),%xmm4
    187 	movdqa	48(%r12),%xmm3
    188 	pand	256(%r10),%xmm5
    189 	por	%xmm4,%xmm0
    190 	pand	272(%r10),%xmm2
    191 	por	%xmm5,%xmm1
    192 	pand	288(%r10),%xmm3
    193 	por	%xmm2,%xmm0
    194 	por	%xmm3,%xmm1
    195 	por	%xmm1,%xmm0
    196 	pshufd	$0x4e,%xmm0,%xmm1
    197 	por	%xmm1,%xmm0
    198 	leaq	256(%r12),%r12
    199 .byte	102,72,15,126,195
    200 
    201 	movq	(%r8),%r8
    202 	movq	(%rsi),%rax
    203 
    204 	xorq	%r14,%r14
    205 	xorq	%r15,%r15
    206 
    207 	movq	%r8,%rbp
    208 	mulq	%rbx
    209 	movq	%rax,%r10
    210 	movq	(%rcx),%rax
    211 
    212 	imulq	%r10,%rbp
    213 	movq	%rdx,%r11
    214 
    215 	mulq	%rbp
    216 	addq	%rax,%r10
    217 	movq	8(%rsi),%rax
    218 	adcq	$0,%rdx
    219 	movq	%rdx,%r13
    220 
    221 	leaq	1(%r15),%r15
    222 	jmp	.L1st_enter
    223 
    224 .align	16
    225 .L1st:
    226 	addq	%rax,%r13
    227 	movq	(%rsi,%r15,8),%rax
    228 	adcq	$0,%rdx
    229 	addq	%r11,%r13
    230 	movq	%r10,%r11
    231 	adcq	$0,%rdx
    232 	movq	%r13,-16(%rsp,%r15,8)
    233 	movq	%rdx,%r13
    234 
    235 .L1st_enter:
    236 	mulq	%rbx
    237 	addq	%rax,%r11
    238 	movq	(%rcx,%r15,8),%rax
    239 	adcq	$0,%rdx
    240 	leaq	1(%r15),%r15
    241 	movq	%rdx,%r10
    242 
    243 	mulq	%rbp
    244 	cmpq	%r9,%r15
    245 	jne	.L1st
    246 
    247 
    248 	addq	%rax,%r13
    249 	adcq	$0,%rdx
    250 	addq	%r11,%r13
    251 	adcq	$0,%rdx
    252 	movq	%r13,-16(%rsp,%r9,8)
    253 	movq	%rdx,%r13
    254 	movq	%r10,%r11
    255 
    256 	xorq	%rdx,%rdx
    257 	addq	%r11,%r13
    258 	adcq	$0,%rdx
    259 	movq	%r13,-8(%rsp,%r9,8)
    260 	movq	%rdx,(%rsp,%r9,8)
    261 
    262 	leaq	1(%r14),%r14
    263 	jmp	.Louter
    264 .align	16
    265 .Louter:
    266 	leaq	24+128(%rsp,%r9,8),%rdx
    267 	andq	$-16,%rdx
    268 	pxor	%xmm4,%xmm4
    269 	pxor	%xmm5,%xmm5
    270 	movdqa	-128(%r12),%xmm0
    271 	movdqa	-112(%r12),%xmm1
    272 	movdqa	-96(%r12),%xmm2
    273 	movdqa	-80(%r12),%xmm3
    274 	pand	-128(%rdx),%xmm0
    275 	pand	-112(%rdx),%xmm1
    276 	por	%xmm0,%xmm4
    277 	pand	-96(%rdx),%xmm2
    278 	por	%xmm1,%xmm5
    279 	pand	-80(%rdx),%xmm3
    280 	por	%xmm2,%xmm4
    281 	por	%xmm3,%xmm5
    282 	movdqa	-64(%r12),%xmm0
    283 	movdqa	-48(%r12),%xmm1
    284 	movdqa	-32(%r12),%xmm2
    285 	movdqa	-16(%r12),%xmm3
    286 	pand	-64(%rdx),%xmm0
    287 	pand	-48(%rdx),%xmm1
    288 	por	%xmm0,%xmm4
    289 	pand	-32(%rdx),%xmm2
    290 	por	%xmm1,%xmm5
    291 	pand	-16(%rdx),%xmm3
    292 	por	%xmm2,%xmm4
    293 	por	%xmm3,%xmm5
    294 	movdqa	0(%r12),%xmm0
    295 	movdqa	16(%r12),%xmm1
    296 	movdqa	32(%r12),%xmm2
    297 	movdqa	48(%r12),%xmm3
    298 	pand	0(%rdx),%xmm0
    299 	pand	16(%rdx),%xmm1
    300 	por	%xmm0,%xmm4
    301 	pand	32(%rdx),%xmm2
    302 	por	%xmm1,%xmm5
    303 	pand	48(%rdx),%xmm3
    304 	por	%xmm2,%xmm4
    305 	por	%xmm3,%xmm5
    306 	movdqa	64(%r12),%xmm0
    307 	movdqa	80(%r12),%xmm1
    308 	movdqa	96(%r12),%xmm2
    309 	movdqa	112(%r12),%xmm3
    310 	pand	64(%rdx),%xmm0
    311 	pand	80(%rdx),%xmm1
    312 	por	%xmm0,%xmm4
    313 	pand	96(%rdx),%xmm2
    314 	por	%xmm1,%xmm5
    315 	pand	112(%rdx),%xmm3
    316 	por	%xmm2,%xmm4
    317 	por	%xmm3,%xmm5
    318 	por	%xmm5,%xmm4
    319 	pshufd	$0x4e,%xmm4,%xmm0
    320 	por	%xmm4,%xmm0
    321 	leaq	256(%r12),%r12
    322 
    323 	movq	(%rsi),%rax
    324 .byte	102,72,15,126,195
    325 
    326 	xorq	%r15,%r15
    327 	movq	%r8,%rbp
    328 	movq	(%rsp),%r10
    329 
    330 	mulq	%rbx
    331 	addq	%rax,%r10
    332 	movq	(%rcx),%rax
    333 	adcq	$0,%rdx
    334 
    335 	imulq	%r10,%rbp
    336 	movq	%rdx,%r11
    337 
    338 	mulq	%rbp
    339 	addq	%rax,%r10
    340 	movq	8(%rsi),%rax
    341 	adcq	$0,%rdx
    342 	movq	8(%rsp),%r10
    343 	movq	%rdx,%r13
    344 
    345 	leaq	1(%r15),%r15
    346 	jmp	.Linner_enter
    347 
    348 .align	16
    349 .Linner:
    350 	addq	%rax,%r13
    351 	movq	(%rsi,%r15,8),%rax
    352 	adcq	$0,%rdx
    353 	addq	%r10,%r13
    354 	movq	(%rsp,%r15,8),%r10
    355 	adcq	$0,%rdx
    356 	movq	%r13,-16(%rsp,%r15,8)
    357 	movq	%rdx,%r13
    358 
    359 .Linner_enter:
    360 	mulq	%rbx
    361 	addq	%rax,%r11
    362 	movq	(%rcx,%r15,8),%rax
    363 	adcq	$0,%rdx
    364 	addq	%r11,%r10
    365 	movq	%rdx,%r11
    366 	adcq	$0,%r11
    367 	leaq	1(%r15),%r15
    368 
    369 	mulq	%rbp
    370 	cmpq	%r9,%r15
    371 	jne	.Linner
    372 
    373 	addq	%rax,%r13
    374 	adcq	$0,%rdx
    375 	addq	%r10,%r13
    376 	movq	(%rsp,%r9,8),%r10
    377 	adcq	$0,%rdx
    378 	movq	%r13,-16(%rsp,%r9,8)
    379 	movq	%rdx,%r13
    380 
    381 	xorq	%rdx,%rdx
    382 	addq	%r11,%r13
    383 	adcq	$0,%rdx
    384 	addq	%r10,%r13
    385 	adcq	$0,%rdx
    386 	movq	%r13,-8(%rsp,%r9,8)
    387 	movq	%rdx,(%rsp,%r9,8)
    388 
    389 	leaq	1(%r14),%r14
    390 	cmpq	%r9,%r14
    391 	jb	.Louter
    392 
    393 	xorq	%r14,%r14
    394 	movq	(%rsp),%rax
    395 	leaq	(%rsp),%rsi
    396 	movq	%r9,%r15
    397 	jmp	.Lsub
    398 .align	16
    399 .Lsub:
    400 	sbbq	(%rcx,%r14,8),%rax
    401 	movq	%rax,(%rdi,%r14,8)
    402 	movq	8(%rsi,%r14,8),%rax
    403 	leaq	1(%r14),%r14
    404 	decq	%r15
    405 	jnz	.Lsub
    406 
    407 	sbbq	$0,%rax
    408 	xorq	%r14,%r14
    409 	andq	%rax,%rsi
    410 	notq	%rax
    411 	movq	%rdi,%rcx
    412 	andq	%rax,%rcx
    413 	movq	%r9,%r15
    414 	orq	%rcx,%rsi
    415 .align	16
    416 .Lcopy:
    417 	movq	(%rsi,%r14,8),%rax
    418 	movq	%r14,(%rsp,%r14,8)
    419 	movq	%rax,(%rdi,%r14,8)
    420 	leaq	1(%r14),%r14
    421 	subq	$1,%r15
    422 	jnz	.Lcopy
    423 
    424 	movq	8(%rsp,%r9,8),%rsi
    425 .cfi_def_cfa	%rsi,8
    426 	movq	$1,%rax
    427 
    428 	movq	-48(%rsi),%r15
    429 .cfi_restore	%r15
    430 	movq	-40(%rsi),%r14
    431 .cfi_restore	%r14
    432 	movq	-32(%rsi),%r13
    433 .cfi_restore	%r13
    434 	movq	-24(%rsi),%r12
    435 .cfi_restore	%r12
    436 	movq	-16(%rsi),%rbp
    437 .cfi_restore	%rbp
    438 	movq	-8(%rsi),%rbx
    439 .cfi_restore	%rbx
    440 	leaq	(%rsi),%rsp
    441 .cfi_def_cfa_register	%rsp
    442 .Lmul_epilogue:
    443 	.byte	0xf3,0xc3
    444 .cfi_endproc
    445 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    446 .type	bn_mul4x_mont_gather5,@function
    447 .align	32
    448 bn_mul4x_mont_gather5:
    449 .cfi_startproc
    450 .byte	0x67
    451 	movq	%rsp,%rax
    452 .cfi_def_cfa_register	%rax
    453 .Lmul4x_enter:
    454 	pushq	%rbx
    455 .cfi_offset	%rbx,-16
    456 	pushq	%rbp
    457 .cfi_offset	%rbp,-24
    458 	pushq	%r12
    459 .cfi_offset	%r12,-32
    460 	pushq	%r13
    461 .cfi_offset	%r13,-40
    462 	pushq	%r14
    463 .cfi_offset	%r14,-48
    464 	pushq	%r15
    465 .cfi_offset	%r15,-56
    466 .Lmul4x_prologue:
    467 
    468 .byte	0x67
    469 	shll	$3,%r9d
    470 	leaq	(%r9,%r9,2),%r10
    471 	negq	%r9
    472 
    473 
    474 
    475 
    476 
    477 
    478 
    479 
    480 
    481 
    482 	leaq	-320(%rsp,%r9,2),%r11
    483 	movq	%rsp,%rbp
    484 	subq	%rdi,%r11
    485 	andq	$4095,%r11
    486 	cmpq	%r11,%r10
    487 	jb	.Lmul4xsp_alt
    488 	subq	%r11,%rbp
    489 	leaq	-320(%rbp,%r9,2),%rbp
    490 	jmp	.Lmul4xsp_done
    491 
    492 .align	32
    493 .Lmul4xsp_alt:
    494 	leaq	4096-320(,%r9,2),%r10
    495 	leaq	-320(%rbp,%r9,2),%rbp
    496 	subq	%r10,%r11
    497 	movq	$0,%r10
    498 	cmovcq	%r10,%r11
    499 	subq	%r11,%rbp
    500 .Lmul4xsp_done:
    501 	andq	$-64,%rbp
    502 	movq	%rsp,%r11
    503 	subq	%rbp,%r11
    504 	andq	$-4096,%r11
    505 	leaq	(%r11,%rbp,1),%rsp
    506 	movq	(%rsp),%r10
    507 	cmpq	%rbp,%rsp
    508 	ja	.Lmul4x_page_walk
    509 	jmp	.Lmul4x_page_walk_done
    510 
    511 .Lmul4x_page_walk:
    512 	leaq	-4096(%rsp),%rsp
    513 	movq	(%rsp),%r10
    514 	cmpq	%rbp,%rsp
    515 	ja	.Lmul4x_page_walk
    516 .Lmul4x_page_walk_done:
    517 
    518 	negq	%r9
    519 
    520 	movq	%rax,40(%rsp)
    521 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
    522 .Lmul4x_body:
    523 
    524 	call	mul4x_internal
    525 
    526 	movq	40(%rsp),%rsi
    527 .cfi_def_cfa	%rsi,8
    528 	movq	$1,%rax
    529 
    530 	movq	-48(%rsi),%r15
    531 .cfi_restore	%r15
    532 	movq	-40(%rsi),%r14
    533 .cfi_restore	%r14
    534 	movq	-32(%rsi),%r13
    535 .cfi_restore	%r13
    536 	movq	-24(%rsi),%r12
    537 .cfi_restore	%r12
    538 	movq	-16(%rsi),%rbp
    539 .cfi_restore	%rbp
    540 	movq	-8(%rsi),%rbx
    541 .cfi_restore	%rbx
    542 	leaq	(%rsi),%rsp
    543 .cfi_def_cfa_register	%rsp
    544 .Lmul4x_epilogue:
    545 	.byte	0xf3,0xc3
    546 .cfi_endproc
    547 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    548 
    549 .type	mul4x_internal,@function
    550 .align	32
    551 mul4x_internal:
    552 	shlq	$5,%r9
    553 	movd	8(%rax),%xmm5
    554 	leaq	.Linc(%rip),%rax
    555 	leaq	128(%rdx,%r9,1),%r13
    556 	shrq	$5,%r9
    557 	movdqa	0(%rax),%xmm0
    558 	movdqa	16(%rax),%xmm1
    559 	leaq	88-112(%rsp,%r9,1),%r10
    560 	leaq	128(%rdx),%r12
    561 
    562 	pshufd	$0,%xmm5,%xmm5
    563 	movdqa	%xmm1,%xmm4
    564 .byte	0x67,0x67
    565 	movdqa	%xmm1,%xmm2
    566 	paddd	%xmm0,%xmm1
    567 	pcmpeqd	%xmm5,%xmm0
    568 .byte	0x67
    569 	movdqa	%xmm4,%xmm3
    570 	paddd	%xmm1,%xmm2
    571 	pcmpeqd	%xmm5,%xmm1
    572 	movdqa	%xmm0,112(%r10)
    573 	movdqa	%xmm4,%xmm0
    574 
    575 	paddd	%xmm2,%xmm3
    576 	pcmpeqd	%xmm5,%xmm2
    577 	movdqa	%xmm1,128(%r10)
    578 	movdqa	%xmm4,%xmm1
    579 
    580 	paddd	%xmm3,%xmm0
    581 	pcmpeqd	%xmm5,%xmm3
    582 	movdqa	%xmm2,144(%r10)
    583 	movdqa	%xmm4,%xmm2
    584 
    585 	paddd	%xmm0,%xmm1
    586 	pcmpeqd	%xmm5,%xmm0
    587 	movdqa	%xmm3,160(%r10)
    588 	movdqa	%xmm4,%xmm3
    589 	paddd	%xmm1,%xmm2
    590 	pcmpeqd	%xmm5,%xmm1
    591 	movdqa	%xmm0,176(%r10)
    592 	movdqa	%xmm4,%xmm0
    593 
    594 	paddd	%xmm2,%xmm3
    595 	pcmpeqd	%xmm5,%xmm2
    596 	movdqa	%xmm1,192(%r10)
    597 	movdqa	%xmm4,%xmm1
    598 
    599 	paddd	%xmm3,%xmm0
    600 	pcmpeqd	%xmm5,%xmm3
    601 	movdqa	%xmm2,208(%r10)
    602 	movdqa	%xmm4,%xmm2
    603 
    604 	paddd	%xmm0,%xmm1
    605 	pcmpeqd	%xmm5,%xmm0
    606 	movdqa	%xmm3,224(%r10)
    607 	movdqa	%xmm4,%xmm3
    608 	paddd	%xmm1,%xmm2
    609 	pcmpeqd	%xmm5,%xmm1
    610 	movdqa	%xmm0,240(%r10)
    611 	movdqa	%xmm4,%xmm0
    612 
    613 	paddd	%xmm2,%xmm3
    614 	pcmpeqd	%xmm5,%xmm2
    615 	movdqa	%xmm1,256(%r10)
    616 	movdqa	%xmm4,%xmm1
    617 
    618 	paddd	%xmm3,%xmm0
    619 	pcmpeqd	%xmm5,%xmm3
    620 	movdqa	%xmm2,272(%r10)
    621 	movdqa	%xmm4,%xmm2
    622 
    623 	paddd	%xmm0,%xmm1
    624 	pcmpeqd	%xmm5,%xmm0
    625 	movdqa	%xmm3,288(%r10)
    626 	movdqa	%xmm4,%xmm3
    627 	paddd	%xmm1,%xmm2
    628 	pcmpeqd	%xmm5,%xmm1
    629 	movdqa	%xmm0,304(%r10)
    630 
    631 	paddd	%xmm2,%xmm3
    632 .byte	0x67
    633 	pcmpeqd	%xmm5,%xmm2
    634 	movdqa	%xmm1,320(%r10)
    635 
    636 	pcmpeqd	%xmm5,%xmm3
    637 	movdqa	%xmm2,336(%r10)
    638 	pand	64(%r12),%xmm0
    639 
    640 	pand	80(%r12),%xmm1
    641 	pand	96(%r12),%xmm2
    642 	movdqa	%xmm3,352(%r10)
    643 	pand	112(%r12),%xmm3
    644 	por	%xmm2,%xmm0
    645 	por	%xmm3,%xmm1
    646 	movdqa	-128(%r12),%xmm4
    647 	movdqa	-112(%r12),%xmm5
    648 	movdqa	-96(%r12),%xmm2
    649 	pand	112(%r10),%xmm4
    650 	movdqa	-80(%r12),%xmm3
    651 	pand	128(%r10),%xmm5
    652 	por	%xmm4,%xmm0
    653 	pand	144(%r10),%xmm2
    654 	por	%xmm5,%xmm1
    655 	pand	160(%r10),%xmm3
    656 	por	%xmm2,%xmm0
    657 	por	%xmm3,%xmm1
    658 	movdqa	-64(%r12),%xmm4
    659 	movdqa	-48(%r12),%xmm5
    660 	movdqa	-32(%r12),%xmm2
    661 	pand	176(%r10),%xmm4
    662 	movdqa	-16(%r12),%xmm3
    663 	pand	192(%r10),%xmm5
    664 	por	%xmm4,%xmm0
    665 	pand	208(%r10),%xmm2
    666 	por	%xmm5,%xmm1
    667 	pand	224(%r10),%xmm3
    668 	por	%xmm2,%xmm0
    669 	por	%xmm3,%xmm1
    670 	movdqa	0(%r12),%xmm4
    671 	movdqa	16(%r12),%xmm5
    672 	movdqa	32(%r12),%xmm2
    673 	pand	240(%r10),%xmm4
    674 	movdqa	48(%r12),%xmm3
    675 	pand	256(%r10),%xmm5
    676 	por	%xmm4,%xmm0
    677 	pand	272(%r10),%xmm2
    678 	por	%xmm5,%xmm1
    679 	pand	288(%r10),%xmm3
    680 	por	%xmm2,%xmm0
    681 	por	%xmm3,%xmm1
    682 	por	%xmm1,%xmm0
    683 	pshufd	$0x4e,%xmm0,%xmm1
    684 	por	%xmm1,%xmm0
    685 	leaq	256(%r12),%r12
    686 .byte	102,72,15,126,195
    687 
    688 	movq	%r13,16+8(%rsp)
    689 	movq	%rdi,56+8(%rsp)
    690 
    691 	movq	(%r8),%r8
    692 	movq	(%rsi),%rax
    693 	leaq	(%rsi,%r9,1),%rsi
    694 	negq	%r9
    695 
    696 	movq	%r8,%rbp
    697 	mulq	%rbx
    698 	movq	%rax,%r10
    699 	movq	(%rcx),%rax
    700 
    701 	imulq	%r10,%rbp
    702 	leaq	64+8(%rsp),%r14
    703 	movq	%rdx,%r11
    704 
    705 	mulq	%rbp
    706 	addq	%rax,%r10
    707 	movq	8(%rsi,%r9,1),%rax
    708 	adcq	$0,%rdx
    709 	movq	%rdx,%rdi
    710 
    711 	mulq	%rbx
    712 	addq	%rax,%r11
    713 	movq	8(%rcx),%rax
    714 	adcq	$0,%rdx
    715 	movq	%rdx,%r10
    716 
    717 	mulq	%rbp
    718 	addq	%rax,%rdi
    719 	movq	16(%rsi,%r9,1),%rax
    720 	adcq	$0,%rdx
    721 	addq	%r11,%rdi
    722 	leaq	32(%r9),%r15
    723 	leaq	32(%rcx),%rcx
    724 	adcq	$0,%rdx
    725 	movq	%rdi,(%r14)
    726 	movq	%rdx,%r13
    727 	jmp	.L1st4x
    728 
    729 .align	32
    730 .L1st4x:
    731 	mulq	%rbx
    732 	addq	%rax,%r10
    733 	movq	-16(%rcx),%rax
    734 	leaq	32(%r14),%r14
    735 	adcq	$0,%rdx
    736 	movq	%rdx,%r11
    737 
    738 	mulq	%rbp
    739 	addq	%rax,%r13
    740 	movq	-8(%rsi,%r15,1),%rax
    741 	adcq	$0,%rdx
    742 	addq	%r10,%r13
    743 	adcq	$0,%rdx
    744 	movq	%r13,-24(%r14)
    745 	movq	%rdx,%rdi
    746 
    747 	mulq	%rbx
    748 	addq	%rax,%r11
    749 	movq	-8(%rcx),%rax
    750 	adcq	$0,%rdx
    751 	movq	%rdx,%r10
    752 
    753 	mulq	%rbp
    754 	addq	%rax,%rdi
    755 	movq	(%rsi,%r15,1),%rax
    756 	adcq	$0,%rdx
    757 	addq	%r11,%rdi
    758 	adcq	$0,%rdx
    759 	movq	%rdi,-16(%r14)
    760 	movq	%rdx,%r13
    761 
    762 	mulq	%rbx
    763 	addq	%rax,%r10
    764 	movq	0(%rcx),%rax
    765 	adcq	$0,%rdx
    766 	movq	%rdx,%r11
    767 
    768 	mulq	%rbp
    769 	addq	%rax,%r13
    770 	movq	8(%rsi,%r15,1),%rax
    771 	adcq	$0,%rdx
    772 	addq	%r10,%r13
    773 	adcq	$0,%rdx
    774 	movq	%r13,-8(%r14)
    775 	movq	%rdx,%rdi
    776 
    777 	mulq	%rbx
    778 	addq	%rax,%r11
    779 	movq	8(%rcx),%rax
    780 	adcq	$0,%rdx
    781 	movq	%rdx,%r10
    782 
    783 	mulq	%rbp
    784 	addq	%rax,%rdi
    785 	movq	16(%rsi,%r15,1),%rax
    786 	adcq	$0,%rdx
    787 	addq	%r11,%rdi
    788 	leaq	32(%rcx),%rcx
    789 	adcq	$0,%rdx
    790 	movq	%rdi,(%r14)
    791 	movq	%rdx,%r13
    792 
    793 	addq	$32,%r15
    794 	jnz	.L1st4x
    795 
    796 	mulq	%rbx
    797 	addq	%rax,%r10
    798 	movq	-16(%rcx),%rax
    799 	leaq	32(%r14),%r14
    800 	adcq	$0,%rdx
    801 	movq	%rdx,%r11
    802 
    803 	mulq	%rbp
    804 	addq	%rax,%r13
    805 	movq	-8(%rsi),%rax
    806 	adcq	$0,%rdx
    807 	addq	%r10,%r13
    808 	adcq	$0,%rdx
    809 	movq	%r13,-24(%r14)
    810 	movq	%rdx,%rdi
    811 
    812 	mulq	%rbx
    813 	addq	%rax,%r11
    814 	movq	-8(%rcx),%rax
    815 	adcq	$0,%rdx
    816 	movq	%rdx,%r10
    817 
    818 	mulq	%rbp
    819 	addq	%rax,%rdi
    820 	movq	(%rsi,%r9,1),%rax
    821 	adcq	$0,%rdx
    822 	addq	%r11,%rdi
    823 	adcq	$0,%rdx
    824 	movq	%rdi,-16(%r14)
    825 	movq	%rdx,%r13
    826 
    827 	leaq	(%rcx,%r9,1),%rcx
    828 
    829 	xorq	%rdi,%rdi
    830 	addq	%r10,%r13
    831 	adcq	$0,%rdi
    832 	movq	%r13,-8(%r14)
    833 
    834 	jmp	.Louter4x
    835 
    836 .align	32
    837 .Louter4x:
    838 	leaq	16+128(%r14),%rdx
    839 	pxor	%xmm4,%xmm4
    840 	pxor	%xmm5,%xmm5
    841 	movdqa	-128(%r12),%xmm0
    842 	movdqa	-112(%r12),%xmm1
    843 	movdqa	-96(%r12),%xmm2
    844 	movdqa	-80(%r12),%xmm3
    845 	pand	-128(%rdx),%xmm0
    846 	pand	-112(%rdx),%xmm1
    847 	por	%xmm0,%xmm4
    848 	pand	-96(%rdx),%xmm2
    849 	por	%xmm1,%xmm5
    850 	pand	-80(%rdx),%xmm3
    851 	por	%xmm2,%xmm4
    852 	por	%xmm3,%xmm5
    853 	movdqa	-64(%r12),%xmm0
    854 	movdqa	-48(%r12),%xmm1
    855 	movdqa	-32(%r12),%xmm2
    856 	movdqa	-16(%r12),%xmm3
    857 	pand	-64(%rdx),%xmm0
    858 	pand	-48(%rdx),%xmm1
    859 	por	%xmm0,%xmm4
    860 	pand	-32(%rdx),%xmm2
    861 	por	%xmm1,%xmm5
    862 	pand	-16(%rdx),%xmm3
    863 	por	%xmm2,%xmm4
    864 	por	%xmm3,%xmm5
    865 	movdqa	0(%r12),%xmm0
    866 	movdqa	16(%r12),%xmm1
    867 	movdqa	32(%r12),%xmm2
    868 	movdqa	48(%r12),%xmm3
    869 	pand	0(%rdx),%xmm0
    870 	pand	16(%rdx),%xmm1
    871 	por	%xmm0,%xmm4
    872 	pand	32(%rdx),%xmm2
    873 	por	%xmm1,%xmm5
    874 	pand	48(%rdx),%xmm3
    875 	por	%xmm2,%xmm4
    876 	por	%xmm3,%xmm5
    877 	movdqa	64(%r12),%xmm0
    878 	movdqa	80(%r12),%xmm1
    879 	movdqa	96(%r12),%xmm2
    880 	movdqa	112(%r12),%xmm3
    881 	pand	64(%rdx),%xmm0
    882 	pand	80(%rdx),%xmm1
    883 	por	%xmm0,%xmm4
    884 	pand	96(%rdx),%xmm2
    885 	por	%xmm1,%xmm5
    886 	pand	112(%rdx),%xmm3
    887 	por	%xmm2,%xmm4
    888 	por	%xmm3,%xmm5
    889 	por	%xmm5,%xmm4
    890 	pshufd	$0x4e,%xmm4,%xmm0
    891 	por	%xmm4,%xmm0
    892 	leaq	256(%r12),%r12
    893 .byte	102,72,15,126,195
    894 
    895 	movq	(%r14,%r9,1),%r10
    896 	movq	%r8,%rbp
    897 	mulq	%rbx
    898 	addq	%rax,%r10
    899 	movq	(%rcx),%rax
    900 	adcq	$0,%rdx
    901 
    902 	imulq	%r10,%rbp
    903 	movq	%rdx,%r11
    904 	movq	%rdi,(%r14)
    905 
    906 	leaq	(%r14,%r9,1),%r14
    907 
    908 	mulq	%rbp
    909 	addq	%rax,%r10
    910 	movq	8(%rsi,%r9,1),%rax
    911 	adcq	$0,%rdx
    912 	movq	%rdx,%rdi
    913 
    914 	mulq	%rbx
    915 	addq	%rax,%r11
    916 	movq	8(%rcx),%rax
    917 	adcq	$0,%rdx
    918 	addq	8(%r14),%r11
    919 	adcq	$0,%rdx
    920 	movq	%rdx,%r10
    921 
    922 	mulq	%rbp
    923 	addq	%rax,%rdi
    924 	movq	16(%rsi,%r9,1),%rax
    925 	adcq	$0,%rdx
    926 	addq	%r11,%rdi
    927 	leaq	32(%r9),%r15
    928 	leaq	32(%rcx),%rcx
    929 	adcq	$0,%rdx
    930 	movq	%rdx,%r13
    931 	jmp	.Linner4x
    932 
    933 .align	32
    934 .Linner4x:
    935 	mulq	%rbx
    936 	addq	%rax,%r10
    937 	movq	-16(%rcx),%rax
    938 	adcq	$0,%rdx
    939 	addq	16(%r14),%r10
    940 	leaq	32(%r14),%r14
    941 	adcq	$0,%rdx
    942 	movq	%rdx,%r11
    943 
    944 	mulq	%rbp
    945 	addq	%rax,%r13
    946 	movq	-8(%rsi,%r15,1),%rax
    947 	adcq	$0,%rdx
    948 	addq	%r10,%r13
    949 	adcq	$0,%rdx
    950 	movq	%rdi,-32(%r14)
    951 	movq	%rdx,%rdi
    952 
    953 	mulq	%rbx
    954 	addq	%rax,%r11
    955 	movq	-8(%rcx),%rax
    956 	adcq	$0,%rdx
    957 	addq	-8(%r14),%r11
    958 	adcq	$0,%rdx
    959 	movq	%rdx,%r10
    960 
    961 	mulq	%rbp
    962 	addq	%rax,%rdi
    963 	movq	(%rsi,%r15,1),%rax
    964 	adcq	$0,%rdx
    965 	addq	%r11,%rdi
    966 	adcq	$0,%rdx
    967 	movq	%r13,-24(%r14)
    968 	movq	%rdx,%r13
    969 
    970 	mulq	%rbx
    971 	addq	%rax,%r10
    972 	movq	0(%rcx),%rax
    973 	adcq	$0,%rdx
    974 	addq	(%r14),%r10
    975 	adcq	$0,%rdx
    976 	movq	%rdx,%r11
    977 
    978 	mulq	%rbp
    979 	addq	%rax,%r13
    980 	movq	8(%rsi,%r15,1),%rax
    981 	adcq	$0,%rdx
    982 	addq	%r10,%r13
    983 	adcq	$0,%rdx
    984 	movq	%rdi,-16(%r14)
    985 	movq	%rdx,%rdi
    986 
    987 	mulq	%rbx
    988 	addq	%rax,%r11
    989 	movq	8(%rcx),%rax
    990 	adcq	$0,%rdx
    991 	addq	8(%r14),%r11
    992 	adcq	$0,%rdx
    993 	movq	%rdx,%r10
    994 
    995 	mulq	%rbp
    996 	addq	%rax,%rdi
    997 	movq	16(%rsi,%r15,1),%rax
    998 	adcq	$0,%rdx
    999 	addq	%r11,%rdi
   1000 	leaq	32(%rcx),%rcx
   1001 	adcq	$0,%rdx
   1002 	movq	%r13,-8(%r14)
   1003 	movq	%rdx,%r13
   1004 
   1005 	addq	$32,%r15
   1006 	jnz	.Linner4x
   1007 
   1008 	mulq	%rbx
   1009 	addq	%rax,%r10
   1010 	movq	-16(%rcx),%rax
   1011 	adcq	$0,%rdx
   1012 	addq	16(%r14),%r10
   1013 	leaq	32(%r14),%r14
   1014 	adcq	$0,%rdx
   1015 	movq	%rdx,%r11
   1016 
   1017 	mulq	%rbp
   1018 	addq	%rax,%r13
   1019 	movq	-8(%rsi),%rax
   1020 	adcq	$0,%rdx
   1021 	addq	%r10,%r13
   1022 	adcq	$0,%rdx
   1023 	movq	%rdi,-32(%r14)
   1024 	movq	%rdx,%rdi
   1025 
   1026 	mulq	%rbx
   1027 	addq	%rax,%r11
   1028 	movq	%rbp,%rax
   1029 	movq	-8(%rcx),%rbp
   1030 	adcq	$0,%rdx
   1031 	addq	-8(%r14),%r11
   1032 	adcq	$0,%rdx
   1033 	movq	%rdx,%r10
   1034 
   1035 	mulq	%rbp
   1036 	addq	%rax,%rdi
   1037 	movq	(%rsi,%r9,1),%rax
   1038 	adcq	$0,%rdx
   1039 	addq	%r11,%rdi
   1040 	adcq	$0,%rdx
   1041 	movq	%r13,-24(%r14)
   1042 	movq	%rdx,%r13
   1043 
   1044 	movq	%rdi,-16(%r14)
   1045 	leaq	(%rcx,%r9,1),%rcx
   1046 
   1047 	xorq	%rdi,%rdi
   1048 	addq	%r10,%r13
   1049 	adcq	$0,%rdi
   1050 	addq	(%r14),%r13
   1051 	adcq	$0,%rdi
   1052 	movq	%r13,-8(%r14)
   1053 
   1054 	cmpq	16+8(%rsp),%r12
   1055 	jb	.Louter4x
   1056 	xorq	%rax,%rax
   1057 	subq	%r13,%rbp
   1058 	adcq	%r15,%r15
   1059 	orq	%r15,%rdi
   1060 	subq	%rdi,%rax
   1061 	leaq	(%r14,%r9,1),%rbx
   1062 	movq	(%rcx),%r12
   1063 	leaq	(%rcx),%rbp
   1064 	movq	%r9,%rcx
   1065 	sarq	$3+2,%rcx
   1066 	movq	56+8(%rsp),%rdi
   1067 	decq	%r12
   1068 	xorq	%r10,%r10
   1069 	movq	8(%rbp),%r13
   1070 	movq	16(%rbp),%r14
   1071 	movq	24(%rbp),%r15
   1072 	jmp	.Lsqr4x_sub_entry
   1073 .size	mul4x_internal,.-mul4x_internal
   1074 .globl	bn_power5
   1075 .hidden bn_power5
   1076 .type	bn_power5,@function
   1077 .align	32
   1078 bn_power5:
   1079 .cfi_startproc
   1080 	movq	%rsp,%rax
   1081 .cfi_def_cfa_register	%rax
   1082 	pushq	%rbx
   1083 .cfi_offset	%rbx,-16
   1084 	pushq	%rbp
   1085 .cfi_offset	%rbp,-24
   1086 	pushq	%r12
   1087 .cfi_offset	%r12,-32
   1088 	pushq	%r13
   1089 .cfi_offset	%r13,-40
   1090 	pushq	%r14
   1091 .cfi_offset	%r14,-48
   1092 	pushq	%r15
   1093 .cfi_offset	%r15,-56
   1094 .Lpower5_prologue:
   1095 
   1096 	shll	$3,%r9d
   1097 	leal	(%r9,%r9,2),%r10d
   1098 	negq	%r9
   1099 	movq	(%r8),%r8
   1100 
   1101 
   1102 
   1103 
   1104 
   1105 
   1106 
   1107 
   1108 	leaq	-320(%rsp,%r9,2),%r11
   1109 	movq	%rsp,%rbp
   1110 	subq	%rdi,%r11
   1111 	andq	$4095,%r11
   1112 	cmpq	%r11,%r10
   1113 	jb	.Lpwr_sp_alt
   1114 	subq	%r11,%rbp
   1115 	leaq	-320(%rbp,%r9,2),%rbp
   1116 	jmp	.Lpwr_sp_done
   1117 
   1118 .align	32
   1119 .Lpwr_sp_alt:
   1120 	leaq	4096-320(,%r9,2),%r10
   1121 	leaq	-320(%rbp,%r9,2),%rbp
   1122 	subq	%r10,%r11
   1123 	movq	$0,%r10
   1124 	cmovcq	%r10,%r11
   1125 	subq	%r11,%rbp
   1126 .Lpwr_sp_done:
   1127 	andq	$-64,%rbp
   1128 	movq	%rsp,%r11
   1129 	subq	%rbp,%r11
   1130 	andq	$-4096,%r11
   1131 	leaq	(%r11,%rbp,1),%rsp
   1132 	movq	(%rsp),%r10
   1133 	cmpq	%rbp,%rsp
   1134 	ja	.Lpwr_page_walk
   1135 	jmp	.Lpwr_page_walk_done
   1136 
   1137 .Lpwr_page_walk:
   1138 	leaq	-4096(%rsp),%rsp
   1139 	movq	(%rsp),%r10
   1140 	cmpq	%rbp,%rsp
   1141 	ja	.Lpwr_page_walk
   1142 .Lpwr_page_walk_done:
   1143 
   1144 	movq	%r9,%r10
   1145 	negq	%r9
   1146 
   1147 
   1148 
   1149 
   1150 
   1151 
   1152 
   1153 
   1154 
   1155 
   1156 	movq	%r8,32(%rsp)
   1157 	movq	%rax,40(%rsp)
   1158 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
   1159 .Lpower5_body:
   1160 .byte	102,72,15,110,207
   1161 .byte	102,72,15,110,209
   1162 .byte	102,73,15,110,218
   1163 .byte	102,72,15,110,226
   1164 
   1165 	call	__bn_sqr8x_internal
   1166 	call	__bn_post4x_internal
   1167 	call	__bn_sqr8x_internal
   1168 	call	__bn_post4x_internal
   1169 	call	__bn_sqr8x_internal
   1170 	call	__bn_post4x_internal
   1171 	call	__bn_sqr8x_internal
   1172 	call	__bn_post4x_internal
   1173 	call	__bn_sqr8x_internal
   1174 	call	__bn_post4x_internal
   1175 
   1176 .byte	102,72,15,126,209
   1177 .byte	102,72,15,126,226
   1178 	movq	%rsi,%rdi
   1179 	movq	40(%rsp),%rax
   1180 	leaq	32(%rsp),%r8
   1181 
   1182 	call	mul4x_internal
   1183 
   1184 	movq	40(%rsp),%rsi
   1185 .cfi_def_cfa	%rsi,8
   1186 	movq	$1,%rax
   1187 	movq	-48(%rsi),%r15
   1188 .cfi_restore	%r15
   1189 	movq	-40(%rsi),%r14
   1190 .cfi_restore	%r14
   1191 	movq	-32(%rsi),%r13
   1192 .cfi_restore	%r13
   1193 	movq	-24(%rsi),%r12
   1194 .cfi_restore	%r12
   1195 	movq	-16(%rsi),%rbp
   1196 .cfi_restore	%rbp
   1197 	movq	-8(%rsi),%rbx
   1198 .cfi_restore	%rbx
   1199 	leaq	(%rsi),%rsp
   1200 .cfi_def_cfa_register	%rsp
   1201 .Lpower5_epilogue:
   1202 	.byte	0xf3,0xc3
   1203 .cfi_endproc
   1204 .size	bn_power5,.-bn_power5
   1205 
   1206 .globl	bn_sqr8x_internal
   1207 .hidden bn_sqr8x_internal
   1208 .hidden	bn_sqr8x_internal
   1209 .type	bn_sqr8x_internal,@function
   1210 .align	32
   1211 bn_sqr8x_internal:
   1212 __bn_sqr8x_internal:
   1213 
   1214 
   1215 
   1216 
   1217 
   1218 
   1219 
   1220 
   1221 
   1222 
   1223 
   1224 
   1225 
   1226 
   1227 
   1228 
   1229 
   1230 
   1231 
   1232 
   1233 
   1234 
   1235 
   1236 
   1237 
   1238 
   1239 
   1240 
   1241 
   1242 
   1243 
   1244 
   1245 
   1246 
   1247 
   1248 
   1249 
   1250 
   1251 
   1252 
   1253 
   1254 
   1255 
   1256 
   1257 
   1258 
   1259 
   1260 
   1261 
   1262 
   1263 
   1264 
   1265 
   1266 
   1267 
   1268 
   1269 
   1270 
   1271 
   1272 
   1273 
   1274 
   1275 
   1276 
   1277 
   1278 
   1279 
   1280 
   1281 
   1282 
   1283 
   1284 
   1285 
   1286 	leaq	32(%r10),%rbp
   1287 	leaq	(%rsi,%r9,1),%rsi
   1288 
   1289 	movq	%r9,%rcx
   1290 
   1291 
   1292 	movq	-32(%rsi,%rbp,1),%r14
   1293 	leaq	48+8(%rsp,%r9,2),%rdi
   1294 	movq	-24(%rsi,%rbp,1),%rax
   1295 	leaq	-32(%rdi,%rbp,1),%rdi
   1296 	movq	-16(%rsi,%rbp,1),%rbx
   1297 	movq	%rax,%r15
   1298 
   1299 	mulq	%r14
   1300 	movq	%rax,%r10
   1301 	movq	%rbx,%rax
   1302 	movq	%rdx,%r11
   1303 	movq	%r10,-24(%rdi,%rbp,1)
   1304 
   1305 	mulq	%r14
   1306 	addq	%rax,%r11
   1307 	movq	%rbx,%rax
   1308 	adcq	$0,%rdx
   1309 	movq	%r11,-16(%rdi,%rbp,1)
   1310 	movq	%rdx,%r10
   1311 
   1312 
   1313 	movq	-8(%rsi,%rbp,1),%rbx
   1314 	mulq	%r15
   1315 	movq	%rax,%r12
   1316 	movq	%rbx,%rax
   1317 	movq	%rdx,%r13
   1318 
   1319 	leaq	(%rbp),%rcx
   1320 	mulq	%r14
   1321 	addq	%rax,%r10
   1322 	movq	%rbx,%rax
   1323 	movq	%rdx,%r11
   1324 	adcq	$0,%r11
   1325 	addq	%r12,%r10
   1326 	adcq	$0,%r11
   1327 	movq	%r10,-8(%rdi,%rcx,1)
   1328 	jmp	.Lsqr4x_1st
   1329 
   1330 .align	32
   1331 .Lsqr4x_1st:
   1332 	movq	(%rsi,%rcx,1),%rbx
   1333 	mulq	%r15
   1334 	addq	%rax,%r13
   1335 	movq	%rbx,%rax
   1336 	movq	%rdx,%r12
   1337 	adcq	$0,%r12
   1338 
   1339 	mulq	%r14
   1340 	addq	%rax,%r11
   1341 	movq	%rbx,%rax
   1342 	movq	8(%rsi,%rcx,1),%rbx
   1343 	movq	%rdx,%r10
   1344 	adcq	$0,%r10
   1345 	addq	%r13,%r11
   1346 	adcq	$0,%r10
   1347 
   1348 
   1349 	mulq	%r15
   1350 	addq	%rax,%r12
   1351 	movq	%rbx,%rax
   1352 	movq	%r11,(%rdi,%rcx,1)
   1353 	movq	%rdx,%r13
   1354 	adcq	$0,%r13
   1355 
   1356 	mulq	%r14
   1357 	addq	%rax,%r10
   1358 	movq	%rbx,%rax
   1359 	movq	16(%rsi,%rcx,1),%rbx
   1360 	movq	%rdx,%r11
   1361 	adcq	$0,%r11
   1362 	addq	%r12,%r10
   1363 	adcq	$0,%r11
   1364 
   1365 	mulq	%r15
   1366 	addq	%rax,%r13
   1367 	movq	%rbx,%rax
   1368 	movq	%r10,8(%rdi,%rcx,1)
   1369 	movq	%rdx,%r12
   1370 	adcq	$0,%r12
   1371 
   1372 	mulq	%r14
   1373 	addq	%rax,%r11
   1374 	movq	%rbx,%rax
   1375 	movq	24(%rsi,%rcx,1),%rbx
   1376 	movq	%rdx,%r10
   1377 	adcq	$0,%r10
   1378 	addq	%r13,%r11
   1379 	adcq	$0,%r10
   1380 
   1381 
   1382 	mulq	%r15
   1383 	addq	%rax,%r12
   1384 	movq	%rbx,%rax
   1385 	movq	%r11,16(%rdi,%rcx,1)
   1386 	movq	%rdx,%r13
   1387 	adcq	$0,%r13
   1388 	leaq	32(%rcx),%rcx
   1389 
   1390 	mulq	%r14
   1391 	addq	%rax,%r10
   1392 	movq	%rbx,%rax
   1393 	movq	%rdx,%r11
   1394 	adcq	$0,%r11
   1395 	addq	%r12,%r10
   1396 	adcq	$0,%r11
   1397 	movq	%r10,-8(%rdi,%rcx,1)
   1398 
   1399 	cmpq	$0,%rcx
   1400 	jne	.Lsqr4x_1st
   1401 
   1402 	mulq	%r15
   1403 	addq	%rax,%r13
   1404 	leaq	16(%rbp),%rbp
   1405 	adcq	$0,%rdx
   1406 	addq	%r11,%r13
   1407 	adcq	$0,%rdx
   1408 
   1409 	movq	%r13,(%rdi)
   1410 	movq	%rdx,%r12
   1411 	movq	%rdx,8(%rdi)
   1412 	jmp	.Lsqr4x_outer
   1413 
   1414 .align	32
   1415 .Lsqr4x_outer:
   1416 	movq	-32(%rsi,%rbp,1),%r14
   1417 	leaq	48+8(%rsp,%r9,2),%rdi
   1418 	movq	-24(%rsi,%rbp,1),%rax
   1419 	leaq	-32(%rdi,%rbp,1),%rdi
   1420 	movq	-16(%rsi,%rbp,1),%rbx
   1421 	movq	%rax,%r15
   1422 
   1423 	mulq	%r14
   1424 	movq	-24(%rdi,%rbp,1),%r10
   1425 	addq	%rax,%r10
   1426 	movq	%rbx,%rax
   1427 	adcq	$0,%rdx
   1428 	movq	%r10,-24(%rdi,%rbp,1)
   1429 	movq	%rdx,%r11
   1430 
   1431 	mulq	%r14
   1432 	addq	%rax,%r11
   1433 	movq	%rbx,%rax
   1434 	adcq	$0,%rdx
   1435 	addq	-16(%rdi,%rbp,1),%r11
   1436 	movq	%rdx,%r10
   1437 	adcq	$0,%r10
   1438 	movq	%r11,-16(%rdi,%rbp,1)
   1439 
   1440 	xorq	%r12,%r12
   1441 
   1442 	movq	-8(%rsi,%rbp,1),%rbx
   1443 	mulq	%r15
   1444 	addq	%rax,%r12
   1445 	movq	%rbx,%rax
   1446 	adcq	$0,%rdx
   1447 	addq	-8(%rdi,%rbp,1),%r12
   1448 	movq	%rdx,%r13
   1449 	adcq	$0,%r13
   1450 
   1451 	mulq	%r14
   1452 	addq	%rax,%r10
   1453 	movq	%rbx,%rax
   1454 	adcq	$0,%rdx
   1455 	addq	%r12,%r10
   1456 	movq	%rdx,%r11
   1457 	adcq	$0,%r11
   1458 	movq	%r10,-8(%rdi,%rbp,1)
   1459 
   1460 	leaq	(%rbp),%rcx
   1461 	jmp	.Lsqr4x_inner
   1462 
   1463 .align	32
   1464 .Lsqr4x_inner:
   1465 	movq	(%rsi,%rcx,1),%rbx
   1466 	mulq	%r15
   1467 	addq	%rax,%r13
   1468 	movq	%rbx,%rax
   1469 	movq	%rdx,%r12
   1470 	adcq	$0,%r12
   1471 	addq	(%rdi,%rcx,1),%r13
   1472 	adcq	$0,%r12
   1473 
   1474 .byte	0x67
   1475 	mulq	%r14
   1476 	addq	%rax,%r11
   1477 	movq	%rbx,%rax
   1478 	movq	8(%rsi,%rcx,1),%rbx
   1479 	movq	%rdx,%r10
   1480 	adcq	$0,%r10
   1481 	addq	%r13,%r11
   1482 	adcq	$0,%r10
   1483 
   1484 	mulq	%r15
   1485 	addq	%rax,%r12
   1486 	movq	%r11,(%rdi,%rcx,1)
   1487 	movq	%rbx,%rax
   1488 	movq	%rdx,%r13
   1489 	adcq	$0,%r13
   1490 	addq	8(%rdi,%rcx,1),%r12
   1491 	leaq	16(%rcx),%rcx
   1492 	adcq	$0,%r13
   1493 
   1494 	mulq	%r14
   1495 	addq	%rax,%r10
   1496 	movq	%rbx,%rax
   1497 	adcq	$0,%rdx
   1498 	addq	%r12,%r10
   1499 	movq	%rdx,%r11
   1500 	adcq	$0,%r11
   1501 	movq	%r10,-8(%rdi,%rcx,1)
   1502 
   1503 	cmpq	$0,%rcx
   1504 	jne	.Lsqr4x_inner
   1505 
   1506 .byte	0x67
   1507 	mulq	%r15
   1508 	addq	%rax,%r13
   1509 	adcq	$0,%rdx
   1510 	addq	%r11,%r13
   1511 	adcq	$0,%rdx
   1512 
   1513 	movq	%r13,(%rdi)
   1514 	movq	%rdx,%r12
   1515 	movq	%rdx,8(%rdi)
   1516 
   1517 	addq	$16,%rbp
   1518 	jnz	.Lsqr4x_outer
   1519 
   1520 
   1521 	movq	-32(%rsi),%r14
   1522 	leaq	48+8(%rsp,%r9,2),%rdi
   1523 	movq	-24(%rsi),%rax
   1524 	leaq	-32(%rdi,%rbp,1),%rdi
   1525 	movq	-16(%rsi),%rbx
   1526 	movq	%rax,%r15
   1527 
   1528 	mulq	%r14
   1529 	addq	%rax,%r10
   1530 	movq	%rbx,%rax
   1531 	movq	%rdx,%r11
   1532 	adcq	$0,%r11
   1533 
   1534 	mulq	%r14
   1535 	addq	%rax,%r11
   1536 	movq	%rbx,%rax
   1537 	movq	%r10,-24(%rdi)
   1538 	movq	%rdx,%r10
   1539 	adcq	$0,%r10
   1540 	addq	%r13,%r11
   1541 	movq	-8(%rsi),%rbx
   1542 	adcq	$0,%r10
   1543 
   1544 	mulq	%r15
   1545 	addq	%rax,%r12
   1546 	movq	%rbx,%rax
   1547 	movq	%r11,-16(%rdi)
   1548 	movq	%rdx,%r13
   1549 	adcq	$0,%r13
   1550 
   1551 	mulq	%r14
   1552 	addq	%rax,%r10
   1553 	movq	%rbx,%rax
   1554 	movq	%rdx,%r11
   1555 	adcq	$0,%r11
   1556 	addq	%r12,%r10
   1557 	adcq	$0,%r11
   1558 	movq	%r10,-8(%rdi)
   1559 
   1560 	mulq	%r15
   1561 	addq	%rax,%r13
   1562 	movq	-16(%rsi),%rax
   1563 	adcq	$0,%rdx
   1564 	addq	%r11,%r13
   1565 	adcq	$0,%rdx
   1566 
   1567 	movq	%r13,(%rdi)
   1568 	movq	%rdx,%r12
   1569 	movq	%rdx,8(%rdi)
   1570 
   1571 	mulq	%rbx
   1572 	addq	$16,%rbp
   1573 	xorq	%r14,%r14
   1574 	subq	%r9,%rbp
   1575 	xorq	%r15,%r15
   1576 
   1577 	addq	%r12,%rax
   1578 	adcq	$0,%rdx
   1579 	movq	%rax,8(%rdi)
   1580 	movq	%rdx,16(%rdi)
   1581 	movq	%r15,24(%rdi)
   1582 
   1583 	movq	-16(%rsi,%rbp,1),%rax
   1584 	leaq	48+8(%rsp),%rdi
   1585 	xorq	%r10,%r10
   1586 	movq	8(%rdi),%r11
   1587 
   1588 	leaq	(%r14,%r10,2),%r12
   1589 	shrq	$63,%r10
   1590 	leaq	(%rcx,%r11,2),%r13
   1591 	shrq	$63,%r11
   1592 	orq	%r10,%r13
   1593 	movq	16(%rdi),%r10
   1594 	movq	%r11,%r14
   1595 	mulq	%rax
   1596 	negq	%r15
   1597 	movq	24(%rdi),%r11
   1598 	adcq	%rax,%r12
   1599 	movq	-8(%rsi,%rbp,1),%rax
   1600 	movq	%r12,(%rdi)
   1601 	adcq	%rdx,%r13
   1602 
   1603 	leaq	(%r14,%r10,2),%rbx
   1604 	movq	%r13,8(%rdi)
   1605 	sbbq	%r15,%r15
   1606 	shrq	$63,%r10
   1607 	leaq	(%rcx,%r11,2),%r8
   1608 	shrq	$63,%r11
   1609 	orq	%r10,%r8
   1610 	movq	32(%rdi),%r10
   1611 	movq	%r11,%r14
   1612 	mulq	%rax
   1613 	negq	%r15
   1614 	movq	40(%rdi),%r11
   1615 	adcq	%rax,%rbx
   1616 	movq	0(%rsi,%rbp,1),%rax
   1617 	movq	%rbx,16(%rdi)
   1618 	adcq	%rdx,%r8
   1619 	leaq	16(%rbp),%rbp
   1620 	movq	%r8,24(%rdi)
   1621 	sbbq	%r15,%r15
   1622 	leaq	64(%rdi),%rdi
   1623 	jmp	.Lsqr4x_shift_n_add
   1624 
   1625 .align	32
   1626 .Lsqr4x_shift_n_add:
   1627 	leaq	(%r14,%r10,2),%r12
   1628 	shrq	$63,%r10
   1629 	leaq	(%rcx,%r11,2),%r13
   1630 	shrq	$63,%r11
   1631 	orq	%r10,%r13
   1632 	movq	-16(%rdi),%r10
   1633 	movq	%r11,%r14
   1634 	mulq	%rax
   1635 	negq	%r15
   1636 	movq	-8(%rdi),%r11
   1637 	adcq	%rax,%r12
   1638 	movq	-8(%rsi,%rbp,1),%rax
   1639 	movq	%r12,-32(%rdi)
   1640 	adcq	%rdx,%r13
   1641 
   1642 	leaq	(%r14,%r10,2),%rbx
   1643 	movq	%r13,-24(%rdi)
   1644 	sbbq	%r15,%r15
   1645 	shrq	$63,%r10
   1646 	leaq	(%rcx,%r11,2),%r8
   1647 	shrq	$63,%r11
   1648 	orq	%r10,%r8
   1649 	movq	0(%rdi),%r10
   1650 	movq	%r11,%r14
   1651 	mulq	%rax
   1652 	negq	%r15
   1653 	movq	8(%rdi),%r11
   1654 	adcq	%rax,%rbx
   1655 	movq	0(%rsi,%rbp,1),%rax
   1656 	movq	%rbx,-16(%rdi)
   1657 	adcq	%rdx,%r8
   1658 
   1659 	leaq	(%r14,%r10,2),%r12
   1660 	movq	%r8,-8(%rdi)
   1661 	sbbq	%r15,%r15
   1662 	shrq	$63,%r10
   1663 	leaq	(%rcx,%r11,2),%r13
   1664 	shrq	$63,%r11
   1665 	orq	%r10,%r13
   1666 	movq	16(%rdi),%r10
   1667 	movq	%r11,%r14
   1668 	mulq	%rax
   1669 	negq	%r15
   1670 	movq	24(%rdi),%r11
   1671 	adcq	%rax,%r12
   1672 	movq	8(%rsi,%rbp,1),%rax
   1673 	movq	%r12,0(%rdi)
   1674 	adcq	%rdx,%r13
   1675 
   1676 	leaq	(%r14,%r10,2),%rbx
   1677 	movq	%r13,8(%rdi)
   1678 	sbbq	%r15,%r15
   1679 	shrq	$63,%r10
   1680 	leaq	(%rcx,%r11,2),%r8
   1681 	shrq	$63,%r11
   1682 	orq	%r10,%r8
   1683 	movq	32(%rdi),%r10
   1684 	movq	%r11,%r14
   1685 	mulq	%rax
   1686 	negq	%r15
   1687 	movq	40(%rdi),%r11
   1688 	adcq	%rax,%rbx
   1689 	movq	16(%rsi,%rbp,1),%rax
   1690 	movq	%rbx,16(%rdi)
   1691 	adcq	%rdx,%r8
   1692 	movq	%r8,24(%rdi)
   1693 	sbbq	%r15,%r15
   1694 	leaq	64(%rdi),%rdi
   1695 	addq	$32,%rbp
   1696 	jnz	.Lsqr4x_shift_n_add
   1697 
   1698 	leaq	(%r14,%r10,2),%r12
   1699 .byte	0x67
   1700 	shrq	$63,%r10
   1701 	leaq	(%rcx,%r11,2),%r13
   1702 	shrq	$63,%r11
   1703 	orq	%r10,%r13
   1704 	movq	-16(%rdi),%r10
   1705 	movq	%r11,%r14
   1706 	mulq	%rax
   1707 	negq	%r15
   1708 	movq	-8(%rdi),%r11
   1709 	adcq	%rax,%r12
   1710 	movq	-8(%rsi),%rax
   1711 	movq	%r12,-32(%rdi)
   1712 	adcq	%rdx,%r13
   1713 
   1714 	leaq	(%r14,%r10,2),%rbx
   1715 	movq	%r13,-24(%rdi)
   1716 	sbbq	%r15,%r15
   1717 	shrq	$63,%r10
   1718 	leaq	(%rcx,%r11,2),%r8
   1719 	shrq	$63,%r11
   1720 	orq	%r10,%r8
   1721 	mulq	%rax
   1722 	negq	%r15
   1723 	adcq	%rax,%rbx
   1724 	adcq	%rdx,%r8
   1725 	movq	%rbx,-16(%rdi)
   1726 	movq	%r8,-8(%rdi)
   1727 .byte	102,72,15,126,213
   1728 __bn_sqr8x_reduction:
   1729 	xorq	%rax,%rax
   1730 	leaq	(%r9,%rbp,1),%rcx
   1731 	leaq	48+8(%rsp,%r9,2),%rdx
   1732 	movq	%rcx,0+8(%rsp)
   1733 	leaq	48+8(%rsp,%r9,1),%rdi
   1734 	movq	%rdx,8+8(%rsp)
   1735 	negq	%r9
   1736 	jmp	.L8x_reduction_loop
   1737 
   1738 .align	32
   1739 .L8x_reduction_loop:
   1740 	leaq	(%rdi,%r9,1),%rdi
   1741 .byte	0x66
   1742 	movq	0(%rdi),%rbx
   1743 	movq	8(%rdi),%r9
   1744 	movq	16(%rdi),%r10
   1745 	movq	24(%rdi),%r11
   1746 	movq	32(%rdi),%r12
   1747 	movq	40(%rdi),%r13
   1748 	movq	48(%rdi),%r14
   1749 	movq	56(%rdi),%r15
   1750 	movq	%rax,(%rdx)
   1751 	leaq	64(%rdi),%rdi
   1752 
   1753 .byte	0x67
   1754 	movq	%rbx,%r8
   1755 	imulq	32+8(%rsp),%rbx
   1756 	movq	0(%rbp),%rax
   1757 	movl	$8,%ecx
   1758 	jmp	.L8x_reduce
   1759 
   1760 .align	32
   1761 .L8x_reduce:
   1762 	mulq	%rbx
   1763 	movq	8(%rbp),%rax
   1764 	negq	%r8
   1765 	movq	%rdx,%r8
   1766 	adcq	$0,%r8
   1767 
   1768 	mulq	%rbx
   1769 	addq	%rax,%r9
   1770 	movq	16(%rbp),%rax
   1771 	adcq	$0,%rdx
   1772 	addq	%r9,%r8
   1773 	movq	%rbx,48-8+8(%rsp,%rcx,8)
   1774 	movq	%rdx,%r9
   1775 	adcq	$0,%r9
   1776 
   1777 	mulq	%rbx
   1778 	addq	%rax,%r10
   1779 	movq	24(%rbp),%rax
   1780 	adcq	$0,%rdx
   1781 	addq	%r10,%r9
   1782 	movq	32+8(%rsp),%rsi
   1783 	movq	%rdx,%r10
   1784 	adcq	$0,%r10
   1785 
   1786 	mulq	%rbx
   1787 	addq	%rax,%r11
   1788 	movq	32(%rbp),%rax
   1789 	adcq	$0,%rdx
   1790 	imulq	%r8,%rsi
   1791 	addq	%r11,%r10
   1792 	movq	%rdx,%r11
   1793 	adcq	$0,%r11
   1794 
   1795 	mulq	%rbx
   1796 	addq	%rax,%r12
   1797 	movq	40(%rbp),%rax
   1798 	adcq	$0,%rdx
   1799 	addq	%r12,%r11
   1800 	movq	%rdx,%r12
   1801 	adcq	$0,%r12
   1802 
   1803 	mulq	%rbx
   1804 	addq	%rax,%r13
   1805 	movq	48(%rbp),%rax
   1806 	adcq	$0,%rdx
   1807 	addq	%r13,%r12
   1808 	movq	%rdx,%r13
   1809 	adcq	$0,%r13
   1810 
   1811 	mulq	%rbx
   1812 	addq	%rax,%r14
   1813 	movq	56(%rbp),%rax
   1814 	adcq	$0,%rdx
   1815 	addq	%r14,%r13
   1816 	movq	%rdx,%r14
   1817 	adcq	$0,%r14
   1818 
   1819 	mulq	%rbx
   1820 	movq	%rsi,%rbx
   1821 	addq	%rax,%r15
   1822 	movq	0(%rbp),%rax
   1823 	adcq	$0,%rdx
   1824 	addq	%r15,%r14
   1825 	movq	%rdx,%r15
   1826 	adcq	$0,%r15
   1827 
   1828 	decl	%ecx
   1829 	jnz	.L8x_reduce
   1830 
   1831 	leaq	64(%rbp),%rbp
   1832 	xorq	%rax,%rax
   1833 	movq	8+8(%rsp),%rdx
   1834 	cmpq	0+8(%rsp),%rbp
   1835 	jae	.L8x_no_tail
   1836 
   1837 .byte	0x66
   1838 	addq	0(%rdi),%r8
   1839 	adcq	8(%rdi),%r9
   1840 	adcq	16(%rdi),%r10
   1841 	adcq	24(%rdi),%r11
   1842 	adcq	32(%rdi),%r12
   1843 	adcq	40(%rdi),%r13
   1844 	adcq	48(%rdi),%r14
   1845 	adcq	56(%rdi),%r15
   1846 	sbbq	%rsi,%rsi
   1847 
   1848 	movq	48+56+8(%rsp),%rbx
   1849 	movl	$8,%ecx
   1850 	movq	0(%rbp),%rax
   1851 	jmp	.L8x_tail
   1852 
   1853 .align	32
   1854 .L8x_tail:
   1855 	mulq	%rbx
   1856 	addq	%rax,%r8
   1857 	movq	8(%rbp),%rax
   1858 	movq	%r8,(%rdi)
   1859 	movq	%rdx,%r8
   1860 	adcq	$0,%r8
   1861 
   1862 	mulq	%rbx
   1863 	addq	%rax,%r9
   1864 	movq	16(%rbp),%rax
   1865 	adcq	$0,%rdx
   1866 	addq	%r9,%r8
   1867 	leaq	8(%rdi),%rdi
   1868 	movq	%rdx,%r9
   1869 	adcq	$0,%r9
   1870 
   1871 	mulq	%rbx
   1872 	addq	%rax,%r10
   1873 	movq	24(%rbp),%rax
   1874 	adcq	$0,%rdx
   1875 	addq	%r10,%r9
   1876 	movq	%rdx,%r10
   1877 	adcq	$0,%r10
   1878 
   1879 	mulq	%rbx
   1880 	addq	%rax,%r11
   1881 	movq	32(%rbp),%rax
   1882 	adcq	$0,%rdx
   1883 	addq	%r11,%r10
   1884 	movq	%rdx,%r11
   1885 	adcq	$0,%r11
   1886 
   1887 	mulq	%rbx
   1888 	addq	%rax,%r12
   1889 	movq	40(%rbp),%rax
   1890 	adcq	$0,%rdx
   1891 	addq	%r12,%r11
   1892 	movq	%rdx,%r12
   1893 	adcq	$0,%r12
   1894 
   1895 	mulq	%rbx
   1896 	addq	%rax,%r13
   1897 	movq	48(%rbp),%rax
   1898 	adcq	$0,%rdx
   1899 	addq	%r13,%r12
   1900 	movq	%rdx,%r13
   1901 	adcq	$0,%r13
   1902 
   1903 	mulq	%rbx
   1904 	addq	%rax,%r14
   1905 	movq	56(%rbp),%rax
   1906 	adcq	$0,%rdx
   1907 	addq	%r14,%r13
   1908 	movq	%rdx,%r14
   1909 	adcq	$0,%r14
   1910 
   1911 	mulq	%rbx
   1912 	movq	48-16+8(%rsp,%rcx,8),%rbx
   1913 	addq	%rax,%r15
   1914 	adcq	$0,%rdx
   1915 	addq	%r15,%r14
   1916 	movq	0(%rbp),%rax
   1917 	movq	%rdx,%r15
   1918 	adcq	$0,%r15
   1919 
   1920 	decl	%ecx
   1921 	jnz	.L8x_tail
   1922 
   1923 	leaq	64(%rbp),%rbp
   1924 	movq	8+8(%rsp),%rdx
   1925 	cmpq	0+8(%rsp),%rbp
   1926 	jae	.L8x_tail_done
   1927 
   1928 	movq	48+56+8(%rsp),%rbx
   1929 	negq	%rsi
   1930 	movq	0(%rbp),%rax
   1931 	adcq	0(%rdi),%r8
   1932 	adcq	8(%rdi),%r9
   1933 	adcq	16(%rdi),%r10
   1934 	adcq	24(%rdi),%r11
   1935 	adcq	32(%rdi),%r12
   1936 	adcq	40(%rdi),%r13
   1937 	adcq	48(%rdi),%r14
   1938 	adcq	56(%rdi),%r15
   1939 	sbbq	%rsi,%rsi
   1940 
   1941 	movl	$8,%ecx
   1942 	jmp	.L8x_tail
   1943 
   1944 .align	32
   1945 .L8x_tail_done:
   1946 	xorq	%rax,%rax
   1947 	addq	(%rdx),%r8
   1948 	adcq	$0,%r9
   1949 	adcq	$0,%r10
   1950 	adcq	$0,%r11
   1951 	adcq	$0,%r12
   1952 	adcq	$0,%r13
   1953 	adcq	$0,%r14
   1954 	adcq	$0,%r15
   1955 	adcq	$0,%rax
   1956 
   1957 	negq	%rsi
   1958 .L8x_no_tail:
   1959 	adcq	0(%rdi),%r8
   1960 	adcq	8(%rdi),%r9
   1961 	adcq	16(%rdi),%r10
   1962 	adcq	24(%rdi),%r11
   1963 	adcq	32(%rdi),%r12
   1964 	adcq	40(%rdi),%r13
   1965 	adcq	48(%rdi),%r14
   1966 	adcq	56(%rdi),%r15
   1967 	adcq	$0,%rax
   1968 	movq	-8(%rbp),%rcx
   1969 	xorq	%rsi,%rsi
   1970 
   1971 .byte	102,72,15,126,213
   1972 
   1973 	movq	%r8,0(%rdi)
   1974 	movq	%r9,8(%rdi)
   1975 .byte	102,73,15,126,217
   1976 	movq	%r10,16(%rdi)
   1977 	movq	%r11,24(%rdi)
   1978 	movq	%r12,32(%rdi)
   1979 	movq	%r13,40(%rdi)
   1980 	movq	%r14,48(%rdi)
   1981 	movq	%r15,56(%rdi)
   1982 	leaq	64(%rdi),%rdi
   1983 
   1984 	cmpq	%rdx,%rdi
   1985 	jb	.L8x_reduction_loop
   1986 	.byte	0xf3,0xc3
   1987 .size	bn_sqr8x_internal,.-bn_sqr8x_internal
   1988 .type	__bn_post4x_internal,@function
   1989 .align	32
   1990 __bn_post4x_internal:
   1991 	movq	0(%rbp),%r12
   1992 	leaq	(%rdi,%r9,1),%rbx
   1993 	movq	%r9,%rcx
   1994 .byte	102,72,15,126,207
   1995 	negq	%rax
   1996 .byte	102,72,15,126,206
   1997 	sarq	$3+2,%rcx
   1998 	decq	%r12
   1999 	xorq	%r10,%r10
   2000 	movq	8(%rbp),%r13
   2001 	movq	16(%rbp),%r14
   2002 	movq	24(%rbp),%r15
   2003 	jmp	.Lsqr4x_sub_entry
   2004 
   2005 .align	16
   2006 .Lsqr4x_sub:
   2007 	movq	0(%rbp),%r12
   2008 	movq	8(%rbp),%r13
   2009 	movq	16(%rbp),%r14
   2010 	movq	24(%rbp),%r15
   2011 .Lsqr4x_sub_entry:
   2012 	leaq	32(%rbp),%rbp
   2013 	notq	%r12
   2014 	notq	%r13
   2015 	notq	%r14
   2016 	notq	%r15
   2017 	andq	%rax,%r12
   2018 	andq	%rax,%r13
   2019 	andq	%rax,%r14
   2020 	andq	%rax,%r15
   2021 
   2022 	negq	%r10
   2023 	adcq	0(%rbx),%r12
   2024 	adcq	8(%rbx),%r13
   2025 	adcq	16(%rbx),%r14
   2026 	adcq	24(%rbx),%r15
   2027 	movq	%r12,0(%rdi)
   2028 	leaq	32(%rbx),%rbx
   2029 	movq	%r13,8(%rdi)
   2030 	sbbq	%r10,%r10
   2031 	movq	%r14,16(%rdi)
   2032 	movq	%r15,24(%rdi)
   2033 	leaq	32(%rdi),%rdi
   2034 
   2035 	incq	%rcx
   2036 	jnz	.Lsqr4x_sub
   2037 
   2038 	movq	%r9,%r10
   2039 	negq	%r9
   2040 	.byte	0xf3,0xc3
   2041 .size	__bn_post4x_internal,.-__bn_post4x_internal
   2042 .globl	bn_from_montgomery
   2043 .hidden bn_from_montgomery
   2044 .type	bn_from_montgomery,@function
   2045 .align	32
   2046 bn_from_montgomery:
   2047 	testl	$7,%r9d
   2048 	jz	bn_from_mont8x
   2049 	xorl	%eax,%eax
   2050 	.byte	0xf3,0xc3
   2051 .size	bn_from_montgomery,.-bn_from_montgomery
   2052 
   2053 .type	bn_from_mont8x,@function
   2054 .align	32
   2055 bn_from_mont8x:
   2056 .cfi_startproc
   2057 .byte	0x67
   2058 	movq	%rsp,%rax
   2059 .cfi_def_cfa_register	%rax
   2060 	pushq	%rbx
   2061 .cfi_offset	%rbx,-16
   2062 	pushq	%rbp
   2063 .cfi_offset	%rbp,-24
   2064 	pushq	%r12
   2065 .cfi_offset	%r12,-32
   2066 	pushq	%r13
   2067 .cfi_offset	%r13,-40
   2068 	pushq	%r14
   2069 .cfi_offset	%r14,-48
   2070 	pushq	%r15
   2071 .cfi_offset	%r15,-56
   2072 .Lfrom_prologue:
   2073 
   2074 	shll	$3,%r9d
   2075 	leaq	(%r9,%r9,2),%r10
   2076 	negq	%r9
   2077 	movq	(%r8),%r8
   2078 
   2079 
   2080 
   2081 
   2082 
   2083 
   2084 
   2085 
   2086 	leaq	-320(%rsp,%r9,2),%r11
   2087 	movq	%rsp,%rbp
   2088 	subq	%rdi,%r11
   2089 	andq	$4095,%r11
   2090 	cmpq	%r11,%r10
   2091 	jb	.Lfrom_sp_alt
   2092 	subq	%r11,%rbp
   2093 	leaq	-320(%rbp,%r9,2),%rbp
   2094 	jmp	.Lfrom_sp_done
   2095 
   2096 .align	32
   2097 .Lfrom_sp_alt:
   2098 	leaq	4096-320(,%r9,2),%r10
   2099 	leaq	-320(%rbp,%r9,2),%rbp
   2100 	subq	%r10,%r11
   2101 	movq	$0,%r10
   2102 	cmovcq	%r10,%r11
   2103 	subq	%r11,%rbp
   2104 .Lfrom_sp_done:
   2105 	andq	$-64,%rbp
   2106 	movq	%rsp,%r11
   2107 	subq	%rbp,%r11
   2108 	andq	$-4096,%r11
   2109 	leaq	(%r11,%rbp,1),%rsp
   2110 	movq	(%rsp),%r10
   2111 	cmpq	%rbp,%rsp
   2112 	ja	.Lfrom_page_walk
   2113 	jmp	.Lfrom_page_walk_done
   2114 
   2115 .Lfrom_page_walk:
   2116 	leaq	-4096(%rsp),%rsp
   2117 	movq	(%rsp),%r10
   2118 	cmpq	%rbp,%rsp
   2119 	ja	.Lfrom_page_walk
   2120 .Lfrom_page_walk_done:
   2121 
   2122 	movq	%r9,%r10
   2123 	negq	%r9
   2124 
   2125 
   2126 
   2127 
   2128 
   2129 
   2130 
   2131 
   2132 
   2133 
   2134 	movq	%r8,32(%rsp)
   2135 	movq	%rax,40(%rsp)
   2136 .cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
   2137 .Lfrom_body:
   2138 	movq	%r9,%r11
   2139 	leaq	48(%rsp),%rax
   2140 	pxor	%xmm0,%xmm0
   2141 	jmp	.Lmul_by_1
   2142 
   2143 .align	32
   2144 .Lmul_by_1:
   2145 	movdqu	(%rsi),%xmm1
   2146 	movdqu	16(%rsi),%xmm2
   2147 	movdqu	32(%rsi),%xmm3
   2148 	movdqa	%xmm0,(%rax,%r9,1)
   2149 	movdqu	48(%rsi),%xmm4
   2150 	movdqa	%xmm0,16(%rax,%r9,1)
   2151 .byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
   2152 	movdqa	%xmm1,(%rax)
   2153 	movdqa	%xmm0,32(%rax,%r9,1)
   2154 	movdqa	%xmm2,16(%rax)
   2155 	movdqa	%xmm0,48(%rax,%r9,1)
   2156 	movdqa	%xmm3,32(%rax)
   2157 	movdqa	%xmm4,48(%rax)
   2158 	leaq	64(%rax),%rax
   2159 	subq	$64,%r11
   2160 	jnz	.Lmul_by_1
   2161 
   2162 .byte	102,72,15,110,207
   2163 .byte	102,72,15,110,209
   2164 .byte	0x67
   2165 	movq	%rcx,%rbp
   2166 .byte	102,73,15,110,218
   2167 	call	__bn_sqr8x_reduction
   2168 	call	__bn_post4x_internal
   2169 
   2170 	pxor	%xmm0,%xmm0
   2171 	leaq	48(%rsp),%rax
   2172 	jmp	.Lfrom_mont_zero
   2173 
   2174 .align	32
   2175 .Lfrom_mont_zero:
   2176 	movq	40(%rsp),%rsi
   2177 .cfi_def_cfa	%rsi,8
   2178 	movdqa	%xmm0,0(%rax)
   2179 	movdqa	%xmm0,16(%rax)
   2180 	movdqa	%xmm0,32(%rax)
   2181 	movdqa	%xmm0,48(%rax)
   2182 	leaq	64(%rax),%rax
   2183 	subq	$32,%r9
   2184 	jnz	.Lfrom_mont_zero
   2185 
   2186 	movq	$1,%rax
   2187 	movq	-48(%rsi),%r15
   2188 .cfi_restore	%r15
   2189 	movq	-40(%rsi),%r14
   2190 .cfi_restore	%r14
   2191 	movq	-32(%rsi),%r13
   2192 .cfi_restore	%r13
   2193 	movq	-24(%rsi),%r12
   2194 .cfi_restore	%r12
   2195 	movq	-16(%rsi),%rbp
   2196 .cfi_restore	%rbp
   2197 	movq	-8(%rsi),%rbx
   2198 .cfi_restore	%rbx
   2199 	leaq	(%rsi),%rsp
   2200 .cfi_def_cfa_register	%rsp
   2201 .Lfrom_epilogue:
   2202 	.byte	0xf3,0xc3
   2203 .cfi_endproc
   2204 .size	bn_from_mont8x,.-bn_from_mont8x
   2205 .globl	bn_scatter5
   2206 .hidden bn_scatter5
   2207 .type	bn_scatter5,@function
   2208 .align	16
   2209 bn_scatter5:
   2210 	cmpl	$0,%esi
   2211 	jz	.Lscatter_epilogue
   2212 	leaq	(%rdx,%rcx,8),%rdx
   2213 .Lscatter:
   2214 	movq	(%rdi),%rax
   2215 	leaq	8(%rdi),%rdi
   2216 	movq	%rax,(%rdx)
   2217 	leaq	256(%rdx),%rdx
   2218 	subl	$1,%esi
   2219 	jnz	.Lscatter
   2220 .Lscatter_epilogue:
   2221 	.byte	0xf3,0xc3
   2222 .size	bn_scatter5,.-bn_scatter5
   2223 
   2224 .globl	bn_gather5
   2225 .hidden bn_gather5
   2226 .type	bn_gather5,@function
   2227 .align	32
   2228 bn_gather5:
   2229 .LSEH_begin_bn_gather5:
   2230 
   2231 .byte	0x4c,0x8d,0x14,0x24
   2232 .byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
   2233 	leaq	.Linc(%rip),%rax
   2234 	andq	$-16,%rsp
   2235 
   2236 	movd	%ecx,%xmm5
   2237 	movdqa	0(%rax),%xmm0
   2238 	movdqa	16(%rax),%xmm1
   2239 	leaq	128(%rdx),%r11
   2240 	leaq	128(%rsp),%rax
   2241 
   2242 	pshufd	$0,%xmm5,%xmm5
   2243 	movdqa	%xmm1,%xmm4
   2244 	movdqa	%xmm1,%xmm2
   2245 	paddd	%xmm0,%xmm1
   2246 	pcmpeqd	%xmm5,%xmm0
   2247 	movdqa	%xmm4,%xmm3
   2248 
   2249 	paddd	%xmm1,%xmm2
   2250 	pcmpeqd	%xmm5,%xmm1
   2251 	movdqa	%xmm0,-128(%rax)
   2252 	movdqa	%xmm4,%xmm0
   2253 
   2254 	paddd	%xmm2,%xmm3
   2255 	pcmpeqd	%xmm5,%xmm2
   2256 	movdqa	%xmm1,-112(%rax)
   2257 	movdqa	%xmm4,%xmm1
   2258 
   2259 	paddd	%xmm3,%xmm0
   2260 	pcmpeqd	%xmm5,%xmm3
   2261 	movdqa	%xmm2,-96(%rax)
   2262 	movdqa	%xmm4,%xmm2
   2263 	paddd	%xmm0,%xmm1
   2264 	pcmpeqd	%xmm5,%xmm0
   2265 	movdqa	%xmm3,-80(%rax)
   2266 	movdqa	%xmm4,%xmm3
   2267 
   2268 	paddd	%xmm1,%xmm2
   2269 	pcmpeqd	%xmm5,%xmm1
   2270 	movdqa	%xmm0,-64(%rax)
   2271 	movdqa	%xmm4,%xmm0
   2272 
   2273 	paddd	%xmm2,%xmm3
   2274 	pcmpeqd	%xmm5,%xmm2
   2275 	movdqa	%xmm1,-48(%rax)
   2276 	movdqa	%xmm4,%xmm1
   2277 
   2278 	paddd	%xmm3,%xmm0
   2279 	pcmpeqd	%xmm5,%xmm3
   2280 	movdqa	%xmm2,-32(%rax)
   2281 	movdqa	%xmm4,%xmm2
   2282 	paddd	%xmm0,%xmm1
   2283 	pcmpeqd	%xmm5,%xmm0
   2284 	movdqa	%xmm3,-16(%rax)
   2285 	movdqa	%xmm4,%xmm3
   2286 
   2287 	paddd	%xmm1,%xmm2
   2288 	pcmpeqd	%xmm5,%xmm1
   2289 	movdqa	%xmm0,0(%rax)
   2290 	movdqa	%xmm4,%xmm0
   2291 
   2292 	paddd	%xmm2,%xmm3
   2293 	pcmpeqd	%xmm5,%xmm2
   2294 	movdqa	%xmm1,16(%rax)
   2295 	movdqa	%xmm4,%xmm1
   2296 
   2297 	paddd	%xmm3,%xmm0
   2298 	pcmpeqd	%xmm5,%xmm3
   2299 	movdqa	%xmm2,32(%rax)
   2300 	movdqa	%xmm4,%xmm2
   2301 	paddd	%xmm0,%xmm1
   2302 	pcmpeqd	%xmm5,%xmm0
   2303 	movdqa	%xmm3,48(%rax)
   2304 	movdqa	%xmm4,%xmm3
   2305 
   2306 	paddd	%xmm1,%xmm2
   2307 	pcmpeqd	%xmm5,%xmm1
   2308 	movdqa	%xmm0,64(%rax)
   2309 	movdqa	%xmm4,%xmm0
   2310 
   2311 	paddd	%xmm2,%xmm3
   2312 	pcmpeqd	%xmm5,%xmm2
   2313 	movdqa	%xmm1,80(%rax)
   2314 	movdqa	%xmm4,%xmm1
   2315 
   2316 	paddd	%xmm3,%xmm0
   2317 	pcmpeqd	%xmm5,%xmm3
   2318 	movdqa	%xmm2,96(%rax)
   2319 	movdqa	%xmm4,%xmm2
   2320 	movdqa	%xmm3,112(%rax)
   2321 	jmp	.Lgather
   2322 
   2323 .align	32
   2324 .Lgather:
   2325 	pxor	%xmm4,%xmm4
   2326 	pxor	%xmm5,%xmm5
   2327 	movdqa	-128(%r11),%xmm0
   2328 	movdqa	-112(%r11),%xmm1
   2329 	movdqa	-96(%r11),%xmm2
   2330 	pand	-128(%rax),%xmm0
   2331 	movdqa	-80(%r11),%xmm3
   2332 	pand	-112(%rax),%xmm1
   2333 	por	%xmm0,%xmm4
   2334 	pand	-96(%rax),%xmm2
   2335 	por	%xmm1,%xmm5
   2336 	pand	-80(%rax),%xmm3
   2337 	por	%xmm2,%xmm4
   2338 	por	%xmm3,%xmm5
   2339 	movdqa	-64(%r11),%xmm0
   2340 	movdqa	-48(%r11),%xmm1
   2341 	movdqa	-32(%r11),%xmm2
   2342 	pand	-64(%rax),%xmm0
   2343 	movdqa	-16(%r11),%xmm3
   2344 	pand	-48(%rax),%xmm1
   2345 	por	%xmm0,%xmm4
   2346 	pand	-32(%rax),%xmm2
   2347 	por	%xmm1,%xmm5
   2348 	pand	-16(%rax),%xmm3
   2349 	por	%xmm2,%xmm4
   2350 	por	%xmm3,%xmm5
   2351 	movdqa	0(%r11),%xmm0
   2352 	movdqa	16(%r11),%xmm1
   2353 	movdqa	32(%r11),%xmm2
   2354 	pand	0(%rax),%xmm0
   2355 	movdqa	48(%r11),%xmm3
   2356 	pand	16(%rax),%xmm1
   2357 	por	%xmm0,%xmm4
   2358 	pand	32(%rax),%xmm2
   2359 	por	%xmm1,%xmm5
   2360 	pand	48(%rax),%xmm3
   2361 	por	%xmm2,%xmm4
   2362 	por	%xmm3,%xmm5
   2363 	movdqa	64(%r11),%xmm0
   2364 	movdqa	80(%r11),%xmm1
   2365 	movdqa	96(%r11),%xmm2
   2366 	pand	64(%rax),%xmm0
   2367 	movdqa	112(%r11),%xmm3
   2368 	pand	80(%rax),%xmm1
   2369 	por	%xmm0,%xmm4
   2370 	pand	96(%rax),%xmm2
   2371 	por	%xmm1,%xmm5
   2372 	pand	112(%rax),%xmm3
   2373 	por	%xmm2,%xmm4
   2374 	por	%xmm3,%xmm5
   2375 	por	%xmm5,%xmm4
   2376 	leaq	256(%r11),%r11
   2377 	pshufd	$0x4e,%xmm4,%xmm0
   2378 	por	%xmm4,%xmm0
   2379 	movq	%xmm0,(%rdi)
   2380 	leaq	8(%rdi),%rdi
   2381 	subl	$1,%esi
   2382 	jnz	.Lgather
   2383 
   2384 	leaq	(%r10),%rsp
   2385 	.byte	0xf3,0xc3
   2386 .LSEH_end_bn_gather5:
   2387 .size	bn_gather5,.-bn_gather5
   2388 .align	64
   2389 .Linc:
   2390 .long	0,0, 1,1
   2391 .long	2,2, 2,2
   2392 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   2393 #endif
   2394