Home | History | Annotate | Download | only in fipsmodule
      1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
      2 .text
      3 
      4 
      5 
      6 .globl	_bn_mul_mont_gather5
      7 .private_extern _bn_mul_mont_gather5
      8 
      9 .p2align	6
     10 _bn_mul_mont_gather5:
     11 
     12 	movl	%r9d,%r9d
     13 	movq	%rsp,%rax
     14 
     15 	testl	$7,%r9d
     16 	jnz	L$mul_enter
     17 	jmp	L$mul4x_enter
     18 
     19 .p2align	4
     20 L$mul_enter:
     21 	movd	8(%rsp),%xmm5
     22 	pushq	%rbx
     23 
     24 	pushq	%rbp
     25 
     26 	pushq	%r12
     27 
     28 	pushq	%r13
     29 
     30 	pushq	%r14
     31 
     32 	pushq	%r15
     33 
     34 
     35 	negq	%r9
     36 	movq	%rsp,%r11
     37 	leaq	-280(%rsp,%r9,8),%r10
     38 	negq	%r9
     39 	andq	$-1024,%r10
     40 
     41 
     42 
     43 
     44 
     45 
     46 
     47 
     48 
     49 	subq	%r10,%r11
     50 	andq	$-4096,%r11
     51 	leaq	(%r10,%r11,1),%rsp
     52 	movq	(%rsp),%r11
     53 	cmpq	%r10,%rsp
     54 	ja	L$mul_page_walk
     55 	jmp	L$mul_page_walk_done
     56 
     57 L$mul_page_walk:
     58 	leaq	-4096(%rsp),%rsp
     59 	movq	(%rsp),%r11
     60 	cmpq	%r10,%rsp
     61 	ja	L$mul_page_walk
     62 L$mul_page_walk_done:
     63 
     64 	leaq	L$inc(%rip),%r10
     65 	movq	%rax,8(%rsp,%r9,8)
     66 
     67 L$mul_body:
     68 
     69 	leaq	128(%rdx),%r12
     70 	movdqa	0(%r10),%xmm0
     71 	movdqa	16(%r10),%xmm1
     72 	leaq	24-112(%rsp,%r9,8),%r10
     73 	andq	$-16,%r10
     74 
     75 	pshufd	$0,%xmm5,%xmm5
     76 	movdqa	%xmm1,%xmm4
     77 	movdqa	%xmm1,%xmm2
     78 	paddd	%xmm0,%xmm1
     79 	pcmpeqd	%xmm5,%xmm0
     80 .byte	0x67
     81 	movdqa	%xmm4,%xmm3
     82 	paddd	%xmm1,%xmm2
     83 	pcmpeqd	%xmm5,%xmm1
     84 	movdqa	%xmm0,112(%r10)
     85 	movdqa	%xmm4,%xmm0
     86 
     87 	paddd	%xmm2,%xmm3
     88 	pcmpeqd	%xmm5,%xmm2
     89 	movdqa	%xmm1,128(%r10)
     90 	movdqa	%xmm4,%xmm1
     91 
     92 	paddd	%xmm3,%xmm0
     93 	pcmpeqd	%xmm5,%xmm3
     94 	movdqa	%xmm2,144(%r10)
     95 	movdqa	%xmm4,%xmm2
     96 
     97 	paddd	%xmm0,%xmm1
     98 	pcmpeqd	%xmm5,%xmm0
     99 	movdqa	%xmm3,160(%r10)
    100 	movdqa	%xmm4,%xmm3
    101 	paddd	%xmm1,%xmm2
    102 	pcmpeqd	%xmm5,%xmm1
    103 	movdqa	%xmm0,176(%r10)
    104 	movdqa	%xmm4,%xmm0
    105 
    106 	paddd	%xmm2,%xmm3
    107 	pcmpeqd	%xmm5,%xmm2
    108 	movdqa	%xmm1,192(%r10)
    109 	movdqa	%xmm4,%xmm1
    110 
    111 	paddd	%xmm3,%xmm0
    112 	pcmpeqd	%xmm5,%xmm3
    113 	movdqa	%xmm2,208(%r10)
    114 	movdqa	%xmm4,%xmm2
    115 
    116 	paddd	%xmm0,%xmm1
    117 	pcmpeqd	%xmm5,%xmm0
    118 	movdqa	%xmm3,224(%r10)
    119 	movdqa	%xmm4,%xmm3
    120 	paddd	%xmm1,%xmm2
    121 	pcmpeqd	%xmm5,%xmm1
    122 	movdqa	%xmm0,240(%r10)
    123 	movdqa	%xmm4,%xmm0
    124 
    125 	paddd	%xmm2,%xmm3
    126 	pcmpeqd	%xmm5,%xmm2
    127 	movdqa	%xmm1,256(%r10)
    128 	movdqa	%xmm4,%xmm1
    129 
    130 	paddd	%xmm3,%xmm0
    131 	pcmpeqd	%xmm5,%xmm3
    132 	movdqa	%xmm2,272(%r10)
    133 	movdqa	%xmm4,%xmm2
    134 
    135 	paddd	%xmm0,%xmm1
    136 	pcmpeqd	%xmm5,%xmm0
    137 	movdqa	%xmm3,288(%r10)
    138 	movdqa	%xmm4,%xmm3
    139 	paddd	%xmm1,%xmm2
    140 	pcmpeqd	%xmm5,%xmm1
    141 	movdqa	%xmm0,304(%r10)
    142 
    143 	paddd	%xmm2,%xmm3
    144 .byte	0x67
    145 	pcmpeqd	%xmm5,%xmm2
    146 	movdqa	%xmm1,320(%r10)
    147 
    148 	pcmpeqd	%xmm5,%xmm3
    149 	movdqa	%xmm2,336(%r10)
    150 	pand	64(%r12),%xmm0
    151 
    152 	pand	80(%r12),%xmm1
    153 	pand	96(%r12),%xmm2
    154 	movdqa	%xmm3,352(%r10)
    155 	pand	112(%r12),%xmm3
    156 	por	%xmm2,%xmm0
    157 	por	%xmm3,%xmm1
    158 	movdqa	-128(%r12),%xmm4
    159 	movdqa	-112(%r12),%xmm5
    160 	movdqa	-96(%r12),%xmm2
    161 	pand	112(%r10),%xmm4
    162 	movdqa	-80(%r12),%xmm3
    163 	pand	128(%r10),%xmm5
    164 	por	%xmm4,%xmm0
    165 	pand	144(%r10),%xmm2
    166 	por	%xmm5,%xmm1
    167 	pand	160(%r10),%xmm3
    168 	por	%xmm2,%xmm0
    169 	por	%xmm3,%xmm1
    170 	movdqa	-64(%r12),%xmm4
    171 	movdqa	-48(%r12),%xmm5
    172 	movdqa	-32(%r12),%xmm2
    173 	pand	176(%r10),%xmm4
    174 	movdqa	-16(%r12),%xmm3
    175 	pand	192(%r10),%xmm5
    176 	por	%xmm4,%xmm0
    177 	pand	208(%r10),%xmm2
    178 	por	%xmm5,%xmm1
    179 	pand	224(%r10),%xmm3
    180 	por	%xmm2,%xmm0
    181 	por	%xmm3,%xmm1
    182 	movdqa	0(%r12),%xmm4
    183 	movdqa	16(%r12),%xmm5
    184 	movdqa	32(%r12),%xmm2
    185 	pand	240(%r10),%xmm4
    186 	movdqa	48(%r12),%xmm3
    187 	pand	256(%r10),%xmm5
    188 	por	%xmm4,%xmm0
    189 	pand	272(%r10),%xmm2
    190 	por	%xmm5,%xmm1
    191 	pand	288(%r10),%xmm3
    192 	por	%xmm2,%xmm0
    193 	por	%xmm3,%xmm1
    194 	por	%xmm1,%xmm0
    195 	pshufd	$0x4e,%xmm0,%xmm1
    196 	por	%xmm1,%xmm0
    197 	leaq	256(%r12),%r12
    198 .byte	102,72,15,126,195
    199 
    200 	movq	(%r8),%r8
    201 	movq	(%rsi),%rax
    202 
    203 	xorq	%r14,%r14
    204 	xorq	%r15,%r15
    205 
    206 	movq	%r8,%rbp
    207 	mulq	%rbx
    208 	movq	%rax,%r10
    209 	movq	(%rcx),%rax
    210 
    211 	imulq	%r10,%rbp
    212 	movq	%rdx,%r11
    213 
    214 	mulq	%rbp
    215 	addq	%rax,%r10
    216 	movq	8(%rsi),%rax
    217 	adcq	$0,%rdx
    218 	movq	%rdx,%r13
    219 
    220 	leaq	1(%r15),%r15
    221 	jmp	L$1st_enter
    222 
    223 .p2align	4
    224 L$1st:
    225 	addq	%rax,%r13
    226 	movq	(%rsi,%r15,8),%rax
    227 	adcq	$0,%rdx
    228 	addq	%r11,%r13
    229 	movq	%r10,%r11
    230 	adcq	$0,%rdx
    231 	movq	%r13,-16(%rsp,%r15,8)
    232 	movq	%rdx,%r13
    233 
    234 L$1st_enter:
    235 	mulq	%rbx
    236 	addq	%rax,%r11
    237 	movq	(%rcx,%r15,8),%rax
    238 	adcq	$0,%rdx
    239 	leaq	1(%r15),%r15
    240 	movq	%rdx,%r10
    241 
    242 	mulq	%rbp
    243 	cmpq	%r9,%r15
    244 	jne	L$1st
    245 
    246 
    247 	addq	%rax,%r13
    248 	adcq	$0,%rdx
    249 	addq	%r11,%r13
    250 	adcq	$0,%rdx
    251 	movq	%r13,-16(%rsp,%r9,8)
    252 	movq	%rdx,%r13
    253 	movq	%r10,%r11
    254 
    255 	xorq	%rdx,%rdx
    256 	addq	%r11,%r13
    257 	adcq	$0,%rdx
    258 	movq	%r13,-8(%rsp,%r9,8)
    259 	movq	%rdx,(%rsp,%r9,8)
    260 
    261 	leaq	1(%r14),%r14
    262 	jmp	L$outer
    263 .p2align	4
    264 L$outer:
    265 	leaq	24+128(%rsp,%r9,8),%rdx
    266 	andq	$-16,%rdx
    267 	pxor	%xmm4,%xmm4
    268 	pxor	%xmm5,%xmm5
    269 	movdqa	-128(%r12),%xmm0
    270 	movdqa	-112(%r12),%xmm1
    271 	movdqa	-96(%r12),%xmm2
    272 	movdqa	-80(%r12),%xmm3
    273 	pand	-128(%rdx),%xmm0
    274 	pand	-112(%rdx),%xmm1
    275 	por	%xmm0,%xmm4
    276 	pand	-96(%rdx),%xmm2
    277 	por	%xmm1,%xmm5
    278 	pand	-80(%rdx),%xmm3
    279 	por	%xmm2,%xmm4
    280 	por	%xmm3,%xmm5
    281 	movdqa	-64(%r12),%xmm0
    282 	movdqa	-48(%r12),%xmm1
    283 	movdqa	-32(%r12),%xmm2
    284 	movdqa	-16(%r12),%xmm3
    285 	pand	-64(%rdx),%xmm0
    286 	pand	-48(%rdx),%xmm1
    287 	por	%xmm0,%xmm4
    288 	pand	-32(%rdx),%xmm2
    289 	por	%xmm1,%xmm5
    290 	pand	-16(%rdx),%xmm3
    291 	por	%xmm2,%xmm4
    292 	por	%xmm3,%xmm5
    293 	movdqa	0(%r12),%xmm0
    294 	movdqa	16(%r12),%xmm1
    295 	movdqa	32(%r12),%xmm2
    296 	movdqa	48(%r12),%xmm3
    297 	pand	0(%rdx),%xmm0
    298 	pand	16(%rdx),%xmm1
    299 	por	%xmm0,%xmm4
    300 	pand	32(%rdx),%xmm2
    301 	por	%xmm1,%xmm5
    302 	pand	48(%rdx),%xmm3
    303 	por	%xmm2,%xmm4
    304 	por	%xmm3,%xmm5
    305 	movdqa	64(%r12),%xmm0
    306 	movdqa	80(%r12),%xmm1
    307 	movdqa	96(%r12),%xmm2
    308 	movdqa	112(%r12),%xmm3
    309 	pand	64(%rdx),%xmm0
    310 	pand	80(%rdx),%xmm1
    311 	por	%xmm0,%xmm4
    312 	pand	96(%rdx),%xmm2
    313 	por	%xmm1,%xmm5
    314 	pand	112(%rdx),%xmm3
    315 	por	%xmm2,%xmm4
    316 	por	%xmm3,%xmm5
    317 	por	%xmm5,%xmm4
    318 	pshufd	$0x4e,%xmm4,%xmm0
    319 	por	%xmm4,%xmm0
    320 	leaq	256(%r12),%r12
    321 
    322 	movq	(%rsi),%rax
    323 .byte	102,72,15,126,195
    324 
    325 	xorq	%r15,%r15
    326 	movq	%r8,%rbp
    327 	movq	(%rsp),%r10
    328 
    329 	mulq	%rbx
    330 	addq	%rax,%r10
    331 	movq	(%rcx),%rax
    332 	adcq	$0,%rdx
    333 
    334 	imulq	%r10,%rbp
    335 	movq	%rdx,%r11
    336 
    337 	mulq	%rbp
    338 	addq	%rax,%r10
    339 	movq	8(%rsi),%rax
    340 	adcq	$0,%rdx
    341 	movq	8(%rsp),%r10
    342 	movq	%rdx,%r13
    343 
    344 	leaq	1(%r15),%r15
    345 	jmp	L$inner_enter
    346 
    347 .p2align	4
    348 L$inner:
    349 	addq	%rax,%r13
    350 	movq	(%rsi,%r15,8),%rax
    351 	adcq	$0,%rdx
    352 	addq	%r10,%r13
    353 	movq	(%rsp,%r15,8),%r10
    354 	adcq	$0,%rdx
    355 	movq	%r13,-16(%rsp,%r15,8)
    356 	movq	%rdx,%r13
    357 
    358 L$inner_enter:
    359 	mulq	%rbx
    360 	addq	%rax,%r11
    361 	movq	(%rcx,%r15,8),%rax
    362 	adcq	$0,%rdx
    363 	addq	%r11,%r10
    364 	movq	%rdx,%r11
    365 	adcq	$0,%r11
    366 	leaq	1(%r15),%r15
    367 
    368 	mulq	%rbp
    369 	cmpq	%r9,%r15
    370 	jne	L$inner
    371 
    372 	addq	%rax,%r13
    373 	adcq	$0,%rdx
    374 	addq	%r10,%r13
    375 	movq	(%rsp,%r9,8),%r10
    376 	adcq	$0,%rdx
    377 	movq	%r13,-16(%rsp,%r9,8)
    378 	movq	%rdx,%r13
    379 
    380 	xorq	%rdx,%rdx
    381 	addq	%r11,%r13
    382 	adcq	$0,%rdx
    383 	addq	%r10,%r13
    384 	adcq	$0,%rdx
    385 	movq	%r13,-8(%rsp,%r9,8)
    386 	movq	%rdx,(%rsp,%r9,8)
    387 
    388 	leaq	1(%r14),%r14
    389 	cmpq	%r9,%r14
    390 	jb	L$outer
    391 
    392 	xorq	%r14,%r14
    393 	movq	(%rsp),%rax
    394 	leaq	(%rsp),%rsi
    395 	movq	%r9,%r15
    396 	jmp	L$sub
    397 .p2align	4
    398 L$sub:
    399 	sbbq	(%rcx,%r14,8),%rax
    400 	movq	%rax,(%rdi,%r14,8)
    401 	movq	8(%rsi,%r14,8),%rax
    402 	leaq	1(%r14),%r14
    403 	decq	%r15
    404 	jnz	L$sub
    405 
    406 	sbbq	$0,%rax
    407 	xorq	%r14,%r14
    408 	andq	%rax,%rsi
    409 	notq	%rax
    410 	movq	%rdi,%rcx
    411 	andq	%rax,%rcx
    412 	movq	%r9,%r15
    413 	orq	%rcx,%rsi
    414 .p2align	4
    415 L$copy:
    416 	movq	(%rsi,%r14,8),%rax
    417 	movq	%r14,(%rsp,%r14,8)
    418 	movq	%rax,(%rdi,%r14,8)
    419 	leaq	1(%r14),%r14
    420 	subq	$1,%r15
    421 	jnz	L$copy
    422 
    423 	movq	8(%rsp,%r9,8),%rsi
    424 
    425 	movq	$1,%rax
    426 
    427 	movq	-48(%rsi),%r15
    428 
    429 	movq	-40(%rsi),%r14
    430 
    431 	movq	-32(%rsi),%r13
    432 
    433 	movq	-24(%rsi),%r12
    434 
    435 	movq	-16(%rsi),%rbp
    436 
    437 	movq	-8(%rsi),%rbx
    438 
    439 	leaq	(%rsi),%rsp
    440 
    441 L$mul_epilogue:
    442 	.byte	0xf3,0xc3
    443 
    444 
    445 
    446 .p2align	5
    447 bn_mul4x_mont_gather5:
    448 
    449 .byte	0x67
    450 	movq	%rsp,%rax
    451 
    452 L$mul4x_enter:
    453 	pushq	%rbx
    454 
    455 	pushq	%rbp
    456 
    457 	pushq	%r12
    458 
    459 	pushq	%r13
    460 
    461 	pushq	%r14
    462 
    463 	pushq	%r15
    464 
    465 L$mul4x_prologue:
    466 
    467 .byte	0x67
    468 	shll	$3,%r9d
    469 	leaq	(%r9,%r9,2),%r10
    470 	negq	%r9
    471 
    472 
    473 
    474 
    475 
    476 
    477 
    478 
    479 
    480 
    481 	leaq	-320(%rsp,%r9,2),%r11
    482 	movq	%rsp,%rbp
    483 	subq	%rdi,%r11
    484 	andq	$4095,%r11
    485 	cmpq	%r11,%r10
    486 	jb	L$mul4xsp_alt
    487 	subq	%r11,%rbp
    488 	leaq	-320(%rbp,%r9,2),%rbp
    489 	jmp	L$mul4xsp_done
    490 
    491 .p2align	5
    492 L$mul4xsp_alt:
    493 	leaq	4096-320(,%r9,2),%r10
    494 	leaq	-320(%rbp,%r9,2),%rbp
    495 	subq	%r10,%r11
    496 	movq	$0,%r10
    497 	cmovcq	%r10,%r11
    498 	subq	%r11,%rbp
    499 L$mul4xsp_done:
    500 	andq	$-64,%rbp
    501 	movq	%rsp,%r11
    502 	subq	%rbp,%r11
    503 	andq	$-4096,%r11
    504 	leaq	(%r11,%rbp,1),%rsp
    505 	movq	(%rsp),%r10
    506 	cmpq	%rbp,%rsp
    507 	ja	L$mul4x_page_walk
    508 	jmp	L$mul4x_page_walk_done
    509 
    510 L$mul4x_page_walk:
    511 	leaq	-4096(%rsp),%rsp
    512 	movq	(%rsp),%r10
    513 	cmpq	%rbp,%rsp
    514 	ja	L$mul4x_page_walk
    515 L$mul4x_page_walk_done:
    516 
    517 	negq	%r9
    518 
    519 	movq	%rax,40(%rsp)
    520 
    521 L$mul4x_body:
    522 
    523 	call	mul4x_internal
    524 
    525 	movq	40(%rsp),%rsi
    526 
    527 	movq	$1,%rax
    528 
    529 	movq	-48(%rsi),%r15
    530 
    531 	movq	-40(%rsi),%r14
    532 
    533 	movq	-32(%rsi),%r13
    534 
    535 	movq	-24(%rsi),%r12
    536 
    537 	movq	-16(%rsi),%rbp
    538 
    539 	movq	-8(%rsi),%rbx
    540 
    541 	leaq	(%rsi),%rsp
    542 
    543 L$mul4x_epilogue:
    544 	.byte	0xf3,0xc3
    545 
    546 
    547 
    548 
    549 .p2align	5
    550 mul4x_internal:
    551 	shlq	$5,%r9
    552 	movd	8(%rax),%xmm5
    553 	leaq	L$inc(%rip),%rax
    554 	leaq	128(%rdx,%r9,1),%r13
    555 	shrq	$5,%r9
    556 	movdqa	0(%rax),%xmm0
    557 	movdqa	16(%rax),%xmm1
    558 	leaq	88-112(%rsp,%r9,1),%r10
    559 	leaq	128(%rdx),%r12
    560 
    561 	pshufd	$0,%xmm5,%xmm5
    562 	movdqa	%xmm1,%xmm4
    563 .byte	0x67,0x67
    564 	movdqa	%xmm1,%xmm2
    565 	paddd	%xmm0,%xmm1
    566 	pcmpeqd	%xmm5,%xmm0
    567 .byte	0x67
    568 	movdqa	%xmm4,%xmm3
    569 	paddd	%xmm1,%xmm2
    570 	pcmpeqd	%xmm5,%xmm1
    571 	movdqa	%xmm0,112(%r10)
    572 	movdqa	%xmm4,%xmm0
    573 
    574 	paddd	%xmm2,%xmm3
    575 	pcmpeqd	%xmm5,%xmm2
    576 	movdqa	%xmm1,128(%r10)
    577 	movdqa	%xmm4,%xmm1
    578 
    579 	paddd	%xmm3,%xmm0
    580 	pcmpeqd	%xmm5,%xmm3
    581 	movdqa	%xmm2,144(%r10)
    582 	movdqa	%xmm4,%xmm2
    583 
    584 	paddd	%xmm0,%xmm1
    585 	pcmpeqd	%xmm5,%xmm0
    586 	movdqa	%xmm3,160(%r10)
    587 	movdqa	%xmm4,%xmm3
    588 	paddd	%xmm1,%xmm2
    589 	pcmpeqd	%xmm5,%xmm1
    590 	movdqa	%xmm0,176(%r10)
    591 	movdqa	%xmm4,%xmm0
    592 
    593 	paddd	%xmm2,%xmm3
    594 	pcmpeqd	%xmm5,%xmm2
    595 	movdqa	%xmm1,192(%r10)
    596 	movdqa	%xmm4,%xmm1
    597 
    598 	paddd	%xmm3,%xmm0
    599 	pcmpeqd	%xmm5,%xmm3
    600 	movdqa	%xmm2,208(%r10)
    601 	movdqa	%xmm4,%xmm2
    602 
    603 	paddd	%xmm0,%xmm1
    604 	pcmpeqd	%xmm5,%xmm0
    605 	movdqa	%xmm3,224(%r10)
    606 	movdqa	%xmm4,%xmm3
    607 	paddd	%xmm1,%xmm2
    608 	pcmpeqd	%xmm5,%xmm1
    609 	movdqa	%xmm0,240(%r10)
    610 	movdqa	%xmm4,%xmm0
    611 
    612 	paddd	%xmm2,%xmm3
    613 	pcmpeqd	%xmm5,%xmm2
    614 	movdqa	%xmm1,256(%r10)
    615 	movdqa	%xmm4,%xmm1
    616 
    617 	paddd	%xmm3,%xmm0
    618 	pcmpeqd	%xmm5,%xmm3
    619 	movdqa	%xmm2,272(%r10)
    620 	movdqa	%xmm4,%xmm2
    621 
    622 	paddd	%xmm0,%xmm1
    623 	pcmpeqd	%xmm5,%xmm0
    624 	movdqa	%xmm3,288(%r10)
    625 	movdqa	%xmm4,%xmm3
    626 	paddd	%xmm1,%xmm2
    627 	pcmpeqd	%xmm5,%xmm1
    628 	movdqa	%xmm0,304(%r10)
    629 
    630 	paddd	%xmm2,%xmm3
    631 .byte	0x67
    632 	pcmpeqd	%xmm5,%xmm2
    633 	movdqa	%xmm1,320(%r10)
    634 
    635 	pcmpeqd	%xmm5,%xmm3
    636 	movdqa	%xmm2,336(%r10)
    637 	pand	64(%r12),%xmm0
    638 
    639 	pand	80(%r12),%xmm1
    640 	pand	96(%r12),%xmm2
    641 	movdqa	%xmm3,352(%r10)
    642 	pand	112(%r12),%xmm3
    643 	por	%xmm2,%xmm0
    644 	por	%xmm3,%xmm1
    645 	movdqa	-128(%r12),%xmm4
    646 	movdqa	-112(%r12),%xmm5
    647 	movdqa	-96(%r12),%xmm2
    648 	pand	112(%r10),%xmm4
    649 	movdqa	-80(%r12),%xmm3
    650 	pand	128(%r10),%xmm5
    651 	por	%xmm4,%xmm0
    652 	pand	144(%r10),%xmm2
    653 	por	%xmm5,%xmm1
    654 	pand	160(%r10),%xmm3
    655 	por	%xmm2,%xmm0
    656 	por	%xmm3,%xmm1
    657 	movdqa	-64(%r12),%xmm4
    658 	movdqa	-48(%r12),%xmm5
    659 	movdqa	-32(%r12),%xmm2
    660 	pand	176(%r10),%xmm4
    661 	movdqa	-16(%r12),%xmm3
    662 	pand	192(%r10),%xmm5
    663 	por	%xmm4,%xmm0
    664 	pand	208(%r10),%xmm2
    665 	por	%xmm5,%xmm1
    666 	pand	224(%r10),%xmm3
    667 	por	%xmm2,%xmm0
    668 	por	%xmm3,%xmm1
    669 	movdqa	0(%r12),%xmm4
    670 	movdqa	16(%r12),%xmm5
    671 	movdqa	32(%r12),%xmm2
    672 	pand	240(%r10),%xmm4
    673 	movdqa	48(%r12),%xmm3
    674 	pand	256(%r10),%xmm5
    675 	por	%xmm4,%xmm0
    676 	pand	272(%r10),%xmm2
    677 	por	%xmm5,%xmm1
    678 	pand	288(%r10),%xmm3
    679 	por	%xmm2,%xmm0
    680 	por	%xmm3,%xmm1
    681 	por	%xmm1,%xmm0
    682 	pshufd	$0x4e,%xmm0,%xmm1
    683 	por	%xmm1,%xmm0
    684 	leaq	256(%r12),%r12
    685 .byte	102,72,15,126,195
    686 
    687 	movq	%r13,16+8(%rsp)
    688 	movq	%rdi,56+8(%rsp)
    689 
    690 	movq	(%r8),%r8
    691 	movq	(%rsi),%rax
    692 	leaq	(%rsi,%r9,1),%rsi
    693 	negq	%r9
    694 
    695 	movq	%r8,%rbp
    696 	mulq	%rbx
    697 	movq	%rax,%r10
    698 	movq	(%rcx),%rax
    699 
    700 	imulq	%r10,%rbp
    701 	leaq	64+8(%rsp),%r14
    702 	movq	%rdx,%r11
    703 
    704 	mulq	%rbp
    705 	addq	%rax,%r10
    706 	movq	8(%rsi,%r9,1),%rax
    707 	adcq	$0,%rdx
    708 	movq	%rdx,%rdi
    709 
    710 	mulq	%rbx
    711 	addq	%rax,%r11
    712 	movq	8(%rcx),%rax
    713 	adcq	$0,%rdx
    714 	movq	%rdx,%r10
    715 
    716 	mulq	%rbp
    717 	addq	%rax,%rdi
    718 	movq	16(%rsi,%r9,1),%rax
    719 	adcq	$0,%rdx
    720 	addq	%r11,%rdi
    721 	leaq	32(%r9),%r15
    722 	leaq	32(%rcx),%rcx
    723 	adcq	$0,%rdx
    724 	movq	%rdi,(%r14)
    725 	movq	%rdx,%r13
    726 	jmp	L$1st4x
    727 
    728 .p2align	5
    729 L$1st4x:
    730 	mulq	%rbx
    731 	addq	%rax,%r10
    732 	movq	-16(%rcx),%rax
    733 	leaq	32(%r14),%r14
    734 	adcq	$0,%rdx
    735 	movq	%rdx,%r11
    736 
    737 	mulq	%rbp
    738 	addq	%rax,%r13
    739 	movq	-8(%rsi,%r15,1),%rax
    740 	adcq	$0,%rdx
    741 	addq	%r10,%r13
    742 	adcq	$0,%rdx
    743 	movq	%r13,-24(%r14)
    744 	movq	%rdx,%rdi
    745 
    746 	mulq	%rbx
    747 	addq	%rax,%r11
    748 	movq	-8(%rcx),%rax
    749 	adcq	$0,%rdx
    750 	movq	%rdx,%r10
    751 
    752 	mulq	%rbp
    753 	addq	%rax,%rdi
    754 	movq	(%rsi,%r15,1),%rax
    755 	adcq	$0,%rdx
    756 	addq	%r11,%rdi
    757 	adcq	$0,%rdx
    758 	movq	%rdi,-16(%r14)
    759 	movq	%rdx,%r13
    760 
    761 	mulq	%rbx
    762 	addq	%rax,%r10
    763 	movq	0(%rcx),%rax
    764 	adcq	$0,%rdx
    765 	movq	%rdx,%r11
    766 
    767 	mulq	%rbp
    768 	addq	%rax,%r13
    769 	movq	8(%rsi,%r15,1),%rax
    770 	adcq	$0,%rdx
    771 	addq	%r10,%r13
    772 	adcq	$0,%rdx
    773 	movq	%r13,-8(%r14)
    774 	movq	%rdx,%rdi
    775 
    776 	mulq	%rbx
    777 	addq	%rax,%r11
    778 	movq	8(%rcx),%rax
    779 	adcq	$0,%rdx
    780 	movq	%rdx,%r10
    781 
    782 	mulq	%rbp
    783 	addq	%rax,%rdi
    784 	movq	16(%rsi,%r15,1),%rax
    785 	adcq	$0,%rdx
    786 	addq	%r11,%rdi
    787 	leaq	32(%rcx),%rcx
    788 	adcq	$0,%rdx
    789 	movq	%rdi,(%r14)
    790 	movq	%rdx,%r13
    791 
    792 	addq	$32,%r15
    793 	jnz	L$1st4x
    794 
    795 	mulq	%rbx
    796 	addq	%rax,%r10
    797 	movq	-16(%rcx),%rax
    798 	leaq	32(%r14),%r14
    799 	adcq	$0,%rdx
    800 	movq	%rdx,%r11
    801 
    802 	mulq	%rbp
    803 	addq	%rax,%r13
    804 	movq	-8(%rsi),%rax
    805 	adcq	$0,%rdx
    806 	addq	%r10,%r13
    807 	adcq	$0,%rdx
    808 	movq	%r13,-24(%r14)
    809 	movq	%rdx,%rdi
    810 
    811 	mulq	%rbx
    812 	addq	%rax,%r11
    813 	movq	-8(%rcx),%rax
    814 	adcq	$0,%rdx
    815 	movq	%rdx,%r10
    816 
    817 	mulq	%rbp
    818 	addq	%rax,%rdi
    819 	movq	(%rsi,%r9,1),%rax
    820 	adcq	$0,%rdx
    821 	addq	%r11,%rdi
    822 	adcq	$0,%rdx
    823 	movq	%rdi,-16(%r14)
    824 	movq	%rdx,%r13
    825 
    826 	leaq	(%rcx,%r9,1),%rcx
    827 
    828 	xorq	%rdi,%rdi
    829 	addq	%r10,%r13
    830 	adcq	$0,%rdi
    831 	movq	%r13,-8(%r14)
    832 
    833 	jmp	L$outer4x
    834 
    835 .p2align	5
    836 L$outer4x:
    837 	leaq	16+128(%r14),%rdx
    838 	pxor	%xmm4,%xmm4
    839 	pxor	%xmm5,%xmm5
    840 	movdqa	-128(%r12),%xmm0
    841 	movdqa	-112(%r12),%xmm1
    842 	movdqa	-96(%r12),%xmm2
    843 	movdqa	-80(%r12),%xmm3
    844 	pand	-128(%rdx),%xmm0
    845 	pand	-112(%rdx),%xmm1
    846 	por	%xmm0,%xmm4
    847 	pand	-96(%rdx),%xmm2
    848 	por	%xmm1,%xmm5
    849 	pand	-80(%rdx),%xmm3
    850 	por	%xmm2,%xmm4
    851 	por	%xmm3,%xmm5
    852 	movdqa	-64(%r12),%xmm0
    853 	movdqa	-48(%r12),%xmm1
    854 	movdqa	-32(%r12),%xmm2
    855 	movdqa	-16(%r12),%xmm3
    856 	pand	-64(%rdx),%xmm0
    857 	pand	-48(%rdx),%xmm1
    858 	por	%xmm0,%xmm4
    859 	pand	-32(%rdx),%xmm2
    860 	por	%xmm1,%xmm5
    861 	pand	-16(%rdx),%xmm3
    862 	por	%xmm2,%xmm4
    863 	por	%xmm3,%xmm5
    864 	movdqa	0(%r12),%xmm0
    865 	movdqa	16(%r12),%xmm1
    866 	movdqa	32(%r12),%xmm2
    867 	movdqa	48(%r12),%xmm3
    868 	pand	0(%rdx),%xmm0
    869 	pand	16(%rdx),%xmm1
    870 	por	%xmm0,%xmm4
    871 	pand	32(%rdx),%xmm2
    872 	por	%xmm1,%xmm5
    873 	pand	48(%rdx),%xmm3
    874 	por	%xmm2,%xmm4
    875 	por	%xmm3,%xmm5
    876 	movdqa	64(%r12),%xmm0
    877 	movdqa	80(%r12),%xmm1
    878 	movdqa	96(%r12),%xmm2
    879 	movdqa	112(%r12),%xmm3
    880 	pand	64(%rdx),%xmm0
    881 	pand	80(%rdx),%xmm1
    882 	por	%xmm0,%xmm4
    883 	pand	96(%rdx),%xmm2
    884 	por	%xmm1,%xmm5
    885 	pand	112(%rdx),%xmm3
    886 	por	%xmm2,%xmm4
    887 	por	%xmm3,%xmm5
    888 	por	%xmm5,%xmm4
    889 	pshufd	$0x4e,%xmm4,%xmm0
    890 	por	%xmm4,%xmm0
    891 	leaq	256(%r12),%r12
    892 .byte	102,72,15,126,195
    893 
    894 	movq	(%r14,%r9,1),%r10
    895 	movq	%r8,%rbp
    896 	mulq	%rbx
    897 	addq	%rax,%r10
    898 	movq	(%rcx),%rax
    899 	adcq	$0,%rdx
    900 
    901 	imulq	%r10,%rbp
    902 	movq	%rdx,%r11
    903 	movq	%rdi,(%r14)
    904 
    905 	leaq	(%r14,%r9,1),%r14
    906 
    907 	mulq	%rbp
    908 	addq	%rax,%r10
    909 	movq	8(%rsi,%r9,1),%rax
    910 	adcq	$0,%rdx
    911 	movq	%rdx,%rdi
    912 
    913 	mulq	%rbx
    914 	addq	%rax,%r11
    915 	movq	8(%rcx),%rax
    916 	adcq	$0,%rdx
    917 	addq	8(%r14),%r11
    918 	adcq	$0,%rdx
    919 	movq	%rdx,%r10
    920 
    921 	mulq	%rbp
    922 	addq	%rax,%rdi
    923 	movq	16(%rsi,%r9,1),%rax
    924 	adcq	$0,%rdx
    925 	addq	%r11,%rdi
    926 	leaq	32(%r9),%r15
    927 	leaq	32(%rcx),%rcx
    928 	adcq	$0,%rdx
    929 	movq	%rdx,%r13
    930 	jmp	L$inner4x
    931 
    932 .p2align	5
    933 L$inner4x:
    934 	mulq	%rbx
    935 	addq	%rax,%r10
    936 	movq	-16(%rcx),%rax
    937 	adcq	$0,%rdx
    938 	addq	16(%r14),%r10
    939 	leaq	32(%r14),%r14
    940 	adcq	$0,%rdx
    941 	movq	%rdx,%r11
    942 
    943 	mulq	%rbp
    944 	addq	%rax,%r13
    945 	movq	-8(%rsi,%r15,1),%rax
    946 	adcq	$0,%rdx
    947 	addq	%r10,%r13
    948 	adcq	$0,%rdx
    949 	movq	%rdi,-32(%r14)
    950 	movq	%rdx,%rdi
    951 
    952 	mulq	%rbx
    953 	addq	%rax,%r11
    954 	movq	-8(%rcx),%rax
    955 	adcq	$0,%rdx
    956 	addq	-8(%r14),%r11
    957 	adcq	$0,%rdx
    958 	movq	%rdx,%r10
    959 
    960 	mulq	%rbp
    961 	addq	%rax,%rdi
    962 	movq	(%rsi,%r15,1),%rax
    963 	adcq	$0,%rdx
    964 	addq	%r11,%rdi
    965 	adcq	$0,%rdx
    966 	movq	%r13,-24(%r14)
    967 	movq	%rdx,%r13
    968 
    969 	mulq	%rbx
    970 	addq	%rax,%r10
    971 	movq	0(%rcx),%rax
    972 	adcq	$0,%rdx
    973 	addq	(%r14),%r10
    974 	adcq	$0,%rdx
    975 	movq	%rdx,%r11
    976 
    977 	mulq	%rbp
    978 	addq	%rax,%r13
    979 	movq	8(%rsi,%r15,1),%rax
    980 	adcq	$0,%rdx
    981 	addq	%r10,%r13
    982 	adcq	$0,%rdx
    983 	movq	%rdi,-16(%r14)
    984 	movq	%rdx,%rdi
    985 
    986 	mulq	%rbx
    987 	addq	%rax,%r11
    988 	movq	8(%rcx),%rax
    989 	adcq	$0,%rdx
    990 	addq	8(%r14),%r11
    991 	adcq	$0,%rdx
    992 	movq	%rdx,%r10
    993 
    994 	mulq	%rbp
    995 	addq	%rax,%rdi
    996 	movq	16(%rsi,%r15,1),%rax
    997 	adcq	$0,%rdx
    998 	addq	%r11,%rdi
    999 	leaq	32(%rcx),%rcx
   1000 	adcq	$0,%rdx
   1001 	movq	%r13,-8(%r14)
   1002 	movq	%rdx,%r13
   1003 
   1004 	addq	$32,%r15
   1005 	jnz	L$inner4x
   1006 
   1007 	mulq	%rbx
   1008 	addq	%rax,%r10
   1009 	movq	-16(%rcx),%rax
   1010 	adcq	$0,%rdx
   1011 	addq	16(%r14),%r10
   1012 	leaq	32(%r14),%r14
   1013 	adcq	$0,%rdx
   1014 	movq	%rdx,%r11
   1015 
   1016 	mulq	%rbp
   1017 	addq	%rax,%r13
   1018 	movq	-8(%rsi),%rax
   1019 	adcq	$0,%rdx
   1020 	addq	%r10,%r13
   1021 	adcq	$0,%rdx
   1022 	movq	%rdi,-32(%r14)
   1023 	movq	%rdx,%rdi
   1024 
   1025 	mulq	%rbx
   1026 	addq	%rax,%r11
   1027 	movq	%rbp,%rax
   1028 	movq	-8(%rcx),%rbp
   1029 	adcq	$0,%rdx
   1030 	addq	-8(%r14),%r11
   1031 	adcq	$0,%rdx
   1032 	movq	%rdx,%r10
   1033 
   1034 	mulq	%rbp
   1035 	addq	%rax,%rdi
   1036 	movq	(%rsi,%r9,1),%rax
   1037 	adcq	$0,%rdx
   1038 	addq	%r11,%rdi
   1039 	adcq	$0,%rdx
   1040 	movq	%r13,-24(%r14)
   1041 	movq	%rdx,%r13
   1042 
   1043 	movq	%rdi,-16(%r14)
   1044 	leaq	(%rcx,%r9,1),%rcx
   1045 
   1046 	xorq	%rdi,%rdi
   1047 	addq	%r10,%r13
   1048 	adcq	$0,%rdi
   1049 	addq	(%r14),%r13
   1050 	adcq	$0,%rdi
   1051 	movq	%r13,-8(%r14)
   1052 
   1053 	cmpq	16+8(%rsp),%r12
   1054 	jb	L$outer4x
   1055 	xorq	%rax,%rax
   1056 	subq	%r13,%rbp
   1057 	adcq	%r15,%r15
   1058 	orq	%r15,%rdi
   1059 	subq	%rdi,%rax
   1060 	leaq	(%r14,%r9,1),%rbx
   1061 	movq	(%rcx),%r12
   1062 	leaq	(%rcx),%rbp
   1063 	movq	%r9,%rcx
   1064 	sarq	$3+2,%rcx
   1065 	movq	56+8(%rsp),%rdi
   1066 	decq	%r12
   1067 	xorq	%r10,%r10
   1068 	movq	8(%rbp),%r13
   1069 	movq	16(%rbp),%r14
   1070 	movq	24(%rbp),%r15
   1071 	jmp	L$sqr4x_sub_entry
   1072 
   1073 .globl	_bn_power5
   1074 .private_extern _bn_power5
   1075 
   1076 .p2align	5
   1077 _bn_power5:
   1078 
   1079 	movq	%rsp,%rax
   1080 
   1081 	pushq	%rbx
   1082 
   1083 	pushq	%rbp
   1084 
   1085 	pushq	%r12
   1086 
   1087 	pushq	%r13
   1088 
   1089 	pushq	%r14
   1090 
   1091 	pushq	%r15
   1092 
   1093 L$power5_prologue:
   1094 
   1095 	shll	$3,%r9d
   1096 	leal	(%r9,%r9,2),%r10d
   1097 	negq	%r9
   1098 	movq	(%r8),%r8
   1099 
   1100 
   1101 
   1102 
   1103 
   1104 
   1105 
   1106 
   1107 	leaq	-320(%rsp,%r9,2),%r11
   1108 	movq	%rsp,%rbp
   1109 	subq	%rdi,%r11
   1110 	andq	$4095,%r11
   1111 	cmpq	%r11,%r10
   1112 	jb	L$pwr_sp_alt
   1113 	subq	%r11,%rbp
   1114 	leaq	-320(%rbp,%r9,2),%rbp
   1115 	jmp	L$pwr_sp_done
   1116 
   1117 .p2align	5
   1118 L$pwr_sp_alt:
   1119 	leaq	4096-320(,%r9,2),%r10
   1120 	leaq	-320(%rbp,%r9,2),%rbp
   1121 	subq	%r10,%r11
   1122 	movq	$0,%r10
   1123 	cmovcq	%r10,%r11
   1124 	subq	%r11,%rbp
   1125 L$pwr_sp_done:
   1126 	andq	$-64,%rbp
   1127 	movq	%rsp,%r11
   1128 	subq	%rbp,%r11
   1129 	andq	$-4096,%r11
   1130 	leaq	(%r11,%rbp,1),%rsp
   1131 	movq	(%rsp),%r10
   1132 	cmpq	%rbp,%rsp
   1133 	ja	L$pwr_page_walk
   1134 	jmp	L$pwr_page_walk_done
   1135 
   1136 L$pwr_page_walk:
   1137 	leaq	-4096(%rsp),%rsp
   1138 	movq	(%rsp),%r10
   1139 	cmpq	%rbp,%rsp
   1140 	ja	L$pwr_page_walk
   1141 L$pwr_page_walk_done:
   1142 
   1143 	movq	%r9,%r10
   1144 	negq	%r9
   1145 
   1146 
   1147 
   1148 
   1149 
   1150 
   1151 
   1152 
   1153 
   1154 
   1155 	movq	%r8,32(%rsp)
   1156 	movq	%rax,40(%rsp)
   1157 
   1158 L$power5_body:
   1159 .byte	102,72,15,110,207
   1160 .byte	102,72,15,110,209
   1161 .byte	102,73,15,110,218
   1162 .byte	102,72,15,110,226
   1163 
   1164 	call	__bn_sqr8x_internal
   1165 	call	__bn_post4x_internal
   1166 	call	__bn_sqr8x_internal
   1167 	call	__bn_post4x_internal
   1168 	call	__bn_sqr8x_internal
   1169 	call	__bn_post4x_internal
   1170 	call	__bn_sqr8x_internal
   1171 	call	__bn_post4x_internal
   1172 	call	__bn_sqr8x_internal
   1173 	call	__bn_post4x_internal
   1174 
   1175 .byte	102,72,15,126,209
   1176 .byte	102,72,15,126,226
   1177 	movq	%rsi,%rdi
   1178 	movq	40(%rsp),%rax
   1179 	leaq	32(%rsp),%r8
   1180 
   1181 	call	mul4x_internal
   1182 
   1183 	movq	40(%rsp),%rsi
   1184 
   1185 	movq	$1,%rax
   1186 	movq	-48(%rsi),%r15
   1187 
   1188 	movq	-40(%rsi),%r14
   1189 
   1190 	movq	-32(%rsi),%r13
   1191 
   1192 	movq	-24(%rsi),%r12
   1193 
   1194 	movq	-16(%rsi),%rbp
   1195 
   1196 	movq	-8(%rsi),%rbx
   1197 
   1198 	leaq	(%rsi),%rsp
   1199 
   1200 L$power5_epilogue:
   1201 	.byte	0xf3,0xc3
   1202 
   1203 
   1204 
   1205 .globl	_bn_sqr8x_internal
   1206 .private_extern _bn_sqr8x_internal
   1207 .private_extern	_bn_sqr8x_internal
   1208 
   1209 .p2align	5
   1210 _bn_sqr8x_internal:
   1211 __bn_sqr8x_internal:
   1212 
   1213 
   1214 
   1215 
   1216 
   1217 
   1218 
   1219 
   1220 
   1221 
   1222 
   1223 
   1224 
   1225 
   1226 
   1227 
   1228 
   1229 
   1230 
   1231 
   1232 
   1233 
   1234 
   1235 
   1236 
   1237 
   1238 
   1239 
   1240 
   1241 
   1242 
   1243 
   1244 
   1245 
   1246 
   1247 
   1248 
   1249 
   1250 
   1251 
   1252 
   1253 
   1254 
   1255 
   1256 
   1257 
   1258 
   1259 
   1260 
   1261 
   1262 
   1263 
   1264 
   1265 
   1266 
   1267 
   1268 
   1269 
   1270 
   1271 
   1272 
   1273 
   1274 
   1275 
   1276 
   1277 
   1278 
   1279 
   1280 
   1281 
   1282 
   1283 
   1284 
   1285 	leaq	32(%r10),%rbp
   1286 	leaq	(%rsi,%r9,1),%rsi
   1287 
   1288 	movq	%r9,%rcx
   1289 
   1290 
   1291 	movq	-32(%rsi,%rbp,1),%r14
   1292 	leaq	48+8(%rsp,%r9,2),%rdi
   1293 	movq	-24(%rsi,%rbp,1),%rax
   1294 	leaq	-32(%rdi,%rbp,1),%rdi
   1295 	movq	-16(%rsi,%rbp,1),%rbx
   1296 	movq	%rax,%r15
   1297 
   1298 	mulq	%r14
   1299 	movq	%rax,%r10
   1300 	movq	%rbx,%rax
   1301 	movq	%rdx,%r11
   1302 	movq	%r10,-24(%rdi,%rbp,1)
   1303 
   1304 	mulq	%r14
   1305 	addq	%rax,%r11
   1306 	movq	%rbx,%rax
   1307 	adcq	$0,%rdx
   1308 	movq	%r11,-16(%rdi,%rbp,1)
   1309 	movq	%rdx,%r10
   1310 
   1311 
   1312 	movq	-8(%rsi,%rbp,1),%rbx
   1313 	mulq	%r15
   1314 	movq	%rax,%r12
   1315 	movq	%rbx,%rax
   1316 	movq	%rdx,%r13
   1317 
   1318 	leaq	(%rbp),%rcx
   1319 	mulq	%r14
   1320 	addq	%rax,%r10
   1321 	movq	%rbx,%rax
   1322 	movq	%rdx,%r11
   1323 	adcq	$0,%r11
   1324 	addq	%r12,%r10
   1325 	adcq	$0,%r11
   1326 	movq	%r10,-8(%rdi,%rcx,1)
   1327 	jmp	L$sqr4x_1st
   1328 
   1329 .p2align	5
   1330 L$sqr4x_1st:
   1331 	movq	(%rsi,%rcx,1),%rbx
   1332 	mulq	%r15
   1333 	addq	%rax,%r13
   1334 	movq	%rbx,%rax
   1335 	movq	%rdx,%r12
   1336 	adcq	$0,%r12
   1337 
   1338 	mulq	%r14
   1339 	addq	%rax,%r11
   1340 	movq	%rbx,%rax
   1341 	movq	8(%rsi,%rcx,1),%rbx
   1342 	movq	%rdx,%r10
   1343 	adcq	$0,%r10
   1344 	addq	%r13,%r11
   1345 	adcq	$0,%r10
   1346 
   1347 
   1348 	mulq	%r15
   1349 	addq	%rax,%r12
   1350 	movq	%rbx,%rax
   1351 	movq	%r11,(%rdi,%rcx,1)
   1352 	movq	%rdx,%r13
   1353 	adcq	$0,%r13
   1354 
   1355 	mulq	%r14
   1356 	addq	%rax,%r10
   1357 	movq	%rbx,%rax
   1358 	movq	16(%rsi,%rcx,1),%rbx
   1359 	movq	%rdx,%r11
   1360 	adcq	$0,%r11
   1361 	addq	%r12,%r10
   1362 	adcq	$0,%r11
   1363 
   1364 	mulq	%r15
   1365 	addq	%rax,%r13
   1366 	movq	%rbx,%rax
   1367 	movq	%r10,8(%rdi,%rcx,1)
   1368 	movq	%rdx,%r12
   1369 	adcq	$0,%r12
   1370 
   1371 	mulq	%r14
   1372 	addq	%rax,%r11
   1373 	movq	%rbx,%rax
   1374 	movq	24(%rsi,%rcx,1),%rbx
   1375 	movq	%rdx,%r10
   1376 	adcq	$0,%r10
   1377 	addq	%r13,%r11
   1378 	adcq	$0,%r10
   1379 
   1380 
   1381 	mulq	%r15
   1382 	addq	%rax,%r12
   1383 	movq	%rbx,%rax
   1384 	movq	%r11,16(%rdi,%rcx,1)
   1385 	movq	%rdx,%r13
   1386 	adcq	$0,%r13
   1387 	leaq	32(%rcx),%rcx
   1388 
   1389 	mulq	%r14
   1390 	addq	%rax,%r10
   1391 	movq	%rbx,%rax
   1392 	movq	%rdx,%r11
   1393 	adcq	$0,%r11
   1394 	addq	%r12,%r10
   1395 	adcq	$0,%r11
   1396 	movq	%r10,-8(%rdi,%rcx,1)
   1397 
   1398 	cmpq	$0,%rcx
   1399 	jne	L$sqr4x_1st
   1400 
   1401 	mulq	%r15
   1402 	addq	%rax,%r13
   1403 	leaq	16(%rbp),%rbp
   1404 	adcq	$0,%rdx
   1405 	addq	%r11,%r13
   1406 	adcq	$0,%rdx
   1407 
   1408 	movq	%r13,(%rdi)
   1409 	movq	%rdx,%r12
   1410 	movq	%rdx,8(%rdi)
   1411 	jmp	L$sqr4x_outer
   1412 
   1413 .p2align	5
   1414 L$sqr4x_outer:
   1415 	movq	-32(%rsi,%rbp,1),%r14
   1416 	leaq	48+8(%rsp,%r9,2),%rdi
   1417 	movq	-24(%rsi,%rbp,1),%rax
   1418 	leaq	-32(%rdi,%rbp,1),%rdi
   1419 	movq	-16(%rsi,%rbp,1),%rbx
   1420 	movq	%rax,%r15
   1421 
   1422 	mulq	%r14
   1423 	movq	-24(%rdi,%rbp,1),%r10
   1424 	addq	%rax,%r10
   1425 	movq	%rbx,%rax
   1426 	adcq	$0,%rdx
   1427 	movq	%r10,-24(%rdi,%rbp,1)
   1428 	movq	%rdx,%r11
   1429 
   1430 	mulq	%r14
   1431 	addq	%rax,%r11
   1432 	movq	%rbx,%rax
   1433 	adcq	$0,%rdx
   1434 	addq	-16(%rdi,%rbp,1),%r11
   1435 	movq	%rdx,%r10
   1436 	adcq	$0,%r10
   1437 	movq	%r11,-16(%rdi,%rbp,1)
   1438 
   1439 	xorq	%r12,%r12
   1440 
   1441 	movq	-8(%rsi,%rbp,1),%rbx
   1442 	mulq	%r15
   1443 	addq	%rax,%r12
   1444 	movq	%rbx,%rax
   1445 	adcq	$0,%rdx
   1446 	addq	-8(%rdi,%rbp,1),%r12
   1447 	movq	%rdx,%r13
   1448 	adcq	$0,%r13
   1449 
   1450 	mulq	%r14
   1451 	addq	%rax,%r10
   1452 	movq	%rbx,%rax
   1453 	adcq	$0,%rdx
   1454 	addq	%r12,%r10
   1455 	movq	%rdx,%r11
   1456 	adcq	$0,%r11
   1457 	movq	%r10,-8(%rdi,%rbp,1)
   1458 
   1459 	leaq	(%rbp),%rcx
   1460 	jmp	L$sqr4x_inner
   1461 
   1462 .p2align	5
   1463 L$sqr4x_inner:
   1464 	movq	(%rsi,%rcx,1),%rbx
   1465 	mulq	%r15
   1466 	addq	%rax,%r13
   1467 	movq	%rbx,%rax
   1468 	movq	%rdx,%r12
   1469 	adcq	$0,%r12
   1470 	addq	(%rdi,%rcx,1),%r13
   1471 	adcq	$0,%r12
   1472 
   1473 .byte	0x67
   1474 	mulq	%r14
   1475 	addq	%rax,%r11
   1476 	movq	%rbx,%rax
   1477 	movq	8(%rsi,%rcx,1),%rbx
   1478 	movq	%rdx,%r10
   1479 	adcq	$0,%r10
   1480 	addq	%r13,%r11
   1481 	adcq	$0,%r10
   1482 
   1483 	mulq	%r15
   1484 	addq	%rax,%r12
   1485 	movq	%r11,(%rdi,%rcx,1)
   1486 	movq	%rbx,%rax
   1487 	movq	%rdx,%r13
   1488 	adcq	$0,%r13
   1489 	addq	8(%rdi,%rcx,1),%r12
   1490 	leaq	16(%rcx),%rcx
   1491 	adcq	$0,%r13
   1492 
   1493 	mulq	%r14
   1494 	addq	%rax,%r10
   1495 	movq	%rbx,%rax
   1496 	adcq	$0,%rdx
   1497 	addq	%r12,%r10
   1498 	movq	%rdx,%r11
   1499 	adcq	$0,%r11
   1500 	movq	%r10,-8(%rdi,%rcx,1)
   1501 
   1502 	cmpq	$0,%rcx
   1503 	jne	L$sqr4x_inner
   1504 
   1505 .byte	0x67
   1506 	mulq	%r15
   1507 	addq	%rax,%r13
   1508 	adcq	$0,%rdx
   1509 	addq	%r11,%r13
   1510 	adcq	$0,%rdx
   1511 
   1512 	movq	%r13,(%rdi)
   1513 	movq	%rdx,%r12
   1514 	movq	%rdx,8(%rdi)
   1515 
   1516 	addq	$16,%rbp
   1517 	jnz	L$sqr4x_outer
   1518 
   1519 
   1520 	movq	-32(%rsi),%r14
   1521 	leaq	48+8(%rsp,%r9,2),%rdi
   1522 	movq	-24(%rsi),%rax
   1523 	leaq	-32(%rdi,%rbp,1),%rdi
   1524 	movq	-16(%rsi),%rbx
   1525 	movq	%rax,%r15
   1526 
   1527 	mulq	%r14
   1528 	addq	%rax,%r10
   1529 	movq	%rbx,%rax
   1530 	movq	%rdx,%r11
   1531 	adcq	$0,%r11
   1532 
   1533 	mulq	%r14
   1534 	addq	%rax,%r11
   1535 	movq	%rbx,%rax
   1536 	movq	%r10,-24(%rdi)
   1537 	movq	%rdx,%r10
   1538 	adcq	$0,%r10
   1539 	addq	%r13,%r11
   1540 	movq	-8(%rsi),%rbx
   1541 	adcq	$0,%r10
   1542 
   1543 	mulq	%r15
   1544 	addq	%rax,%r12
   1545 	movq	%rbx,%rax
   1546 	movq	%r11,-16(%rdi)
   1547 	movq	%rdx,%r13
   1548 	adcq	$0,%r13
   1549 
   1550 	mulq	%r14
   1551 	addq	%rax,%r10
   1552 	movq	%rbx,%rax
   1553 	movq	%rdx,%r11
   1554 	adcq	$0,%r11
   1555 	addq	%r12,%r10
   1556 	adcq	$0,%r11
   1557 	movq	%r10,-8(%rdi)
   1558 
   1559 	mulq	%r15
   1560 	addq	%rax,%r13
   1561 	movq	-16(%rsi),%rax
   1562 	adcq	$0,%rdx
   1563 	addq	%r11,%r13
   1564 	adcq	$0,%rdx
   1565 
   1566 	movq	%r13,(%rdi)
   1567 	movq	%rdx,%r12
   1568 	movq	%rdx,8(%rdi)
   1569 
   1570 	mulq	%rbx
   1571 	addq	$16,%rbp
   1572 	xorq	%r14,%r14
   1573 	subq	%r9,%rbp
   1574 	xorq	%r15,%r15
   1575 
   1576 	addq	%r12,%rax
   1577 	adcq	$0,%rdx
   1578 	movq	%rax,8(%rdi)
   1579 	movq	%rdx,16(%rdi)
   1580 	movq	%r15,24(%rdi)
   1581 
   1582 	movq	-16(%rsi,%rbp,1),%rax
   1583 	leaq	48+8(%rsp),%rdi
   1584 	xorq	%r10,%r10
   1585 	movq	8(%rdi),%r11
   1586 
   1587 	leaq	(%r14,%r10,2),%r12
   1588 	shrq	$63,%r10
   1589 	leaq	(%rcx,%r11,2),%r13
   1590 	shrq	$63,%r11
   1591 	orq	%r10,%r13
   1592 	movq	16(%rdi),%r10
   1593 	movq	%r11,%r14
   1594 	mulq	%rax
   1595 	negq	%r15
   1596 	movq	24(%rdi),%r11
   1597 	adcq	%rax,%r12
   1598 	movq	-8(%rsi,%rbp,1),%rax
   1599 	movq	%r12,(%rdi)
   1600 	adcq	%rdx,%r13
   1601 
   1602 	leaq	(%r14,%r10,2),%rbx
   1603 	movq	%r13,8(%rdi)
   1604 	sbbq	%r15,%r15
   1605 	shrq	$63,%r10
   1606 	leaq	(%rcx,%r11,2),%r8
   1607 	shrq	$63,%r11
   1608 	orq	%r10,%r8
   1609 	movq	32(%rdi),%r10
   1610 	movq	%r11,%r14
   1611 	mulq	%rax
   1612 	negq	%r15
   1613 	movq	40(%rdi),%r11
   1614 	adcq	%rax,%rbx
   1615 	movq	0(%rsi,%rbp,1),%rax
   1616 	movq	%rbx,16(%rdi)
   1617 	adcq	%rdx,%r8
   1618 	leaq	16(%rbp),%rbp
   1619 	movq	%r8,24(%rdi)
   1620 	sbbq	%r15,%r15
   1621 	leaq	64(%rdi),%rdi
   1622 	jmp	L$sqr4x_shift_n_add
   1623 
   1624 .p2align	5
   1625 L$sqr4x_shift_n_add:
   1626 	leaq	(%r14,%r10,2),%r12
   1627 	shrq	$63,%r10
   1628 	leaq	(%rcx,%r11,2),%r13
   1629 	shrq	$63,%r11
   1630 	orq	%r10,%r13
   1631 	movq	-16(%rdi),%r10
   1632 	movq	%r11,%r14
   1633 	mulq	%rax
   1634 	negq	%r15
   1635 	movq	-8(%rdi),%r11
   1636 	adcq	%rax,%r12
   1637 	movq	-8(%rsi,%rbp,1),%rax
   1638 	movq	%r12,-32(%rdi)
   1639 	adcq	%rdx,%r13
   1640 
   1641 	leaq	(%r14,%r10,2),%rbx
   1642 	movq	%r13,-24(%rdi)
   1643 	sbbq	%r15,%r15
   1644 	shrq	$63,%r10
   1645 	leaq	(%rcx,%r11,2),%r8
   1646 	shrq	$63,%r11
   1647 	orq	%r10,%r8
   1648 	movq	0(%rdi),%r10
   1649 	movq	%r11,%r14
   1650 	mulq	%rax
   1651 	negq	%r15
   1652 	movq	8(%rdi),%r11
   1653 	adcq	%rax,%rbx
   1654 	movq	0(%rsi,%rbp,1),%rax
   1655 	movq	%rbx,-16(%rdi)
   1656 	adcq	%rdx,%r8
   1657 
   1658 	leaq	(%r14,%r10,2),%r12
   1659 	movq	%r8,-8(%rdi)
   1660 	sbbq	%r15,%r15
   1661 	shrq	$63,%r10
   1662 	leaq	(%rcx,%r11,2),%r13
   1663 	shrq	$63,%r11
   1664 	orq	%r10,%r13
   1665 	movq	16(%rdi),%r10
   1666 	movq	%r11,%r14
   1667 	mulq	%rax
   1668 	negq	%r15
   1669 	movq	24(%rdi),%r11
   1670 	adcq	%rax,%r12
   1671 	movq	8(%rsi,%rbp,1),%rax
   1672 	movq	%r12,0(%rdi)
   1673 	adcq	%rdx,%r13
   1674 
   1675 	leaq	(%r14,%r10,2),%rbx
   1676 	movq	%r13,8(%rdi)
   1677 	sbbq	%r15,%r15
   1678 	shrq	$63,%r10
   1679 	leaq	(%rcx,%r11,2),%r8
   1680 	shrq	$63,%r11
   1681 	orq	%r10,%r8
   1682 	movq	32(%rdi),%r10
   1683 	movq	%r11,%r14
   1684 	mulq	%rax
   1685 	negq	%r15
   1686 	movq	40(%rdi),%r11
   1687 	adcq	%rax,%rbx
   1688 	movq	16(%rsi,%rbp,1),%rax
   1689 	movq	%rbx,16(%rdi)
   1690 	adcq	%rdx,%r8
   1691 	movq	%r8,24(%rdi)
   1692 	sbbq	%r15,%r15
   1693 	leaq	64(%rdi),%rdi
   1694 	addq	$32,%rbp
   1695 	jnz	L$sqr4x_shift_n_add
   1696 
   1697 	leaq	(%r14,%r10,2),%r12
   1698 .byte	0x67
   1699 	shrq	$63,%r10
   1700 	leaq	(%rcx,%r11,2),%r13
   1701 	shrq	$63,%r11
   1702 	orq	%r10,%r13
   1703 	movq	-16(%rdi),%r10
   1704 	movq	%r11,%r14
   1705 	mulq	%rax
   1706 	negq	%r15
   1707 	movq	-8(%rdi),%r11
   1708 	adcq	%rax,%r12
   1709 	movq	-8(%rsi),%rax
   1710 	movq	%r12,-32(%rdi)
   1711 	adcq	%rdx,%r13
   1712 
   1713 	leaq	(%r14,%r10,2),%rbx
   1714 	movq	%r13,-24(%rdi)
   1715 	sbbq	%r15,%r15
   1716 	shrq	$63,%r10
   1717 	leaq	(%rcx,%r11,2),%r8
   1718 	shrq	$63,%r11
   1719 	orq	%r10,%r8
   1720 	mulq	%rax
   1721 	negq	%r15
   1722 	adcq	%rax,%rbx
   1723 	adcq	%rdx,%r8
   1724 	movq	%rbx,-16(%rdi)
   1725 	movq	%r8,-8(%rdi)
   1726 .byte	102,72,15,126,213
   1727 __bn_sqr8x_reduction:
   1728 	xorq	%rax,%rax
   1729 	leaq	(%r9,%rbp,1),%rcx
   1730 	leaq	48+8(%rsp,%r9,2),%rdx
   1731 	movq	%rcx,0+8(%rsp)
   1732 	leaq	48+8(%rsp,%r9,1),%rdi
   1733 	movq	%rdx,8+8(%rsp)
   1734 	negq	%r9
   1735 	jmp	L$8x_reduction_loop
   1736 
   1737 .p2align	5
   1738 L$8x_reduction_loop:
   1739 	leaq	(%rdi,%r9,1),%rdi
   1740 .byte	0x66
   1741 	movq	0(%rdi),%rbx
   1742 	movq	8(%rdi),%r9
   1743 	movq	16(%rdi),%r10
   1744 	movq	24(%rdi),%r11
   1745 	movq	32(%rdi),%r12
   1746 	movq	40(%rdi),%r13
   1747 	movq	48(%rdi),%r14
   1748 	movq	56(%rdi),%r15
   1749 	movq	%rax,(%rdx)
   1750 	leaq	64(%rdi),%rdi
   1751 
   1752 .byte	0x67
   1753 	movq	%rbx,%r8
   1754 	imulq	32+8(%rsp),%rbx
   1755 	movq	0(%rbp),%rax
   1756 	movl	$8,%ecx
   1757 	jmp	L$8x_reduce
   1758 
   1759 .p2align	5
   1760 L$8x_reduce:
   1761 	mulq	%rbx
   1762 	movq	8(%rbp),%rax
   1763 	negq	%r8
   1764 	movq	%rdx,%r8
   1765 	adcq	$0,%r8
   1766 
   1767 	mulq	%rbx
   1768 	addq	%rax,%r9
   1769 	movq	16(%rbp),%rax
   1770 	adcq	$0,%rdx
   1771 	addq	%r9,%r8
   1772 	movq	%rbx,48-8+8(%rsp,%rcx,8)
   1773 	movq	%rdx,%r9
   1774 	adcq	$0,%r9
   1775 
   1776 	mulq	%rbx
   1777 	addq	%rax,%r10
   1778 	movq	24(%rbp),%rax
   1779 	adcq	$0,%rdx
   1780 	addq	%r10,%r9
   1781 	movq	32+8(%rsp),%rsi
   1782 	movq	%rdx,%r10
   1783 	adcq	$0,%r10
   1784 
   1785 	mulq	%rbx
   1786 	addq	%rax,%r11
   1787 	movq	32(%rbp),%rax
   1788 	adcq	$0,%rdx
   1789 	imulq	%r8,%rsi
   1790 	addq	%r11,%r10
   1791 	movq	%rdx,%r11
   1792 	adcq	$0,%r11
   1793 
   1794 	mulq	%rbx
   1795 	addq	%rax,%r12
   1796 	movq	40(%rbp),%rax
   1797 	adcq	$0,%rdx
   1798 	addq	%r12,%r11
   1799 	movq	%rdx,%r12
   1800 	adcq	$0,%r12
   1801 
   1802 	mulq	%rbx
   1803 	addq	%rax,%r13
   1804 	movq	48(%rbp),%rax
   1805 	adcq	$0,%rdx
   1806 	addq	%r13,%r12
   1807 	movq	%rdx,%r13
   1808 	adcq	$0,%r13
   1809 
   1810 	mulq	%rbx
   1811 	addq	%rax,%r14
   1812 	movq	56(%rbp),%rax
   1813 	adcq	$0,%rdx
   1814 	addq	%r14,%r13
   1815 	movq	%rdx,%r14
   1816 	adcq	$0,%r14
   1817 
   1818 	mulq	%rbx
   1819 	movq	%rsi,%rbx
   1820 	addq	%rax,%r15
   1821 	movq	0(%rbp),%rax
   1822 	adcq	$0,%rdx
   1823 	addq	%r15,%r14
   1824 	movq	%rdx,%r15
   1825 	adcq	$0,%r15
   1826 
   1827 	decl	%ecx
   1828 	jnz	L$8x_reduce
   1829 
   1830 	leaq	64(%rbp),%rbp
   1831 	xorq	%rax,%rax
   1832 	movq	8+8(%rsp),%rdx
   1833 	cmpq	0+8(%rsp),%rbp
   1834 	jae	L$8x_no_tail
   1835 
   1836 .byte	0x66
   1837 	addq	0(%rdi),%r8
   1838 	adcq	8(%rdi),%r9
   1839 	adcq	16(%rdi),%r10
   1840 	adcq	24(%rdi),%r11
   1841 	adcq	32(%rdi),%r12
   1842 	adcq	40(%rdi),%r13
   1843 	adcq	48(%rdi),%r14
   1844 	adcq	56(%rdi),%r15
   1845 	sbbq	%rsi,%rsi
   1846 
   1847 	movq	48+56+8(%rsp),%rbx
   1848 	movl	$8,%ecx
   1849 	movq	0(%rbp),%rax
   1850 	jmp	L$8x_tail
   1851 
   1852 .p2align	5
   1853 L$8x_tail:
   1854 	mulq	%rbx
   1855 	addq	%rax,%r8
   1856 	movq	8(%rbp),%rax
   1857 	movq	%r8,(%rdi)
   1858 	movq	%rdx,%r8
   1859 	adcq	$0,%r8
   1860 
   1861 	mulq	%rbx
   1862 	addq	%rax,%r9
   1863 	movq	16(%rbp),%rax
   1864 	adcq	$0,%rdx
   1865 	addq	%r9,%r8
   1866 	leaq	8(%rdi),%rdi
   1867 	movq	%rdx,%r9
   1868 	adcq	$0,%r9
   1869 
   1870 	mulq	%rbx
   1871 	addq	%rax,%r10
   1872 	movq	24(%rbp),%rax
   1873 	adcq	$0,%rdx
   1874 	addq	%r10,%r9
   1875 	movq	%rdx,%r10
   1876 	adcq	$0,%r10
   1877 
   1878 	mulq	%rbx
   1879 	addq	%rax,%r11
   1880 	movq	32(%rbp),%rax
   1881 	adcq	$0,%rdx
   1882 	addq	%r11,%r10
   1883 	movq	%rdx,%r11
   1884 	adcq	$0,%r11
   1885 
   1886 	mulq	%rbx
   1887 	addq	%rax,%r12
   1888 	movq	40(%rbp),%rax
   1889 	adcq	$0,%rdx
   1890 	addq	%r12,%r11
   1891 	movq	%rdx,%r12
   1892 	adcq	$0,%r12
   1893 
   1894 	mulq	%rbx
   1895 	addq	%rax,%r13
   1896 	movq	48(%rbp),%rax
   1897 	adcq	$0,%rdx
   1898 	addq	%r13,%r12
   1899 	movq	%rdx,%r13
   1900 	adcq	$0,%r13
   1901 
   1902 	mulq	%rbx
   1903 	addq	%rax,%r14
   1904 	movq	56(%rbp),%rax
   1905 	adcq	$0,%rdx
   1906 	addq	%r14,%r13
   1907 	movq	%rdx,%r14
   1908 	adcq	$0,%r14
   1909 
   1910 	mulq	%rbx
   1911 	movq	48-16+8(%rsp,%rcx,8),%rbx
   1912 	addq	%rax,%r15
   1913 	adcq	$0,%rdx
   1914 	addq	%r15,%r14
   1915 	movq	0(%rbp),%rax
   1916 	movq	%rdx,%r15
   1917 	adcq	$0,%r15
   1918 
   1919 	decl	%ecx
   1920 	jnz	L$8x_tail
   1921 
   1922 	leaq	64(%rbp),%rbp
   1923 	movq	8+8(%rsp),%rdx
   1924 	cmpq	0+8(%rsp),%rbp
   1925 	jae	L$8x_tail_done
   1926 
   1927 	movq	48+56+8(%rsp),%rbx
   1928 	negq	%rsi
   1929 	movq	0(%rbp),%rax
   1930 	adcq	0(%rdi),%r8
   1931 	adcq	8(%rdi),%r9
   1932 	adcq	16(%rdi),%r10
   1933 	adcq	24(%rdi),%r11
   1934 	adcq	32(%rdi),%r12
   1935 	adcq	40(%rdi),%r13
   1936 	adcq	48(%rdi),%r14
   1937 	adcq	56(%rdi),%r15
   1938 	sbbq	%rsi,%rsi
   1939 
   1940 	movl	$8,%ecx
   1941 	jmp	L$8x_tail
   1942 
   1943 .p2align	5
   1944 L$8x_tail_done:
   1945 	xorq	%rax,%rax
   1946 	addq	(%rdx),%r8
   1947 	adcq	$0,%r9
   1948 	adcq	$0,%r10
   1949 	adcq	$0,%r11
   1950 	adcq	$0,%r12
   1951 	adcq	$0,%r13
   1952 	adcq	$0,%r14
   1953 	adcq	$0,%r15
   1954 	adcq	$0,%rax
   1955 
   1956 	negq	%rsi
   1957 L$8x_no_tail:
   1958 	adcq	0(%rdi),%r8
   1959 	adcq	8(%rdi),%r9
   1960 	adcq	16(%rdi),%r10
   1961 	adcq	24(%rdi),%r11
   1962 	adcq	32(%rdi),%r12
   1963 	adcq	40(%rdi),%r13
   1964 	adcq	48(%rdi),%r14
   1965 	adcq	56(%rdi),%r15
   1966 	adcq	$0,%rax
   1967 	movq	-8(%rbp),%rcx
   1968 	xorq	%rsi,%rsi
   1969 
   1970 .byte	102,72,15,126,213
   1971 
   1972 	movq	%r8,0(%rdi)
   1973 	movq	%r9,8(%rdi)
   1974 .byte	102,73,15,126,217
   1975 	movq	%r10,16(%rdi)
   1976 	movq	%r11,24(%rdi)
   1977 	movq	%r12,32(%rdi)
   1978 	movq	%r13,40(%rdi)
   1979 	movq	%r14,48(%rdi)
   1980 	movq	%r15,56(%rdi)
   1981 	leaq	64(%rdi),%rdi
   1982 
   1983 	cmpq	%rdx,%rdi
   1984 	jb	L$8x_reduction_loop
   1985 	.byte	0xf3,0xc3
   1986 
   1987 
   1988 .p2align	5
   1989 __bn_post4x_internal:
   1990 	movq	0(%rbp),%r12
   1991 	leaq	(%rdi,%r9,1),%rbx
   1992 	movq	%r9,%rcx
   1993 .byte	102,72,15,126,207
   1994 	negq	%rax
   1995 .byte	102,72,15,126,206
   1996 	sarq	$3+2,%rcx
   1997 	decq	%r12
   1998 	xorq	%r10,%r10
   1999 	movq	8(%rbp),%r13
   2000 	movq	16(%rbp),%r14
   2001 	movq	24(%rbp),%r15
   2002 	jmp	L$sqr4x_sub_entry
   2003 
   2004 .p2align	4
   2005 L$sqr4x_sub:
   2006 	movq	0(%rbp),%r12
   2007 	movq	8(%rbp),%r13
   2008 	movq	16(%rbp),%r14
   2009 	movq	24(%rbp),%r15
   2010 L$sqr4x_sub_entry:
   2011 	leaq	32(%rbp),%rbp
   2012 	notq	%r12
   2013 	notq	%r13
   2014 	notq	%r14
   2015 	notq	%r15
   2016 	andq	%rax,%r12
   2017 	andq	%rax,%r13
   2018 	andq	%rax,%r14
   2019 	andq	%rax,%r15
   2020 
   2021 	negq	%r10
   2022 	adcq	0(%rbx),%r12
   2023 	adcq	8(%rbx),%r13
   2024 	adcq	16(%rbx),%r14
   2025 	adcq	24(%rbx),%r15
   2026 	movq	%r12,0(%rdi)
   2027 	leaq	32(%rbx),%rbx
   2028 	movq	%r13,8(%rdi)
   2029 	sbbq	%r10,%r10
   2030 	movq	%r14,16(%rdi)
   2031 	movq	%r15,24(%rdi)
   2032 	leaq	32(%rdi),%rdi
   2033 
   2034 	incq	%rcx
   2035 	jnz	L$sqr4x_sub
   2036 
   2037 	movq	%r9,%r10
   2038 	negq	%r9
   2039 	.byte	0xf3,0xc3
   2040 
   2041 .globl	_bn_from_montgomery
   2042 .private_extern _bn_from_montgomery
   2043 
   2044 .p2align	5
   2045 _bn_from_montgomery:
   2046 	testl	$7,%r9d
   2047 	jz	bn_from_mont8x
   2048 	xorl	%eax,%eax
   2049 	.byte	0xf3,0xc3
   2050 
   2051 
   2052 
   2053 .p2align	5
   2054 bn_from_mont8x:
   2055 
   2056 .byte	0x67
   2057 	movq	%rsp,%rax
   2058 
   2059 	pushq	%rbx
   2060 
   2061 	pushq	%rbp
   2062 
   2063 	pushq	%r12
   2064 
   2065 	pushq	%r13
   2066 
   2067 	pushq	%r14
   2068 
   2069 	pushq	%r15
   2070 
   2071 L$from_prologue:
   2072 
   2073 	shll	$3,%r9d
   2074 	leaq	(%r9,%r9,2),%r10
   2075 	negq	%r9
   2076 	movq	(%r8),%r8
   2077 
   2078 
   2079 
   2080 
   2081 
   2082 
   2083 
   2084 
   2085 	leaq	-320(%rsp,%r9,2),%r11
   2086 	movq	%rsp,%rbp
   2087 	subq	%rdi,%r11
   2088 	andq	$4095,%r11
   2089 	cmpq	%r11,%r10
   2090 	jb	L$from_sp_alt
   2091 	subq	%r11,%rbp
   2092 	leaq	-320(%rbp,%r9,2),%rbp
   2093 	jmp	L$from_sp_done
   2094 
   2095 .p2align	5
   2096 L$from_sp_alt:
   2097 	leaq	4096-320(,%r9,2),%r10
   2098 	leaq	-320(%rbp,%r9,2),%rbp
   2099 	subq	%r10,%r11
   2100 	movq	$0,%r10
   2101 	cmovcq	%r10,%r11
   2102 	subq	%r11,%rbp
   2103 L$from_sp_done:
   2104 	andq	$-64,%rbp
   2105 	movq	%rsp,%r11
   2106 	subq	%rbp,%r11
   2107 	andq	$-4096,%r11
   2108 	leaq	(%r11,%rbp,1),%rsp
   2109 	movq	(%rsp),%r10
   2110 	cmpq	%rbp,%rsp
   2111 	ja	L$from_page_walk
   2112 	jmp	L$from_page_walk_done
   2113 
   2114 L$from_page_walk:
   2115 	leaq	-4096(%rsp),%rsp
   2116 	movq	(%rsp),%r10
   2117 	cmpq	%rbp,%rsp
   2118 	ja	L$from_page_walk
   2119 L$from_page_walk_done:
   2120 
   2121 	movq	%r9,%r10
   2122 	negq	%r9
   2123 
   2124 
   2125 
   2126 
   2127 
   2128 
   2129 
   2130 
   2131 
   2132 
   2133 	movq	%r8,32(%rsp)
   2134 	movq	%rax,40(%rsp)
   2135 
   2136 L$from_body:
   2137 	movq	%r9,%r11
   2138 	leaq	48(%rsp),%rax
   2139 	pxor	%xmm0,%xmm0
   2140 	jmp	L$mul_by_1
   2141 
   2142 .p2align	5
   2143 L$mul_by_1:
   2144 	movdqu	(%rsi),%xmm1
   2145 	movdqu	16(%rsi),%xmm2
   2146 	movdqu	32(%rsi),%xmm3
   2147 	movdqa	%xmm0,(%rax,%r9,1)
   2148 	movdqu	48(%rsi),%xmm4
   2149 	movdqa	%xmm0,16(%rax,%r9,1)
   2150 .byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
   2151 	movdqa	%xmm1,(%rax)
   2152 	movdqa	%xmm0,32(%rax,%r9,1)
   2153 	movdqa	%xmm2,16(%rax)
   2154 	movdqa	%xmm0,48(%rax,%r9,1)
   2155 	movdqa	%xmm3,32(%rax)
   2156 	movdqa	%xmm4,48(%rax)
   2157 	leaq	64(%rax),%rax
   2158 	subq	$64,%r11
   2159 	jnz	L$mul_by_1
   2160 
   2161 .byte	102,72,15,110,207
   2162 .byte	102,72,15,110,209
   2163 .byte	0x67
   2164 	movq	%rcx,%rbp
   2165 .byte	102,73,15,110,218
   2166 	call	__bn_sqr8x_reduction
   2167 	call	__bn_post4x_internal
   2168 
   2169 	pxor	%xmm0,%xmm0
   2170 	leaq	48(%rsp),%rax
   2171 	jmp	L$from_mont_zero
   2172 
   2173 .p2align	5
   2174 L$from_mont_zero:
   2175 	movq	40(%rsp),%rsi
   2176 
   2177 	movdqa	%xmm0,0(%rax)
   2178 	movdqa	%xmm0,16(%rax)
   2179 	movdqa	%xmm0,32(%rax)
   2180 	movdqa	%xmm0,48(%rax)
   2181 	leaq	64(%rax),%rax
   2182 	subq	$32,%r9
   2183 	jnz	L$from_mont_zero
   2184 
   2185 	movq	$1,%rax
   2186 	movq	-48(%rsi),%r15
   2187 
   2188 	movq	-40(%rsi),%r14
   2189 
   2190 	movq	-32(%rsi),%r13
   2191 
   2192 	movq	-24(%rsi),%r12
   2193 
   2194 	movq	-16(%rsi),%rbp
   2195 
   2196 	movq	-8(%rsi),%rbx
   2197 
   2198 	leaq	(%rsi),%rsp
   2199 
   2200 L$from_epilogue:
   2201 	.byte	0xf3,0xc3
   2202 
   2203 
   2204 .globl	_bn_scatter5
   2205 .private_extern _bn_scatter5
   2206 
   2207 .p2align	4
   2208 _bn_scatter5:
   2209 	cmpl	$0,%esi
   2210 	jz	L$scatter_epilogue
   2211 	leaq	(%rdx,%rcx,8),%rdx
   2212 L$scatter:
   2213 	movq	(%rdi),%rax
   2214 	leaq	8(%rdi),%rdi
   2215 	movq	%rax,(%rdx)
   2216 	leaq	256(%rdx),%rdx
   2217 	subl	$1,%esi
   2218 	jnz	L$scatter
   2219 L$scatter_epilogue:
   2220 	.byte	0xf3,0xc3
   2221 
   2222 
   2223 .globl	_bn_gather5
   2224 .private_extern _bn_gather5
   2225 
   2226 .p2align	5
   2227 _bn_gather5:
   2228 L$SEH_begin_bn_gather5:
   2229 
   2230 .byte	0x4c,0x8d,0x14,0x24
   2231 .byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
   2232 	leaq	L$inc(%rip),%rax
   2233 	andq	$-16,%rsp
   2234 
   2235 	movd	%ecx,%xmm5
   2236 	movdqa	0(%rax),%xmm0
   2237 	movdqa	16(%rax),%xmm1
   2238 	leaq	128(%rdx),%r11
   2239 	leaq	128(%rsp),%rax
   2240 
   2241 	pshufd	$0,%xmm5,%xmm5
   2242 	movdqa	%xmm1,%xmm4
   2243 	movdqa	%xmm1,%xmm2
   2244 	paddd	%xmm0,%xmm1
   2245 	pcmpeqd	%xmm5,%xmm0
   2246 	movdqa	%xmm4,%xmm3
   2247 
   2248 	paddd	%xmm1,%xmm2
   2249 	pcmpeqd	%xmm5,%xmm1
   2250 	movdqa	%xmm0,-128(%rax)
   2251 	movdqa	%xmm4,%xmm0
   2252 
   2253 	paddd	%xmm2,%xmm3
   2254 	pcmpeqd	%xmm5,%xmm2
   2255 	movdqa	%xmm1,-112(%rax)
   2256 	movdqa	%xmm4,%xmm1
   2257 
   2258 	paddd	%xmm3,%xmm0
   2259 	pcmpeqd	%xmm5,%xmm3
   2260 	movdqa	%xmm2,-96(%rax)
   2261 	movdqa	%xmm4,%xmm2
   2262 	paddd	%xmm0,%xmm1
   2263 	pcmpeqd	%xmm5,%xmm0
   2264 	movdqa	%xmm3,-80(%rax)
   2265 	movdqa	%xmm4,%xmm3
   2266 
   2267 	paddd	%xmm1,%xmm2
   2268 	pcmpeqd	%xmm5,%xmm1
   2269 	movdqa	%xmm0,-64(%rax)
   2270 	movdqa	%xmm4,%xmm0
   2271 
   2272 	paddd	%xmm2,%xmm3
   2273 	pcmpeqd	%xmm5,%xmm2
   2274 	movdqa	%xmm1,-48(%rax)
   2275 	movdqa	%xmm4,%xmm1
   2276 
   2277 	paddd	%xmm3,%xmm0
   2278 	pcmpeqd	%xmm5,%xmm3
   2279 	movdqa	%xmm2,-32(%rax)
   2280 	movdqa	%xmm4,%xmm2
   2281 	paddd	%xmm0,%xmm1
   2282 	pcmpeqd	%xmm5,%xmm0
   2283 	movdqa	%xmm3,-16(%rax)
   2284 	movdqa	%xmm4,%xmm3
   2285 
   2286 	paddd	%xmm1,%xmm2
   2287 	pcmpeqd	%xmm5,%xmm1
   2288 	movdqa	%xmm0,0(%rax)
   2289 	movdqa	%xmm4,%xmm0
   2290 
   2291 	paddd	%xmm2,%xmm3
   2292 	pcmpeqd	%xmm5,%xmm2
   2293 	movdqa	%xmm1,16(%rax)
   2294 	movdqa	%xmm4,%xmm1
   2295 
   2296 	paddd	%xmm3,%xmm0
   2297 	pcmpeqd	%xmm5,%xmm3
   2298 	movdqa	%xmm2,32(%rax)
   2299 	movdqa	%xmm4,%xmm2
   2300 	paddd	%xmm0,%xmm1
   2301 	pcmpeqd	%xmm5,%xmm0
   2302 	movdqa	%xmm3,48(%rax)
   2303 	movdqa	%xmm4,%xmm3
   2304 
   2305 	paddd	%xmm1,%xmm2
   2306 	pcmpeqd	%xmm5,%xmm1
   2307 	movdqa	%xmm0,64(%rax)
   2308 	movdqa	%xmm4,%xmm0
   2309 
   2310 	paddd	%xmm2,%xmm3
   2311 	pcmpeqd	%xmm5,%xmm2
   2312 	movdqa	%xmm1,80(%rax)
   2313 	movdqa	%xmm4,%xmm1
   2314 
   2315 	paddd	%xmm3,%xmm0
   2316 	pcmpeqd	%xmm5,%xmm3
   2317 	movdqa	%xmm2,96(%rax)
   2318 	movdqa	%xmm4,%xmm2
   2319 	movdqa	%xmm3,112(%rax)
   2320 	jmp	L$gather
   2321 
   2322 .p2align	5
   2323 L$gather:
   2324 	pxor	%xmm4,%xmm4
   2325 	pxor	%xmm5,%xmm5
   2326 	movdqa	-128(%r11),%xmm0
   2327 	movdqa	-112(%r11),%xmm1
   2328 	movdqa	-96(%r11),%xmm2
   2329 	pand	-128(%rax),%xmm0
   2330 	movdqa	-80(%r11),%xmm3
   2331 	pand	-112(%rax),%xmm1
   2332 	por	%xmm0,%xmm4
   2333 	pand	-96(%rax),%xmm2
   2334 	por	%xmm1,%xmm5
   2335 	pand	-80(%rax),%xmm3
   2336 	por	%xmm2,%xmm4
   2337 	por	%xmm3,%xmm5
   2338 	movdqa	-64(%r11),%xmm0
   2339 	movdqa	-48(%r11),%xmm1
   2340 	movdqa	-32(%r11),%xmm2
   2341 	pand	-64(%rax),%xmm0
   2342 	movdqa	-16(%r11),%xmm3
   2343 	pand	-48(%rax),%xmm1
   2344 	por	%xmm0,%xmm4
   2345 	pand	-32(%rax),%xmm2
   2346 	por	%xmm1,%xmm5
   2347 	pand	-16(%rax),%xmm3
   2348 	por	%xmm2,%xmm4
   2349 	por	%xmm3,%xmm5
   2350 	movdqa	0(%r11),%xmm0
   2351 	movdqa	16(%r11),%xmm1
   2352 	movdqa	32(%r11),%xmm2
   2353 	pand	0(%rax),%xmm0
   2354 	movdqa	48(%r11),%xmm3
   2355 	pand	16(%rax),%xmm1
   2356 	por	%xmm0,%xmm4
   2357 	pand	32(%rax),%xmm2
   2358 	por	%xmm1,%xmm5
   2359 	pand	48(%rax),%xmm3
   2360 	por	%xmm2,%xmm4
   2361 	por	%xmm3,%xmm5
   2362 	movdqa	64(%r11),%xmm0
   2363 	movdqa	80(%r11),%xmm1
   2364 	movdqa	96(%r11),%xmm2
   2365 	pand	64(%rax),%xmm0
   2366 	movdqa	112(%r11),%xmm3
   2367 	pand	80(%rax),%xmm1
   2368 	por	%xmm0,%xmm4
   2369 	pand	96(%rax),%xmm2
   2370 	por	%xmm1,%xmm5
   2371 	pand	112(%rax),%xmm3
   2372 	por	%xmm2,%xmm4
   2373 	por	%xmm3,%xmm5
   2374 	por	%xmm5,%xmm4
   2375 	leaq	256(%r11),%r11
   2376 	pshufd	$0x4e,%xmm4,%xmm0
   2377 	por	%xmm4,%xmm0
   2378 	movq	%xmm0,(%rdi)
   2379 	leaq	8(%rdi),%rdi
   2380 	subl	$1,%esi
   2381 	jnz	L$gather
   2382 
   2383 	leaq	(%r10),%rsp
   2384 	.byte	0xf3,0xc3
   2385 L$SEH_end_bn_gather5:
   2386 
   2387 .p2align	6
   2388 L$inc:
   2389 .long	0,0, 1,1
   2390 .long	2,2, 2,2
   2391 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   2392 #endif
   2393