Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 
      5 
      6 .globl	bn_mul_mont
      7 .hidden bn_mul_mont
      8 .type	bn_mul_mont,@function
      9 .align	16
     10 bn_mul_mont:
     11 	testl	$3,%r9d
     12 	jnz	.Lmul_enter
     13 	cmpl	$8,%r9d
     14 	jb	.Lmul_enter
     15 	cmpq	%rsi,%rdx
     16 	jne	.Lmul4x_enter
     17 	testl	$7,%r9d
     18 	jz	.Lsqr8x_enter
     19 	jmp	.Lmul4x_enter
     20 
     21 .align	16
     22 .Lmul_enter:
     23 	pushq	%rbx
     24 	pushq	%rbp
     25 	pushq	%r12
     26 	pushq	%r13
     27 	pushq	%r14
     28 	pushq	%r15
     29 
     30 	movl	%r9d,%r9d
     31 	leaq	2(%r9),%r10
     32 	movq	%rsp,%r11
     33 	negq	%r10
     34 	leaq	(%rsp,%r10,8),%rsp
     35 	andq	$-1024,%rsp
     36 
     37 	movq	%r11,8(%rsp,%r9,8)
     38 .Lmul_body:
     39 	movq	%rdx,%r12
     40 	movq	(%r8),%r8
     41 	movq	(%r12),%rbx
     42 	movq	(%rsi),%rax
     43 
     44 	xorq	%r14,%r14
     45 	xorq	%r15,%r15
     46 
     47 	movq	%r8,%rbp
     48 	mulq	%rbx
     49 	movq	%rax,%r10
     50 	movq	(%rcx),%rax
     51 
     52 	imulq	%r10,%rbp
     53 	movq	%rdx,%r11
     54 
     55 	mulq	%rbp
     56 	addq	%rax,%r10
     57 	movq	8(%rsi),%rax
     58 	adcq	$0,%rdx
     59 	movq	%rdx,%r13
     60 
     61 	leaq	1(%r15),%r15
     62 	jmp	.L1st_enter
     63 
     64 .align	16
     65 .L1st:
     66 	addq	%rax,%r13
     67 	movq	(%rsi,%r15,8),%rax
     68 	adcq	$0,%rdx
     69 	addq	%r11,%r13
     70 	movq	%r10,%r11
     71 	adcq	$0,%rdx
     72 	movq	%r13,-16(%rsp,%r15,8)
     73 	movq	%rdx,%r13
     74 
     75 .L1st_enter:
     76 	mulq	%rbx
     77 	addq	%rax,%r11
     78 	movq	(%rcx,%r15,8),%rax
     79 	adcq	$0,%rdx
     80 	leaq	1(%r15),%r15
     81 	movq	%rdx,%r10
     82 
     83 	mulq	%rbp
     84 	cmpq	%r9,%r15
     85 	jne	.L1st
     86 
     87 	addq	%rax,%r13
     88 	movq	(%rsi),%rax
     89 	adcq	$0,%rdx
     90 	addq	%r11,%r13
     91 	adcq	$0,%rdx
     92 	movq	%r13,-16(%rsp,%r15,8)
     93 	movq	%rdx,%r13
     94 	movq	%r10,%r11
     95 
     96 	xorq	%rdx,%rdx
     97 	addq	%r11,%r13
     98 	adcq	$0,%rdx
     99 	movq	%r13,-8(%rsp,%r9,8)
    100 	movq	%rdx,(%rsp,%r9,8)
    101 
    102 	leaq	1(%r14),%r14
    103 	jmp	.Louter
    104 .align	16
    105 .Louter:
    106 	movq	(%r12,%r14,8),%rbx
    107 	xorq	%r15,%r15
    108 	movq	%r8,%rbp
    109 	movq	(%rsp),%r10
    110 	mulq	%rbx
    111 	addq	%rax,%r10
    112 	movq	(%rcx),%rax
    113 	adcq	$0,%rdx
    114 
    115 	imulq	%r10,%rbp
    116 	movq	%rdx,%r11
    117 
    118 	mulq	%rbp
    119 	addq	%rax,%r10
    120 	movq	8(%rsi),%rax
    121 	adcq	$0,%rdx
    122 	movq	8(%rsp),%r10
    123 	movq	%rdx,%r13
    124 
    125 	leaq	1(%r15),%r15
    126 	jmp	.Linner_enter
    127 
    128 .align	16
    129 .Linner:
    130 	addq	%rax,%r13
    131 	movq	(%rsi,%r15,8),%rax
    132 	adcq	$0,%rdx
    133 	addq	%r10,%r13
    134 	movq	(%rsp,%r15,8),%r10
    135 	adcq	$0,%rdx
    136 	movq	%r13,-16(%rsp,%r15,8)
    137 	movq	%rdx,%r13
    138 
    139 .Linner_enter:
    140 	mulq	%rbx
    141 	addq	%rax,%r11
    142 	movq	(%rcx,%r15,8),%rax
    143 	adcq	$0,%rdx
    144 	addq	%r11,%r10
    145 	movq	%rdx,%r11
    146 	adcq	$0,%r11
    147 	leaq	1(%r15),%r15
    148 
    149 	mulq	%rbp
    150 	cmpq	%r9,%r15
    151 	jne	.Linner
    152 
    153 	addq	%rax,%r13
    154 	movq	(%rsi),%rax
    155 	adcq	$0,%rdx
    156 	addq	%r10,%r13
    157 	movq	(%rsp,%r15,8),%r10
    158 	adcq	$0,%rdx
    159 	movq	%r13,-16(%rsp,%r15,8)
    160 	movq	%rdx,%r13
    161 
    162 	xorq	%rdx,%rdx
    163 	addq	%r11,%r13
    164 	adcq	$0,%rdx
    165 	addq	%r10,%r13
    166 	adcq	$0,%rdx
    167 	movq	%r13,-8(%rsp,%r9,8)
    168 	movq	%rdx,(%rsp,%r9,8)
    169 
    170 	leaq	1(%r14),%r14
    171 	cmpq	%r9,%r14
    172 	jb	.Louter
    173 
    174 	xorq	%r14,%r14
    175 	movq	(%rsp),%rax
    176 	leaq	(%rsp),%rsi
    177 	movq	%r9,%r15
    178 	jmp	.Lsub
    179 .align	16
    180 .Lsub:	sbbq	(%rcx,%r14,8),%rax
    181 	movq	%rax,(%rdi,%r14,8)
    182 	movq	8(%rsi,%r14,8),%rax
    183 	leaq	1(%r14),%r14
    184 	decq	%r15
    185 	jnz	.Lsub
    186 
    187 	sbbq	$0,%rax
    188 	xorq	%r14,%r14
    189 	movq	%r9,%r15
    190 .align	16
    191 .Lcopy:
    192 	movq	(%rsp,%r14,8),%rsi
    193 	movq	(%rdi,%r14,8),%rcx
    194 	xorq	%rcx,%rsi
    195 	andq	%rax,%rsi
    196 	xorq	%rcx,%rsi
    197 	movq	%r14,(%rsp,%r14,8)
    198 	movq	%rsi,(%rdi,%r14,8)
    199 	leaq	1(%r14),%r14
    200 	subq	$1,%r15
    201 	jnz	.Lcopy
    202 
    203 	movq	8(%rsp,%r9,8),%rsi
    204 	movq	$1,%rax
    205 	movq	(%rsi),%r15
    206 	movq	8(%rsi),%r14
    207 	movq	16(%rsi),%r13
    208 	movq	24(%rsi),%r12
    209 	movq	32(%rsi),%rbp
    210 	movq	40(%rsi),%rbx
    211 	leaq	48(%rsi),%rsp
    212 .Lmul_epilogue:
    213 	.byte	0xf3,0xc3
    214 .size	bn_mul_mont,.-bn_mul_mont
    215 .type	bn_mul4x_mont,@function
    216 .align	16
    217 bn_mul4x_mont:
    218 .Lmul4x_enter:
    219 	pushq	%rbx
    220 	pushq	%rbp
    221 	pushq	%r12
    222 	pushq	%r13
    223 	pushq	%r14
    224 	pushq	%r15
    225 
    226 	movl	%r9d,%r9d
    227 	leaq	4(%r9),%r10
    228 	movq	%rsp,%r11
    229 	negq	%r10
    230 	leaq	(%rsp,%r10,8),%rsp
    231 	andq	$-1024,%rsp
    232 
    233 	movq	%r11,8(%rsp,%r9,8)
    234 .Lmul4x_body:
    235 	movq	%rdi,16(%rsp,%r9,8)
    236 	movq	%rdx,%r12
    237 	movq	(%r8),%r8
    238 	movq	(%r12),%rbx
    239 	movq	(%rsi),%rax
    240 
    241 	xorq	%r14,%r14
    242 	xorq	%r15,%r15
    243 
    244 	movq	%r8,%rbp
    245 	mulq	%rbx
    246 	movq	%rax,%r10
    247 	movq	(%rcx),%rax
    248 
    249 	imulq	%r10,%rbp
    250 	movq	%rdx,%r11
    251 
    252 	mulq	%rbp
    253 	addq	%rax,%r10
    254 	movq	8(%rsi),%rax
    255 	adcq	$0,%rdx
    256 	movq	%rdx,%rdi
    257 
    258 	mulq	%rbx
    259 	addq	%rax,%r11
    260 	movq	8(%rcx),%rax
    261 	adcq	$0,%rdx
    262 	movq	%rdx,%r10
    263 
    264 	mulq	%rbp
    265 	addq	%rax,%rdi
    266 	movq	16(%rsi),%rax
    267 	adcq	$0,%rdx
    268 	addq	%r11,%rdi
    269 	leaq	4(%r15),%r15
    270 	adcq	$0,%rdx
    271 	movq	%rdi,(%rsp)
    272 	movq	%rdx,%r13
    273 	jmp	.L1st4x
    274 .align	16
    275 .L1st4x:
    276 	mulq	%rbx
    277 	addq	%rax,%r10
    278 	movq	-16(%rcx,%r15,8),%rax
    279 	adcq	$0,%rdx
    280 	movq	%rdx,%r11
    281 
    282 	mulq	%rbp
    283 	addq	%rax,%r13
    284 	movq	-8(%rsi,%r15,8),%rax
    285 	adcq	$0,%rdx
    286 	addq	%r10,%r13
    287 	adcq	$0,%rdx
    288 	movq	%r13,-24(%rsp,%r15,8)
    289 	movq	%rdx,%rdi
    290 
    291 	mulq	%rbx
    292 	addq	%rax,%r11
    293 	movq	-8(%rcx,%r15,8),%rax
    294 	adcq	$0,%rdx
    295 	movq	%rdx,%r10
    296 
    297 	mulq	%rbp
    298 	addq	%rax,%rdi
    299 	movq	(%rsi,%r15,8),%rax
    300 	adcq	$0,%rdx
    301 	addq	%r11,%rdi
    302 	adcq	$0,%rdx
    303 	movq	%rdi,-16(%rsp,%r15,8)
    304 	movq	%rdx,%r13
    305 
    306 	mulq	%rbx
    307 	addq	%rax,%r10
    308 	movq	(%rcx,%r15,8),%rax
    309 	adcq	$0,%rdx
    310 	movq	%rdx,%r11
    311 
    312 	mulq	%rbp
    313 	addq	%rax,%r13
    314 	movq	8(%rsi,%r15,8),%rax
    315 	adcq	$0,%rdx
    316 	addq	%r10,%r13
    317 	adcq	$0,%rdx
    318 	movq	%r13,-8(%rsp,%r15,8)
    319 	movq	%rdx,%rdi
    320 
    321 	mulq	%rbx
    322 	addq	%rax,%r11
    323 	movq	8(%rcx,%r15,8),%rax
    324 	adcq	$0,%rdx
    325 	leaq	4(%r15),%r15
    326 	movq	%rdx,%r10
    327 
    328 	mulq	%rbp
    329 	addq	%rax,%rdi
    330 	movq	-16(%rsi,%r15,8),%rax
    331 	adcq	$0,%rdx
    332 	addq	%r11,%rdi
    333 	adcq	$0,%rdx
    334 	movq	%rdi,-32(%rsp,%r15,8)
    335 	movq	%rdx,%r13
    336 	cmpq	%r9,%r15
    337 	jb	.L1st4x
    338 
    339 	mulq	%rbx
    340 	addq	%rax,%r10
    341 	movq	-16(%rcx,%r15,8),%rax
    342 	adcq	$0,%rdx
    343 	movq	%rdx,%r11
    344 
    345 	mulq	%rbp
    346 	addq	%rax,%r13
    347 	movq	-8(%rsi,%r15,8),%rax
    348 	adcq	$0,%rdx
    349 	addq	%r10,%r13
    350 	adcq	$0,%rdx
    351 	movq	%r13,-24(%rsp,%r15,8)
    352 	movq	%rdx,%rdi
    353 
    354 	mulq	%rbx
    355 	addq	%rax,%r11
    356 	movq	-8(%rcx,%r15,8),%rax
    357 	adcq	$0,%rdx
    358 	movq	%rdx,%r10
    359 
    360 	mulq	%rbp
    361 	addq	%rax,%rdi
    362 	movq	(%rsi),%rax
    363 	adcq	$0,%rdx
    364 	addq	%r11,%rdi
    365 	adcq	$0,%rdx
    366 	movq	%rdi,-16(%rsp,%r15,8)
    367 	movq	%rdx,%r13
    368 
    369 	xorq	%rdi,%rdi
    370 	addq	%r10,%r13
    371 	adcq	$0,%rdi
    372 	movq	%r13,-8(%rsp,%r15,8)
    373 	movq	%rdi,(%rsp,%r15,8)
    374 
    375 	leaq	1(%r14),%r14
    376 .align	4
    377 .Louter4x:
    378 	movq	(%r12,%r14,8),%rbx
    379 	xorq	%r15,%r15
    380 	movq	(%rsp),%r10
    381 	movq	%r8,%rbp
    382 	mulq	%rbx
    383 	addq	%rax,%r10
    384 	movq	(%rcx),%rax
    385 	adcq	$0,%rdx
    386 
    387 	imulq	%r10,%rbp
    388 	movq	%rdx,%r11
    389 
    390 	mulq	%rbp
    391 	addq	%rax,%r10
    392 	movq	8(%rsi),%rax
    393 	adcq	$0,%rdx
    394 	movq	%rdx,%rdi
    395 
    396 	mulq	%rbx
    397 	addq	%rax,%r11
    398 	movq	8(%rcx),%rax
    399 	adcq	$0,%rdx
    400 	addq	8(%rsp),%r11
    401 	adcq	$0,%rdx
    402 	movq	%rdx,%r10
    403 
    404 	mulq	%rbp
    405 	addq	%rax,%rdi
    406 	movq	16(%rsi),%rax
    407 	adcq	$0,%rdx
    408 	addq	%r11,%rdi
    409 	leaq	4(%r15),%r15
    410 	adcq	$0,%rdx
    411 	movq	%rdi,(%rsp)
    412 	movq	%rdx,%r13
    413 	jmp	.Linner4x
    414 .align	16
    415 .Linner4x:
    416 	mulq	%rbx
    417 	addq	%rax,%r10
    418 	movq	-16(%rcx,%r15,8),%rax
    419 	adcq	$0,%rdx
    420 	addq	-16(%rsp,%r15,8),%r10
    421 	adcq	$0,%rdx
    422 	movq	%rdx,%r11
    423 
    424 	mulq	%rbp
    425 	addq	%rax,%r13
    426 	movq	-8(%rsi,%r15,8),%rax
    427 	adcq	$0,%rdx
    428 	addq	%r10,%r13
    429 	adcq	$0,%rdx
    430 	movq	%r13,-24(%rsp,%r15,8)
    431 	movq	%rdx,%rdi
    432 
    433 	mulq	%rbx
    434 	addq	%rax,%r11
    435 	movq	-8(%rcx,%r15,8),%rax
    436 	adcq	$0,%rdx
    437 	addq	-8(%rsp,%r15,8),%r11
    438 	adcq	$0,%rdx
    439 	movq	%rdx,%r10
    440 
    441 	mulq	%rbp
    442 	addq	%rax,%rdi
    443 	movq	(%rsi,%r15,8),%rax
    444 	adcq	$0,%rdx
    445 	addq	%r11,%rdi
    446 	adcq	$0,%rdx
    447 	movq	%rdi,-16(%rsp,%r15,8)
    448 	movq	%rdx,%r13
    449 
    450 	mulq	%rbx
    451 	addq	%rax,%r10
    452 	movq	(%rcx,%r15,8),%rax
    453 	adcq	$0,%rdx
    454 	addq	(%rsp,%r15,8),%r10
    455 	adcq	$0,%rdx
    456 	movq	%rdx,%r11
    457 
    458 	mulq	%rbp
    459 	addq	%rax,%r13
    460 	movq	8(%rsi,%r15,8),%rax
    461 	adcq	$0,%rdx
    462 	addq	%r10,%r13
    463 	adcq	$0,%rdx
    464 	movq	%r13,-8(%rsp,%r15,8)
    465 	movq	%rdx,%rdi
    466 
    467 	mulq	%rbx
    468 	addq	%rax,%r11
    469 	movq	8(%rcx,%r15,8),%rax
    470 	adcq	$0,%rdx
    471 	addq	8(%rsp,%r15,8),%r11
    472 	adcq	$0,%rdx
    473 	leaq	4(%r15),%r15
    474 	movq	%rdx,%r10
    475 
    476 	mulq	%rbp
    477 	addq	%rax,%rdi
    478 	movq	-16(%rsi,%r15,8),%rax
    479 	adcq	$0,%rdx
    480 	addq	%r11,%rdi
    481 	adcq	$0,%rdx
    482 	movq	%rdi,-32(%rsp,%r15,8)
    483 	movq	%rdx,%r13
    484 	cmpq	%r9,%r15
    485 	jb	.Linner4x
    486 
    487 	mulq	%rbx
    488 	addq	%rax,%r10
    489 	movq	-16(%rcx,%r15,8),%rax
    490 	adcq	$0,%rdx
    491 	addq	-16(%rsp,%r15,8),%r10
    492 	adcq	$0,%rdx
    493 	movq	%rdx,%r11
    494 
    495 	mulq	%rbp
    496 	addq	%rax,%r13
    497 	movq	-8(%rsi,%r15,8),%rax
    498 	adcq	$0,%rdx
    499 	addq	%r10,%r13
    500 	adcq	$0,%rdx
    501 	movq	%r13,-24(%rsp,%r15,8)
    502 	movq	%rdx,%rdi
    503 
    504 	mulq	%rbx
    505 	addq	%rax,%r11
    506 	movq	-8(%rcx,%r15,8),%rax
    507 	adcq	$0,%rdx
    508 	addq	-8(%rsp,%r15,8),%r11
    509 	adcq	$0,%rdx
    510 	leaq	1(%r14),%r14
    511 	movq	%rdx,%r10
    512 
    513 	mulq	%rbp
    514 	addq	%rax,%rdi
    515 	movq	(%rsi),%rax
    516 	adcq	$0,%rdx
    517 	addq	%r11,%rdi
    518 	adcq	$0,%rdx
    519 	movq	%rdi,-16(%rsp,%r15,8)
    520 	movq	%rdx,%r13
    521 
    522 	xorq	%rdi,%rdi
    523 	addq	%r10,%r13
    524 	adcq	$0,%rdi
    525 	addq	(%rsp,%r9,8),%r13
    526 	adcq	$0,%rdi
    527 	movq	%r13,-8(%rsp,%r15,8)
    528 	movq	%rdi,(%rsp,%r15,8)
    529 
    530 	cmpq	%r9,%r14
    531 	jb	.Louter4x
    532 	movq	16(%rsp,%r9,8),%rdi
    533 	movq	0(%rsp),%rax
    534 	movq	8(%rsp),%rdx
    535 	shrq	$2,%r9
    536 	leaq	(%rsp),%rsi
    537 	xorq	%r14,%r14
    538 
    539 	subq	0(%rcx),%rax
    540 	movq	16(%rsi),%rbx
    541 	movq	24(%rsi),%rbp
    542 	sbbq	8(%rcx),%rdx
    543 	leaq	-1(%r9),%r15
    544 	jmp	.Lsub4x
    545 .align	16
    546 .Lsub4x:
    547 	movq	%rax,0(%rdi,%r14,8)
    548 	movq	%rdx,8(%rdi,%r14,8)
    549 	sbbq	16(%rcx,%r14,8),%rbx
    550 	movq	32(%rsi,%r14,8),%rax
    551 	movq	40(%rsi,%r14,8),%rdx
    552 	sbbq	24(%rcx,%r14,8),%rbp
    553 	movq	%rbx,16(%rdi,%r14,8)
    554 	movq	%rbp,24(%rdi,%r14,8)
    555 	sbbq	32(%rcx,%r14,8),%rax
    556 	movq	48(%rsi,%r14,8),%rbx
    557 	movq	56(%rsi,%r14,8),%rbp
    558 	sbbq	40(%rcx,%r14,8),%rdx
    559 	leaq	4(%r14),%r14
    560 	decq	%r15
    561 	jnz	.Lsub4x
    562 
    563 	movq	%rax,0(%rdi,%r14,8)
    564 	movq	32(%rsi,%r14,8),%rax
    565 	sbbq	16(%rcx,%r14,8),%rbx
    566 	movq	%rdx,8(%rdi,%r14,8)
    567 	sbbq	24(%rcx,%r14,8),%rbp
    568 	movq	%rbx,16(%rdi,%r14,8)
    569 
    570 	sbbq	$0,%rax
    571 	movq	%rax,%xmm0
    572 	punpcklqdq	%xmm0,%xmm0
    573 	movq	%rbp,24(%rdi,%r14,8)
    574 	xorq	%r14,%r14
    575 
    576 	movq	%r9,%r15
    577 	pxor	%xmm5,%xmm5
    578 	jmp	.Lcopy4x
    579 .align	16
    580 .Lcopy4x:
    581 	movdqu	(%rsp,%r14,1),%xmm2
    582 	movdqu	16(%rsp,%r14,1),%xmm4
    583 	movdqu	(%rdi,%r14,1),%xmm1
    584 	movdqu	16(%rdi,%r14,1),%xmm3
    585 	pxor	%xmm1,%xmm2
    586 	pxor	%xmm3,%xmm4
    587 	pand	%xmm0,%xmm2
    588 	pand	%xmm0,%xmm4
    589 	pxor	%xmm1,%xmm2
    590 	pxor	%xmm3,%xmm4
    591 	movdqu	%xmm2,(%rdi,%r14,1)
    592 	movdqu	%xmm4,16(%rdi,%r14,1)
    593 	movdqa	%xmm5,(%rsp,%r14,1)
    594 	movdqa	%xmm5,16(%rsp,%r14,1)
    595 
    596 	leaq	32(%r14),%r14
    597 	decq	%r15
    598 	jnz	.Lcopy4x
    599 
    600 	shlq	$2,%r9
    601 	movq	8(%rsp,%r9,8),%rsi
    602 	movq	$1,%rax
    603 	movq	(%rsi),%r15
    604 	movq	8(%rsi),%r14
    605 	movq	16(%rsi),%r13
    606 	movq	24(%rsi),%r12
    607 	movq	32(%rsi),%rbp
    608 	movq	40(%rsi),%rbx
    609 	leaq	48(%rsi),%rsp
    610 .Lmul4x_epilogue:
    611 	.byte	0xf3,0xc3
    612 .size	bn_mul4x_mont,.-bn_mul4x_mont
    613 
    614 
    615 .type	bn_sqr8x_mont,@function
    616 .align	32
    617 bn_sqr8x_mont:
    618 .Lsqr8x_enter:
    619 	movq	%rsp,%rax
    620 	pushq	%rbx
    621 	pushq	%rbp
    622 	pushq	%r12
    623 	pushq	%r13
    624 	pushq	%r14
    625 	pushq	%r15
    626 
    627 	movl	%r9d,%r10d
    628 	shll	$3,%r9d
    629 	shlq	$3+2,%r10
    630 	negq	%r9
    631 
    632 
    633 
    634 
    635 
    636 
    637 	leaq	-64(%rsp,%r9,4),%r11
    638 	movq	(%r8),%r8
    639 	subq	%rsi,%r11
    640 	andq	$4095,%r11
    641 	cmpq	%r11,%r10
    642 	jb	.Lsqr8x_sp_alt
    643 	subq	%r11,%rsp
    644 	leaq	-64(%rsp,%r9,4),%rsp
    645 	jmp	.Lsqr8x_sp_done
    646 
    647 .align	32
    648 .Lsqr8x_sp_alt:
    649 	leaq	4096-64(,%r9,4),%r10
    650 	leaq	-64(%rsp,%r9,4),%rsp
    651 	subq	%r10,%r11
    652 	movq	$0,%r10
    653 	cmovcq	%r10,%r11
    654 	subq	%r11,%rsp
    655 .Lsqr8x_sp_done:
    656 	andq	$-64,%rsp
    657 	movq	%r9,%r10
    658 	negq	%r9
    659 
    660 	leaq	64(%rsp,%r9,2),%r11
    661 	movq	%r8,32(%rsp)
    662 	movq	%rax,40(%rsp)
    663 .Lsqr8x_body:
    664 
    665 	movq	%r9,%rbp
    666 .byte	102,73,15,110,211
    667 	shrq	$3+2,%rbp
    668 	movl	OPENSSL_ia32cap_P+8(%rip),%eax
    669 	jmp	.Lsqr8x_copy_n
    670 
    671 .align	32
    672 .Lsqr8x_copy_n:
    673 	movq	0(%rcx),%xmm0
    674 	movq	8(%rcx),%xmm1
    675 	movq	16(%rcx),%xmm3
    676 	movq	24(%rcx),%xmm4
    677 	leaq	32(%rcx),%rcx
    678 	movdqa	%xmm0,0(%r11)
    679 	movdqa	%xmm1,16(%r11)
    680 	movdqa	%xmm3,32(%r11)
    681 	movdqa	%xmm4,48(%r11)
    682 	leaq	64(%r11),%r11
    683 	decq	%rbp
    684 	jnz	.Lsqr8x_copy_n
    685 
    686 	pxor	%xmm0,%xmm0
    687 .byte	102,72,15,110,207
    688 .byte	102,73,15,110,218
    689 	call	bn_sqr8x_internal
    690 
    691 	pxor	%xmm0,%xmm0
    692 	leaq	48(%rsp),%rax
    693 	leaq	64(%rsp,%r9,2),%rdx
    694 	shrq	$3+2,%r9
    695 	movq	40(%rsp),%rsi
    696 	jmp	.Lsqr8x_zero
    697 
    698 .align	32
    699 .Lsqr8x_zero:
    700 	movdqa	%xmm0,0(%rax)
    701 	movdqa	%xmm0,16(%rax)
    702 	movdqa	%xmm0,32(%rax)
    703 	movdqa	%xmm0,48(%rax)
    704 	leaq	64(%rax),%rax
    705 	movdqa	%xmm0,0(%rdx)
    706 	movdqa	%xmm0,16(%rdx)
    707 	movdqa	%xmm0,32(%rdx)
    708 	movdqa	%xmm0,48(%rdx)
    709 	leaq	64(%rdx),%rdx
    710 	decq	%r9
    711 	jnz	.Lsqr8x_zero
    712 
    713 	movq	$1,%rax
    714 	movq	-48(%rsi),%r15
    715 	movq	-40(%rsi),%r14
    716 	movq	-32(%rsi),%r13
    717 	movq	-24(%rsi),%r12
    718 	movq	-16(%rsi),%rbp
    719 	movq	-8(%rsi),%rbx
    720 	leaq	(%rsi),%rsp
    721 .Lsqr8x_epilogue:
    722 	.byte	0xf3,0xc3
    723 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
    724 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    725 .align	16
    726 #endif
    727