Home | History | Annotate | Download | only in bn
      1 #if defined(__x86_64__)
      2 .text
      3 
      4 .extern	OPENSSL_ia32cap_P
      5 .hidden OPENSSL_ia32cap_P
      6 
      7 .globl	bn_mul_mont
      8 .hidden bn_mul_mont
      9 .type	bn_mul_mont,@function
     10 .align	16
     11 bn_mul_mont:
     12 	testl	$3,%r9d
     13 	jnz	.Lmul_enter
     14 	cmpl	$8,%r9d
     15 	jb	.Lmul_enter
     16 	cmpq	%rsi,%rdx
     17 	jne	.Lmul4x_enter
     18 	testl	$7,%r9d
     19 	jz	.Lsqr8x_enter
     20 	jmp	.Lmul4x_enter
     21 
     22 .align	16
     23 .Lmul_enter:
     24 	pushq	%rbx
     25 	pushq	%rbp
     26 	pushq	%r12
     27 	pushq	%r13
     28 	pushq	%r14
     29 	pushq	%r15
     30 
     31 	movl	%r9d,%r9d
     32 	leaq	2(%r9),%r10
     33 	movq	%rsp,%r11
     34 	negq	%r10
     35 	leaq	(%rsp,%r10,8),%rsp
     36 	andq	$-1024,%rsp
     37 
     38 	movq	%r11,8(%rsp,%r9,8)
     39 .Lmul_body:
     40 	movq	%rdx,%r12
     41 	movq	(%r8),%r8
     42 	movq	(%r12),%rbx
     43 	movq	(%rsi),%rax
     44 
     45 	xorq	%r14,%r14
     46 	xorq	%r15,%r15
     47 
     48 	movq	%r8,%rbp
     49 	mulq	%rbx
     50 	movq	%rax,%r10
     51 	movq	(%rcx),%rax
     52 
     53 	imulq	%r10,%rbp
     54 	movq	%rdx,%r11
     55 
     56 	mulq	%rbp
     57 	addq	%rax,%r10
     58 	movq	8(%rsi),%rax
     59 	adcq	$0,%rdx
     60 	movq	%rdx,%r13
     61 
     62 	leaq	1(%r15),%r15
     63 	jmp	.L1st_enter
     64 
     65 .align	16
     66 .L1st:
     67 	addq	%rax,%r13
     68 	movq	(%rsi,%r15,8),%rax
     69 	adcq	$0,%rdx
     70 	addq	%r11,%r13
     71 	movq	%r10,%r11
     72 	adcq	$0,%rdx
     73 	movq	%r13,-16(%rsp,%r15,8)
     74 	movq	%rdx,%r13
     75 
     76 .L1st_enter:
     77 	mulq	%rbx
     78 	addq	%rax,%r11
     79 	movq	(%rcx,%r15,8),%rax
     80 	adcq	$0,%rdx
     81 	leaq	1(%r15),%r15
     82 	movq	%rdx,%r10
     83 
     84 	mulq	%rbp
     85 	cmpq	%r9,%r15
     86 	jne	.L1st
     87 
     88 	addq	%rax,%r13
     89 	movq	(%rsi),%rax
     90 	adcq	$0,%rdx
     91 	addq	%r11,%r13
     92 	adcq	$0,%rdx
     93 	movq	%r13,-16(%rsp,%r15,8)
     94 	movq	%rdx,%r13
     95 	movq	%r10,%r11
     96 
     97 	xorq	%rdx,%rdx
     98 	addq	%r11,%r13
     99 	adcq	$0,%rdx
    100 	movq	%r13,-8(%rsp,%r9,8)
    101 	movq	%rdx,(%rsp,%r9,8)
    102 
    103 	leaq	1(%r14),%r14
    104 	jmp	.Louter
    105 .align	16
    106 .Louter:
    107 	movq	(%r12,%r14,8),%rbx
    108 	xorq	%r15,%r15
    109 	movq	%r8,%rbp
    110 	movq	(%rsp),%r10
    111 	mulq	%rbx
    112 	addq	%rax,%r10
    113 	movq	(%rcx),%rax
    114 	adcq	$0,%rdx
    115 
    116 	imulq	%r10,%rbp
    117 	movq	%rdx,%r11
    118 
    119 	mulq	%rbp
    120 	addq	%rax,%r10
    121 	movq	8(%rsi),%rax
    122 	adcq	$0,%rdx
    123 	movq	8(%rsp),%r10
    124 	movq	%rdx,%r13
    125 
    126 	leaq	1(%r15),%r15
    127 	jmp	.Linner_enter
    128 
    129 .align	16
    130 .Linner:
    131 	addq	%rax,%r13
    132 	movq	(%rsi,%r15,8),%rax
    133 	adcq	$0,%rdx
    134 	addq	%r10,%r13
    135 	movq	(%rsp,%r15,8),%r10
    136 	adcq	$0,%rdx
    137 	movq	%r13,-16(%rsp,%r15,8)
    138 	movq	%rdx,%r13
    139 
    140 .Linner_enter:
    141 	mulq	%rbx
    142 	addq	%rax,%r11
    143 	movq	(%rcx,%r15,8),%rax
    144 	adcq	$0,%rdx
    145 	addq	%r11,%r10
    146 	movq	%rdx,%r11
    147 	adcq	$0,%r11
    148 	leaq	1(%r15),%r15
    149 
    150 	mulq	%rbp
    151 	cmpq	%r9,%r15
    152 	jne	.Linner
    153 
    154 	addq	%rax,%r13
    155 	movq	(%rsi),%rax
    156 	adcq	$0,%rdx
    157 	addq	%r10,%r13
    158 	movq	(%rsp,%r15,8),%r10
    159 	adcq	$0,%rdx
    160 	movq	%r13,-16(%rsp,%r15,8)
    161 	movq	%rdx,%r13
    162 
    163 	xorq	%rdx,%rdx
    164 	addq	%r11,%r13
    165 	adcq	$0,%rdx
    166 	addq	%r10,%r13
    167 	adcq	$0,%rdx
    168 	movq	%r13,-8(%rsp,%r9,8)
    169 	movq	%rdx,(%rsp,%r9,8)
    170 
    171 	leaq	1(%r14),%r14
    172 	cmpq	%r9,%r14
    173 	jb	.Louter
    174 
    175 	xorq	%r14,%r14
    176 	movq	(%rsp),%rax
    177 	leaq	(%rsp),%rsi
    178 	movq	%r9,%r15
    179 	jmp	.Lsub
    180 .align	16
    181 .Lsub:	sbbq	(%rcx,%r14,8),%rax
    182 	movq	%rax,(%rdi,%r14,8)
    183 	movq	8(%rsi,%r14,8),%rax
    184 	leaq	1(%r14),%r14
    185 	decq	%r15
    186 	jnz	.Lsub
    187 
    188 	sbbq	$0,%rax
    189 	xorq	%r14,%r14
    190 	movq	%r9,%r15
    191 .align	16
    192 .Lcopy:
    193 	movq	(%rsp,%r14,8),%rsi
    194 	movq	(%rdi,%r14,8),%rcx
    195 	xorq	%rcx,%rsi
    196 	andq	%rax,%rsi
    197 	xorq	%rcx,%rsi
    198 	movq	%r14,(%rsp,%r14,8)
    199 	movq	%rsi,(%rdi,%r14,8)
    200 	leaq	1(%r14),%r14
    201 	subq	$1,%r15
    202 	jnz	.Lcopy
    203 
    204 	movq	8(%rsp,%r9,8),%rsi
    205 	movq	$1,%rax
    206 	movq	(%rsi),%r15
    207 	movq	8(%rsi),%r14
    208 	movq	16(%rsi),%r13
    209 	movq	24(%rsi),%r12
    210 	movq	32(%rsi),%rbp
    211 	movq	40(%rsi),%rbx
    212 	leaq	48(%rsi),%rsp
    213 .Lmul_epilogue:
    214 	.byte	0xf3,0xc3
    215 .size	bn_mul_mont,.-bn_mul_mont
    216 .type	bn_mul4x_mont,@function
    217 .align	16
    218 bn_mul4x_mont:
    219 .Lmul4x_enter:
    220 	pushq	%rbx
    221 	pushq	%rbp
    222 	pushq	%r12
    223 	pushq	%r13
    224 	pushq	%r14
    225 	pushq	%r15
    226 
    227 	movl	%r9d,%r9d
    228 	leaq	4(%r9),%r10
    229 	movq	%rsp,%r11
    230 	negq	%r10
    231 	leaq	(%rsp,%r10,8),%rsp
    232 	andq	$-1024,%rsp
    233 
    234 	movq	%r11,8(%rsp,%r9,8)
    235 .Lmul4x_body:
    236 	movq	%rdi,16(%rsp,%r9,8)
    237 	movq	%rdx,%r12
    238 	movq	(%r8),%r8
    239 	movq	(%r12),%rbx
    240 	movq	(%rsi),%rax
    241 
    242 	xorq	%r14,%r14
    243 	xorq	%r15,%r15
    244 
    245 	movq	%r8,%rbp
    246 	mulq	%rbx
    247 	movq	%rax,%r10
    248 	movq	(%rcx),%rax
    249 
    250 	imulq	%r10,%rbp
    251 	movq	%rdx,%r11
    252 
    253 	mulq	%rbp
    254 	addq	%rax,%r10
    255 	movq	8(%rsi),%rax
    256 	adcq	$0,%rdx
    257 	movq	%rdx,%rdi
    258 
    259 	mulq	%rbx
    260 	addq	%rax,%r11
    261 	movq	8(%rcx),%rax
    262 	adcq	$0,%rdx
    263 	movq	%rdx,%r10
    264 
    265 	mulq	%rbp
    266 	addq	%rax,%rdi
    267 	movq	16(%rsi),%rax
    268 	adcq	$0,%rdx
    269 	addq	%r11,%rdi
    270 	leaq	4(%r15),%r15
    271 	adcq	$0,%rdx
    272 	movq	%rdi,(%rsp)
    273 	movq	%rdx,%r13
    274 	jmp	.L1st4x
    275 .align	16
    276 .L1st4x:
    277 	mulq	%rbx
    278 	addq	%rax,%r10
    279 	movq	-16(%rcx,%r15,8),%rax
    280 	adcq	$0,%rdx
    281 	movq	%rdx,%r11
    282 
    283 	mulq	%rbp
    284 	addq	%rax,%r13
    285 	movq	-8(%rsi,%r15,8),%rax
    286 	adcq	$0,%rdx
    287 	addq	%r10,%r13
    288 	adcq	$0,%rdx
    289 	movq	%r13,-24(%rsp,%r15,8)
    290 	movq	%rdx,%rdi
    291 
    292 	mulq	%rbx
    293 	addq	%rax,%r11
    294 	movq	-8(%rcx,%r15,8),%rax
    295 	adcq	$0,%rdx
    296 	movq	%rdx,%r10
    297 
    298 	mulq	%rbp
    299 	addq	%rax,%rdi
    300 	movq	(%rsi,%r15,8),%rax
    301 	adcq	$0,%rdx
    302 	addq	%r11,%rdi
    303 	adcq	$0,%rdx
    304 	movq	%rdi,-16(%rsp,%r15,8)
    305 	movq	%rdx,%r13
    306 
    307 	mulq	%rbx
    308 	addq	%rax,%r10
    309 	movq	(%rcx,%r15,8),%rax
    310 	adcq	$0,%rdx
    311 	movq	%rdx,%r11
    312 
    313 	mulq	%rbp
    314 	addq	%rax,%r13
    315 	movq	8(%rsi,%r15,8),%rax
    316 	adcq	$0,%rdx
    317 	addq	%r10,%r13
    318 	adcq	$0,%rdx
    319 	movq	%r13,-8(%rsp,%r15,8)
    320 	movq	%rdx,%rdi
    321 
    322 	mulq	%rbx
    323 	addq	%rax,%r11
    324 	movq	8(%rcx,%r15,8),%rax
    325 	adcq	$0,%rdx
    326 	leaq	4(%r15),%r15
    327 	movq	%rdx,%r10
    328 
    329 	mulq	%rbp
    330 	addq	%rax,%rdi
    331 	movq	-16(%rsi,%r15,8),%rax
    332 	adcq	$0,%rdx
    333 	addq	%r11,%rdi
    334 	adcq	$0,%rdx
    335 	movq	%rdi,-32(%rsp,%r15,8)
    336 	movq	%rdx,%r13
    337 	cmpq	%r9,%r15
    338 	jb	.L1st4x
    339 
    340 	mulq	%rbx
    341 	addq	%rax,%r10
    342 	movq	-16(%rcx,%r15,8),%rax
    343 	adcq	$0,%rdx
    344 	movq	%rdx,%r11
    345 
    346 	mulq	%rbp
    347 	addq	%rax,%r13
    348 	movq	-8(%rsi,%r15,8),%rax
    349 	adcq	$0,%rdx
    350 	addq	%r10,%r13
    351 	adcq	$0,%rdx
    352 	movq	%r13,-24(%rsp,%r15,8)
    353 	movq	%rdx,%rdi
    354 
    355 	mulq	%rbx
    356 	addq	%rax,%r11
    357 	movq	-8(%rcx,%r15,8),%rax
    358 	adcq	$0,%rdx
    359 	movq	%rdx,%r10
    360 
    361 	mulq	%rbp
    362 	addq	%rax,%rdi
    363 	movq	(%rsi),%rax
    364 	adcq	$0,%rdx
    365 	addq	%r11,%rdi
    366 	adcq	$0,%rdx
    367 	movq	%rdi,-16(%rsp,%r15,8)
    368 	movq	%rdx,%r13
    369 
    370 	xorq	%rdi,%rdi
    371 	addq	%r10,%r13
    372 	adcq	$0,%rdi
    373 	movq	%r13,-8(%rsp,%r15,8)
    374 	movq	%rdi,(%rsp,%r15,8)
    375 
    376 	leaq	1(%r14),%r14
    377 .align	4
    378 .Louter4x:
    379 	movq	(%r12,%r14,8),%rbx
    380 	xorq	%r15,%r15
    381 	movq	(%rsp),%r10
    382 	movq	%r8,%rbp
    383 	mulq	%rbx
    384 	addq	%rax,%r10
    385 	movq	(%rcx),%rax
    386 	adcq	$0,%rdx
    387 
    388 	imulq	%r10,%rbp
    389 	movq	%rdx,%r11
    390 
    391 	mulq	%rbp
    392 	addq	%rax,%r10
    393 	movq	8(%rsi),%rax
    394 	adcq	$0,%rdx
    395 	movq	%rdx,%rdi
    396 
    397 	mulq	%rbx
    398 	addq	%rax,%r11
    399 	movq	8(%rcx),%rax
    400 	adcq	$0,%rdx
    401 	addq	8(%rsp),%r11
    402 	adcq	$0,%rdx
    403 	movq	%rdx,%r10
    404 
    405 	mulq	%rbp
    406 	addq	%rax,%rdi
    407 	movq	16(%rsi),%rax
    408 	adcq	$0,%rdx
    409 	addq	%r11,%rdi
    410 	leaq	4(%r15),%r15
    411 	adcq	$0,%rdx
    412 	movq	%rdi,(%rsp)
    413 	movq	%rdx,%r13
    414 	jmp	.Linner4x
    415 .align	16
    416 .Linner4x:
    417 	mulq	%rbx
    418 	addq	%rax,%r10
    419 	movq	-16(%rcx,%r15,8),%rax
    420 	adcq	$0,%rdx
    421 	addq	-16(%rsp,%r15,8),%r10
    422 	adcq	$0,%rdx
    423 	movq	%rdx,%r11
    424 
    425 	mulq	%rbp
    426 	addq	%rax,%r13
    427 	movq	-8(%rsi,%r15,8),%rax
    428 	adcq	$0,%rdx
    429 	addq	%r10,%r13
    430 	adcq	$0,%rdx
    431 	movq	%r13,-24(%rsp,%r15,8)
    432 	movq	%rdx,%rdi
    433 
    434 	mulq	%rbx
    435 	addq	%rax,%r11
    436 	movq	-8(%rcx,%r15,8),%rax
    437 	adcq	$0,%rdx
    438 	addq	-8(%rsp,%r15,8),%r11
    439 	adcq	$0,%rdx
    440 	movq	%rdx,%r10
    441 
    442 	mulq	%rbp
    443 	addq	%rax,%rdi
    444 	movq	(%rsi,%r15,8),%rax
    445 	adcq	$0,%rdx
    446 	addq	%r11,%rdi
    447 	adcq	$0,%rdx
    448 	movq	%rdi,-16(%rsp,%r15,8)
    449 	movq	%rdx,%r13
    450 
    451 	mulq	%rbx
    452 	addq	%rax,%r10
    453 	movq	(%rcx,%r15,8),%rax
    454 	adcq	$0,%rdx
    455 	addq	(%rsp,%r15,8),%r10
    456 	adcq	$0,%rdx
    457 	movq	%rdx,%r11
    458 
    459 	mulq	%rbp
    460 	addq	%rax,%r13
    461 	movq	8(%rsi,%r15,8),%rax
    462 	adcq	$0,%rdx
    463 	addq	%r10,%r13
    464 	adcq	$0,%rdx
    465 	movq	%r13,-8(%rsp,%r15,8)
    466 	movq	%rdx,%rdi
    467 
    468 	mulq	%rbx
    469 	addq	%rax,%r11
    470 	movq	8(%rcx,%r15,8),%rax
    471 	adcq	$0,%rdx
    472 	addq	8(%rsp,%r15,8),%r11
    473 	adcq	$0,%rdx
    474 	leaq	4(%r15),%r15
    475 	movq	%rdx,%r10
    476 
    477 	mulq	%rbp
    478 	addq	%rax,%rdi
    479 	movq	-16(%rsi,%r15,8),%rax
    480 	adcq	$0,%rdx
    481 	addq	%r11,%rdi
    482 	adcq	$0,%rdx
    483 	movq	%rdi,-32(%rsp,%r15,8)
    484 	movq	%rdx,%r13
    485 	cmpq	%r9,%r15
    486 	jb	.Linner4x
    487 
    488 	mulq	%rbx
    489 	addq	%rax,%r10
    490 	movq	-16(%rcx,%r15,8),%rax
    491 	adcq	$0,%rdx
    492 	addq	-16(%rsp,%r15,8),%r10
    493 	adcq	$0,%rdx
    494 	movq	%rdx,%r11
    495 
    496 	mulq	%rbp
    497 	addq	%rax,%r13
    498 	movq	-8(%rsi,%r15,8),%rax
    499 	adcq	$0,%rdx
    500 	addq	%r10,%r13
    501 	adcq	$0,%rdx
    502 	movq	%r13,-24(%rsp,%r15,8)
    503 	movq	%rdx,%rdi
    504 
    505 	mulq	%rbx
    506 	addq	%rax,%r11
    507 	movq	-8(%rcx,%r15,8),%rax
    508 	adcq	$0,%rdx
    509 	addq	-8(%rsp,%r15,8),%r11
    510 	adcq	$0,%rdx
    511 	leaq	1(%r14),%r14
    512 	movq	%rdx,%r10
    513 
    514 	mulq	%rbp
    515 	addq	%rax,%rdi
    516 	movq	(%rsi),%rax
    517 	adcq	$0,%rdx
    518 	addq	%r11,%rdi
    519 	adcq	$0,%rdx
    520 	movq	%rdi,-16(%rsp,%r15,8)
    521 	movq	%rdx,%r13
    522 
    523 	xorq	%rdi,%rdi
    524 	addq	%r10,%r13
    525 	adcq	$0,%rdi
    526 	addq	(%rsp,%r9,8),%r13
    527 	adcq	$0,%rdi
    528 	movq	%r13,-8(%rsp,%r15,8)
    529 	movq	%rdi,(%rsp,%r15,8)
    530 
    531 	cmpq	%r9,%r14
    532 	jb	.Louter4x
    533 	movq	16(%rsp,%r9,8),%rdi
    534 	movq	0(%rsp),%rax
    535 	movq	8(%rsp),%rdx
    536 	shrq	$2,%r9
    537 	leaq	(%rsp),%rsi
    538 	xorq	%r14,%r14
    539 
    540 	subq	0(%rcx),%rax
    541 	movq	16(%rsi),%rbx
    542 	movq	24(%rsi),%rbp
    543 	sbbq	8(%rcx),%rdx
    544 	leaq	-1(%r9),%r15
    545 	jmp	.Lsub4x
    546 .align	16
    547 .Lsub4x:
    548 	movq	%rax,0(%rdi,%r14,8)
    549 	movq	%rdx,8(%rdi,%r14,8)
    550 	sbbq	16(%rcx,%r14,8),%rbx
    551 	movq	32(%rsi,%r14,8),%rax
    552 	movq	40(%rsi,%r14,8),%rdx
    553 	sbbq	24(%rcx,%r14,8),%rbp
    554 	movq	%rbx,16(%rdi,%r14,8)
    555 	movq	%rbp,24(%rdi,%r14,8)
    556 	sbbq	32(%rcx,%r14,8),%rax
    557 	movq	48(%rsi,%r14,8),%rbx
    558 	movq	56(%rsi,%r14,8),%rbp
    559 	sbbq	40(%rcx,%r14,8),%rdx
    560 	leaq	4(%r14),%r14
    561 	decq	%r15
    562 	jnz	.Lsub4x
    563 
    564 	movq	%rax,0(%rdi,%r14,8)
    565 	movq	32(%rsi,%r14,8),%rax
    566 	sbbq	16(%rcx,%r14,8),%rbx
    567 	movq	%rdx,8(%rdi,%r14,8)
    568 	sbbq	24(%rcx,%r14,8),%rbp
    569 	movq	%rbx,16(%rdi,%r14,8)
    570 
    571 	sbbq	$0,%rax
    572 	movq	%rax,%xmm0
    573 	punpcklqdq	%xmm0,%xmm0
    574 	movq	%rbp,24(%rdi,%r14,8)
    575 	xorq	%r14,%r14
    576 
    577 	movq	%r9,%r15
    578 	pxor	%xmm5,%xmm5
    579 	jmp	.Lcopy4x
    580 .align	16
    581 .Lcopy4x:
    582 	movdqu	(%rsp,%r14,1),%xmm2
    583 	movdqu	16(%rsp,%r14,1),%xmm4
    584 	movdqu	(%rdi,%r14,1),%xmm1
    585 	movdqu	16(%rdi,%r14,1),%xmm3
    586 	pxor	%xmm1,%xmm2
    587 	pxor	%xmm3,%xmm4
    588 	pand	%xmm0,%xmm2
    589 	pand	%xmm0,%xmm4
    590 	pxor	%xmm1,%xmm2
    591 	pxor	%xmm3,%xmm4
    592 	movdqu	%xmm2,(%rdi,%r14,1)
    593 	movdqu	%xmm4,16(%rdi,%r14,1)
    594 	movdqa	%xmm5,(%rsp,%r14,1)
    595 	movdqa	%xmm5,16(%rsp,%r14,1)
    596 
    597 	leaq	32(%r14),%r14
    598 	decq	%r15
    599 	jnz	.Lcopy4x
    600 
    601 	shlq	$2,%r9
    602 	movq	8(%rsp,%r9,8),%rsi
    603 	movq	$1,%rax
    604 	movq	(%rsi),%r15
    605 	movq	8(%rsi),%r14
    606 	movq	16(%rsi),%r13
    607 	movq	24(%rsi),%r12
    608 	movq	32(%rsi),%rbp
    609 	movq	40(%rsi),%rbx
    610 	leaq	48(%rsi),%rsp
    611 .Lmul4x_epilogue:
    612 	.byte	0xf3,0xc3
    613 .size	bn_mul4x_mont,.-bn_mul4x_mont
    614 .extern	bn_sqr8x_internal
    615 .hidden bn_sqr8x_internal
    616 
    617 .type	bn_sqr8x_mont,@function
    618 .align	32
    619 bn_sqr8x_mont:
    620 .Lsqr8x_enter:
    621 	movq	%rsp,%rax
    622 	pushq	%rbx
    623 	pushq	%rbp
    624 	pushq	%r12
    625 	pushq	%r13
    626 	pushq	%r14
    627 	pushq	%r15
    628 
    629 	movl	%r9d,%r10d
    630 	shll	$3,%r9d
    631 	shlq	$3+2,%r10
    632 	negq	%r9
    633 
    634 
    635 
    636 
    637 
    638 
    639 	leaq	-64(%rsp,%r9,4),%r11
    640 	movq	(%r8),%r8
    641 	subq	%rsi,%r11
    642 	andq	$4095,%r11
    643 	cmpq	%r11,%r10
    644 	jb	.Lsqr8x_sp_alt
    645 	subq	%r11,%rsp
    646 	leaq	-64(%rsp,%r9,4),%rsp
    647 	jmp	.Lsqr8x_sp_done
    648 
    649 .align	32
    650 .Lsqr8x_sp_alt:
    651 	leaq	4096-64(,%r9,4),%r10
    652 	leaq	-64(%rsp,%r9,4),%rsp
    653 	subq	%r10,%r11
    654 	movq	$0,%r10
    655 	cmovcq	%r10,%r11
    656 	subq	%r11,%rsp
    657 .Lsqr8x_sp_done:
    658 	andq	$-64,%rsp
    659 	movq	%r9,%r10
    660 	negq	%r9
    661 
    662 	leaq	64(%rsp,%r9,2),%r11
    663 	movq	%r8,32(%rsp)
    664 	movq	%rax,40(%rsp)
    665 .Lsqr8x_body:
    666 
    667 	movq	%r9,%rbp
    668 .byte	102,73,15,110,211
    669 	shrq	$3+2,%rbp
    670 	movl	OPENSSL_ia32cap_P+8(%rip),%eax
    671 	jmp	.Lsqr8x_copy_n
    672 
    673 .align	32
    674 .Lsqr8x_copy_n:
    675 	movq	0(%rcx),%xmm0
    676 	movq	8(%rcx),%xmm1
    677 	movq	16(%rcx),%xmm3
    678 	movq	24(%rcx),%xmm4
    679 	leaq	32(%rcx),%rcx
    680 	movdqa	%xmm0,0(%r11)
    681 	movdqa	%xmm1,16(%r11)
    682 	movdqa	%xmm3,32(%r11)
    683 	movdqa	%xmm4,48(%r11)
    684 	leaq	64(%r11),%r11
    685 	decq	%rbp
    686 	jnz	.Lsqr8x_copy_n
    687 
    688 	pxor	%xmm0,%xmm0
    689 .byte	102,72,15,110,207
    690 .byte	102,73,15,110,218
    691 	call	bn_sqr8x_internal
    692 
    693 	pxor	%xmm0,%xmm0
    694 	leaq	48(%rsp),%rax
    695 	leaq	64(%rsp,%r9,2),%rdx
    696 	shrq	$3+2,%r9
    697 	movq	40(%rsp),%rsi
    698 	jmp	.Lsqr8x_zero
    699 
    700 .align	32
    701 .Lsqr8x_zero:
    702 	movdqa	%xmm0,0(%rax)
    703 	movdqa	%xmm0,16(%rax)
    704 	movdqa	%xmm0,32(%rax)
    705 	movdqa	%xmm0,48(%rax)
    706 	leaq	64(%rax),%rax
    707 	movdqa	%xmm0,0(%rdx)
    708 	movdqa	%xmm0,16(%rdx)
    709 	movdqa	%xmm0,32(%rdx)
    710 	movdqa	%xmm0,48(%rdx)
    711 	leaq	64(%rdx),%rdx
    712 	decq	%r9
    713 	jnz	.Lsqr8x_zero
    714 
    715 	movq	$1,%rax
    716 	movq	-48(%rsi),%r15
    717 	movq	-40(%rsi),%r14
    718 	movq	-32(%rsi),%r13
    719 	movq	-24(%rsi),%r12
    720 	movq	-16(%rsi),%rbp
    721 	movq	-8(%rsi),%rbx
    722 	leaq	(%rsi),%rsp
    723 .Lsqr8x_epilogue:
    724 	.byte	0xf3,0xc3
    725 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
    726 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    727 .align	16
    728 #endif
    729