Home | History | Annotate | Download | only in bn
      1 %ifidn __OUTPUT_FORMAT__,obj
      2 section	code	use32 class=code align=64
      3 %elifidn __OUTPUT_FORMAT__,win32
      4 %ifdef __YASM_VERSION_ID__
      5 %if __YASM_VERSION_ID__ < 01010000h
      6 %error yasm version 1.1.0 or later needed.
      7 %endif
      8 ; Yasm automatically includes .00 and complains about redefining it.
      9 ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
     10 %else
     11 $@feat.00 equ 1
     12 %endif
     13 section	.text	code align=64
     14 %else
     15 section	.text	code
     16 %endif
     17 ;extern	_OPENSSL_ia32cap_P
     18 global	_bn_mul_mont
     19 align	16
     20 _bn_mul_mont:
     21 L$_bn_mul_mont_begin:
     22 	push	ebp
     23 	push	ebx
     24 	push	esi
     25 	push	edi
     26 	xor	eax,eax
     27 	mov	edi,DWORD [40+esp]
     28 	cmp	edi,4
     29 	jl	NEAR L$000just_leave
     30 	lea	esi,[20+esp]
     31 	lea	edx,[24+esp]
     32 	mov	ebp,esp
     33 	add	edi,2
     34 	neg	edi
     35 	lea	esp,[edi*4+esp-32]
     36 	neg	edi
     37 	mov	eax,esp
     38 	sub	eax,edx
     39 	and	eax,2047
     40 	sub	esp,eax
     41 	xor	edx,esp
     42 	and	edx,2048
     43 	xor	edx,2048
     44 	sub	esp,edx
     45 	and	esp,-64
     46 	mov	eax,DWORD [esi]
     47 	mov	ebx,DWORD [4+esi]
     48 	mov	ecx,DWORD [8+esi]
     49 	mov	edx,DWORD [12+esi]
     50 	mov	esi,DWORD [16+esi]
     51 	mov	esi,DWORD [esi]
     52 	mov	DWORD [4+esp],eax
     53 	mov	DWORD [8+esp],ebx
     54 	mov	DWORD [12+esp],ecx
     55 	mov	DWORD [16+esp],edx
     56 	mov	DWORD [20+esp],esi
     57 	lea	ebx,[edi-3]
     58 	mov	DWORD [24+esp],ebp
     59 	lea	eax,[_OPENSSL_ia32cap_P]
     60 	bt	DWORD [eax],26
     61 	jnc	NEAR L$001non_sse2
     62 	mov	eax,-1
     63 	movd	mm7,eax
     64 	mov	esi,DWORD [8+esp]
     65 	mov	edi,DWORD [12+esp]
     66 	mov	ebp,DWORD [16+esp]
     67 	xor	edx,edx
     68 	xor	ecx,ecx
     69 	movd	mm4,DWORD [edi]
     70 	movd	mm5,DWORD [esi]
     71 	movd	mm3,DWORD [ebp]
     72 	pmuludq	mm5,mm4
     73 	movq	mm2,mm5
     74 	movq	mm0,mm5
     75 	pand	mm0,mm7
     76 	pmuludq	mm5,[20+esp]
     77 	pmuludq	mm3,mm5
     78 	paddq	mm3,mm0
     79 	movd	mm1,DWORD [4+ebp]
     80 	movd	mm0,DWORD [4+esi]
     81 	psrlq	mm2,32
     82 	psrlq	mm3,32
     83 	inc	ecx
     84 align	16
     85 L$0021st:
     86 	pmuludq	mm0,mm4
     87 	pmuludq	mm1,mm5
     88 	paddq	mm2,mm0
     89 	paddq	mm3,mm1
     90 	movq	mm0,mm2
     91 	pand	mm0,mm7
     92 	movd	mm1,DWORD [4+ecx*4+ebp]
     93 	paddq	mm3,mm0
     94 	movd	mm0,DWORD [4+ecx*4+esi]
     95 	psrlq	mm2,32
     96 	movd	DWORD [28+ecx*4+esp],mm3
     97 	psrlq	mm3,32
     98 	lea	ecx,[1+ecx]
     99 	cmp	ecx,ebx
    100 	jl	NEAR L$0021st
    101 	pmuludq	mm0,mm4
    102 	pmuludq	mm1,mm5
    103 	paddq	mm2,mm0
    104 	paddq	mm3,mm1
    105 	movq	mm0,mm2
    106 	pand	mm0,mm7
    107 	paddq	mm3,mm0
    108 	movd	DWORD [28+ecx*4+esp],mm3
    109 	psrlq	mm2,32
    110 	psrlq	mm3,32
    111 	paddq	mm3,mm2
    112 	movq	[32+ebx*4+esp],mm3
    113 	inc	edx
    114 L$003outer:
    115 	xor	ecx,ecx
    116 	movd	mm4,DWORD [edx*4+edi]
    117 	movd	mm5,DWORD [esi]
    118 	movd	mm6,DWORD [32+esp]
    119 	movd	mm3,DWORD [ebp]
    120 	pmuludq	mm5,mm4
    121 	paddq	mm5,mm6
    122 	movq	mm0,mm5
    123 	movq	mm2,mm5
    124 	pand	mm0,mm7
    125 	pmuludq	mm5,[20+esp]
    126 	pmuludq	mm3,mm5
    127 	paddq	mm3,mm0
    128 	movd	mm6,DWORD [36+esp]
    129 	movd	mm1,DWORD [4+ebp]
    130 	movd	mm0,DWORD [4+esi]
    131 	psrlq	mm2,32
    132 	psrlq	mm3,32
    133 	paddq	mm2,mm6
    134 	inc	ecx
    135 	dec	ebx
    136 L$004inner:
    137 	pmuludq	mm0,mm4
    138 	pmuludq	mm1,mm5
    139 	paddq	mm2,mm0
    140 	paddq	mm3,mm1
    141 	movq	mm0,mm2
    142 	movd	mm6,DWORD [36+ecx*4+esp]
    143 	pand	mm0,mm7
    144 	movd	mm1,DWORD [4+ecx*4+ebp]
    145 	paddq	mm3,mm0
    146 	movd	mm0,DWORD [4+ecx*4+esi]
    147 	psrlq	mm2,32
    148 	movd	DWORD [28+ecx*4+esp],mm3
    149 	psrlq	mm3,32
    150 	paddq	mm2,mm6
    151 	dec	ebx
    152 	lea	ecx,[1+ecx]
    153 	jnz	NEAR L$004inner
    154 	mov	ebx,ecx
    155 	pmuludq	mm0,mm4
    156 	pmuludq	mm1,mm5
    157 	paddq	mm2,mm0
    158 	paddq	mm3,mm1
    159 	movq	mm0,mm2
    160 	pand	mm0,mm7
    161 	paddq	mm3,mm0
    162 	movd	DWORD [28+ecx*4+esp],mm3
    163 	psrlq	mm2,32
    164 	psrlq	mm3,32
    165 	movd	mm6,DWORD [36+ebx*4+esp]
    166 	paddq	mm3,mm2
    167 	paddq	mm3,mm6
    168 	movq	[32+ebx*4+esp],mm3
    169 	lea	edx,[1+edx]
    170 	cmp	edx,ebx
    171 	jle	NEAR L$003outer
    172 	emms
    173 	jmp	NEAR L$005common_tail
    174 align	16
    175 L$001non_sse2:
    176 	mov	esi,DWORD [8+esp]
    177 	lea	ebp,[1+ebx]
    178 	mov	edi,DWORD [12+esp]
    179 	xor	ecx,ecx
    180 	mov	edx,esi
    181 	and	ebp,1
    182 	sub	edx,edi
    183 	lea	eax,[4+ebx*4+edi]
    184 	or	ebp,edx
    185 	mov	edi,DWORD [edi]
    186 	jz	NEAR L$006bn_sqr_mont
    187 	mov	DWORD [28+esp],eax
    188 	mov	eax,DWORD [esi]
    189 	xor	edx,edx
    190 align	16
    191 L$007mull:
    192 	mov	ebp,edx
    193 	mul	edi
    194 	add	ebp,eax
    195 	lea	ecx,[1+ecx]
    196 	adc	edx,0
    197 	mov	eax,DWORD [ecx*4+esi]
    198 	cmp	ecx,ebx
    199 	mov	DWORD [28+ecx*4+esp],ebp
    200 	jl	NEAR L$007mull
    201 	mov	ebp,edx
    202 	mul	edi
    203 	mov	edi,DWORD [20+esp]
    204 	add	eax,ebp
    205 	mov	esi,DWORD [16+esp]
    206 	adc	edx,0
    207 	imul	edi,DWORD [32+esp]
    208 	mov	DWORD [32+ebx*4+esp],eax
    209 	xor	ecx,ecx
    210 	mov	DWORD [36+ebx*4+esp],edx
    211 	mov	DWORD [40+ebx*4+esp],ecx
    212 	mov	eax,DWORD [esi]
    213 	mul	edi
    214 	add	eax,DWORD [32+esp]
    215 	mov	eax,DWORD [4+esi]
    216 	adc	edx,0
    217 	inc	ecx
    218 	jmp	NEAR L$0082ndmadd
    219 align	16
    220 L$0091stmadd:
    221 	mov	ebp,edx
    222 	mul	edi
    223 	add	ebp,DWORD [32+ecx*4+esp]
    224 	lea	ecx,[1+ecx]
    225 	adc	edx,0
    226 	add	ebp,eax
    227 	mov	eax,DWORD [ecx*4+esi]
    228 	adc	edx,0
    229 	cmp	ecx,ebx
    230 	mov	DWORD [28+ecx*4+esp],ebp
    231 	jl	NEAR L$0091stmadd
    232 	mov	ebp,edx
    233 	mul	edi
    234 	add	eax,DWORD [32+ebx*4+esp]
    235 	mov	edi,DWORD [20+esp]
    236 	adc	edx,0
    237 	mov	esi,DWORD [16+esp]
    238 	add	ebp,eax
    239 	adc	edx,0
    240 	imul	edi,DWORD [32+esp]
    241 	xor	ecx,ecx
    242 	add	edx,DWORD [36+ebx*4+esp]
    243 	mov	DWORD [32+ebx*4+esp],ebp
    244 	adc	ecx,0
    245 	mov	eax,DWORD [esi]
    246 	mov	DWORD [36+ebx*4+esp],edx
    247 	mov	DWORD [40+ebx*4+esp],ecx
    248 	mul	edi
    249 	add	eax,DWORD [32+esp]
    250 	mov	eax,DWORD [4+esi]
    251 	adc	edx,0
    252 	mov	ecx,1
    253 align	16
    254 L$0082ndmadd:
    255 	mov	ebp,edx
    256 	mul	edi
    257 	add	ebp,DWORD [32+ecx*4+esp]
    258 	lea	ecx,[1+ecx]
    259 	adc	edx,0
    260 	add	ebp,eax
    261 	mov	eax,DWORD [ecx*4+esi]
    262 	adc	edx,0
    263 	cmp	ecx,ebx
    264 	mov	DWORD [24+ecx*4+esp],ebp
    265 	jl	NEAR L$0082ndmadd
    266 	mov	ebp,edx
    267 	mul	edi
    268 	add	ebp,DWORD [32+ebx*4+esp]
    269 	adc	edx,0
    270 	add	ebp,eax
    271 	adc	edx,0
    272 	mov	DWORD [28+ebx*4+esp],ebp
    273 	xor	eax,eax
    274 	mov	ecx,DWORD [12+esp]
    275 	add	edx,DWORD [36+ebx*4+esp]
    276 	adc	eax,DWORD [40+ebx*4+esp]
    277 	lea	ecx,[4+ecx]
    278 	mov	DWORD [32+ebx*4+esp],edx
    279 	cmp	ecx,DWORD [28+esp]
    280 	mov	DWORD [36+ebx*4+esp],eax
    281 	je	NEAR L$005common_tail
    282 	mov	edi,DWORD [ecx]
    283 	mov	esi,DWORD [8+esp]
    284 	mov	DWORD [12+esp],ecx
    285 	xor	ecx,ecx
    286 	xor	edx,edx
    287 	mov	eax,DWORD [esi]
    288 	jmp	NEAR L$0091stmadd
    289 align	16
    290 L$006bn_sqr_mont:
    291 	mov	DWORD [esp],ebx
    292 	mov	DWORD [12+esp],ecx
    293 	mov	eax,edi
    294 	mul	edi
    295 	mov	DWORD [32+esp],eax
    296 	mov	ebx,edx
    297 	shr	edx,1
    298 	and	ebx,1
    299 	inc	ecx
    300 align	16
    301 L$010sqr:
    302 	mov	eax,DWORD [ecx*4+esi]
    303 	mov	ebp,edx
    304 	mul	edi
    305 	add	eax,ebp
    306 	lea	ecx,[1+ecx]
    307 	adc	edx,0
    308 	lea	ebp,[eax*2+ebx]
    309 	shr	eax,31
    310 	cmp	ecx,DWORD [esp]
    311 	mov	ebx,eax
    312 	mov	DWORD [28+ecx*4+esp],ebp
    313 	jl	NEAR L$010sqr
    314 	mov	eax,DWORD [ecx*4+esi]
    315 	mov	ebp,edx
    316 	mul	edi
    317 	add	eax,ebp
    318 	mov	edi,DWORD [20+esp]
    319 	adc	edx,0
    320 	mov	esi,DWORD [16+esp]
    321 	lea	ebp,[eax*2+ebx]
    322 	imul	edi,DWORD [32+esp]
    323 	shr	eax,31
    324 	mov	DWORD [32+ecx*4+esp],ebp
    325 	lea	ebp,[edx*2+eax]
    326 	mov	eax,DWORD [esi]
    327 	shr	edx,31
    328 	mov	DWORD [36+ecx*4+esp],ebp
    329 	mov	DWORD [40+ecx*4+esp],edx
    330 	mul	edi
    331 	add	eax,DWORD [32+esp]
    332 	mov	ebx,ecx
    333 	adc	edx,0
    334 	mov	eax,DWORD [4+esi]
    335 	mov	ecx,1
    336 align	16
    337 L$0113rdmadd:
    338 	mov	ebp,edx
    339 	mul	edi
    340 	add	ebp,DWORD [32+ecx*4+esp]
    341 	adc	edx,0
    342 	add	ebp,eax
    343 	mov	eax,DWORD [4+ecx*4+esi]
    344 	adc	edx,0
    345 	mov	DWORD [28+ecx*4+esp],ebp
    346 	mov	ebp,edx
    347 	mul	edi
    348 	add	ebp,DWORD [36+ecx*4+esp]
    349 	lea	ecx,[2+ecx]
    350 	adc	edx,0
    351 	add	ebp,eax
    352 	mov	eax,DWORD [ecx*4+esi]
    353 	adc	edx,0
    354 	cmp	ecx,ebx
    355 	mov	DWORD [24+ecx*4+esp],ebp
    356 	jl	NEAR L$0113rdmadd
    357 	mov	ebp,edx
    358 	mul	edi
    359 	add	ebp,DWORD [32+ebx*4+esp]
    360 	adc	edx,0
    361 	add	ebp,eax
    362 	adc	edx,0
    363 	mov	DWORD [28+ebx*4+esp],ebp
    364 	mov	ecx,DWORD [12+esp]
    365 	xor	eax,eax
    366 	mov	esi,DWORD [8+esp]
    367 	add	edx,DWORD [36+ebx*4+esp]
    368 	adc	eax,DWORD [40+ebx*4+esp]
    369 	mov	DWORD [32+ebx*4+esp],edx
    370 	cmp	ecx,ebx
    371 	mov	DWORD [36+ebx*4+esp],eax
    372 	je	NEAR L$005common_tail
    373 	mov	edi,DWORD [4+ecx*4+esi]
    374 	lea	ecx,[1+ecx]
    375 	mov	eax,edi
    376 	mov	DWORD [12+esp],ecx
    377 	mul	edi
    378 	add	eax,DWORD [32+ecx*4+esp]
    379 	adc	edx,0
    380 	mov	DWORD [32+ecx*4+esp],eax
    381 	xor	ebp,ebp
    382 	cmp	ecx,ebx
    383 	lea	ecx,[1+ecx]
    384 	je	NEAR L$012sqrlast
    385 	mov	ebx,edx
    386 	shr	edx,1
    387 	and	ebx,1
    388 align	16
    389 L$013sqradd:
    390 	mov	eax,DWORD [ecx*4+esi]
    391 	mov	ebp,edx
    392 	mul	edi
    393 	add	eax,ebp
    394 	lea	ebp,[eax*1+eax]
    395 	adc	edx,0
    396 	shr	eax,31
    397 	add	ebp,DWORD [32+ecx*4+esp]
    398 	lea	ecx,[1+ecx]
    399 	adc	eax,0
    400 	add	ebp,ebx
    401 	adc	eax,0
    402 	cmp	ecx,DWORD [esp]
    403 	mov	DWORD [28+ecx*4+esp],ebp
    404 	mov	ebx,eax
    405 	jle	NEAR L$013sqradd
    406 	mov	ebp,edx
    407 	add	edx,edx
    408 	shr	ebp,31
    409 	add	edx,ebx
    410 	adc	ebp,0
    411 L$012sqrlast:
    412 	mov	edi,DWORD [20+esp]
    413 	mov	esi,DWORD [16+esp]
    414 	imul	edi,DWORD [32+esp]
    415 	add	edx,DWORD [32+ecx*4+esp]
    416 	mov	eax,DWORD [esi]
    417 	adc	ebp,0
    418 	mov	DWORD [32+ecx*4+esp],edx
    419 	mov	DWORD [36+ecx*4+esp],ebp
    420 	mul	edi
    421 	add	eax,DWORD [32+esp]
    422 	lea	ebx,[ecx-1]
    423 	adc	edx,0
    424 	mov	ecx,1
    425 	mov	eax,DWORD [4+esi]
    426 	jmp	NEAR L$0113rdmadd
    427 align	16
    428 L$005common_tail:
    429 	mov	ebp,DWORD [16+esp]
    430 	mov	edi,DWORD [4+esp]
    431 	lea	esi,[32+esp]
    432 	mov	eax,DWORD [esi]
    433 	mov	ecx,ebx
    434 	xor	edx,edx
    435 align	16
    436 L$014sub:
    437 	sbb	eax,DWORD [edx*4+ebp]
    438 	mov	DWORD [edx*4+edi],eax
    439 	dec	ecx
    440 	mov	eax,DWORD [4+edx*4+esi]
    441 	lea	edx,[1+edx]
    442 	jge	NEAR L$014sub
    443 	sbb	eax,0
    444 align	16
    445 L$015copy:
    446 	mov	edx,DWORD [ebx*4+esi]
    447 	mov	ebp,DWORD [ebx*4+edi]
    448 	xor	edx,ebp
    449 	and	edx,eax
    450 	xor	edx,ebp
    451 	mov	DWORD [ebx*4+esi],ecx
    452 	mov	DWORD [ebx*4+edi],edx
    453 	dec	ebx
    454 	jge	NEAR L$015copy
    455 	mov	esp,DWORD [24+esp]
    456 	mov	eax,1
    457 L$000just_leave:
    458 	pop	edi
    459 	pop	esi
    460 	pop	ebx
    461 	pop	ebp
    462 	ret
    463 db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
    464 db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
    465 db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
    466 db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
    467 db	111,114,103,62,0
    468 segment	.bss
    469 common	_OPENSSL_ia32cap_P 16
    470