Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     11 push(@INC,"${dir}","${dir}../../../perlasm");
     12 require "x86asm.pl";
     13 
     14 $output = pop;
     15 open STDOUT,">$output";
     16 
     17 &asm_init($ARGV[0]);
     18 
     19 $sse2=0;
     20 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
     21 
     22 &external_label("OPENSSL_ia32cap_P") if ($sse2);
     23 
     24 &bn_mul_add_words("bn_mul_add_words");
     25 &bn_mul_words("bn_mul_words");
     26 &bn_sqr_words("bn_sqr_words");
     27 &bn_div_words("bn_div_words");
     28 &bn_add_words("bn_add_words");
     29 &bn_sub_words("bn_sub_words");
     30 &bn_sub_part_words("bn_sub_part_words");
     31 
     32 &asm_finish();
     33 
     34 close STDOUT;
     35 
     36 sub bn_mul_add_words
     37 	{
     38 	local($name)=@_;
     39 
     40 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
     41 
     42 	$r="eax";
     43 	$a="edx";
     44 	$c="ecx";
     45 
     46 	if ($sse2) {
     47 		&picmeup("eax","OPENSSL_ia32cap_P");
     48 		&bt(&DWP(0,"eax"),26);
     49 		&jnc(&label("maw_non_sse2"));
     50 
     51 		&mov($r,&wparam(0));
     52 		&mov($a,&wparam(1));
     53 		&mov($c,&wparam(2));
     54 		&movd("mm0",&wparam(3));	# mm0 = w
     55 		&pxor("mm1","mm1");		# mm1 = carry_in
     56 		&jmp(&label("maw_sse2_entry"));
     57 
     58 	&set_label("maw_sse2_unrolled",16);
     59 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
     60 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
     61 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
     62 		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
     63 		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
     64 		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
     65 		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
     66 		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
     67 		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
     68 		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
     69 		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
     70 		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
     71 		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
     72 		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
     73 		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
     74 		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
     75 		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
     76 		&movd(&DWP(0,$r,"",0),"mm1");
     77 		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
     78 		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
     79 		&psrlq("mm1",32);		# mm1 = carry0
     80 		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
     81 		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
     82 		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
     83 		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
     84 		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
     85 		&movd(&DWP(4,$r,"",0),"mm1");
     86 		&psrlq("mm1",32);		# mm1 = carry1
     87 		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
     88 		&add($a,32);
     89 		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
     90 		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
     91 		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
     92 		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
     93 		&movd(&DWP(8,$r,"",0),"mm1");
     94 		&psrlq("mm1",32);		# mm1 = carry2
     95 		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
     96 		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
     97 		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
     98 		&movd(&DWP(12,$r,"",0),"mm1");
     99 		&psrlq("mm1",32);		# mm1 = carry3
    100 		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
    101 		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
    102 		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
    103 		&movd(&DWP(16,$r,"",0),"mm1");
    104 		&psrlq("mm1",32);		# mm1 = carry4
    105 		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
    106 		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
    107 		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
    108 		&movd(&DWP(20,$r,"",0),"mm1");
    109 		&psrlq("mm1",32);		# mm1 = carry5
    110 		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
    111 		&movd(&DWP(24,$r,"",0),"mm1");
    112 		&psrlq("mm1",32);		# mm1 = carry6
    113 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
    114 		&movd(&DWP(28,$r,"",0),"mm1");
    115 		&lea($r,&DWP(32,$r));
    116 		&psrlq("mm1",32);		# mm1 = carry_out
    117 
    118 		&sub($c,8);
    119 		&jz(&label("maw_sse2_exit"));
    120 	&set_label("maw_sse2_entry");
    121 		&test($c,0xfffffff8);
    122 		&jnz(&label("maw_sse2_unrolled"));
    123 
    124 	&set_label("maw_sse2_loop",4);
    125 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    126 		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
    127 		&pmuludq("mm2","mm0");		# a[i] *= w
    128 		&lea($a,&DWP(4,$a));
    129 		&paddq("mm1","mm3");		# carry += r[i]
    130 		&paddq("mm1","mm2");		# carry += a[i]*w
    131 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    132 		&sub($c,1);
    133 		&psrlq("mm1",32);		# carry = carry_high
    134 		&lea($r,&DWP(4,$r));
    135 		&jnz(&label("maw_sse2_loop"));
    136 	&set_label("maw_sse2_exit");
    137 		&movd("eax","mm1");		# c = carry_out
    138 		&emms();
    139 		&ret();
    140 
    141 	&set_label("maw_non_sse2",16);
    142 	}
    143 
    144 	# function_begin prologue
    145 	&push("ebp");
    146 	&push("ebx");
    147 	&push("esi");
    148 	&push("edi");
    149 
    150 	&comment("");
    151 	$Low="eax";
    152 	$High="edx";
    153 	$a="ebx";
    154 	$w="ebp";
    155 	$r="edi";
    156 	$c="esi";
    157 
    158 	&xor($c,$c);		# clear carry
    159 	&mov($r,&wparam(0));	#
    160 
    161 	&mov("ecx",&wparam(2));	#
    162 	&mov($a,&wparam(1));	#
    163 
    164 	&and("ecx",0xfffffff8);	# num / 8
    165 	&mov($w,&wparam(3));	#
    166 
    167 	&push("ecx");		# Up the stack for a tmp variable
    168 
    169 	&jz(&label("maw_finish"));
    170 
    171 	&set_label("maw_loop",16);
    172 
    173 	for ($i=0; $i<32; $i+=4)
    174 		{
    175 		&comment("Round $i");
    176 
    177 		 &mov("eax",&DWP($i,$a)); 	# *a
    178 		&mul($w);			# *a * w
    179 		&add("eax",$c);			# L(t)+= c
    180 		&adc("edx",0);			# H(t)+=carry
    181 		 &add("eax",&DWP($i,$r));	# L(t)+= *r
    182 		&adc("edx",0);			# H(t)+=carry
    183 		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
    184 		&mov($c,"edx");			# c=  H(t);
    185 		}
    186 
    187 	&comment("");
    188 	&sub("ecx",8);
    189 	&lea($a,&DWP(32,$a));
    190 	&lea($r,&DWP(32,$r));
    191 	&jnz(&label("maw_loop"));
    192 
    193 	&set_label("maw_finish",0);
    194 	&mov("ecx",&wparam(2));	# get num
    195 	&and("ecx",7);
    196 	&jnz(&label("maw_finish2"));	# helps branch prediction
    197 	&jmp(&label("maw_end"));
    198 
    199 	&set_label("maw_finish2",1);
    200 	for ($i=0; $i<7; $i++)
    201 		{
    202 		&comment("Tail Round $i");
    203 		 &mov("eax",&DWP($i*4,$a));	# *a
    204 		&mul($w);			# *a * w
    205 		&add("eax",$c);			# L(t)+=c
    206 		&adc("edx",0);			# H(t)+=carry
    207 		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
    208 		&adc("edx",0);			# H(t)+=carry
    209 		 &dec("ecx") if ($i != 7-1);
    210 		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
    211 		 &mov($c,"edx");		# c=  H(t);
    212 		&jz(&label("maw_end")) if ($i != 7-1);
    213 		}
    214 	&set_label("maw_end",0);
    215 	&mov("eax",$c);
    216 
    217 	&pop("ecx");	# clear variable from
    218 
    219 	&function_end($name);
    220 	}
    221 
    222 sub bn_mul_words
    223 	{
    224 	local($name)=@_;
    225 
    226 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    227 
    228 	$r="eax";
    229 	$a="edx";
    230 	$c="ecx";
    231 
    232 	if ($sse2) {
    233 		&picmeup("eax","OPENSSL_ia32cap_P");
    234 		&bt(&DWP(0,"eax"),26);
    235 		&jnc(&label("mw_non_sse2"));
    236 
    237 		&mov($r,&wparam(0));
    238 		&mov($a,&wparam(1));
    239 		&mov($c,&wparam(2));
    240 		&movd("mm0",&wparam(3));	# mm0 = w
    241 		&pxor("mm1","mm1");		# mm1 = carry = 0
    242 
    243 	&set_label("mw_sse2_loop",16);
    244 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    245 		&pmuludq("mm2","mm0");		# a[i] *= w
    246 		&lea($a,&DWP(4,$a));
    247 		&paddq("mm1","mm2");		# carry += a[i]*w
    248 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    249 		&sub($c,1);
    250 		&psrlq("mm1",32);		# carry = carry_high
    251 		&lea($r,&DWP(4,$r));
    252 		&jnz(&label("mw_sse2_loop"));
    253 
    254 		&movd("eax","mm1");		# return carry
    255 		&emms();
    256 		&ret();
    257 	&set_label("mw_non_sse2",16);
    258 	}
    259 
    260 	# function_begin prologue
    261 	&push("ebp");
    262 	&push("ebx");
    263 	&push("esi");
    264 	&push("edi");
    265 
    266 	&comment("");
    267 	$Low="eax";
    268 	$High="edx";
    269 	$a="ebx";
    270 	$w="ecx";
    271 	$r="edi";
    272 	$c="esi";
    273 	$num="ebp";
    274 
    275 	&xor($c,$c);		# clear carry
    276 	&mov($r,&wparam(0));	#
    277 	&mov($a,&wparam(1));	#
    278 	&mov($num,&wparam(2));	#
    279 	&mov($w,&wparam(3));	#
    280 
    281 	&and($num,0xfffffff8);	# num / 8
    282 	&jz(&label("mw_finish"));
    283 
    284 	&set_label("mw_loop",0);
    285 	for ($i=0; $i<32; $i+=4)
    286 		{
    287 		&comment("Round $i");
    288 
    289 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
    290 		&mul($w);			# *a * w
    291 		&add("eax",$c);			# L(t)+=c
    292 		 # XXX
    293 
    294 		&adc("edx",0);			# H(t)+=carry
    295 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
    296 
    297 		&mov($c,"edx");			# c=  H(t);
    298 		}
    299 
    300 	&comment("");
    301 	&add($a,32);
    302 	&add($r,32);
    303 	&sub($num,8);
    304 	&jz(&label("mw_finish"));
    305 	&jmp(&label("mw_loop"));
    306 
    307 	&set_label("mw_finish",0);
    308 	&mov($num,&wparam(2));	# get num
    309 	&and($num,7);
    310 	&jnz(&label("mw_finish2"));
    311 	&jmp(&label("mw_end"));
    312 
    313 	&set_label("mw_finish2",1);
    314 	for ($i=0; $i<7; $i++)
    315 		{
    316 		&comment("Tail Round $i");
    317 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
    318 		&mul($w);			# *a * w
    319 		&add("eax",$c);			# L(t)+=c
    320 		 # XXX
    321 		&adc("edx",0);			# H(t)+=carry
    322 		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
    323 		&mov($c,"edx");			# c=  H(t);
    324 		 &dec($num) if ($i != 7-1);
    325 		&jz(&label("mw_end")) if ($i != 7-1);
    326 		}
    327 	&set_label("mw_end",0);
    328 	&mov("eax",$c);
    329 
    330 	&function_end($name);
    331 	}
    332 
    333 sub bn_sqr_words
    334 	{
    335 	local($name)=@_;
    336 
    337 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    338 
    339 	$r="eax";
    340 	$a="edx";
    341 	$c="ecx";
    342 
    343 	if ($sse2) {
    344 		&picmeup("eax","OPENSSL_ia32cap_P");
    345 		&bt(&DWP(0,"eax"),26);
    346 		&jnc(&label("sqr_non_sse2"));
    347 
    348 		&mov($r,&wparam(0));
    349 		&mov($a,&wparam(1));
    350 		&mov($c,&wparam(2));
    351 
    352 	&set_label("sqr_sse2_loop",16);
    353 		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
    354 		&pmuludq("mm0","mm0");		# a[i] *= a[i]
    355 		&lea($a,&DWP(4,$a));		# a++
    356 		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
    357 		&sub($c,1);
    358 		&lea($r,&DWP(8,$r));		# r += 2
    359 		&jnz(&label("sqr_sse2_loop"));
    360 
    361 		&emms();
    362 		&ret();
    363 	&set_label("sqr_non_sse2",16);
    364 	}
    365 
    366 	# function_begin prologue
    367 	&push("ebp");
    368 	&push("ebx");
    369 	&push("esi");
    370 	&push("edi");
    371 
    372 	&comment("");
    373 	$r="esi";
    374 	$a="edi";
    375 	$num="ebx";
    376 
    377 	&mov($r,&wparam(0));	#
    378 	&mov($a,&wparam(1));	#
    379 	&mov($num,&wparam(2));	#
    380 
    381 	&and($num,0xfffffff8);	# num / 8
    382 	&jz(&label("sw_finish"));
    383 
    384 	&set_label("sw_loop",0);
    385 	for ($i=0; $i<32; $i+=4)
    386 		{
    387 		&comment("Round $i");
    388 		&mov("eax",&DWP($i,$a,"",0)); 	# *a
    389 		 # XXX
    390 		&mul("eax");			# *a * *a
    391 		&mov(&DWP($i*2,$r,"",0),"eax");	#
    392 		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
    393 		}
    394 
    395 	&comment("");
    396 	&add($a,32);
    397 	&add($r,64);
    398 	&sub($num,8);
    399 	&jnz(&label("sw_loop"));
    400 
    401 	&set_label("sw_finish",0);
    402 	&mov($num,&wparam(2));	# get num
    403 	&and($num,7);
    404 	&jz(&label("sw_end"));
    405 
    406 	for ($i=0; $i<7; $i++)
    407 		{
    408 		&comment("Tail Round $i");
    409 		&mov("eax",&DWP($i*4,$a,"",0));	# *a
    410 		 # XXX
    411 		&mul("eax");			# *a * *a
    412 		&mov(&DWP($i*8,$r,"",0),"eax");	#
    413 		 &dec($num) if ($i != 7-1);
    414 		&mov(&DWP($i*8+4,$r,"",0),"edx");
    415 		 &jz(&label("sw_end")) if ($i != 7-1);
    416 		}
    417 	&set_label("sw_end",0);
    418 
    419 	&function_end($name);
    420 	}
    421 
    422 sub bn_div_words
    423 	{
    424 	local($name)=@_;
    425 
    426 	&function_begin_B($name,"");
    427 	&mov("edx",&wparam(0));	#
    428 	&mov("eax",&wparam(1));	#
    429 	&mov("ecx",&wparam(2));	#
    430 	&div("ecx");
    431 	&ret();
    432 	&function_end_B($name);
    433 	}
    434 
    435 sub bn_add_words
    436 	{
    437 	local($name)=@_;
    438 
    439 	&function_begin($name,"");
    440 
    441 	&comment("");
    442 	$a="esi";
    443 	$b="edi";
    444 	$c="eax";
    445 	$r="ebx";
    446 	$tmp1="ecx";
    447 	$tmp2="edx";
    448 	$num="ebp";
    449 
    450 	&mov($r,&wparam(0));	# get r
    451 	 &mov($a,&wparam(1));	# get a
    452 	&mov($b,&wparam(2));	# get b
    453 	 &mov($num,&wparam(3));	# get num
    454 	&xor($c,$c);		# clear carry
    455 	 &and($num,0xfffffff8);	# num / 8
    456 
    457 	&jz(&label("aw_finish"));
    458 
    459 	&set_label("aw_loop",0);
    460 	for ($i=0; $i<8; $i++)
    461 		{
    462 		&comment("Round $i");
    463 
    464 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    465 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    466 		&add($tmp1,$c);
    467 		 &mov($c,0);
    468 		&adc($c,$c);
    469 		 &add($tmp1,$tmp2);
    470 		&adc($c,0);
    471 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    472 		}
    473 
    474 	&comment("");
    475 	&add($a,32);
    476 	 &add($b,32);
    477 	&add($r,32);
    478 	 &sub($num,8);
    479 	&jnz(&label("aw_loop"));
    480 
    481 	&set_label("aw_finish",0);
    482 	&mov($num,&wparam(3));	# get num
    483 	&and($num,7);
    484 	 &jz(&label("aw_end"));
    485 
    486 	for ($i=0; $i<7; $i++)
    487 		{
    488 		&comment("Tail Round $i");
    489 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    490 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    491 		&add($tmp1,$c);
    492 		 &mov($c,0);
    493 		&adc($c,$c);
    494 		 &add($tmp1,$tmp2);
    495 		&adc($c,0);
    496 		 &dec($num) if ($i != 6);
    497 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    498 		 &jz(&label("aw_end")) if ($i != 6);
    499 		}
    500 	&set_label("aw_end",0);
    501 
    502 #	&mov("eax",$c);		# $c is "eax"
    503 
    504 	&function_end($name);
    505 	}
    506 
    507 sub bn_sub_words
    508 	{
    509 	local($name)=@_;
    510 
    511 	&function_begin($name,"");
    512 
    513 	&comment("");
    514 	$a="esi";
    515 	$b="edi";
    516 	$c="eax";
    517 	$r="ebx";
    518 	$tmp1="ecx";
    519 	$tmp2="edx";
    520 	$num="ebp";
    521 
    522 	&mov($r,&wparam(0));	# get r
    523 	 &mov($a,&wparam(1));	# get a
    524 	&mov($b,&wparam(2));	# get b
    525 	 &mov($num,&wparam(3));	# get num
    526 	&xor($c,$c);		# clear carry
    527 	 &and($num,0xfffffff8);	# num / 8
    528 
    529 	&jz(&label("aw_finish"));
    530 
    531 	&set_label("aw_loop",0);
    532 	for ($i=0; $i<8; $i++)
    533 		{
    534 		&comment("Round $i");
    535 
    536 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    537 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    538 		&sub($tmp1,$c);
    539 		 &mov($c,0);
    540 		&adc($c,$c);
    541 		 &sub($tmp1,$tmp2);
    542 		&adc($c,0);
    543 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    544 		}
    545 
    546 	&comment("");
    547 	&add($a,32);
    548 	 &add($b,32);
    549 	&add($r,32);
    550 	 &sub($num,8);
    551 	&jnz(&label("aw_loop"));
    552 
    553 	&set_label("aw_finish",0);
    554 	&mov($num,&wparam(3));	# get num
    555 	&and($num,7);
    556 	 &jz(&label("aw_end"));
    557 
    558 	for ($i=0; $i<7; $i++)
    559 		{
    560 		&comment("Tail Round $i");
    561 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    562 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    563 		&sub($tmp1,$c);
    564 		 &mov($c,0);
    565 		&adc($c,$c);
    566 		 &sub($tmp1,$tmp2);
    567 		&adc($c,0);
    568 		 &dec($num) if ($i != 6);
    569 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    570 		 &jz(&label("aw_end")) if ($i != 6);
    571 		}
    572 	&set_label("aw_end",0);
    573 
    574 #	&mov("eax",$c);		# $c is "eax"
    575 
    576 	&function_end($name);
    577 	}
    578 
    579 sub bn_sub_part_words
    580 	{
    581 	local($name)=@_;
    582 
    583 	&function_begin($name,"");
    584 
    585 	&comment("");
    586 	$a="esi";
    587 	$b="edi";
    588 	$c="eax";
    589 	$r="ebx";
    590 	$tmp1="ecx";
    591 	$tmp2="edx";
    592 	$num="ebp";
    593 
    594 	&mov($r,&wparam(0));	# get r
    595 	 &mov($a,&wparam(1));	# get a
    596 	&mov($b,&wparam(2));	# get b
    597 	 &mov($num,&wparam(3));	# get num
    598 	&xor($c,$c);		# clear carry
    599 	 &and($num,0xfffffff8);	# num / 8
    600 
    601 	&jz(&label("aw_finish"));
    602 
    603 	&set_label("aw_loop",0);
    604 	for ($i=0; $i<8; $i++)
    605 		{
    606 		&comment("Round $i");
    607 
    608 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    609 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    610 		&sub($tmp1,$c);
    611 		 &mov($c,0);
    612 		&adc($c,$c);
    613 		 &sub($tmp1,$tmp2);
    614 		&adc($c,0);
    615 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    616 		}
    617 
    618 	&comment("");
    619 	&add($a,32);
    620 	 &add($b,32);
    621 	&add($r,32);
    622 	 &sub($num,8);
    623 	&jnz(&label("aw_loop"));
    624 
    625 	&set_label("aw_finish",0);
    626 	&mov($num,&wparam(3));	# get num
    627 	&and($num,7);
    628 	 &jz(&label("aw_end"));
    629 
    630 	for ($i=0; $i<7; $i++)
    631 		{
    632 		&comment("Tail Round $i");
    633 		&mov($tmp1,&DWP(0,$a,"",0));	# *a
    634 		 &mov($tmp2,&DWP(0,$b,"",0));# *b
    635 		&sub($tmp1,$c);
    636 		 &mov($c,0);
    637 		&adc($c,$c);
    638 		 &sub($tmp1,$tmp2);
    639 		&adc($c,0);
    640 		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
    641 		&add($a, 4);
    642 		&add($b, 4);
    643 		&add($r, 4);
    644 		 &dec($num) if ($i != 6);
    645 		 &jz(&label("aw_end")) if ($i != 6);
    646 		}
    647 	&set_label("aw_end",0);
    648 
    649 	&cmp(&wparam(4),0);
    650 	&je(&label("pw_end"));
    651 
    652 	&mov($num,&wparam(4));	# get dl
    653 	&cmp($num,0);
    654 	&je(&label("pw_end"));
    655 	&jge(&label("pw_pos"));
    656 
    657 	&comment("pw_neg");
    658 	&mov($tmp2,0);
    659 	&sub($tmp2,$num);
    660 	&mov($num,$tmp2);
    661 	&and($num,0xfffffff8);	# num / 8
    662 	&jz(&label("pw_neg_finish"));
    663 
    664 	&set_label("pw_neg_loop",0);
    665 	for ($i=0; $i<8; $i++)
    666 	{
    667 	    &comment("dl<0 Round $i");
    668 
    669 	    &mov($tmp1,0);
    670 	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    671 	    &sub($tmp1,$c);
    672 	    &mov($c,0);
    673 	    &adc($c,$c);
    674 	    &sub($tmp1,$tmp2);
    675 	    &adc($c,0);
    676 	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    677 	}
    678 
    679 	&comment("");
    680 	&add($b,32);
    681 	&add($r,32);
    682 	&sub($num,8);
    683 	&jnz(&label("pw_neg_loop"));
    684 
    685 	&set_label("pw_neg_finish",0);
    686 	&mov($tmp2,&wparam(4));	# get dl
    687 	&mov($num,0);
    688 	&sub($num,$tmp2);
    689 	&and($num,7);
    690 	&jz(&label("pw_end"));
    691 
    692 	for ($i=0; $i<7; $i++)
    693 	{
    694 	    &comment("dl<0 Tail Round $i");
    695 	    &mov($tmp1,0);
    696 	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    697 	    &sub($tmp1,$c);
    698 	    &mov($c,0);
    699 	    &adc($c,$c);
    700 	    &sub($tmp1,$tmp2);
    701 	    &adc($c,0);
    702 	    &dec($num) if ($i != 6);
    703 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    704 	    &jz(&label("pw_end")) if ($i != 6);
    705 	}
    706 
    707 	&jmp(&label("pw_end"));
    708 
    709 	&set_label("pw_pos",0);
    710 
    711 	&and($num,0xfffffff8);	# num / 8
    712 	&jz(&label("pw_pos_finish"));
    713 
    714 	&set_label("pw_pos_loop",0);
    715 
    716 	for ($i=0; $i<8; $i++)
    717 	{
    718 	    &comment("dl>0 Round $i");
    719 
    720 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    721 	    &sub($tmp1,$c);
    722 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    723 	    &jnc(&label("pw_nc".$i));
    724 	}
    725 
    726 	&comment("");
    727 	&add($a,32);
    728 	&add($r,32);
    729 	&sub($num,8);
    730 	&jnz(&label("pw_pos_loop"));
    731 
    732 	&set_label("pw_pos_finish",0);
    733 	&mov($num,&wparam(4));	# get dl
    734 	&and($num,7);
    735 	&jz(&label("pw_end"));
    736 
    737 	for ($i=0; $i<7; $i++)
    738 	{
    739 	    &comment("dl>0 Tail Round $i");
    740 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    741 	    &sub($tmp1,$c);
    742 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    743 	    &jnc(&label("pw_tail_nc".$i));
    744 	    &dec($num) if ($i != 6);
    745 	    &jz(&label("pw_end")) if ($i != 6);
    746 	}
    747 	&mov($c,1);
    748 	&jmp(&label("pw_end"));
    749 
    750 	&set_label("pw_nc_loop",0);
    751 	for ($i=0; $i<8; $i++)
    752 	{
    753 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    754 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    755 	    &set_label("pw_nc".$i,0);
    756 	}
    757 
    758 	&comment("");
    759 	&add($a,32);
    760 	&add($r,32);
    761 	&sub($num,8);
    762 	&jnz(&label("pw_nc_loop"));
    763 
    764 	&mov($num,&wparam(4));	# get dl
    765 	&and($num,7);
    766 	&jz(&label("pw_nc_end"));
    767 
    768 	for ($i=0; $i<7; $i++)
    769 	{
    770 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    771 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    772 	    &set_label("pw_tail_nc".$i,0);
    773 	    &dec($num) if ($i != 6);
    774 	    &jz(&label("pw_nc_end")) if ($i != 6);
    775 	}
    776 
    777 	&set_label("pw_nc_end",0);
    778 	&mov($c,0);
    779 
    780 	&set_label("pw_end",0);
    781 
    782 #	&mov("eax",$c);		# $c is "eax"
    783 
    784 	&function_end($name);
    785 	}
    786