Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
      4 push(@INC,"${dir}","${dir}../../../perlasm");
      5 require "x86asm.pl";
      6 
      7 $output = pop;
      8 open STDOUT,">$output";
      9 
     10 &asm_init($ARGV[0]);
     11 
     12 $sse2=0;
     13 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
     14 
     15 &external_label("OPENSSL_ia32cap_P") if ($sse2);
     16 
     17 &bn_mul_add_words("bn_mul_add_words");
     18 &bn_mul_words("bn_mul_words");
     19 &bn_sqr_words("bn_sqr_words");
     20 &bn_div_words("bn_div_words");
     21 &bn_add_words("bn_add_words");
     22 &bn_sub_words("bn_sub_words");
     23 &bn_sub_part_words("bn_sub_part_words");
     24 
     25 &asm_finish();
     26 
     27 close STDOUT;
     28 
     29 sub bn_mul_add_words
     30 	{
     31 	local($name)=@_;
     32 
     33 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
     34 
     35 	$r="eax";
     36 	$a="edx";
     37 	$c="ecx";
     38 
     39 	if ($sse2) {
     40 		&picmeup("eax","OPENSSL_ia32cap_P");
     41 		&bt(&DWP(0,"eax"),26);
     42 		&jnc(&label("maw_non_sse2"));
     43 
     44 		&mov($r,&wparam(0));
     45 		&mov($a,&wparam(1));
     46 		&mov($c,&wparam(2));
     47 		&movd("mm0",&wparam(3));	# mm0 = w
     48 		&pxor("mm1","mm1");		# mm1 = carry_in
     49 		&jmp(&label("maw_sse2_entry"));
     50 
     51 	&set_label("maw_sse2_unrolled",16);
     52 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
     53 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
     54 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
     55 		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
     56 		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
     57 		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
     58 		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
     59 		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
     60 		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
     61 		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
     62 		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
     63 		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
     64 		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
     65 		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
     66 		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
     67 		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
     68 		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
     69 		&movd(&DWP(0,$r,"",0),"mm1");
     70 		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
     71 		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
     72 		&psrlq("mm1",32);		# mm1 = carry0
     73 		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
     74 		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
     75 		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
     76 		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
     77 		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
     78 		&movd(&DWP(4,$r,"",0),"mm1");
     79 		&psrlq("mm1",32);		# mm1 = carry1
     80 		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
     81 		&add($a,32);
     82 		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
     83 		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
     84 		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
     85 		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
     86 		&movd(&DWP(8,$r,"",0),"mm1");
     87 		&psrlq("mm1",32);		# mm1 = carry2
     88 		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
     89 		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
     90 		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
     91 		&movd(&DWP(12,$r,"",0),"mm1");
     92 		&psrlq("mm1",32);		# mm1 = carry3
     93 		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
     94 		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
     95 		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
     96 		&movd(&DWP(16,$r,"",0),"mm1");
     97 		&psrlq("mm1",32);		# mm1 = carry4
     98 		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
     99 		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
    100 		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
    101 		&movd(&DWP(20,$r,"",0),"mm1");
    102 		&psrlq("mm1",32);		# mm1 = carry5
    103 		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
    104 		&movd(&DWP(24,$r,"",0),"mm1");
    105 		&psrlq("mm1",32);		# mm1 = carry6
    106 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
    107 		&movd(&DWP(28,$r,"",0),"mm1");
    108 		&lea($r,&DWP(32,$r));
    109 		&psrlq("mm1",32);		# mm1 = carry_out
    110 
    111 		&sub($c,8);
    112 		&jz(&label("maw_sse2_exit"));
    113 	&set_label("maw_sse2_entry");
    114 		&test($c,0xfffffff8);
    115 		&jnz(&label("maw_sse2_unrolled"));
    116 
    117 	&set_label("maw_sse2_loop",4);
    118 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    119 		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
    120 		&pmuludq("mm2","mm0");		# a[i] *= w
    121 		&lea($a,&DWP(4,$a));
    122 		&paddq("mm1","mm3");		# carry += r[i]
    123 		&paddq("mm1","mm2");		# carry += a[i]*w
    124 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    125 		&sub($c,1);
    126 		&psrlq("mm1",32);		# carry = carry_high
    127 		&lea($r,&DWP(4,$r));
    128 		&jnz(&label("maw_sse2_loop"));
    129 	&set_label("maw_sse2_exit");
    130 		&movd("eax","mm1");		# c = carry_out
    131 		&emms();
    132 		&ret();
    133 
    134 	&set_label("maw_non_sse2",16);
    135 	}
    136 
    137 	# function_begin prologue
    138 	&push("ebp");
    139 	&push("ebx");
    140 	&push("esi");
    141 	&push("edi");
    142 
    143 	&comment("");
    144 	$Low="eax";
    145 	$High="edx";
    146 	$a="ebx";
    147 	$w="ebp";
    148 	$r="edi";
    149 	$c="esi";
    150 
    151 	&xor($c,$c);		# clear carry
    152 	&mov($r,&wparam(0));	#
    153 
    154 	&mov("ecx",&wparam(2));	#
    155 	&mov($a,&wparam(1));	#
    156 
    157 	&and("ecx",0xfffffff8);	# num / 8
    158 	&mov($w,&wparam(3));	#
    159 
    160 	&push("ecx");		# Up the stack for a tmp variable
    161 
    162 	&jz(&label("maw_finish"));
    163 
    164 	&set_label("maw_loop",16);
    165 
    166 	for ($i=0; $i<32; $i+=4)
    167 		{
    168 		&comment("Round $i");
    169 
    170 		 &mov("eax",&DWP($i,$a)); 	# *a
    171 		&mul($w);			# *a * w
    172 		&add("eax",$c);			# L(t)+= c
    173 		&adc("edx",0);			# H(t)+=carry
    174 		 &add("eax",&DWP($i,$r));	# L(t)+= *r
    175 		&adc("edx",0);			# H(t)+=carry
    176 		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
    177 		&mov($c,"edx");			# c=  H(t);
    178 		}
    179 
    180 	&comment("");
    181 	&sub("ecx",8);
    182 	&lea($a,&DWP(32,$a));
    183 	&lea($r,&DWP(32,$r));
    184 	&jnz(&label("maw_loop"));
    185 
    186 	&set_label("maw_finish",0);
    187 	&mov("ecx",&wparam(2));	# get num
    188 	&and("ecx",7);
    189 	&jnz(&label("maw_finish2"));	# helps branch prediction
    190 	&jmp(&label("maw_end"));
    191 
    192 	&set_label("maw_finish2",1);
    193 	for ($i=0; $i<7; $i++)
    194 		{
    195 		&comment("Tail Round $i");
    196 		 &mov("eax",&DWP($i*4,$a));	# *a
    197 		&mul($w);			# *a * w
    198 		&add("eax",$c);			# L(t)+=c
    199 		&adc("edx",0);			# H(t)+=carry
    200 		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
    201 		&adc("edx",0);			# H(t)+=carry
    202 		 &dec("ecx") if ($i != 7-1);
    203 		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
    204 		 &mov($c,"edx");		# c=  H(t);
    205 		&jz(&label("maw_end")) if ($i != 7-1);
    206 		}
    207 	&set_label("maw_end",0);
    208 	&mov("eax",$c);
    209 
    210 	&pop("ecx");	# clear variable from
    211 
    212 	&function_end($name);
    213 	}
    214 
    215 sub bn_mul_words
    216 	{
    217 	local($name)=@_;
    218 
    219 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    220 
    221 	$r="eax";
    222 	$a="edx";
    223 	$c="ecx";
    224 
    225 	if ($sse2) {
    226 		&picmeup("eax","OPENSSL_ia32cap_P");
    227 		&bt(&DWP(0,"eax"),26);
    228 		&jnc(&label("mw_non_sse2"));
    229 
    230 		&mov($r,&wparam(0));
    231 		&mov($a,&wparam(1));
    232 		&mov($c,&wparam(2));
    233 		&movd("mm0",&wparam(3));	# mm0 = w
    234 		&pxor("mm1","mm1");		# mm1 = carry = 0
    235 
    236 	&set_label("mw_sse2_loop",16);
    237 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    238 		&pmuludq("mm2","mm0");		# a[i] *= w
    239 		&lea($a,&DWP(4,$a));
    240 		&paddq("mm1","mm2");		# carry += a[i]*w
    241 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    242 		&sub($c,1);
    243 		&psrlq("mm1",32);		# carry = carry_high
    244 		&lea($r,&DWP(4,$r));
    245 		&jnz(&label("mw_sse2_loop"));
    246 
    247 		&movd("eax","mm1");		# return carry
    248 		&emms();
    249 		&ret();
    250 	&set_label("mw_non_sse2",16);
    251 	}
    252 
    253 	# function_begin prologue
    254 	&push("ebp");
    255 	&push("ebx");
    256 	&push("esi");
    257 	&push("edi");
    258 
    259 	&comment("");
    260 	$Low="eax";
    261 	$High="edx";
    262 	$a="ebx";
    263 	$w="ecx";
    264 	$r="edi";
    265 	$c="esi";
    266 	$num="ebp";
    267 
    268 	&xor($c,$c);		# clear carry
    269 	&mov($r,&wparam(0));	#
    270 	&mov($a,&wparam(1));	#
    271 	&mov($num,&wparam(2));	#
    272 	&mov($w,&wparam(3));	#
    273 
    274 	&and($num,0xfffffff8);	# num / 8
    275 	&jz(&label("mw_finish"));
    276 
    277 	&set_label("mw_loop",0);
    278 	for ($i=0; $i<32; $i+=4)
    279 		{
    280 		&comment("Round $i");
    281 
    282 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
    283 		&mul($w);			# *a * w
    284 		&add("eax",$c);			# L(t)+=c
    285 		 # XXX
    286 
    287 		&adc("edx",0);			# H(t)+=carry
    288 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
    289 
    290 		&mov($c,"edx");			# c=  H(t);
    291 		}
    292 
    293 	&comment("");
    294 	&add($a,32);
    295 	&add($r,32);
    296 	&sub($num,8);
    297 	&jz(&label("mw_finish"));
    298 	&jmp(&label("mw_loop"));
    299 
    300 	&set_label("mw_finish",0);
    301 	&mov($num,&wparam(2));	# get num
    302 	&and($num,7);
    303 	&jnz(&label("mw_finish2"));
    304 	&jmp(&label("mw_end"));
    305 
    306 	&set_label("mw_finish2",1);
    307 	for ($i=0; $i<7; $i++)
    308 		{
    309 		&comment("Tail Round $i");
    310 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
    311 		&mul($w);			# *a * w
    312 		&add("eax",$c);			# L(t)+=c
    313 		 # XXX
    314 		&adc("edx",0);			# H(t)+=carry
    315 		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
    316 		&mov($c,"edx");			# c=  H(t);
    317 		 &dec($num) if ($i != 7-1);
    318 		&jz(&label("mw_end")) if ($i != 7-1);
    319 		}
    320 	&set_label("mw_end",0);
    321 	&mov("eax",$c);
    322 
    323 	&function_end($name);
    324 	}
    325 
    326 sub bn_sqr_words
    327 	{
    328 	local($name)=@_;
    329 
    330 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    331 
    332 	$r="eax";
    333 	$a="edx";
    334 	$c="ecx";
    335 
    336 	if ($sse2) {
    337 		&picmeup("eax","OPENSSL_ia32cap_P");
    338 		&bt(&DWP(0,"eax"),26);
    339 		&jnc(&label("sqr_non_sse2"));
    340 
    341 		&mov($r,&wparam(0));
    342 		&mov($a,&wparam(1));
    343 		&mov($c,&wparam(2));
    344 
    345 	&set_label("sqr_sse2_loop",16);
    346 		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
    347 		&pmuludq("mm0","mm0");		# a[i] *= a[i]
    348 		&lea($a,&DWP(4,$a));		# a++
    349 		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
    350 		&sub($c,1);
    351 		&lea($r,&DWP(8,$r));		# r += 2
    352 		&jnz(&label("sqr_sse2_loop"));
    353 
    354 		&emms();
    355 		&ret();
    356 	&set_label("sqr_non_sse2",16);
    357 	}
    358 
    359 	# function_begin prologue
    360 	&push("ebp");
    361 	&push("ebx");
    362 	&push("esi");
    363 	&push("edi");
    364 
    365 	&comment("");
    366 	$r="esi";
    367 	$a="edi";
    368 	$num="ebx";
    369 
    370 	&mov($r,&wparam(0));	#
    371 	&mov($a,&wparam(1));	#
    372 	&mov($num,&wparam(2));	#
    373 
    374 	&and($num,0xfffffff8);	# num / 8
    375 	&jz(&label("sw_finish"));
    376 
    377 	&set_label("sw_loop",0);
    378 	for ($i=0; $i<32; $i+=4)
    379 		{
    380 		&comment("Round $i");
    381 		&mov("eax",&DWP($i,$a,"",0)); 	# *a
    382 		 # XXX
    383 		&mul("eax");			# *a * *a
    384 		&mov(&DWP($i*2,$r,"",0),"eax");	#
    385 		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
    386 		}
    387 
    388 	&comment("");
    389 	&add($a,32);
    390 	&add($r,64);
    391 	&sub($num,8);
    392 	&jnz(&label("sw_loop"));
    393 
    394 	&set_label("sw_finish",0);
    395 	&mov($num,&wparam(2));	# get num
    396 	&and($num,7);
    397 	&jz(&label("sw_end"));
    398 
    399 	for ($i=0; $i<7; $i++)
    400 		{
    401 		&comment("Tail Round $i");
    402 		&mov("eax",&DWP($i*4,$a,"",0));	# *a
    403 		 # XXX
    404 		&mul("eax");			# *a * *a
    405 		&mov(&DWP($i*8,$r,"",0),"eax");	#
    406 		 &dec($num) if ($i != 7-1);
    407 		&mov(&DWP($i*8+4,$r,"",0),"edx");
    408 		 &jz(&label("sw_end")) if ($i != 7-1);
    409 		}
    410 	&set_label("sw_end",0);
    411 
    412 	&function_end($name);
    413 	}
    414 
    415 sub bn_div_words
    416 	{
    417 	local($name)=@_;
    418 
    419 	&function_begin_B($name,"");
    420 	&mov("edx",&wparam(0));	#
    421 	&mov("eax",&wparam(1));	#
    422 	&mov("ecx",&wparam(2));	#
    423 	&div("ecx");
    424 	&ret();
    425 	&function_end_B($name);
    426 	}
    427 
    428 sub bn_add_words
    429 	{
    430 	local($name)=@_;
    431 
    432 	&function_begin($name,"");
    433 
    434 	&comment("");
    435 	$a="esi";
    436 	$b="edi";
    437 	$c="eax";
    438 	$r="ebx";
    439 	$tmp1="ecx";
    440 	$tmp2="edx";
    441 	$num="ebp";
    442 
    443 	&mov($r,&wparam(0));	# get r
    444 	 &mov($a,&wparam(1));	# get a
    445 	&mov($b,&wparam(2));	# get b
    446 	 &mov($num,&wparam(3));	# get num
    447 	&xor($c,$c);		# clear carry
    448 	 &and($num,0xfffffff8);	# num / 8
    449 
    450 	&jz(&label("aw_finish"));
    451 
    452 	&set_label("aw_loop",0);
    453 	for ($i=0; $i<8; $i++)
    454 		{
    455 		&comment("Round $i");
    456 
    457 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    458 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    459 		&add($tmp1,$c);
    460 		 &mov($c,0);
    461 		&adc($c,$c);
    462 		 &add($tmp1,$tmp2);
    463 		&adc($c,0);
    464 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    465 		}
    466 
    467 	&comment("");
    468 	&add($a,32);
    469 	 &add($b,32);
    470 	&add($r,32);
    471 	 &sub($num,8);
    472 	&jnz(&label("aw_loop"));
    473 
    474 	&set_label("aw_finish",0);
    475 	&mov($num,&wparam(3));	# get num
    476 	&and($num,7);
    477 	 &jz(&label("aw_end"));
    478 
    479 	for ($i=0; $i<7; $i++)
    480 		{
    481 		&comment("Tail Round $i");
    482 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    483 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    484 		&add($tmp1,$c);
    485 		 &mov($c,0);
    486 		&adc($c,$c);
    487 		 &add($tmp1,$tmp2);
    488 		&adc($c,0);
    489 		 &dec($num) if ($i != 6);
    490 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    491 		 &jz(&label("aw_end")) if ($i != 6);
    492 		}
    493 	&set_label("aw_end",0);
    494 
    495 #	&mov("eax",$c);		# $c is "eax"
    496 
    497 	&function_end($name);
    498 	}
    499 
    500 sub bn_sub_words
    501 	{
    502 	local($name)=@_;
    503 
    504 	&function_begin($name,"");
    505 
    506 	&comment("");
    507 	$a="esi";
    508 	$b="edi";
    509 	$c="eax";
    510 	$r="ebx";
    511 	$tmp1="ecx";
    512 	$tmp2="edx";
    513 	$num="ebp";
    514 
    515 	&mov($r,&wparam(0));	# get r
    516 	 &mov($a,&wparam(1));	# get a
    517 	&mov($b,&wparam(2));	# get b
    518 	 &mov($num,&wparam(3));	# get num
    519 	&xor($c,$c);		# clear carry
    520 	 &and($num,0xfffffff8);	# num / 8
    521 
    522 	&jz(&label("aw_finish"));
    523 
    524 	&set_label("aw_loop",0);
    525 	for ($i=0; $i<8; $i++)
    526 		{
    527 		&comment("Round $i");
    528 
    529 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    530 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    531 		&sub($tmp1,$c);
    532 		 &mov($c,0);
    533 		&adc($c,$c);
    534 		 &sub($tmp1,$tmp2);
    535 		&adc($c,0);
    536 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    537 		}
    538 
    539 	&comment("");
    540 	&add($a,32);
    541 	 &add($b,32);
    542 	&add($r,32);
    543 	 &sub($num,8);
    544 	&jnz(&label("aw_loop"));
    545 
    546 	&set_label("aw_finish",0);
    547 	&mov($num,&wparam(3));	# get num
    548 	&and($num,7);
    549 	 &jz(&label("aw_end"));
    550 
    551 	for ($i=0; $i<7; $i++)
    552 		{
    553 		&comment("Tail Round $i");
    554 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    555 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    556 		&sub($tmp1,$c);
    557 		 &mov($c,0);
    558 		&adc($c,$c);
    559 		 &sub($tmp1,$tmp2);
    560 		&adc($c,0);
    561 		 &dec($num) if ($i != 6);
    562 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    563 		 &jz(&label("aw_end")) if ($i != 6);
    564 		}
    565 	&set_label("aw_end",0);
    566 
    567 #	&mov("eax",$c);		# $c is "eax"
    568 
    569 	&function_end($name);
    570 	}
    571 
    572 sub bn_sub_part_words
    573 	{
    574 	local($name)=@_;
    575 
    576 	&function_begin($name,"");
    577 
    578 	&comment("");
    579 	$a="esi";
    580 	$b="edi";
    581 	$c="eax";
    582 	$r="ebx";
    583 	$tmp1="ecx";
    584 	$tmp2="edx";
    585 	$num="ebp";
    586 
    587 	&mov($r,&wparam(0));	# get r
    588 	 &mov($a,&wparam(1));	# get a
    589 	&mov($b,&wparam(2));	# get b
    590 	 &mov($num,&wparam(3));	# get num
    591 	&xor($c,$c);		# clear carry
    592 	 &and($num,0xfffffff8);	# num / 8
    593 
    594 	&jz(&label("aw_finish"));
    595 
    596 	&set_label("aw_loop",0);
    597 	for ($i=0; $i<8; $i++)
    598 		{
    599 		&comment("Round $i");
    600 
    601 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    602 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    603 		&sub($tmp1,$c);
    604 		 &mov($c,0);
    605 		&adc($c,$c);
    606 		 &sub($tmp1,$tmp2);
    607 		&adc($c,0);
    608 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    609 		}
    610 
    611 	&comment("");
    612 	&add($a,32);
    613 	 &add($b,32);
    614 	&add($r,32);
    615 	 &sub($num,8);
    616 	&jnz(&label("aw_loop"));
    617 
    618 	&set_label("aw_finish",0);
    619 	&mov($num,&wparam(3));	# get num
    620 	&and($num,7);
    621 	 &jz(&label("aw_end"));
    622 
    623 	for ($i=0; $i<7; $i++)
    624 		{
    625 		&comment("Tail Round $i");
    626 		&mov($tmp1,&DWP(0,$a,"",0));	# *a
    627 		 &mov($tmp2,&DWP(0,$b,"",0));# *b
    628 		&sub($tmp1,$c);
    629 		 &mov($c,0);
    630 		&adc($c,$c);
    631 		 &sub($tmp1,$tmp2);
    632 		&adc($c,0);
    633 		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
    634 		&add($a, 4);
    635 		&add($b, 4);
    636 		&add($r, 4);
    637 		 &dec($num) if ($i != 6);
    638 		 &jz(&label("aw_end")) if ($i != 6);
    639 		}
    640 	&set_label("aw_end",0);
    641 
    642 	&cmp(&wparam(4),0);
    643 	&je(&label("pw_end"));
    644 
    645 	&mov($num,&wparam(4));	# get dl
    646 	&cmp($num,0);
    647 	&je(&label("pw_end"));
    648 	&jge(&label("pw_pos"));
    649 
    650 	&comment("pw_neg");
    651 	&mov($tmp2,0);
    652 	&sub($tmp2,$num);
    653 	&mov($num,$tmp2);
    654 	&and($num,0xfffffff8);	# num / 8
    655 	&jz(&label("pw_neg_finish"));
    656 
    657 	&set_label("pw_neg_loop",0);
    658 	for ($i=0; $i<8; $i++)
    659 	{
    660 	    &comment("dl<0 Round $i");
    661 
    662 	    &mov($tmp1,0);
    663 	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    664 	    &sub($tmp1,$c);
    665 	    &mov($c,0);
    666 	    &adc($c,$c);
    667 	    &sub($tmp1,$tmp2);
    668 	    &adc($c,0);
    669 	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    670 	}
    671 
    672 	&comment("");
    673 	&add($b,32);
    674 	&add($r,32);
    675 	&sub($num,8);
    676 	&jnz(&label("pw_neg_loop"));
    677 
    678 	&set_label("pw_neg_finish",0);
    679 	&mov($tmp2,&wparam(4));	# get dl
    680 	&mov($num,0);
    681 	&sub($num,$tmp2);
    682 	&and($num,7);
    683 	&jz(&label("pw_end"));
    684 
    685 	for ($i=0; $i<7; $i++)
    686 	{
    687 	    &comment("dl<0 Tail Round $i");
    688 	    &mov($tmp1,0);
    689 	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    690 	    &sub($tmp1,$c);
    691 	    &mov($c,0);
    692 	    &adc($c,$c);
    693 	    &sub($tmp1,$tmp2);
    694 	    &adc($c,0);
    695 	    &dec($num) if ($i != 6);
    696 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    697 	    &jz(&label("pw_end")) if ($i != 6);
    698 	}
    699 
    700 	&jmp(&label("pw_end"));
    701 
    702 	&set_label("pw_pos",0);
    703 
    704 	&and($num,0xfffffff8);	# num / 8
    705 	&jz(&label("pw_pos_finish"));
    706 
    707 	&set_label("pw_pos_loop",0);
    708 
    709 	for ($i=0; $i<8; $i++)
    710 	{
    711 	    &comment("dl>0 Round $i");
    712 
    713 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    714 	    &sub($tmp1,$c);
    715 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    716 	    &jnc(&label("pw_nc".$i));
    717 	}
    718 
    719 	&comment("");
    720 	&add($a,32);
    721 	&add($r,32);
    722 	&sub($num,8);
    723 	&jnz(&label("pw_pos_loop"));
    724 
    725 	&set_label("pw_pos_finish",0);
    726 	&mov($num,&wparam(4));	# get dl
    727 	&and($num,7);
    728 	&jz(&label("pw_end"));
    729 
    730 	for ($i=0; $i<7; $i++)
    731 	{
    732 	    &comment("dl>0 Tail Round $i");
    733 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    734 	    &sub($tmp1,$c);
    735 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    736 	    &jnc(&label("pw_tail_nc".$i));
    737 	    &dec($num) if ($i != 6);
    738 	    &jz(&label("pw_end")) if ($i != 6);
    739 	}
    740 	&mov($c,1);
    741 	&jmp(&label("pw_end"));
    742 
    743 	&set_label("pw_nc_loop",0);
    744 	for ($i=0; $i<8; $i++)
    745 	{
    746 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    747 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    748 	    &set_label("pw_nc".$i,0);
    749 	}
    750 
    751 	&comment("");
    752 	&add($a,32);
    753 	&add($r,32);
    754 	&sub($num,8);
    755 	&jnz(&label("pw_nc_loop"));
    756 
    757 	&mov($num,&wparam(4));	# get dl
    758 	&and($num,7);
    759 	&jz(&label("pw_nc_end"));
    760 
    761 	for ($i=0; $i<7; $i++)
    762 	{
    763 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    764 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    765 	    &set_label("pw_tail_nc".$i,0);
    766 	    &dec($num) if ($i != 6);
    767 	    &jz(&label("pw_nc_end")) if ($i != 6);
    768 	}
    769 
    770 	&set_label("pw_nc_end",0);
    771 	&mov($c,0);
    772 
    773 	&set_label("pw_end",0);
    774 
    775 #	&mov("eax",$c);		# $c is "eax"
    776 
    777 	&function_end($name);
    778 	}
    779