Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
      4 push(@INC,"${dir}","${dir}../../perlasm");
      5 require "x86asm.pl";
      6 
      7 &asm_init($ARGV[0],$0);
      8 
      9 $sse2=0;
     10 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
     11 
     12 &external_label("OPENSSL_ia32cap_P") if ($sse2);
     13 
     14 &bn_mul_add_words("bn_mul_add_words");
     15 &bn_mul_words("bn_mul_words");
     16 &bn_sqr_words("bn_sqr_words");
     17 &bn_div_words("bn_div_words");
     18 &bn_add_words("bn_add_words");
     19 &bn_sub_words("bn_sub_words");
     20 &bn_sub_part_words("bn_sub_part_words");
     21 
     22 &asm_finish();
     23 
     24 sub bn_mul_add_words
     25 	{
     26 	local($name)=@_;
     27 
     28 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
     29 
     30 	$r="eax";
     31 	$a="edx";
     32 	$c="ecx";
     33 
     34 	if ($sse2) {
     35 		&picmeup("eax","OPENSSL_ia32cap_P");
     36 		&bt(&DWP(0,"eax"),26);
     37 		&jnc(&label("maw_non_sse2"));
     38 
     39 		&mov($r,&wparam(0));
     40 		&mov($a,&wparam(1));
     41 		&mov($c,&wparam(2));
     42 		&movd("mm0",&wparam(3));	# mm0 = w
     43 		&pxor("mm1","mm1");		# mm1 = carry_in
     44 		&jmp(&label("maw_sse2_entry"));
     45 		
     46 	&set_label("maw_sse2_unrolled",16);
     47 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
     48 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
     49 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
     50 		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
     51 		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
     52 		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
     53 		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
     54 		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
     55 		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
     56 		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
     57 		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
     58 		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
     59 		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
     60 		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
     61 		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
     62 		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
     63 		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
     64 		&movd(&DWP(0,$r,"",0),"mm1");
     65 		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
     66 		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
     67 		&psrlq("mm1",32);		# mm1 = carry0
     68 		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
     69 		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
     70 		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
     71 		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
     72 		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
     73 		&movd(&DWP(4,$r,"",0),"mm1");
     74 		&psrlq("mm1",32);		# mm1 = carry1
     75 		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
     76 		&add($a,32);
     77 		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
     78 		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
     79 		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
     80 		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
     81 		&movd(&DWP(8,$r,"",0),"mm1");
     82 		&psrlq("mm1",32);		# mm1 = carry2
     83 		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
     84 		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
     85 		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
     86 		&movd(&DWP(12,$r,"",0),"mm1");
     87 		&psrlq("mm1",32);		# mm1 = carry3
     88 		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
     89 		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
     90 		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
     91 		&movd(&DWP(16,$r,"",0),"mm1");
     92 		&psrlq("mm1",32);		# mm1 = carry4
     93 		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
     94 		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
     95 		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
     96 		&movd(&DWP(20,$r,"",0),"mm1");
     97 		&psrlq("mm1",32);		# mm1 = carry5
     98 		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
     99 		&movd(&DWP(24,$r,"",0),"mm1");
    100 		&psrlq("mm1",32);		# mm1 = carry6
    101 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
    102 		&movd(&DWP(28,$r,"",0),"mm1");
    103 		&lea($r,&DWP(32,$r));
    104 		&psrlq("mm1",32);		# mm1 = carry_out
    105 
    106 		&sub($c,8);
    107 		&jz(&label("maw_sse2_exit"));
    108 	&set_label("maw_sse2_entry");
    109 		&test($c,0xfffffff8);
    110 		&jnz(&label("maw_sse2_unrolled"));
    111 
    112 	&set_label("maw_sse2_loop",4);
    113 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    114 		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
    115 		&pmuludq("mm2","mm0");		# a[i] *= w
    116 		&lea($a,&DWP(4,$a));
    117 		&paddq("mm1","mm3");		# carry += r[i]
    118 		&paddq("mm1","mm2");		# carry += a[i]*w
    119 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    120 		&sub($c,1);
    121 		&psrlq("mm1",32);		# carry = carry_high
    122 		&lea($r,&DWP(4,$r));
    123 		&jnz(&label("maw_sse2_loop"));
    124 	&set_label("maw_sse2_exit");
    125 		&movd("eax","mm1");		# c = carry_out
    126 		&emms();
    127 		&ret();
    128 
    129 	&set_label("maw_non_sse2",16);
    130 	}
    131 
    132 	# function_begin prologue
    133 	&push("ebp");
    134 	&push("ebx");
    135 	&push("esi");
    136 	&push("edi");
    137 
    138 	&comment("");
    139 	$Low="eax";
    140 	$High="edx";
    141 	$a="ebx";
    142 	$w="ebp";
    143 	$r="edi";
    144 	$c="esi";
    145 
    146 	&xor($c,$c);		# clear carry
    147 	&mov($r,&wparam(0));	#
    148 
    149 	&mov("ecx",&wparam(2));	#
    150 	&mov($a,&wparam(1));	#
    151 
    152 	&and("ecx",0xfffffff8);	# num / 8
    153 	&mov($w,&wparam(3));	#
    154 
    155 	&push("ecx");		# Up the stack for a tmp variable
    156 
    157 	&jz(&label("maw_finish"));
    158 
    159 	&set_label("maw_loop",16);
    160 
    161 	for ($i=0; $i<32; $i+=4)
    162 		{
    163 		&comment("Round $i");
    164 
    165 		 &mov("eax",&DWP($i,$a)); 	# *a
    166 		&mul($w);			# *a * w
    167 		&add("eax",$c);			# L(t)+= c
    168 		&adc("edx",0);			# H(t)+=carry
    169 		 &add("eax",&DWP($i,$r));	# L(t)+= *r
    170 		&adc("edx",0);			# H(t)+=carry
    171 		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
    172 		&mov($c,"edx");			# c=  H(t);
    173 		}
    174 
    175 	&comment("");
    176 	&sub("ecx",8);
    177 	&lea($a,&DWP(32,$a));
    178 	&lea($r,&DWP(32,$r));
    179 	&jnz(&label("maw_loop"));
    180 
    181 	&set_label("maw_finish",0);
    182 	&mov("ecx",&wparam(2));	# get num
    183 	&and("ecx",7);
    184 	&jnz(&label("maw_finish2"));	# helps branch prediction
    185 	&jmp(&label("maw_end"));
    186 
    187 	&set_label("maw_finish2",1);
    188 	for ($i=0; $i<7; $i++)
    189 		{
    190 		&comment("Tail Round $i");
    191 		 &mov("eax",&DWP($i*4,$a));	# *a
    192 		&mul($w);			# *a * w
    193 		&add("eax",$c);			# L(t)+=c
    194 		&adc("edx",0);			# H(t)+=carry
    195 		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
    196 		&adc("edx",0);			# H(t)+=carry
    197 		 &dec("ecx") if ($i != 7-1);
    198 		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
    199 		 &mov($c,"edx");		# c=  H(t);
    200 		&jz(&label("maw_end")) if ($i != 7-1);
    201 		}
    202 	&set_label("maw_end",0);
    203 	&mov("eax",$c);
    204 
    205 	&pop("ecx");	# clear variable from
    206 
    207 	&function_end($name);
    208 	}
    209 
    210 sub bn_mul_words
    211 	{
    212 	local($name)=@_;
    213 
    214 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    215 
    216 	$r="eax";
    217 	$a="edx";
    218 	$c="ecx";
    219 
    220 	if ($sse2) {
    221 		&picmeup("eax","OPENSSL_ia32cap_P");
    222 		&bt(&DWP(0,"eax"),26);
    223 		&jnc(&label("mw_non_sse2"));
    224 
    225 		&mov($r,&wparam(0));
    226 		&mov($a,&wparam(1));
    227 		&mov($c,&wparam(2));
    228 		&movd("mm0",&wparam(3));	# mm0 = w
    229 		&pxor("mm1","mm1");		# mm1 = carry = 0
    230 
    231 	&set_label("mw_sse2_loop",16);
    232 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
    233 		&pmuludq("mm2","mm0");		# a[i] *= w
    234 		&lea($a,&DWP(4,$a));
    235 		&paddq("mm1","mm2");		# carry += a[i]*w
    236 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
    237 		&sub($c,1);
    238 		&psrlq("mm1",32);		# carry = carry_high
    239 		&lea($r,&DWP(4,$r));
    240 		&jnz(&label("mw_sse2_loop"));
    241 
    242 		&movd("eax","mm1");		# return carry
    243 		&emms();
    244 		&ret();
    245 	&set_label("mw_non_sse2",16);
    246 	}
    247 
    248 	# function_begin prologue
    249 	&push("ebp");
    250 	&push("ebx");
    251 	&push("esi");
    252 	&push("edi");
    253 
    254 	&comment("");
    255 	$Low="eax";
    256 	$High="edx";
    257 	$a="ebx";
    258 	$w="ecx";
    259 	$r="edi";
    260 	$c="esi";
    261 	$num="ebp";
    262 
    263 	&xor($c,$c);		# clear carry
    264 	&mov($r,&wparam(0));	#
    265 	&mov($a,&wparam(1));	#
    266 	&mov($num,&wparam(2));	#
    267 	&mov($w,&wparam(3));	#
    268 
    269 	&and($num,0xfffffff8);	# num / 8
    270 	&jz(&label("mw_finish"));
    271 
    272 	&set_label("mw_loop",0);
    273 	for ($i=0; $i<32; $i+=4)
    274 		{
    275 		&comment("Round $i");
    276 
    277 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
    278 		&mul($w);			# *a * w
    279 		&add("eax",$c);			# L(t)+=c
    280 		 # XXX
    281 
    282 		&adc("edx",0);			# H(t)+=carry
    283 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
    284 
    285 		&mov($c,"edx");			# c=  H(t);
    286 		}
    287 
    288 	&comment("");
    289 	&add($a,32);
    290 	&add($r,32);
    291 	&sub($num,8);
    292 	&jz(&label("mw_finish"));
    293 	&jmp(&label("mw_loop"));
    294 
    295 	&set_label("mw_finish",0);
    296 	&mov($num,&wparam(2));	# get num
    297 	&and($num,7);
    298 	&jnz(&label("mw_finish2"));
    299 	&jmp(&label("mw_end"));
    300 
    301 	&set_label("mw_finish2",1);
    302 	for ($i=0; $i<7; $i++)
    303 		{
    304 		&comment("Tail Round $i");
    305 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
    306 		&mul($w);			# *a * w
    307 		&add("eax",$c);			# L(t)+=c
    308 		 # XXX
    309 		&adc("edx",0);			# H(t)+=carry
    310 		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
    311 		&mov($c,"edx");			# c=  H(t);
    312 		 &dec($num) if ($i != 7-1);
    313 		&jz(&label("mw_end")) if ($i != 7-1);
    314 		}
    315 	&set_label("mw_end",0);
    316 	&mov("eax",$c);
    317 
    318 	&function_end($name);
    319 	}
    320 
    321 sub bn_sqr_words
    322 	{
    323 	local($name)=@_;
    324 
    325 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
    326 
    327 	$r="eax";
    328 	$a="edx";
    329 	$c="ecx";
    330 
    331 	if ($sse2) {
    332 		&picmeup("eax","OPENSSL_ia32cap_P");
    333 		&bt(&DWP(0,"eax"),26);
    334 		&jnc(&label("sqr_non_sse2"));
    335 
    336 		&mov($r,&wparam(0));
    337 		&mov($a,&wparam(1));
    338 		&mov($c,&wparam(2));
    339 
    340 	&set_label("sqr_sse2_loop",16);
    341 		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
    342 		&pmuludq("mm0","mm0");		# a[i] *= a[i]
    343 		&lea($a,&DWP(4,$a));		# a++
    344 		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
    345 		&sub($c,1);
    346 		&lea($r,&DWP(8,$r));		# r += 2
    347 		&jnz(&label("sqr_sse2_loop"));
    348 
    349 		&emms();
    350 		&ret();
    351 	&set_label("sqr_non_sse2",16);
    352 	}
    353 
    354 	# function_begin prologue
    355 	&push("ebp");
    356 	&push("ebx");
    357 	&push("esi");
    358 	&push("edi");
    359 
    360 	&comment("");
    361 	$r="esi";
    362 	$a="edi";
    363 	$num="ebx";
    364 
    365 	&mov($r,&wparam(0));	#
    366 	&mov($a,&wparam(1));	#
    367 	&mov($num,&wparam(2));	#
    368 
    369 	&and($num,0xfffffff8);	# num / 8
    370 	&jz(&label("sw_finish"));
    371 
    372 	&set_label("sw_loop",0);
    373 	for ($i=0; $i<32; $i+=4)
    374 		{
    375 		&comment("Round $i");
    376 		&mov("eax",&DWP($i,$a,"",0)); 	# *a
    377 		 # XXX
    378 		&mul("eax");			# *a * *a
    379 		&mov(&DWP($i*2,$r,"",0),"eax");	#
    380 		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
    381 		}
    382 
    383 	&comment("");
    384 	&add($a,32);
    385 	&add($r,64);
    386 	&sub($num,8);
    387 	&jnz(&label("sw_loop"));
    388 
    389 	&set_label("sw_finish",0);
    390 	&mov($num,&wparam(2));	# get num
    391 	&and($num,7);
    392 	&jz(&label("sw_end"));
    393 
    394 	for ($i=0; $i<7; $i++)
    395 		{
    396 		&comment("Tail Round $i");
    397 		&mov("eax",&DWP($i*4,$a,"",0));	# *a
    398 		 # XXX
    399 		&mul("eax");			# *a * *a
    400 		&mov(&DWP($i*8,$r,"",0),"eax");	#
    401 		 &dec($num) if ($i != 7-1);
    402 		&mov(&DWP($i*8+4,$r,"",0),"edx");
    403 		 &jz(&label("sw_end")) if ($i != 7-1);
    404 		}
    405 	&set_label("sw_end",0);
    406 
    407 	&function_end($name);
    408 	}
    409 
    410 sub bn_div_words
    411 	{
    412 	local($name)=@_;
    413 
    414 	&function_begin_B($name,"");
    415 	&mov("edx",&wparam(0));	#
    416 	&mov("eax",&wparam(1));	#
    417 	&mov("ecx",&wparam(2));	#
    418 	&div("ecx");
    419 	&ret();
    420 	&function_end_B($name);
    421 	}
    422 
    423 sub bn_add_words
    424 	{
    425 	local($name)=@_;
    426 
    427 	&function_begin($name,"");
    428 
    429 	&comment("");
    430 	$a="esi";
    431 	$b="edi";
    432 	$c="eax";
    433 	$r="ebx";
    434 	$tmp1="ecx";
    435 	$tmp2="edx";
    436 	$num="ebp";
    437 
    438 	&mov($r,&wparam(0));	# get r
    439 	 &mov($a,&wparam(1));	# get a
    440 	&mov($b,&wparam(2));	# get b
    441 	 &mov($num,&wparam(3));	# get num
    442 	&xor($c,$c);		# clear carry
    443 	 &and($num,0xfffffff8);	# num / 8
    444 
    445 	&jz(&label("aw_finish"));
    446 
    447 	&set_label("aw_loop",0);
    448 	for ($i=0; $i<8; $i++)
    449 		{
    450 		&comment("Round $i");
    451 
    452 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    453 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    454 		&add($tmp1,$c);
    455 		 &mov($c,0);
    456 		&adc($c,$c);
    457 		 &add($tmp1,$tmp2);
    458 		&adc($c,0);
    459 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    460 		}
    461 
    462 	&comment("");
    463 	&add($a,32);
    464 	 &add($b,32);
    465 	&add($r,32);
    466 	 &sub($num,8);
    467 	&jnz(&label("aw_loop"));
    468 
    469 	&set_label("aw_finish",0);
    470 	&mov($num,&wparam(3));	# get num
    471 	&and($num,7);
    472 	 &jz(&label("aw_end"));
    473 
    474 	for ($i=0; $i<7; $i++)
    475 		{
    476 		&comment("Tail Round $i");
    477 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    478 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    479 		&add($tmp1,$c);
    480 		 &mov($c,0);
    481 		&adc($c,$c);
    482 		 &add($tmp1,$tmp2);
    483 		&adc($c,0);
    484 		 &dec($num) if ($i != 6);
    485 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    486 		 &jz(&label("aw_end")) if ($i != 6);
    487 		}
    488 	&set_label("aw_end",0);
    489 
    490 #	&mov("eax",$c);		# $c is "eax"
    491 
    492 	&function_end($name);
    493 	}
    494 
    495 sub bn_sub_words
    496 	{
    497 	local($name)=@_;
    498 
    499 	&function_begin($name,"");
    500 
    501 	&comment("");
    502 	$a="esi";
    503 	$b="edi";
    504 	$c="eax";
    505 	$r="ebx";
    506 	$tmp1="ecx";
    507 	$tmp2="edx";
    508 	$num="ebp";
    509 
    510 	&mov($r,&wparam(0));	# get r
    511 	 &mov($a,&wparam(1));	# get a
    512 	&mov($b,&wparam(2));	# get b
    513 	 &mov($num,&wparam(3));	# get num
    514 	&xor($c,$c);		# clear carry
    515 	 &and($num,0xfffffff8);	# num / 8
    516 
    517 	&jz(&label("aw_finish"));
    518 
    519 	&set_label("aw_loop",0);
    520 	for ($i=0; $i<8; $i++)
    521 		{
    522 		&comment("Round $i");
    523 
    524 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    525 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    526 		&sub($tmp1,$c);
    527 		 &mov($c,0);
    528 		&adc($c,$c);
    529 		 &sub($tmp1,$tmp2);
    530 		&adc($c,0);
    531 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    532 		}
    533 
    534 	&comment("");
    535 	&add($a,32);
    536 	 &add($b,32);
    537 	&add($r,32);
    538 	 &sub($num,8);
    539 	&jnz(&label("aw_loop"));
    540 
    541 	&set_label("aw_finish",0);
    542 	&mov($num,&wparam(3));	# get num
    543 	&and($num,7);
    544 	 &jz(&label("aw_end"));
    545 
    546 	for ($i=0; $i<7; $i++)
    547 		{
    548 		&comment("Tail Round $i");
    549 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    550 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    551 		&sub($tmp1,$c);
    552 		 &mov($c,0);
    553 		&adc($c,$c);
    554 		 &sub($tmp1,$tmp2);
    555 		&adc($c,0);
    556 		 &dec($num) if ($i != 6);
    557 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    558 		 &jz(&label("aw_end")) if ($i != 6);
    559 		}
    560 	&set_label("aw_end",0);
    561 
    562 #	&mov("eax",$c);		# $c is "eax"
    563 
    564 	&function_end($name);
    565 	}
    566 
    567 sub bn_sub_part_words
    568 	{
    569 	local($name)=@_;
    570 
    571 	&function_begin($name,"");
    572 
    573 	&comment("");
    574 	$a="esi";
    575 	$b="edi";
    576 	$c="eax";
    577 	$r="ebx";
    578 	$tmp1="ecx";
    579 	$tmp2="edx";
    580 	$num="ebp";
    581 
    582 	&mov($r,&wparam(0));	# get r
    583 	 &mov($a,&wparam(1));	# get a
    584 	&mov($b,&wparam(2));	# get b
    585 	 &mov($num,&wparam(3));	# get num
    586 	&xor($c,$c);		# clear carry
    587 	 &and($num,0xfffffff8);	# num / 8
    588 
    589 	&jz(&label("aw_finish"));
    590 
    591 	&set_label("aw_loop",0);
    592 	for ($i=0; $i<8; $i++)
    593 		{
    594 		&comment("Round $i");
    595 
    596 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
    597 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    598 		&sub($tmp1,$c);
    599 		 &mov($c,0);
    600 		&adc($c,$c);
    601 		 &sub($tmp1,$tmp2);
    602 		&adc($c,0);
    603 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    604 		}
    605 
    606 	&comment("");
    607 	&add($a,32);
    608 	 &add($b,32);
    609 	&add($r,32);
    610 	 &sub($num,8);
    611 	&jnz(&label("aw_loop"));
    612 
    613 	&set_label("aw_finish",0);
    614 	&mov($num,&wparam(3));	# get num
    615 	&and($num,7);
    616 	 &jz(&label("aw_end"));
    617 
    618 	for ($i=0; $i<7; $i++)
    619 		{
    620 		&comment("Tail Round $i");
    621 		&mov($tmp1,&DWP(0,$a,"",0));	# *a
    622 		 &mov($tmp2,&DWP(0,$b,"",0));# *b
    623 		&sub($tmp1,$c);
    624 		 &mov($c,0);
    625 		&adc($c,$c);
    626 		 &sub($tmp1,$tmp2);
    627 		&adc($c,0);
    628 		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
    629 		&add($a, 4);
    630 		&add($b, 4);
    631 		&add($r, 4);
    632 		 &dec($num) if ($i != 6);
    633 		 &jz(&label("aw_end")) if ($i != 6);
    634 		}
    635 	&set_label("aw_end",0);
    636 
    637 	&cmp(&wparam(4),0);
    638 	&je(&label("pw_end"));
    639 
    640 	&mov($num,&wparam(4));	# get dl
    641 	&cmp($num,0);
    642 	&je(&label("pw_end"));
    643 	&jge(&label("pw_pos"));
    644 
    645 	&comment("pw_neg");
    646 	&mov($tmp2,0);
    647 	&sub($tmp2,$num);
    648 	&mov($num,$tmp2);
    649 	&and($num,0xfffffff8);	# num / 8
    650 	&jz(&label("pw_neg_finish"));
    651 
    652 	&set_label("pw_neg_loop",0);
    653 	for ($i=0; $i<8; $i++)
    654 	{
    655 	    &comment("dl<0 Round $i");
    656 
    657 	    &mov($tmp1,0);
    658 	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
    659 	    &sub($tmp1,$c);
    660 	    &mov($c,0);
    661 	    &adc($c,$c);
    662 	    &sub($tmp1,$tmp2);
    663 	    &adc($c,0);
    664 	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
    665 	}
    666 	    
    667 	&comment("");
    668 	&add($b,32);
    669 	&add($r,32);
    670 	&sub($num,8);
    671 	&jnz(&label("pw_neg_loop"));
    672 	    
    673 	&set_label("pw_neg_finish",0);
    674 	&mov($tmp2,&wparam(4));	# get dl
    675 	&mov($num,0);
    676 	&sub($num,$tmp2);
    677 	&and($num,7);
    678 	&jz(&label("pw_end"));
    679 	    
    680 	for ($i=0; $i<7; $i++)
    681 	{
    682 	    &comment("dl<0 Tail Round $i");
    683 	    &mov($tmp1,0);
    684 	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
    685 	    &sub($tmp1,$c);
    686 	    &mov($c,0);
    687 	    &adc($c,$c);
    688 	    &sub($tmp1,$tmp2);
    689 	    &adc($c,0);
    690 	    &dec($num) if ($i != 6);
    691 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    692 	    &jz(&label("pw_end")) if ($i != 6);
    693 	}
    694 
    695 	&jmp(&label("pw_end"));
    696 	
    697 	&set_label("pw_pos",0);
    698 	
    699 	&and($num,0xfffffff8);	# num / 8
    700 	&jz(&label("pw_pos_finish"));
    701 
    702 	&set_label("pw_pos_loop",0);
    703 
    704 	for ($i=0; $i<8; $i++)
    705 	{
    706 	    &comment("dl>0 Round $i");
    707 
    708 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    709 	    &sub($tmp1,$c);
    710 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    711 	    &jnc(&label("pw_nc".$i));
    712 	}
    713 	    
    714 	&comment("");
    715 	&add($a,32);
    716 	&add($r,32);
    717 	&sub($num,8);
    718 	&jnz(&label("pw_pos_loop"));
    719 	    
    720 	&set_label("pw_pos_finish",0);
    721 	&mov($num,&wparam(4));	# get dl
    722 	&and($num,7);
    723 	&jz(&label("pw_end"));
    724 	    
    725 	for ($i=0; $i<7; $i++)
    726 	{
    727 	    &comment("dl>0 Tail Round $i");
    728 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    729 	    &sub($tmp1,$c);
    730 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    731 	    &jnc(&label("pw_tail_nc".$i));
    732 	    &dec($num) if ($i != 6);
    733 	    &jz(&label("pw_end")) if ($i != 6);
    734 	}
    735 	&mov($c,1);
    736 	&jmp(&label("pw_end"));
    737 
    738 	&set_label("pw_nc_loop",0);
    739 	for ($i=0; $i<8; $i++)
    740 	{
    741 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    742 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    743 	    &set_label("pw_nc".$i,0);
    744 	}
    745 	    
    746 	&comment("");
    747 	&add($a,32);
    748 	&add($r,32);
    749 	&sub($num,8);
    750 	&jnz(&label("pw_nc_loop"));
    751 	    
    752 	&mov($num,&wparam(4));	# get dl
    753 	&and($num,7);
    754 	&jz(&label("pw_nc_end"));
    755 	    
    756 	for ($i=0; $i<7; $i++)
    757 	{
    758 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
    759 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
    760 	    &set_label("pw_tail_nc".$i,0);
    761 	    &dec($num) if ($i != 6);
    762 	    &jz(&label("pw_nc_end")) if ($i != 6);
    763 	}
    764 
    765 	&set_label("pw_nc_end",0);
    766 	&mov($c,0);
    767 
    768 	&set_label("pw_end",0);
    769 
    770 #	&mov("eax",$c);		# $c is "eax"
    771 
    772 	&function_end($name);
    773 	}
    774 
    775