Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # SHA256 block transform for x86. September 2007.
     11 #
     12 # Performance in clock cycles per processed byte (less is better):
     13 #
     14 #		Pentium	PIII	P4	AMD K8	Core2
     15 # gcc		46	36	41	27	26
     16 # icc		57	33	38	25	23	
     17 # x86 asm	40	30	33	20	18
     18 # x86_64 asm(*)	-	-	21	16	16
     19 #
     20 # (*) x86_64 assembler performance is presented for reference
     21 #     purposes.
     22 #
     23 # Performance improvement over compiler generated code varies from
     24 # 10% to 40% [see above]. Not very impressive on some -archs, but
     25 # it's 5 times smaller and optimizies amount of writes.
     26 
     27 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     28 push(@INC,"${dir}","${dir}../../perlasm");
     29 require "x86asm.pl";
     30 
     31 &asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
     32 
     33 $A="eax";
     34 $E="edx";
     35 $T="ebx";
     36 $Aoff=&DWP(0,"esp");
     37 $Boff=&DWP(4,"esp");
     38 $Coff=&DWP(8,"esp");
     39 $Doff=&DWP(12,"esp");
     40 $Eoff=&DWP(16,"esp");
     41 $Foff=&DWP(20,"esp");
     42 $Goff=&DWP(24,"esp");
     43 $Hoff=&DWP(28,"esp");
     44 $Xoff=&DWP(32,"esp");
     45 $K256="ebp";
     46 
     47 sub BODY_00_15() {
     48     my $in_16_63=shift;
     49 
     50 	&mov	("ecx",$E);
     51 	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
     52 	&ror	("ecx",25-11);
     53 	 &mov	("esi",$Foff);
     54 	&xor	("ecx",$E);
     55 	&ror	("ecx",11-6);
     56 	 &mov	(&DWP(4*(8+15),"esp"),$T)	if ($in_16_63);	# save X[0]
     57 	&xor	("ecx",$E);
     58 	&ror	("ecx",6);	# Sigma1(e)
     59 	 &mov	("edi",$Goff);
     60 	&add	($T,"ecx");	# T += Sigma1(e)
     61 
     62 	&xor	("esi","edi");
     63 	 &mov	($Eoff,$E);	# modulo-scheduled
     64 	 &mov	("ecx",$A);
     65 	&and	("esi",$E);
     66 	 &mov	($E,$Doff);	# e becomes d, which is e in next iteration
     67 	&xor	("esi","edi");	# Ch(e,f,g)
     68 	 &mov	("edi",$A);
     69 	&add	($T,"esi");	# T += Ch(e,f,g)
     70 
     71 	&ror	("ecx",22-13);
     72 	 &add	($T,$Hoff);	# T += h
     73 	&xor	("ecx",$A);
     74 	&ror	("ecx",13-2);
     75 	 &mov	("esi",$Boff);
     76 	&xor	("ecx",$A);
     77 	&ror	("ecx",2);	# Sigma0(a)
     78 	 &add	($E,$T);	# d += T
     79 	 &mov	("edi",$Coff);
     80 
     81 	&add	($T,"ecx");	# T += Sigma0(a)
     82 	 &mov	($Aoff,$A);	# modulo-scheduled
     83 
     84 	&mov	("ecx",$A);
     85 	 &sub	("esp",4);
     86 	&or	($A,"esi");	# a becomes h, which is a in next iteration
     87 	&and	("ecx","esi");
     88 	&and	($A,"edi");
     89 	 &mov	("esi",&DWP(0,$K256));
     90 	&or	($A,"ecx");	# h=Maj(a,b,c)
     91 
     92 	&add	($K256,4);
     93 	&add	($A,$T);	# h += T
     94 	 &mov	($T,&DWP(4*(8+15+16-1),"esp"))	if ($in_16_63);	# preload T
     95 	&add	($E,"esi");	# d += K256[i]
     96 	&add	($A,"esi");	# h += K256[i]
     97 }
     98 
     99 &function_begin("sha256_block_data_order");
    100 	&mov	("esi",wparam(0));	# ctx
    101 	&mov	("edi",wparam(1));	# inp
    102 	&mov	("eax",wparam(2));	# num
    103 	&mov	("ebx","esp");		# saved sp
    104 
    105 	&call	(&label("pic_point"));	# make it PIC!
    106 &set_label("pic_point");
    107 	&blindpop($K256);
    108 	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
    109 
    110 	&sub	("esp",16);
    111 	&and	("esp",-64);
    112 
    113 	&shl	("eax",6);
    114 	&add	("eax","edi");
    115 	&mov	(&DWP(0,"esp"),"esi");	# ctx
    116 	&mov	(&DWP(4,"esp"),"edi");	# inp
    117 	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
    118 	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
    119 
    120 &set_label("loop",16);
    121     # copy input block to stack reversing byte and dword order
    122     for($i=0;$i<4;$i++) {
    123 	&mov	("eax",&DWP($i*16+0,"edi"));
    124 	&mov	("ebx",&DWP($i*16+4,"edi"));
    125 	&mov	("ecx",&DWP($i*16+8,"edi"));
    126 	&mov	("edx",&DWP($i*16+12,"edi"));
    127 	&bswap	("eax");
    128 	&bswap	("ebx");
    129 	&bswap	("ecx");
    130 	&bswap	("edx");
    131 	&push	("eax");
    132 	&push	("ebx");
    133 	&push	("ecx");
    134 	&push	("edx");
    135     }
    136 	&add	("edi",64);
    137 	&sub	("esp",4*8);		# place for A,B,C,D,E,F,G,H
    138 	&mov	(&DWP(4*(8+16)+4,"esp"),"edi");
    139 
    140 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
    141 	&mov	($A,&DWP(0,"esi"));
    142 	&mov	("ebx",&DWP(4,"esi"));
    143 	&mov	("ecx",&DWP(8,"esi"));
    144 	&mov	("edi",&DWP(12,"esi"));
    145 	# &mov	($Aoff,$A);
    146 	&mov	($Boff,"ebx");
    147 	&mov	($Coff,"ecx");
    148 	&mov	($Doff,"edi");
    149 	&mov	($E,&DWP(16,"esi"));	
    150 	&mov	("ebx",&DWP(20,"esi"));
    151 	&mov	("ecx",&DWP(24,"esi"));
    152 	&mov	("edi",&DWP(28,"esi"));
    153 	# &mov	($Eoff,$E);
    154 	&mov	($Foff,"ebx");
    155 	&mov	($Goff,"ecx");
    156 	&mov	($Hoff,"edi");
    157 
    158 &set_label("00_15",16);
    159 	&mov	($T,&DWP(4*(8+15),"esp"));
    160 
    161 	&BODY_00_15();
    162 
    163 	&cmp	("esi",0xc19bf174);
    164 	&jne	(&label("00_15"));
    165 
    166 	&mov	($T,&DWP(4*(8+15+16-1),"esp"));	# preloaded in BODY_00_15(1)
    167 &set_label("16_63",16);
    168 	&mov	("esi",$T);
    169 	 &mov	("ecx",&DWP(4*(8+15+16-14),"esp"));
    170 	&ror	("esi",18-7);
    171 	 &mov	("edi","ecx");
    172 	&xor	("esi",$T);
    173 	&ror	("esi",7);
    174 	&shr	($T,3);
    175 
    176 	&ror	("edi",19-17);
    177 	 &xor	($T,"esi");			# T = sigma0(X[-15])
    178 	&xor	("edi","ecx");
    179 	&ror	("edi",17);
    180 	&shr	("ecx",10);
    181 	 &add	($T,&DWP(4*(8+15+16),"esp"));	# T += X[-16]
    182 	&xor	("edi","ecx");			# sigma1(X[-2])
    183 
    184 	 &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7]
    185 	# &add	($T,"edi");			# T += sigma1(X[-2])
    186 	# &mov	(&DWP(4*(8+15),"esp"),$T);	# save X[0]
    187 
    188 	&BODY_00_15(1);
    189 
    190 	&cmp	("esi",0xc67178f2);
    191 	&jne	(&label("16_63"));
    192 
    193 	&mov	("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
    194 	# &mov	($A,$Aoff);
    195 	&mov	("ebx",$Boff);
    196 	&mov	("ecx",$Coff);
    197 	&mov	("edi",$Doff);
    198 	&add	($A,&DWP(0,"esi"));
    199 	&add	("ebx",&DWP(4,"esi"));
    200 	&add	("ecx",&DWP(8,"esi"));
    201 	&add	("edi",&DWP(12,"esi"));
    202 	&mov	(&DWP(0,"esi"),$A);
    203 	&mov	(&DWP(4,"esi"),"ebx");
    204 	&mov	(&DWP(8,"esi"),"ecx");
    205 	&mov	(&DWP(12,"esi"),"edi");
    206 	# &mov	($E,$Eoff);
    207 	&mov	("eax",$Foff);
    208 	&mov	("ebx",$Goff);
    209 	&mov	("ecx",$Hoff);
    210 	&mov	("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
    211 	&add	($E,&DWP(16,"esi"));
    212 	&add	("eax",&DWP(20,"esi"));
    213 	&add	("ebx",&DWP(24,"esi"));
    214 	&add	("ecx",&DWP(28,"esi"));
    215 	&mov	(&DWP(16,"esi"),$E);
    216 	&mov	(&DWP(20,"esi"),"eax");
    217 	&mov	(&DWP(24,"esi"),"ebx");
    218 	&mov	(&DWP(28,"esi"),"ecx");
    219 
    220 	&add	("esp",4*(8+16+64));		# destroy frame
    221 	&sub	($K256,4*64);			# rewind K
    222 
    223 	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
    224 	&jb	(&label("loop"));
    225 
    226 	&mov	("esp",&DWP(12,"esp"));		# restore sp
    227 &function_end_A();
    228 
    229 &set_label("K256",64);	# Yes! I keep it in the code segment!
    230 	&data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
    231 	&data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
    232 	&data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
    233 	&data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
    234 	&data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
    235 	&data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
    236 	&data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
    237 	&data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
    238 	&data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
    239 	&data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
    240 	&data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
    241 	&data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
    242 	&data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
    243 	&data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
    244 	&data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
    245 	&data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
    246 &function_end_B("sha256_block_data_order");
    247 &asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
    248 
    249 &asm_finish();
    250