Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # May 2011
     11 #
     12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
     13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
     14 # the time being... Except that it has three code paths: pure integer
     15 # code suitable for any x86 CPU, MMX code suitable for PIII and later
     16 # and PCLMULQDQ suitable for Westmere and later. Improvement varies
     17 # from one benchmark and -arch to another. Below are interval values
     18 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated
     19 # code:
     20 #
     21 # PIII		16%-30%
     22 # P4		12%-12%
     23 # Opteron	18%-40%
     24 # Core2		19%-44%
     25 # Atom		38%-64%
     26 # Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
     27 # Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
     28 #
     29 # Note that above improvement coefficients are not coefficients for
     30 # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
     31 # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
     32 # is more and more dominated by other subroutines, most notably by
     33 # BN_GF2m_mod[_mul]_arr...
     34 
     35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     36 push(@INC,"${dir}","${dir}../../perlasm");
     37 require "x86asm.pl";
     38 
     39 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
     40 
     41 $sse2=0;
     42 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
     43 
     44 &external_label("OPENSSL_ia32cap_P") if ($sse2);
     45 
     46 $a="eax";
     47 $b="ebx";
     48 ($a1,$a2,$a4)=("ecx","edx","ebp");
     49 
     50 $R="mm0";
     51 @T=("mm1","mm2");
     52 ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
     53 @i=("esi","edi");
     54 
     55 					if (!$x86only) {
     56 &function_begin_B("_mul_1x1_mmx");
     57 	&sub	("esp",32+4);
     58 	 &mov	($a1,$a);
     59 	 &lea	($a2,&DWP(0,$a,$a));
     60 	 &and	($a1,0x3fffffff);
     61 	 &lea	($a4,&DWP(0,$a2,$a2));
     62 	 &mov	(&DWP(0*4,"esp"),0);
     63 	 &and	($a2,0x7fffffff);
     64 	&movd	($A,$a);
     65 	&movd	($B,$b);
     66 	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
     67 	 &xor	($a1,$a2);		# a1^a2
     68 	&pxor	($B31,$B31);
     69 	&pxor	($B30,$B30);
     70 	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
     71 	 &xor	($a2,$a4);		# a2^a4
     72 	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
     73 	&pcmpgtd($B31,$A);		# broadcast 31st bit
     74 	&paddd	($A,$A);		# $A<<=1
     75 	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
     76 	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
     77 	 &xor	($a4,$a2);		# a2=a4^a2^a4
     78 	&pand	($B31,$B);
     79 	&pcmpgtd($B30,$A);		# broadcast 30th bit
     80 	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
     81 	 &xor	($a4,$a1);		# a1^a2^a4
     82 	&psllq	($B31,31);
     83 	&pand	($B30,$B);
     84 	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
     85 	&mov	(@i[0],0x7);
     86 	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
     87 	 &mov	($a4,@i[0]);
     88 	&and	(@i[0],$b);
     89 	&shr	($b,3);
     90 	&mov	(@i[1],$a4);
     91 	&psllq	($B30,30);
     92 	&and	(@i[1],$b);
     93 	&shr	($b,3);
     94 	&movd	($R,&DWP(0,"esp",@i[0],4));
     95 	&mov	(@i[0],$a4);
     96 	&and	(@i[0],$b);
     97 	&shr	($b,3);
     98 	for($n=1;$n<9;$n++) {
     99 		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
    100 		&mov	(@i[1],$a4);
    101 		&psllq	(@T[1],3*$n);
    102 		&and	(@i[1],$b);
    103 		&shr	($b,3);
    104 		&pxor	($R,@T[1]);
    105 
    106 		push(@i,shift(@i)); push(@T,shift(@T));
    107 	}
    108 	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
    109 	&pxor	($R,$B30);
    110 	&psllq	(@T[1],3*$n++);
    111 	&pxor	($R,@T[1]);
    112 
    113 	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
    114 	&pxor	($R,$B31);
    115 	&psllq	(@T[0],3*$n);
    116 	&add	("esp",32+4);
    117 	&pxor	($R,@T[0]);
    118 	&ret	();
    119 &function_end_B("_mul_1x1_mmx");
    120 					}
    121 
    122 ($lo,$hi)=("eax","edx");
    123 @T=("ecx","ebp");
    124 
    125 &function_begin_B("_mul_1x1_ialu");
    126 	&sub	("esp",32+4);
    127 	 &mov	($a1,$a);
    128 	 &lea	($a2,&DWP(0,$a,$a));
    129 	 &lea	($a4,&DWP(0,"",$a,4));
    130 	 &and	($a1,0x3fffffff);
    131 	&lea	(@i[1],&DWP(0,$lo,$lo));
    132 	&sar	($lo,31);		# broadcast 31st bit
    133 	 &mov	(&DWP(0*4,"esp"),0);
    134 	 &and	($a2,0x7fffffff);
    135 	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
    136 	 &xor	($a1,$a2);		# a1^a2
    137 	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
    138 	 &xor	($a2,$a4);		# a2^a4
    139 	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
    140 	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
    141 	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
    142 	 &xor	($a4,$a2);		# a2=a4^a2^a4
    143 	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
    144 	 &xor	($a4,$a1);		# a1^a2^a4
    145 	&sar	(@i[1],31);		# broardcast 30th bit
    146 	&and	($lo,$b);
    147 	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
    148 	&and	(@i[1],$b);
    149 	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
    150 	&mov	($hi,$lo);
    151 	&shl	($lo,31);
    152 	&mov	(@T[0],@i[1]);
    153 	&shr	($hi,1);
    154 
    155 	 &mov	(@i[0],0x7);
    156 	&shl	(@i[1],30);
    157 	 &and	(@i[0],$b);
    158 	&shr	(@T[0],2);
    159 	&xor	($lo,@i[1]);
    160 
    161 	&shr	($b,3);
    162 	&mov	(@i[1],0x7);		# 5-byte instruction!?
    163 	&and	(@i[1],$b);
    164 	&shr	($b,3);
    165 	 &xor	($hi,@T[0]);
    166 	&xor	($lo,&DWP(0,"esp",@i[0],4));
    167 	&mov	(@i[0],0x7);
    168 	&and	(@i[0],$b);
    169 	&shr	($b,3);
    170 	for($n=1;$n<9;$n++) {
    171 		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
    172 		&mov	(@i[1],0x7);
    173 		&mov	(@T[0],@T[1]);
    174 		&shl	(@T[1],3*$n);
    175 		&and	(@i[1],$b);
    176 		&shr	(@T[0],32-3*$n);
    177 		&xor	($lo,@T[1]);
    178 		&shr	($b,3);
    179 		&xor	($hi,@T[0]);
    180 
    181 		push(@i,shift(@i)); push(@T,shift(@T));
    182 	}
    183 	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
    184 	&mov	(@T[0],@T[1]);
    185 	&shl	(@T[1],3*$n);
    186 	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
    187 	&shr	(@T[0],32-3*$n);	$n++;
    188 	&mov	(@i[0],@i[1]);
    189 	&xor	($lo,@T[1]);
    190 	&shl	(@i[1],3*$n);
    191 	&xor	($hi,@T[0]);
    192 	&shr	(@i[0],32-3*$n);
    193 	&xor	($lo,@i[1]);
    194 	&xor	($hi,@i[0]);
    195 
    196 	&add	("esp",32+4);
    197 	&ret	();
    198 &function_end_B("_mul_1x1_ialu");
    199 
    200 # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
    201 &function_begin_B("bn_GF2m_mul_2x2");
    202 if (!$x86only) {
    203 	&picmeup("edx","OPENSSL_ia32cap_P");
    204 	&mov	("eax",&DWP(0,"edx"));
    205 	&mov	("edx",&DWP(4,"edx"));
    206 	&test	("eax",1<<23);		# check MMX bit
    207 	&jz	(&label("ialu"));
    208 if ($sse2) {
    209 	&test	("eax",1<<24);		# check FXSR bit
    210 	&jz	(&label("mmx"));
    211 	&test	("edx",1<<1);		# check PCLMULQDQ bit
    212 	&jz	(&label("mmx"));
    213 
    214 	&movups		("xmm0",&QWP(8,"esp"));
    215 	&shufps		("xmm0","xmm0",0b10110001);
    216 	&pclmulqdq	("xmm0","xmm0",1);
    217 	&mov		("eax",&DWP(4,"esp"));
    218 	&movups		(&QWP(0,"eax"),"xmm0");
    219 	&ret	();
    220 
    221 &set_label("mmx",16);
    222 }
    223 	&push	("ebp");
    224 	&push	("ebx");
    225 	&push	("esi");
    226 	&push	("edi");
    227 	&mov	($a,&wparam(1));
    228 	&mov	($b,&wparam(3));
    229 	&call	("_mul_1x1_mmx");	# a1b1
    230 	&movq	("mm7",$R);
    231 
    232 	&mov	($a,&wparam(2));
    233 	&mov	($b,&wparam(4));
    234 	&call	("_mul_1x1_mmx");	# a0b0
    235 	&movq	("mm6",$R);
    236 
    237 	&mov	($a,&wparam(1));
    238 	&mov	($b,&wparam(3));
    239 	&xor	($a,&wparam(2));
    240 	&xor	($b,&wparam(4));
    241 	&call	("_mul_1x1_mmx");	# (a0+a1)(b0+b1)
    242 	&pxor	($R,"mm7");
    243 	&mov	($a,&wparam(0));
    244 	&pxor	($R,"mm6");		# (a0+a1)(b0+b1)-a1b1-a0b0
    245 
    246 	&movq	($A,$R);
    247 	&psllq	($R,32);
    248 	&pop	("edi");
    249 	&psrlq	($A,32);
    250 	&pop	("esi");
    251 	&pxor	($R,"mm6");
    252 	&pop	("ebx");
    253 	&pxor	($A,"mm7");
    254 	&movq	(&QWP(0,$a),$R);
    255 	&pop	("ebp");
    256 	&movq	(&QWP(8,$a),$A);
    257 	&emms	();
    258 	&ret	();
    259 &set_label("ialu",16);
    260 }
    261 	&push	("ebp");
    262 	&push	("ebx");
    263 	&push	("esi");
    264 	&push	("edi");
    265 	&stack_push(4+1);
    266 
    267 	&mov	($a,&wparam(1));
    268 	&mov	($b,&wparam(3));
    269 	&call	("_mul_1x1_ialu");	# a1b1
    270 	&mov	(&DWP(8,"esp"),$lo);
    271 	&mov	(&DWP(12,"esp"),$hi);
    272 
    273 	&mov	($a,&wparam(2));
    274 	&mov	($b,&wparam(4));
    275 	&call	("_mul_1x1_ialu");	# a0b0
    276 	&mov	(&DWP(0,"esp"),$lo);
    277 	&mov	(&DWP(4,"esp"),$hi);
    278 
    279 	&mov	($a,&wparam(1));
    280 	&mov	($b,&wparam(3));
    281 	&xor	($a,&wparam(2));
    282 	&xor	($b,&wparam(4));
    283 	&call	("_mul_1x1_ialu");	# (a0+a1)(b0+b1)
    284 
    285 	&mov	("ebp",&wparam(0));
    286 		 @r=("ebx","ecx","edi","esi");
    287 	&mov	(@r[0],&DWP(0,"esp"));
    288 	&mov	(@r[1],&DWP(4,"esp"));
    289 	&mov	(@r[2],&DWP(8,"esp"));
    290 	&mov	(@r[3],&DWP(12,"esp"));
    291 
    292 	&xor	($lo,$hi);
    293 	&xor	($hi,@r[1]);
    294 	&xor	($lo,@r[0]);
    295 	&mov	(&DWP(0,"ebp"),@r[0]);
    296 	&xor	($hi,@r[2]);
    297 	&mov	(&DWP(12,"ebp"),@r[3]);
    298 	&xor	($lo,@r[3]);
    299 	&stack_pop(4+1);
    300 	&xor	($hi,@r[3]);
    301 	&pop	("edi");
    302 	&xor	($lo,$hi);
    303 	&pop	("esi");
    304 	&mov	(&DWP(8,"ebp"),$hi);
    305 	&pop	("ebx");
    306 	&mov	(&DWP(4,"ebp"),$lo);
    307 	&pop	("ebp");
    308 	&ret	();
    309 &function_end_B("bn_GF2m_mul_2x2");
    310 
    311 &asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
    312 
    313 &asm_finish();
    314