1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # May 2011 11 # 12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used 13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 14 # the time being... Except that it has three code paths: pure integer 15 # code suitable for any x86 CPU, MMX code suitable for PIII and later 16 # and PCLMULQDQ suitable for Westmere and later. Improvement varies 17 # from one benchmark and -arch to another. Below are interval values 18 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated 19 # code: 20 # 21 # PIII 16%-30% 22 # P4 12%-12% 23 # Opteron 18%-40% 24 # Core2 19%-44% 25 # Atom 38%-64% 26 # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) 27 # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) 28 # 29 # Note that above improvement coefficients are not coefficients for 30 # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result 31 # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark 32 # is more and more dominated by other subroutines, most notably by 33 # BN_GF2m_mod[_mul]_arr... 34 35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 36 push(@INC,"${dir}","${dir}../../perlasm"); 37 require "x86asm.pl"; 38 39 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); 40 41 $sse2=0; 42 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 43 44 &external_label("OPENSSL_ia32cap_P") if ($sse2); 45 46 $a="eax"; 47 $b="ebx"; 48 ($a1,$a2,$a4)=("ecx","edx","ebp"); 49 50 $R="mm0"; 51 @T=("mm1","mm2"); 52 ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); 53 @i=("esi","edi"); 54 55 if (!$x86only) { 56 &function_begin_B("_mul_1x1_mmx"); 57 &sub ("esp",32+4); 58 &mov ($a1,$a); 59 &lea ($a2,&DWP(0,$a,$a)); 60 &and ($a1,0x3fffffff); 61 &lea ($a4,&DWP(0,$a2,$a2)); 62 &mov (&DWP(0*4,"esp"),0); 63 &and ($a2,0x7fffffff); 64 &movd ($A,$a); 65 &movd ($B,$b); 66 &mov (&DWP(1*4,"esp"),$a1); # a1 67 &xor ($a1,$a2); # a1^a2 68 &pxor ($B31,$B31); 69 &pxor ($B30,$B30); 70 &mov (&DWP(2*4,"esp"),$a2); # a2 71 &xor ($a2,$a4); # a2^a4 72 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 73 &pcmpgtd($B31,$A); # broadcast 31st bit 74 &paddd ($A,$A); # $A<<=1 75 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 76 &mov (&DWP(4*4,"esp"),$a4); # a4 77 &xor ($a4,$a2); # a2=a4^a2^a4 78 &pand ($B31,$B); 79 &pcmpgtd($B30,$A); # broadcast 30th bit 80 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 81 &xor ($a4,$a1); # a1^a2^a4 82 &psllq ($B31,31); 83 &pand ($B30,$B); 84 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 85 &mov (@i[0],0x7); 86 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 87 &mov ($a4,@i[0]); 88 &and (@i[0],$b); 89 &shr ($b,3); 90 &mov (@i[1],$a4); 91 &psllq ($B30,30); 92 &and (@i[1],$b); 93 &shr ($b,3); 94 &movd ($R,&DWP(0,"esp",@i[0],4)); 95 &mov (@i[0],$a4); 96 &and (@i[0],$b); 97 &shr ($b,3); 98 for($n=1;$n<9;$n++) { 99 &movd (@T[1],&DWP(0,"esp",@i[1],4)); 100 &mov (@i[1],$a4); 101 &psllq (@T[1],3*$n); 102 &and (@i[1],$b); 103 &shr ($b,3); 104 &pxor ($R,@T[1]); 105 106 push(@i,shift(@i)); push(@T,shift(@T)); 107 } 108 &movd (@T[1],&DWP(0,"esp",@i[1],4)); 109 &pxor ($R,$B30); 110 &psllq (@T[1],3*$n++); 111 &pxor ($R,@T[1]); 112 113 &movd (@T[0],&DWP(0,"esp",@i[0],4)); 114 &pxor ($R,$B31); 115 &psllq (@T[0],3*$n); 116 &add ("esp",32+4); 117 &pxor ($R,@T[0]); 118 &ret (); 119 &function_end_B("_mul_1x1_mmx"); 120 } 121 122 ($lo,$hi)=("eax","edx"); 123 @T=("ecx","ebp"); 124 125 &function_begin_B("_mul_1x1_ialu"); 126 &sub ("esp",32+4); 127 &mov ($a1,$a); 128 &lea ($a2,&DWP(0,$a,$a)); 129 &lea ($a4,&DWP(0,"",$a,4)); 130 &and ($a1,0x3fffffff); 131 &lea (@i[1],&DWP(0,$lo,$lo)); 132 &sar ($lo,31); # broadcast 31st bit 133 &mov (&DWP(0*4,"esp"),0); 134 &and ($a2,0x7fffffff); 135 &mov (&DWP(1*4,"esp"),$a1); # a1 136 &xor ($a1,$a2); # a1^a2 137 &mov (&DWP(2*4,"esp"),$a2); # a2 138 &xor ($a2,$a4); # a2^a4 139 &mov (&DWP(3*4,"esp"),$a1); # a1^a2 140 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 141 &mov (&DWP(4*4,"esp"),$a4); # a4 142 &xor ($a4,$a2); # a2=a4^a2^a4 143 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 144 &xor ($a4,$a1); # a1^a2^a4 145 &sar (@i[1],31); # broardcast 30th bit 146 &and ($lo,$b); 147 &mov (&DWP(6*4,"esp"),$a2); # a2^a4 148 &and (@i[1],$b); 149 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 150 &mov ($hi,$lo); 151 &shl ($lo,31); 152 &mov (@T[0],@i[1]); 153 &shr ($hi,1); 154 155 &mov (@i[0],0x7); 156 &shl (@i[1],30); 157 &and (@i[0],$b); 158 &shr (@T[0],2); 159 &xor ($lo,@i[1]); 160 161 &shr ($b,3); 162 &mov (@i[1],0x7); # 5-byte instruction!? 163 &and (@i[1],$b); 164 &shr ($b,3); 165 &xor ($hi,@T[0]); 166 &xor ($lo,&DWP(0,"esp",@i[0],4)); 167 &mov (@i[0],0x7); 168 &and (@i[0],$b); 169 &shr ($b,3); 170 for($n=1;$n<9;$n++) { 171 &mov (@T[1],&DWP(0,"esp",@i[1],4)); 172 &mov (@i[1],0x7); 173 &mov (@T[0],@T[1]); 174 &shl (@T[1],3*$n); 175 &and (@i[1],$b); 176 &shr (@T[0],32-3*$n); 177 &xor ($lo,@T[1]); 178 &shr ($b,3); 179 &xor ($hi,@T[0]); 180 181 push(@i,shift(@i)); push(@T,shift(@T)); 182 } 183 &mov (@T[1],&DWP(0,"esp",@i[1],4)); 184 &mov (@T[0],@T[1]); 185 &shl (@T[1],3*$n); 186 &mov (@i[1],&DWP(0,"esp",@i[0],4)); 187 &shr (@T[0],32-3*$n); $n++; 188 &mov (@i[0],@i[1]); 189 &xor ($lo,@T[1]); 190 &shl (@i[1],3*$n); 191 &xor ($hi,@T[0]); 192 &shr (@i[0],32-3*$n); 193 &xor ($lo,@i[1]); 194 &xor ($hi,@i[0]); 195 196 &add ("esp",32+4); 197 &ret (); 198 &function_end_B("_mul_1x1_ialu"); 199 200 # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); 201 &function_begin_B("bn_GF2m_mul_2x2"); 202 if (!$x86only) { 203 &picmeup("edx","OPENSSL_ia32cap_P"); 204 &mov ("eax",&DWP(0,"edx")); 205 &mov ("edx",&DWP(4,"edx")); 206 &test ("eax",1<<23); # check MMX bit 207 &jz (&label("ialu")); 208 if ($sse2) { 209 &test ("eax",1<<24); # check FXSR bit 210 &jz (&label("mmx")); 211 &test ("edx",1<<1); # check PCLMULQDQ bit 212 &jz (&label("mmx")); 213 214 &movups ("xmm0",&QWP(8,"esp")); 215 &shufps ("xmm0","xmm0",0b10110001); 216 &pclmulqdq ("xmm0","xmm0",1); 217 &mov ("eax",&DWP(4,"esp")); 218 &movups (&QWP(0,"eax"),"xmm0"); 219 &ret (); 220 221 &set_label("mmx",16); 222 } 223 &push ("ebp"); 224 &push ("ebx"); 225 &push ("esi"); 226 &push ("edi"); 227 &mov ($a,&wparam(1)); 228 &mov ($b,&wparam(3)); 229 &call ("_mul_1x1_mmx"); # a1b1 230 &movq ("mm7",$R); 231 232 &mov ($a,&wparam(2)); 233 &mov ($b,&wparam(4)); 234 &call ("_mul_1x1_mmx"); # a0b0 235 &movq ("mm6",$R); 236 237 &mov ($a,&wparam(1)); 238 &mov ($b,&wparam(3)); 239 &xor ($a,&wparam(2)); 240 &xor ($b,&wparam(4)); 241 &call ("_mul_1x1_mmx"); # (a0+a1)(b0+b1) 242 &pxor ($R,"mm7"); 243 &mov ($a,&wparam(0)); 244 &pxor ($R,"mm6"); # (a0+a1)(b0+b1)-a1b1-a0b0 245 246 &movq ($A,$R); 247 &psllq ($R,32); 248 &pop ("edi"); 249 &psrlq ($A,32); 250 &pop ("esi"); 251 &pxor ($R,"mm6"); 252 &pop ("ebx"); 253 &pxor ($A,"mm7"); 254 &movq (&QWP(0,$a),$R); 255 &pop ("ebp"); 256 &movq (&QWP(8,$a),$A); 257 &emms (); 258 &ret (); 259 &set_label("ialu",16); 260 } 261 &push ("ebp"); 262 &push ("ebx"); 263 &push ("esi"); 264 &push ("edi"); 265 &stack_push(4+1); 266 267 &mov ($a,&wparam(1)); 268 &mov ($b,&wparam(3)); 269 &call ("_mul_1x1_ialu"); # a1b1 270 &mov (&DWP(8,"esp"),$lo); 271 &mov (&DWP(12,"esp"),$hi); 272 273 &mov ($a,&wparam(2)); 274 &mov ($b,&wparam(4)); 275 &call ("_mul_1x1_ialu"); # a0b0 276 &mov (&DWP(0,"esp"),$lo); 277 &mov (&DWP(4,"esp"),$hi); 278 279 &mov ($a,&wparam(1)); 280 &mov ($b,&wparam(3)); 281 &xor ($a,&wparam(2)); 282 &xor ($b,&wparam(4)); 283 &call ("_mul_1x1_ialu"); # (a0+a1)(b0+b1) 284 285 &mov ("ebp",&wparam(0)); 286 @r=("ebx","ecx","edi","esi"); 287 &mov (@r[0],&DWP(0,"esp")); 288 &mov (@r[1],&DWP(4,"esp")); 289 &mov (@r[2],&DWP(8,"esp")); 290 &mov (@r[3],&DWP(12,"esp")); 291 292 &xor ($lo,$hi); 293 &xor ($hi,@r[1]); 294 &xor ($lo,@r[0]); 295 &mov (&DWP(0,"ebp"),@r[0]); 296 &xor ($hi,@r[2]); 297 &mov (&DWP(12,"ebp"),@r[3]); 298 &xor ($lo,@r[3]); 299 &stack_pop(4+1); 300 &xor ($hi,@r[3]); 301 &pop ("edi"); 302 &xor ($lo,$hi); 303 &pop ("esi"); 304 &mov (&DWP(8,"ebp"),$hi); 305 &pop ("ebx"); 306 &mov (&DWP(4,"ebp"),$lo); 307 &pop ("ebp"); 308 &ret (); 309 &function_end_B("bn_GF2m_mul_2x2"); 310 311 &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 312 313 &asm_finish(); 314