1 #!/usr/bin/env perl 2 # 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # sha1_block procedure for x86_64. 11 # 12 # It was brought to my attention that on EM64T compiler-generated code 13 # was far behind 32-bit assembler implementation. This is unlike on 14 # Opteron where compiler-generated code was only 15% behind 32-bit 15 # assembler, which originally made it hard to motivate the effort. 16 # There was suggestion to mechanically translate 32-bit code, but I 17 # dismissed it, reasoning that x86_64 offers enough register bank 18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19 # implementation:-) However! While 64-bit code does performs better 20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21 # x86_64 does offer larger *addressable* bank, but out-of-order core 22 # reaches for even more registers through dynamic aliasing, and EM64T 23 # core must have managed to run-time optimize even 32-bit code just as 24 # good as 64-bit one. Performance improvement is summarized in the 25 # following table: 26 # 27 # gcc 3.4 32-bit asm cycles/byte 28 # Opteron +45% +20% 6.8 29 # Xeon P4 +65% +0% 9.9 30 # Core2 +60% +10% 7.0 31 32 $flavour = shift; 33 $output = shift; 34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35 36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37 38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41 die "can't locate x86_64-xlate.pl"; 42 43 open STDOUT,"| $^X $xlate $flavour $output"; 44 45 $ctx="%rdi"; # 1st arg 46 $inp="%rsi"; # 2nd arg 47 $num="%rdx"; # 3rd arg 48 49 # reassign arguments in order to produce more compact code 50 $ctx="%r8"; 51 $inp="%r9"; 52 $num="%r10"; 53 54 $xi="%eax"; 55 $t0="%ebx"; 56 $t1="%ecx"; 57 $A="%edx"; 58 $B="%esi"; 59 $C="%edi"; 60 $D="%ebp"; 61 $E="%r11d"; 62 $T="%r12d"; 63 64 @V=($A,$B,$C,$D,$E,$T); 65 66 sub PROLOGUE { 67 my $func=shift; 68 $code.=<<___; 69 .globl $func 70 .type $func,\@function,3 71 .align 16 72 $func: 73 push %rbx 74 push %rbp 75 push %r12 76 mov %rsp,%r11 77 mov %rdi,$ctx # reassigned argument 78 sub \$`8+16*4`,%rsp 79 mov %rsi,$inp # reassigned argument 80 and \$-64,%rsp 81 mov %rdx,$num # reassigned argument 82 mov %r11,`16*4`(%rsp) 83 .Lprologue: 84 85 mov 0($ctx),$A 86 mov 4($ctx),$B 87 mov 8($ctx),$C 88 mov 12($ctx),$D 89 mov 16($ctx),$E 90 ___ 91 } 92 93 sub EPILOGUE { 94 my $func=shift; 95 $code.=<<___; 96 mov `16*4`(%rsp),%rsi 97 mov (%rsi),%r12 98 mov 8(%rsi),%rbp 99 mov 16(%rsi),%rbx 100 lea 24(%rsi),%rsp 101 .Lepilogue: 102 ret 103 .size $func,.-$func 104 ___ 105 } 106 107 sub BODY_00_19 { 108 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 109 my $j=$i+1; 110 $code.=<<___ if ($i==0); 111 mov `4*$i`($inp),$xi 112 `"bswap $xi" if(!defined($host))` 113 mov $xi,`4*$i`(%rsp) 114 ___ 115 $code.=<<___ if ($i<15); 116 lea 0x5a827999($xi,$e),$f 117 mov $c,$t0 118 mov `4*$j`($inp),$xi 119 mov $a,$e 120 xor $d,$t0 121 `"bswap $xi" if(!defined($host))` 122 rol \$5,$e 123 and $b,$t0 124 mov $xi,`4*$j`(%rsp) 125 add $e,$f 126 xor $d,$t0 127 rol \$30,$b 128 add $t0,$f 129 ___ 130 $code.=<<___ if ($i>=15); 131 lea 0x5a827999($xi,$e),$f 132 mov `4*($j%16)`(%rsp),$xi 133 mov $c,$t0 134 mov $a,$e 135 xor `4*(($j+2)%16)`(%rsp),$xi 136 xor $d,$t0 137 rol \$5,$e 138 xor `4*(($j+8)%16)`(%rsp),$xi 139 and $b,$t0 140 add $e,$f 141 xor `4*(($j+13)%16)`(%rsp),$xi 142 xor $d,$t0 143 rol \$30,$b 144 add $t0,$f 145 rol \$1,$xi 146 mov $xi,`4*($j%16)`(%rsp) 147 ___ 148 } 149 150 sub BODY_20_39 { 151 my ($i,$a,$b,$c,$d,$e,$f)=@_; 152 my $j=$i+1; 153 my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 154 $code.=<<___ if ($i<79); 155 lea $K($xi,$e),$f 156 mov `4*($j%16)`(%rsp),$xi 157 mov $c,$t0 158 mov $a,$e 159 xor `4*(($j+2)%16)`(%rsp),$xi 160 xor $b,$t0 161 rol \$5,$e 162 xor `4*(($j+8)%16)`(%rsp),$xi 163 xor $d,$t0 164 add $e,$f 165 xor `4*(($j+13)%16)`(%rsp),$xi 166 rol \$30,$b 167 add $t0,$f 168 rol \$1,$xi 169 ___ 170 $code.=<<___ if ($i<76); 171 mov $xi,`4*($j%16)`(%rsp) 172 ___ 173 $code.=<<___ if ($i==79); 174 lea $K($xi,$e),$f 175 mov $c,$t0 176 mov $a,$e 177 xor $b,$t0 178 rol \$5,$e 179 xor $d,$t0 180 add $e,$f 181 rol \$30,$b 182 add $t0,$f 183 ___ 184 } 185 186 sub BODY_40_59 { 187 my ($i,$a,$b,$c,$d,$e,$f)=@_; 188 my $j=$i+1; 189 $code.=<<___; 190 lea 0x8f1bbcdc($xi,$e),$f 191 mov `4*($j%16)`(%rsp),$xi 192 mov $b,$t0 193 mov $b,$t1 194 xor `4*(($j+2)%16)`(%rsp),$xi 195 mov $a,$e 196 and $c,$t0 197 xor `4*(($j+8)%16)`(%rsp),$xi 198 or $c,$t1 199 rol \$5,$e 200 xor `4*(($j+13)%16)`(%rsp),$xi 201 and $d,$t1 202 add $e,$f 203 rol \$1,$xi 204 or $t1,$t0 205 rol \$30,$b 206 mov $xi,`4*($j%16)`(%rsp) 207 add $t0,$f 208 ___ 209 } 210 211 $code=".text\n"; 212 213 &PROLOGUE("sha1_block_data_order"); 214 $code.=".align 4\n.Lloop:\n"; 215 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 216 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 217 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 218 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 219 $code.=<<___; 220 add 0($ctx),$E 221 add 4($ctx),$T 222 add 8($ctx),$A 223 add 12($ctx),$B 224 add 16($ctx),$C 225 mov $E,0($ctx) 226 mov $T,4($ctx) 227 mov $A,8($ctx) 228 mov $B,12($ctx) 229 mov $C,16($ctx) 230 231 xchg $E,$A # mov $E,$A 232 xchg $T,$B # mov $T,$B 233 xchg $E,$C # mov $A,$C 234 xchg $T,$D # mov $B,$D 235 # mov $C,$E 236 lea `16*4`($inp),$inp 237 sub \$1,$num 238 jnz .Lloop 239 ___ 240 &EPILOGUE("sha1_block_data_order"); 241 $code.=<<___; 242 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 243 .align 16 244 ___ 245 246 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 247 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 248 if ($win64) { 249 $rec="%rcx"; 250 $frame="%rdx"; 251 $context="%r8"; 252 $disp="%r9"; 253 254 $code.=<<___; 255 .extern __imp_RtlVirtualUnwind 256 .type se_handler,\@abi-omnipotent 257 .align 16 258 se_handler: 259 push %rsi 260 push %rdi 261 push %rbx 262 push %rbp 263 push %r12 264 push %r13 265 push %r14 266 push %r15 267 pushfq 268 sub \$64,%rsp 269 270 mov 120($context),%rax # pull context->Rax 271 mov 248($context),%rbx # pull context->Rip 272 273 lea .Lprologue(%rip),%r10 274 cmp %r10,%rbx # context->Rip<.Lprologue 275 jb .Lin_prologue 276 277 mov 152($context),%rax # pull context->Rsp 278 279 lea .Lepilogue(%rip),%r10 280 cmp %r10,%rbx # context->Rip>=.Lepilogue 281 jae .Lin_prologue 282 283 mov `16*4`(%rax),%rax # pull saved stack pointer 284 lea 24(%rax),%rax 285 286 mov -8(%rax),%rbx 287 mov -16(%rax),%rbp 288 mov -24(%rax),%r12 289 mov %rbx,144($context) # restore context->Rbx 290 mov %rbp,160($context) # restore context->Rbp 291 mov %r12,216($context) # restore context->R12 292 293 .Lin_prologue: 294 mov 8(%rax),%rdi 295 mov 16(%rax),%rsi 296 mov %rax,152($context) # restore context->Rsp 297 mov %rsi,168($context) # restore context->Rsi 298 mov %rdi,176($context) # restore context->Rdi 299 300 mov 40($disp),%rdi # disp->ContextRecord 301 mov $context,%rsi # context 302 mov \$154,%ecx # sizeof(CONTEXT) 303 .long 0xa548f3fc # cld; rep movsq 304 305 mov $disp,%rsi 306 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 307 mov 8(%rsi),%rdx # arg2, disp->ImageBase 308 mov 0(%rsi),%r8 # arg3, disp->ControlPc 309 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 310 mov 40(%rsi),%r10 # disp->ContextRecord 311 lea 56(%rsi),%r11 # &disp->HandlerData 312 lea 24(%rsi),%r12 # &disp->EstablisherFrame 313 mov %r10,32(%rsp) # arg5 314 mov %r11,40(%rsp) # arg6 315 mov %r12,48(%rsp) # arg7 316 mov %rcx,56(%rsp) # arg8, (NULL) 317 call *__imp_RtlVirtualUnwind(%rip) 318 319 mov \$1,%eax # ExceptionContinueSearch 320 add \$64,%rsp 321 popfq 322 pop %r15 323 pop %r14 324 pop %r13 325 pop %r12 326 pop %rbp 327 pop %rbx 328 pop %rdi 329 pop %rsi 330 ret 331 .size se_handler,.-se_handler 332 333 .section .pdata 334 .align 4 335 .rva .LSEH_begin_sha1_block_data_order 336 .rva .LSEH_end_sha1_block_data_order 337 .rva .LSEH_info_sha1_block_data_order 338 339 .section .xdata 340 .align 8 341 .LSEH_info_sha1_block_data_order: 342 .byte 9,0,0,0 343 .rva se_handler 344 ___ 345 } 346 347 #################################################################### 348 349 $code =~ s/\`([^\`]*)\`/eval $1/gem; 350 print $code; 351 close STDOUT; 352