1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # SHA256 block procedure for ARMv4. May 2007. 11 12 # Performance is ~2x better than gcc 3.4 generated code and in "abso- 13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 14 # byte [on single-issue Xscale PXA250 core]. 15 16 # July 2010. 17 # 18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on 19 # Cortex A8 core and ~20 cycles per processed byte. 20 21 # February 2011. 22 # 23 # Profiler-assisted and platform-specific optimization resulted in 16% 24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte. 25 26 # September 2013. 27 # 28 # Add NEON implementation. On Cortex A8 it was measured to process one 29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 31 # code (meaning that latter performs sub-optimally, nothing was done 32 # about it). 33 34 # May 2014. 35 # 36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7. 37 38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 39 open STDOUT,">$output"; 40 41 $ctx="r0"; $t0="r0"; 42 $inp="r1"; $t4="r1"; 43 $len="r2"; $t1="r2"; 44 $T1="r3"; $t3="r3"; 45 $A="r4"; 46 $B="r5"; 47 $C="r6"; 48 $D="r7"; 49 $E="r8"; 50 $F="r9"; 51 $G="r10"; 52 $H="r11"; 53 @V=($A,$B,$C,$D,$E,$F,$G,$H); 54 $t2="r12"; 55 $Ktbl="r14"; 56 57 @Sigma0=( 2,13,22); 58 @Sigma1=( 6,11,25); 59 @sigma0=( 7,18, 3); 60 @sigma1=(17,19,10); 61 62 sub BODY_00_15 { 63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 64 65 $code.=<<___ if ($i<16); 66 #if __ARM_ARCH__>=7 67 @ ldr $t1,[$inp],#4 @ $i 68 # if $i==15 69 str $inp,[sp,#17*4] @ make room for $t4 70 # endif 71 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 72 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 73 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 74 rev $t1,$t1 75 #else 76 @ ldrb $t1,[$inp,#3] @ $i 77 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 78 ldrb $t2,[$inp,#2] 79 ldrb $t0,[$inp,#1] 80 orr $t1,$t1,$t2,lsl#8 81 ldrb $t2,[$inp],#4 82 orr $t1,$t1,$t0,lsl#16 83 # if $i==15 84 str $inp,[sp,#17*4] @ make room for $t4 85 # endif 86 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 87 orr $t1,$t1,$t2,lsl#24 88 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 89 #endif 90 ___ 91 $code.=<<___; 92 ldr $t2,[$Ktbl],#4 @ *K256++ 93 add $h,$h,$t1 @ h+=X[i] 94 str $t1,[sp,#`$i%16`*4] 95 eor $t1,$f,$g 96 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 97 and $t1,$t1,$e 98 add $h,$h,$t2 @ h+=K256[i] 99 eor $t1,$t1,$g @ Ch(e,f,g) 100 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 101 add $h,$h,$t1 @ h+=Ch(e,f,g) 102 #if $i==31 103 and $t2,$t2,#0xff 104 cmp $t2,#0xf2 @ done? 105 #endif 106 #if $i<15 107 # if __ARM_ARCH__>=7 108 ldr $t1,[$inp],#4 @ prefetch 109 # else 110 ldrb $t1,[$inp,#3] 111 # endif 112 eor $t2,$a,$b @ a^b, b^c in next round 113 #else 114 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 115 eor $t2,$a,$b @ a^b, b^c in next round 116 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 117 #endif 118 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 119 and $t3,$t3,$t2 @ (b^c)&=(a^b) 120 add $d,$d,$h @ d+=h 121 eor $t3,$t3,$b @ Maj(a,b,c) 122 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 123 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 124 ___ 125 ($t2,$t3)=($t3,$t2); 126 } 127 128 sub BODY_16_XX { 129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 130 131 $code.=<<___; 132 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 133 @ ldr $t4,[sp,#`($i+14)%16`*4] 134 mov $t0,$t1,ror#$sigma0[0] 135 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 136 mov $t2,$t4,ror#$sigma1[0] 137 eor $t0,$t0,$t1,ror#$sigma0[1] 138 eor $t2,$t2,$t4,ror#$sigma1[1] 139 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 140 ldr $t1,[sp,#`($i+0)%16`*4] 141 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 142 ldr $t4,[sp,#`($i+9)%16`*4] 143 144 add $t2,$t2,$t0 145 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 146 add $t1,$t1,$t2 147 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 148 add $t1,$t1,$t4 @ X[i] 149 ___ 150 &BODY_00_15(@_); 151 } 152 153 $code=<<___; 154 #include "arm_arch.h" 155 156 .text 157 .code 32 158 159 .type K256,%object 160 .align 5 161 K256: 162 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 163 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 164 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 165 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 166 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 167 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 168 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 169 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 170 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 171 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 172 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 173 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 174 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 175 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 176 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 177 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 178 .size K256,.-K256 179 .word 0 @ terminator 180 .LOPENSSL_armcap: 181 .word OPENSSL_armcap_P-sha256_block_data_order 182 .align 5 183 184 .global sha256_block_data_order 185 .type sha256_block_data_order,%function 186 sha256_block_data_order: 187 sub r3,pc,#8 @ sha256_block_data_order 188 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 189 #if __ARM_ARCH__>=7 190 ldr r12,.LOPENSSL_armcap 191 ldr r12,[r3,r12] @ OPENSSL_armcap_P 192 tst r12,#ARMV8_SHA256 193 bne .LARMv8 194 tst r12,#ARMV7_NEON 195 bne .LNEON 196 #endif 197 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 198 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 199 sub $Ktbl,r3,#256+32 @ K256 200 sub sp,sp,#16*4 @ alloca(X[16]) 201 .Loop: 202 # if __ARM_ARCH__>=7 203 ldr $t1,[$inp],#4 204 # else 205 ldrb $t1,[$inp,#3] 206 # endif 207 eor $t3,$B,$C @ magic 208 eor $t2,$t2,$t2 209 ___ 210 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 211 $code.=".Lrounds_16_xx:\n"; 212 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 213 $code.=<<___; 214 ldreq $t3,[sp,#16*4] @ pull ctx 215 bne .Lrounds_16_xx 216 217 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 218 ldr $t0,[$t3,#0] 219 ldr $t1,[$t3,#4] 220 ldr $t2,[$t3,#8] 221 add $A,$A,$t0 222 ldr $t0,[$t3,#12] 223 add $B,$B,$t1 224 ldr $t1,[$t3,#16] 225 add $C,$C,$t2 226 ldr $t2,[$t3,#20] 227 add $D,$D,$t0 228 ldr $t0,[$t3,#24] 229 add $E,$E,$t1 230 ldr $t1,[$t3,#28] 231 add $F,$F,$t2 232 ldr $inp,[sp,#17*4] @ pull inp 233 ldr $t2,[sp,#18*4] @ pull inp+len 234 add $G,$G,$t0 235 add $H,$H,$t1 236 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 237 cmp $inp,$t2 238 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 239 bne .Loop 240 241 add sp,sp,#`16+3`*4 @ destroy frame 242 #if __ARM_ARCH__>=5 243 ldmia sp!,{r4-r11,pc} 244 #else 245 ldmia sp!,{r4-r11,lr} 246 tst lr,#1 247 moveq pc,lr @ be binary compatible with V4, yet 248 bx lr @ interoperable with Thumb ISA:-) 249 #endif 250 .size sha256_block_data_order,.-sha256_block_data_order 251 ___ 252 ###################################################################### 253 # NEON stuff 254 # 255 {{{ 256 my @X=map("q$_",(0..3)); 257 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 258 my $Xfer=$t4; 259 my $j=0; 260 261 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 262 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 263 264 sub AUTOLOAD() # thunk [simplified] x86-style perlasm 265 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 266 my $arg = pop; 267 $arg = "#$arg" if ($arg*1 eq $arg); 268 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 269 } 270 271 sub Xupdate() 272 { use integer; 273 my $body = shift; 274 my @insns = (&$body,&$body,&$body,&$body); 275 my ($a,$b,$c,$d,$e,$f,$g,$h); 276 277 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 278 eval(shift(@insns)); 279 eval(shift(@insns)); 280 eval(shift(@insns)); 281 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 282 eval(shift(@insns)); 283 eval(shift(@insns)); 284 eval(shift(@insns)); 285 &vshr_u32 ($T2,$T0,$sigma0[0]); 286 eval(shift(@insns)); 287 eval(shift(@insns)); 288 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 289 eval(shift(@insns)); 290 eval(shift(@insns)); 291 &vshr_u32 ($T1,$T0,$sigma0[2]); 292 eval(shift(@insns)); 293 eval(shift(@insns)); 294 &vsli_32 ($T2,$T0,32-$sigma0[0]); 295 eval(shift(@insns)); 296 eval(shift(@insns)); 297 &vshr_u32 ($T3,$T0,$sigma0[1]); 298 eval(shift(@insns)); 299 eval(shift(@insns)); 300 &veor ($T1,$T1,$T2); 301 eval(shift(@insns)); 302 eval(shift(@insns)); 303 &vsli_32 ($T3,$T0,32-$sigma0[1]); 304 eval(shift(@insns)); 305 eval(shift(@insns)); 306 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 307 eval(shift(@insns)); 308 eval(shift(@insns)); 309 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 310 eval(shift(@insns)); 311 eval(shift(@insns)); 312 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 313 eval(shift(@insns)); 314 eval(shift(@insns)); 315 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 316 eval(shift(@insns)); 317 eval(shift(@insns)); 318 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 319 eval(shift(@insns)); 320 eval(shift(@insns)); 321 &veor ($T5,$T5,$T4); 322 eval(shift(@insns)); 323 eval(shift(@insns)); 324 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 325 eval(shift(@insns)); 326 eval(shift(@insns)); 327 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 328 eval(shift(@insns)); 329 eval(shift(@insns)); 330 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 331 eval(shift(@insns)); 332 eval(shift(@insns)); 333 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 334 eval(shift(@insns)); 335 eval(shift(@insns)); 336 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 337 eval(shift(@insns)); 338 eval(shift(@insns)); 339 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 340 eval(shift(@insns)); 341 eval(shift(@insns)); 342 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 343 eval(shift(@insns)); 344 eval(shift(@insns)); 345 &veor ($T5,$T5,$T4); 346 eval(shift(@insns)); 347 eval(shift(@insns)); 348 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 349 eval(shift(@insns)); 350 eval(shift(@insns)); 351 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 352 eval(shift(@insns)); 353 eval(shift(@insns)); 354 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 355 eval(shift(@insns)); 356 eval(shift(@insns)); 357 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 358 eval(shift(@insns)); 359 eval(shift(@insns)); 360 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 361 eval(shift(@insns)); 362 eval(shift(@insns)); 363 &vadd_i32 ($T0,$T0,@X[0]); 364 while($#insns>=2) { eval(shift(@insns)); } 365 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 369 push(@X,shift(@X)); # "rotate" X[] 370 } 371 372 sub Xpreload() 373 { use integer; 374 my $body = shift; 375 my @insns = (&$body,&$body,&$body,&$body); 376 my ($a,$b,$c,$d,$e,$f,$g,$h); 377 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 eval(shift(@insns)); 381 eval(shift(@insns)); 382 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 eval(shift(@insns)); 387 &vrev32_8 (@X[0],@X[0]); 388 eval(shift(@insns)); 389 eval(shift(@insns)); 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &vadd_i32 ($T0,$T0,@X[0]); 393 foreach (@insns) { eval; } # remaining instructions 394 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 395 396 push(@X,shift(@X)); # "rotate" X[] 397 } 398 399 sub body_00_15 () { 400 ( 401 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 402 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 403 '&eor ($t1,$f,$g)', 404 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 405 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 406 '&and ($t1,$t1,$e)', 407 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 408 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 409 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 410 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 411 '&eor ($t2,$a,$b)', # a^b, b^c in next round 412 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 413 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 414 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 415 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 416 '&ldr ($t1,"[sp,#64]") if ($j==31)', 417 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 418 '&add ($d,$d,$h)', # d+=h 419 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 420 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 421 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 422 ) 423 } 424 425 $code.=<<___; 426 #if __ARM_ARCH__>=7 427 .fpu neon 428 429 .type sha256_block_data_order_neon,%function 430 .align 4 431 sha256_block_data_order_neon: 432 .LNEON: 433 stmdb sp!,{r4-r12,lr} 434 435 mov $t2,sp 436 sub sp,sp,#16*4+16 @ alloca 437 sub $Ktbl,r3,#256+32 @ K256 438 bic sp,sp,#15 @ align for 128-bit stores 439 440 vld1.8 {@X[0]},[$inp]! 441 vld1.8 {@X[1]},[$inp]! 442 vld1.8 {@X[2]},[$inp]! 443 vld1.8 {@X[3]},[$inp]! 444 vld1.32 {$T0},[$Ktbl,:128]! 445 vld1.32 {$T1},[$Ktbl,:128]! 446 vld1.32 {$T2},[$Ktbl,:128]! 447 vld1.32 {$T3},[$Ktbl,:128]! 448 vrev32.8 @X[0],@X[0] @ yes, even on 449 str $ctx,[sp,#64] 450 vrev32.8 @X[1],@X[1] @ big-endian 451 str $inp,[sp,#68] 452 mov $Xfer,sp 453 vrev32.8 @X[2],@X[2] 454 str $len,[sp,#72] 455 vrev32.8 @X[3],@X[3] 456 str $t2,[sp,#76] @ save original sp 457 vadd.i32 $T0,$T0,@X[0] 458 vadd.i32 $T1,$T1,@X[1] 459 vst1.32 {$T0},[$Xfer,:128]! 460 vadd.i32 $T2,$T2,@X[2] 461 vst1.32 {$T1},[$Xfer,:128]! 462 vadd.i32 $T3,$T3,@X[3] 463 vst1.32 {$T2},[$Xfer,:128]! 464 vst1.32 {$T3},[$Xfer,:128]! 465 466 ldmia $ctx,{$A-$H} 467 sub $Xfer,$Xfer,#64 468 ldr $t1,[sp,#0] 469 eor $t2,$t2,$t2 470 eor $t3,$B,$C 471 b .L_00_48 472 473 .align 4 474 .L_00_48: 475 ___ 476 &Xupdate(\&body_00_15); 477 &Xupdate(\&body_00_15); 478 &Xupdate(\&body_00_15); 479 &Xupdate(\&body_00_15); 480 $code.=<<___; 481 teq $t1,#0 @ check for K256 terminator 482 ldr $t1,[sp,#0] 483 sub $Xfer,$Xfer,#64 484 bne .L_00_48 485 486 ldr $inp,[sp,#68] 487 ldr $t0,[sp,#72] 488 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 489 teq $inp,$t0 490 subeq $inp,$inp,#64 @ avoid SEGV 491 vld1.8 {@X[0]},[$inp]! @ load next input block 492 vld1.8 {@X[1]},[$inp]! 493 vld1.8 {@X[2]},[$inp]! 494 vld1.8 {@X[3]},[$inp]! 495 strne $inp,[sp,#68] 496 mov $Xfer,sp 497 ___ 498 &Xpreload(\&body_00_15); 499 &Xpreload(\&body_00_15); 500 &Xpreload(\&body_00_15); 501 &Xpreload(\&body_00_15); 502 $code.=<<___; 503 ldr $t0,[$t1,#0] 504 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 505 ldr $t2,[$t1,#4] 506 ldr $t3,[$t1,#8] 507 ldr $t4,[$t1,#12] 508 add $A,$A,$t0 @ accumulate 509 ldr $t0,[$t1,#16] 510 add $B,$B,$t2 511 ldr $t2,[$t1,#20] 512 add $C,$C,$t3 513 ldr $t3,[$t1,#24] 514 add $D,$D,$t4 515 ldr $t4,[$t1,#28] 516 add $E,$E,$t0 517 str $A,[$t1],#4 518 add $F,$F,$t2 519 str $B,[$t1],#4 520 add $G,$G,$t3 521 str $C,[$t1],#4 522 add $H,$H,$t4 523 str $D,[$t1],#4 524 stmia $t1,{$E-$H} 525 526 movne $Xfer,sp 527 ldrne $t1,[sp,#0] 528 eorne $t2,$t2,$t2 529 ldreq sp,[sp,#76] @ restore original sp 530 eorne $t3,$B,$C 531 bne .L_00_48 532 533 ldmia sp!,{r4-r12,pc} 534 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon 535 #endif 536 ___ 537 }}} 538 ###################################################################### 539 # ARMv8 stuff 540 # 541 {{{ 542 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 543 my @MSG=map("q$_",(8..11)); 544 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 545 my $Ktbl="r3"; 546 547 $code.=<<___; 548 #if __ARM_ARCH__>=7 549 .type sha256_block_data_order_armv8,%function 550 .align 5 551 sha256_block_data_order_armv8: 552 .LARMv8: 553 vld1.32 {$ABCD,$EFGH},[$ctx] 554 sub $Ktbl,r3,#sha256_block_data_order-K256 555 556 .Loop_v8: 557 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 558 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 559 vld1.32 {$W0},[$Ktbl]! 560 vrev32.8 @MSG[0],@MSG[0] 561 vrev32.8 @MSG[1],@MSG[1] 562 vrev32.8 @MSG[2],@MSG[2] 563 vrev32.8 @MSG[3],@MSG[3] 564 vmov $ABCD_SAVE,$ABCD @ offload 565 vmov $EFGH_SAVE,$EFGH 566 teq $inp,$len 567 ___ 568 for($i=0;$i<12;$i++) { 569 $code.=<<___; 570 vld1.32 {$W1},[$Ktbl]! 571 vadd.i32 $W0,$W0,@MSG[0] 572 sha256su0 @MSG[0],@MSG[1] 573 vmov $abcd,$ABCD 574 sha256h $ABCD,$EFGH,$W0 575 sha256h2 $EFGH,$abcd,$W0 576 sha256su1 @MSG[0],@MSG[2],@MSG[3] 577 ___ 578 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 579 } 580 $code.=<<___; 581 vld1.32 {$W1},[$Ktbl]! 582 vadd.i32 $W0,$W0,@MSG[0] 583 vmov $abcd,$ABCD 584 sha256h $ABCD,$EFGH,$W0 585 sha256h2 $EFGH,$abcd,$W0 586 587 vld1.32 {$W0},[$Ktbl]! 588 vadd.i32 $W1,$W1,@MSG[1] 589 vmov $abcd,$ABCD 590 sha256h $ABCD,$EFGH,$W1 591 sha256h2 $EFGH,$abcd,$W1 592 593 vld1.32 {$W1},[$Ktbl] 594 vadd.i32 $W0,$W0,@MSG[2] 595 sub $Ktbl,$Ktbl,#256-16 @ rewind 596 vmov $abcd,$ABCD 597 sha256h $ABCD,$EFGH,$W0 598 sha256h2 $EFGH,$abcd,$W0 599 600 vadd.i32 $W1,$W1,@MSG[3] 601 vmov $abcd,$ABCD 602 sha256h $ABCD,$EFGH,$W1 603 sha256h2 $EFGH,$abcd,$W1 604 605 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 606 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 607 bne .Loop_v8 608 609 vst1.32 {$ABCD,$EFGH},[$ctx] 610 611 ret @ bx lr 612 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 613 #endif 614 ___ 615 }}} 616 $code.=<<___; 617 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 618 .align 2 619 .comm OPENSSL_armcap_P,4,4 620 ___ 621 622 { my %opcode = ( 623 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 624 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 625 626 sub unsha256 { 627 my ($mnemonic,$arg)=@_; 628 629 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 630 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 631 |(($2&7)<<17)|(($2&8)<<4) 632 |(($3&7)<<1) |(($3&8)<<2); 633 # since ARMv7 instructions are always encoded little-endian. 634 # correct solution is to use .inst directive, but older 635 # assemblers don't implement it:-( 636 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 637 $word&0xff,($word>>8)&0xff, 638 ($word>>16)&0xff,($word>>24)&0xff, 639 $mnemonic,$arg; 640 } 641 } 642 } 643 644 foreach (split($/,$code)) { 645 646 s/\`([^\`]*)\`/eval $1/geo; 647 648 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 649 650 s/\bret\b/bx lr/go or 651 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 652 653 print $_,"\n"; 654 } 655 656 close STDOUT; # enforce flush 657