1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # 9 # Permission to use under GPL terms is granted. 10 # ==================================================================== 11 12 # SHA512 block procedure for ARMv4. September 2007. 13 14 # This code is ~4.5 (four and a half) times faster than code generated 15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 16 # Xscale PXA250 core]. 17 # 18 # July 2010. 19 # 20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 21 # Cortex A8 core and ~40 cycles per processed byte. 22 23 # February 2011. 24 # 25 # Profiler-assisted and platform-specific optimization resulted in 7% 26 # improvement on Coxtex A8 core and ~38 cycles per byte. 27 28 # March 2011. 29 # 30 # Add NEON implementation. On Cortex A8 it was measured to process 31 # one byte in 23.3 cycles or ~60% faster than integer-only code. 32 33 # August 2012. 34 # 35 # Improve NEON performance by 12% on Snapdragon S4. In absolute 36 # terms it's 22.6 cycles per byte, which is disappointing result. 37 # Technical writers asserted that 3-way S4 pipeline can sustain 38 # multiple NEON instructions per cycle, but dual NEON issue could 39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html 40 # for further details. On side note Cortex-A15 processes one byte in 41 # 16 cycles. 42 43 # Byte order [in]dependence. ========================================= 44 # 45 # Originally caller was expected to maintain specific *dword* order in 46 # h[0-7], namely with most significant dword at *lower* address, which 47 # was reflected in below two parameters as 0 and 4. Now caller is 48 # expected to maintain native byte order for whole 64-bit values. 49 $hi="HI"; 50 $lo="LO"; 51 # ==================================================================== 52 53 $flavour = shift; 54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } 56 57 if ($flavour && $flavour ne "void") { 58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 60 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 61 die "can't locate arm-xlate.pl"; 62 63 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 64 } else { 65 open STDOUT,">$output"; 66 } 67 68 $ctx="r0"; # parameter block 69 $inp="r1"; 70 $len="r2"; 71 72 $Tlo="r3"; 73 $Thi="r4"; 74 $Alo="r5"; 75 $Ahi="r6"; 76 $Elo="r7"; 77 $Ehi="r8"; 78 $t0="r9"; 79 $t1="r10"; 80 $t2="r11"; 81 $t3="r12"; 82 ############ r13 is stack pointer 83 $Ktbl="r14"; 84 ############ r15 is program counter 85 86 $Aoff=8*0; 87 $Boff=8*1; 88 $Coff=8*2; 89 $Doff=8*3; 90 $Eoff=8*4; 91 $Foff=8*5; 92 $Goff=8*6; 93 $Hoff=8*7; 94 $Xoff=8*8; 95 96 sub BODY_00_15() { 97 my $magic = shift; 98 $code.=<<___; 99 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 100 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 101 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 102 mov $t0,$Elo,lsr#14 103 str $Tlo,[sp,#$Xoff+0] 104 mov $t1,$Ehi,lsr#14 105 str $Thi,[sp,#$Xoff+4] 106 eor $t0,$t0,$Ehi,lsl#18 107 ldr $t2,[sp,#$Hoff+0] @ h.lo 108 eor $t1,$t1,$Elo,lsl#18 109 ldr $t3,[sp,#$Hoff+4] @ h.hi 110 eor $t0,$t0,$Elo,lsr#18 111 eor $t1,$t1,$Ehi,lsr#18 112 eor $t0,$t0,$Ehi,lsl#14 113 eor $t1,$t1,$Elo,lsl#14 114 eor $t0,$t0,$Ehi,lsr#9 115 eor $t1,$t1,$Elo,lsr#9 116 eor $t0,$t0,$Elo,lsl#23 117 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 118 adds $Tlo,$Tlo,$t0 119 ldr $t0,[sp,#$Foff+0] @ f.lo 120 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 121 ldr $t1,[sp,#$Foff+4] @ f.hi 122 adds $Tlo,$Tlo,$t2 123 ldr $t2,[sp,#$Goff+0] @ g.lo 124 adc $Thi,$Thi,$t3 @ T += h 125 ldr $t3,[sp,#$Goff+4] @ g.hi 126 127 eor $t0,$t0,$t2 128 str $Elo,[sp,#$Eoff+0] 129 eor $t1,$t1,$t3 130 str $Ehi,[sp,#$Eoff+4] 131 and $t0,$t0,$Elo 132 str $Alo,[sp,#$Aoff+0] 133 and $t1,$t1,$Ehi 134 str $Ahi,[sp,#$Aoff+4] 135 eor $t0,$t0,$t2 136 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 137 eor $t1,$t1,$t3 @ Ch(e,f,g) 138 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 139 140 adds $Tlo,$Tlo,$t0 141 ldr $Elo,[sp,#$Doff+0] @ d.lo 142 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 143 ldr $Ehi,[sp,#$Doff+4] @ d.hi 144 adds $Tlo,$Tlo,$t2 145 and $t0,$t2,#0xff 146 adc $Thi,$Thi,$t3 @ T += K[i] 147 adds $Elo,$Elo,$Tlo 148 ldr $t2,[sp,#$Boff+0] @ b.lo 149 adc $Ehi,$Ehi,$Thi @ d += T 150 teq $t0,#$magic 151 152 ldr $t3,[sp,#$Coff+0] @ c.lo 153 #if __ARM_ARCH__>=7 154 it eq @ Thumb2 thing, sanity check in ARM 155 #endif 156 orreq $Ktbl,$Ktbl,#1 157 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 158 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 159 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 160 mov $t0,$Alo,lsr#28 161 mov $t1,$Ahi,lsr#28 162 eor $t0,$t0,$Ahi,lsl#4 163 eor $t1,$t1,$Alo,lsl#4 164 eor $t0,$t0,$Ahi,lsr#2 165 eor $t1,$t1,$Alo,lsr#2 166 eor $t0,$t0,$Alo,lsl#30 167 eor $t1,$t1,$Ahi,lsl#30 168 eor $t0,$t0,$Ahi,lsr#7 169 eor $t1,$t1,$Alo,lsr#7 170 eor $t0,$t0,$Alo,lsl#25 171 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 172 adds $Tlo,$Tlo,$t0 173 and $t0,$Alo,$t2 174 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 175 176 ldr $t1,[sp,#$Boff+4] @ b.hi 177 orr $Alo,$Alo,$t2 178 ldr $t2,[sp,#$Coff+4] @ c.hi 179 and $Alo,$Alo,$t3 180 and $t3,$Ahi,$t1 181 orr $Ahi,$Ahi,$t1 182 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 183 and $Ahi,$Ahi,$t2 184 adds $Alo,$Alo,$Tlo 185 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 186 sub sp,sp,#8 187 adc $Ahi,$Ahi,$Thi @ h += T 188 tst $Ktbl,#1 189 add $Ktbl,$Ktbl,#8 190 ___ 191 } 192 $code=<<___; 193 #ifndef __KERNEL__ 194 # include <openssl/arm_arch.h> 195 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} 196 # define VFP_ABI_POP vldmia sp!,{d8-d15} 197 #else 198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 199 # define __ARM_MAX_ARCH__ 7 200 # define VFP_ABI_PUSH 201 # define VFP_ABI_POP 202 #endif 203 204 #ifdef __ARMEL__ 205 # define LO 0 206 # define HI 4 207 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 208 #else 209 # define HI 0 210 # define LO 4 211 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 212 #endif 213 214 .text 215 #if __ARM_ARCH__<7 || defined(__APPLE__) 216 .code 32 217 #else 218 .syntax unified 219 # ifdef __thumb2__ 220 # define adrl adr 221 .thumb 222 # else 223 .code 32 224 # endif 225 #endif 226 227 .type K512,%object 228 .align 5 229 K512: 230 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 231 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 232 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 233 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 234 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 235 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 236 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 237 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 238 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 239 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 240 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 241 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 242 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 243 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 244 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 245 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 246 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 247 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 248 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 249 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 250 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 251 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 252 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 253 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 254 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 255 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 256 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 257 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 258 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 259 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 260 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 261 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 262 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 263 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 264 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 265 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 266 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 267 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 268 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 269 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 270 .size K512,.-K512 271 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 272 .LOPENSSL_armcap: 273 .word OPENSSL_armcap_P-.Lsha512_block_data_order 274 .skip 32-4 275 #else 276 .skip 32 277 #endif 278 279 .global sha512_block_data_order 280 .type sha512_block_data_order,%function 281 sha512_block_data_order: 282 .Lsha512_block_data_order: 283 #if __ARM_ARCH__<7 284 sub r3,pc,#8 @ sha512_block_data_order 285 #else 286 adr r3,sha512_block_data_order 287 #endif 288 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 289 ldr r12,.LOPENSSL_armcap 290 ldr r12,[r3,r12] @ OPENSSL_armcap_P 291 #ifdef __APPLE__ 292 ldr r12,[r12] 293 #endif 294 tst r12,#1 295 bne .LNEON 296 #endif 297 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 298 stmdb sp!,{r4-r12,lr} 299 sub $Ktbl,r3,#672 @ K512 300 sub sp,sp,#9*8 301 302 ldr $Elo,[$ctx,#$Eoff+$lo] 303 ldr $Ehi,[$ctx,#$Eoff+$hi] 304 ldr $t0, [$ctx,#$Goff+$lo] 305 ldr $t1, [$ctx,#$Goff+$hi] 306 ldr $t2, [$ctx,#$Hoff+$lo] 307 ldr $t3, [$ctx,#$Hoff+$hi] 308 .Loop: 309 str $t0, [sp,#$Goff+0] 310 str $t1, [sp,#$Goff+4] 311 str $t2, [sp,#$Hoff+0] 312 str $t3, [sp,#$Hoff+4] 313 ldr $Alo,[$ctx,#$Aoff+$lo] 314 ldr $Ahi,[$ctx,#$Aoff+$hi] 315 ldr $Tlo,[$ctx,#$Boff+$lo] 316 ldr $Thi,[$ctx,#$Boff+$hi] 317 ldr $t0, [$ctx,#$Coff+$lo] 318 ldr $t1, [$ctx,#$Coff+$hi] 319 ldr $t2, [$ctx,#$Doff+$lo] 320 ldr $t3, [$ctx,#$Doff+$hi] 321 str $Tlo,[sp,#$Boff+0] 322 str $Thi,[sp,#$Boff+4] 323 str $t0, [sp,#$Coff+0] 324 str $t1, [sp,#$Coff+4] 325 str $t2, [sp,#$Doff+0] 326 str $t3, [sp,#$Doff+4] 327 ldr $Tlo,[$ctx,#$Foff+$lo] 328 ldr $Thi,[$ctx,#$Foff+$hi] 329 str $Tlo,[sp,#$Foff+0] 330 str $Thi,[sp,#$Foff+4] 331 332 .L00_15: 333 #if __ARM_ARCH__<7 334 ldrb $Tlo,[$inp,#7] 335 ldrb $t0, [$inp,#6] 336 ldrb $t1, [$inp,#5] 337 ldrb $t2, [$inp,#4] 338 ldrb $Thi,[$inp,#3] 339 ldrb $t3, [$inp,#2] 340 orr $Tlo,$Tlo,$t0,lsl#8 341 ldrb $t0, [$inp,#1] 342 orr $Tlo,$Tlo,$t1,lsl#16 343 ldrb $t1, [$inp],#8 344 orr $Tlo,$Tlo,$t2,lsl#24 345 orr $Thi,$Thi,$t3,lsl#8 346 orr $Thi,$Thi,$t0,lsl#16 347 orr $Thi,$Thi,$t1,lsl#24 348 #else 349 ldr $Tlo,[$inp,#4] 350 ldr $Thi,[$inp],#8 351 #ifdef __ARMEL__ 352 rev $Tlo,$Tlo 353 rev $Thi,$Thi 354 #endif 355 #endif 356 ___ 357 &BODY_00_15(0x94); 358 $code.=<<___; 359 tst $Ktbl,#1 360 beq .L00_15 361 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 362 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 363 bic $Ktbl,$Ktbl,#1 364 .L16_79: 365 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 366 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 367 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 368 mov $Tlo,$t0,lsr#1 369 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 370 mov $Thi,$t1,lsr#1 371 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 372 eor $Tlo,$Tlo,$t1,lsl#31 373 eor $Thi,$Thi,$t0,lsl#31 374 eor $Tlo,$Tlo,$t0,lsr#8 375 eor $Thi,$Thi,$t1,lsr#8 376 eor $Tlo,$Tlo,$t1,lsl#24 377 eor $Thi,$Thi,$t0,lsl#24 378 eor $Tlo,$Tlo,$t0,lsr#7 379 eor $Thi,$Thi,$t1,lsr#7 380 eor $Tlo,$Tlo,$t1,lsl#25 381 382 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 383 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 384 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 385 mov $t0,$t2,lsr#19 386 mov $t1,$t3,lsr#19 387 eor $t0,$t0,$t3,lsl#13 388 eor $t1,$t1,$t2,lsl#13 389 eor $t0,$t0,$t3,lsr#29 390 eor $t1,$t1,$t2,lsr#29 391 eor $t0,$t0,$t2,lsl#3 392 eor $t1,$t1,$t3,lsl#3 393 eor $t0,$t0,$t2,lsr#6 394 eor $t1,$t1,$t3,lsr#6 395 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 396 eor $t0,$t0,$t3,lsl#26 397 398 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 399 adds $Tlo,$Tlo,$t0 400 ldr $t0,[sp,#`$Xoff+8*16`+0] 401 adc $Thi,$Thi,$t1 402 403 ldr $t1,[sp,#`$Xoff+8*16`+4] 404 adds $Tlo,$Tlo,$t2 405 adc $Thi,$Thi,$t3 406 adds $Tlo,$Tlo,$t0 407 adc $Thi,$Thi,$t1 408 ___ 409 &BODY_00_15(0x17); 410 $code.=<<___; 411 #if __ARM_ARCH__>=7 412 ittt eq @ Thumb2 thing, sanity check in ARM 413 #endif 414 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 415 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 416 beq .L16_79 417 bic $Ktbl,$Ktbl,#1 418 419 ldr $Tlo,[sp,#$Boff+0] 420 ldr $Thi,[sp,#$Boff+4] 421 ldr $t0, [$ctx,#$Aoff+$lo] 422 ldr $t1, [$ctx,#$Aoff+$hi] 423 ldr $t2, [$ctx,#$Boff+$lo] 424 ldr $t3, [$ctx,#$Boff+$hi] 425 adds $t0,$Alo,$t0 426 str $t0, [$ctx,#$Aoff+$lo] 427 adc $t1,$Ahi,$t1 428 str $t1, [$ctx,#$Aoff+$hi] 429 adds $t2,$Tlo,$t2 430 str $t2, [$ctx,#$Boff+$lo] 431 adc $t3,$Thi,$t3 432 str $t3, [$ctx,#$Boff+$hi] 433 434 ldr $Alo,[sp,#$Coff+0] 435 ldr $Ahi,[sp,#$Coff+4] 436 ldr $Tlo,[sp,#$Doff+0] 437 ldr $Thi,[sp,#$Doff+4] 438 ldr $t0, [$ctx,#$Coff+$lo] 439 ldr $t1, [$ctx,#$Coff+$hi] 440 ldr $t2, [$ctx,#$Doff+$lo] 441 ldr $t3, [$ctx,#$Doff+$hi] 442 adds $t0,$Alo,$t0 443 str $t0, [$ctx,#$Coff+$lo] 444 adc $t1,$Ahi,$t1 445 str $t1, [$ctx,#$Coff+$hi] 446 adds $t2,$Tlo,$t2 447 str $t2, [$ctx,#$Doff+$lo] 448 adc $t3,$Thi,$t3 449 str $t3, [$ctx,#$Doff+$hi] 450 451 ldr $Tlo,[sp,#$Foff+0] 452 ldr $Thi,[sp,#$Foff+4] 453 ldr $t0, [$ctx,#$Eoff+$lo] 454 ldr $t1, [$ctx,#$Eoff+$hi] 455 ldr $t2, [$ctx,#$Foff+$lo] 456 ldr $t3, [$ctx,#$Foff+$hi] 457 adds $Elo,$Elo,$t0 458 str $Elo,[$ctx,#$Eoff+$lo] 459 adc $Ehi,$Ehi,$t1 460 str $Ehi,[$ctx,#$Eoff+$hi] 461 adds $t2,$Tlo,$t2 462 str $t2, [$ctx,#$Foff+$lo] 463 adc $t3,$Thi,$t3 464 str $t3, [$ctx,#$Foff+$hi] 465 466 ldr $Alo,[sp,#$Goff+0] 467 ldr $Ahi,[sp,#$Goff+4] 468 ldr $Tlo,[sp,#$Hoff+0] 469 ldr $Thi,[sp,#$Hoff+4] 470 ldr $t0, [$ctx,#$Goff+$lo] 471 ldr $t1, [$ctx,#$Goff+$hi] 472 ldr $t2, [$ctx,#$Hoff+$lo] 473 ldr $t3, [$ctx,#$Hoff+$hi] 474 adds $t0,$Alo,$t0 475 str $t0, [$ctx,#$Goff+$lo] 476 adc $t1,$Ahi,$t1 477 str $t1, [$ctx,#$Goff+$hi] 478 adds $t2,$Tlo,$t2 479 str $t2, [$ctx,#$Hoff+$lo] 480 adc $t3,$Thi,$t3 481 str $t3, [$ctx,#$Hoff+$hi] 482 483 add sp,sp,#640 484 sub $Ktbl,$Ktbl,#640 485 486 teq $inp,$len 487 bne .Loop 488 489 add sp,sp,#8*9 @ destroy frame 490 #if __ARM_ARCH__>=5 491 ldmia sp!,{r4-r12,pc} 492 #else 493 ldmia sp!,{r4-r12,lr} 494 tst lr,#1 495 moveq pc,lr @ be binary compatible with V4, yet 496 bx lr @ interoperable with Thumb ISA:-) 497 #endif 498 .size sha512_block_data_order,.-sha512_block_data_order 499 ___ 500 501 { 502 my @Sigma0=(28,34,39); 503 my @Sigma1=(14,18,41); 504 my @sigma0=(1, 8, 7); 505 my @sigma1=(19,61,6); 506 507 my $Ktbl="r3"; 508 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 509 510 my @X=map("d$_",(0..15)); 511 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 512 513 sub NEON_00_15() { 514 my $i=shift; 515 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 516 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 517 518 $code.=<<___ if ($i<16 || $i&1); 519 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 520 #if $i<16 521 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 522 #endif 523 vshr.u64 $t1,$e,#@Sigma1[1] 524 #if $i>0 525 vadd.i64 $a,$Maj @ h+=Maj from the past 526 #endif 527 vshr.u64 $t2,$e,#@Sigma1[2] 528 ___ 529 $code.=<<___; 530 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 531 vsli.64 $t0,$e,#`64-@Sigma1[0]` 532 vsli.64 $t1,$e,#`64-@Sigma1[1]` 533 vmov $Ch,$e 534 vsli.64 $t2,$e,#`64-@Sigma1[2]` 535 #if $i<16 && defined(__ARMEL__) 536 vrev64.8 @X[$i],@X[$i] 537 #endif 538 veor $t1,$t0 539 vbsl $Ch,$f,$g @ Ch(e,f,g) 540 vshr.u64 $t0,$a,#@Sigma0[0] 541 veor $t2,$t1 @ Sigma1(e) 542 vadd.i64 $T1,$Ch,$h 543 vshr.u64 $t1,$a,#@Sigma0[1] 544 vsli.64 $t0,$a,#`64-@Sigma0[0]` 545 vadd.i64 $T1,$t2 546 vshr.u64 $t2,$a,#@Sigma0[2] 547 vadd.i64 $K,@X[$i%16] 548 vsli.64 $t1,$a,#`64-@Sigma0[1]` 549 veor $Maj,$a,$b 550 vsli.64 $t2,$a,#`64-@Sigma0[2]` 551 veor $h,$t0,$t1 552 vadd.i64 $T1,$K 553 vbsl $Maj,$c,$b @ Maj(a,b,c) 554 veor $h,$t2 @ Sigma0(a) 555 vadd.i64 $d,$T1 556 vadd.i64 $Maj,$T1 557 @ vadd.i64 $h,$Maj 558 ___ 559 } 560 561 sub NEON_16_79() { 562 my $i=shift; 563 564 if ($i&1) { &NEON_00_15($i,@_); return; } 565 566 # 2x-vectorized, therefore runs every 2nd round 567 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 568 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 569 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 570 my $e=@_[4]; # $e from NEON_00_15 571 $i /= 2; 572 $code.=<<___; 573 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 574 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 575 vadd.i64 @_[0],d30 @ h+=Maj from the past 576 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 577 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 578 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 579 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 580 veor $s1,$t0 581 vshr.u64 $t0,$s0,#@sigma0[0] 582 veor $s1,$t1 @ sigma1(X[i+14]) 583 vshr.u64 $t1,$s0,#@sigma0[1] 584 vadd.i64 @X[$i%8],$s1 585 vshr.u64 $s1,$s0,#@sigma0[2] 586 vsli.64 $t0,$s0,#`64-@sigma0[0]` 587 vsli.64 $t1,$s0,#`64-@sigma0[1]` 588 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 589 veor $s1,$t0 590 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 591 vadd.i64 @X[$i%8],$s0 592 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 593 veor $s1,$t1 @ sigma0(X[i+1]) 594 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 595 vadd.i64 @X[$i%8],$s1 596 ___ 597 &NEON_00_15(2*$i,@_); 598 } 599 600 $code.=<<___; 601 #if __ARM_MAX_ARCH__>=7 602 .arch armv7-a 603 .fpu neon 604 605 .global sha512_block_data_order_neon 606 .type sha512_block_data_order_neon,%function 607 .align 4 608 sha512_block_data_order_neon: 609 .LNEON: 610 dmb @ errata #451034 on early Cortex A8 611 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 612 adr $Ktbl,K512 613 VFP_ABI_PUSH 614 vldmia $ctx,{$A-$H} @ load context 615 .Loop_neon: 616 ___ 617 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 618 $code.=<<___; 619 mov $cnt,#4 620 .L16_79_neon: 621 subs $cnt,#1 622 ___ 623 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 624 $code.=<<___; 625 bne .L16_79_neon 626 627 vadd.i64 $A,d30 @ h+=Maj from the past 628 vldmia $ctx,{d24-d31} @ load context to temp 629 vadd.i64 q8,q12 @ vectorized accumulate 630 vadd.i64 q9,q13 631 vadd.i64 q10,q14 632 vadd.i64 q11,q15 633 vstmia $ctx,{$A-$H} @ save context 634 teq $inp,$len 635 sub $Ktbl,#640 @ rewind K512 636 bne .Loop_neon 637 638 VFP_ABI_POP 639 ret @ bx lr 640 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon 641 #endif 642 ___ 643 } 644 $code.=<<___; 645 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 646 .align 2 647 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 648 .comm OPENSSL_armcap_P,4,4 649 .hidden OPENSSL_armcap_P 650 #endif 651 ___ 652 653 $code =~ s/\`([^\`]*)\`/eval $1/gem; 654 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 655 $code =~ s/\bret\b/bx lr/gm; 656 657 open SELF,$0; 658 while(<SELF>) { 659 next if (/^#!/); 660 last if (!s/^#/@/ and !/^$/); 661 print; 662 } 663 close SELF; 664 665 print $code; 666 close STDOUT; # enforce flush 667