1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # SHA512 block procedure for ARMv4. September 2007. 11 12 # This code is ~4.5 (four and a half) times faster than code generated 13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 14 # Xscale PXA250 core]. 15 # 16 # July 2010. 17 # 18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 19 # Cortex A8 core and ~40 cycles per processed byte. 20 21 # February 2011. 22 # 23 # Profiler-assisted and platform-specific optimization resulted in 7% 24 # improvement on Coxtex A8 core and ~38 cycles per byte. 25 26 # March 2011. 27 # 28 # Add NEON implementation. On Cortex A8 it was measured to process 29 # one byte in 25.5 cycles or 47% faster than integer-only code. 30 31 # Byte order [in]dependence. ========================================= 32 # 33 # Originally caller was expected to maintain specific *dword* order in 34 # h[0-7], namely with most significant dword at *lower* address, which 35 # was reflected in below two parameters as 0 and 4. Now caller is 36 # expected to maintain native byte order for whole 64-bit values. 37 $hi="HI"; 38 $lo="LO"; 39 # ==================================================================== 40 41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 42 open STDOUT,">$output"; 43 44 $ctx="r0"; # parameter block 45 $inp="r1"; 46 $len="r2"; 47 48 $Tlo="r3"; 49 $Thi="r4"; 50 $Alo="r5"; 51 $Ahi="r6"; 52 $Elo="r7"; 53 $Ehi="r8"; 54 $t0="r9"; 55 $t1="r10"; 56 $t2="r11"; 57 $t3="r12"; 58 ############ r13 is stack pointer 59 $Ktbl="r14"; 60 ############ r15 is program counter 61 62 $Aoff=8*0; 63 $Boff=8*1; 64 $Coff=8*2; 65 $Doff=8*3; 66 $Eoff=8*4; 67 $Foff=8*5; 68 $Goff=8*6; 69 $Hoff=8*7; 70 $Xoff=8*8; 71 72 sub BODY_00_15() { 73 my $magic = shift; 74 $code.=<<___; 75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 78 mov $t0,$Elo,lsr#14 79 str $Tlo,[sp,#$Xoff+0] 80 mov $t1,$Ehi,lsr#14 81 str $Thi,[sp,#$Xoff+4] 82 eor $t0,$t0,$Ehi,lsl#18 83 ldr $t2,[sp,#$Hoff+0] @ h.lo 84 eor $t1,$t1,$Elo,lsl#18 85 ldr $t3,[sp,#$Hoff+4] @ h.hi 86 eor $t0,$t0,$Elo,lsr#18 87 eor $t1,$t1,$Ehi,lsr#18 88 eor $t0,$t0,$Ehi,lsl#14 89 eor $t1,$t1,$Elo,lsl#14 90 eor $t0,$t0,$Ehi,lsr#9 91 eor $t1,$t1,$Elo,lsr#9 92 eor $t0,$t0,$Elo,lsl#23 93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 94 adds $Tlo,$Tlo,$t0 95 ldr $t0,[sp,#$Foff+0] @ f.lo 96 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 97 ldr $t1,[sp,#$Foff+4] @ f.hi 98 adds $Tlo,$Tlo,$t2 99 ldr $t2,[sp,#$Goff+0] @ g.lo 100 adc $Thi,$Thi,$t3 @ T += h 101 ldr $t3,[sp,#$Goff+4] @ g.hi 102 103 eor $t0,$t0,$t2 104 str $Elo,[sp,#$Eoff+0] 105 eor $t1,$t1,$t3 106 str $Ehi,[sp,#$Eoff+4] 107 and $t0,$t0,$Elo 108 str $Alo,[sp,#$Aoff+0] 109 and $t1,$t1,$Ehi 110 str $Ahi,[sp,#$Aoff+4] 111 eor $t0,$t0,$t2 112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 113 eor $t1,$t1,$t3 @ Ch(e,f,g) 114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 115 116 adds $Tlo,$Tlo,$t0 117 ldr $Elo,[sp,#$Doff+0] @ d.lo 118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 119 ldr $Ehi,[sp,#$Doff+4] @ d.hi 120 adds $Tlo,$Tlo,$t2 121 and $t0,$t2,#0xff 122 adc $Thi,$Thi,$t3 @ T += K[i] 123 adds $Elo,$Elo,$Tlo 124 ldr $t2,[sp,#$Boff+0] @ b.lo 125 adc $Ehi,$Ehi,$Thi @ d += T 126 teq $t0,#$magic 127 128 ldr $t3,[sp,#$Coff+0] @ c.lo 129 orreq $Ktbl,$Ktbl,#1 130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 133 mov $t0,$Alo,lsr#28 134 mov $t1,$Ahi,lsr#28 135 eor $t0,$t0,$Ahi,lsl#4 136 eor $t1,$t1,$Alo,lsl#4 137 eor $t0,$t0,$Ahi,lsr#2 138 eor $t1,$t1,$Alo,lsr#2 139 eor $t0,$t0,$Alo,lsl#30 140 eor $t1,$t1,$Ahi,lsl#30 141 eor $t0,$t0,$Ahi,lsr#7 142 eor $t1,$t1,$Alo,lsr#7 143 eor $t0,$t0,$Alo,lsl#25 144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 145 adds $Tlo,$Tlo,$t0 146 and $t0,$Alo,$t2 147 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 148 149 ldr $t1,[sp,#$Boff+4] @ b.hi 150 orr $Alo,$Alo,$t2 151 ldr $t2,[sp,#$Coff+4] @ c.hi 152 and $Alo,$Alo,$t3 153 and $t3,$Ahi,$t1 154 orr $Ahi,$Ahi,$t1 155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 156 and $Ahi,$Ahi,$t2 157 adds $Alo,$Alo,$Tlo 158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 159 sub sp,sp,#8 160 adc $Ahi,$Ahi,$Thi @ h += T 161 tst $Ktbl,#1 162 add $Ktbl,$Ktbl,#8 163 ___ 164 } 165 $code=<<___; 166 #include "arm_arch.h" 167 #ifdef __ARMEL__ 168 # define LO 0 169 # define HI 4 170 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 171 #else 172 # define HI 0 173 # define LO 4 174 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 175 #endif 176 177 .text 178 .code 32 179 .type K512,%object 180 .align 5 181 K512: 182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 222 .size K512,.-K512 223 .LOPENSSL_armcap: 224 .word OPENSSL_armcap_P-sha512_block_data_order 225 .skip 32-4 226 227 .global sha512_block_data_order 228 .type sha512_block_data_order,%function 229 sha512_block_data_order: 230 sub r3,pc,#8 @ sha512_block_data_order 231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 232 #if __ARM_ARCH__>=7 233 ldr r12,.LOPENSSL_armcap 234 ldr r12,[r3,r12] @ OPENSSL_armcap_P 235 tst r12,#1 236 bne .LNEON 237 #endif 238 stmdb sp!,{r4-r12,lr} 239 sub $Ktbl,r3,#672 @ K512 240 sub sp,sp,#9*8 241 242 ldr $Elo,[$ctx,#$Eoff+$lo] 243 ldr $Ehi,[$ctx,#$Eoff+$hi] 244 ldr $t0, [$ctx,#$Goff+$lo] 245 ldr $t1, [$ctx,#$Goff+$hi] 246 ldr $t2, [$ctx,#$Hoff+$lo] 247 ldr $t3, [$ctx,#$Hoff+$hi] 248 .Loop: 249 str $t0, [sp,#$Goff+0] 250 str $t1, [sp,#$Goff+4] 251 str $t2, [sp,#$Hoff+0] 252 str $t3, [sp,#$Hoff+4] 253 ldr $Alo,[$ctx,#$Aoff+$lo] 254 ldr $Ahi,[$ctx,#$Aoff+$hi] 255 ldr $Tlo,[$ctx,#$Boff+$lo] 256 ldr $Thi,[$ctx,#$Boff+$hi] 257 ldr $t0, [$ctx,#$Coff+$lo] 258 ldr $t1, [$ctx,#$Coff+$hi] 259 ldr $t2, [$ctx,#$Doff+$lo] 260 ldr $t3, [$ctx,#$Doff+$hi] 261 str $Tlo,[sp,#$Boff+0] 262 str $Thi,[sp,#$Boff+4] 263 str $t0, [sp,#$Coff+0] 264 str $t1, [sp,#$Coff+4] 265 str $t2, [sp,#$Doff+0] 266 str $t3, [sp,#$Doff+4] 267 ldr $Tlo,[$ctx,#$Foff+$lo] 268 ldr $Thi,[$ctx,#$Foff+$hi] 269 str $Tlo,[sp,#$Foff+0] 270 str $Thi,[sp,#$Foff+4] 271 272 .L00_15: 273 #if __ARM_ARCH__<7 274 ldrb $Tlo,[$inp,#7] 275 ldrb $t0, [$inp,#6] 276 ldrb $t1, [$inp,#5] 277 ldrb $t2, [$inp,#4] 278 ldrb $Thi,[$inp,#3] 279 ldrb $t3, [$inp,#2] 280 orr $Tlo,$Tlo,$t0,lsl#8 281 ldrb $t0, [$inp,#1] 282 orr $Tlo,$Tlo,$t1,lsl#16 283 ldrb $t1, [$inp],#8 284 orr $Tlo,$Tlo,$t2,lsl#24 285 orr $Thi,$Thi,$t3,lsl#8 286 orr $Thi,$Thi,$t0,lsl#16 287 orr $Thi,$Thi,$t1,lsl#24 288 #else 289 ldr $Tlo,[$inp,#4] 290 ldr $Thi,[$inp],#8 291 #ifdef __ARMEL__ 292 rev $Tlo,$Tlo 293 rev $Thi,$Thi 294 #endif 295 #endif 296 ___ 297 &BODY_00_15(0x94); 298 $code.=<<___; 299 tst $Ktbl,#1 300 beq .L00_15 301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 303 bic $Ktbl,$Ktbl,#1 304 .L16_79: 305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 308 mov $Tlo,$t0,lsr#1 309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 310 mov $Thi,$t1,lsr#1 311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 312 eor $Tlo,$Tlo,$t1,lsl#31 313 eor $Thi,$Thi,$t0,lsl#31 314 eor $Tlo,$Tlo,$t0,lsr#8 315 eor $Thi,$Thi,$t1,lsr#8 316 eor $Tlo,$Tlo,$t1,lsl#24 317 eor $Thi,$Thi,$t0,lsl#24 318 eor $Tlo,$Tlo,$t0,lsr#7 319 eor $Thi,$Thi,$t1,lsr#7 320 eor $Tlo,$Tlo,$t1,lsl#25 321 322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 325 mov $t0,$t2,lsr#19 326 mov $t1,$t3,lsr#19 327 eor $t0,$t0,$t3,lsl#13 328 eor $t1,$t1,$t2,lsl#13 329 eor $t0,$t0,$t3,lsr#29 330 eor $t1,$t1,$t2,lsr#29 331 eor $t0,$t0,$t2,lsl#3 332 eor $t1,$t1,$t3,lsl#3 333 eor $t0,$t0,$t2,lsr#6 334 eor $t1,$t1,$t3,lsr#6 335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 336 eor $t0,$t0,$t3,lsl#26 337 338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 339 adds $Tlo,$Tlo,$t0 340 ldr $t0,[sp,#`$Xoff+8*16`+0] 341 adc $Thi,$Thi,$t1 342 343 ldr $t1,[sp,#`$Xoff+8*16`+4] 344 adds $Tlo,$Tlo,$t2 345 adc $Thi,$Thi,$t3 346 adds $Tlo,$Tlo,$t0 347 adc $Thi,$Thi,$t1 348 ___ 349 &BODY_00_15(0x17); 350 $code.=<<___; 351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 353 beq .L16_79 354 bic $Ktbl,$Ktbl,#1 355 356 ldr $Tlo,[sp,#$Boff+0] 357 ldr $Thi,[sp,#$Boff+4] 358 ldr $t0, [$ctx,#$Aoff+$lo] 359 ldr $t1, [$ctx,#$Aoff+$hi] 360 ldr $t2, [$ctx,#$Boff+$lo] 361 ldr $t3, [$ctx,#$Boff+$hi] 362 adds $t0,$Alo,$t0 363 str $t0, [$ctx,#$Aoff+$lo] 364 adc $t1,$Ahi,$t1 365 str $t1, [$ctx,#$Aoff+$hi] 366 adds $t2,$Tlo,$t2 367 str $t2, [$ctx,#$Boff+$lo] 368 adc $t3,$Thi,$t3 369 str $t3, [$ctx,#$Boff+$hi] 370 371 ldr $Alo,[sp,#$Coff+0] 372 ldr $Ahi,[sp,#$Coff+4] 373 ldr $Tlo,[sp,#$Doff+0] 374 ldr $Thi,[sp,#$Doff+4] 375 ldr $t0, [$ctx,#$Coff+$lo] 376 ldr $t1, [$ctx,#$Coff+$hi] 377 ldr $t2, [$ctx,#$Doff+$lo] 378 ldr $t3, [$ctx,#$Doff+$hi] 379 adds $t0,$Alo,$t0 380 str $t0, [$ctx,#$Coff+$lo] 381 adc $t1,$Ahi,$t1 382 str $t1, [$ctx,#$Coff+$hi] 383 adds $t2,$Tlo,$t2 384 str $t2, [$ctx,#$Doff+$lo] 385 adc $t3,$Thi,$t3 386 str $t3, [$ctx,#$Doff+$hi] 387 388 ldr $Tlo,[sp,#$Foff+0] 389 ldr $Thi,[sp,#$Foff+4] 390 ldr $t0, [$ctx,#$Eoff+$lo] 391 ldr $t1, [$ctx,#$Eoff+$hi] 392 ldr $t2, [$ctx,#$Foff+$lo] 393 ldr $t3, [$ctx,#$Foff+$hi] 394 adds $Elo,$Elo,$t0 395 str $Elo,[$ctx,#$Eoff+$lo] 396 adc $Ehi,$Ehi,$t1 397 str $Ehi,[$ctx,#$Eoff+$hi] 398 adds $t2,$Tlo,$t2 399 str $t2, [$ctx,#$Foff+$lo] 400 adc $t3,$Thi,$t3 401 str $t3, [$ctx,#$Foff+$hi] 402 403 ldr $Alo,[sp,#$Goff+0] 404 ldr $Ahi,[sp,#$Goff+4] 405 ldr $Tlo,[sp,#$Hoff+0] 406 ldr $Thi,[sp,#$Hoff+4] 407 ldr $t0, [$ctx,#$Goff+$lo] 408 ldr $t1, [$ctx,#$Goff+$hi] 409 ldr $t2, [$ctx,#$Hoff+$lo] 410 ldr $t3, [$ctx,#$Hoff+$hi] 411 adds $t0,$Alo,$t0 412 str $t0, [$ctx,#$Goff+$lo] 413 adc $t1,$Ahi,$t1 414 str $t1, [$ctx,#$Goff+$hi] 415 adds $t2,$Tlo,$t2 416 str $t2, [$ctx,#$Hoff+$lo] 417 adc $t3,$Thi,$t3 418 str $t3, [$ctx,#$Hoff+$hi] 419 420 add sp,sp,#640 421 sub $Ktbl,$Ktbl,#640 422 423 teq $inp,$len 424 bne .Loop 425 426 add sp,sp,#8*9 @ destroy frame 427 #if __ARM_ARCH__>=5 428 ldmia sp!,{r4-r12,pc} 429 #else 430 ldmia sp!,{r4-r12,lr} 431 tst lr,#1 432 moveq pc,lr @ be binary compatible with V4, yet 433 bx lr @ interoperable with Thumb ISA:-) 434 #endif 435 ___ 436 437 { 438 my @Sigma0=(28,34,39); 439 my @Sigma1=(14,18,41); 440 my @sigma0=(1, 8, 7); 441 my @sigma1=(19,61,6); 442 443 my $Ktbl="r3"; 444 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 445 446 my @X=map("d$_",(0..15)); 447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 448 449 sub NEON_00_15() { 450 my $i=shift; 451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 453 454 $code.=<<___ if ($i<16 || $i&1); 455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 456 #if $i<16 457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 458 #endif 459 vshr.u64 $t1,$e,#@Sigma1[1] 460 vshr.u64 $t2,$e,#@Sigma1[2] 461 ___ 462 $code.=<<___; 463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 464 vsli.64 $t0,$e,#`64-@Sigma1[0]` 465 vsli.64 $t1,$e,#`64-@Sigma1[1]` 466 vsli.64 $t2,$e,#`64-@Sigma1[2]` 467 #if $i<16 && defined(__ARMEL__) 468 vrev64.8 @X[$i],@X[$i] 469 #endif 470 vadd.i64 $T1,$K,$h 471 veor $Ch,$f,$g 472 veor $t0,$t1 473 vand $Ch,$e 474 veor $t0,$t2 @ Sigma1(e) 475 veor $Ch,$g @ Ch(e,f,g) 476 vadd.i64 $T1,$t0 477 vshr.u64 $t0,$a,#@Sigma0[0] 478 vadd.i64 $T1,$Ch 479 vshr.u64 $t1,$a,#@Sigma0[1] 480 vshr.u64 $t2,$a,#@Sigma0[2] 481 vsli.64 $t0,$a,#`64-@Sigma0[0]` 482 vsli.64 $t1,$a,#`64-@Sigma0[1]` 483 vsli.64 $t2,$a,#`64-@Sigma0[2]` 484 vadd.i64 $T1,@X[$i%16] 485 vorr $Maj,$a,$c 486 vand $Ch,$a,$c 487 veor $h,$t0,$t1 488 vand $Maj,$b 489 veor $h,$t2 @ Sigma0(a) 490 vorr $Maj,$Ch @ Maj(a,b,c) 491 vadd.i64 $h,$T1 492 vadd.i64 $d,$T1 493 vadd.i64 $h,$Maj 494 ___ 495 } 496 497 sub NEON_16_79() { 498 my $i=shift; 499 500 if ($i&1) { &NEON_00_15($i,@_); return; } 501 502 # 2x-vectorized, therefore runs every 2nd round 503 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 504 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 505 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 506 my $e=@_[4]; # $e from NEON_00_15 507 $i /= 2; 508 $code.=<<___; 509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 515 veor $s1,$t0 516 vshr.u64 $t0,$s0,#@sigma0[0] 517 veor $s1,$t1 @ sigma1(X[i+14]) 518 vshr.u64 $t1,$s0,#@sigma0[1] 519 vadd.i64 @X[$i%8],$s1 520 vshr.u64 $s1,$s0,#@sigma0[2] 521 vsli.64 $t0,$s0,#`64-@sigma0[0]` 522 vsli.64 $t1,$s0,#`64-@sigma0[1]` 523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 524 veor $s1,$t0 525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 526 vadd.i64 @X[$i%8],$s0 527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 528 veor $s1,$t1 @ sigma0(X[i+1]) 529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 530 vadd.i64 @X[$i%8],$s1 531 ___ 532 &NEON_00_15(2*$i,@_); 533 } 534 535 $code.=<<___; 536 #if __ARM_ARCH__>=7 537 .fpu neon 538 539 .align 4 540 .LNEON: 541 dmb @ errata #451034 on early Cortex A8 542 vstmdb sp!,{d8-d15} @ ABI specification says so 543 sub $Ktbl,r3,#672 @ K512 544 vldmia $ctx,{$A-$H} @ load context 545 .Loop_neon: 546 ___ 547 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 548 $code.=<<___; 549 mov $cnt,#4 550 .L16_79_neon: 551 subs $cnt,#1 552 ___ 553 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 554 $code.=<<___; 555 bne .L16_79_neon 556 557 vldmia $ctx,{d24-d31} @ load context to temp 558 vadd.i64 q8,q12 @ vectorized accumulate 559 vadd.i64 q9,q13 560 vadd.i64 q10,q14 561 vadd.i64 q11,q15 562 vstmia $ctx,{$A-$H} @ save context 563 teq $inp,$len 564 sub $Ktbl,#640 @ rewind K512 565 bne .Loop_neon 566 567 vldmia sp!,{d8-d15} @ epilogue 568 bx lr 569 #endif 570 ___ 571 } 572 $code.=<<___; 573 .size sha512_block_data_order,.-sha512_block_data_order 574 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 575 .align 2 576 .comm OPENSSL_armcap_P,4,4 577 ___ 578 579 $code =~ s/\`([^\`]*)\`/eval $1/gem; 580 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 581 print $code; 582 close STDOUT; # enforce flush 583