1 #! /usr/bin/env perl 2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # 16 # Permission to use under GPL terms is granted. 17 # ==================================================================== 18 19 # SHA512 block procedure for ARMv4. September 2007. 20 21 # This code is ~4.5 (four and a half) times faster than code generated 22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 23 # Xscale PXA250 core]. 24 # 25 # July 2010. 26 # 27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 28 # Cortex A8 core and ~40 cycles per processed byte. 29 30 # February 2011. 31 # 32 # Profiler-assisted and platform-specific optimization resulted in 7% 33 # improvement on Coxtex A8 core and ~38 cycles per byte. 34 35 # March 2011. 36 # 37 # Add NEON implementation. On Cortex A8 it was measured to process 38 # one byte in 23.3 cycles or ~60% faster than integer-only code. 39 40 # August 2012. 41 # 42 # Improve NEON performance by 12% on Snapdragon S4. In absolute 43 # terms it's 22.6 cycles per byte, which is disappointing result. 44 # Technical writers asserted that 3-way S4 pipeline can sustain 45 # multiple NEON instructions per cycle, but dual NEON issue could 46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html 47 # for further details. On side note Cortex-A15 processes one byte in 48 # 16 cycles. 49 50 # Byte order [in]dependence. ========================================= 51 # 52 # Originally caller was expected to maintain specific *dword* order in 53 # h[0-7], namely with most significant dword at *lower* address, which 54 # was reflected in below two parameters as 0 and 4. Now caller is 55 # expected to maintain native byte order for whole 64-bit values. 56 $hi="HI"; 57 $lo="LO"; 58 # ==================================================================== 59 60 $flavour = shift; 61 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 62 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 63 64 if ($flavour && $flavour ne "void") { 65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 66 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 67 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 68 die "can't locate arm-xlate.pl"; 69 70 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 71 } else { 72 open STDOUT,">$output"; 73 } 74 75 $ctx="r0"; # parameter block 76 $inp="r1"; 77 $len="r2"; 78 79 $Tlo="r3"; 80 $Thi="r4"; 81 $Alo="r5"; 82 $Ahi="r6"; 83 $Elo="r7"; 84 $Ehi="r8"; 85 $t0="r9"; 86 $t1="r10"; 87 $t2="r11"; 88 $t3="r12"; 89 ############ r13 is stack pointer 90 $Ktbl="r14"; 91 ############ r15 is program counter 92 93 $Aoff=8*0; 94 $Boff=8*1; 95 $Coff=8*2; 96 $Doff=8*3; 97 $Eoff=8*4; 98 $Foff=8*5; 99 $Goff=8*6; 100 $Hoff=8*7; 101 $Xoff=8*8; 102 103 sub BODY_00_15() { 104 my $magic = shift; 105 $code.=<<___; 106 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 107 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 108 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 109 mov $t0,$Elo,lsr#14 110 str $Tlo,[sp,#$Xoff+0] 111 mov $t1,$Ehi,lsr#14 112 str $Thi,[sp,#$Xoff+4] 113 eor $t0,$t0,$Ehi,lsl#18 114 ldr $t2,[sp,#$Hoff+0] @ h.lo 115 eor $t1,$t1,$Elo,lsl#18 116 ldr $t3,[sp,#$Hoff+4] @ h.hi 117 eor $t0,$t0,$Elo,lsr#18 118 eor $t1,$t1,$Ehi,lsr#18 119 eor $t0,$t0,$Ehi,lsl#14 120 eor $t1,$t1,$Elo,lsl#14 121 eor $t0,$t0,$Ehi,lsr#9 122 eor $t1,$t1,$Elo,lsr#9 123 eor $t0,$t0,$Elo,lsl#23 124 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 125 adds $Tlo,$Tlo,$t0 126 ldr $t0,[sp,#$Foff+0] @ f.lo 127 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 128 ldr $t1,[sp,#$Foff+4] @ f.hi 129 adds $Tlo,$Tlo,$t2 130 ldr $t2,[sp,#$Goff+0] @ g.lo 131 adc $Thi,$Thi,$t3 @ T += h 132 ldr $t3,[sp,#$Goff+4] @ g.hi 133 134 eor $t0,$t0,$t2 135 str $Elo,[sp,#$Eoff+0] 136 eor $t1,$t1,$t3 137 str $Ehi,[sp,#$Eoff+4] 138 and $t0,$t0,$Elo 139 str $Alo,[sp,#$Aoff+0] 140 and $t1,$t1,$Ehi 141 str $Ahi,[sp,#$Aoff+4] 142 eor $t0,$t0,$t2 143 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 144 eor $t1,$t1,$t3 @ Ch(e,f,g) 145 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 146 147 adds $Tlo,$Tlo,$t0 148 ldr $Elo,[sp,#$Doff+0] @ d.lo 149 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 150 ldr $Ehi,[sp,#$Doff+4] @ d.hi 151 adds $Tlo,$Tlo,$t2 152 and $t0,$t2,#0xff 153 adc $Thi,$Thi,$t3 @ T += K[i] 154 adds $Elo,$Elo,$Tlo 155 ldr $t2,[sp,#$Boff+0] @ b.lo 156 adc $Ehi,$Ehi,$Thi @ d += T 157 teq $t0,#$magic 158 159 ldr $t3,[sp,#$Coff+0] @ c.lo 160 #if __ARM_ARCH__>=7 161 it eq @ Thumb2 thing, sanity check in ARM 162 #endif 163 orreq $Ktbl,$Ktbl,#1 164 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 165 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 166 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 167 mov $t0,$Alo,lsr#28 168 mov $t1,$Ahi,lsr#28 169 eor $t0,$t0,$Ahi,lsl#4 170 eor $t1,$t1,$Alo,lsl#4 171 eor $t0,$t0,$Ahi,lsr#2 172 eor $t1,$t1,$Alo,lsr#2 173 eor $t0,$t0,$Alo,lsl#30 174 eor $t1,$t1,$Ahi,lsl#30 175 eor $t0,$t0,$Ahi,lsr#7 176 eor $t1,$t1,$Alo,lsr#7 177 eor $t0,$t0,$Alo,lsl#25 178 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 179 adds $Tlo,$Tlo,$t0 180 and $t0,$Alo,$t2 181 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 182 183 ldr $t1,[sp,#$Boff+4] @ b.hi 184 orr $Alo,$Alo,$t2 185 ldr $t2,[sp,#$Coff+4] @ c.hi 186 and $Alo,$Alo,$t3 187 and $t3,$Ahi,$t1 188 orr $Ahi,$Ahi,$t1 189 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 190 and $Ahi,$Ahi,$t2 191 adds $Alo,$Alo,$Tlo 192 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 193 sub sp,sp,#8 194 adc $Ahi,$Ahi,$Thi @ h += T 195 tst $Ktbl,#1 196 add $Ktbl,$Ktbl,#8 197 ___ 198 } 199 $code=<<___; 200 #ifndef __KERNEL__ 201 # include <openssl/arm_arch.h> 202 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} 203 # define VFP_ABI_POP vldmia sp!,{d8-d15} 204 #else 205 # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 206 # define __ARM_MAX_ARCH__ 7 207 # define VFP_ABI_PUSH 208 # define VFP_ABI_POP 209 #endif 210 211 @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 212 @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. 213 .arch armv7-a 214 215 #ifdef __ARMEL__ 216 # define LO 0 217 # define HI 4 218 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 219 #else 220 # define HI 0 221 # define LO 4 222 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 223 #endif 224 225 .text 226 #if defined(__thumb2__) 227 .syntax unified 228 .thumb 229 # define adrl adr 230 #else 231 .code 32 232 #endif 233 234 .type K512,%object 235 .align 5 236 K512: 237 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 238 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 239 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 240 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 241 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 242 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 243 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 244 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 245 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 246 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 247 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 248 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 249 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 250 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 251 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 252 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 253 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 254 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 255 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 256 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 257 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 258 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 259 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 260 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 261 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 262 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 263 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 264 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 265 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 266 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 267 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 268 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 269 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 270 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 271 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 272 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 273 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 274 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 275 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 276 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 277 .size K512,.-K512 278 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 279 .LOPENSSL_armcap: 280 .word OPENSSL_armcap_P-.Lsha512_block_data_order 281 .skip 32-4 282 #else 283 .skip 32 284 #endif 285 286 .global sha512_block_data_order 287 .type sha512_block_data_order,%function 288 sha512_block_data_order: 289 .Lsha512_block_data_order: 290 #if __ARM_ARCH__<7 && !defined(__thumb2__) 291 sub r3,pc,#8 @ sha512_block_data_order 292 #else 293 adr r3,.Lsha512_block_data_order 294 #endif 295 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 296 ldr r12,.LOPENSSL_armcap 297 ldr r12,[r3,r12] @ OPENSSL_armcap_P 298 #ifdef __APPLE__ 299 ldr r12,[r12] 300 #endif 301 tst r12,#ARMV7_NEON 302 bne .LNEON 303 #endif 304 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 305 stmdb sp!,{r4-r12,lr} 306 sub $Ktbl,r3,#672 @ K512 307 sub sp,sp,#9*8 308 309 ldr $Elo,[$ctx,#$Eoff+$lo] 310 ldr $Ehi,[$ctx,#$Eoff+$hi] 311 ldr $t0, [$ctx,#$Goff+$lo] 312 ldr $t1, [$ctx,#$Goff+$hi] 313 ldr $t2, [$ctx,#$Hoff+$lo] 314 ldr $t3, [$ctx,#$Hoff+$hi] 315 .Loop: 316 str $t0, [sp,#$Goff+0] 317 str $t1, [sp,#$Goff+4] 318 str $t2, [sp,#$Hoff+0] 319 str $t3, [sp,#$Hoff+4] 320 ldr $Alo,[$ctx,#$Aoff+$lo] 321 ldr $Ahi,[$ctx,#$Aoff+$hi] 322 ldr $Tlo,[$ctx,#$Boff+$lo] 323 ldr $Thi,[$ctx,#$Boff+$hi] 324 ldr $t0, [$ctx,#$Coff+$lo] 325 ldr $t1, [$ctx,#$Coff+$hi] 326 ldr $t2, [$ctx,#$Doff+$lo] 327 ldr $t3, [$ctx,#$Doff+$hi] 328 str $Tlo,[sp,#$Boff+0] 329 str $Thi,[sp,#$Boff+4] 330 str $t0, [sp,#$Coff+0] 331 str $t1, [sp,#$Coff+4] 332 str $t2, [sp,#$Doff+0] 333 str $t3, [sp,#$Doff+4] 334 ldr $Tlo,[$ctx,#$Foff+$lo] 335 ldr $Thi,[$ctx,#$Foff+$hi] 336 str $Tlo,[sp,#$Foff+0] 337 str $Thi,[sp,#$Foff+4] 338 339 .L00_15: 340 #if __ARM_ARCH__<7 341 ldrb $Tlo,[$inp,#7] 342 ldrb $t0, [$inp,#6] 343 ldrb $t1, [$inp,#5] 344 ldrb $t2, [$inp,#4] 345 ldrb $Thi,[$inp,#3] 346 ldrb $t3, [$inp,#2] 347 orr $Tlo,$Tlo,$t0,lsl#8 348 ldrb $t0, [$inp,#1] 349 orr $Tlo,$Tlo,$t1,lsl#16 350 ldrb $t1, [$inp],#8 351 orr $Tlo,$Tlo,$t2,lsl#24 352 orr $Thi,$Thi,$t3,lsl#8 353 orr $Thi,$Thi,$t0,lsl#16 354 orr $Thi,$Thi,$t1,lsl#24 355 #else 356 ldr $Tlo,[$inp,#4] 357 ldr $Thi,[$inp],#8 358 #ifdef __ARMEL__ 359 rev $Tlo,$Tlo 360 rev $Thi,$Thi 361 #endif 362 #endif 363 ___ 364 &BODY_00_15(0x94); 365 $code.=<<___; 366 tst $Ktbl,#1 367 beq .L00_15 368 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 369 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 370 bic $Ktbl,$Ktbl,#1 371 .L16_79: 372 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 373 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 374 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 375 mov $Tlo,$t0,lsr#1 376 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 377 mov $Thi,$t1,lsr#1 378 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 379 eor $Tlo,$Tlo,$t1,lsl#31 380 eor $Thi,$Thi,$t0,lsl#31 381 eor $Tlo,$Tlo,$t0,lsr#8 382 eor $Thi,$Thi,$t1,lsr#8 383 eor $Tlo,$Tlo,$t1,lsl#24 384 eor $Thi,$Thi,$t0,lsl#24 385 eor $Tlo,$Tlo,$t0,lsr#7 386 eor $Thi,$Thi,$t1,lsr#7 387 eor $Tlo,$Tlo,$t1,lsl#25 388 389 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 390 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 391 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 392 mov $t0,$t2,lsr#19 393 mov $t1,$t3,lsr#19 394 eor $t0,$t0,$t3,lsl#13 395 eor $t1,$t1,$t2,lsl#13 396 eor $t0,$t0,$t3,lsr#29 397 eor $t1,$t1,$t2,lsr#29 398 eor $t0,$t0,$t2,lsl#3 399 eor $t1,$t1,$t3,lsl#3 400 eor $t0,$t0,$t2,lsr#6 401 eor $t1,$t1,$t3,lsr#6 402 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 403 eor $t0,$t0,$t3,lsl#26 404 405 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 406 adds $Tlo,$Tlo,$t0 407 ldr $t0,[sp,#`$Xoff+8*16`+0] 408 adc $Thi,$Thi,$t1 409 410 ldr $t1,[sp,#`$Xoff+8*16`+4] 411 adds $Tlo,$Tlo,$t2 412 adc $Thi,$Thi,$t3 413 adds $Tlo,$Tlo,$t0 414 adc $Thi,$Thi,$t1 415 ___ 416 &BODY_00_15(0x17); 417 $code.=<<___; 418 #if __ARM_ARCH__>=7 419 ittt eq @ Thumb2 thing, sanity check in ARM 420 #endif 421 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 422 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 423 beq .L16_79 424 bic $Ktbl,$Ktbl,#1 425 426 ldr $Tlo,[sp,#$Boff+0] 427 ldr $Thi,[sp,#$Boff+4] 428 ldr $t0, [$ctx,#$Aoff+$lo] 429 ldr $t1, [$ctx,#$Aoff+$hi] 430 ldr $t2, [$ctx,#$Boff+$lo] 431 ldr $t3, [$ctx,#$Boff+$hi] 432 adds $t0,$Alo,$t0 433 str $t0, [$ctx,#$Aoff+$lo] 434 adc $t1,$Ahi,$t1 435 str $t1, [$ctx,#$Aoff+$hi] 436 adds $t2,$Tlo,$t2 437 str $t2, [$ctx,#$Boff+$lo] 438 adc $t3,$Thi,$t3 439 str $t3, [$ctx,#$Boff+$hi] 440 441 ldr $Alo,[sp,#$Coff+0] 442 ldr $Ahi,[sp,#$Coff+4] 443 ldr $Tlo,[sp,#$Doff+0] 444 ldr $Thi,[sp,#$Doff+4] 445 ldr $t0, [$ctx,#$Coff+$lo] 446 ldr $t1, [$ctx,#$Coff+$hi] 447 ldr $t2, [$ctx,#$Doff+$lo] 448 ldr $t3, [$ctx,#$Doff+$hi] 449 adds $t0,$Alo,$t0 450 str $t0, [$ctx,#$Coff+$lo] 451 adc $t1,$Ahi,$t1 452 str $t1, [$ctx,#$Coff+$hi] 453 adds $t2,$Tlo,$t2 454 str $t2, [$ctx,#$Doff+$lo] 455 adc $t3,$Thi,$t3 456 str $t3, [$ctx,#$Doff+$hi] 457 458 ldr $Tlo,[sp,#$Foff+0] 459 ldr $Thi,[sp,#$Foff+4] 460 ldr $t0, [$ctx,#$Eoff+$lo] 461 ldr $t1, [$ctx,#$Eoff+$hi] 462 ldr $t2, [$ctx,#$Foff+$lo] 463 ldr $t3, [$ctx,#$Foff+$hi] 464 adds $Elo,$Elo,$t0 465 str $Elo,[$ctx,#$Eoff+$lo] 466 adc $Ehi,$Ehi,$t1 467 str $Ehi,[$ctx,#$Eoff+$hi] 468 adds $t2,$Tlo,$t2 469 str $t2, [$ctx,#$Foff+$lo] 470 adc $t3,$Thi,$t3 471 str $t3, [$ctx,#$Foff+$hi] 472 473 ldr $Alo,[sp,#$Goff+0] 474 ldr $Ahi,[sp,#$Goff+4] 475 ldr $Tlo,[sp,#$Hoff+0] 476 ldr $Thi,[sp,#$Hoff+4] 477 ldr $t0, [$ctx,#$Goff+$lo] 478 ldr $t1, [$ctx,#$Goff+$hi] 479 ldr $t2, [$ctx,#$Hoff+$lo] 480 ldr $t3, [$ctx,#$Hoff+$hi] 481 adds $t0,$Alo,$t0 482 str $t0, [$ctx,#$Goff+$lo] 483 adc $t1,$Ahi,$t1 484 str $t1, [$ctx,#$Goff+$hi] 485 adds $t2,$Tlo,$t2 486 str $t2, [$ctx,#$Hoff+$lo] 487 adc $t3,$Thi,$t3 488 str $t3, [$ctx,#$Hoff+$hi] 489 490 add sp,sp,#640 491 sub $Ktbl,$Ktbl,#640 492 493 teq $inp,$len 494 bne .Loop 495 496 add sp,sp,#8*9 @ destroy frame 497 #if __ARM_ARCH__>=5 498 ldmia sp!,{r4-r12,pc} 499 #else 500 ldmia sp!,{r4-r12,lr} 501 tst lr,#1 502 moveq pc,lr @ be binary compatible with V4, yet 503 bx lr @ interoperable with Thumb ISA:-) 504 #endif 505 .size sha512_block_data_order,.-sha512_block_data_order 506 ___ 507 508 { 509 my @Sigma0=(28,34,39); 510 my @Sigma1=(14,18,41); 511 my @sigma0=(1, 8, 7); 512 my @sigma1=(19,61,6); 513 514 my $Ktbl="r3"; 515 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 516 517 my @X=map("d$_",(0..15)); 518 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 519 520 sub NEON_00_15() { 521 my $i=shift; 522 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 523 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 524 525 $code.=<<___ if ($i<16 || $i&1); 526 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 527 #if $i<16 528 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 529 #endif 530 vshr.u64 $t1,$e,#@Sigma1[1] 531 #if $i>0 532 vadd.i64 $a,$Maj @ h+=Maj from the past 533 #endif 534 vshr.u64 $t2,$e,#@Sigma1[2] 535 ___ 536 $code.=<<___; 537 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 538 vsli.64 $t0,$e,#`64-@Sigma1[0]` 539 vsli.64 $t1,$e,#`64-@Sigma1[1]` 540 vmov $Ch,$e 541 vsli.64 $t2,$e,#`64-@Sigma1[2]` 542 #if $i<16 && defined(__ARMEL__) 543 vrev64.8 @X[$i],@X[$i] 544 #endif 545 veor $t1,$t0 546 vbsl $Ch,$f,$g @ Ch(e,f,g) 547 vshr.u64 $t0,$a,#@Sigma0[0] 548 veor $t2,$t1 @ Sigma1(e) 549 vadd.i64 $T1,$Ch,$h 550 vshr.u64 $t1,$a,#@Sigma0[1] 551 vsli.64 $t0,$a,#`64-@Sigma0[0]` 552 vadd.i64 $T1,$t2 553 vshr.u64 $t2,$a,#@Sigma0[2] 554 vadd.i64 $K,@X[$i%16] 555 vsli.64 $t1,$a,#`64-@Sigma0[1]` 556 veor $Maj,$a,$b 557 vsli.64 $t2,$a,#`64-@Sigma0[2]` 558 veor $h,$t0,$t1 559 vadd.i64 $T1,$K 560 vbsl $Maj,$c,$b @ Maj(a,b,c) 561 veor $h,$t2 @ Sigma0(a) 562 vadd.i64 $d,$T1 563 vadd.i64 $Maj,$T1 564 @ vadd.i64 $h,$Maj 565 ___ 566 } 567 568 sub NEON_16_79() { 569 my $i=shift; 570 571 if ($i&1) { &NEON_00_15($i,@_); return; } 572 573 # 2x-vectorized, therefore runs every 2nd round 574 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 575 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 576 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 577 my $e=@_[4]; # $e from NEON_00_15 578 $i /= 2; 579 $code.=<<___; 580 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 581 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 582 vadd.i64 @_[0],d30 @ h+=Maj from the past 583 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 584 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 585 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 586 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 587 veor $s1,$t0 588 vshr.u64 $t0,$s0,#@sigma0[0] 589 veor $s1,$t1 @ sigma1(X[i+14]) 590 vshr.u64 $t1,$s0,#@sigma0[1] 591 vadd.i64 @X[$i%8],$s1 592 vshr.u64 $s1,$s0,#@sigma0[2] 593 vsli.64 $t0,$s0,#`64-@sigma0[0]` 594 vsli.64 $t1,$s0,#`64-@sigma0[1]` 595 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 596 veor $s1,$t0 597 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 598 vadd.i64 @X[$i%8],$s0 599 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 600 veor $s1,$t1 @ sigma0(X[i+1]) 601 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 602 vadd.i64 @X[$i%8],$s1 603 ___ 604 &NEON_00_15(2*$i,@_); 605 } 606 607 $code.=<<___; 608 #if __ARM_MAX_ARCH__>=7 609 .arch armv7-a 610 .fpu neon 611 612 .global sha512_block_data_order_neon 613 .type sha512_block_data_order_neon,%function 614 .align 4 615 sha512_block_data_order_neon: 616 .LNEON: 617 dmb @ errata #451034 on early Cortex A8 618 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 619 adr $Ktbl,K512 620 VFP_ABI_PUSH 621 vldmia $ctx,{$A-$H} @ load context 622 .Loop_neon: 623 ___ 624 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 625 $code.=<<___; 626 mov $cnt,#4 627 .L16_79_neon: 628 subs $cnt,#1 629 ___ 630 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 631 $code.=<<___; 632 bne .L16_79_neon 633 634 vadd.i64 $A,d30 @ h+=Maj from the past 635 vldmia $ctx,{d24-d31} @ load context to temp 636 vadd.i64 q8,q12 @ vectorized accumulate 637 vadd.i64 q9,q13 638 vadd.i64 q10,q14 639 vadd.i64 q11,q15 640 vstmia $ctx,{$A-$H} @ save context 641 teq $inp,$len 642 sub $Ktbl,#640 @ rewind K512 643 bne .Loop_neon 644 645 VFP_ABI_POP 646 ret @ bx lr 647 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon 648 #endif 649 ___ 650 } 651 $code.=<<___; 652 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 653 .align 2 654 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 655 .comm OPENSSL_armcap_P,4,4 656 .hidden OPENSSL_armcap_P 657 #endif 658 ___ 659 660 $code =~ s/\`([^\`]*)\`/eval $1/gem; 661 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 662 $code =~ s/\bret\b/bx lr/gm; 663 664 open SELF,$0; 665 while(<SELF>) { 666 next if (/^#!/); 667 last if (!s/^#/@/ and !/^$/); 668 print; 669 } 670 close SELF; 671 672 print $code; 673 close STDOUT; # enforce flush 674