Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 #
      9 # Permission to use under GPL terms is granted.
     10 # ====================================================================
     11 
     12 # SHA512 block procedure for ARMv4. September 2007.
     13 
     14 # This code is ~4.5 (four and a half) times faster than code generated
     15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
     16 # Xscale PXA250 core].
     17 #
     18 # July 2010.
     19 #
     20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
     21 # Cortex A8 core and ~40 cycles per processed byte.
     22 
     23 # February 2011.
     24 #
     25 # Profiler-assisted and platform-specific optimization resulted in 7%
     26 # improvement on Coxtex A8 core and ~38 cycles per byte.
     27 
     28 # March 2011.
     29 #
     30 # Add NEON implementation. On Cortex A8 it was measured to process
     31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
     32 
     33 # August 2012.
     34 #
     35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
     36 # terms it's 22.6 cycles per byte, which is disappointing result.
     37 # Technical writers asserted that 3-way S4 pipeline can sustain
     38 # multiple NEON instructions per cycle, but dual NEON issue could
     39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
     40 # for further details. On side note Cortex-A15 processes one byte in
     41 # 16 cycles.
     42 
     43 # Byte order [in]dependence. =========================================
     44 #
     45 # Originally caller was expected to maintain specific *dword* order in
     46 # h[0-7], namely with most significant dword at *lower* address, which
     47 # was reflected in below two parameters as 0 and 4. Now caller is
     48 # expected to maintain native byte order for whole 64-bit values.
     49 $hi="HI";
     50 $lo="LO";
     51 # ====================================================================
     52 
     53 $flavour = shift;
     54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
     55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
     56 
     57 if ($flavour && $flavour ne "void") {
     58     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     59     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     60     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     61     die "can't locate arm-xlate.pl";
     62 
     63     open STDOUT,"| \"$^X\" $xlate $flavour $output";
     64 } else {
     65     open STDOUT,">$output";
     66 }
     67 
     68 $ctx="r0";	# parameter block
     69 $inp="r1";
     70 $len="r2";
     71 
     72 $Tlo="r3";
     73 $Thi="r4";
     74 $Alo="r5";
     75 $Ahi="r6";
     76 $Elo="r7";
     77 $Ehi="r8";
     78 $t0="r9";
     79 $t1="r10";
     80 $t2="r11";
     81 $t3="r12";
     82 ############	r13 is stack pointer
     83 $Ktbl="r14";
     84 ############	r15 is program counter
     85 
     86 $Aoff=8*0;
     87 $Boff=8*1;
     88 $Coff=8*2;
     89 $Doff=8*3;
     90 $Eoff=8*4;
     91 $Foff=8*5;
     92 $Goff=8*6;
     93 $Hoff=8*7;
     94 $Xoff=8*8;
     95 
     96 sub BODY_00_15() {
     97 my $magic = shift;
     98 $code.=<<___;
     99 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
    100 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
    101 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
    102 	mov	$t0,$Elo,lsr#14
    103 	str	$Tlo,[sp,#$Xoff+0]
    104 	mov	$t1,$Ehi,lsr#14
    105 	str	$Thi,[sp,#$Xoff+4]
    106 	eor	$t0,$t0,$Ehi,lsl#18
    107 	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
    108 	eor	$t1,$t1,$Elo,lsl#18
    109 	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
    110 	eor	$t0,$t0,$Elo,lsr#18
    111 	eor	$t1,$t1,$Ehi,lsr#18
    112 	eor	$t0,$t0,$Ehi,lsl#14
    113 	eor	$t1,$t1,$Elo,lsl#14
    114 	eor	$t0,$t0,$Ehi,lsr#9
    115 	eor	$t1,$t1,$Elo,lsr#9
    116 	eor	$t0,$t0,$Elo,lsl#23
    117 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
    118 	adds	$Tlo,$Tlo,$t0
    119 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
    120 	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
    121 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
    122 	adds	$Tlo,$Tlo,$t2
    123 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
    124 	adc	$Thi,$Thi,$t3		@ T += h
    125 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
    126 
    127 	eor	$t0,$t0,$t2
    128 	str	$Elo,[sp,#$Eoff+0]
    129 	eor	$t1,$t1,$t3
    130 	str	$Ehi,[sp,#$Eoff+4]
    131 	and	$t0,$t0,$Elo
    132 	str	$Alo,[sp,#$Aoff+0]
    133 	and	$t1,$t1,$Ehi
    134 	str	$Ahi,[sp,#$Aoff+4]
    135 	eor	$t0,$t0,$t2
    136 	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
    137 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
    138 	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
    139 
    140 	adds	$Tlo,$Tlo,$t0
    141 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
    142 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
    143 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
    144 	adds	$Tlo,$Tlo,$t2
    145 	and	$t0,$t2,#0xff
    146 	adc	$Thi,$Thi,$t3		@ T += K[i]
    147 	adds	$Elo,$Elo,$Tlo
    148 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
    149 	adc	$Ehi,$Ehi,$Thi		@ d += T
    150 	teq	$t0,#$magic
    151 
    152 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
    153 #if __ARM_ARCH__>=7
    154 	it	eq			@ Thumb2 thing, sanity check in ARM
    155 #endif
    156 	orreq	$Ktbl,$Ktbl,#1
    157 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
    158 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
    159 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
    160 	mov	$t0,$Alo,lsr#28
    161 	mov	$t1,$Ahi,lsr#28
    162 	eor	$t0,$t0,$Ahi,lsl#4
    163 	eor	$t1,$t1,$Alo,lsl#4
    164 	eor	$t0,$t0,$Ahi,lsr#2
    165 	eor	$t1,$t1,$Alo,lsr#2
    166 	eor	$t0,$t0,$Alo,lsl#30
    167 	eor	$t1,$t1,$Ahi,lsl#30
    168 	eor	$t0,$t0,$Ahi,lsr#7
    169 	eor	$t1,$t1,$Alo,lsr#7
    170 	eor	$t0,$t0,$Alo,lsl#25
    171 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
    172 	adds	$Tlo,$Tlo,$t0
    173 	and	$t0,$Alo,$t2
    174 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
    175 
    176 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
    177 	orr	$Alo,$Alo,$t2
    178 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
    179 	and	$Alo,$Alo,$t3
    180 	and	$t3,$Ahi,$t1
    181 	orr	$Ahi,$Ahi,$t1
    182 	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
    183 	and	$Ahi,$Ahi,$t2
    184 	adds	$Alo,$Alo,$Tlo
    185 	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
    186 	sub	sp,sp,#8
    187 	adc	$Ahi,$Ahi,$Thi		@ h += T
    188 	tst	$Ktbl,#1
    189 	add	$Ktbl,$Ktbl,#8
    190 ___
    191 }
    192 $code=<<___;
    193 #ifndef __KERNEL__
    194 # include <openssl/arm_arch.h>
    195 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
    196 # define VFP_ABI_POP	vldmia	sp!,{d8-d15}
    197 #else
    198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
    199 # define __ARM_MAX_ARCH__ 7
    200 # define VFP_ABI_PUSH
    201 # define VFP_ABI_POP
    202 #endif
    203 
    204 #ifdef __ARMEL__
    205 # define LO 0
    206 # define HI 4
    207 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
    208 #else
    209 # define HI 0
    210 # define LO 4
    211 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
    212 #endif
    213 
    214 .text
    215 #if __ARM_ARCH__<7 || defined(__APPLE__)
    216 .code	32
    217 #else
    218 .syntax unified
    219 # ifdef __thumb2__
    220 #  define adrl adr
    221 .thumb
    222 # else
    223 .code   32
    224 # endif
    225 #endif
    226 
    227 .type	K512,%object
    228 .align	5
    229 K512:
    230 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
    231 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
    232 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
    233 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
    234 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
    235 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
    236 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
    237 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
    238 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
    239 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
    240 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
    241 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
    242 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
    243 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
    244 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
    245 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
    246 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
    247 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
    248 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
    249 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
    250 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
    251 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
    252 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
    253 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
    254 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
    255 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
    256 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
    257 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
    258 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
    259 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
    260 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
    261 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
    262 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
    263 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
    264 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
    265 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
    266 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
    267 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
    268 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
    269 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
    270 .size	K512,.-K512
    271 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    272 .LOPENSSL_armcap:
    273 .word	OPENSSL_armcap_P-.Lsha512_block_data_order
    274 .skip	32-4
    275 #else
    276 .skip	32
    277 #endif
    278 
    279 .global	sha512_block_data_order
    280 .type	sha512_block_data_order,%function
    281 sha512_block_data_order:
    282 .Lsha512_block_data_order:
    283 #if __ARM_ARCH__<7
    284 	sub	r3,pc,#8		@ sha512_block_data_order
    285 #else
    286 	adr	r3,sha512_block_data_order
    287 #endif
    288 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    289 	ldr	r12,.LOPENSSL_armcap
    290 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    291 #ifdef	__APPLE__
    292 	ldr	r12,[r12]
    293 #endif
    294 	tst	r12,#1
    295 	bne	.LNEON
    296 #endif
    297 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    298 	stmdb	sp!,{r4-r12,lr}
    299 	sub	$Ktbl,r3,#672		@ K512
    300 	sub	sp,sp,#9*8
    301 
    302 	ldr	$Elo,[$ctx,#$Eoff+$lo]
    303 	ldr	$Ehi,[$ctx,#$Eoff+$hi]
    304 	ldr	$t0, [$ctx,#$Goff+$lo]
    305 	ldr	$t1, [$ctx,#$Goff+$hi]
    306 	ldr	$t2, [$ctx,#$Hoff+$lo]
    307 	ldr	$t3, [$ctx,#$Hoff+$hi]
    308 .Loop:
    309 	str	$t0, [sp,#$Goff+0]
    310 	str	$t1, [sp,#$Goff+4]
    311 	str	$t2, [sp,#$Hoff+0]
    312 	str	$t3, [sp,#$Hoff+4]
    313 	ldr	$Alo,[$ctx,#$Aoff+$lo]
    314 	ldr	$Ahi,[$ctx,#$Aoff+$hi]
    315 	ldr	$Tlo,[$ctx,#$Boff+$lo]
    316 	ldr	$Thi,[$ctx,#$Boff+$hi]
    317 	ldr	$t0, [$ctx,#$Coff+$lo]
    318 	ldr	$t1, [$ctx,#$Coff+$hi]
    319 	ldr	$t2, [$ctx,#$Doff+$lo]
    320 	ldr	$t3, [$ctx,#$Doff+$hi]
    321 	str	$Tlo,[sp,#$Boff+0]
    322 	str	$Thi,[sp,#$Boff+4]
    323 	str	$t0, [sp,#$Coff+0]
    324 	str	$t1, [sp,#$Coff+4]
    325 	str	$t2, [sp,#$Doff+0]
    326 	str	$t3, [sp,#$Doff+4]
    327 	ldr	$Tlo,[$ctx,#$Foff+$lo]
    328 	ldr	$Thi,[$ctx,#$Foff+$hi]
    329 	str	$Tlo,[sp,#$Foff+0]
    330 	str	$Thi,[sp,#$Foff+4]
    331 
    332 .L00_15:
    333 #if __ARM_ARCH__<7
    334 	ldrb	$Tlo,[$inp,#7]
    335 	ldrb	$t0, [$inp,#6]
    336 	ldrb	$t1, [$inp,#5]
    337 	ldrb	$t2, [$inp,#4]
    338 	ldrb	$Thi,[$inp,#3]
    339 	ldrb	$t3, [$inp,#2]
    340 	orr	$Tlo,$Tlo,$t0,lsl#8
    341 	ldrb	$t0, [$inp,#1]
    342 	orr	$Tlo,$Tlo,$t1,lsl#16
    343 	ldrb	$t1, [$inp],#8
    344 	orr	$Tlo,$Tlo,$t2,lsl#24
    345 	orr	$Thi,$Thi,$t3,lsl#8
    346 	orr	$Thi,$Thi,$t0,lsl#16
    347 	orr	$Thi,$Thi,$t1,lsl#24
    348 #else
    349 	ldr	$Tlo,[$inp,#4]
    350 	ldr	$Thi,[$inp],#8
    351 #ifdef __ARMEL__
    352 	rev	$Tlo,$Tlo
    353 	rev	$Thi,$Thi
    354 #endif
    355 #endif
    356 ___
    357 	&BODY_00_15(0x94);
    358 $code.=<<___;
    359 	tst	$Ktbl,#1
    360 	beq	.L00_15
    361 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    362 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    363 	bic	$Ktbl,$Ktbl,#1
    364 .L16_79:
    365 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
    366 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
    367 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
    368 	mov	$Tlo,$t0,lsr#1
    369 	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
    370 	mov	$Thi,$t1,lsr#1
    371 	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
    372 	eor	$Tlo,$Tlo,$t1,lsl#31
    373 	eor	$Thi,$Thi,$t0,lsl#31
    374 	eor	$Tlo,$Tlo,$t0,lsr#8
    375 	eor	$Thi,$Thi,$t1,lsr#8
    376 	eor	$Tlo,$Tlo,$t1,lsl#24
    377 	eor	$Thi,$Thi,$t0,lsl#24
    378 	eor	$Tlo,$Tlo,$t0,lsr#7
    379 	eor	$Thi,$Thi,$t1,lsr#7
    380 	eor	$Tlo,$Tlo,$t1,lsl#25
    381 
    382 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
    383 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
    384 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
    385 	mov	$t0,$t2,lsr#19
    386 	mov	$t1,$t3,lsr#19
    387 	eor	$t0,$t0,$t3,lsl#13
    388 	eor	$t1,$t1,$t2,lsl#13
    389 	eor	$t0,$t0,$t3,lsr#29
    390 	eor	$t1,$t1,$t2,lsr#29
    391 	eor	$t0,$t0,$t2,lsl#3
    392 	eor	$t1,$t1,$t3,lsl#3
    393 	eor	$t0,$t0,$t2,lsr#6
    394 	eor	$t1,$t1,$t3,lsr#6
    395 	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
    396 	eor	$t0,$t0,$t3,lsl#26
    397 
    398 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
    399 	adds	$Tlo,$Tlo,$t0
    400 	ldr	$t0,[sp,#`$Xoff+8*16`+0]
    401 	adc	$Thi,$Thi,$t1
    402 
    403 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
    404 	adds	$Tlo,$Tlo,$t2
    405 	adc	$Thi,$Thi,$t3
    406 	adds	$Tlo,$Tlo,$t0
    407 	adc	$Thi,$Thi,$t1
    408 ___
    409 	&BODY_00_15(0x17);
    410 $code.=<<___;
    411 #if __ARM_ARCH__>=7
    412 	ittt	eq			@ Thumb2 thing, sanity check in ARM
    413 #endif
    414 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    415 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    416 	beq	.L16_79
    417 	bic	$Ktbl,$Ktbl,#1
    418 
    419 	ldr	$Tlo,[sp,#$Boff+0]
    420 	ldr	$Thi,[sp,#$Boff+4]
    421 	ldr	$t0, [$ctx,#$Aoff+$lo]
    422 	ldr	$t1, [$ctx,#$Aoff+$hi]
    423 	ldr	$t2, [$ctx,#$Boff+$lo]
    424 	ldr	$t3, [$ctx,#$Boff+$hi]
    425 	adds	$t0,$Alo,$t0
    426 	str	$t0, [$ctx,#$Aoff+$lo]
    427 	adc	$t1,$Ahi,$t1
    428 	str	$t1, [$ctx,#$Aoff+$hi]
    429 	adds	$t2,$Tlo,$t2
    430 	str	$t2, [$ctx,#$Boff+$lo]
    431 	adc	$t3,$Thi,$t3
    432 	str	$t3, [$ctx,#$Boff+$hi]
    433 
    434 	ldr	$Alo,[sp,#$Coff+0]
    435 	ldr	$Ahi,[sp,#$Coff+4]
    436 	ldr	$Tlo,[sp,#$Doff+0]
    437 	ldr	$Thi,[sp,#$Doff+4]
    438 	ldr	$t0, [$ctx,#$Coff+$lo]
    439 	ldr	$t1, [$ctx,#$Coff+$hi]
    440 	ldr	$t2, [$ctx,#$Doff+$lo]
    441 	ldr	$t3, [$ctx,#$Doff+$hi]
    442 	adds	$t0,$Alo,$t0
    443 	str	$t0, [$ctx,#$Coff+$lo]
    444 	adc	$t1,$Ahi,$t1
    445 	str	$t1, [$ctx,#$Coff+$hi]
    446 	adds	$t2,$Tlo,$t2
    447 	str	$t2, [$ctx,#$Doff+$lo]
    448 	adc	$t3,$Thi,$t3
    449 	str	$t3, [$ctx,#$Doff+$hi]
    450 
    451 	ldr	$Tlo,[sp,#$Foff+0]
    452 	ldr	$Thi,[sp,#$Foff+4]
    453 	ldr	$t0, [$ctx,#$Eoff+$lo]
    454 	ldr	$t1, [$ctx,#$Eoff+$hi]
    455 	ldr	$t2, [$ctx,#$Foff+$lo]
    456 	ldr	$t3, [$ctx,#$Foff+$hi]
    457 	adds	$Elo,$Elo,$t0
    458 	str	$Elo,[$ctx,#$Eoff+$lo]
    459 	adc	$Ehi,$Ehi,$t1
    460 	str	$Ehi,[$ctx,#$Eoff+$hi]
    461 	adds	$t2,$Tlo,$t2
    462 	str	$t2, [$ctx,#$Foff+$lo]
    463 	adc	$t3,$Thi,$t3
    464 	str	$t3, [$ctx,#$Foff+$hi]
    465 
    466 	ldr	$Alo,[sp,#$Goff+0]
    467 	ldr	$Ahi,[sp,#$Goff+4]
    468 	ldr	$Tlo,[sp,#$Hoff+0]
    469 	ldr	$Thi,[sp,#$Hoff+4]
    470 	ldr	$t0, [$ctx,#$Goff+$lo]
    471 	ldr	$t1, [$ctx,#$Goff+$hi]
    472 	ldr	$t2, [$ctx,#$Hoff+$lo]
    473 	ldr	$t3, [$ctx,#$Hoff+$hi]
    474 	adds	$t0,$Alo,$t0
    475 	str	$t0, [$ctx,#$Goff+$lo]
    476 	adc	$t1,$Ahi,$t1
    477 	str	$t1, [$ctx,#$Goff+$hi]
    478 	adds	$t2,$Tlo,$t2
    479 	str	$t2, [$ctx,#$Hoff+$lo]
    480 	adc	$t3,$Thi,$t3
    481 	str	$t3, [$ctx,#$Hoff+$hi]
    482 
    483 	add	sp,sp,#640
    484 	sub	$Ktbl,$Ktbl,#640
    485 
    486 	teq	$inp,$len
    487 	bne	.Loop
    488 
    489 	add	sp,sp,#8*9		@ destroy frame
    490 #if __ARM_ARCH__>=5
    491 	ldmia	sp!,{r4-r12,pc}
    492 #else
    493 	ldmia	sp!,{r4-r12,lr}
    494 	tst	lr,#1
    495 	moveq	pc,lr			@ be binary compatible with V4, yet
    496 	bx	lr			@ interoperable with Thumb ISA:-)
    497 #endif
    498 .size	sha512_block_data_order,.-sha512_block_data_order
    499 ___
    500 
    501 {
    502 my @Sigma0=(28,34,39);
    503 my @Sigma1=(14,18,41);
    504 my @sigma0=(1, 8, 7);
    505 my @sigma1=(19,61,6);
    506 
    507 my $Ktbl="r3";
    508 my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
    509 
    510 my @X=map("d$_",(0..15));
    511 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
    512 
    513 sub NEON_00_15() {
    514 my $i=shift;
    515 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
    516 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
    517 
    518 $code.=<<___ if ($i<16 || $i&1);
    519 	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
    520 #if $i<16
    521 	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
    522 #endif
    523 	vshr.u64	$t1,$e,#@Sigma1[1]
    524 #if $i>0
    525 	 vadd.i64	$a,$Maj			@ h+=Maj from the past
    526 #endif
    527 	vshr.u64	$t2,$e,#@Sigma1[2]
    528 ___
    529 $code.=<<___;
    530 	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
    531 	vsli.64		$t0,$e,#`64-@Sigma1[0]`
    532 	vsli.64		$t1,$e,#`64-@Sigma1[1]`
    533 	vmov		$Ch,$e
    534 	vsli.64		$t2,$e,#`64-@Sigma1[2]`
    535 #if $i<16 && defined(__ARMEL__)
    536 	vrev64.8	@X[$i],@X[$i]
    537 #endif
    538 	veor		$t1,$t0
    539 	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
    540 	vshr.u64	$t0,$a,#@Sigma0[0]
    541 	veor		$t2,$t1			@ Sigma1(e)
    542 	vadd.i64	$T1,$Ch,$h
    543 	vshr.u64	$t1,$a,#@Sigma0[1]
    544 	vsli.64		$t0,$a,#`64-@Sigma0[0]`
    545 	vadd.i64	$T1,$t2
    546 	vshr.u64	$t2,$a,#@Sigma0[2]
    547 	vadd.i64	$K,@X[$i%16]
    548 	vsli.64		$t1,$a,#`64-@Sigma0[1]`
    549 	veor		$Maj,$a,$b
    550 	vsli.64		$t2,$a,#`64-@Sigma0[2]`
    551 	veor		$h,$t0,$t1
    552 	vadd.i64	$T1,$K
    553 	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
    554 	veor		$h,$t2			@ Sigma0(a)
    555 	vadd.i64	$d,$T1
    556 	vadd.i64	$Maj,$T1
    557 	@ vadd.i64	$h,$Maj
    558 ___
    559 }
    560 
    561 sub NEON_16_79() {
    562 my $i=shift;
    563 
    564 if ($i&1)	{ &NEON_00_15($i,@_); return; }
    565 
    566 # 2x-vectorized, therefore runs every 2nd round
    567 my @X=map("q$_",(0..7));			# view @X as 128-bit vector
    568 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
    569 my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
    570 my $e=@_[4];					# $e from NEON_00_15
    571 $i /= 2;
    572 $code.=<<___;
    573 	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
    574 	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
    575 	 vadd.i64	@_[0],d30			@ h+=Maj from the past
    576 	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
    577 	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
    578 	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
    579 	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
    580 	veor		$s1,$t0
    581 	vshr.u64	$t0,$s0,#@sigma0[0]
    582 	veor		$s1,$t1				@ sigma1(X[i+14])
    583 	vshr.u64	$t1,$s0,#@sigma0[1]
    584 	vadd.i64	@X[$i%8],$s1
    585 	vshr.u64	$s1,$s0,#@sigma0[2]
    586 	vsli.64		$t0,$s0,#`64-@sigma0[0]`
    587 	vsli.64		$t1,$s0,#`64-@sigma0[1]`
    588 	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
    589 	veor		$s1,$t0
    590 	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
    591 	vadd.i64	@X[$i%8],$s0
    592 	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
    593 	veor		$s1,$t1				@ sigma0(X[i+1])
    594 	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
    595 	vadd.i64	@X[$i%8],$s1
    596 ___
    597 	&NEON_00_15(2*$i,@_);
    598 }
    599 
    600 $code.=<<___;
    601 #if __ARM_MAX_ARCH__>=7
    602 .arch	armv7-a
    603 .fpu	neon
    604 
    605 .global	sha512_block_data_order_neon
    606 .type	sha512_block_data_order_neon,%function
    607 .align	4
    608 sha512_block_data_order_neon:
    609 .LNEON:
    610 	dmb				@ errata #451034 on early Cortex A8
    611 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    612 	adr	$Ktbl,K512
    613 	VFP_ABI_PUSH
    614 	vldmia	$ctx,{$A-$H}		@ load context
    615 .Loop_neon:
    616 ___
    617 for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
    618 $code.=<<___;
    619 	mov		$cnt,#4
    620 .L16_79_neon:
    621 	subs		$cnt,#1
    622 ___
    623 for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
    624 $code.=<<___;
    625 	bne		.L16_79_neon
    626 
    627 	 vadd.i64	$A,d30		@ h+=Maj from the past
    628 	vldmia		$ctx,{d24-d31}	@ load context to temp
    629 	vadd.i64	q8,q12		@ vectorized accumulate
    630 	vadd.i64	q9,q13
    631 	vadd.i64	q10,q14
    632 	vadd.i64	q11,q15
    633 	vstmia		$ctx,{$A-$H}	@ save context
    634 	teq		$inp,$len
    635 	sub		$Ktbl,#640	@ rewind K512
    636 	bne		.Loop_neon
    637 
    638 	VFP_ABI_POP
    639 	ret				@ bx lr
    640 .size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
    641 #endif
    642 ___
    643 }
    644 $code.=<<___;
    645 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    646 .align	2
    647 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    648 .comm	OPENSSL_armcap_P,4,4
    649 .hidden	OPENSSL_armcap_P
    650 #endif
    651 ___
    652 
    653 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    654 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    655 $code =~ s/\bret\b/bx	lr/gm;
    656 
    657 open SELF,$0;
    658 while(<SELF>) {
    659 	next if (/^#!/);
    660 	last if (!s/^#/@/ and !/^$/);
    661 	print;
    662 }
    663 close SELF;
    664 
    665 print $code;
    666 close STDOUT; # enforce flush
    667