Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 #
     16 # Permission to use under GPL terms is granted.
     17 # ====================================================================
     18 
     19 # SHA512 block procedure for ARMv4. September 2007.
     20 
     21 # This code is ~4.5 (four and a half) times faster than code generated
     22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
     23 # Xscale PXA250 core].
     24 #
     25 # July 2010.
     26 #
     27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
     28 # Cortex A8 core and ~40 cycles per processed byte.
     29 
     30 # February 2011.
     31 #
     32 # Profiler-assisted and platform-specific optimization resulted in 7%
     33 # improvement on Coxtex A8 core and ~38 cycles per byte.
     34 
     35 # March 2011.
     36 #
     37 # Add NEON implementation. On Cortex A8 it was measured to process
     38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
     39 
     40 # August 2012.
     41 #
     42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
     43 # terms it's 22.6 cycles per byte, which is disappointing result.
     44 # Technical writers asserted that 3-way S4 pipeline can sustain
     45 # multiple NEON instructions per cycle, but dual NEON issue could
     46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
     47 # for further details. On side note Cortex-A15 processes one byte in
     48 # 16 cycles.
     49 
     50 # Byte order [in]dependence. =========================================
     51 #
     52 # Originally caller was expected to maintain specific *dword* order in
     53 # h[0-7], namely with most significant dword at *lower* address, which
     54 # was reflected in below two parameters as 0 and 4. Now caller is
     55 # expected to maintain native byte order for whole 64-bit values.
     56 $hi="HI";
     57 $lo="LO";
     58 # ====================================================================
     59 
     60 $flavour = shift;
     61 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
     62 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
     63 
     64 if ($flavour && $flavour ne "void") {
     65     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     66     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     67     ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
     68     die "can't locate arm-xlate.pl";
     69 
     70     open STDOUT,"| \"$^X\" $xlate $flavour $output";
     71 } else {
     72     open STDOUT,">$output";
     73 }
     74 
     75 $ctx="r0";	# parameter block
     76 $inp="r1";
     77 $len="r2";
     78 
     79 $Tlo="r3";
     80 $Thi="r4";
     81 $Alo="r5";
     82 $Ahi="r6";
     83 $Elo="r7";
     84 $Ehi="r8";
     85 $t0="r9";
     86 $t1="r10";
     87 $t2="r11";
     88 $t3="r12";
     89 ############	r13 is stack pointer
     90 $Ktbl="r14";
     91 ############	r15 is program counter
     92 
     93 $Aoff=8*0;
     94 $Boff=8*1;
     95 $Coff=8*2;
     96 $Doff=8*3;
     97 $Eoff=8*4;
     98 $Foff=8*5;
     99 $Goff=8*6;
    100 $Hoff=8*7;
    101 $Xoff=8*8;
    102 
    103 sub BODY_00_15() {
    104 my $magic = shift;
    105 $code.=<<___;
    106 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
    107 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
    108 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
    109 	mov	$t0,$Elo,lsr#14
    110 	str	$Tlo,[sp,#$Xoff+0]
    111 	mov	$t1,$Ehi,lsr#14
    112 	str	$Thi,[sp,#$Xoff+4]
    113 	eor	$t0,$t0,$Ehi,lsl#18
    114 	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
    115 	eor	$t1,$t1,$Elo,lsl#18
    116 	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
    117 	eor	$t0,$t0,$Elo,lsr#18
    118 	eor	$t1,$t1,$Ehi,lsr#18
    119 	eor	$t0,$t0,$Ehi,lsl#14
    120 	eor	$t1,$t1,$Elo,lsl#14
    121 	eor	$t0,$t0,$Ehi,lsr#9
    122 	eor	$t1,$t1,$Elo,lsr#9
    123 	eor	$t0,$t0,$Elo,lsl#23
    124 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
    125 	adds	$Tlo,$Tlo,$t0
    126 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
    127 	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
    128 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
    129 	adds	$Tlo,$Tlo,$t2
    130 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
    131 	adc	$Thi,$Thi,$t3		@ T += h
    132 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
    133 
    134 	eor	$t0,$t0,$t2
    135 	str	$Elo,[sp,#$Eoff+0]
    136 	eor	$t1,$t1,$t3
    137 	str	$Ehi,[sp,#$Eoff+4]
    138 	and	$t0,$t0,$Elo
    139 	str	$Alo,[sp,#$Aoff+0]
    140 	and	$t1,$t1,$Ehi
    141 	str	$Ahi,[sp,#$Aoff+4]
    142 	eor	$t0,$t0,$t2
    143 	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
    144 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
    145 	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
    146 
    147 	adds	$Tlo,$Tlo,$t0
    148 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
    149 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
    150 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
    151 	adds	$Tlo,$Tlo,$t2
    152 	and	$t0,$t2,#0xff
    153 	adc	$Thi,$Thi,$t3		@ T += K[i]
    154 	adds	$Elo,$Elo,$Tlo
    155 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
    156 	adc	$Ehi,$Ehi,$Thi		@ d += T
    157 	teq	$t0,#$magic
    158 
    159 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
    160 #if __ARM_ARCH__>=7
    161 	it	eq			@ Thumb2 thing, sanity check in ARM
    162 #endif
    163 	orreq	$Ktbl,$Ktbl,#1
    164 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
    165 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
    166 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
    167 	mov	$t0,$Alo,lsr#28
    168 	mov	$t1,$Ahi,lsr#28
    169 	eor	$t0,$t0,$Ahi,lsl#4
    170 	eor	$t1,$t1,$Alo,lsl#4
    171 	eor	$t0,$t0,$Ahi,lsr#2
    172 	eor	$t1,$t1,$Alo,lsr#2
    173 	eor	$t0,$t0,$Alo,lsl#30
    174 	eor	$t1,$t1,$Ahi,lsl#30
    175 	eor	$t0,$t0,$Ahi,lsr#7
    176 	eor	$t1,$t1,$Alo,lsr#7
    177 	eor	$t0,$t0,$Alo,lsl#25
    178 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
    179 	adds	$Tlo,$Tlo,$t0
    180 	and	$t0,$Alo,$t2
    181 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
    182 
    183 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
    184 	orr	$Alo,$Alo,$t2
    185 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
    186 	and	$Alo,$Alo,$t3
    187 	and	$t3,$Ahi,$t1
    188 	orr	$Ahi,$Ahi,$t1
    189 	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
    190 	and	$Ahi,$Ahi,$t2
    191 	adds	$Alo,$Alo,$Tlo
    192 	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
    193 	sub	sp,sp,#8
    194 	adc	$Ahi,$Ahi,$Thi		@ h += T
    195 	tst	$Ktbl,#1
    196 	add	$Ktbl,$Ktbl,#8
    197 ___
    198 }
    199 $code=<<___;
    200 #ifndef __KERNEL__
    201 # include <openssl/arm_arch.h>
    202 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
    203 # define VFP_ABI_POP	vldmia	sp!,{d8-d15}
    204 #else
    205 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
    206 # define __ARM_MAX_ARCH__ 7
    207 # define VFP_ABI_PUSH
    208 # define VFP_ABI_POP
    209 #endif
    210 
    211 @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
    212 @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
    213 .arch  armv7-a
    214 
    215 #ifdef __ARMEL__
    216 # define LO 0
    217 # define HI 4
    218 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
    219 #else
    220 # define HI 0
    221 # define LO 4
    222 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
    223 #endif
    224 
    225 .text
    226 #if defined(__thumb2__)
    227 .syntax unified
    228 .thumb
    229 # define adrl adr
    230 #else
    231 .code	32
    232 #endif
    233 
    234 .type	K512,%object
    235 .align	5
    236 K512:
    237 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
    238 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
    239 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
    240 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
    241 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
    242 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
    243 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
    244 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
    245 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
    246 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
    247 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
    248 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
    249 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
    250 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
    251 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
    252 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
    253 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
    254 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
    255 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
    256 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
    257 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
    258 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
    259 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
    260 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
    261 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
    262 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
    263 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
    264 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
    265 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
    266 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
    267 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
    268 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
    269 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
    270 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
    271 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
    272 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
    273 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
    274 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
    275 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
    276 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
    277 .size	K512,.-K512
    278 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    279 .LOPENSSL_armcap:
    280 .word	OPENSSL_armcap_P-.Lsha512_block_data_order
    281 .skip	32-4
    282 #else
    283 .skip	32
    284 #endif
    285 
    286 .global	sha512_block_data_order
    287 .type	sha512_block_data_order,%function
    288 sha512_block_data_order:
    289 .Lsha512_block_data_order:
    290 #if __ARM_ARCH__<7 && !defined(__thumb2__)
    291 	sub	r3,pc,#8		@ sha512_block_data_order
    292 #else
    293 	adr	r3,.Lsha512_block_data_order
    294 #endif
    295 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    296 	ldr	r12,.LOPENSSL_armcap
    297 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    298 #ifdef	__APPLE__
    299 	ldr	r12,[r12]
    300 #endif
    301 	tst	r12,#ARMV7_NEON
    302 	bne	.LNEON
    303 #endif
    304 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    305 	stmdb	sp!,{r4-r12,lr}
    306 	sub	$Ktbl,r3,#672		@ K512
    307 	sub	sp,sp,#9*8
    308 
    309 	ldr	$Elo,[$ctx,#$Eoff+$lo]
    310 	ldr	$Ehi,[$ctx,#$Eoff+$hi]
    311 	ldr	$t0, [$ctx,#$Goff+$lo]
    312 	ldr	$t1, [$ctx,#$Goff+$hi]
    313 	ldr	$t2, [$ctx,#$Hoff+$lo]
    314 	ldr	$t3, [$ctx,#$Hoff+$hi]
    315 .Loop:
    316 	str	$t0, [sp,#$Goff+0]
    317 	str	$t1, [sp,#$Goff+4]
    318 	str	$t2, [sp,#$Hoff+0]
    319 	str	$t3, [sp,#$Hoff+4]
    320 	ldr	$Alo,[$ctx,#$Aoff+$lo]
    321 	ldr	$Ahi,[$ctx,#$Aoff+$hi]
    322 	ldr	$Tlo,[$ctx,#$Boff+$lo]
    323 	ldr	$Thi,[$ctx,#$Boff+$hi]
    324 	ldr	$t0, [$ctx,#$Coff+$lo]
    325 	ldr	$t1, [$ctx,#$Coff+$hi]
    326 	ldr	$t2, [$ctx,#$Doff+$lo]
    327 	ldr	$t3, [$ctx,#$Doff+$hi]
    328 	str	$Tlo,[sp,#$Boff+0]
    329 	str	$Thi,[sp,#$Boff+4]
    330 	str	$t0, [sp,#$Coff+0]
    331 	str	$t1, [sp,#$Coff+4]
    332 	str	$t2, [sp,#$Doff+0]
    333 	str	$t3, [sp,#$Doff+4]
    334 	ldr	$Tlo,[$ctx,#$Foff+$lo]
    335 	ldr	$Thi,[$ctx,#$Foff+$hi]
    336 	str	$Tlo,[sp,#$Foff+0]
    337 	str	$Thi,[sp,#$Foff+4]
    338 
    339 .L00_15:
    340 #if __ARM_ARCH__<7
    341 	ldrb	$Tlo,[$inp,#7]
    342 	ldrb	$t0, [$inp,#6]
    343 	ldrb	$t1, [$inp,#5]
    344 	ldrb	$t2, [$inp,#4]
    345 	ldrb	$Thi,[$inp,#3]
    346 	ldrb	$t3, [$inp,#2]
    347 	orr	$Tlo,$Tlo,$t0,lsl#8
    348 	ldrb	$t0, [$inp,#1]
    349 	orr	$Tlo,$Tlo,$t1,lsl#16
    350 	ldrb	$t1, [$inp],#8
    351 	orr	$Tlo,$Tlo,$t2,lsl#24
    352 	orr	$Thi,$Thi,$t3,lsl#8
    353 	orr	$Thi,$Thi,$t0,lsl#16
    354 	orr	$Thi,$Thi,$t1,lsl#24
    355 #else
    356 	ldr	$Tlo,[$inp,#4]
    357 	ldr	$Thi,[$inp],#8
    358 #ifdef __ARMEL__
    359 	rev	$Tlo,$Tlo
    360 	rev	$Thi,$Thi
    361 #endif
    362 #endif
    363 ___
    364 	&BODY_00_15(0x94);
    365 $code.=<<___;
    366 	tst	$Ktbl,#1
    367 	beq	.L00_15
    368 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    369 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    370 	bic	$Ktbl,$Ktbl,#1
    371 .L16_79:
    372 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
    373 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
    374 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
    375 	mov	$Tlo,$t0,lsr#1
    376 	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
    377 	mov	$Thi,$t1,lsr#1
    378 	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
    379 	eor	$Tlo,$Tlo,$t1,lsl#31
    380 	eor	$Thi,$Thi,$t0,lsl#31
    381 	eor	$Tlo,$Tlo,$t0,lsr#8
    382 	eor	$Thi,$Thi,$t1,lsr#8
    383 	eor	$Tlo,$Tlo,$t1,lsl#24
    384 	eor	$Thi,$Thi,$t0,lsl#24
    385 	eor	$Tlo,$Tlo,$t0,lsr#7
    386 	eor	$Thi,$Thi,$t1,lsr#7
    387 	eor	$Tlo,$Tlo,$t1,lsl#25
    388 
    389 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
    390 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
    391 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
    392 	mov	$t0,$t2,lsr#19
    393 	mov	$t1,$t3,lsr#19
    394 	eor	$t0,$t0,$t3,lsl#13
    395 	eor	$t1,$t1,$t2,lsl#13
    396 	eor	$t0,$t0,$t3,lsr#29
    397 	eor	$t1,$t1,$t2,lsr#29
    398 	eor	$t0,$t0,$t2,lsl#3
    399 	eor	$t1,$t1,$t3,lsl#3
    400 	eor	$t0,$t0,$t2,lsr#6
    401 	eor	$t1,$t1,$t3,lsr#6
    402 	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
    403 	eor	$t0,$t0,$t3,lsl#26
    404 
    405 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
    406 	adds	$Tlo,$Tlo,$t0
    407 	ldr	$t0,[sp,#`$Xoff+8*16`+0]
    408 	adc	$Thi,$Thi,$t1
    409 
    410 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
    411 	adds	$Tlo,$Tlo,$t2
    412 	adc	$Thi,$Thi,$t3
    413 	adds	$Tlo,$Tlo,$t0
    414 	adc	$Thi,$Thi,$t1
    415 ___
    416 	&BODY_00_15(0x17);
    417 $code.=<<___;
    418 #if __ARM_ARCH__>=7
    419 	ittt	eq			@ Thumb2 thing, sanity check in ARM
    420 #endif
    421 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    422 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    423 	beq	.L16_79
    424 	bic	$Ktbl,$Ktbl,#1
    425 
    426 	ldr	$Tlo,[sp,#$Boff+0]
    427 	ldr	$Thi,[sp,#$Boff+4]
    428 	ldr	$t0, [$ctx,#$Aoff+$lo]
    429 	ldr	$t1, [$ctx,#$Aoff+$hi]
    430 	ldr	$t2, [$ctx,#$Boff+$lo]
    431 	ldr	$t3, [$ctx,#$Boff+$hi]
    432 	adds	$t0,$Alo,$t0
    433 	str	$t0, [$ctx,#$Aoff+$lo]
    434 	adc	$t1,$Ahi,$t1
    435 	str	$t1, [$ctx,#$Aoff+$hi]
    436 	adds	$t2,$Tlo,$t2
    437 	str	$t2, [$ctx,#$Boff+$lo]
    438 	adc	$t3,$Thi,$t3
    439 	str	$t3, [$ctx,#$Boff+$hi]
    440 
    441 	ldr	$Alo,[sp,#$Coff+0]
    442 	ldr	$Ahi,[sp,#$Coff+4]
    443 	ldr	$Tlo,[sp,#$Doff+0]
    444 	ldr	$Thi,[sp,#$Doff+4]
    445 	ldr	$t0, [$ctx,#$Coff+$lo]
    446 	ldr	$t1, [$ctx,#$Coff+$hi]
    447 	ldr	$t2, [$ctx,#$Doff+$lo]
    448 	ldr	$t3, [$ctx,#$Doff+$hi]
    449 	adds	$t0,$Alo,$t0
    450 	str	$t0, [$ctx,#$Coff+$lo]
    451 	adc	$t1,$Ahi,$t1
    452 	str	$t1, [$ctx,#$Coff+$hi]
    453 	adds	$t2,$Tlo,$t2
    454 	str	$t2, [$ctx,#$Doff+$lo]
    455 	adc	$t3,$Thi,$t3
    456 	str	$t3, [$ctx,#$Doff+$hi]
    457 
    458 	ldr	$Tlo,[sp,#$Foff+0]
    459 	ldr	$Thi,[sp,#$Foff+4]
    460 	ldr	$t0, [$ctx,#$Eoff+$lo]
    461 	ldr	$t1, [$ctx,#$Eoff+$hi]
    462 	ldr	$t2, [$ctx,#$Foff+$lo]
    463 	ldr	$t3, [$ctx,#$Foff+$hi]
    464 	adds	$Elo,$Elo,$t0
    465 	str	$Elo,[$ctx,#$Eoff+$lo]
    466 	adc	$Ehi,$Ehi,$t1
    467 	str	$Ehi,[$ctx,#$Eoff+$hi]
    468 	adds	$t2,$Tlo,$t2
    469 	str	$t2, [$ctx,#$Foff+$lo]
    470 	adc	$t3,$Thi,$t3
    471 	str	$t3, [$ctx,#$Foff+$hi]
    472 
    473 	ldr	$Alo,[sp,#$Goff+0]
    474 	ldr	$Ahi,[sp,#$Goff+4]
    475 	ldr	$Tlo,[sp,#$Hoff+0]
    476 	ldr	$Thi,[sp,#$Hoff+4]
    477 	ldr	$t0, [$ctx,#$Goff+$lo]
    478 	ldr	$t1, [$ctx,#$Goff+$hi]
    479 	ldr	$t2, [$ctx,#$Hoff+$lo]
    480 	ldr	$t3, [$ctx,#$Hoff+$hi]
    481 	adds	$t0,$Alo,$t0
    482 	str	$t0, [$ctx,#$Goff+$lo]
    483 	adc	$t1,$Ahi,$t1
    484 	str	$t1, [$ctx,#$Goff+$hi]
    485 	adds	$t2,$Tlo,$t2
    486 	str	$t2, [$ctx,#$Hoff+$lo]
    487 	adc	$t3,$Thi,$t3
    488 	str	$t3, [$ctx,#$Hoff+$hi]
    489 
    490 	add	sp,sp,#640
    491 	sub	$Ktbl,$Ktbl,#640
    492 
    493 	teq	$inp,$len
    494 	bne	.Loop
    495 
    496 	add	sp,sp,#8*9		@ destroy frame
    497 #if __ARM_ARCH__>=5
    498 	ldmia	sp!,{r4-r12,pc}
    499 #else
    500 	ldmia	sp!,{r4-r12,lr}
    501 	tst	lr,#1
    502 	moveq	pc,lr			@ be binary compatible with V4, yet
    503 	bx	lr			@ interoperable with Thumb ISA:-)
    504 #endif
    505 .size	sha512_block_data_order,.-sha512_block_data_order
    506 ___
    507 
    508 {
    509 my @Sigma0=(28,34,39);
    510 my @Sigma1=(14,18,41);
    511 my @sigma0=(1, 8, 7);
    512 my @sigma1=(19,61,6);
    513 
    514 my $Ktbl="r3";
    515 my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
    516 
    517 my @X=map("d$_",(0..15));
    518 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
    519 
    520 sub NEON_00_15() {
    521 my $i=shift;
    522 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
    523 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
    524 
    525 $code.=<<___ if ($i<16 || $i&1);
    526 	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
    527 #if $i<16
    528 	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
    529 #endif
    530 	vshr.u64	$t1,$e,#@Sigma1[1]
    531 #if $i>0
    532 	 vadd.i64	$a,$Maj			@ h+=Maj from the past
    533 #endif
    534 	vshr.u64	$t2,$e,#@Sigma1[2]
    535 ___
    536 $code.=<<___;
    537 	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
    538 	vsli.64		$t0,$e,#`64-@Sigma1[0]`
    539 	vsli.64		$t1,$e,#`64-@Sigma1[1]`
    540 	vmov		$Ch,$e
    541 	vsli.64		$t2,$e,#`64-@Sigma1[2]`
    542 #if $i<16 && defined(__ARMEL__)
    543 	vrev64.8	@X[$i],@X[$i]
    544 #endif
    545 	veor		$t1,$t0
    546 	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
    547 	vshr.u64	$t0,$a,#@Sigma0[0]
    548 	veor		$t2,$t1			@ Sigma1(e)
    549 	vadd.i64	$T1,$Ch,$h
    550 	vshr.u64	$t1,$a,#@Sigma0[1]
    551 	vsli.64		$t0,$a,#`64-@Sigma0[0]`
    552 	vadd.i64	$T1,$t2
    553 	vshr.u64	$t2,$a,#@Sigma0[2]
    554 	vadd.i64	$K,@X[$i%16]
    555 	vsli.64		$t1,$a,#`64-@Sigma0[1]`
    556 	veor		$Maj,$a,$b
    557 	vsli.64		$t2,$a,#`64-@Sigma0[2]`
    558 	veor		$h,$t0,$t1
    559 	vadd.i64	$T1,$K
    560 	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
    561 	veor		$h,$t2			@ Sigma0(a)
    562 	vadd.i64	$d,$T1
    563 	vadd.i64	$Maj,$T1
    564 	@ vadd.i64	$h,$Maj
    565 ___
    566 }
    567 
    568 sub NEON_16_79() {
    569 my $i=shift;
    570 
    571 if ($i&1)	{ &NEON_00_15($i,@_); return; }
    572 
    573 # 2x-vectorized, therefore runs every 2nd round
    574 my @X=map("q$_",(0..7));			# view @X as 128-bit vector
    575 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
    576 my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
    577 my $e=@_[4];					# $e from NEON_00_15
    578 $i /= 2;
    579 $code.=<<___;
    580 	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
    581 	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
    582 	 vadd.i64	@_[0],d30			@ h+=Maj from the past
    583 	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
    584 	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
    585 	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
    586 	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
    587 	veor		$s1,$t0
    588 	vshr.u64	$t0,$s0,#@sigma0[0]
    589 	veor		$s1,$t1				@ sigma1(X[i+14])
    590 	vshr.u64	$t1,$s0,#@sigma0[1]
    591 	vadd.i64	@X[$i%8],$s1
    592 	vshr.u64	$s1,$s0,#@sigma0[2]
    593 	vsli.64		$t0,$s0,#`64-@sigma0[0]`
    594 	vsli.64		$t1,$s0,#`64-@sigma0[1]`
    595 	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
    596 	veor		$s1,$t0
    597 	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
    598 	vadd.i64	@X[$i%8],$s0
    599 	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
    600 	veor		$s1,$t1				@ sigma0(X[i+1])
    601 	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
    602 	vadd.i64	@X[$i%8],$s1
    603 ___
    604 	&NEON_00_15(2*$i,@_);
    605 }
    606 
    607 $code.=<<___;
    608 #if __ARM_MAX_ARCH__>=7
    609 .arch	armv7-a
    610 .fpu	neon
    611 
    612 .global	sha512_block_data_order_neon
    613 .type	sha512_block_data_order_neon,%function
    614 .align	4
    615 sha512_block_data_order_neon:
    616 .LNEON:
    617 	dmb				@ errata #451034 on early Cortex A8
    618 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    619 	adr	$Ktbl,K512
    620 	VFP_ABI_PUSH
    621 	vldmia	$ctx,{$A-$H}		@ load context
    622 .Loop_neon:
    623 ___
    624 for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
    625 $code.=<<___;
    626 	mov		$cnt,#4
    627 .L16_79_neon:
    628 	subs		$cnt,#1
    629 ___
    630 for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
    631 $code.=<<___;
    632 	bne		.L16_79_neon
    633 
    634 	 vadd.i64	$A,d30		@ h+=Maj from the past
    635 	vldmia		$ctx,{d24-d31}	@ load context to temp
    636 	vadd.i64	q8,q12		@ vectorized accumulate
    637 	vadd.i64	q9,q13
    638 	vadd.i64	q10,q14
    639 	vadd.i64	q11,q15
    640 	vstmia		$ctx,{$A-$H}	@ save context
    641 	teq		$inp,$len
    642 	sub		$Ktbl,#640	@ rewind K512
    643 	bne		.Loop_neon
    644 
    645 	VFP_ABI_POP
    646 	ret				@ bx lr
    647 .size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
    648 #endif
    649 ___
    650 }
    651 $code.=<<___;
    652 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    653 .align	2
    654 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    655 .comm	OPENSSL_armcap_P,4,4
    656 .hidden	OPENSSL_armcap_P
    657 #endif
    658 ___
    659 
    660 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    661 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    662 $code =~ s/\bret\b/bx	lr/gm;
    663 
    664 open SELF,$0;
    665 while(<SELF>) {
    666 	next if (/^#!/);
    667 	last if (!s/^#/@/ and !/^$/);
    668 	print;
    669 }
    670 close SELF;
    671 
    672 print $code;
    673 close STDOUT; # enforce flush
    674