Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA512 block procedure for ARMv4. September 2007.
     11 
     12 # This code is ~4.5 (four and a half) times faster than code generated
     13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
     14 # Xscale PXA250 core].
     15 #
     16 # July 2010.
     17 #
     18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
     19 # Cortex A8 core and ~40 cycles per processed byte.
     20 
     21 # February 2011.
     22 #
     23 # Profiler-assisted and platform-specific optimization resulted in 7%
     24 # improvement on Coxtex A8 core and ~38 cycles per byte.
     25 
     26 # March 2011.
     27 #
     28 # Add NEON implementation. On Cortex A8 it was measured to process
     29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
     30 
     31 # August 2012.
     32 #
     33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
     34 # terms it's 22.6 cycles per byte, which is disappointing result.
     35 # Technical writers asserted that 3-way S4 pipeline can sustain
     36 # multiple NEON instructions per cycle, but dual NEON issue could
     37 # not be observed, and for NEON-only sequences IPC(*) was found to
     38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
     39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
     40 # even find yourself striving, as I did here, for achieving IPC
     41 # adequate to one delivered by Cortex A8 [for reference, it's
     42 # 0.5 for ILP of 1, and 1 for higher ILPs].
     43 #
     44 # (*) ILP, instruction-level parallelism, how many instructions
     45 #     *can* execute at the same time. IPC, instructions per cycle,
     46 #     indicates how many instructions actually execute.
     47 
     48 # Byte order [in]dependence. =========================================
     49 #
     50 # Originally caller was expected to maintain specific *dword* order in
     51 # h[0-7], namely with most significant dword at *lower* address, which
     52 # was reflected in below two parameters as 0 and 4. Now caller is
     53 # expected to maintain native byte order for whole 64-bit values.
     54 $hi="HI";
     55 $lo="LO";
     56 # ====================================================================
     57 
     58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     59 open STDOUT,">$output";
     60 
     61 $ctx="r0";	# parameter block
     62 $inp="r1";
     63 $len="r2";
     64 
     65 $Tlo="r3";
     66 $Thi="r4";
     67 $Alo="r5";
     68 $Ahi="r6";
     69 $Elo="r7";
     70 $Ehi="r8";
     71 $t0="r9";
     72 $t1="r10";
     73 $t2="r11";
     74 $t3="r12";
     75 ############	r13 is stack pointer
     76 $Ktbl="r14";
     77 ############	r15 is program counter
     78 
     79 $Aoff=8*0;
     80 $Boff=8*1;
     81 $Coff=8*2;
     82 $Doff=8*3;
     83 $Eoff=8*4;
     84 $Foff=8*5;
     85 $Goff=8*6;
     86 $Hoff=8*7;
     87 $Xoff=8*8;
     88 
     89 sub BODY_00_15() {
     90 my $magic = shift;
     91 $code.=<<___;
     92 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
     93 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
     94 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
     95 	mov	$t0,$Elo,lsr#14
     96 	str	$Tlo,[sp,#$Xoff+0]
     97 	mov	$t1,$Ehi,lsr#14
     98 	str	$Thi,[sp,#$Xoff+4]
     99 	eor	$t0,$t0,$Ehi,lsl#18
    100 	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
    101 	eor	$t1,$t1,$Elo,lsl#18
    102 	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
    103 	eor	$t0,$t0,$Elo,lsr#18
    104 	eor	$t1,$t1,$Ehi,lsr#18
    105 	eor	$t0,$t0,$Ehi,lsl#14
    106 	eor	$t1,$t1,$Elo,lsl#14
    107 	eor	$t0,$t0,$Ehi,lsr#9
    108 	eor	$t1,$t1,$Elo,lsr#9
    109 	eor	$t0,$t0,$Elo,lsl#23
    110 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
    111 	adds	$Tlo,$Tlo,$t0
    112 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
    113 	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
    114 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
    115 	adds	$Tlo,$Tlo,$t2
    116 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
    117 	adc	$Thi,$Thi,$t3		@ T += h
    118 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
    119 
    120 	eor	$t0,$t0,$t2
    121 	str	$Elo,[sp,#$Eoff+0]
    122 	eor	$t1,$t1,$t3
    123 	str	$Ehi,[sp,#$Eoff+4]
    124 	and	$t0,$t0,$Elo
    125 	str	$Alo,[sp,#$Aoff+0]
    126 	and	$t1,$t1,$Ehi
    127 	str	$Ahi,[sp,#$Aoff+4]
    128 	eor	$t0,$t0,$t2
    129 	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
    130 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
    131 	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
    132 
    133 	adds	$Tlo,$Tlo,$t0
    134 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
    135 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
    136 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
    137 	adds	$Tlo,$Tlo,$t2
    138 	and	$t0,$t2,#0xff
    139 	adc	$Thi,$Thi,$t3		@ T += K[i]
    140 	adds	$Elo,$Elo,$Tlo
    141 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
    142 	adc	$Ehi,$Ehi,$Thi		@ d += T
    143 	teq	$t0,#$magic
    144 
    145 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
    146 	orreq	$Ktbl,$Ktbl,#1
    147 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
    148 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
    149 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
    150 	mov	$t0,$Alo,lsr#28
    151 	mov	$t1,$Ahi,lsr#28
    152 	eor	$t0,$t0,$Ahi,lsl#4
    153 	eor	$t1,$t1,$Alo,lsl#4
    154 	eor	$t0,$t0,$Ahi,lsr#2
    155 	eor	$t1,$t1,$Alo,lsr#2
    156 	eor	$t0,$t0,$Alo,lsl#30
    157 	eor	$t1,$t1,$Ahi,lsl#30
    158 	eor	$t0,$t0,$Ahi,lsr#7
    159 	eor	$t1,$t1,$Alo,lsr#7
    160 	eor	$t0,$t0,$Alo,lsl#25
    161 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
    162 	adds	$Tlo,$Tlo,$t0
    163 	and	$t0,$Alo,$t2
    164 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
    165 
    166 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
    167 	orr	$Alo,$Alo,$t2
    168 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
    169 	and	$Alo,$Alo,$t3
    170 	and	$t3,$Ahi,$t1
    171 	orr	$Ahi,$Ahi,$t1
    172 	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
    173 	and	$Ahi,$Ahi,$t2
    174 	adds	$Alo,$Alo,$Tlo
    175 	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
    176 	sub	sp,sp,#8
    177 	adc	$Ahi,$Ahi,$Thi		@ h += T
    178 	tst	$Ktbl,#1
    179 	add	$Ktbl,$Ktbl,#8
    180 ___
    181 }
    182 $code=<<___;
    183 #if defined(__arm__)
    184 #include "arm_arch.h"
    185 #ifdef __ARMEL__
    186 # define LO 0
    187 # define HI 4
    188 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
    189 #else
    190 # define HI 0
    191 # define LO 4
    192 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
    193 #endif
    194 
    195 .text
    196 .code	32
    197 .type	K512,%object
    198 .align	5
    199 K512:
    200 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
    201 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
    202 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
    203 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
    204 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
    205 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
    206 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
    207 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
    208 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
    209 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
    210 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
    211 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
    212 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
    213 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
    214 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
    215 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
    216 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
    217 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
    218 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
    219 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
    220 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
    221 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
    222 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
    223 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
    224 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
    225 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
    226 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
    227 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
    228 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
    229 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
    230 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
    231 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
    232 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
    233 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
    234 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
    235 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
    236 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
    237 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
    238 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
    239 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
    240 .size	K512,.-K512
    241 .LOPENSSL_armcap:
    242 .word	OPENSSL_armcap_P-sha512_block_data_order
    243 .skip	32-4
    244 
    245 .global	sha512_block_data_order
    246 .hidden	sha512_block_data_order
    247 .type	sha512_block_data_order,%function
    248 sha512_block_data_order:
    249 	sub	r3,pc,#8		@ sha512_block_data_order
    250 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    251 #if __ARM_ARCH__>=7
    252 	ldr	r12,.LOPENSSL_armcap
    253 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    254 	tst	r12,#1
    255 	bne	.LNEON
    256 #endif
    257 	stmdb	sp!,{r4-r12,lr}
    258 	sub	$Ktbl,r3,#672		@ K512
    259 	sub	sp,sp,#9*8
    260 
    261 	ldr	$Elo,[$ctx,#$Eoff+$lo]
    262 	ldr	$Ehi,[$ctx,#$Eoff+$hi]
    263 	ldr	$t0, [$ctx,#$Goff+$lo]
    264 	ldr	$t1, [$ctx,#$Goff+$hi]
    265 	ldr	$t2, [$ctx,#$Hoff+$lo]
    266 	ldr	$t3, [$ctx,#$Hoff+$hi]
    267 .Loop:
    268 	str	$t0, [sp,#$Goff+0]
    269 	str	$t1, [sp,#$Goff+4]
    270 	str	$t2, [sp,#$Hoff+0]
    271 	str	$t3, [sp,#$Hoff+4]
    272 	ldr	$Alo,[$ctx,#$Aoff+$lo]
    273 	ldr	$Ahi,[$ctx,#$Aoff+$hi]
    274 	ldr	$Tlo,[$ctx,#$Boff+$lo]
    275 	ldr	$Thi,[$ctx,#$Boff+$hi]
    276 	ldr	$t0, [$ctx,#$Coff+$lo]
    277 	ldr	$t1, [$ctx,#$Coff+$hi]
    278 	ldr	$t2, [$ctx,#$Doff+$lo]
    279 	ldr	$t3, [$ctx,#$Doff+$hi]
    280 	str	$Tlo,[sp,#$Boff+0]
    281 	str	$Thi,[sp,#$Boff+4]
    282 	str	$t0, [sp,#$Coff+0]
    283 	str	$t1, [sp,#$Coff+4]
    284 	str	$t2, [sp,#$Doff+0]
    285 	str	$t3, [sp,#$Doff+4]
    286 	ldr	$Tlo,[$ctx,#$Foff+$lo]
    287 	ldr	$Thi,[$ctx,#$Foff+$hi]
    288 	str	$Tlo,[sp,#$Foff+0]
    289 	str	$Thi,[sp,#$Foff+4]
    290 
    291 .L00_15:
    292 #if __ARM_ARCH__<7
    293 	ldrb	$Tlo,[$inp,#7]
    294 	ldrb	$t0, [$inp,#6]
    295 	ldrb	$t1, [$inp,#5]
    296 	ldrb	$t2, [$inp,#4]
    297 	ldrb	$Thi,[$inp,#3]
    298 	ldrb	$t3, [$inp,#2]
    299 	orr	$Tlo,$Tlo,$t0,lsl#8
    300 	ldrb	$t0, [$inp,#1]
    301 	orr	$Tlo,$Tlo,$t1,lsl#16
    302 	ldrb	$t1, [$inp],#8
    303 	orr	$Tlo,$Tlo,$t2,lsl#24
    304 	orr	$Thi,$Thi,$t3,lsl#8
    305 	orr	$Thi,$Thi,$t0,lsl#16
    306 	orr	$Thi,$Thi,$t1,lsl#24
    307 #else
    308 	ldr	$Tlo,[$inp,#4]
    309 	ldr	$Thi,[$inp],#8
    310 #ifdef __ARMEL__
    311 	rev	$Tlo,$Tlo
    312 	rev	$Thi,$Thi
    313 #endif
    314 #endif
    315 ___
    316 	&BODY_00_15(0x94);
    317 $code.=<<___;
    318 	tst	$Ktbl,#1
    319 	beq	.L00_15
    320 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    321 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    322 	bic	$Ktbl,$Ktbl,#1
    323 .L16_79:
    324 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
    325 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
    326 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
    327 	mov	$Tlo,$t0,lsr#1
    328 	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
    329 	mov	$Thi,$t1,lsr#1
    330 	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
    331 	eor	$Tlo,$Tlo,$t1,lsl#31
    332 	eor	$Thi,$Thi,$t0,lsl#31
    333 	eor	$Tlo,$Tlo,$t0,lsr#8
    334 	eor	$Thi,$Thi,$t1,lsr#8
    335 	eor	$Tlo,$Tlo,$t1,lsl#24
    336 	eor	$Thi,$Thi,$t0,lsl#24
    337 	eor	$Tlo,$Tlo,$t0,lsr#7
    338 	eor	$Thi,$Thi,$t1,lsr#7
    339 	eor	$Tlo,$Tlo,$t1,lsl#25
    340 
    341 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
    342 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
    343 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
    344 	mov	$t0,$t2,lsr#19
    345 	mov	$t1,$t3,lsr#19
    346 	eor	$t0,$t0,$t3,lsl#13
    347 	eor	$t1,$t1,$t2,lsl#13
    348 	eor	$t0,$t0,$t3,lsr#29
    349 	eor	$t1,$t1,$t2,lsr#29
    350 	eor	$t0,$t0,$t2,lsl#3
    351 	eor	$t1,$t1,$t3,lsl#3
    352 	eor	$t0,$t0,$t2,lsr#6
    353 	eor	$t1,$t1,$t3,lsr#6
    354 	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
    355 	eor	$t0,$t0,$t3,lsl#26
    356 
    357 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
    358 	adds	$Tlo,$Tlo,$t0
    359 	ldr	$t0,[sp,#`$Xoff+8*16`+0]
    360 	adc	$Thi,$Thi,$t1
    361 
    362 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
    363 	adds	$Tlo,$Tlo,$t2
    364 	adc	$Thi,$Thi,$t3
    365 	adds	$Tlo,$Tlo,$t0
    366 	adc	$Thi,$Thi,$t1
    367 ___
    368 	&BODY_00_15(0x17);
    369 $code.=<<___;
    370 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    371 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    372 	beq	.L16_79
    373 	bic	$Ktbl,$Ktbl,#1
    374 
    375 	ldr	$Tlo,[sp,#$Boff+0]
    376 	ldr	$Thi,[sp,#$Boff+4]
    377 	ldr	$t0, [$ctx,#$Aoff+$lo]
    378 	ldr	$t1, [$ctx,#$Aoff+$hi]
    379 	ldr	$t2, [$ctx,#$Boff+$lo]
    380 	ldr	$t3, [$ctx,#$Boff+$hi]
    381 	adds	$t0,$Alo,$t0
    382 	str	$t0, [$ctx,#$Aoff+$lo]
    383 	adc	$t1,$Ahi,$t1
    384 	str	$t1, [$ctx,#$Aoff+$hi]
    385 	adds	$t2,$Tlo,$t2
    386 	str	$t2, [$ctx,#$Boff+$lo]
    387 	adc	$t3,$Thi,$t3
    388 	str	$t3, [$ctx,#$Boff+$hi]
    389 
    390 	ldr	$Alo,[sp,#$Coff+0]
    391 	ldr	$Ahi,[sp,#$Coff+4]
    392 	ldr	$Tlo,[sp,#$Doff+0]
    393 	ldr	$Thi,[sp,#$Doff+4]
    394 	ldr	$t0, [$ctx,#$Coff+$lo]
    395 	ldr	$t1, [$ctx,#$Coff+$hi]
    396 	ldr	$t2, [$ctx,#$Doff+$lo]
    397 	ldr	$t3, [$ctx,#$Doff+$hi]
    398 	adds	$t0,$Alo,$t0
    399 	str	$t0, [$ctx,#$Coff+$lo]
    400 	adc	$t1,$Ahi,$t1
    401 	str	$t1, [$ctx,#$Coff+$hi]
    402 	adds	$t2,$Tlo,$t2
    403 	str	$t2, [$ctx,#$Doff+$lo]
    404 	adc	$t3,$Thi,$t3
    405 	str	$t3, [$ctx,#$Doff+$hi]
    406 
    407 	ldr	$Tlo,[sp,#$Foff+0]
    408 	ldr	$Thi,[sp,#$Foff+4]
    409 	ldr	$t0, [$ctx,#$Eoff+$lo]
    410 	ldr	$t1, [$ctx,#$Eoff+$hi]
    411 	ldr	$t2, [$ctx,#$Foff+$lo]
    412 	ldr	$t3, [$ctx,#$Foff+$hi]
    413 	adds	$Elo,$Elo,$t0
    414 	str	$Elo,[$ctx,#$Eoff+$lo]
    415 	adc	$Ehi,$Ehi,$t1
    416 	str	$Ehi,[$ctx,#$Eoff+$hi]
    417 	adds	$t2,$Tlo,$t2
    418 	str	$t2, [$ctx,#$Foff+$lo]
    419 	adc	$t3,$Thi,$t3
    420 	str	$t3, [$ctx,#$Foff+$hi]
    421 
    422 	ldr	$Alo,[sp,#$Goff+0]
    423 	ldr	$Ahi,[sp,#$Goff+4]
    424 	ldr	$Tlo,[sp,#$Hoff+0]
    425 	ldr	$Thi,[sp,#$Hoff+4]
    426 	ldr	$t0, [$ctx,#$Goff+$lo]
    427 	ldr	$t1, [$ctx,#$Goff+$hi]
    428 	ldr	$t2, [$ctx,#$Hoff+$lo]
    429 	ldr	$t3, [$ctx,#$Hoff+$hi]
    430 	adds	$t0,$Alo,$t0
    431 	str	$t0, [$ctx,#$Goff+$lo]
    432 	adc	$t1,$Ahi,$t1
    433 	str	$t1, [$ctx,#$Goff+$hi]
    434 	adds	$t2,$Tlo,$t2
    435 	str	$t2, [$ctx,#$Hoff+$lo]
    436 	adc	$t3,$Thi,$t3
    437 	str	$t3, [$ctx,#$Hoff+$hi]
    438 
    439 	add	sp,sp,#640
    440 	sub	$Ktbl,$Ktbl,#640
    441 
    442 	teq	$inp,$len
    443 	bne	.Loop
    444 
    445 	add	sp,sp,#8*9		@ destroy frame
    446 #if __ARM_ARCH__>=5
    447 	ldmia	sp!,{r4-r12,pc}
    448 #else
    449 	ldmia	sp!,{r4-r12,lr}
    450 	tst	lr,#1
    451 	moveq	pc,lr			@ be binary compatible with V4, yet
    452 	bx	lr			@ interoperable with Thumb ISA:-)
    453 #endif
    454 ___
    455 
    456 {
    457 my @Sigma0=(28,34,39);
    458 my @Sigma1=(14,18,41);
    459 my @sigma0=(1, 8, 7);
    460 my @sigma1=(19,61,6);
    461 
    462 my $Ktbl="r3";
    463 my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
    464 
    465 my @X=map("d$_",(0..15));
    466 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
    467 
    468 sub NEON_00_15() {
    469 my $i=shift;
    470 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
    471 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
    472 
    473 $code.=<<___ if ($i<16 || $i&1);
    474 	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
    475 #if $i<16
    476 	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
    477 #endif
    478 	vshr.u64	$t1,$e,#@Sigma1[1]
    479 #if $i>0
    480 	 vadd.i64	$a,$Maj			@ h+=Maj from the past
    481 #endif
    482 	vshr.u64	$t2,$e,#@Sigma1[2]
    483 ___
    484 $code.=<<___;
    485 	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
    486 	vsli.64		$t0,$e,#`64-@Sigma1[0]`
    487 	vsli.64		$t1,$e,#`64-@Sigma1[1]`
    488 	vmov		$Ch,$e
    489 	vsli.64		$t2,$e,#`64-@Sigma1[2]`
    490 #if $i<16 && defined(__ARMEL__)
    491 	vrev64.8	@X[$i],@X[$i]
    492 #endif
    493 	veor		$t1,$t0
    494 	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
    495 	vshr.u64	$t0,$a,#@Sigma0[0]
    496 	veor		$t2,$t1			@ Sigma1(e)
    497 	vadd.i64	$T1,$Ch,$h
    498 	vshr.u64	$t1,$a,#@Sigma0[1]
    499 	vsli.64		$t0,$a,#`64-@Sigma0[0]`
    500 	vadd.i64	$T1,$t2
    501 	vshr.u64	$t2,$a,#@Sigma0[2]
    502 	vadd.i64	$K,@X[$i%16]
    503 	vsli.64		$t1,$a,#`64-@Sigma0[1]`
    504 	veor		$Maj,$a,$b
    505 	vsli.64		$t2,$a,#`64-@Sigma0[2]`
    506 	veor		$h,$t0,$t1
    507 	vadd.i64	$T1,$K
    508 	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
    509 	veor		$h,$t2			@ Sigma0(a)
    510 	vadd.i64	$d,$T1
    511 	vadd.i64	$Maj,$T1
    512 	@ vadd.i64	$h,$Maj
    513 ___
    514 }
    515 
    516 sub NEON_16_79() {
    517 my $i=shift;
    518 
    519 if ($i&1)	{ &NEON_00_15($i,@_); return; }
    520 
    521 # 2x-vectorized, therefore runs every 2nd round
    522 my @X=map("q$_",(0..7));			# view @X as 128-bit vector
    523 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
    524 my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
    525 my $e=@_[4];					# $e from NEON_00_15
    526 $i /= 2;
    527 $code.=<<___;
    528 	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
    529 	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
    530 	 vadd.i64	@_[0],d30			@ h+=Maj from the past
    531 	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
    532 	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
    533 	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
    534 	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
    535 	veor		$s1,$t0
    536 	vshr.u64	$t0,$s0,#@sigma0[0]
    537 	veor		$s1,$t1				@ sigma1(X[i+14])
    538 	vshr.u64	$t1,$s0,#@sigma0[1]
    539 	vadd.i64	@X[$i%8],$s1
    540 	vshr.u64	$s1,$s0,#@sigma0[2]
    541 	vsli.64		$t0,$s0,#`64-@sigma0[0]`
    542 	vsli.64		$t1,$s0,#`64-@sigma0[1]`
    543 	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
    544 	veor		$s1,$t0
    545 	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
    546 	vadd.i64	@X[$i%8],$s0
    547 	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
    548 	veor		$s1,$t1				@ sigma0(X[i+1])
    549 	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
    550 	vadd.i64	@X[$i%8],$s1
    551 ___
    552 	&NEON_00_15(2*$i,@_);
    553 }
    554 
    555 $code.=<<___;
    556 #if __ARM_ARCH__>=7
    557 .fpu	neon
    558 
    559 .align	4
    560 .LNEON:
    561 	dmb				@ errata #451034 on early Cortex A8
    562 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
    563 	sub	$Ktbl,r3,#672		@ K512
    564 	vldmia	$ctx,{$A-$H}		@ load context
    565 .Loop_neon:
    566 ___
    567 for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
    568 $code.=<<___;
    569 	mov		$cnt,#4
    570 .L16_79_neon:
    571 	subs		$cnt,#1
    572 ___
    573 for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
    574 $code.=<<___;
    575 	bne		.L16_79_neon
    576 
    577 	 vadd.i64	$A,d30		@ h+=Maj from the past
    578 	vldmia		$ctx,{d24-d31}	@ load context to temp
    579 	vadd.i64	q8,q12		@ vectorized accumulate
    580 	vadd.i64	q9,q13
    581 	vadd.i64	q10,q14
    582 	vadd.i64	q11,q15
    583 	vstmia		$ctx,{$A-$H}	@ save context
    584 	teq		$inp,$len
    585 	sub		$Ktbl,#640	@ rewind K512
    586 	bne		.Loop_neon
    587 
    588 	vldmia	sp!,{d8-d15}		@ epilogue
    589 	bx	lr
    590 #endif
    591 ___
    592 }
    593 $code.=<<___;
    594 .size	sha512_block_data_order,.-sha512_block_data_order
    595 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    596 .align	2
    597 .comm	OPENSSL_armcap_P,4,4
    598 
    599 #endif
    600 ___
    601 
    602 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    603 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    604 print $code;
    605 close STDOUT; # enforce flush
    606