Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA512 block procedure for ARMv4. September 2007.
     11 
     12 # This code is ~4.5 (four and a half) times faster than code generated
     13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
     14 # Xscale PXA250 core].
     15 #
     16 # July 2010.
     17 #
     18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
     19 # Cortex A8 core and ~40 cycles per processed byte.
     20 
     21 # February 2011.
     22 #
     23 # Profiler-assisted and platform-specific optimization resulted in 7%
     24 # improvement on Coxtex A8 core and ~38 cycles per byte.
     25 
     26 # March 2011.
     27 #
     28 # Add NEON implementation. On Cortex A8 it was measured to process
     29 # one byte in 25.5 cycles or 47% faster than integer-only code.
     30 
     31 # Byte order [in]dependence. =========================================
     32 #
     33 # Originally caller was expected to maintain specific *dword* order in
     34 # h[0-7], namely with most significant dword at *lower* address, which
     35 # was reflected in below two parameters as 0 and 4. Now caller is
     36 # expected to maintain native byte order for whole 64-bit values.
     37 $hi="HI";
     38 $lo="LO";
     39 # ====================================================================
     40 
     41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     42 open STDOUT,">$output";
     43 
     44 $ctx="r0";	# parameter block
     45 $inp="r1";
     46 $len="r2";
     47 
     48 $Tlo="r3";
     49 $Thi="r4";
     50 $Alo="r5";
     51 $Ahi="r6";
     52 $Elo="r7";
     53 $Ehi="r8";
     54 $t0="r9";
     55 $t1="r10";
     56 $t2="r11";
     57 $t3="r12";
     58 ############	r13 is stack pointer
     59 $Ktbl="r14";
     60 ############	r15 is program counter
     61 
     62 $Aoff=8*0;
     63 $Boff=8*1;
     64 $Coff=8*2;
     65 $Doff=8*3;
     66 $Eoff=8*4;
     67 $Foff=8*5;
     68 $Goff=8*6;
     69 $Hoff=8*7;
     70 $Xoff=8*8;
     71 
     72 sub BODY_00_15() {
     73 my $magic = shift;
     74 $code.=<<___;
     75 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
     76 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
     77 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
     78 	mov	$t0,$Elo,lsr#14
     79 	str	$Tlo,[sp,#$Xoff+0]
     80 	mov	$t1,$Ehi,lsr#14
     81 	str	$Thi,[sp,#$Xoff+4]
     82 	eor	$t0,$t0,$Ehi,lsl#18
     83 	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
     84 	eor	$t1,$t1,$Elo,lsl#18
     85 	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
     86 	eor	$t0,$t0,$Elo,lsr#18
     87 	eor	$t1,$t1,$Ehi,lsr#18
     88 	eor	$t0,$t0,$Ehi,lsl#14
     89 	eor	$t1,$t1,$Elo,lsl#14
     90 	eor	$t0,$t0,$Ehi,lsr#9
     91 	eor	$t1,$t1,$Elo,lsr#9
     92 	eor	$t0,$t0,$Elo,lsl#23
     93 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
     94 	adds	$Tlo,$Tlo,$t0
     95 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
     96 	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
     97 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
     98 	adds	$Tlo,$Tlo,$t2
     99 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
    100 	adc	$Thi,$Thi,$t3		@ T += h
    101 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
    102 
    103 	eor	$t0,$t0,$t2
    104 	str	$Elo,[sp,#$Eoff+0]
    105 	eor	$t1,$t1,$t3
    106 	str	$Ehi,[sp,#$Eoff+4]
    107 	and	$t0,$t0,$Elo
    108 	str	$Alo,[sp,#$Aoff+0]
    109 	and	$t1,$t1,$Ehi
    110 	str	$Ahi,[sp,#$Aoff+4]
    111 	eor	$t0,$t0,$t2
    112 	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
    113 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
    114 	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
    115 
    116 	adds	$Tlo,$Tlo,$t0
    117 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
    118 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
    119 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
    120 	adds	$Tlo,$Tlo,$t2
    121 	and	$t0,$t2,#0xff
    122 	adc	$Thi,$Thi,$t3		@ T += K[i]
    123 	adds	$Elo,$Elo,$Tlo
    124 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
    125 	adc	$Ehi,$Ehi,$Thi		@ d += T
    126 	teq	$t0,#$magic
    127 
    128 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
    129 	orreq	$Ktbl,$Ktbl,#1
    130 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
    131 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
    132 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
    133 	mov	$t0,$Alo,lsr#28
    134 	mov	$t1,$Ahi,lsr#28
    135 	eor	$t0,$t0,$Ahi,lsl#4
    136 	eor	$t1,$t1,$Alo,lsl#4
    137 	eor	$t0,$t0,$Ahi,lsr#2
    138 	eor	$t1,$t1,$Alo,lsr#2
    139 	eor	$t0,$t0,$Alo,lsl#30
    140 	eor	$t1,$t1,$Ahi,lsl#30
    141 	eor	$t0,$t0,$Ahi,lsr#7
    142 	eor	$t1,$t1,$Alo,lsr#7
    143 	eor	$t0,$t0,$Alo,lsl#25
    144 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
    145 	adds	$Tlo,$Tlo,$t0
    146 	and	$t0,$Alo,$t2
    147 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
    148 
    149 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
    150 	orr	$Alo,$Alo,$t2
    151 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
    152 	and	$Alo,$Alo,$t3
    153 	and	$t3,$Ahi,$t1
    154 	orr	$Ahi,$Ahi,$t1
    155 	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
    156 	and	$Ahi,$Ahi,$t2
    157 	adds	$Alo,$Alo,$Tlo
    158 	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
    159 	sub	sp,sp,#8
    160 	adc	$Ahi,$Ahi,$Thi		@ h += T
    161 	tst	$Ktbl,#1
    162 	add	$Ktbl,$Ktbl,#8
    163 ___
    164 }
    165 $code=<<___;
    166 #include "arm_arch.h"
    167 #ifdef __ARMEL__
    168 # define LO 0
    169 # define HI 4
    170 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
    171 #else
    172 # define HI 0
    173 # define LO 4
    174 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
    175 #endif
    176 
    177 .text
    178 .code	32
    179 .type	K512,%object
    180 .align	5
    181 K512:
    182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
    183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
    184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
    185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
    186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
    187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
    188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
    189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
    190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
    191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
    192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
    193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
    194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
    195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
    196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
    197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
    198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
    199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
    200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
    201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
    202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
    203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
    204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
    205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
    206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
    207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
    208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
    209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
    210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
    211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
    212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
    213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
    214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
    215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
    216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
    217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
    218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
    219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
    220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
    221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
    222 .size	K512,.-K512
    223 .LOPENSSL_armcap:
    224 .word	OPENSSL_armcap_P-sha512_block_data_order
    225 .skip	32-4
    226 
    227 .global	sha512_block_data_order
    228 .type	sha512_block_data_order,%function
    229 sha512_block_data_order:
    230 	sub	r3,pc,#8		@ sha512_block_data_order
    231 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    232 #if __ARM_ARCH__>=7
    233 	ldr	r12,.LOPENSSL_armcap
    234 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    235 	tst	r12,#1
    236 	bne	.LNEON
    237 #endif
    238 	stmdb	sp!,{r4-r12,lr}
    239 	sub	$Ktbl,r3,#672		@ K512
    240 	sub	sp,sp,#9*8
    241 
    242 	ldr	$Elo,[$ctx,#$Eoff+$lo]
    243 	ldr	$Ehi,[$ctx,#$Eoff+$hi]
    244 	ldr	$t0, [$ctx,#$Goff+$lo]
    245 	ldr	$t1, [$ctx,#$Goff+$hi]
    246 	ldr	$t2, [$ctx,#$Hoff+$lo]
    247 	ldr	$t3, [$ctx,#$Hoff+$hi]
    248 .Loop:
    249 	str	$t0, [sp,#$Goff+0]
    250 	str	$t1, [sp,#$Goff+4]
    251 	str	$t2, [sp,#$Hoff+0]
    252 	str	$t3, [sp,#$Hoff+4]
    253 	ldr	$Alo,[$ctx,#$Aoff+$lo]
    254 	ldr	$Ahi,[$ctx,#$Aoff+$hi]
    255 	ldr	$Tlo,[$ctx,#$Boff+$lo]
    256 	ldr	$Thi,[$ctx,#$Boff+$hi]
    257 	ldr	$t0, [$ctx,#$Coff+$lo]
    258 	ldr	$t1, [$ctx,#$Coff+$hi]
    259 	ldr	$t2, [$ctx,#$Doff+$lo]
    260 	ldr	$t3, [$ctx,#$Doff+$hi]
    261 	str	$Tlo,[sp,#$Boff+0]
    262 	str	$Thi,[sp,#$Boff+4]
    263 	str	$t0, [sp,#$Coff+0]
    264 	str	$t1, [sp,#$Coff+4]
    265 	str	$t2, [sp,#$Doff+0]
    266 	str	$t3, [sp,#$Doff+4]
    267 	ldr	$Tlo,[$ctx,#$Foff+$lo]
    268 	ldr	$Thi,[$ctx,#$Foff+$hi]
    269 	str	$Tlo,[sp,#$Foff+0]
    270 	str	$Thi,[sp,#$Foff+4]
    271 
    272 .L00_15:
    273 #if __ARM_ARCH__<7
    274 	ldrb	$Tlo,[$inp,#7]
    275 	ldrb	$t0, [$inp,#6]
    276 	ldrb	$t1, [$inp,#5]
    277 	ldrb	$t2, [$inp,#4]
    278 	ldrb	$Thi,[$inp,#3]
    279 	ldrb	$t3, [$inp,#2]
    280 	orr	$Tlo,$Tlo,$t0,lsl#8
    281 	ldrb	$t0, [$inp,#1]
    282 	orr	$Tlo,$Tlo,$t1,lsl#16
    283 	ldrb	$t1, [$inp],#8
    284 	orr	$Tlo,$Tlo,$t2,lsl#24
    285 	orr	$Thi,$Thi,$t3,lsl#8
    286 	orr	$Thi,$Thi,$t0,lsl#16
    287 	orr	$Thi,$Thi,$t1,lsl#24
    288 #else
    289 	ldr	$Tlo,[$inp,#4]
    290 	ldr	$Thi,[$inp],#8
    291 #ifdef __ARMEL__
    292 	rev	$Tlo,$Tlo
    293 	rev	$Thi,$Thi
    294 #endif
    295 #endif
    296 ___
    297 	&BODY_00_15(0x94);
    298 $code.=<<___;
    299 	tst	$Ktbl,#1
    300 	beq	.L00_15
    301 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    302 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    303 	bic	$Ktbl,$Ktbl,#1
    304 .L16_79:
    305 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
    306 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
    307 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
    308 	mov	$Tlo,$t0,lsr#1
    309 	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
    310 	mov	$Thi,$t1,lsr#1
    311 	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
    312 	eor	$Tlo,$Tlo,$t1,lsl#31
    313 	eor	$Thi,$Thi,$t0,lsl#31
    314 	eor	$Tlo,$Tlo,$t0,lsr#8
    315 	eor	$Thi,$Thi,$t1,lsr#8
    316 	eor	$Tlo,$Tlo,$t1,lsl#24
    317 	eor	$Thi,$Thi,$t0,lsl#24
    318 	eor	$Tlo,$Tlo,$t0,lsr#7
    319 	eor	$Thi,$Thi,$t1,lsr#7
    320 	eor	$Tlo,$Tlo,$t1,lsl#25
    321 
    322 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
    323 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
    324 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
    325 	mov	$t0,$t2,lsr#19
    326 	mov	$t1,$t3,lsr#19
    327 	eor	$t0,$t0,$t3,lsl#13
    328 	eor	$t1,$t1,$t2,lsl#13
    329 	eor	$t0,$t0,$t3,lsr#29
    330 	eor	$t1,$t1,$t2,lsr#29
    331 	eor	$t0,$t0,$t2,lsl#3
    332 	eor	$t1,$t1,$t3,lsl#3
    333 	eor	$t0,$t0,$t2,lsr#6
    334 	eor	$t1,$t1,$t3,lsr#6
    335 	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
    336 	eor	$t0,$t0,$t3,lsl#26
    337 
    338 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
    339 	adds	$Tlo,$Tlo,$t0
    340 	ldr	$t0,[sp,#`$Xoff+8*16`+0]
    341 	adc	$Thi,$Thi,$t1
    342 
    343 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
    344 	adds	$Tlo,$Tlo,$t2
    345 	adc	$Thi,$Thi,$t3
    346 	adds	$Tlo,$Tlo,$t0
    347 	adc	$Thi,$Thi,$t1
    348 ___
    349 	&BODY_00_15(0x17);
    350 $code.=<<___;
    351 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    352 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    353 	beq	.L16_79
    354 	bic	$Ktbl,$Ktbl,#1
    355 
    356 	ldr	$Tlo,[sp,#$Boff+0]
    357 	ldr	$Thi,[sp,#$Boff+4]
    358 	ldr	$t0, [$ctx,#$Aoff+$lo]
    359 	ldr	$t1, [$ctx,#$Aoff+$hi]
    360 	ldr	$t2, [$ctx,#$Boff+$lo]
    361 	ldr	$t3, [$ctx,#$Boff+$hi]
    362 	adds	$t0,$Alo,$t0
    363 	str	$t0, [$ctx,#$Aoff+$lo]
    364 	adc	$t1,$Ahi,$t1
    365 	str	$t1, [$ctx,#$Aoff+$hi]
    366 	adds	$t2,$Tlo,$t2
    367 	str	$t2, [$ctx,#$Boff+$lo]
    368 	adc	$t3,$Thi,$t3
    369 	str	$t3, [$ctx,#$Boff+$hi]
    370 
    371 	ldr	$Alo,[sp,#$Coff+0]
    372 	ldr	$Ahi,[sp,#$Coff+4]
    373 	ldr	$Tlo,[sp,#$Doff+0]
    374 	ldr	$Thi,[sp,#$Doff+4]
    375 	ldr	$t0, [$ctx,#$Coff+$lo]
    376 	ldr	$t1, [$ctx,#$Coff+$hi]
    377 	ldr	$t2, [$ctx,#$Doff+$lo]
    378 	ldr	$t3, [$ctx,#$Doff+$hi]
    379 	adds	$t0,$Alo,$t0
    380 	str	$t0, [$ctx,#$Coff+$lo]
    381 	adc	$t1,$Ahi,$t1
    382 	str	$t1, [$ctx,#$Coff+$hi]
    383 	adds	$t2,$Tlo,$t2
    384 	str	$t2, [$ctx,#$Doff+$lo]
    385 	adc	$t3,$Thi,$t3
    386 	str	$t3, [$ctx,#$Doff+$hi]
    387 
    388 	ldr	$Tlo,[sp,#$Foff+0]
    389 	ldr	$Thi,[sp,#$Foff+4]
    390 	ldr	$t0, [$ctx,#$Eoff+$lo]
    391 	ldr	$t1, [$ctx,#$Eoff+$hi]
    392 	ldr	$t2, [$ctx,#$Foff+$lo]
    393 	ldr	$t3, [$ctx,#$Foff+$hi]
    394 	adds	$Elo,$Elo,$t0
    395 	str	$Elo,[$ctx,#$Eoff+$lo]
    396 	adc	$Ehi,$Ehi,$t1
    397 	str	$Ehi,[$ctx,#$Eoff+$hi]
    398 	adds	$t2,$Tlo,$t2
    399 	str	$t2, [$ctx,#$Foff+$lo]
    400 	adc	$t3,$Thi,$t3
    401 	str	$t3, [$ctx,#$Foff+$hi]
    402 
    403 	ldr	$Alo,[sp,#$Goff+0]
    404 	ldr	$Ahi,[sp,#$Goff+4]
    405 	ldr	$Tlo,[sp,#$Hoff+0]
    406 	ldr	$Thi,[sp,#$Hoff+4]
    407 	ldr	$t0, [$ctx,#$Goff+$lo]
    408 	ldr	$t1, [$ctx,#$Goff+$hi]
    409 	ldr	$t2, [$ctx,#$Hoff+$lo]
    410 	ldr	$t3, [$ctx,#$Hoff+$hi]
    411 	adds	$t0,$Alo,$t0
    412 	str	$t0, [$ctx,#$Goff+$lo]
    413 	adc	$t1,$Ahi,$t1
    414 	str	$t1, [$ctx,#$Goff+$hi]
    415 	adds	$t2,$Tlo,$t2
    416 	str	$t2, [$ctx,#$Hoff+$lo]
    417 	adc	$t3,$Thi,$t3
    418 	str	$t3, [$ctx,#$Hoff+$hi]
    419 
    420 	add	sp,sp,#640
    421 	sub	$Ktbl,$Ktbl,#640
    422 
    423 	teq	$inp,$len
    424 	bne	.Loop
    425 
    426 	add	sp,sp,#8*9		@ destroy frame
    427 #if __ARM_ARCH__>=5
    428 	ldmia	sp!,{r4-r12,pc}
    429 #else
    430 	ldmia	sp!,{r4-r12,lr}
    431 	tst	lr,#1
    432 	moveq	pc,lr			@ be binary compatible with V4, yet
    433 	bx	lr			@ interoperable with Thumb ISA:-)
    434 #endif
    435 ___
    436 
    437 {
    438 my @Sigma0=(28,34,39);
    439 my @Sigma1=(14,18,41);
    440 my @sigma0=(1, 8, 7);
    441 my @sigma1=(19,61,6);
    442 
    443 my $Ktbl="r3";
    444 my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
    445 
    446 my @X=map("d$_",(0..15));
    447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
    448 
    449 sub NEON_00_15() {
    450 my $i=shift;
    451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
    452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
    453 
    454 $code.=<<___ if ($i<16 || $i&1);
    455 	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
    456 #if $i<16
    457 	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
    458 #endif
    459 	vshr.u64	$t1,$e,#@Sigma1[1]
    460 	vshr.u64	$t2,$e,#@Sigma1[2]
    461 ___
    462 $code.=<<___;
    463 	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
    464 	vsli.64		$t0,$e,#`64-@Sigma1[0]`
    465 	vsli.64		$t1,$e,#`64-@Sigma1[1]`
    466 	vsli.64		$t2,$e,#`64-@Sigma1[2]`
    467 #if $i<16 && defined(__ARMEL__)
    468 	vrev64.8	@X[$i],@X[$i]
    469 #endif
    470 	vadd.i64	$T1,$K,$h
    471 	veor		$Ch,$f,$g
    472 	veor		$t0,$t1
    473 	vand		$Ch,$e
    474 	veor		$t0,$t2			@ Sigma1(e)
    475 	veor		$Ch,$g			@ Ch(e,f,g)
    476 	vadd.i64	$T1,$t0
    477 	vshr.u64	$t0,$a,#@Sigma0[0]
    478 	vadd.i64	$T1,$Ch
    479 	vshr.u64	$t1,$a,#@Sigma0[1]
    480 	vshr.u64	$t2,$a,#@Sigma0[2]
    481 	vsli.64		$t0,$a,#`64-@Sigma0[0]`
    482 	vsli.64		$t1,$a,#`64-@Sigma0[1]`
    483 	vsli.64		$t2,$a,#`64-@Sigma0[2]`
    484 	vadd.i64	$T1,@X[$i%16]
    485 	vorr		$Maj,$a,$c
    486 	vand		$Ch,$a,$c
    487 	veor		$h,$t0,$t1
    488 	vand		$Maj,$b
    489 	veor		$h,$t2			@ Sigma0(a)
    490 	vorr		$Maj,$Ch		@ Maj(a,b,c)
    491 	vadd.i64	$h,$T1
    492 	vadd.i64	$d,$T1
    493 	vadd.i64	$h,$Maj
    494 ___
    495 }
    496 
    497 sub NEON_16_79() {
    498 my $i=shift;
    499 
    500 if ($i&1)	{ &NEON_00_15($i,@_); return; }
    501 
    502 # 2x-vectorized, therefore runs every 2nd round
    503 my @X=map("q$_",(0..7));			# view @X as 128-bit vector
    504 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
    505 my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
    506 my $e=@_[4];					# $e from NEON_00_15
    507 $i /= 2;
    508 $code.=<<___;
    509 	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
    510 	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
    511 	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
    512 	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
    513 	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
    514 	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
    515 	veor		$s1,$t0
    516 	vshr.u64	$t0,$s0,#@sigma0[0]
    517 	veor		$s1,$t1				@ sigma1(X[i+14])
    518 	vshr.u64	$t1,$s0,#@sigma0[1]
    519 	vadd.i64	@X[$i%8],$s1
    520 	vshr.u64	$s1,$s0,#@sigma0[2]
    521 	vsli.64		$t0,$s0,#`64-@sigma0[0]`
    522 	vsli.64		$t1,$s0,#`64-@sigma0[1]`
    523 	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
    524 	veor		$s1,$t0
    525 	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
    526 	vadd.i64	@X[$i%8],$s0
    527 	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
    528 	veor		$s1,$t1				@ sigma0(X[i+1])
    529 	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
    530 	vadd.i64	@X[$i%8],$s1
    531 ___
    532 	&NEON_00_15(2*$i,@_);
    533 }
    534 
    535 $code.=<<___;
    536 #if __ARM_ARCH__>=7
    537 .fpu	neon
    538 
    539 .align	4
    540 .LNEON:
    541 	dmb				@ errata #451034 on early Cortex A8
    542 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
    543 	sub	$Ktbl,r3,#672		@ K512
    544 	vldmia	$ctx,{$A-$H}		@ load context
    545 .Loop_neon:
    546 ___
    547 for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
    548 $code.=<<___;
    549 	mov		$cnt,#4
    550 .L16_79_neon:
    551 	subs		$cnt,#1
    552 ___
    553 for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
    554 $code.=<<___;
    555 	bne		.L16_79_neon
    556 
    557 	vldmia		$ctx,{d24-d31}	@ load context to temp
    558 	vadd.i64	q8,q12		@ vectorized accumulate
    559 	vadd.i64	q9,q13
    560 	vadd.i64	q10,q14
    561 	vadd.i64	q11,q15
    562 	vstmia		$ctx,{$A-$H}	@ save context
    563 	teq		$inp,$len
    564 	sub		$Ktbl,#640	@ rewind K512
    565 	bne		.Loop_neon
    566 
    567 	vldmia	sp!,{d8-d15}		@ epilogue
    568 	bx	lr
    569 #endif
    570 ___
    571 }
    572 $code.=<<___;
    573 .size	sha512_block_data_order,.-sha512_block_data_order
    574 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    575 .align	2
    576 .comm	OPENSSL_armcap_P,4,4
    577 ___
    578 
    579 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    580 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    581 print $code;
    582 close STDOUT; # enforce flush
    583