Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA256 block procedure for ARMv4. May 2007.
     11 
     12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
     13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
     14 # byte [on single-issue Xscale PXA250 core].
     15 
     16 # July 2010.
     17 #
     18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
     19 # Cortex A8 core and ~20 cycles per processed byte.
     20 
     21 # February 2011.
     22 #
     23 # Profiler-assisted and platform-specific optimization resulted in 16%
     24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
     25 
     26 # September 2013.
     27 #
     28 # Add NEON implementation. On Cortex A8 it was measured to process one
     29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
     30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
     31 # code (meaning that latter performs sub-optimally, nothing was done
     32 # about it).
     33 
     34 # May 2014.
     35 #
     36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
     37 
     38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     39 open STDOUT,">$output";
     40 
     41 $ctx="r0";	$t0="r0";
     42 $inp="r1";	$t4="r1";
     43 $len="r2";	$t1="r2";
     44 $T1="r3";	$t3="r3";
     45 $A="r4";
     46 $B="r5";
     47 $C="r6";
     48 $D="r7";
     49 $E="r8";
     50 $F="r9";
     51 $G="r10";
     52 $H="r11";
     53 @V=($A,$B,$C,$D,$E,$F,$G,$H);
     54 $t2="r12";
     55 $Ktbl="r14";
     56 
     57 @Sigma0=( 2,13,22);
     58 @Sigma1=( 6,11,25);
     59 @sigma0=( 7,18, 3);
     60 @sigma1=(17,19,10);
     61 
     62 sub BODY_00_15 {
     63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
     64 
     65 $code.=<<___ if ($i<16);
     66 #if __ARM_ARCH__>=7
     67 	@ ldr	$t1,[$inp],#4			@ $i
     68 # if $i==15
     69 	str	$inp,[sp,#17*4]			@ make room for $t4
     70 # endif
     71 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
     72 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     73 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
     74 	rev	$t1,$t1
     75 #else
     76 	@ ldrb	$t1,[$inp,#3]			@ $i
     77 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     78 	ldrb	$t2,[$inp,#2]
     79 	ldrb	$t0,[$inp,#1]
     80 	orr	$t1,$t1,$t2,lsl#8
     81 	ldrb	$t2,[$inp],#4
     82 	orr	$t1,$t1,$t0,lsl#16
     83 # if $i==15
     84 	str	$inp,[sp,#17*4]			@ make room for $t4
     85 # endif
     86 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
     87 	orr	$t1,$t1,$t2,lsl#24
     88 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
     89 #endif
     90 ___
     91 $code.=<<___;
     92 	ldr	$t2,[$Ktbl],#4			@ *K256++
     93 	add	$h,$h,$t1			@ h+=X[i]
     94 	str	$t1,[sp,#`$i%16`*4]
     95 	eor	$t1,$f,$g
     96 	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
     97 	and	$t1,$t1,$e
     98 	add	$h,$h,$t2			@ h+=K256[i]
     99 	eor	$t1,$t1,$g			@ Ch(e,f,g)
    100 	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
    101 	add	$h,$h,$t1			@ h+=Ch(e,f,g)
    102 #if $i==31
    103 	and	$t2,$t2,#0xff
    104 	cmp	$t2,#0xf2			@ done?
    105 #endif
    106 #if $i<15
    107 # if __ARM_ARCH__>=7
    108 	ldr	$t1,[$inp],#4			@ prefetch
    109 # else
    110 	ldrb	$t1,[$inp,#3]
    111 # endif
    112 	eor	$t2,$a,$b			@ a^b, b^c in next round
    113 #else
    114 	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
    115 	eor	$t2,$a,$b			@ a^b, b^c in next round
    116 	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
    117 #endif
    118 	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
    119 	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
    120 	add	$d,$d,$h			@ d+=h
    121 	eor	$t3,$t3,$b			@ Maj(a,b,c)
    122 	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
    123 	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
    124 ___
    125 	($t2,$t3)=($t3,$t2);
    126 }
    127 
    128 sub BODY_16_XX {
    129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    130 
    131 $code.=<<___;
    132 	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
    133 	@ ldr	$t4,[sp,#`($i+14)%16`*4]
    134 	mov	$t0,$t1,ror#$sigma0[0]
    135 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
    136 	mov	$t2,$t4,ror#$sigma1[0]
    137 	eor	$t0,$t0,$t1,ror#$sigma0[1]
    138 	eor	$t2,$t2,$t4,ror#$sigma1[1]
    139 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
    140 	ldr	$t1,[sp,#`($i+0)%16`*4]
    141 	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
    142 	ldr	$t4,[sp,#`($i+9)%16`*4]
    143 
    144 	add	$t2,$t2,$t0
    145 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
    146 	add	$t1,$t1,$t2
    147 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
    148 	add	$t1,$t1,$t4			@ X[i]
    149 ___
    150 	&BODY_00_15(@_);
    151 }
    152 
    153 $code=<<___;
    154 #include "arm_arch.h"
    155 
    156 .text
    157 .code	32
    158 
    159 .type	K256,%object
    160 .align	5
    161 K256:
    162 .word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    163 .word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    164 .word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    165 .word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    166 .word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    167 .word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    168 .word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    169 .word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    170 .word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    171 .word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    172 .word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    173 .word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    174 .word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    175 .word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    176 .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    177 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    178 .size	K256,.-K256
    179 .word	0				@ terminator
    180 .LOPENSSL_armcap:
    181 .word	OPENSSL_armcap_P-sha256_block_data_order
    182 .align	5
    183 
    184 .global	sha256_block_data_order
    185 .type	sha256_block_data_order,%function
    186 sha256_block_data_order:
    187 	sub	r3,pc,#8		@ sha256_block_data_order
    188 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    189 #if __ARM_ARCH__>=7
    190 	ldr	r12,.LOPENSSL_armcap
    191 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    192 	tst	r12,#ARMV8_SHA256
    193 	bne	.LARMv8
    194 	tst	r12,#ARMV7_NEON
    195 	bne	.LNEON
    196 #endif
    197 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
    198 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
    199 	sub	$Ktbl,r3,#256+32	@ K256
    200 	sub	sp,sp,#16*4		@ alloca(X[16])
    201 .Loop:
    202 # if __ARM_ARCH__>=7
    203 	ldr	$t1,[$inp],#4
    204 # else
    205 	ldrb	$t1,[$inp,#3]
    206 # endif
    207 	eor	$t3,$B,$C		@ magic
    208 	eor	$t2,$t2,$t2
    209 ___
    210 for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    211 $code.=".Lrounds_16_xx:\n";
    212 for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
    213 $code.=<<___;
    214 	ldreq	$t3,[sp,#16*4]		@ pull ctx
    215 	bne	.Lrounds_16_xx
    216 
    217 	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
    218 	ldr	$t0,[$t3,#0]
    219 	ldr	$t1,[$t3,#4]
    220 	ldr	$t2,[$t3,#8]
    221 	add	$A,$A,$t0
    222 	ldr	$t0,[$t3,#12]
    223 	add	$B,$B,$t1
    224 	ldr	$t1,[$t3,#16]
    225 	add	$C,$C,$t2
    226 	ldr	$t2,[$t3,#20]
    227 	add	$D,$D,$t0
    228 	ldr	$t0,[$t3,#24]
    229 	add	$E,$E,$t1
    230 	ldr	$t1,[$t3,#28]
    231 	add	$F,$F,$t2
    232 	ldr	$inp,[sp,#17*4]		@ pull inp
    233 	ldr	$t2,[sp,#18*4]		@ pull inp+len
    234 	add	$G,$G,$t0
    235 	add	$H,$H,$t1
    236 	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
    237 	cmp	$inp,$t2
    238 	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
    239 	bne	.Loop
    240 
    241 	add	sp,sp,#`16+3`*4	@ destroy frame
    242 #if __ARM_ARCH__>=5
    243 	ldmia	sp!,{r4-r11,pc}
    244 #else
    245 	ldmia	sp!,{r4-r11,lr}
    246 	tst	lr,#1
    247 	moveq	pc,lr			@ be binary compatible with V4, yet
    248 	bx	lr			@ interoperable with Thumb ISA:-)
    249 #endif
    250 .size	sha256_block_data_order,.-sha256_block_data_order
    251 ___
    252 ######################################################################
    253 # NEON stuff
    254 #
    255 {{{
    256 my @X=map("q$_",(0..3));
    257 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
    258 my $Xfer=$t4;
    259 my $j=0;
    260 
    261 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
    262 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
    263 
    264 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
    265 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
    266   my $arg = pop;
    267     $arg = "#$arg" if ($arg*1 eq $arg);
    268     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
    269 }
    270 
    271 sub Xupdate()
    272 { use integer;
    273   my $body = shift;
    274   my @insns = (&$body,&$body,&$body,&$body);
    275   my ($a,$b,$c,$d,$e,$f,$g,$h);
    276 
    277 	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
    278 	 eval(shift(@insns));
    279 	 eval(shift(@insns));
    280 	 eval(shift(@insns));
    281 	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
    282 	 eval(shift(@insns));
    283 	 eval(shift(@insns));
    284 	 eval(shift(@insns));
    285 	&vshr_u32	($T2,$T0,$sigma0[0]);
    286 	 eval(shift(@insns));
    287 	 eval(shift(@insns));
    288 	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
    289 	 eval(shift(@insns));
    290 	 eval(shift(@insns));
    291 	&vshr_u32	($T1,$T0,$sigma0[2]);
    292 	 eval(shift(@insns));
    293 	 eval(shift(@insns));
    294 	&vsli_32	($T2,$T0,32-$sigma0[0]);
    295 	 eval(shift(@insns));
    296 	 eval(shift(@insns));
    297 	&vshr_u32	($T3,$T0,$sigma0[1]);
    298 	 eval(shift(@insns));
    299 	 eval(shift(@insns));
    300 	&veor		($T1,$T1,$T2);
    301 	 eval(shift(@insns));
    302 	 eval(shift(@insns));
    303 	&vsli_32	($T3,$T0,32-$sigma0[1]);
    304 	 eval(shift(@insns));
    305 	 eval(shift(@insns));
    306 	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
    307 	 eval(shift(@insns));
    308 	 eval(shift(@insns));
    309 	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
    310 	 eval(shift(@insns));
    311 	 eval(shift(@insns));
    312 	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
    313 	 eval(shift(@insns));
    314 	 eval(shift(@insns));
    315 	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
    316 	 eval(shift(@insns));
    317 	 eval(shift(@insns));
    318 	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
    319 	 eval(shift(@insns));
    320 	 eval(shift(@insns));
    321 	  &veor		($T5,$T5,$T4);
    322 	 eval(shift(@insns));
    323 	 eval(shift(@insns));
    324 	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
    325 	 eval(shift(@insns));
    326 	 eval(shift(@insns));
    327 	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
    328 	 eval(shift(@insns));
    329 	 eval(shift(@insns));
    330 	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
    331 	 eval(shift(@insns));
    332 	 eval(shift(@insns));
    333 	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
    334 	 eval(shift(@insns));
    335 	 eval(shift(@insns));
    336 	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
    337 	 eval(shift(@insns));
    338 	 eval(shift(@insns));
    339 	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
    340 	 eval(shift(@insns));
    341 	 eval(shift(@insns));
    342 	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
    343 	 eval(shift(@insns));
    344 	 eval(shift(@insns));
    345 	  &veor		($T5,$T5,$T4);
    346 	 eval(shift(@insns));
    347 	 eval(shift(@insns));
    348 	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
    349 	 eval(shift(@insns));
    350 	 eval(shift(@insns));
    351 	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    352 	 eval(shift(@insns));
    353 	 eval(shift(@insns));
    354 	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
    355 	 eval(shift(@insns));
    356 	 eval(shift(@insns));
    357 	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
    358 	 eval(shift(@insns));
    359 	 eval(shift(@insns));
    360 	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
    361 	 eval(shift(@insns));
    362 	 eval(shift(@insns));
    363 	&vadd_i32	($T0,$T0,@X[0]);
    364 	 while($#insns>=2) { eval(shift(@insns)); }
    365 	&vst1_32	("{$T0}","[$Xfer,:128]!");
    366 	 eval(shift(@insns));
    367 	 eval(shift(@insns));
    368 
    369 	push(@X,shift(@X));		# "rotate" X[]
    370 }
    371 
    372 sub Xpreload()
    373 { use integer;
    374   my $body = shift;
    375   my @insns = (&$body,&$body,&$body,&$body);
    376   my ($a,$b,$c,$d,$e,$f,$g,$h);
    377 
    378 	 eval(shift(@insns));
    379 	 eval(shift(@insns));
    380 	 eval(shift(@insns));
    381 	 eval(shift(@insns));
    382 	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    383 	 eval(shift(@insns));
    384 	 eval(shift(@insns));
    385 	 eval(shift(@insns));
    386 	 eval(shift(@insns));
    387 	&vrev32_8	(@X[0],@X[0]);
    388 	 eval(shift(@insns));
    389 	 eval(shift(@insns));
    390 	 eval(shift(@insns));
    391 	 eval(shift(@insns));
    392 	&vadd_i32	($T0,$T0,@X[0]);
    393 	 foreach (@insns) { eval; }	# remaining instructions
    394 	&vst1_32	("{$T0}","[$Xfer,:128]!");
    395 
    396 	push(@X,shift(@X));		# "rotate" X[]
    397 }
    398 
    399 sub body_00_15 () {
    400 	(
    401 	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
    402 	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
    403 	'&eor	($t1,$f,$g)',
    404 	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
    405 	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
    406 	'&and	($t1,$t1,$e)',
    407 	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
    408 	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
    409 	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
    410 	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
    411 	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
    412 	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
    413 	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
    414 	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
    415 	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
    416 	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
    417 	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
    418 	'&add	($d,$d,$h)',			# d+=h
    419 	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
    420 	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
    421 	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
    422 	)
    423 }
    424 
    425 $code.=<<___;
    426 #if __ARM_ARCH__>=7
    427 .fpu	neon
    428 
    429 .type	sha256_block_data_order_neon,%function
    430 .align	4
    431 sha256_block_data_order_neon:
    432 .LNEON:
    433 	stmdb	sp!,{r4-r12,lr}
    434 
    435 	mov	$t2,sp
    436 	sub	sp,sp,#16*4+16		@ alloca
    437 	sub	$Ktbl,r3,#256+32	@ K256
    438 	bic	sp,sp,#15		@ align for 128-bit stores
    439 
    440 	vld1.8		{@X[0]},[$inp]!
    441 	vld1.8		{@X[1]},[$inp]!
    442 	vld1.8		{@X[2]},[$inp]!
    443 	vld1.8		{@X[3]},[$inp]!
    444 	vld1.32		{$T0},[$Ktbl,:128]!
    445 	vld1.32		{$T1},[$Ktbl,:128]!
    446 	vld1.32		{$T2},[$Ktbl,:128]!
    447 	vld1.32		{$T3},[$Ktbl,:128]!
    448 	vrev32.8	@X[0],@X[0]		@ yes, even on
    449 	str		$ctx,[sp,#64]
    450 	vrev32.8	@X[1],@X[1]		@ big-endian
    451 	str		$inp,[sp,#68]
    452 	mov		$Xfer,sp
    453 	vrev32.8	@X[2],@X[2]
    454 	str		$len,[sp,#72]
    455 	vrev32.8	@X[3],@X[3]
    456 	str		$t2,[sp,#76]		@ save original sp
    457 	vadd.i32	$T0,$T0,@X[0]
    458 	vadd.i32	$T1,$T1,@X[1]
    459 	vst1.32		{$T0},[$Xfer,:128]!
    460 	vadd.i32	$T2,$T2,@X[2]
    461 	vst1.32		{$T1},[$Xfer,:128]!
    462 	vadd.i32	$T3,$T3,@X[3]
    463 	vst1.32		{$T2},[$Xfer,:128]!
    464 	vst1.32		{$T3},[$Xfer,:128]!
    465 
    466 	ldmia		$ctx,{$A-$H}
    467 	sub		$Xfer,$Xfer,#64
    468 	ldr		$t1,[sp,#0]
    469 	eor		$t2,$t2,$t2
    470 	eor		$t3,$B,$C
    471 	b		.L_00_48
    472 
    473 .align	4
    474 .L_00_48:
    475 ___
    476 	&Xupdate(\&body_00_15);
    477 	&Xupdate(\&body_00_15);
    478 	&Xupdate(\&body_00_15);
    479 	&Xupdate(\&body_00_15);
    480 $code.=<<___;
    481 	teq	$t1,#0				@ check for K256 terminator
    482 	ldr	$t1,[sp,#0]
    483 	sub	$Xfer,$Xfer,#64
    484 	bne	.L_00_48
    485 
    486 	ldr		$inp,[sp,#68]
    487 	ldr		$t0,[sp,#72]
    488 	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
    489 	teq		$inp,$t0
    490 	subeq		$inp,$inp,#64		@ avoid SEGV
    491 	vld1.8		{@X[0]},[$inp]!		@ load next input block
    492 	vld1.8		{@X[1]},[$inp]!
    493 	vld1.8		{@X[2]},[$inp]!
    494 	vld1.8		{@X[3]},[$inp]!
    495 	strne		$inp,[sp,#68]
    496 	mov		$Xfer,sp
    497 ___
    498 	&Xpreload(\&body_00_15);
    499 	&Xpreload(\&body_00_15);
    500 	&Xpreload(\&body_00_15);
    501 	&Xpreload(\&body_00_15);
    502 $code.=<<___;
    503 	ldr	$t0,[$t1,#0]
    504 	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
    505 	ldr	$t2,[$t1,#4]
    506 	ldr	$t3,[$t1,#8]
    507 	ldr	$t4,[$t1,#12]
    508 	add	$A,$A,$t0			@ accumulate
    509 	ldr	$t0,[$t1,#16]
    510 	add	$B,$B,$t2
    511 	ldr	$t2,[$t1,#20]
    512 	add	$C,$C,$t3
    513 	ldr	$t3,[$t1,#24]
    514 	add	$D,$D,$t4
    515 	ldr	$t4,[$t1,#28]
    516 	add	$E,$E,$t0
    517 	str	$A,[$t1],#4
    518 	add	$F,$F,$t2
    519 	str	$B,[$t1],#4
    520 	add	$G,$G,$t3
    521 	str	$C,[$t1],#4
    522 	add	$H,$H,$t4
    523 	str	$D,[$t1],#4
    524 	stmia	$t1,{$E-$H}
    525 
    526 	movne	$Xfer,sp
    527 	ldrne	$t1,[sp,#0]
    528 	eorne	$t2,$t2,$t2
    529 	ldreq	sp,[sp,#76]			@ restore original sp
    530 	eorne	$t3,$B,$C
    531 	bne	.L_00_48
    532 
    533 	ldmia	sp!,{r4-r12,pc}
    534 .size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
    535 #endif
    536 ___
    537 }}}
    538 ######################################################################
    539 # ARMv8 stuff
    540 #
    541 {{{
    542 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
    543 my @MSG=map("q$_",(8..11));
    544 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
    545 my $Ktbl="r3";
    546 
    547 $code.=<<___;
    548 #if __ARM_ARCH__>=7
    549 .type	sha256_block_data_order_armv8,%function
    550 .align	5
    551 sha256_block_data_order_armv8:
    552 .LARMv8:
    553 	vld1.32	{$ABCD,$EFGH},[$ctx]
    554 	sub	$Ktbl,r3,#sha256_block_data_order-K256
    555 
    556 .Loop_v8:
    557 	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
    558 	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
    559 	vld1.32		{$W0},[$Ktbl]!
    560 	vrev32.8	@MSG[0],@MSG[0]
    561 	vrev32.8	@MSG[1],@MSG[1]
    562 	vrev32.8	@MSG[2],@MSG[2]
    563 	vrev32.8	@MSG[3],@MSG[3]
    564 	vmov		$ABCD_SAVE,$ABCD	@ offload
    565 	vmov		$EFGH_SAVE,$EFGH
    566 	teq		$inp,$len
    567 ___
    568 for($i=0;$i<12;$i++) {
    569 $code.=<<___;
    570 	vld1.32		{$W1},[$Ktbl]!
    571 	vadd.i32	$W0,$W0,@MSG[0]
    572 	sha256su0	@MSG[0],@MSG[1]
    573 	vmov		$abcd,$ABCD
    574 	sha256h		$ABCD,$EFGH,$W0
    575 	sha256h2	$EFGH,$abcd,$W0
    576 	sha256su1	@MSG[0],@MSG[2],@MSG[3]
    577 ___
    578 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    579 }
    580 $code.=<<___;
    581 	vld1.32		{$W1},[$Ktbl]!
    582 	vadd.i32	$W0,$W0,@MSG[0]
    583 	vmov		$abcd,$ABCD
    584 	sha256h		$ABCD,$EFGH,$W0
    585 	sha256h2	$EFGH,$abcd,$W0
    586 
    587 	vld1.32		{$W0},[$Ktbl]!
    588 	vadd.i32	$W1,$W1,@MSG[1]
    589 	vmov		$abcd,$ABCD
    590 	sha256h		$ABCD,$EFGH,$W1
    591 	sha256h2	$EFGH,$abcd,$W1
    592 
    593 	vld1.32		{$W1},[$Ktbl]
    594 	vadd.i32	$W0,$W0,@MSG[2]
    595 	sub		$Ktbl,$Ktbl,#256-16	@ rewind
    596 	vmov		$abcd,$ABCD
    597 	sha256h		$ABCD,$EFGH,$W0
    598 	sha256h2	$EFGH,$abcd,$W0
    599 
    600 	vadd.i32	$W1,$W1,@MSG[3]
    601 	vmov		$abcd,$ABCD
    602 	sha256h		$ABCD,$EFGH,$W1
    603 	sha256h2	$EFGH,$abcd,$W1
    604 
    605 	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
    606 	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
    607 	bne		.Loop_v8
    608 
    609 	vst1.32		{$ABCD,$EFGH},[$ctx]
    610 
    611 	ret		@ bx lr
    612 .size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
    613 #endif
    614 ___
    615 }}}
    616 $code.=<<___;
    617 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    618 .align	2
    619 .comm   OPENSSL_armcap_P,4,4
    620 ___
    621 
    622 {   my  %opcode = (
    623 	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
    624 	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
    625 
    626     sub unsha256 {
    627 	my ($mnemonic,$arg)=@_;
    628 
    629 	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
    630 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    631 					 |(($2&7)<<17)|(($2&8)<<4)
    632 					 |(($3&7)<<1) |(($3&8)<<2);
    633 	    # since ARMv7 instructions are always encoded little-endian.
    634 	    # correct solution is to use .inst directive, but older
    635 	    # assemblers don't implement it:-(
    636 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
    637 			$word&0xff,($word>>8)&0xff,
    638 			($word>>16)&0xff,($word>>24)&0xff,
    639 			$mnemonic,$arg;
    640 	}
    641     }
    642 }
    643 
    644 foreach (split($/,$code)) {
    645 
    646 	s/\`([^\`]*)\`/eval $1/geo;
    647 
    648 	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
    649 
    650 	s/\bret\b/bx	lr/go		or
    651 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
    652 
    653 	print $_,"\n";
    654 }
    655 
    656 close STDOUT; # enforce flush
    657