Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 #
      9 # Permission to use under GPL terms is granted.
     10 # ====================================================================
     11 
     12 # SHA256 block procedure for ARMv4. May 2007.
     13 
     14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
     15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
     16 # byte [on single-issue Xscale PXA250 core].
     17 
     18 # July 2010.
     19 #
     20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
     21 # Cortex A8 core and ~20 cycles per processed byte.
     22 
     23 # February 2011.
     24 #
     25 # Profiler-assisted and platform-specific optimization resulted in 16%
     26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
     27 
     28 # September 2013.
     29 #
     30 # Add NEON implementation. On Cortex A8 it was measured to process one
     31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
     32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
     33 # code (meaning that latter performs sub-optimally, nothing was done
     34 # about it).
     35 
     36 # May 2014.
     37 #
     38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
     39 
     40 $flavour = shift;
     41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
     42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
     43 
     44 if ($flavour && $flavour ne "void") {
     45     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     46     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     47     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     48     die "can't locate arm-xlate.pl";
     49 
     50     open STDOUT,"| \"$^X\" $xlate $flavour $output";
     51 } else {
     52     open STDOUT,">$output";
     53 }
     54 
     55 $ctx="r0";	$t0="r0";
     56 $inp="r1";	$t4="r1";
     57 $len="r2";	$t1="r2";
     58 $T1="r3";	$t3="r3";
     59 $A="r4";
     60 $B="r5";
     61 $C="r6";
     62 $D="r7";
     63 $E="r8";
     64 $F="r9";
     65 $G="r10";
     66 $H="r11";
     67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
     68 $t2="r12";
     69 $Ktbl="r14";
     70 
     71 @Sigma0=( 2,13,22);
     72 @Sigma1=( 6,11,25);
     73 @sigma0=( 7,18, 3);
     74 @sigma1=(17,19,10);
     75 
     76 sub BODY_00_15 {
     77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
     78 
     79 $code.=<<___ if ($i<16);
     80 #if __ARM_ARCH__>=7
     81 	@ ldr	$t1,[$inp],#4			@ $i
     82 # if $i==15
     83 	str	$inp,[sp,#17*4]			@ make room for $t4
     84 # endif
     85 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
     86 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     87 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
     88 # ifndef __ARMEB__
     89 	rev	$t1,$t1
     90 # endif
     91 #else
     92 	@ ldrb	$t1,[$inp,#3]			@ $i
     93 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
     94 	ldrb	$t2,[$inp,#2]
     95 	ldrb	$t0,[$inp,#1]
     96 	orr	$t1,$t1,$t2,lsl#8
     97 	ldrb	$t2,[$inp],#4
     98 	orr	$t1,$t1,$t0,lsl#16
     99 # if $i==15
    100 	str	$inp,[sp,#17*4]			@ make room for $t4
    101 # endif
    102 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
    103 	orr	$t1,$t1,$t2,lsl#24
    104 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
    105 #endif
    106 ___
    107 $code.=<<___;
    108 	ldr	$t2,[$Ktbl],#4			@ *K256++
    109 	add	$h,$h,$t1			@ h+=X[i]
    110 	str	$t1,[sp,#`$i%16`*4]
    111 	eor	$t1,$f,$g
    112 	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
    113 	and	$t1,$t1,$e
    114 	add	$h,$h,$t2			@ h+=K256[i]
    115 	eor	$t1,$t1,$g			@ Ch(e,f,g)
    116 	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
    117 	add	$h,$h,$t1			@ h+=Ch(e,f,g)
    118 #if $i==31
    119 	and	$t2,$t2,#0xff
    120 	cmp	$t2,#0xf2			@ done?
    121 #endif
    122 #if $i<15
    123 # if __ARM_ARCH__>=7
    124 	ldr	$t1,[$inp],#4			@ prefetch
    125 # else
    126 	ldrb	$t1,[$inp,#3]
    127 # endif
    128 	eor	$t2,$a,$b			@ a^b, b^c in next round
    129 #else
    130 	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
    131 	eor	$t2,$a,$b			@ a^b, b^c in next round
    132 	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
    133 #endif
    134 	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
    135 	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
    136 	add	$d,$d,$h			@ d+=h
    137 	eor	$t3,$t3,$b			@ Maj(a,b,c)
    138 	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
    139 	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
    140 ___
    141 	($t2,$t3)=($t3,$t2);
    142 }
    143 
    144 sub BODY_16_XX {
    145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    146 
    147 $code.=<<___;
    148 	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
    149 	@ ldr	$t4,[sp,#`($i+14)%16`*4]
    150 	mov	$t0,$t1,ror#$sigma0[0]
    151 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
    152 	mov	$t2,$t4,ror#$sigma1[0]
    153 	eor	$t0,$t0,$t1,ror#$sigma0[1]
    154 	eor	$t2,$t2,$t4,ror#$sigma1[1]
    155 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
    156 	ldr	$t1,[sp,#`($i+0)%16`*4]
    157 	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
    158 	ldr	$t4,[sp,#`($i+9)%16`*4]
    159 
    160 	add	$t2,$t2,$t0
    161 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
    162 	add	$t1,$t1,$t2
    163 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
    164 	add	$t1,$t1,$t4			@ X[i]
    165 ___
    166 	&BODY_00_15(@_);
    167 }
    168 
    169 $code=<<___;
    170 #ifndef __KERNEL__
    171 # include "arm_arch.h"
    172 #else
    173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
    174 # define __ARM_MAX_ARCH__ 7
    175 #endif
    176 
    177 .text
    178 #if __ARM_ARCH__<7
    179 .code	32
    180 #else
    181 .syntax unified
    182 # if defined(__thumb2__) && !defined(__APPLE__)
    183 #  define adrl adr
    184 .thumb
    185 # else
    186 .code   32
    187 # endif
    188 #endif
    189 
    190 .type	K256,%object
    191 .align	5
    192 K256:
    193 .word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    194 .word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    195 .word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    196 .word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    197 .word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    198 .word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    199 .word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    200 .word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    201 .word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    202 .word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    203 .word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    204 .word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    205 .word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    206 .word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    207 .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    208 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    209 .size	K256,.-K256
    210 .word	0				@ terminator
    211 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    212 .LOPENSSL_armcap:
    213 .word	OPENSSL_armcap_P-.Lsha256_block_data_order
    214 #endif
    215 .align	5
    216 
    217 .global	sha256_block_data_order
    218 .type	sha256_block_data_order,%function
    219 sha256_block_data_order:
    220 .Lsha256_block_data_order:
    221 #if __ARM_ARCH__<7
    222 	sub	r3,pc,#8		@ sha256_block_data_order
    223 #else
    224 	adr	r3,sha256_block_data_order
    225 #endif
    226 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    227 	ldr	r12,.LOPENSSL_armcap
    228 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    229 #ifdef	__APPLE__
    230 	ldr	r12,[r12]
    231 #endif
    232 	tst	r12,#ARMV8_SHA256
    233 	bne	.LARMv8
    234 	tst	r12,#ARMV7_NEON
    235 	bne	.LNEON
    236 #endif
    237 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    238 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
    239 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
    240 	sub	$Ktbl,r3,#256+32	@ K256
    241 	sub	sp,sp,#16*4		@ alloca(X[16])
    242 .Loop:
    243 # if __ARM_ARCH__>=7
    244 	ldr	$t1,[$inp],#4
    245 # else
    246 	ldrb	$t1,[$inp,#3]
    247 # endif
    248 	eor	$t3,$B,$C		@ magic
    249 	eor	$t2,$t2,$t2
    250 ___
    251 for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    252 $code.=".Lrounds_16_xx:\n";
    253 for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
    254 $code.=<<___;
    255 #if __ARM_ARCH__>=7
    256 	ite	eq			@ Thumb2 thing, sanity check in ARM
    257 #endif
    258 	ldreq	$t3,[sp,#16*4]		@ pull ctx
    259 	bne	.Lrounds_16_xx
    260 
    261 	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
    262 	ldr	$t0,[$t3,#0]
    263 	ldr	$t1,[$t3,#4]
    264 	ldr	$t2,[$t3,#8]
    265 	add	$A,$A,$t0
    266 	ldr	$t0,[$t3,#12]
    267 	add	$B,$B,$t1
    268 	ldr	$t1,[$t3,#16]
    269 	add	$C,$C,$t2
    270 	ldr	$t2,[$t3,#20]
    271 	add	$D,$D,$t0
    272 	ldr	$t0,[$t3,#24]
    273 	add	$E,$E,$t1
    274 	ldr	$t1,[$t3,#28]
    275 	add	$F,$F,$t2
    276 	ldr	$inp,[sp,#17*4]		@ pull inp
    277 	ldr	$t2,[sp,#18*4]		@ pull inp+len
    278 	add	$G,$G,$t0
    279 	add	$H,$H,$t1
    280 	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
    281 	cmp	$inp,$t2
    282 	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
    283 	bne	.Loop
    284 
    285 	add	sp,sp,#`16+3`*4	@ destroy frame
    286 #if __ARM_ARCH__>=5
    287 	ldmia	sp!,{r4-r11,pc}
    288 #else
    289 	ldmia	sp!,{r4-r11,lr}
    290 	tst	lr,#1
    291 	moveq	pc,lr			@ be binary compatible with V4, yet
    292 	bx	lr			@ interoperable with Thumb ISA:-)
    293 #endif
    294 .size	sha256_block_data_order,.-sha256_block_data_order
    295 ___
    296 ######################################################################
    297 # NEON stuff
    298 #
    299 {{{
    300 my @X=map("q$_",(0..3));
    301 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
    302 my $Xfer=$t4;
    303 my $j=0;
    304 
    305 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
    306 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
    307 
    308 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
    309 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
    310   my $arg = pop;
    311     $arg = "#$arg" if ($arg*1 eq $arg);
    312     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
    313 }
    314 
    315 sub Xupdate()
    316 { use integer;
    317   my $body = shift;
    318   my @insns = (&$body,&$body,&$body,&$body);
    319   my ($a,$b,$c,$d,$e,$f,$g,$h);
    320 
    321 	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
    322 	 eval(shift(@insns));
    323 	 eval(shift(@insns));
    324 	 eval(shift(@insns));
    325 	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
    326 	 eval(shift(@insns));
    327 	 eval(shift(@insns));
    328 	 eval(shift(@insns));
    329 	&vshr_u32	($T2,$T0,$sigma0[0]);
    330 	 eval(shift(@insns));
    331 	 eval(shift(@insns));
    332 	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
    333 	 eval(shift(@insns));
    334 	 eval(shift(@insns));
    335 	&vshr_u32	($T1,$T0,$sigma0[2]);
    336 	 eval(shift(@insns));
    337 	 eval(shift(@insns));
    338 	&vsli_32	($T2,$T0,32-$sigma0[0]);
    339 	 eval(shift(@insns));
    340 	 eval(shift(@insns));
    341 	&vshr_u32	($T3,$T0,$sigma0[1]);
    342 	 eval(shift(@insns));
    343 	 eval(shift(@insns));
    344 	&veor		($T1,$T1,$T2);
    345 	 eval(shift(@insns));
    346 	 eval(shift(@insns));
    347 	&vsli_32	($T3,$T0,32-$sigma0[1]);
    348 	 eval(shift(@insns));
    349 	 eval(shift(@insns));
    350 	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
    351 	 eval(shift(@insns));
    352 	 eval(shift(@insns));
    353 	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
    354 	 eval(shift(@insns));
    355 	 eval(shift(@insns));
    356 	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
    357 	 eval(shift(@insns));
    358 	 eval(shift(@insns));
    359 	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
    360 	 eval(shift(@insns));
    361 	 eval(shift(@insns));
    362 	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
    363 	 eval(shift(@insns));
    364 	 eval(shift(@insns));
    365 	  &veor		($T5,$T5,$T4);
    366 	 eval(shift(@insns));
    367 	 eval(shift(@insns));
    368 	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
    369 	 eval(shift(@insns));
    370 	 eval(shift(@insns));
    371 	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
    372 	 eval(shift(@insns));
    373 	 eval(shift(@insns));
    374 	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
    375 	 eval(shift(@insns));
    376 	 eval(shift(@insns));
    377 	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
    378 	 eval(shift(@insns));
    379 	 eval(shift(@insns));
    380 	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
    381 	 eval(shift(@insns));
    382 	 eval(shift(@insns));
    383 	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
    384 	 eval(shift(@insns));
    385 	 eval(shift(@insns));
    386 	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
    387 	 eval(shift(@insns));
    388 	 eval(shift(@insns));
    389 	  &veor		($T5,$T5,$T4);
    390 	 eval(shift(@insns));
    391 	 eval(shift(@insns));
    392 	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
    393 	 eval(shift(@insns));
    394 	 eval(shift(@insns));
    395 	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    396 	 eval(shift(@insns));
    397 	 eval(shift(@insns));
    398 	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
    399 	 eval(shift(@insns));
    400 	 eval(shift(@insns));
    401 	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
    402 	 eval(shift(@insns));
    403 	 eval(shift(@insns));
    404 	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
    405 	 eval(shift(@insns));
    406 	 eval(shift(@insns));
    407 	&vadd_i32	($T0,$T0,@X[0]);
    408 	 while($#insns>=2) { eval(shift(@insns)); }
    409 	&vst1_32	("{$T0}","[$Xfer,:128]!");
    410 	 eval(shift(@insns));
    411 	 eval(shift(@insns));
    412 
    413 	push(@X,shift(@X));		# "rotate" X[]
    414 }
    415 
    416 sub Xpreload()
    417 { use integer;
    418   my $body = shift;
    419   my @insns = (&$body,&$body,&$body,&$body);
    420   my ($a,$b,$c,$d,$e,$f,$g,$h);
    421 
    422 	 eval(shift(@insns));
    423 	 eval(shift(@insns));
    424 	 eval(shift(@insns));
    425 	 eval(shift(@insns));
    426 	&vld1_32	("{$T0}","[$Ktbl,:128]!");
    427 	 eval(shift(@insns));
    428 	 eval(shift(@insns));
    429 	 eval(shift(@insns));
    430 	 eval(shift(@insns));
    431 	&vrev32_8	(@X[0],@X[0]);
    432 	 eval(shift(@insns));
    433 	 eval(shift(@insns));
    434 	 eval(shift(@insns));
    435 	 eval(shift(@insns));
    436 	&vadd_i32	($T0,$T0,@X[0]);
    437 	 foreach (@insns) { eval; }	# remaining instructions
    438 	&vst1_32	("{$T0}","[$Xfer,:128]!");
    439 
    440 	push(@X,shift(@X));		# "rotate" X[]
    441 }
    442 
    443 sub body_00_15 () {
    444 	(
    445 	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
    446 	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
    447 	'&eor	($t1,$f,$g)',
    448 	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
    449 	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
    450 	'&and	($t1,$t1,$e)',
    451 	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
    452 	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
    453 	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
    454 	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
    455 	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
    456 	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
    457 	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
    458 	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
    459 	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
    460 	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
    461 	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
    462 	'&add	($d,$d,$h)',			# d+=h
    463 	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
    464 	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
    465 	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
    466 	)
    467 }
    468 
    469 $code.=<<___;
    470 #if __ARM_MAX_ARCH__>=7
    471 .arch	armv7-a
    472 .fpu	neon
    473 
    474 .global	sha256_block_data_order_neon
    475 .type	sha256_block_data_order_neon,%function
    476 .align	4
    477 sha256_block_data_order_neon:
    478 .LNEON:
    479 	stmdb	sp!,{r4-r12,lr}
    480 
    481 	sub	$H,sp,#16*4+16
    482 	adrl	$Ktbl,K256
    483 	bic	$H,$H,#15		@ align for 128-bit stores
    484 	mov	$t2,sp
    485 	mov	sp,$H			@ alloca
    486 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    487 
    488 	vld1.8		{@X[0]},[$inp]!
    489 	vld1.8		{@X[1]},[$inp]!
    490 	vld1.8		{@X[2]},[$inp]!
    491 	vld1.8		{@X[3]},[$inp]!
    492 	vld1.32		{$T0},[$Ktbl,:128]!
    493 	vld1.32		{$T1},[$Ktbl,:128]!
    494 	vld1.32		{$T2},[$Ktbl,:128]!
    495 	vld1.32		{$T3},[$Ktbl,:128]!
    496 	vrev32.8	@X[0],@X[0]		@ yes, even on
    497 	str		$ctx,[sp,#64]
    498 	vrev32.8	@X[1],@X[1]		@ big-endian
    499 	str		$inp,[sp,#68]
    500 	mov		$Xfer,sp
    501 	vrev32.8	@X[2],@X[2]
    502 	str		$len,[sp,#72]
    503 	vrev32.8	@X[3],@X[3]
    504 	str		$t2,[sp,#76]		@ save original sp
    505 	vadd.i32	$T0,$T0,@X[0]
    506 	vadd.i32	$T1,$T1,@X[1]
    507 	vst1.32		{$T0},[$Xfer,:128]!
    508 	vadd.i32	$T2,$T2,@X[2]
    509 	vst1.32		{$T1},[$Xfer,:128]!
    510 	vadd.i32	$T3,$T3,@X[3]
    511 	vst1.32		{$T2},[$Xfer,:128]!
    512 	vst1.32		{$T3},[$Xfer,:128]!
    513 
    514 	ldmia		$ctx,{$A-$H}
    515 	sub		$Xfer,$Xfer,#64
    516 	ldr		$t1,[sp,#0]
    517 	eor		$t2,$t2,$t2
    518 	eor		$t3,$B,$C
    519 	b		.L_00_48
    520 
    521 .align	4
    522 .L_00_48:
    523 ___
    524 	&Xupdate(\&body_00_15);
    525 	&Xupdate(\&body_00_15);
    526 	&Xupdate(\&body_00_15);
    527 	&Xupdate(\&body_00_15);
    528 $code.=<<___;
    529 	teq	$t1,#0				@ check for K256 terminator
    530 	ldr	$t1,[sp,#0]
    531 	sub	$Xfer,$Xfer,#64
    532 	bne	.L_00_48
    533 
    534 	ldr		$inp,[sp,#68]
    535 	ldr		$t0,[sp,#72]
    536 	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
    537 	teq		$inp,$t0
    538 	it		eq
    539 	subeq		$inp,$inp,#64		@ avoid SEGV
    540 	vld1.8		{@X[0]},[$inp]!		@ load next input block
    541 	vld1.8		{@X[1]},[$inp]!
    542 	vld1.8		{@X[2]},[$inp]!
    543 	vld1.8		{@X[3]},[$inp]!
    544 	it		ne
    545 	strne		$inp,[sp,#68]
    546 	mov		$Xfer,sp
    547 ___
    548 	&Xpreload(\&body_00_15);
    549 	&Xpreload(\&body_00_15);
    550 	&Xpreload(\&body_00_15);
    551 	&Xpreload(\&body_00_15);
    552 $code.=<<___;
    553 	ldr	$t0,[$t1,#0]
    554 	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
    555 	ldr	$t2,[$t1,#4]
    556 	ldr	$t3,[$t1,#8]
    557 	ldr	$t4,[$t1,#12]
    558 	add	$A,$A,$t0			@ accumulate
    559 	ldr	$t0,[$t1,#16]
    560 	add	$B,$B,$t2
    561 	ldr	$t2,[$t1,#20]
    562 	add	$C,$C,$t3
    563 	ldr	$t3,[$t1,#24]
    564 	add	$D,$D,$t4
    565 	ldr	$t4,[$t1,#28]
    566 	add	$E,$E,$t0
    567 	str	$A,[$t1],#4
    568 	add	$F,$F,$t2
    569 	str	$B,[$t1],#4
    570 	add	$G,$G,$t3
    571 	str	$C,[$t1],#4
    572 	add	$H,$H,$t4
    573 	str	$D,[$t1],#4
    574 	stmia	$t1,{$E-$H}
    575 
    576 	ittte	ne
    577 	movne	$Xfer,sp
    578 	ldrne	$t1,[sp,#0]
    579 	eorne	$t2,$t2,$t2
    580 	ldreq	sp,[sp,#76]			@ restore original sp
    581 	itt	ne
    582 	eorne	$t3,$B,$C
    583 	bne	.L_00_48
    584 
    585 	ldmia	sp!,{r4-r12,pc}
    586 .size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
    587 #endif
    588 ___
    589 }}}
    590 ######################################################################
    591 # ARMv8 stuff
    592 #
    593 {{{
    594 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
    595 my @MSG=map("q$_",(8..11));
    596 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
    597 my $Ktbl="r3";
    598 
    599 $code.=<<___;
    600 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    601 
    602 # if defined(__thumb2__) && !defined(__APPLE__)
    603 #  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
    604 # else
    605 #  define INST(a,b,c,d)	.byte	a,b,c,d
    606 # endif
    607 
    608 .type	sha256_block_data_order_armv8,%function
    609 .align	5
    610 sha256_block_data_order_armv8:
    611 .LARMv8:
    612 	vld1.32	{$ABCD,$EFGH},[$ctx]
    613 # ifdef	__APPLE__
    614 	sub	$Ktbl,$Ktbl,#256+32
    615 # elif	defined(__thumb2__)
    616 	adr	$Ktbl,.LARMv8
    617 	sub	$Ktbl,$Ktbl,#.LARMv8-K256
    618 # else
    619 	adrl	$Ktbl,K256
    620 # endif
    621 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    622 
    623 .Loop_v8:
    624 	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
    625 	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
    626 	vld1.32		{$W0},[$Ktbl]!
    627 	vrev32.8	@MSG[0],@MSG[0]
    628 	vrev32.8	@MSG[1],@MSG[1]
    629 	vrev32.8	@MSG[2],@MSG[2]
    630 	vrev32.8	@MSG[3],@MSG[3]
    631 	vmov		$ABCD_SAVE,$ABCD	@ offload
    632 	vmov		$EFGH_SAVE,$EFGH
    633 	teq		$inp,$len
    634 ___
    635 for($i=0;$i<12;$i++) {
    636 $code.=<<___;
    637 	vld1.32		{$W1},[$Ktbl]!
    638 	vadd.i32	$W0,$W0,@MSG[0]
    639 	sha256su0	@MSG[0],@MSG[1]
    640 	vmov		$abcd,$ABCD
    641 	sha256h		$ABCD,$EFGH,$W0
    642 	sha256h2	$EFGH,$abcd,$W0
    643 	sha256su1	@MSG[0],@MSG[2],@MSG[3]
    644 ___
    645 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    646 }
    647 $code.=<<___;
    648 	vld1.32		{$W1},[$Ktbl]!
    649 	vadd.i32	$W0,$W0,@MSG[0]
    650 	vmov		$abcd,$ABCD
    651 	sha256h		$ABCD,$EFGH,$W0
    652 	sha256h2	$EFGH,$abcd,$W0
    653 
    654 	vld1.32		{$W0},[$Ktbl]!
    655 	vadd.i32	$W1,$W1,@MSG[1]
    656 	vmov		$abcd,$ABCD
    657 	sha256h		$ABCD,$EFGH,$W1
    658 	sha256h2	$EFGH,$abcd,$W1
    659 
    660 	vld1.32		{$W1},[$Ktbl]
    661 	vadd.i32	$W0,$W0,@MSG[2]
    662 	sub		$Ktbl,$Ktbl,#256-16	@ rewind
    663 	vmov		$abcd,$ABCD
    664 	sha256h		$ABCD,$EFGH,$W0
    665 	sha256h2	$EFGH,$abcd,$W0
    666 
    667 	vadd.i32	$W1,$W1,@MSG[3]
    668 	vmov		$abcd,$ABCD
    669 	sha256h		$ABCD,$EFGH,$W1
    670 	sha256h2	$EFGH,$abcd,$W1
    671 
    672 	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
    673 	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
    674 	it		ne
    675 	bne		.Loop_v8
    676 
    677 	vst1.32		{$ABCD,$EFGH},[$ctx]
    678 
    679 	ret		@ bx lr
    680 .size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
    681 #endif
    682 ___
    683 }}}
    684 $code.=<<___;
    685 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    686 .align	2
    687 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    688 .comm   OPENSSL_armcap_P,4,4
    689 .hidden OPENSSL_armcap_P
    690 #endif
    691 ___
    692 
    693 open SELF,$0;
    694 while(<SELF>) {
    695 	next if (/^#!/);
    696 	last if (!s/^#/@/ and !/^$/);
    697 	print;
    698 }
    699 close SELF;
    700 
    701 {   my  %opcode = (
    702 	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
    703 	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
    704 
    705     sub unsha256 {
    706 	my ($mnemonic,$arg)=@_;
    707 
    708 	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
    709 	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
    710 					 |(($2&7)<<17)|(($2&8)<<4)
    711 					 |(($3&7)<<1) |(($3&8)<<2);
    712 	    # since ARMv7 instructions are always encoded little-endian.
    713 	    # correct solution is to use .inst directive, but older
    714 	    # assemblers don't implement it:-(
    715 	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
    716 			$word&0xff,($word>>8)&0xff,
    717 			($word>>16)&0xff,($word>>24)&0xff,
    718 			$mnemonic,$arg;
    719 	}
    720     }
    721 }
    722 
    723 foreach (split($/,$code)) {
    724 
    725 	s/\`([^\`]*)\`/eval $1/geo;
    726 
    727 	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
    728 
    729 	s/\bret\b/bx	lr/go		or
    730 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
    731 
    732 	print $_,"\n";
    733 }
    734 
    735 close STDOUT; # enforce flush
    736