Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # SHA256/512 for ARMv8.
     11 #
     12 # Performance in cycles per processed byte and improvement coefficient
     13 # over code generated with "default" compiler:
     14 #
     15 #		SHA256-hw	SHA256(*)	SHA512
     16 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
     17 # Cortex-A5x	n/a		n/a		n/a
     18 # 
     19 # (*)	Software SHA256 results are of lesser relevance, presented
     20 #	mostly for informational purposes.
     21 # (**)	The result is a trade-off: it's possible to improve it by
     22 #	10%, but at the cost of 20% loss on Cortex-A5x.
     23 
     24 $flavour=shift;
     25 $output=shift;
     26 open STDOUT,">$output";
     27 
     28 if ($output =~ /512/) {
     29 	$BITS=512;
     30 	$SZ=8;
     31 	@Sigma0=(28,34,39);
     32 	@Sigma1=(14,18,41);
     33 	@sigma0=(1,  8, 7);
     34 	@sigma1=(19,61, 6);
     35 	$rounds=80;
     36 	$reg_t="x";
     37 } else {
     38 	$BITS=256;
     39 	$SZ=4;
     40 	@Sigma0=( 2,13,22);
     41 	@Sigma1=( 6,11,25);
     42 	@sigma0=( 7,18, 3);
     43 	@sigma1=(17,19,10);
     44 	$rounds=64;
     45 	$reg_t="w";
     46 }
     47 
     48 $func="sha${BITS}_block_data_order";
     49 
     50 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
     51 
     52 @X=map("$reg_t$_",(3..15,0..2));
     53 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
     54 ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
     55 
     56 sub BODY_00_xx {
     57 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
     58 my $j=($i+1)&15;
     59 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
     60    $T0=@X[$i+3] if ($i<11);
     61 
     62 $code.=<<___	if ($i<16);
     63 #ifndef	__ARMEB__
     64 	rev	@X[$i],@X[$i]			// $i
     65 #endif
     66 ___
     67 $code.=<<___	if ($i<13 && ($i&1));
     68 	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
     69 ___
     70 $code.=<<___	if ($i==13);
     71 	ldp	@X[14],@X[15],[$inp]
     72 ___
     73 $code.=<<___	if ($i>=14);
     74 	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
     75 ___
     76 $code.=<<___	if ($i>0 && $i<16);
     77 	add	$a,$a,$t1			// h+=Sigma0(a)
     78 ___
     79 $code.=<<___	if ($i>=11);
     80 	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
     81 ___
     82 # While ARMv8 specifies merged rotate-n-logical operation such as
     83 # 'eor x,y,z,ror#n', it was found to negatively affect performance
     84 # on Apple A7. The reason seems to be that it requires even 'y' to
     85 # be available earlier. This means that such merged instruction is
     86 # not necessarily best choice on critical path... On the other hand
     87 # Cortex-A5x handles merged instructions much better than disjoint
     88 # rotate and logical... See (**) footnote above.
     89 $code.=<<___	if ($i<15);
     90 	ror	$t0,$e,#$Sigma1[0]
     91 	add	$h,$h,$t2			// h+=K[i]
     92 	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
     93 	and	$t1,$f,$e
     94 	bic	$t2,$g,$e
     95 	add	$h,$h,@X[$i&15]			// h+=X[i]
     96 	orr	$t1,$t1,$t2			// Ch(e,f,g)
     97 	eor	$t2,$a,$b			// a^b, b^c in next round
     98 	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
     99 	ror	$T0,$a,#$Sigma0[0]
    100 	add	$h,$h,$t1			// h+=Ch(e,f,g)
    101 	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
    102 	add	$h,$h,$t0			// h+=Sigma1(e)
    103 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
    104 	add	$d,$d,$h			// d+=h
    105 	eor	$t3,$t3,$b			// Maj(a,b,c)
    106 	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
    107 	add	$h,$h,$t3			// h+=Maj(a,b,c)
    108 	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
    109 	//add	$h,$h,$t1			// h+=Sigma0(a)
    110 ___
    111 $code.=<<___	if ($i>=15);
    112 	ror	$t0,$e,#$Sigma1[0]
    113 	add	$h,$h,$t2			// h+=K[i]
    114 	ror	$T1,@X[($j+1)&15],#$sigma0[0]
    115 	and	$t1,$f,$e
    116 	ror	$T2,@X[($j+14)&15],#$sigma1[0]
    117 	bic	$t2,$g,$e
    118 	ror	$T0,$a,#$Sigma0[0]
    119 	add	$h,$h,@X[$i&15]			// h+=X[i]
    120 	eor	$t0,$t0,$e,ror#$Sigma1[1]
    121 	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
    122 	orr	$t1,$t1,$t2			// Ch(e,f,g)
    123 	eor	$t2,$a,$b			// a^b, b^c in next round
    124 	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
    125 	eor	$T0,$T0,$a,ror#$Sigma0[1]
    126 	add	$h,$h,$t1			// h+=Ch(e,f,g)
    127 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
    128 	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
    129 	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
    130 	add	$h,$h,$t0			// h+=Sigma1(e)
    131 	eor	$t3,$t3,$b			// Maj(a,b,c)
    132 	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
    133 	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
    134 	add	@X[$j],@X[$j],@X[($j+9)&15]
    135 	add	$d,$d,$h			// d+=h
    136 	add	$h,$h,$t3			// h+=Maj(a,b,c)
    137 	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
    138 	add	@X[$j],@X[$j],$T1
    139 	add	$h,$h,$t1			// h+=Sigma0(a)
    140 	add	@X[$j],@X[$j],$T2
    141 ___
    142 	($t2,$t3)=($t3,$t2);
    143 }
    144 
    145 $code.=<<___;
    146 #include "arm_arch.h"
    147 
    148 .text
    149 
    150 .globl	$func
    151 .type	$func,%function
    152 .align	6
    153 $func:
    154 ___
    155 $code.=<<___	if ($SZ==4);
    156 	ldr	x16,.LOPENSSL_armcap_P
    157 	adr	x17,.LOPENSSL_armcap_P
    158 	add	x16,x16,x17
    159 	ldr	w16,[x16]
    160 	tst	w16,#ARMV8_SHA256
    161 	b.ne	.Lv8_entry
    162 ___
    163 $code.=<<___;
    164 	stp	x29,x30,[sp,#-128]!
    165 	add	x29,sp,#0
    166 
    167 	stp	x19,x20,[sp,#16]
    168 	stp	x21,x22,[sp,#32]
    169 	stp	x23,x24,[sp,#48]
    170 	stp	x25,x26,[sp,#64]
    171 	stp	x27,x28,[sp,#80]
    172 	sub	sp,sp,#4*$SZ
    173 
    174 	ldp	$A,$B,[$ctx]				// load context
    175 	ldp	$C,$D,[$ctx,#2*$SZ]
    176 	ldp	$E,$F,[$ctx,#4*$SZ]
    177 	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
    178 	ldp	$G,$H,[$ctx,#6*$SZ]
    179 	adr	$Ktbl,K$BITS
    180 	stp	$ctx,$num,[x29,#96]
    181 
    182 .Loop:
    183 	ldp	@X[0],@X[1],[$inp],#2*$SZ
    184 	ldr	$t2,[$Ktbl],#$SZ			// *K++
    185 	eor	$t3,$B,$C				// magic seed
    186 	str	$inp,[x29,#112]
    187 ___
    188 for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
    189 $code.=".Loop_16_xx:\n";
    190 for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
    191 $code.=<<___;
    192 	cbnz	$t2,.Loop_16_xx
    193 
    194 	ldp	$ctx,$num,[x29,#96]
    195 	ldr	$inp,[x29,#112]
    196 	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
    197 
    198 	ldp	@X[0],@X[1],[$ctx]
    199 	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
    200 	add	$inp,$inp,#14*$SZ			// advance input pointer
    201 	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
    202 	add	$A,$A,@X[0]
    203 	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
    204 	add	$B,$B,@X[1]
    205 	add	$C,$C,@X[2]
    206 	add	$D,$D,@X[3]
    207 	stp	$A,$B,[$ctx]
    208 	add	$E,$E,@X[4]
    209 	add	$F,$F,@X[5]
    210 	stp	$C,$D,[$ctx,#2*$SZ]
    211 	add	$G,$G,@X[6]
    212 	add	$H,$H,@X[7]
    213 	cmp	$inp,$num
    214 	stp	$E,$F,[$ctx,#4*$SZ]
    215 	stp	$G,$H,[$ctx,#6*$SZ]
    216 	b.ne	.Loop
    217 
    218 	ldp	x19,x20,[x29,#16]
    219 	add	sp,sp,#4*$SZ
    220 	ldp	x21,x22,[x29,#32]
    221 	ldp	x23,x24,[x29,#48]
    222 	ldp	x25,x26,[x29,#64]
    223 	ldp	x27,x28,[x29,#80]
    224 	ldp	x29,x30,[sp],#128
    225 	ret
    226 .size	$func,.-$func
    227 
    228 .align	6
    229 .type	K$BITS,%object
    230 K$BITS:
    231 ___
    232 $code.=<<___ if ($SZ==8);
    233 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    234 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    235 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    236 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    237 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    238 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    239 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    240 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    241 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    242 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    243 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    244 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    245 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    246 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    247 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    248 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    249 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    250 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    251 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    252 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    253 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    254 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    255 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    256 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    257 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    258 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    259 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    260 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    261 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    262 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    263 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    264 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    265 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    266 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    267 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    268 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    269 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    270 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    271 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    272 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    273 	.quad	0	// terminator
    274 ___
    275 $code.=<<___ if ($SZ==4);
    276 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    277 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    278 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    279 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    280 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    281 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    282 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    283 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    284 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    285 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    286 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    287 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    288 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    289 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    290 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    291 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    292 	.long	0	//terminator
    293 ___
    294 $code.=<<___;
    295 .size	K$BITS,.-K$BITS
    296 .align	3
    297 .LOPENSSL_armcap_P:
    298 	.quad	OPENSSL_armcap_P-.
    299 .asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    300 .align	2
    301 ___
    302 
    303 if ($SZ==4) {
    304 my $Ktbl="x3";
    305 
    306 my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
    307 my @MSG=map("v$_.16b",(4..7));
    308 my ($W0,$W1)=("v16.4s","v17.4s");
    309 my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
    310 
    311 $code.=<<___;
    312 .type	sha256_block_armv8,%function
    313 .align	6
    314 sha256_block_armv8:
    315 .Lv8_entry:
    316 	stp		x29,x30,[sp,#-16]!
    317 	add		x29,sp,#0
    318 
    319 	ld1.32		{$ABCD,$EFGH},[$ctx]
    320 	adr		$Ktbl,K256
    321 
    322 .Loop_hw:
    323 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
    324 	sub		$num,$num,#1
    325 	ld1.32		{$W0},[$Ktbl],#16
    326 	rev32		@MSG[0],@MSG[0]
    327 	rev32		@MSG[1],@MSG[1]
    328 	rev32		@MSG[2],@MSG[2]
    329 	rev32		@MSG[3],@MSG[3]
    330 	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
    331 	orr		$EFGH_SAVE,$EFGH,$EFGH
    332 ___
    333 for($i=0;$i<12;$i++) {
    334 $code.=<<___;
    335 	ld1.32		{$W1},[$Ktbl],#16
    336 	add.i32		$W0,$W0,@MSG[0]
    337 	sha256su0	@MSG[0],@MSG[1]
    338 	orr		$abcd,$ABCD,$ABCD
    339 	sha256h		$ABCD,$EFGH,$W0
    340 	sha256h2	$EFGH,$abcd,$W0
    341 	sha256su1	@MSG[0],@MSG[2],@MSG[3]
    342 ___
    343 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    344 }
    345 $code.=<<___;
    346 	ld1.32		{$W1},[$Ktbl],#16
    347 	add.i32		$W0,$W0,@MSG[0]
    348 	orr		$abcd,$ABCD,$ABCD
    349 	sha256h		$ABCD,$EFGH,$W0
    350 	sha256h2	$EFGH,$abcd,$W0
    351 
    352 	ld1.32		{$W0},[$Ktbl],#16
    353 	add.i32		$W1,$W1,@MSG[1]
    354 	orr		$abcd,$ABCD,$ABCD
    355 	sha256h		$ABCD,$EFGH,$W1
    356 	sha256h2	$EFGH,$abcd,$W1
    357 
    358 	ld1.32		{$W1},[$Ktbl]
    359 	add.i32		$W0,$W0,@MSG[2]
    360 	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
    361 	orr		$abcd,$ABCD,$ABCD
    362 	sha256h		$ABCD,$EFGH,$W0
    363 	sha256h2	$EFGH,$abcd,$W0
    364 
    365 	add.i32		$W1,$W1,@MSG[3]
    366 	orr		$abcd,$ABCD,$ABCD
    367 	sha256h		$ABCD,$EFGH,$W1
    368 	sha256h2	$EFGH,$abcd,$W1
    369 
    370 	add.i32		$ABCD,$ABCD,$ABCD_SAVE
    371 	add.i32		$EFGH,$EFGH,$EFGH_SAVE
    372 
    373 	cbnz		$num,.Loop_hw
    374 
    375 	st1.32		{$ABCD,$EFGH},[$ctx]
    376 
    377 	ldr		x29,[sp],#16
    378 	ret
    379 .size	sha256_block_armv8,.-sha256_block_armv8
    380 ___
    381 }
    382 
    383 $code.=<<___;
    384 .comm	OPENSSL_armcap_P,4,4
    385 ___
    386 
    387 {   my  %opcode = (
    388 	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
    389 	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
    390 
    391     sub unsha256 {
    392 	my ($mnemonic,$arg)=@_;
    393 
    394 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
    395 	&&
    396 	sprintf ".inst\t0x%08x\t//%s %s",
    397 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
    398 			$mnemonic,$arg;
    399     }
    400 }
    401 
    402 foreach(split("\n",$code)) {
    403 
    404 	s/\`([^\`]*)\`/eval($1)/geo;
    405 
    406 	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
    407 
    408 	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
    409 	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
    410 
    411 	print $_,"\n";
    412 }
    413 
    414 close STDOUT;
    415