Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # SHA1 for ARMv8.
     11 #
     12 # Performance in cycles per processed byte and improvement coefficient
     13 # over code generated with "default" compiler:
     14 #
     15 #		hardware-assisted	software(*)
     16 # Apple A7	2.31			4.13 (+14%)
     17 # Cortex-A53	2.24			8.03 (+97%)
     18 # Cortex-A57	2.35			7.88 (+74%)
     19 # Denver	2.13			3.97 (+0%)(**)
     20 # X-Gene				8.80 (+200%)
     21 #
     22 # (*)	Software results are presented mostly for reference purposes.
     23 # (**)	Keep in mind that Denver relies on binary translation, which
     24 #	optimizes compiler output at run-time.
     25 
     26 $flavour = shift;
     27 $output  = shift;
     28 
     29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     30 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     31 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     32 die "can't locate arm-xlate.pl";
     33 
     34 open OUT,"| \"$^X\" $xlate $flavour $output";
     35 *STDOUT=*OUT;
     36 
     37 ($ctx,$inp,$num)=("x0","x1","x2");
     38 @Xw=map("w$_",(3..17,19));
     39 @Xx=map("x$_",(3..17,19));
     40 @V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
     41 ($t0,$t1,$t2,$K)=map("w$_",(25..28));
     42 
     43 
     44 sub BODY_00_19 {
     45 my ($i,$a,$b,$c,$d,$e)=@_;
     46 my $j=($i+2)&15;
     47 
     48 $code.=<<___ if ($i<15 && !($i&1));
     49 	lsr	@Xx[$i+1],@Xx[$i],#32
     50 ___
     51 $code.=<<___ if ($i<14 && !($i&1));
     52 	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
     53 ___
     54 $code.=<<___ if ($i<14 && ($i&1));
     55 #ifdef	__ARMEB__
     56 	ror	@Xx[$i+1],@Xx[$i+1],#32
     57 #else
     58 	rev32	@Xx[$i+1],@Xx[$i+1]
     59 #endif
     60 ___
     61 $code.=<<___ if ($i<14);
     62 	bic	$t0,$d,$b
     63 	and	$t1,$c,$b
     64 	ror	$t2,$a,#27
     65 	add	$d,$d,$K		// future e+=K
     66 	orr	$t0,$t0,$t1
     67 	add	$e,$e,$t2		// e+=rot(a,5)
     68 	ror	$b,$b,#2
     69 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
     70 	add	$e,$e,$t0		// e+=F(b,c,d)
     71 ___
     72 $code.=<<___ if ($i==19);
     73 	movz	$K,#0xeba1
     74 	movk	$K,#0x6ed9,lsl#16
     75 ___
     76 $code.=<<___ if ($i>=14);
     77 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
     78 	bic	$t0,$d,$b
     79 	and	$t1,$c,$b
     80 	ror	$t2,$a,#27
     81 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
     82 	add	$d,$d,$K		// future e+=K
     83 	orr	$t0,$t0,$t1
     84 	add	$e,$e,$t2		// e+=rot(a,5)
     85 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
     86 	ror	$b,$b,#2
     87 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
     88 	add	$e,$e,$t0		// e+=F(b,c,d)
     89 	 ror	@Xw[$j],@Xw[$j],#31
     90 ___
     91 }
     92 
     93 sub BODY_40_59 {
     94 my ($i,$a,$b,$c,$d,$e)=@_;
     95 my $j=($i+2)&15;
     96 
     97 $code.=<<___ if ($i==59);
     98 	movz	$K,#0xc1d6
     99 	movk	$K,#0xca62,lsl#16
    100 ___
    101 $code.=<<___;
    102 	orr	$t0,$b,$c
    103 	and	$t1,$b,$c
    104 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
    105 	ror	$t2,$a,#27
    106 	and	$t0,$t0,$d
    107 	add	$d,$d,$K		// future e+=K
    108 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
    109 	add	$e,$e,$t2		// e+=rot(a,5)
    110 	orr	$t0,$t0,$t1
    111 	ror	$b,$b,#2
    112 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
    113 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    114 	add	$e,$e,$t0		// e+=F(b,c,d)
    115 	 ror	@Xw[$j],@Xw[$j],#31
    116 ___
    117 }
    118 
    119 sub BODY_20_39 {
    120 my ($i,$a,$b,$c,$d,$e)=@_;
    121 my $j=($i+2)&15;
    122 
    123 $code.=<<___ if ($i==39);
    124 	movz	$K,#0xbcdc
    125 	movk	$K,#0x8f1b,lsl#16
    126 ___
    127 $code.=<<___ if ($i<78);
    128 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
    129 	eor	$t0,$d,$b
    130 	ror	$t2,$a,#27
    131 	add	$d,$d,$K		// future e+=K
    132 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
    133 	eor	$t0,$t0,$c
    134 	add	$e,$e,$t2		// e+=rot(a,5)
    135 	ror	$b,$b,#2
    136 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
    137 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    138 	add	$e,$e,$t0		// e+=F(b,c,d)
    139 	 ror	@Xw[$j],@Xw[$j],#31
    140 ___
    141 $code.=<<___ if ($i==78);
    142 	ldp	@Xw[1],@Xw[2],[$ctx]
    143 	eor	$t0,$d,$b
    144 	ror	$t2,$a,#27
    145 	add	$d,$d,$K		// future e+=K
    146 	eor	$t0,$t0,$c
    147 	add	$e,$e,$t2		// e+=rot(a,5)
    148 	ror	$b,$b,#2
    149 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    150 	add	$e,$e,$t0		// e+=F(b,c,d)
    151 ___
    152 $code.=<<___ if ($i==79);
    153 	ldp	@Xw[3],@Xw[4],[$ctx,#8]
    154 	eor	$t0,$d,$b
    155 	ror	$t2,$a,#27
    156 	eor	$t0,$t0,$c
    157 	add	$e,$e,$t2		// e+=rot(a,5)
    158 	ror	$b,$b,#2
    159 	ldr	@Xw[5],[$ctx,#16]
    160 	add	$e,$e,$t0		// e+=F(b,c,d)
    161 ___
    162 }
    163 
    164 $code.=<<___;
    165 #include "arm_arch.h"
    166 
    167 .text
    168 
    169 .extern	OPENSSL_armcap_P
    170 .globl	sha1_block_data_order
    171 .type	sha1_block_data_order,%function
    172 .align	6
    173 sha1_block_data_order:
    174 	ldr	x16,.LOPENSSL_armcap_P
    175 	adr	x17,.LOPENSSL_armcap_P
    176 	add	x16,x16,x17
    177 	ldr	w16,[x16]
    178 	tst	w16,#ARMV8_SHA1
    179 	b.ne	.Lv8_entry
    180 
    181 	stp	x29,x30,[sp,#-96]!
    182 	add	x29,sp,#0
    183 	stp	x19,x20,[sp,#16]
    184 	stp	x21,x22,[sp,#32]
    185 	stp	x23,x24,[sp,#48]
    186 	stp	x25,x26,[sp,#64]
    187 	stp	x27,x28,[sp,#80]
    188 
    189 	ldp	$A,$B,[$ctx]
    190 	ldp	$C,$D,[$ctx,#8]
    191 	ldr	$E,[$ctx,#16]
    192 
    193 .Loop:
    194 	ldr	@Xx[0],[$inp],#64
    195 	movz	$K,#0x7999
    196 	sub	$num,$num,#1
    197 	movk	$K,#0x5a82,lsl#16
    198 #ifdef	__ARMEB__
    199 	ror	$Xx[0],@Xx[0],#32
    200 #else
    201 	rev32	@Xx[0],@Xx[0]
    202 #endif
    203 	add	$E,$E,$K		// warm it up
    204 	add	$E,$E,@Xw[0]
    205 ___
    206 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    207 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    208 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    209 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    210 $code.=<<___;
    211 	add	$B,$B,@Xw[2]
    212 	add	$C,$C,@Xw[3]
    213 	add	$A,$A,@Xw[1]
    214 	add	$D,$D,@Xw[4]
    215 	add	$E,$E,@Xw[5]
    216 	stp	$A,$B,[$ctx]
    217 	stp	$C,$D,[$ctx,#8]
    218 	str	$E,[$ctx,#16]
    219 	cbnz	$num,.Loop
    220 
    221 	ldp	x19,x20,[sp,#16]
    222 	ldp	x21,x22,[sp,#32]
    223 	ldp	x23,x24,[sp,#48]
    224 	ldp	x25,x26,[sp,#64]
    225 	ldp	x27,x28,[sp,#80]
    226 	ldr	x29,[sp],#96
    227 	ret
    228 .size	sha1_block_data_order,.-sha1_block_data_order
    229 ___
    230 {{{
    231 my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
    232 my @MSG=map("v$_.16b",(4..7));
    233 my @Kxx=map("v$_.4s",(16..19));
    234 my ($W0,$W1)=("v20.4s","v21.4s");
    235 my $ABCD_SAVE="v22.16b";
    236 
    237 $code.=<<___;
    238 .type	sha1_block_armv8,%function
    239 .align	6
    240 sha1_block_armv8:
    241 .Lv8_entry:
    242 	stp	x29,x30,[sp,#-16]!
    243 	add	x29,sp,#0
    244 
    245 	adr	x4,.Lconst
    246 	eor	$E,$E,$E
    247 	ld1.32	{$ABCD},[$ctx],#16
    248 	ld1.32	{$E}[0],[$ctx]
    249 	sub	$ctx,$ctx,#16
    250 	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
    251 
    252 .Loop_hw:
    253 	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
    254 	sub	$num,$num,#1
    255 	rev32	@MSG[0],@MSG[0]
    256 	rev32	@MSG[1],@MSG[1]
    257 
    258 	add.i32	$W0,@Kxx[0],@MSG[0]
    259 	rev32	@MSG[2],@MSG[2]
    260 	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
    261 
    262 	add.i32	$W1,@Kxx[0],@MSG[1]
    263 	rev32	@MSG[3],@MSG[3]
    264 	sha1h	$E1,$ABCD
    265 	sha1c	$ABCD,$E,$W0		// 0
    266 	add.i32	$W0,@Kxx[$j],@MSG[2]
    267 	sha1su0	@MSG[0],@MSG[1],@MSG[2]
    268 ___
    269 for ($j=0,$i=1;$i<20-3;$i++) {
    270 my $f=("c","p","m","p")[$i/5];
    271 $code.=<<___;
    272 	sha1h	$E0,$ABCD		// $i
    273 	sha1$f	$ABCD,$E1,$W1
    274 	add.i32	$W1,@Kxx[$j],@MSG[3]
    275 	sha1su1	@MSG[0],@MSG[3]
    276 ___
    277 $code.=<<___ if ($i<20-4);
    278 	sha1su0	@MSG[1],@MSG[2],@MSG[3]
    279 ___
    280 	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
    281 	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
    282 }
    283 $code.=<<___;
    284 	sha1h	$E0,$ABCD		// $i
    285 	sha1p	$ABCD,$E1,$W1
    286 	add.i32	$W1,@Kxx[$j],@MSG[3]
    287 
    288 	sha1h	$E1,$ABCD		// 18
    289 	sha1p	$ABCD,$E0,$W0
    290 
    291 	sha1h	$E0,$ABCD		// 19
    292 	sha1p	$ABCD,$E1,$W1
    293 
    294 	add.i32	$E,$E,$E0
    295 	add.i32	$ABCD,$ABCD,$ABCD_SAVE
    296 
    297 	cbnz	$num,.Loop_hw
    298 
    299 	st1.32	{$ABCD},[$ctx],#16
    300 	st1.32	{$E}[0],[$ctx]
    301 
    302 	ldr	x29,[sp],#16
    303 	ret
    304 .size	sha1_block_armv8,.-sha1_block_armv8
    305 .align	6
    306 .Lconst:
    307 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
    308 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
    309 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
    310 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
    311 .LOPENSSL_armcap_P:
    312 .quad	OPENSSL_armcap_P-.
    313 .asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    314 .align	2
    315 .comm	OPENSSL_armcap_P,4,4
    316 ___
    317 }}}
    318 
    319 {   my	%opcode = (
    320 	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
    321 	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
    322 	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
    323 
    324     sub unsha1 {
    325 	my ($mnemonic,$arg)=@_;
    326 
    327 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
    328 	&&
    329 	sprintf ".inst\t0x%08x\t//%s %s",
    330 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
    331 			$mnemonic,$arg;
    332     }
    333 }
    334 
    335 foreach(split("\n",$code)) {
    336 
    337 	s/\`([^\`]*)\`/eval($1)/geo;
    338 
    339 	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
    340 
    341 	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
    342 	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
    343 
    344 	print $_,"\n";
    345 }
    346 
    347 close STDOUT;
    348