Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # SHA1 for ARMv8.
     11 #
     12 # Performance in cycles per processed byte and improvement coefficient
     13 # over code generated with "default" compiler:
     14 #
     15 #		hardware-assisted	software(*)
     16 # Apple A7	2.31			4.13 (+14%)
     17 # Cortex-A5x	n/a			n/a
     18 #
     19 # (*)	Software results are presented mostly for reference purposes.
     20 
     21 $flavour = shift;
     22 open STDOUT,">".shift;
     23 
     24 ($ctx,$inp,$num)=("x0","x1","x2");
     25 @Xw=map("w$_",(3..17,19));
     26 @Xx=map("x$_",(3..17,19));
     27 @V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
     28 ($t0,$t1,$t2,$K)=map("w$_",(25..28));
     29 
     30 
     31 sub BODY_00_19 {
     32 my ($i,$a,$b,$c,$d,$e)=@_;
     33 my $j=($i+2)&15;
     34 
     35 $code.=<<___ if ($i<15 && !($i&1));
     36 	lsr	@Xx[$i+1],@Xx[$i],#32
     37 ___
     38 $code.=<<___ if ($i<14 && !($i&1));
     39 	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
     40 ___
     41 $code.=<<___ if ($i<14 && ($i&1));
     42 #ifdef	__ARMEB__
     43 	ror	@Xx[$i+1],@Xx[$i+1],#32
     44 #else
     45 	rev32	@Xx[$i+1],@Xx[$i+1]
     46 #endif
     47 ___
     48 $code.=<<___ if ($i<14);
     49 	bic	$t0,$d,$b
     50 	and	$t1,$c,$b
     51 	ror	$t2,$a,#27
     52 	add	$d,$d,$K		// future e+=K
     53 	orr	$t0,$t0,$t1
     54 	add	$e,$e,$t2		// e+=rot(a,5)
     55 	ror	$b,$b,#2
     56 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
     57 	add	$e,$e,$t0		// e+=F(b,c,d)
     58 ___
     59 $code.=<<___ if ($i==19);
     60 	movz	$K,#0xeba1
     61 	movk	$K,#0x6ed9,lsl#16
     62 ___
     63 $code.=<<___ if ($i>=14);
     64 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
     65 	bic	$t0,$d,$b
     66 	and	$t1,$c,$b
     67 	ror	$t2,$a,#27
     68 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
     69 	add	$d,$d,$K		// future e+=K
     70 	orr	$t0,$t0,$t1
     71 	add	$e,$e,$t2		// e+=rot(a,5)
     72 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
     73 	ror	$b,$b,#2
     74 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
     75 	add	$e,$e,$t0		// e+=F(b,c,d)
     76 	 ror	@Xw[$j],@Xw[$j],#31
     77 ___
     78 }
     79 
     80 sub BODY_40_59 {
     81 my ($i,$a,$b,$c,$d,$e)=@_;
     82 my $j=($i+2)&15;
     83 
     84 $code.=<<___ if ($i==59);
     85 	movz	$K,#0xc1d6
     86 	movk	$K,#0xca62,lsl#16
     87 ___
     88 $code.=<<___;
     89 	orr	$t0,$b,$c
     90 	and	$t1,$b,$c
     91 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
     92 	ror	$t2,$a,#27
     93 	and	$t0,$t0,$d
     94 	add	$d,$d,$K		// future e+=K
     95 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
     96 	add	$e,$e,$t2		// e+=rot(a,5)
     97 	orr	$t0,$t0,$t1
     98 	ror	$b,$b,#2
     99 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
    100 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    101 	add	$e,$e,$t0		// e+=F(b,c,d)
    102 	 ror	@Xw[$j],@Xw[$j],#31
    103 ___
    104 }
    105 
    106 sub BODY_20_39 {
    107 my ($i,$a,$b,$c,$d,$e)=@_;
    108 my $j=($i+2)&15;
    109 
    110 $code.=<<___ if ($i==39);
    111 	movz	$K,#0xbcdc
    112 	movk	$K,#0x8f1b,lsl#16
    113 ___
    114 $code.=<<___ if ($i<78);
    115 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
    116 	eor	$t0,$d,$b
    117 	ror	$t2,$a,#27
    118 	add	$d,$d,$K		// future e+=K
    119 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
    120 	eor	$t0,$t0,$c
    121 	add	$e,$e,$t2		// e+=rot(a,5)
    122 	ror	$b,$b,#2
    123 	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
    124 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    125 	add	$e,$e,$t0		// e+=F(b,c,d)
    126 	 ror	@Xw[$j],@Xw[$j],#31
    127 ___
    128 $code.=<<___ if ($i==78);
    129 	ldp	@Xw[1],@Xw[2],[$ctx]
    130 	eor	$t0,$d,$b
    131 	ror	$t2,$a,#27
    132 	add	$d,$d,$K		// future e+=K
    133 	eor	$t0,$t0,$c
    134 	add	$e,$e,$t2		// e+=rot(a,5)
    135 	ror	$b,$b,#2
    136 	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
    137 	add	$e,$e,$t0		// e+=F(b,c,d)
    138 ___
    139 $code.=<<___ if ($i==79);
    140 	ldp	@Xw[3],@Xw[4],[$ctx,#8]
    141 	eor	$t0,$d,$b
    142 	ror	$t2,$a,#27
    143 	eor	$t0,$t0,$c
    144 	add	$e,$e,$t2		// e+=rot(a,5)
    145 	ror	$b,$b,#2
    146 	ldr	@Xw[5],[$ctx,#16]
    147 	add	$e,$e,$t0		// e+=F(b,c,d)
    148 ___
    149 }
    150 
    151 $code.=<<___;
    152 #include "arm_arch.h"
    153 
    154 .text
    155 
    156 .globl	sha1_block_data_order
    157 .type	sha1_block_data_order,%function
    158 .align	6
    159 sha1_block_data_order:
    160 	ldr	x16,.LOPENSSL_armcap_P
    161 	adr	x17,.LOPENSSL_armcap_P
    162 	add	x16,x16,x17
    163 	ldr	w16,[x16]
    164 	tst	w16,#ARMV8_SHA1
    165 	b.ne	.Lv8_entry
    166 
    167 	stp	x29,x30,[sp,#-96]!
    168 	add	x29,sp,#0
    169 	stp	x19,x20,[sp,#16]
    170 	stp	x21,x22,[sp,#32]
    171 	stp	x23,x24,[sp,#48]
    172 	stp	x25,x26,[sp,#64]
    173 	stp	x27,x28,[sp,#80]
    174 
    175 	ldp	$A,$B,[$ctx]
    176 	ldp	$C,$D,[$ctx,#8]
    177 	ldr	$E,[$ctx,#16]
    178 
    179 .Loop:
    180 	ldr	@Xx[0],[$inp],#64
    181 	movz	$K,#0x7999
    182 	sub	$num,$num,#1
    183 	movk	$K,#0x5a82,lsl#16
    184 #ifdef	__ARMEB__
    185 	ror	$Xx[0],@Xx[0],#32
    186 #else
    187 	rev32	@Xx[0],@Xx[0]
    188 #endif
    189 	add	$E,$E,$K		// warm it up
    190 	add	$E,$E,@Xw[0]
    191 ___
    192 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    193 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    194 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    195 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    196 $code.=<<___;
    197 	add	$B,$B,@Xw[2]
    198 	add	$C,$C,@Xw[3]
    199 	add	$A,$A,@Xw[1]
    200 	add	$D,$D,@Xw[4]
    201 	add	$E,$E,@Xw[5]
    202 	stp	$A,$B,[$ctx]
    203 	stp	$C,$D,[$ctx,#8]
    204 	str	$E,[$ctx,#16]
    205 	cbnz	$num,.Loop
    206 
    207 	ldp	x19,x20,[sp,#16]
    208 	ldp	x21,x22,[sp,#32]
    209 	ldp	x23,x24,[sp,#48]
    210 	ldp	x25,x26,[sp,#64]
    211 	ldp	x27,x28,[sp,#80]
    212 	ldr	x29,[sp],#96
    213 	ret
    214 .size	sha1_block_data_order,.-sha1_block_data_order
    215 ___
    216 {{{
    217 my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
    218 my @MSG=map("v$_.16b",(4..7));
    219 my @Kxx=map("v$_.4s",(16..19));
    220 my ($W0,$W1)=("v20.4s","v21.4s");
    221 my $ABCD_SAVE="v22.16b";
    222 
    223 $code.=<<___;
    224 .type	sha1_block_armv8,%function
    225 .align	6
    226 sha1_block_armv8:
    227 .Lv8_entry:
    228 	stp	x29,x30,[sp,#-16]!
    229 	add	x29,sp,#0
    230 
    231 	adr	x4,.Lconst
    232 	eor	$E,$E,$E
    233 	ld1.32	{$ABCD},[$ctx],#16
    234 	ld1.32	{$E}[0],[$ctx]
    235 	sub	$ctx,$ctx,#16
    236 	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
    237 
    238 .Loop_hw:
    239 	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
    240 	sub	$num,$num,#1
    241 	rev32	@MSG[0],@MSG[0]
    242 	rev32	@MSG[1],@MSG[1]
    243 
    244 	add.i32	$W0,@Kxx[0],@MSG[0]
    245 	rev32	@MSG[2],@MSG[2]
    246 	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
    247 
    248 	add.i32	$W1,@Kxx[0],@MSG[1]
    249 	rev32	@MSG[3],@MSG[3]
    250 	sha1h	$E1,$ABCD
    251 	sha1c	$ABCD,$E,$W0		// 0
    252 	add.i32	$W0,@Kxx[$j],@MSG[2]
    253 	sha1su0	@MSG[0],@MSG[1],@MSG[2]
    254 ___
    255 for ($j=0,$i=1;$i<20-3;$i++) {
    256 my $f=("c","p","m","p")[$i/5];
    257 $code.=<<___;
    258 	sha1h	$E0,$ABCD		// $i
    259 	sha1$f	$ABCD,$E1,$W1
    260 	add.i32	$W1,@Kxx[$j],@MSG[3]
    261 	sha1su1	@MSG[0],@MSG[3]
    262 ___
    263 $code.=<<___ if ($i<20-4);
    264 	sha1su0	@MSG[1],@MSG[2],@MSG[3]
    265 ___
    266 	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
    267 	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
    268 }
    269 $code.=<<___;
    270 	sha1h	$E0,$ABCD		// $i
    271 	sha1p	$ABCD,$E1,$W1
    272 	add.i32	$W1,@Kxx[$j],@MSG[3]
    273 
    274 	sha1h	$E1,$ABCD		// 18
    275 	sha1p	$ABCD,$E0,$W0
    276 
    277 	sha1h	$E0,$ABCD		// 19
    278 	sha1p	$ABCD,$E1,$W1
    279 
    280 	add.i32	$E,$E,$E0
    281 	add.i32	$ABCD,$ABCD,$ABCD_SAVE
    282 
    283 	cbnz	$num,.Loop_hw
    284 
    285 	st1.32	{$ABCD},[$ctx],#16
    286 	st1.32	{$E}[0],[$ctx]
    287 
    288 	ldr	x29,[sp],#16
    289 	ret
    290 .size	sha1_block_armv8,.-sha1_block_armv8
    291 .align	6
    292 .Lconst:
    293 .long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
    294 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
    295 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
    296 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
    297 .LOPENSSL_armcap_P:
    298 .quad	OPENSSL_armcap_P-.
    299 .asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    300 .align	2
    301 .comm	OPENSSL_armcap_P,4,4
    302 ___
    303 }}}
    304 
    305 {   my	%opcode = (
    306 	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
    307 	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
    308 	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
    309 
    310     sub unsha1 {
    311 	my ($mnemonic,$arg)=@_;
    312 
    313 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
    314 	&&
    315 	sprintf ".inst\t0x%08x\t//%s %s",
    316 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
    317 			$mnemonic,$arg;
    318     }
    319 }
    320 
    321 foreach(split("\n",$code)) {
    322 
    323 	s/\`([^\`]*)\`/eval($1)/geo;
    324 
    325 	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
    326 
    327 	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
    328 	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
    329 
    330 	print $_,"\n";
    331 }
    332 
    333 close STDOUT;
    334