Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA256 block procedure for ARMv4. May 2007.
     11 
     12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
     13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
     14 # byte [on single-issue Xscale PXA250 core].
     15 
     16 # July 2010.
     17 #
     18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
     19 # Cortex A8 core and ~20 cycles per processed byte.
     20 
     21 # February 2011.
     22 #
     23 # Profiler-assisted and platform-specific optimization resulted in 16%
     24 # improvement on Cortex A8 core and ~17 cycles per processed byte.
     25 
     26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     27 open STDOUT,">$output";
     28 
     29 $ctx="r0";	$t0="r0";
     30 $inp="r1";	$t3="r1";
     31 $len="r2";	$t1="r2";
     32 $T1="r3";
     33 $A="r4";
     34 $B="r5";
     35 $C="r6";
     36 $D="r7";
     37 $E="r8";
     38 $F="r9";
     39 $G="r10";
     40 $H="r11";
     41 @V=($A,$B,$C,$D,$E,$F,$G,$H);
     42 $t2="r12";
     43 $Ktbl="r14";
     44 
     45 @Sigma0=( 2,13,22);
     46 @Sigma1=( 6,11,25);
     47 @sigma0=( 7,18, 3);
     48 @sigma1=(17,19,10);
     49 
     50 sub BODY_00_15 {
     51 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
     52 
     53 $code.=<<___ if ($i<16);
     54 #if __ARM_ARCH__>=7
     55 	ldr	$T1,[$inp],#4
     56 #else
     57 	ldrb	$T1,[$inp,#3]			@ $i
     58 	ldrb	$t2,[$inp,#2]
     59 	ldrb	$t1,[$inp,#1]
     60 	ldrb	$t0,[$inp],#4
     61 	orr	$T1,$T1,$t2,lsl#8
     62 	orr	$T1,$T1,$t1,lsl#16
     63 	orr	$T1,$T1,$t0,lsl#24
     64 #endif
     65 ___
     66 $code.=<<___;
     67 	mov	$t0,$e,ror#$Sigma1[0]
     68 	ldr	$t2,[$Ktbl],#4			@ *K256++
     69 	eor	$t0,$t0,$e,ror#$Sigma1[1]
     70 	eor	$t1,$f,$g
     71 #if $i>=16
     72 	add	$T1,$T1,$t3			@ from BODY_16_xx
     73 #elif __ARM_ARCH__>=7 && defined(__ARMEL__)
     74 	rev	$T1,$T1
     75 #endif
     76 #if $i==15
     77 	str	$inp,[sp,#17*4]			@ leave room for $t3
     78 #endif
     79 	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
     80 	and	$t1,$t1,$e
     81 	str	$T1,[sp,#`$i%16`*4]
     82 	add	$T1,$T1,$t0
     83 	eor	$t1,$t1,$g			@ Ch(e,f,g)
     84 	add	$T1,$T1,$h
     85 	mov	$h,$a,ror#$Sigma0[0]
     86 	add	$T1,$T1,$t1
     87 	eor	$h,$h,$a,ror#$Sigma0[1]
     88 	add	$T1,$T1,$t2
     89 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
     90 #if $i>=15
     91 	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
     92 #endif
     93 	orr	$t0,$a,$b
     94 	and	$t1,$a,$b
     95 	and	$t0,$t0,$c
     96 	add	$h,$h,$T1
     97 	orr	$t0,$t0,$t1			@ Maj(a,b,c)
     98 	add	$d,$d,$T1
     99 	add	$h,$h,$t0
    100 ___
    101 }
    102 
    103 sub BODY_16_XX {
    104 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
    105 
    106 $code.=<<___;
    107 	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
    108 	ldr	$t2,[sp,#`($i+14)%16`*4]
    109 	mov	$t0,$t3,ror#$sigma0[0]
    110 	ldr	$T1,[sp,#`($i+0)%16`*4]
    111 	eor	$t0,$t0,$t3,ror#$sigma0[1]
    112 	ldr	$t1,[sp,#`($i+9)%16`*4]
    113 	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
    114 	mov	$t3,$t2,ror#$sigma1[0]
    115 	add	$T1,$T1,$t0
    116 	eor	$t3,$t3,$t2,ror#$sigma1[1]
    117 	add	$T1,$T1,$t1
    118 	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
    119 	@ add	$T1,$T1,$t3
    120 ___
    121 	&BODY_00_15(@_);
    122 }
    123 
    124 $code=<<___;
    125 #include "arm_arch.h"
    126 
    127 .text
    128 .code	32
    129 
    130 .type	K256,%object
    131 .align	5
    132 K256:
    133 .word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    134 .word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    135 .word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    136 .word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    137 .word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    138 .word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    139 .word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    140 .word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    141 .word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    142 .word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    143 .word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    144 .word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    145 .word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    146 .word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    147 .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    148 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    149 .size	K256,.-K256
    150 
    151 .global	sha256_block_data_order
    152 .type	sha256_block_data_order,%function
    153 sha256_block_data_order:
    154 	sub	r3,pc,#8		@ sha256_block_data_order
    155 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
    156 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
    157 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
    158 	sub	$Ktbl,r3,#256		@ K256
    159 	sub	sp,sp,#16*4		@ alloca(X[16])
    160 .Loop:
    161 ___
    162 for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
    163 $code.=".Lrounds_16_xx:\n";
    164 for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
    165 $code.=<<___;
    166 	and	$t2,$t2,#0xff
    167 	cmp	$t2,#0xf2
    168 	bne	.Lrounds_16_xx
    169 
    170 	ldr	$T1,[sp,#16*4]		@ pull ctx
    171 	ldr	$t0,[$T1,#0]
    172 	ldr	$t1,[$T1,#4]
    173 	ldr	$t2,[$T1,#8]
    174 	add	$A,$A,$t0
    175 	ldr	$t0,[$T1,#12]
    176 	add	$B,$B,$t1
    177 	ldr	$t1,[$T1,#16]
    178 	add	$C,$C,$t2
    179 	ldr	$t2,[$T1,#20]
    180 	add	$D,$D,$t0
    181 	ldr	$t0,[$T1,#24]
    182 	add	$E,$E,$t1
    183 	ldr	$t1,[$T1,#28]
    184 	add	$F,$F,$t2
    185 	ldr	$inp,[sp,#17*4]		@ pull inp
    186 	ldr	$t2,[sp,#18*4]		@ pull inp+len
    187 	add	$G,$G,$t0
    188 	add	$H,$H,$t1
    189 	stmia	$T1,{$A,$B,$C,$D,$E,$F,$G,$H}
    190 	cmp	$inp,$t2
    191 	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
    192 	bne	.Loop
    193 
    194 	add	sp,sp,#`16+3`*4	@ destroy frame
    195 #if __ARM_ARCH__>=5
    196 	ldmia	sp!,{r4-r11,pc}
    197 #else
    198 	ldmia	sp!,{r4-r11,lr}
    199 	tst	lr,#1
    200 	moveq	pc,lr			@ be binary compatible with V4, yet
    201 	bx	lr			@ interoperable with Thumb ISA:-)
    202 #endif
    203 .size   sha256_block_data_order,.-sha256_block_data_order
    204 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
    205 .align	2
    206 ___
    207 
    208 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    209 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    210 print $code;
    211 close STDOUT; # enforce flush
    212