Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2007.
     11 
     12 # Montgomery multiplication for ARMv4.
     13 #
     14 # Performance improvement naturally varies among CPU implementations
     15 # and compilers. The code was observed to provide +65-35% improvement
     16 # [depending on key length, less for longer keys] on ARM920T, and
     17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
     18 # base and compiler generated code with in-lined umull and even umlal
     19 # instructions. The latter means that this code didn't really have an 
     20 # "advantage" of utilizing some "secret" instruction.
     21 #
     22 # The code is interoperable with Thumb ISA and is rather compact, less
     23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
     24 # about decorations, ABI and instruction syntax are identical.
     25 
     26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     27 open STDOUT,">$output";
     28 
     29 $num="r0";	# starts as num argument, but holds &tp[num-1]
     30 $ap="r1";
     31 $bp="r2"; $bi="r2"; $rp="r2";
     32 $np="r3";
     33 $tp="r4";
     34 $aj="r5";
     35 $nj="r6";
     36 $tj="r7";
     37 $n0="r8";
     38 ###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
     39 $alo="r10";	# sl, gcc uses it to keep @GOT
     40 $ahi="r11";	# fp
     41 $nlo="r12";	# ip
     42 ###########	# r13 is stack pointer
     43 $nhi="r14";	# lr
     44 ###########	# r15 is program counter
     45 
     46 #### argument block layout relative to &tp[num-1], a.k.a. $num
     47 $_rp="$num,#12*4";
     48 # ap permanently resides in r1
     49 $_bp="$num,#13*4";
     50 # np permanently resides in r3
     51 $_n0="$num,#14*4";
     52 $_num="$num,#15*4";	$_bpend=$_num;
     53 
     54 $code=<<___;
     55 .text
     56 
     57 .global	bn_mul_mont
     58 .type	bn_mul_mont,%function
     59 
     60 .align	2
     61 bn_mul_mont:
     62 	stmdb	sp!,{r0,r2}		@ sp points at argument block
     63 	ldr	$num,[sp,#3*4]		@ load num
     64 	cmp	$num,#2
     65 	movlt	r0,#0
     66 	addlt	sp,sp,#2*4
     67 	blt	.Labrt
     68 
     69 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
     70 
     71 	mov	$num,$num,lsl#2		@ rescale $num for byte count
     72 	sub	sp,sp,$num		@ alloca(4*num)
     73 	sub	sp,sp,#4		@ +extra dword
     74 	sub	$num,$num,#4		@ "num=num-1"
     75 	add	$tp,$bp,$num		@ &bp[num-1]
     76 
     77 	add	$num,sp,$num		@ $num to point at &tp[num-1]
     78 	ldr	$n0,[$_n0]		@ &n0
     79 	ldr	$bi,[$bp]		@ bp[0]
     80 	ldr	$aj,[$ap],#4		@ ap[0],ap++
     81 	ldr	$nj,[$np],#4		@ np[0],np++
     82 	ldr	$n0,[$n0]		@ *n0
     83 	str	$tp,[$_bpend]		@ save &bp[num]
     84 
     85 	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
     86 	str	$n0,[$_n0]		@ save n0 value
     87 	mul	$n0,$alo,$n0		@ "tp[0]"*n0
     88 	mov	$nlo,#0
     89 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
     90 	mov	$tp,sp
     91 
     92 .L1st:
     93 	ldr	$aj,[$ap],#4		@ ap[j],ap++
     94 	mov	$alo,$ahi
     95 	ldr	$nj,[$np],#4		@ np[j],np++
     96 	mov	$ahi,#0
     97 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
     98 	mov	$nhi,#0
     99 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    100 	adds	$nlo,$nlo,$alo
    101 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    102 	adc	$nlo,$nhi,#0
    103 	cmp	$tp,$num
    104 	bne	.L1st
    105 
    106 	adds	$nlo,$nlo,$ahi
    107 	ldr	$tp,[$_bp]		@ restore bp
    108 	mov	$nhi,#0
    109 	ldr	$n0,[$_n0]		@ restore n0
    110 	adc	$nhi,$nhi,#0
    111 	str	$nlo,[$num]		@ tp[num-1]=
    112 	str	$nhi,[$num,#4]		@ tp[num]=
    113 
    115 .Louter:
    116 	sub	$tj,$num,sp		@ "original" $num-1 value
    117 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
    118 	ldr	$bi,[$tp,#4]!		@ *(++bp)
    119 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
    120 	ldr	$aj,[$ap,#-4]		@ ap[0]
    121 	ldr	$alo,[sp]		@ tp[0]
    122 	ldr	$nj,[$np,#-4]		@ np[0]
    123 	ldr	$tj,[sp,#4]		@ tp[1]
    124 
    125 	mov	$ahi,#0
    126 	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
    127 	str	$tp,[$_bp]		@ save bp
    128 	mul	$n0,$alo,$n0
    129 	mov	$nlo,#0
    130 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
    131 	mov	$tp,sp
    132 
    133 .Linner:
    134 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    135 	adds	$alo,$ahi,$tj		@ +=tp[j]
    136 	ldr	$nj,[$np],#4		@ np[j],np++
    137 	mov	$ahi,#0
    138 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
    139 	mov	$nhi,#0
    140 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    141 	adc	$ahi,$ahi,#0
    142 	ldr	$tj,[$tp,#8]		@ tp[j+1]
    143 	adds	$nlo,$nlo,$alo
    144 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    145 	adc	$nlo,$nhi,#0
    146 	cmp	$tp,$num
    147 	bne	.Linner
    148 
    149 	adds	$nlo,$nlo,$ahi
    150 	mov	$nhi,#0
    151 	ldr	$tp,[$_bp]		@ restore bp
    152 	adc	$nhi,$nhi,#0
    153 	ldr	$n0,[$_n0]		@ restore n0
    154 	adds	$nlo,$nlo,$tj
    155 	ldr	$tj,[$_bpend]		@ restore &bp[num]
    156 	adc	$nhi,$nhi,#0
    157 	str	$nlo,[$num]		@ tp[num-1]=
    158 	str	$nhi,[$num,#4]		@ tp[num]=
    159 
    160 	cmp	$tp,$tj
    161 	bne	.Louter
    162 
    164 	ldr	$rp,[$_rp]		@ pull rp
    165 	add	$num,$num,#4		@ $num to point at &tp[num]
    166 	sub	$aj,$num,sp		@ "original" num value
    167 	mov	$tp,sp			@ "rewind" $tp
    168 	mov	$ap,$tp			@ "borrow" $ap
    169 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
    170 
    171 	subs	$tj,$tj,$tj		@ "clear" carry flag
    172 .Lsub:	ldr	$tj,[$tp],#4
    173 	ldr	$nj,[$np],#4
    174 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
    175 	str	$tj,[$rp],#4		@ rp[j]=
    176 	teq	$tp,$num		@ preserve carry
    177 	bne	.Lsub
    178 	sbcs	$nhi,$nhi,#0		@ upmost carry
    179 	mov	$tp,sp			@ "rewind" $tp
    180 	sub	$rp,$rp,$aj		@ "rewind" $rp
    181 
    182 	and	$ap,$tp,$nhi
    183 	bic	$np,$rp,$nhi
    184 	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
    185 
    186 .Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
    187 	str	sp,[$tp],#4		@ zap tp
    188 	str	$tj,[$rp],#4
    189 	cmp	$tp,$num
    190 	bne	.Lcopy
    191 
    192 	add	sp,$num,#4		@ skip over tp[num+1]
    193 	ldmia	sp!,{r4-r12,lr}		@ restore registers
    194 	add	sp,sp,#2*4		@ skip over {r0,r2}
    195 	mov	r0,#1
    196 .Labrt:	tst	lr,#1
    197 	moveq	pc,lr			@ be binary compatible with V4, yet
    198 	bx	lr			@ interoperable with Thumb ISA:-)
    199 .size	bn_mul_mont,.-bn_mul_mont
    200 .asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
    201 .align	2
    202 ___
    203 
    204 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    205 print $code;
    206 close STDOUT;
    207