Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # January 2007.
     11 
     12 # Montgomery multiplication for ARMv4.
     13 #
     14 # Performance improvement naturally varies among CPU implementations
     15 # and compilers. The code was observed to provide +65-35% improvement
     16 # [depending on key length, less for longer keys] on ARM920T, and
     17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
     18 # base and compiler generated code with in-lined umull and even umlal
     19 # instructions. The latter means that this code didn't really have an 
     20 # "advantage" of utilizing some "secret" instruction.
     21 #
     22 # The code is interoperable with Thumb ISA and is rather compact, less
     23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
     24 # about decorations, ABI and instruction syntax are identical.
     25 
     26 $num="r0";	# starts as num argument, but holds &tp[num-1]
     27 $ap="r1";
     28 $bp="r2"; $bi="r2"; $rp="r2";
     29 $np="r3";
     30 $tp="r4";
     31 $aj="r5";
     32 $nj="r6";
     33 $tj="r7";
     34 $n0="r8";
     35 ###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
     36 $alo="r10";	# sl, gcc uses it to keep @GOT
     37 $ahi="r11";	# fp
     38 $nlo="r12";	# ip
     39 ###########	# r13 is stack pointer
     40 $nhi="r14";	# lr
     41 ###########	# r15 is program counter
     42 
     43 #### argument block layout relative to &tp[num-1], a.k.a. $num
     44 $_rp="$num,#12*4";
     45 # ap permanently resides in r1
     46 $_bp="$num,#13*4";
     47 # np permanently resides in r3
     48 $_n0="$num,#14*4";
     49 $_num="$num,#15*4";	$_bpend=$_num;
     50 
     51 $code=<<___;
     52 .text
     53 
     54 .global	bn_mul_mont
     55 .type	bn_mul_mont,%function
     56 
     57 .align	2
     58 bn_mul_mont:
     59 	stmdb	sp!,{r0,r2}		@ sp points at argument block
     60 	ldr	$num,[sp,#3*4]		@ load num
     61 	cmp	$num,#2
     62 	movlt	r0,#0
     63 	addlt	sp,sp,#2*4
     64 	blt	.Labrt
     65 
     66 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
     67 
     68 	mov	$num,$num,lsl#2		@ rescale $num for byte count
     69 	sub	sp,sp,$num		@ alloca(4*num)
     70 	sub	sp,sp,#4		@ +extra dword
     71 	sub	$num,$num,#4		@ "num=num-1"
     72 	add	$tp,$bp,$num		@ &bp[num-1]
     73 
     74 	add	$num,sp,$num		@ $num to point at &tp[num-1]
     75 	ldr	$n0,[$_n0]		@ &n0
     76 	ldr	$bi,[$bp]		@ bp[0]
     77 	ldr	$aj,[$ap],#4		@ ap[0],ap++
     78 	ldr	$nj,[$np],#4		@ np[0],np++
     79 	ldr	$n0,[$n0]		@ *n0
     80 	str	$tp,[$_bpend]		@ save &bp[num]
     81 
     82 	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
     83 	str	$n0,[$_n0]		@ save n0 value
     84 	mul	$n0,$alo,$n0		@ "tp[0]"*n0
     85 	mov	$nlo,#0
     86 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
     87 	mov	$tp,sp
     88 
     89 .L1st:
     90 	ldr	$aj,[$ap],#4		@ ap[j],ap++
     91 	mov	$alo,$ahi
     92 	mov	$ahi,#0
     93 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
     94 	ldr	$nj,[$np],#4		@ np[j],np++
     95 	mov	$nhi,#0
     96 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
     97 	adds	$nlo,$nlo,$alo
     98 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
     99 	adc	$nlo,$nhi,#0
    100 	cmp	$tp,$num
    101 	bne	.L1st
    102 
    103 	adds	$nlo,$nlo,$ahi
    104 	mov	$nhi,#0
    105 	adc	$nhi,$nhi,#0
    106 	ldr	$tp,[$_bp]		@ restore bp
    107 	str	$nlo,[$num]		@ tp[num-1]=
    108 	ldr	$n0,[$_n0]		@ restore n0
    109 	str	$nhi,[$num,#4]		@ tp[num]=
    110 
    112 .Louter:
    113 	sub	$tj,$num,sp		@ "original" $num-1 value
    114 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
    115 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
    116 	ldr	$bi,[$tp,#4]!		@ *(++bp)
    117 	ldr	$aj,[$ap,#-4]		@ ap[0]
    118 	ldr	$nj,[$np,#-4]		@ np[0]
    119 	ldr	$alo,[sp]		@ tp[0]
    120 	ldr	$tj,[sp,#4]		@ tp[1]
    121 
    122 	mov	$ahi,#0
    123 	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
    124 	str	$tp,[$_bp]		@ save bp
    125 	mul	$n0,$alo,$n0
    126 	mov	$nlo,#0
    127 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
    128 	mov	$tp,sp
    129 
    130 .Linner:
    131 	ldr	$aj,[$ap],#4		@ ap[j],ap++
    132 	adds	$alo,$ahi,$tj		@ +=tp[j]
    133 	mov	$ahi,#0
    134 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
    135 	ldr	$nj,[$np],#4		@ np[j],np++
    136 	mov	$nhi,#0
    137 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
    138 	ldr	$tj,[$tp,#8]		@ tp[j+1]
    139 	adc	$ahi,$ahi,#0
    140 	adds	$nlo,$nlo,$alo
    141 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
    142 	adc	$nlo,$nhi,#0
    143 	cmp	$tp,$num
    144 	bne	.Linner
    145 
    146 	adds	$nlo,$nlo,$ahi
    147 	mov	$nhi,#0
    148 	adc	$nhi,$nhi,#0
    149 	adds	$nlo,$nlo,$tj
    150 	adc	$nhi,$nhi,#0
    151 	ldr	$tp,[$_bp]		@ restore bp
    152 	ldr	$tj,[$_bpend]		@ restore &bp[num]
    153 	str	$nlo,[$num]		@ tp[num-1]=
    154 	ldr	$n0,[$_n0]		@ restore n0
    155 	str	$nhi,[$num,#4]		@ tp[num]=
    156 
    157 	cmp	$tp,$tj
    158 	bne	.Louter
    159 
    161 	ldr	$rp,[$_rp]		@ pull rp
    162 	add	$num,$num,#4		@ $num to point at &tp[num]
    163 	sub	$aj,$num,sp		@ "original" num value
    164 	mov	$tp,sp			@ "rewind" $tp
    165 	mov	$ap,$tp			@ "borrow" $ap
    166 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
    167 
    168 	subs	$tj,$tj,$tj		@ "clear" carry flag
    169 .Lsub:	ldr	$tj,[$tp],#4
    170 	ldr	$nj,[$np],#4
    171 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
    172 	str	$tj,[$rp],#4		@ rp[j]=
    173 	teq	$tp,$num		@ preserve carry
    174 	bne	.Lsub
    175 	sbcs	$nhi,$nhi,#0		@ upmost carry
    176 	mov	$tp,sp			@ "rewind" $tp
    177 	sub	$rp,$rp,$aj		@ "rewind" $rp
    178 
    179 	and	$ap,$tp,$nhi
    180 	bic	$np,$rp,$nhi
    181 	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
    182 
    183 .Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
    184 	str	sp,[$tp],#4		@ zap tp
    185 	str	$tj,[$rp],#4
    186 	cmp	$tp,$num
    187 	bne	.Lcopy
    188 
    189 	add	sp,$num,#4		@ skip over tp[num+1]
    190 	ldmia	sp!,{r4-r12,lr}		@ restore registers
    191 	add	sp,sp,#2*4		@ skip over {r0,r2}
    192 	mov	r0,#1
    193 .Labrt:	tst	lr,#1
    194 	moveq	pc,lr			@ be binary compatible with V4, yet
    195 	bx	lr			@ interoperable with Thumb ISA:-)
    196 .size	bn_mul_mont,.-bn_mul_mont
    197 .asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
    198 .align	2
    199 ___
    200 
    201 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    202 print $code;
    203 close STDOUT;
    204