1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # January 2007. 11 12 # Montgomery multiplication for ARMv4. 13 # 14 # Performance improvement naturally varies among CPU implementations 15 # and compilers. The code was observed to provide +65-35% improvement 16 # [depending on key length, less for longer keys] on ARM920T, and 17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code 18 # base and compiler generated code with in-lined umull and even umlal 19 # instructions. The latter means that this code didn't really have an 20 # "advantage" of utilizing some "secret" instruction. 21 # 22 # The code is interoperable with Thumb ISA and is rather compact, less 23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively 24 # about decorations, ABI and instruction syntax are identical. 25 26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 27 open STDOUT,">$output"; 28 29 $num="r0"; # starts as num argument, but holds &tp[num-1] 30 $ap="r1"; 31 $bp="r2"; $bi="r2"; $rp="r2"; 32 $np="r3"; 33 $tp="r4"; 34 $aj="r5"; 35 $nj="r6"; 36 $tj="r7"; 37 $n0="r8"; 38 ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer 39 $alo="r10"; # sl, gcc uses it to keep @GOT 40 $ahi="r11"; # fp 41 $nlo="r12"; # ip 42 ########### # r13 is stack pointer 43 $nhi="r14"; # lr 44 ########### # r15 is program counter 45 46 #### argument block layout relative to &tp[num-1], a.k.a. $num 47 $_rp="$num,#12*4"; 48 # ap permanently resides in r1 49 $_bp="$num,#13*4"; 50 # np permanently resides in r3 51 $_n0="$num,#14*4"; 52 $_num="$num,#15*4"; $_bpend=$_num; 53 54 $code=<<___; 55 .text 56 57 .global bn_mul_mont 58 .type bn_mul_mont,%function 59 60 .align 2 61 bn_mul_mont: 62 stmdb sp!,{r0,r2} @ sp points at argument block 63 ldr $num,[sp,#3*4] @ load num 64 cmp $num,#2 65 movlt r0,#0 66 addlt sp,sp,#2*4 67 blt .Labrt 68 69 stmdb sp!,{r4-r12,lr} @ save 10 registers 70 71 mov $num,$num,lsl#2 @ rescale $num for byte count 72 sub sp,sp,$num @ alloca(4*num) 73 sub sp,sp,#4 @ +extra dword 74 sub $num,$num,#4 @ "num=num-1" 75 add $tp,$bp,$num @ &bp[num-1] 76 77 add $num,sp,$num @ $num to point at &tp[num-1] 78 ldr $n0,[$_n0] @ &n0 79 ldr $bi,[$bp] @ bp[0] 80 ldr $aj,[$ap],#4 @ ap[0],ap++ 81 ldr $nj,[$np],#4 @ np[0],np++ 82 ldr $n0,[$n0] @ *n0 83 str $tp,[$_bpend] @ save &bp[num] 84 85 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] 86 str $n0,[$_n0] @ save n0 value 87 mul $n0,$alo,$n0 @ "tp[0]"*n0 88 mov $nlo,#0 89 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" 90 mov $tp,sp 91 92 .L1st: 93 ldr $aj,[$ap],#4 @ ap[j],ap++ 94 mov $alo,$ahi 95 ldr $nj,[$np],#4 @ np[j],np++ 96 mov $ahi,#0 97 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] 98 mov $nhi,#0 99 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 100 adds $nlo,$nlo,$alo 101 str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 102 adc $nlo,$nhi,#0 103 cmp $tp,$num 104 bne .L1st 105 106 adds $nlo,$nlo,$ahi 107 ldr $tp,[$_bp] @ restore bp 108 mov $nhi,#0 109 ldr $n0,[$_n0] @ restore n0 110 adc $nhi,$nhi,#0 111 str $nlo,[$num] @ tp[num-1]= 112 str $nhi,[$num,#4] @ tp[num]= 113 115 .Louter: 116 sub $tj,$num,sp @ "original" $num-1 value 117 sub $ap,$ap,$tj @ "rewind" ap to &ap[1] 118 ldr $bi,[$tp,#4]! @ *(++bp) 119 sub $np,$np,$tj @ "rewind" np to &np[1] 120 ldr $aj,[$ap,#-4] @ ap[0] 121 ldr $alo,[sp] @ tp[0] 122 ldr $nj,[$np,#-4] @ np[0] 123 ldr $tj,[sp,#4] @ tp[1] 124 125 mov $ahi,#0 126 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] 127 str $tp,[$_bp] @ save bp 128 mul $n0,$alo,$n0 129 mov $nlo,#0 130 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" 131 mov $tp,sp 132 133 .Linner: 134 ldr $aj,[$ap],#4 @ ap[j],ap++ 135 adds $alo,$ahi,$tj @ +=tp[j] 136 ldr $nj,[$np],#4 @ np[j],np++ 137 mov $ahi,#0 138 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] 139 mov $nhi,#0 140 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 141 adc $ahi,$ahi,#0 142 ldr $tj,[$tp,#8] @ tp[j+1] 143 adds $nlo,$nlo,$alo 144 str $nlo,[$tp],#4 @ tp[j-1]=,tp++ 145 adc $nlo,$nhi,#0 146 cmp $tp,$num 147 bne .Linner 148 149 adds $nlo,$nlo,$ahi 150 mov $nhi,#0 151 ldr $tp,[$_bp] @ restore bp 152 adc $nhi,$nhi,#0 153 ldr $n0,[$_n0] @ restore n0 154 adds $nlo,$nlo,$tj 155 ldr $tj,[$_bpend] @ restore &bp[num] 156 adc $nhi,$nhi,#0 157 str $nlo,[$num] @ tp[num-1]= 158 str $nhi,[$num,#4] @ tp[num]= 159 160 cmp $tp,$tj 161 bne .Louter 162 164 ldr $rp,[$_rp] @ pull rp 165 add $num,$num,#4 @ $num to point at &tp[num] 166 sub $aj,$num,sp @ "original" num value 167 mov $tp,sp @ "rewind" $tp 168 mov $ap,$tp @ "borrow" $ap 169 sub $np,$np,$aj @ "rewind" $np to &np[0] 170 171 subs $tj,$tj,$tj @ "clear" carry flag 172 .Lsub: ldr $tj,[$tp],#4 173 ldr $nj,[$np],#4 174 sbcs $tj,$tj,$nj @ tp[j]-np[j] 175 str $tj,[$rp],#4 @ rp[j]= 176 teq $tp,$num @ preserve carry 177 bne .Lsub 178 sbcs $nhi,$nhi,#0 @ upmost carry 179 mov $tp,sp @ "rewind" $tp 180 sub $rp,$rp,$aj @ "rewind" $rp 181 182 and $ap,$tp,$nhi 183 bic $np,$rp,$nhi 184 orr $ap,$ap,$np @ ap=borrow?tp:rp 185 186 .Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh 187 str sp,[$tp],#4 @ zap tp 188 str $tj,[$rp],#4 189 cmp $tp,$num 190 bne .Lcopy 191 192 add sp,$num,#4 @ skip over tp[num+1] 193 ldmia sp!,{r4-r12,lr} @ restore registers 194 add sp,sp,#2*4 @ skip over {r0,r2} 195 mov r0,#1 196 .Labrt: tst lr,#1 197 moveq pc,lr @ be binary compatible with V4, yet 198 bx lr @ interoperable with Thumb ISA:-) 199 .size bn_mul_mont,.-bn_mul_mont 200 .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 201 .align 2 202 ___ 203 204 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 205 print $code; 206 close STDOUT; 207