1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # April 2006 11 12 # "Teaser" Montgomery multiplication module for PowerPC. It's possible 13 # to gain a bit more by modulo-scheduling outer loop, then dedicated 14 # squaring procedure should give further 20% and code can be adapted 15 # for 32-bit application running on 64-bit CPU. As for the latter. 16 # It won't be able to achieve "native" 64-bit performance, because in 17 # 32-bit application context every addc instruction will have to be 18 # expanded as addc, twice right shift by 32 and finally adde, etc. 19 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm 20 # for 64-bit application running on PPC970/G5 is: 21 # 22 # 512-bit +65% 23 # 1024-bit +35% 24 # 2048-bit +18% 25 # 4096-bit +4% 26 27 $flavour = shift; 28 29 if ($flavour =~ /32/) { 30 $BITS= 32; 31 $BNSZ= $BITS/8; 32 $SIZE_T=4; 33 $RZONE= 224; 34 $FRAME= $SIZE_T*16; 35 36 $LD= "lwz"; # load 37 $LDU= "lwzu"; # load and update 38 $LDX= "lwzx"; # load indexed 39 $ST= "stw"; # store 40 $STU= "stwu"; # store and update 41 $STX= "stwx"; # store indexed 42 $STUX= "stwux"; # store indexed and update 43 $UMULL= "mullw"; # unsigned multiply low 44 $UMULH= "mulhwu"; # unsigned multiply high 45 $UCMP= "cmplw"; # unsigned compare 46 $SHRI= "srwi"; # unsigned shift right by immediate 47 $PUSH= $ST; 48 $POP= $LD; 49 } elsif ($flavour =~ /64/) { 50 $BITS= 64; 51 $BNSZ= $BITS/8; 52 $SIZE_T=8; 53 $RZONE= 288; 54 $FRAME= $SIZE_T*16; 55 56 # same as above, but 64-bit mnemonics... 57 $LD= "ld"; # load 58 $LDU= "ldu"; # load and update 59 $LDX= "ldx"; # load indexed 60 $ST= "std"; # store 61 $STU= "stdu"; # store and update 62 $STX= "stdx"; # store indexed 63 $STUX= "stdux"; # store indexed and update 64 $UMULL= "mulld"; # unsigned multiply low 65 $UMULH= "mulhdu"; # unsigned multiply high 66 $UCMP= "cmpld"; # unsigned compare 67 $SHRI= "srdi"; # unsigned shift right by immediate 68 $PUSH= $ST; 69 $POP= $LD; 70 } else { die "nonsense $flavour"; } 71 72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 75 die "can't locate ppc-xlate.pl"; 76 77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 78 79 $sp="r1"; 80 $toc="r2"; 81 $rp="r3"; $ovf="r3"; 82 $ap="r4"; 83 $bp="r5"; 84 $np="r6"; 85 $n0="r7"; 86 $num="r8"; 87 $rp="r9"; # $rp is reassigned 88 $aj="r10"; 89 $nj="r11"; 90 $tj="r12"; 91 # non-volatile registers 92 $i="r14"; 93 $j="r15"; 94 $tp="r16"; 95 $m0="r17"; 96 $m1="r18"; 97 $lo0="r19"; 98 $hi0="r20"; 99 $lo1="r21"; 100 $hi1="r22"; 101 $alo="r23"; 102 $ahi="r24"; 103 $nlo="r25"; 104 # 105 $nhi="r0"; 106 107 $code=<<___; 108 .machine "any" 109 .text 110 111 .globl .bn_mul_mont 112 .align 4 113 .bn_mul_mont: 114 cmpwi $num,4 115 mr $rp,r3 ; $rp is reassigned 116 li r3,0 117 bltlr 118 119 slwi $num,$num,`log($BNSZ)/log(2)` 120 li $tj,-4096 121 addi $ovf,$num,`$FRAME+$RZONE` 122 subf $ovf,$ovf,$sp ; $sp-$ovf 123 and $ovf,$ovf,$tj ; minimize TLB usage 124 subf $ovf,$sp,$ovf ; $ovf-$sp 125 srwi $num,$num,`log($BNSZ)/log(2)` 126 $STUX $sp,$sp,$ovf 127 128 $PUSH r14,`4*$SIZE_T`($sp) 129 $PUSH r15,`5*$SIZE_T`($sp) 130 $PUSH r16,`6*$SIZE_T`($sp) 131 $PUSH r17,`7*$SIZE_T`($sp) 132 $PUSH r18,`8*$SIZE_T`($sp) 133 $PUSH r19,`9*$SIZE_T`($sp) 134 $PUSH r20,`10*$SIZE_T`($sp) 135 $PUSH r21,`11*$SIZE_T`($sp) 136 $PUSH r22,`12*$SIZE_T`($sp) 137 $PUSH r23,`13*$SIZE_T`($sp) 138 $PUSH r24,`14*$SIZE_T`($sp) 139 $PUSH r25,`15*$SIZE_T`($sp) 140 141 $LD $n0,0($n0) ; pull n0[0] value 142 addi $num,$num,-2 ; adjust $num for counter register 143 145 $LD $m0,0($bp) ; m0=bp[0] 146 $LD $aj,0($ap) ; ap[0] 147 addi $tp,$sp,$FRAME 148 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 149 $UMULH $hi0,$aj,$m0 150 151 $LD $aj,$BNSZ($ap) ; ap[1] 152 $LD $nj,0($np) ; np[0] 153 154 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 155 156 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 157 $UMULH $ahi,$aj,$m0 158 159 $UMULL $lo1,$nj,$m1 ; np[0]*m1 160 $UMULH $hi1,$nj,$m1 161 $LD $nj,$BNSZ($np) ; np[1] 162 addc $lo1,$lo1,$lo0 163 addze $hi1,$hi1 164 165 $UMULL $nlo,$nj,$m1 ; np[1]*m1 166 $UMULH $nhi,$nj,$m1 167 168 mtctr $num 169 li $j,`2*$BNSZ` 170 .align 4 171 L1st: 172 $LDX $aj,$ap,$j ; ap[j] 173 addc $lo0,$alo,$hi0 174 $LDX $nj,$np,$j ; np[j] 175 addze $hi0,$ahi 176 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 177 addc $lo1,$nlo,$hi1 178 $UMULH $ahi,$aj,$m0 179 addze $hi1,$nhi 180 $UMULL $nlo,$nj,$m1 ; np[j]*m1 181 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 182 $UMULH $nhi,$nj,$m1 183 addze $hi1,$hi1 184 $ST $lo1,0($tp) ; tp[j-1] 185 186 addi $j,$j,$BNSZ ; j++ 187 addi $tp,$tp,$BNSZ ; tp++ 188 bdnz- L1st 189 ;L1st 190 addc $lo0,$alo,$hi0 191 addze $hi0,$ahi 192 193 addc $lo1,$nlo,$hi1 194 addze $hi1,$nhi 195 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 196 addze $hi1,$hi1 197 $ST $lo1,0($tp) ; tp[j-1] 198 199 li $ovf,0 200 addc $hi1,$hi1,$hi0 201 addze $ovf,$ovf ; upmost overflow bit 202 $ST $hi1,$BNSZ($tp) 203 205 li $i,$BNSZ 206 .align 4 207 Louter: 208 $LDX $m0,$bp,$i ; m0=bp[i] 209 $LD $aj,0($ap) ; ap[0] 210 addi $tp,$sp,$FRAME 211 $LD $tj,$FRAME($sp) ; tp[0] 212 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 213 $UMULH $hi0,$aj,$m0 214 $LD $aj,$BNSZ($ap) ; ap[1] 215 $LD $nj,0($np) ; np[0] 216 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 217 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 218 addze $hi0,$hi0 219 $UMULL $m1,$lo0,$n0 ; tp[0]*n0 220 $UMULH $ahi,$aj,$m0 221 $UMULL $lo1,$nj,$m1 ; np[0]*m1 222 $UMULH $hi1,$nj,$m1 223 $LD $nj,$BNSZ($np) ; np[1] 224 addc $lo1,$lo1,$lo0 225 $UMULL $nlo,$nj,$m1 ; np[1]*m1 226 addze $hi1,$hi1 227 $UMULH $nhi,$nj,$m1 228 230 mtctr $num 231 li $j,`2*$BNSZ` 232 .align 4 233 Linner: 234 $LDX $aj,$ap,$j ; ap[j] 235 addc $lo0,$alo,$hi0 236 $LD $tj,$BNSZ($tp) ; tp[j] 237 addze $hi0,$ahi 238 $LDX $nj,$np,$j ; np[j] 239 addc $lo1,$nlo,$hi1 240 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 241 addze $hi1,$nhi 242 $UMULH $ahi,$aj,$m0 243 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 244 $UMULL $nlo,$nj,$m1 ; np[j]*m1 245 addze $hi0,$hi0 246 $UMULH $nhi,$nj,$m1 247 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 248 addi $j,$j,$BNSZ ; j++ 249 addze $hi1,$hi1 250 $ST $lo1,0($tp) ; tp[j-1] 251 addi $tp,$tp,$BNSZ ; tp++ 252 bdnz- Linner 253 ;Linner 254 $LD $tj,$BNSZ($tp) ; tp[j] 255 addc $lo0,$alo,$hi0 256 addze $hi0,$ahi 257 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 258 addze $hi0,$hi0 259 260 addc $lo1,$nlo,$hi1 261 addze $hi1,$nhi 262 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 263 addze $hi1,$hi1 264 $ST $lo1,0($tp) ; tp[j-1] 265 266 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 267 li $ovf,0 268 adde $hi1,$hi1,$hi0 269 addze $ovf,$ovf 270 $ST $hi1,$BNSZ($tp) 271 ; 272 slwi $tj,$num,`log($BNSZ)/log(2)` 273 $UCMP $i,$tj 274 addi $i,$i,$BNSZ 275 ble- Louter 276 278 addi $num,$num,2 ; restore $num 279 subfc $j,$j,$j ; j=0 and "clear" XER[CA] 280 addi $tp,$sp,$FRAME 281 mtctr $num 282 283 .align 4 284 Lsub: $LDX $tj,$tp,$j 285 $LDX $nj,$np,$j 286 subfe $aj,$nj,$tj ; tp[j]-np[j] 287 $STX $aj,$rp,$j 288 addi $j,$j,$BNSZ 289 bdnz- Lsub 290 291 li $j,0 292 mtctr $num 293 subfe $ovf,$j,$ovf ; handle upmost overflow bit 294 and $ap,$tp,$ovf 295 andc $np,$rp,$ovf 296 or $ap,$ap,$np ; ap=borrow?tp:rp 297 298 .align 4 299 Lcopy: ; copy or in-place refresh 300 $LDX $tj,$ap,$j 301 $STX $tj,$rp,$j 302 $STX $j,$tp,$j ; zap at once 303 addi $j,$j,$BNSZ 304 bdnz- Lcopy 305 306 $POP r14,`4*$SIZE_T`($sp) 307 $POP r15,`5*$SIZE_T`($sp) 308 $POP r16,`6*$SIZE_T`($sp) 309 $POP r17,`7*$SIZE_T`($sp) 310 $POP r18,`8*$SIZE_T`($sp) 311 $POP r19,`9*$SIZE_T`($sp) 312 $POP r20,`10*$SIZE_T`($sp) 313 $POP r21,`11*$SIZE_T`($sp) 314 $POP r22,`12*$SIZE_T`($sp) 315 $POP r23,`13*$SIZE_T`($sp) 316 $POP r24,`14*$SIZE_T`($sp) 317 $POP r25,`15*$SIZE_T`($sp) 318 $POP $sp,0($sp) 319 li r3,1 320 blr 321 .long 0 322 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 323 ___ 324 325 $code =~ s/\`([^\`]*)\`/eval $1/gem; 326 print $code; 327 close STDOUT; 328