Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # April 2006
     11 
     12 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
     13 # to gain a bit more by modulo-scheduling outer loop, then dedicated
     14 # squaring procedure should give further 20% and code can be adapted
     15 # for 32-bit application running on 64-bit CPU. As for the latter.
     16 # It won't be able to achieve "native" 64-bit performance, because in
     17 # 32-bit application context every addc instruction will have to be
     18 # expanded as addc, twice right shift by 32 and finally adde, etc.
     19 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
     20 # for 64-bit application running on PPC970/G5 is:
     21 #
     22 # 512-bit	+65%	
     23 # 1024-bit	+35%
     24 # 2048-bit	+18%
     25 # 4096-bit	+4%
     26 
     27 $flavour = shift;
     28 
     29 if ($flavour =~ /32/) {
     30 	$BITS=	32;
     31 	$BNSZ=	$BITS/8;
     32 	$SIZE_T=4;
     33 	$RZONE=	224;
     34 	$FRAME=	$SIZE_T*16;
     35 
     36 	$LD=	"lwz";		# load
     37 	$LDU=	"lwzu";		# load and update
     38 	$LDX=	"lwzx";		# load indexed
     39 	$ST=	"stw";		# store
     40 	$STU=	"stwu";		# store and update
     41 	$STX=	"stwx";		# store indexed
     42 	$STUX=	"stwux";	# store indexed and update
     43 	$UMULL=	"mullw";	# unsigned multiply low
     44 	$UMULH=	"mulhwu";	# unsigned multiply high
     45 	$UCMP=	"cmplw";	# unsigned compare
     46 	$SHRI=	"srwi";		# unsigned shift right by immediate	
     47 	$PUSH=	$ST;
     48 	$POP=	$LD;
     49 } elsif ($flavour =~ /64/) {
     50 	$BITS=	64;
     51 	$BNSZ=	$BITS/8;
     52 	$SIZE_T=8;
     53 	$RZONE=	288;
     54 	$FRAME=	$SIZE_T*16;
     55 
     56 	# same as above, but 64-bit mnemonics...
     57 	$LD=	"ld";		# load
     58 	$LDU=	"ldu";		# load and update
     59 	$LDX=	"ldx";		# load indexed
     60 	$ST=	"std";		# store
     61 	$STU=	"stdu";		# store and update
     62 	$STX=	"stdx";		# store indexed
     63 	$STUX=	"stdux";	# store indexed and update
     64 	$UMULL=	"mulld";	# unsigned multiply low
     65 	$UMULH=	"mulhdu";	# unsigned multiply high
     66 	$UCMP=	"cmpld";	# unsigned compare
     67 	$SHRI=	"srdi";		# unsigned shift right by immediate	
     68 	$PUSH=	$ST;
     69 	$POP=	$LD;
     70 } else { die "nonsense $flavour"; }
     71 
     72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     75 die "can't locate ppc-xlate.pl";
     76 
     77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     78 
     79 $sp="r1";
     80 $toc="r2";
     81 $rp="r3";	$ovf="r3";
     82 $ap="r4";
     83 $bp="r5";
     84 $np="r6";
     85 $n0="r7";
     86 $num="r8";
     87 $rp="r9";	# $rp is reassigned
     88 $aj="r10";
     89 $nj="r11";
     90 $tj="r12";
     91 # non-volatile registers
     92 $i="r14";
     93 $j="r15";
     94 $tp="r16";
     95 $m0="r17";
     96 $m1="r18";
     97 $lo0="r19";
     98 $hi0="r20";
     99 $lo1="r21";
    100 $hi1="r22";
    101 $alo="r23";
    102 $ahi="r24";
    103 $nlo="r25";
    104 #
    105 $nhi="r0";
    106 
    107 $code=<<___;
    108 .machine "any"
    109 .text
    110 
    111 .globl	.bn_mul_mont
    112 .align	4
    113 .bn_mul_mont:
    114 	cmpwi	$num,4
    115 	mr	$rp,r3		; $rp is reassigned
    116 	li	r3,0
    117 	bltlr
    118 
    119 	slwi	$num,$num,`log($BNSZ)/log(2)`
    120 	li	$tj,-4096
    121 	addi	$ovf,$num,`$FRAME+$RZONE`
    122 	subf	$ovf,$ovf,$sp	; $sp-$ovf
    123 	and	$ovf,$ovf,$tj	; minimize TLB usage
    124 	subf	$ovf,$sp,$ovf	; $ovf-$sp
    125 	srwi	$num,$num,`log($BNSZ)/log(2)`
    126 	$STUX	$sp,$sp,$ovf
    127 
    128 	$PUSH	r14,`4*$SIZE_T`($sp)
    129 	$PUSH	r15,`5*$SIZE_T`($sp)
    130 	$PUSH	r16,`6*$SIZE_T`($sp)
    131 	$PUSH	r17,`7*$SIZE_T`($sp)
    132 	$PUSH	r18,`8*$SIZE_T`($sp)
    133 	$PUSH	r19,`9*$SIZE_T`($sp)
    134 	$PUSH	r20,`10*$SIZE_T`($sp)
    135 	$PUSH	r21,`11*$SIZE_T`($sp)
    136 	$PUSH	r22,`12*$SIZE_T`($sp)
    137 	$PUSH	r23,`13*$SIZE_T`($sp)
    138 	$PUSH	r24,`14*$SIZE_T`($sp)
    139 	$PUSH	r25,`15*$SIZE_T`($sp)
    140 
    141 	$LD	$n0,0($n0)	; pull n0[0] value
    142 	addi	$num,$num,-2	; adjust $num for counter register
    143 
    145 	$LD	$m0,0($bp)	; m0=bp[0]
    146 	$LD	$aj,0($ap)	; ap[0]
    147 	addi	$tp,$sp,$FRAME
    148 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
    149 	$UMULH	$hi0,$aj,$m0
    150 
    151 	$LD	$aj,$BNSZ($ap)	; ap[1]
    152 	$LD	$nj,0($np)	; np[0]
    153 
    154 	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
    155 
    156 	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
    157 	$UMULH	$ahi,$aj,$m0
    158 
    159 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
    160 	$UMULH	$hi1,$nj,$m1
    161 	$LD	$nj,$BNSZ($np)	; np[1]
    162 	addc	$lo1,$lo1,$lo0
    163 	addze	$hi1,$hi1
    164 
    165 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
    166 	$UMULH	$nhi,$nj,$m1
    167 
    168 	mtctr	$num
    169 	li	$j,`2*$BNSZ`
    170 .align	4
    171 L1st:
    172 	$LDX	$aj,$ap,$j	; ap[j]
    173 	addc	$lo0,$alo,$hi0
    174 	$LDX	$nj,$np,$j	; np[j]
    175 	addze	$hi0,$ahi
    176 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
    177 	addc	$lo1,$nlo,$hi1
    178 	$UMULH	$ahi,$aj,$m0
    179 	addze	$hi1,$nhi
    180 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
    181 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
    182 	$UMULH	$nhi,$nj,$m1
    183 	addze	$hi1,$hi1
    184 	$ST	$lo1,0($tp)	; tp[j-1]
    185 
    186 	addi	$j,$j,$BNSZ	; j++
    187 	addi	$tp,$tp,$BNSZ	; tp++
    188 	bdnz-	L1st
    189 ;L1st
    190 	addc	$lo0,$alo,$hi0
    191 	addze	$hi0,$ahi
    192 
    193 	addc	$lo1,$nlo,$hi1
    194 	addze	$hi1,$nhi
    195 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
    196 	addze	$hi1,$hi1
    197 	$ST	$lo1,0($tp)	; tp[j-1]
    198 
    199 	li	$ovf,0
    200 	addc	$hi1,$hi1,$hi0
    201 	addze	$ovf,$ovf	; upmost overflow bit
    202 	$ST	$hi1,$BNSZ($tp)
    203 
    205 	li	$i,$BNSZ
    206 .align	4
    207 Louter:
    208 	$LDX	$m0,$bp,$i	; m0=bp[i]
    209 	$LD	$aj,0($ap)	; ap[0]
    210 	addi	$tp,$sp,$FRAME
    211 	$LD	$tj,$FRAME($sp)	; tp[0]
    212 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
    213 	$UMULH	$hi0,$aj,$m0
    214 	$LD	$aj,$BNSZ($ap)	; ap[1]
    215 	$LD	$nj,0($np)	; np[0]
    216 	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
    217 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
    218 	addze	$hi0,$hi0
    219 	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
    220 	$UMULH	$ahi,$aj,$m0
    221 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
    222 	$UMULH	$hi1,$nj,$m1
    223 	$LD	$nj,$BNSZ($np)	; np[1]
    224 	addc	$lo1,$lo1,$lo0
    225 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
    226 	addze	$hi1,$hi1
    227 	$UMULH	$nhi,$nj,$m1
    228 
    230 	mtctr	$num
    231 	li	$j,`2*$BNSZ`
    232 .align	4
    233 Linner:
    234 	$LDX	$aj,$ap,$j	; ap[j]
    235 	addc	$lo0,$alo,$hi0
    236 	$LD	$tj,$BNSZ($tp)	; tp[j]
    237 	addze	$hi0,$ahi
    238 	$LDX	$nj,$np,$j	; np[j]
    239 	addc	$lo1,$nlo,$hi1
    240 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
    241 	addze	$hi1,$nhi
    242 	$UMULH	$ahi,$aj,$m0
    243 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
    244 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
    245 	addze	$hi0,$hi0
    246 	$UMULH	$nhi,$nj,$m1
    247 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
    248 	addi	$j,$j,$BNSZ	; j++
    249 	addze	$hi1,$hi1
    250 	$ST	$lo1,0($tp)	; tp[j-1]
    251 	addi	$tp,$tp,$BNSZ	; tp++
    252 	bdnz-	Linner
    253 ;Linner
    254 	$LD	$tj,$BNSZ($tp)	; tp[j]
    255 	addc	$lo0,$alo,$hi0
    256 	addze	$hi0,$ahi
    257 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
    258 	addze	$hi0,$hi0
    259 
    260 	addc	$lo1,$nlo,$hi1
    261 	addze	$hi1,$nhi
    262 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
    263 	addze	$hi1,$hi1
    264 	$ST	$lo1,0($tp)	; tp[j-1]
    265 
    266 	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
    267 	li	$ovf,0
    268 	adde	$hi1,$hi1,$hi0
    269 	addze	$ovf,$ovf
    270 	$ST	$hi1,$BNSZ($tp)
    271 ;
    272 	slwi	$tj,$num,`log($BNSZ)/log(2)`
    273 	$UCMP	$i,$tj
    274 	addi	$i,$i,$BNSZ
    275 	ble-	Louter
    276 
    278 	addi	$num,$num,2	; restore $num
    279 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
    280 	addi	$tp,$sp,$FRAME
    281 	mtctr	$num
    282 
    283 .align	4
    284 Lsub:	$LDX	$tj,$tp,$j
    285 	$LDX	$nj,$np,$j
    286 	subfe	$aj,$nj,$tj	; tp[j]-np[j]
    287 	$STX	$aj,$rp,$j
    288 	addi	$j,$j,$BNSZ
    289 	bdnz-	Lsub
    290 
    291 	li	$j,0
    292 	mtctr	$num
    293 	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
    294 	and	$ap,$tp,$ovf
    295 	andc	$np,$rp,$ovf
    296 	or	$ap,$ap,$np	; ap=borrow?tp:rp
    297 
    298 .align	4
    299 Lcopy:				; copy or in-place refresh
    300 	$LDX	$tj,$ap,$j
    301 	$STX	$tj,$rp,$j
    302 	$STX	$j,$tp,$j	; zap at once
    303 	addi	$j,$j,$BNSZ
    304 	bdnz-	Lcopy
    305 
    306 	$POP	r14,`4*$SIZE_T`($sp)
    307 	$POP	r15,`5*$SIZE_T`($sp)
    308 	$POP	r16,`6*$SIZE_T`($sp)
    309 	$POP	r17,`7*$SIZE_T`($sp)
    310 	$POP	r18,`8*$SIZE_T`($sp)
    311 	$POP	r19,`9*$SIZE_T`($sp)
    312 	$POP	r20,`10*$SIZE_T`($sp)
    313 	$POP	r21,`11*$SIZE_T`($sp)
    314 	$POP	r22,`12*$SIZE_T`($sp)
    315 	$POP	r23,`13*$SIZE_T`($sp)
    316 	$POP	r24,`14*$SIZE_T`($sp)
    317 	$POP	r25,`15*$SIZE_T`($sp)
    318 	$POP	$sp,0($sp)
    319 	li	r3,1
    320 	blr
    321 	.long	0
    322 .asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
    323 ___
    324 
    325 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    326 print $code;
    327 close STDOUT;
    328