Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # April 2006
     11 
     12 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
     13 # to gain a bit more by modulo-scheduling outer loop, then dedicated
     14 # squaring procedure should give further 20% and code can be adapted
     15 # for 32-bit application running on 64-bit CPU. As for the latter.
     16 # It won't be able to achieve "native" 64-bit performance, because in
     17 # 32-bit application context every addc instruction will have to be
     18 # expanded as addc, twice right shift by 32 and finally adde, etc.
     19 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
     20 # for 64-bit application running on PPC970/G5 is:
     21 #
     22 # 512-bit	+65%	
     23 # 1024-bit	+35%
     24 # 2048-bit	+18%
     25 # 4096-bit	+4%
     26 
     27 $flavour = shift;
     28 
     29 if ($flavour =~ /32/) {
     30 	$BITS=	32;
     31 	$BNSZ=	$BITS/8;
     32 	$SIZE_T=4;
     33 	$RZONE=	224;
     34 
     35 	$LD=	"lwz";		# load
     36 	$LDU=	"lwzu";		# load and update
     37 	$LDX=	"lwzx";		# load indexed
     38 	$ST=	"stw";		# store
     39 	$STU=	"stwu";		# store and update
     40 	$STX=	"stwx";		# store indexed
     41 	$STUX=	"stwux";	# store indexed and update
     42 	$UMULL=	"mullw";	# unsigned multiply low
     43 	$UMULH=	"mulhwu";	# unsigned multiply high
     44 	$UCMP=	"cmplw";	# unsigned compare
     45 	$SHRI=	"srwi";		# unsigned shift right by immediate	
     46 	$PUSH=	$ST;
     47 	$POP=	$LD;
     48 } elsif ($flavour =~ /64/) {
     49 	$BITS=	64;
     50 	$BNSZ=	$BITS/8;
     51 	$SIZE_T=8;
     52 	$RZONE=	288;
     53 
     54 	# same as above, but 64-bit mnemonics...
     55 	$LD=	"ld";		# load
     56 	$LDU=	"ldu";		# load and update
     57 	$LDX=	"ldx";		# load indexed
     58 	$ST=	"std";		# store
     59 	$STU=	"stdu";		# store and update
     60 	$STX=	"stdx";		# store indexed
     61 	$STUX=	"stdux";	# store indexed and update
     62 	$UMULL=	"mulld";	# unsigned multiply low
     63 	$UMULH=	"mulhdu";	# unsigned multiply high
     64 	$UCMP=	"cmpld";	# unsigned compare
     65 	$SHRI=	"srdi";		# unsigned shift right by immediate	
     66 	$PUSH=	$ST;
     67 	$POP=	$LD;
     68 } else { die "nonsense $flavour"; }
     69 
     70 $FRAME=8*$SIZE_T+$RZONE;
     71 $LOCALS=8*$SIZE_T;
     72 
     73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     74 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     75 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     76 die "can't locate ppc-xlate.pl";
     77 
     78 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     79 
     80 $sp="r1";
     81 $toc="r2";
     82 $rp="r3";	$ovf="r3";
     83 $ap="r4";
     84 $bp="r5";
     85 $np="r6";
     86 $n0="r7";
     87 $num="r8";
     88 $rp="r9";	# $rp is reassigned
     89 $aj="r10";
     90 $nj="r11";
     91 $tj="r12";
     92 # non-volatile registers
     93 $i="r20";
     94 $j="r21";
     95 $tp="r22";
     96 $m0="r23";
     97 $m1="r24";
     98 $lo0="r25";
     99 $hi0="r26";
    100 $lo1="r27";
    101 $hi1="r28";
    102 $alo="r29";
    103 $ahi="r30";
    104 $nlo="r31";
    105 #
    106 $nhi="r0";
    107 
    108 $code=<<___;
    109 .machine "any"
    110 .text
    111 
    112 .globl	.bn_mul_mont_int
    113 .align	4
    114 .bn_mul_mont_int:
    115 	cmpwi	$num,4
    116 	mr	$rp,r3		; $rp is reassigned
    117 	li	r3,0
    118 	bltlr
    119 ___
    120 $code.=<<___ if ($BNSZ==4);
    121 	cmpwi	$num,32		; longer key performance is not better
    122 	bgelr
    123 ___
    124 $code.=<<___;
    125 	slwi	$num,$num,`log($BNSZ)/log(2)`
    126 	li	$tj,-4096
    127 	addi	$ovf,$num,$FRAME
    128 	subf	$ovf,$ovf,$sp	; $sp-$ovf
    129 	and	$ovf,$ovf,$tj	; minimize TLB usage
    130 	subf	$ovf,$sp,$ovf	; $ovf-$sp
    131 	mr	$tj,$sp
    132 	srwi	$num,$num,`log($BNSZ)/log(2)`
    133 	$STUX	$sp,$sp,$ovf
    134 
    135 	$PUSH	r20,`-12*$SIZE_T`($tj)
    136 	$PUSH	r21,`-11*$SIZE_T`($tj)
    137 	$PUSH	r22,`-10*$SIZE_T`($tj)
    138 	$PUSH	r23,`-9*$SIZE_T`($tj)
    139 	$PUSH	r24,`-8*$SIZE_T`($tj)
    140 	$PUSH	r25,`-7*$SIZE_T`($tj)
    141 	$PUSH	r26,`-6*$SIZE_T`($tj)
    142 	$PUSH	r27,`-5*$SIZE_T`($tj)
    143 	$PUSH	r28,`-4*$SIZE_T`($tj)
    144 	$PUSH	r29,`-3*$SIZE_T`($tj)
    145 	$PUSH	r30,`-2*$SIZE_T`($tj)
    146 	$PUSH	r31,`-1*$SIZE_T`($tj)
    147 
    148 	$LD	$n0,0($n0)	; pull n0[0] value
    149 	addi	$num,$num,-2	; adjust $num for counter register
    150 
    152 	$LD	$m0,0($bp)	; m0=bp[0]
    153 	$LD	$aj,0($ap)	; ap[0]
    154 	addi	$tp,$sp,$LOCALS
    155 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
    156 	$UMULH	$hi0,$aj,$m0
    157 
    158 	$LD	$aj,$BNSZ($ap)	; ap[1]
    159 	$LD	$nj,0($np)	; np[0]
    160 
    161 	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
    162 
    163 	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
    164 	$UMULH	$ahi,$aj,$m0
    165 
    166 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
    167 	$UMULH	$hi1,$nj,$m1
    168 	$LD	$nj,$BNSZ($np)	; np[1]
    169 	addc	$lo1,$lo1,$lo0
    170 	addze	$hi1,$hi1
    171 
    172 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
    173 	$UMULH	$nhi,$nj,$m1
    174 
    175 	mtctr	$num
    176 	li	$j,`2*$BNSZ`
    177 .align	4
    178 L1st:
    179 	$LDX	$aj,$ap,$j	; ap[j]
    180 	addc	$lo0,$alo,$hi0
    181 	$LDX	$nj,$np,$j	; np[j]
    182 	addze	$hi0,$ahi
    183 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
    184 	addc	$lo1,$nlo,$hi1
    185 	$UMULH	$ahi,$aj,$m0
    186 	addze	$hi1,$nhi
    187 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
    188 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
    189 	$UMULH	$nhi,$nj,$m1
    190 	addze	$hi1,$hi1
    191 	$ST	$lo1,0($tp)	; tp[j-1]
    192 
    193 	addi	$j,$j,$BNSZ	; j++
    194 	addi	$tp,$tp,$BNSZ	; tp++
    195 	bdnz-	L1st
    196 ;L1st
    197 	addc	$lo0,$alo,$hi0
    198 	addze	$hi0,$ahi
    199 
    200 	addc	$lo1,$nlo,$hi1
    201 	addze	$hi1,$nhi
    202 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
    203 	addze	$hi1,$hi1
    204 	$ST	$lo1,0($tp)	; tp[j-1]
    205 
    206 	li	$ovf,0
    207 	addc	$hi1,$hi1,$hi0
    208 	addze	$ovf,$ovf	; upmost overflow bit
    209 	$ST	$hi1,$BNSZ($tp)
    210 
    212 	li	$i,$BNSZ
    213 .align	4
    214 Louter:
    215 	$LDX	$m0,$bp,$i	; m0=bp[i]
    216 	$LD	$aj,0($ap)	; ap[0]
    217 	addi	$tp,$sp,$LOCALS
    218 	$LD	$tj,$LOCALS($sp); tp[0]
    219 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
    220 	$UMULH	$hi0,$aj,$m0
    221 	$LD	$aj,$BNSZ($ap)	; ap[1]
    222 	$LD	$nj,0($np)	; np[0]
    223 	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
    224 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
    225 	addze	$hi0,$hi0
    226 	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
    227 	$UMULH	$ahi,$aj,$m0
    228 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
    229 	$UMULH	$hi1,$nj,$m1
    230 	$LD	$nj,$BNSZ($np)	; np[1]
    231 	addc	$lo1,$lo1,$lo0
    232 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
    233 	addze	$hi1,$hi1
    234 	$UMULH	$nhi,$nj,$m1
    235 
    237 	mtctr	$num
    238 	li	$j,`2*$BNSZ`
    239 .align	4
    240 Linner:
    241 	$LDX	$aj,$ap,$j	; ap[j]
    242 	addc	$lo0,$alo,$hi0
    243 	$LD	$tj,$BNSZ($tp)	; tp[j]
    244 	addze	$hi0,$ahi
    245 	$LDX	$nj,$np,$j	; np[j]
    246 	addc	$lo1,$nlo,$hi1
    247 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
    248 	addze	$hi1,$nhi
    249 	$UMULH	$ahi,$aj,$m0
    250 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
    251 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
    252 	addze	$hi0,$hi0
    253 	$UMULH	$nhi,$nj,$m1
    254 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
    255 	addi	$j,$j,$BNSZ	; j++
    256 	addze	$hi1,$hi1
    257 	$ST	$lo1,0($tp)	; tp[j-1]
    258 	addi	$tp,$tp,$BNSZ	; tp++
    259 	bdnz-	Linner
    260 ;Linner
    261 	$LD	$tj,$BNSZ($tp)	; tp[j]
    262 	addc	$lo0,$alo,$hi0
    263 	addze	$hi0,$ahi
    264 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
    265 	addze	$hi0,$hi0
    266 
    267 	addc	$lo1,$nlo,$hi1
    268 	addze	$hi1,$nhi
    269 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
    270 	addze	$hi1,$hi1
    271 	$ST	$lo1,0($tp)	; tp[j-1]
    272 
    273 	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
    274 	li	$ovf,0
    275 	adde	$hi1,$hi1,$hi0
    276 	addze	$ovf,$ovf
    277 	$ST	$hi1,$BNSZ($tp)
    278 ;
    279 	slwi	$tj,$num,`log($BNSZ)/log(2)`
    280 	$UCMP	$i,$tj
    281 	addi	$i,$i,$BNSZ
    282 	ble-	Louter
    283 
    285 	addi	$num,$num,2	; restore $num
    286 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
    287 	addi	$tp,$sp,$LOCALS
    288 	mtctr	$num
    289 
    290 .align	4
    291 Lsub:	$LDX	$tj,$tp,$j
    292 	$LDX	$nj,$np,$j
    293 	subfe	$aj,$nj,$tj	; tp[j]-np[j]
    294 	$STX	$aj,$rp,$j
    295 	addi	$j,$j,$BNSZ
    296 	bdnz-	Lsub
    297 
    298 	li	$j,0
    299 	mtctr	$num
    300 	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
    301 	and	$ap,$tp,$ovf
    302 	andc	$np,$rp,$ovf
    303 	or	$ap,$ap,$np	; ap=borrow?tp:rp
    304 
    305 .align	4
    306 Lcopy:				; copy or in-place refresh
    307 	$LDX	$tj,$ap,$j
    308 	$STX	$tj,$rp,$j
    309 	$STX	$j,$tp,$j	; zap at once
    310 	addi	$j,$j,$BNSZ
    311 	bdnz-	Lcopy
    312 
    313 	$POP	$tj,0($sp)
    314 	li	r3,1
    315 	$POP	r20,`-12*$SIZE_T`($tj)
    316 	$POP	r21,`-11*$SIZE_T`($tj)
    317 	$POP	r22,`-10*$SIZE_T`($tj)
    318 	$POP	r23,`-9*$SIZE_T`($tj)
    319 	$POP	r24,`-8*$SIZE_T`($tj)
    320 	$POP	r25,`-7*$SIZE_T`($tj)
    321 	$POP	r26,`-6*$SIZE_T`($tj)
    322 	$POP	r27,`-5*$SIZE_T`($tj)
    323 	$POP	r28,`-4*$SIZE_T`($tj)
    324 	$POP	r29,`-3*$SIZE_T`($tj)
    325 	$POP	r30,`-2*$SIZE_T`($tj)
    326 	$POP	r31,`-1*$SIZE_T`($tj)
    327 	mr	$sp,$tj
    328 	blr
    329 	.long	0
    330 	.byte	0,12,4,0,0x80,12,6,0
    331 	.long	0
    332 
    333 .asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
    334 ___
    335 
    336 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    337 print $code;
    338 close STDOUT;
    339