Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # On PA-7100LC this module performs ~90-50% better, less for longer
     11 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
     12 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
     13 # multiplication, which in turn means that "baseline" performance was
     14 # optimal in respect to instruction set capabilities. Fair comparison
     15 # with vendor compiler is problematic, because OpenSSL doesn't define
     16 # BN_LLONG [presumably] for historical reasons, which drives compiler
     17 # toward 4 times 16x16=32-bit multiplicatons [plus complementary
     18 # shifts and additions] instead. This means that you should observe
     19 # several times improvement over code generated by vendor compiler
     20 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
     21 # improvement coefficient was never collected on PA-7100LC, or any
     22 # other 1.1 CPU, because I don't have access to such machine with
     23 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
     24 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
     25 # of ~5x on PA-8600.
     26 #
     27 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
     28 # reportedly ~2x faster than vendor compiler generated code [according
     29 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
     30 # this implementation is actually 32-bit one, in the sense that it
     31 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
     32 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
     33 # module picks halves of 64-bit values in reverse order and pretends
     34 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
     35 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
     36 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
     37 # i.e. there is no "wider" multiplication like on most other 64-bit
     38 # platforms. This means that even being effectively 32-bit, this
     39 # implementation performs "64-bit" computational task in same amount
     40 # of arithmetic operations, most notably multiplications. It requires
     41 # more memory references, most notably to tp[num], but this doesn't
     42 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
     43 # 2.0 code path, provides virtually same performance as pa-risc2[W].s:
     44 # it's ~10% better for shortest key length and ~10% worse for longest
     45 # one.
     46 #
     47 # In case it wasn't clear. The module has two distinct code paths:
     48 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
     49 # additions and 64-bit integer loads, not to mention specific
     50 # instruction scheduling. In 64-bit build naturally only 2.0 code path
     51 # is assembled. In 32-bit application context both code paths are
     52 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
     53 # is taken automatically. Also, in 32-bit build the module imposes
     54 # couple of limitations: vector lengths has to be even and vector
     55 # addresses has to be 64-bit aligned. Normally neither is a problem:
     56 # most common key lengths are even and vectors are commonly malloc-ed,
     57 # which ensures alignment.
     58 #
     59 # Special thanks to polarhome.com for providing HP-UX account on
     60 # PA-RISC 1.1 machine, and to correspondent who chose to remain
     61 # anonymous for testing the code on PA-RISC 2.0 machine.
     62 
     64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     65 
     66 $flavour = shift;
     67 $output = shift;
     68 
     69 open STDOUT,">$output";
     70 
     71 if ($flavour =~ /64/) {
     72 	$LEVEL		="2.0W";
     73 	$SIZE_T		=8;
     74 	$FRAME_MARKER	=80;
     75 	$SAVED_RP	=16;
     76 	$PUSH		="std";
     77 	$PUSHMA		="std,ma";
     78 	$POP		="ldd";
     79 	$POPMB		="ldd,mb";
     80 	$BN_SZ		=$SIZE_T;
     81 } else {
     82 	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
     83 	$SIZE_T		=4;
     84 	$FRAME_MARKER	=48;
     85 	$SAVED_RP	=20;
     86 	$PUSH		="stw";
     87 	$PUSHMA		="stwm";
     88 	$POP		="ldw";
     89 	$POPMB		="ldwm";
     90 	$BN_SZ		=$SIZE_T;
     91 	if (open CONF,"<${dir}../../opensslconf.h") {
     92 	    while(<CONF>) {
     93 		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
     94 		    $BN_SZ=8;
     95 		    $LEVEL="2.0";
     96 		    last;
     97 		}
     98 	    }
     99 	    close CONF;
    100 	}
    101 }
    102 
    103 $FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
    104 				#                [+ argument transfer]
    105 $LOCALS=$FRAME-$FRAME_MARKER;
    106 $FRAME+=32;			# local variables
    107 
    108 $tp="%r31";
    109 $ti1="%r29";
    110 $ti0="%r28";
    111 
    112 $rp="%r26";
    113 $ap="%r25";
    114 $bp="%r24";
    115 $np="%r23";
    116 $n0="%r22";	# passed through stack in 32-bit
    117 $num="%r21";	# passed through stack in 32-bit
    118 $idx="%r20";
    119 $arrsz="%r19";
    120 
    121 $nm1="%r7";
    122 $nm0="%r6";
    123 $ab1="%r5";
    124 $ab0="%r4";
    125 
    126 $fp="%r3";
    127 $hi1="%r2";
    128 $hi0="%r1";
    129 
    130 $xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
    131 
    132 $fm0="%fr4";	$fti=$fm0;
    133 $fbi="%fr5L";
    134 $fn0="%fr5R";
    135 $fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
    136 $fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
    137 
    138 $code=<<___;
    139 	.LEVEL	$LEVEL
    140 	.SPACE	\$TEXT\$
    141 	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
    142 
    143 	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
    144 	.ALIGN	64
    145 bn_mul_mont
    146 	.PROC
    147 	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
    148 	.ENTRY
    149 	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
    150 	$PUSHMA	%r3,$FRAME(%sp)
    151 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
    152 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
    153 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
    154 	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
    155 	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
    156 	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
    157 	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
    158 	ldo	-$FRAME(%sp),$fp
    159 ___
    160 $code.=<<___ if ($SIZE_T==4);
    161 	ldw	`-$FRAME_MARKER-4`($fp),$n0
    162 	ldw	`-$FRAME_MARKER-8`($fp),$num
    163 	nop
    164 	nop					; alignment
    165 ___
    166 $code.=<<___ if ($BN_SZ==4);
    167 	comiclr,<=	6,$num,%r0		; are vectors long enough?
    168 	b		L\$abort
    169 	ldi		0,%r28			; signal "unhandled"
    170 	add,ev		%r0,$num,$num		; is $num even?
    171 	b		L\$abort
    172 	nop
    173 	or		$ap,$np,$ti1
    174 	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
    175 	b		L\$abort
    176 	nop
    177 	nop					; alignment
    178 	nop
    179 
    180 	fldws		0($n0),${fn0}
    181 	fldws,ma	4($bp),${fbi}		; bp[0]
    182 ___
    183 $code.=<<___ if ($BN_SZ==8);
    184 	comib,>		3,$num,L\$abort		; are vectors long enough?
    185 	ldi		0,%r28			; signal "unhandled"
    186 	addl		$num,$num,$num		; I operate on 32-bit values
    187 
    188 	fldws		4($n0),${fn0}		; only low part of n0
    189 	fldws		4($bp),${fbi}		; bp[0] in flipped word order
    190 ___
    191 $code.=<<___;
    192 	fldds		0($ap),${fai}		; ap[0,1]
    193 	fldds		0($np),${fni}		; np[0,1]
    194 
    195 	sh2addl		$num,%r0,$arrsz
    196 	ldi		31,$hi0
    197 	ldo		36($arrsz),$hi1		; space for tp[num+1]
    198 	andcm		$hi1,$hi0,$hi1		; align
    199 	addl		$hi1,%sp,%sp
    200 	$PUSH		$fp,-$SIZE_T(%sp)
    201 
    202 	ldo		`$LOCALS+16`($fp),$xfer
    203 	ldo		`$LOCALS+32+4`($fp),$tp
    204 
    205 	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
    206 	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
    207 	xmpyu		${fn0},${fab0}R,${fm0}
    208 
    209 	addl		$arrsz,$ap,$ap		; point at the end
    210 	addl		$arrsz,$np,$np
    211 	subi		0,$arrsz,$idx		; j=0
    212 	ldo		8($idx),$idx		; j++++
    213 
    214 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
    215 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
    216 	fstds		${fab0},-16($xfer)
    217 	fstds		${fnm0},-8($xfer)
    218 	fstds		${fab1},0($xfer)
    219 	fstds		${fnm1},8($xfer)
    220 	 flddx		$idx($ap),${fai}	; ap[2,3]
    221 	 flddx		$idx($np),${fni}	; np[2,3]
    222 ___
    223 $code.=<<___ if ($BN_SZ==4);
    224 	mtctl		$hi0,%cr11		; $hi0 still holds 31
    225 	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
    226 	b		L\$parisc11
    227 	nop
    228 ___
    229 $code.=<<___;					# PA-RISC 2.0 code-path
    230 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
    231 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    232 	ldd		-16($xfer),$ab0
    233 	fstds		${fab0},-16($xfer)
    234 
    235 	extrd,u		$ab0,31,32,$hi0
    236 	extrd,u		$ab0,63,32,$ab0
    237 	ldd		-8($xfer),$nm0
    238 	fstds		${fnm0},-8($xfer)
    239 	 ldo		8($idx),$idx		; j++++
    240 	 addl		$ab0,$nm0,$nm0		; low part is discarded
    241 	 extrd,u	$nm0,31,32,$hi1
    242 
    244 L\$1st
    245 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
    246 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
    247 	ldd		0($xfer),$ab1
    248 	fstds		${fab1},0($xfer)
    249 	 addl		$hi0,$ab1,$ab1
    250 	 extrd,u	$ab1,31,32,$hi0
    251 	ldd		8($xfer),$nm1
    252 	fstds		${fnm1},8($xfer)
    253 	 extrd,u	$ab1,63,32,$ab1
    254 	 addl		$hi1,$nm1,$nm1
    255 	flddx		$idx($ap),${fai}	; ap[j,j+1]
    256 	flddx		$idx($np),${fni}	; np[j,j+1]
    257 	 addl		$ab1,$nm1,$nm1
    258 	 extrd,u	$nm1,31,32,$hi1
    259 
    260 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
    261 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    262 	ldd		-16($xfer),$ab0
    263 	fstds		${fab0},-16($xfer)
    264 	 addl		$hi0,$ab0,$ab0
    265 	 extrd,u	$ab0,31,32,$hi0
    266 	ldd		-8($xfer),$nm0
    267 	fstds		${fnm0},-8($xfer)
    268 	 extrd,u	$ab0,63,32,$ab0
    269 	 addl		$hi1,$nm0,$nm0
    270 	stw		$nm1,-4($tp)		; tp[j-1]
    271 	 addl		$ab0,$nm0,$nm0
    272 	 stw,ma		$nm0,8($tp)		; tp[j-1]
    273 	addib,<>	8,$idx,L\$1st		; j++++
    274 	 extrd,u	$nm0,31,32,$hi1
    275 
    276 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
    277 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
    278 	ldd		0($xfer),$ab1
    279 	fstds		${fab1},0($xfer)
    280 	 addl		$hi0,$ab1,$ab1
    281 	 extrd,u	$ab1,31,32,$hi0
    282 	ldd		8($xfer),$nm1
    283 	fstds		${fnm1},8($xfer)
    284 	 extrd,u	$ab1,63,32,$ab1
    285 	 addl		$hi1,$nm1,$nm1
    286 	ldd		-16($xfer),$ab0
    287 	 addl		$ab1,$nm1,$nm1
    288 	ldd		-8($xfer),$nm0
    289 	 extrd,u	$nm1,31,32,$hi1
    290 
    291 	 addl		$hi0,$ab0,$ab0
    292 	 extrd,u	$ab0,31,32,$hi0
    293 	stw		$nm1,-4($tp)		; tp[j-1]
    294 	 extrd,u	$ab0,63,32,$ab0
    295 	 addl		$hi1,$nm0,$nm0
    296 	ldd		0($xfer),$ab1
    297 	 addl		$ab0,$nm0,$nm0
    298 	ldd,mb		8($xfer),$nm1
    299 	 extrd,u	$nm0,31,32,$hi1
    300 	stw,ma		$nm0,8($tp)		; tp[j-1]
    301 
    302 	ldo		-1($num),$num		; i--
    303 	subi		0,$arrsz,$idx		; j=0
    304 ___
    305 $code.=<<___ if ($BN_SZ==4);
    306 	fldws,ma	4($bp),${fbi}		; bp[1]
    307 ___
    308 $code.=<<___ if ($BN_SZ==8);
    309 	fldws		0($bp),${fbi}		; bp[1] in flipped word order
    310 ___
    311 $code.=<<___;
    312 	 flddx		$idx($ap),${fai}	; ap[0,1]
    313 	 flddx		$idx($np),${fni}	; np[0,1]
    314 	 fldws		8($xfer),${fti}R	; tp[0]
    315 	addl		$hi0,$ab1,$ab1
    316 	 extrd,u	$ab1,31,32,$hi0
    317 	 extrd,u	$ab1,63,32,$ab1
    318 	 ldo		8($idx),$idx		; j++++
    319 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
    320 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
    321 	addl		$hi1,$nm1,$nm1
    322 	addl		$ab1,$nm1,$nm1
    323 	extrd,u		$nm1,31,32,$hi1
    324 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
    325 	stw		$nm1,-4($tp)		; tp[j-1]
    326 
    327 	 fcpy,sgl	%fr0,${fti}L		; zero high part
    328 	 fcpy,sgl	%fr0,${fab0}L
    329 	addl		$hi1,$hi0,$hi0
    330 	extrd,u		$hi0,31,32,$hi1
    331 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
    332 	 fcnvxf,dbl,dbl	${fab0},${fab0}
    333 	stw		$hi0,0($tp)
    334 	stw		$hi1,4($tp)
    335 
    336 	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
    337 	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
    338 	xmpyu		${fn0},${fab0}R,${fm0}
    339 	ldo		`$LOCALS+32+4`($fp),$tp
    340 L\$outer
    341 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
    342 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
    343 	fstds		${fab0},-16($xfer)	; 33-bit value
    344 	fstds		${fnm0},-8($xfer)
    345 	 flddx		$idx($ap),${fai}	; ap[2]
    346 	 flddx		$idx($np),${fni}	; np[2]
    347 	 ldo		8($idx),$idx		; j++++
    348 	ldd		-16($xfer),$ab0		; 33-bit value
    349 	ldd		-8($xfer),$nm0
    350 	ldw		0($xfer),$hi0		; high part
    351 
    352 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
    353 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    354 	 extrd,u	$ab0,31,32,$ti0		; carry bit
    355 	 extrd,u	$ab0,63,32,$ab0
    356 	fstds		${fab1},0($xfer)
    357 	 addl		$ti0,$hi0,$hi0		; account carry bit
    358 	fstds		${fnm1},8($xfer)
    359 	 addl		$ab0,$nm0,$nm0		; low part is discarded
    360 	ldw		0($tp),$ti1		; tp[1]
    361 	 extrd,u	$nm0,31,32,$hi1
    362 	fstds		${fab0},-16($xfer)
    363 	fstds		${fnm0},-8($xfer)
    364 
    366 L\$inner
    367 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
    368 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
    369 	ldd		0($xfer),$ab1
    370 	fstds		${fab1},0($xfer)
    371 	 addl		$hi0,$ti1,$ti1
    372 	 addl		$ti1,$ab1,$ab1
    373 	ldd		8($xfer),$nm1
    374 	fstds		${fnm1},8($xfer)
    375 	 extrd,u	$ab1,31,32,$hi0
    376 	 extrd,u	$ab1,63,32,$ab1
    377 	flddx		$idx($ap),${fai}	; ap[j,j+1]
    378 	flddx		$idx($np),${fni}	; np[j,j+1]
    379 	 addl		$hi1,$nm1,$nm1
    380 	 addl		$ab1,$nm1,$nm1
    381 	ldw		4($tp),$ti0		; tp[j]
    382 	stw		$nm1,-4($tp)		; tp[j-1]
    383 
    384 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
    385 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    386 	ldd		-16($xfer),$ab0
    387 	fstds		${fab0},-16($xfer)
    388 	 addl		$hi0,$ti0,$ti0
    389 	 addl		$ti0,$ab0,$ab0
    390 	ldd		-8($xfer),$nm0
    391 	fstds		${fnm0},-8($xfer)
    392 	 extrd,u	$ab0,31,32,$hi0
    393 	 extrd,u	$nm1,31,32,$hi1
    394 	ldw		8($tp),$ti1		; tp[j]
    395 	 extrd,u	$ab0,63,32,$ab0
    396 	 addl		$hi1,$nm0,$nm0
    397 	 addl		$ab0,$nm0,$nm0
    398 	 stw,ma		$nm0,8($tp)		; tp[j-1]
    399 	addib,<>	8,$idx,L\$inner		; j++++
    400 	 extrd,u	$nm0,31,32,$hi1
    401 
    402 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
    403 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
    404 	ldd		0($xfer),$ab1
    405 	fstds		${fab1},0($xfer)
    406 	 addl		$hi0,$ti1,$ti1
    407 	 addl		$ti1,$ab1,$ab1
    408 	ldd		8($xfer),$nm1
    409 	fstds		${fnm1},8($xfer)
    410 	 extrd,u	$ab1,31,32,$hi0
    411 	 extrd,u	$ab1,63,32,$ab1
    412 	ldw		4($tp),$ti0		; tp[j]
    413 	 addl		$hi1,$nm1,$nm1
    414 	 addl		$ab1,$nm1,$nm1
    415 	ldd		-16($xfer),$ab0
    416 	ldd		-8($xfer),$nm0
    417 	 extrd,u	$nm1,31,32,$hi1
    418 
    419 	addl		$hi0,$ab0,$ab0
    420 	 addl		$ti0,$ab0,$ab0
    421 	 stw		$nm1,-4($tp)		; tp[j-1]
    422 	 extrd,u	$ab0,31,32,$hi0
    423 	ldw		8($tp),$ti1		; tp[j]
    424 	 extrd,u	$ab0,63,32,$ab0
    425 	 addl		$hi1,$nm0,$nm0
    426 	ldd		0($xfer),$ab1
    427 	 addl		$ab0,$nm0,$nm0
    428 	ldd,mb		8($xfer),$nm1
    429 	 extrd,u	$nm0,31,32,$hi1
    430 	 stw,ma		$nm0,8($tp)		; tp[j-1]
    431 
    432 	addib,=		-1,$num,L\$outerdone	; i--
    433 	subi		0,$arrsz,$idx		; j=0
    434 ___
    435 $code.=<<___ if ($BN_SZ==4);
    436 	fldws,ma	4($bp),${fbi}		; bp[i]
    437 ___
    438 $code.=<<___ if ($BN_SZ==8);
    439 	ldi		12,$ti0			; bp[i] in flipped word order
    440 	addl,ev		%r0,$num,$num
    441 	ldi		-4,$ti0
    442 	addl		$ti0,$bp,$bp
    443 	fldws		0($bp),${fbi}
    444 ___
    445 $code.=<<___;
    446 	 flddx		$idx($ap),${fai}	; ap[0]
    447 	addl		$hi0,$ab1,$ab1
    448 	 flddx		$idx($np),${fni}	; np[0]
    449 	 fldws		8($xfer),${fti}R	; tp[0]
    450 	addl		$ti1,$ab1,$ab1
    451 	extrd,u		$ab1,31,32,$hi0
    452 	extrd,u		$ab1,63,32,$ab1
    453 
    454 	 ldo		8($idx),$idx		; j++++
    455 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
    456 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
    457 	ldw		4($tp),$ti0		; tp[j]
    458 
    459 	addl		$hi1,$nm1,$nm1
    460 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
    461 	addl		$ab1,$nm1,$nm1
    462 	extrd,u		$nm1,31,32,$hi1
    463 	 fcpy,sgl	%fr0,${fti}L		; zero high part
    464 	 fcpy,sgl	%fr0,${fab0}L
    465 	stw		$nm1,-4($tp)		; tp[j-1]
    466 
    467 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
    468 	 fcnvxf,dbl,dbl	${fab0},${fab0}
    469 	addl		$hi1,$hi0,$hi0
    470 	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
    471 	addl		$ti0,$hi0,$hi0
    472 	extrd,u		$hi0,31,32,$hi1
    473 	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
    474 	stw		$hi0,0($tp)
    475 	stw		$hi1,4($tp)
    476 	 xmpyu		${fn0},${fab0}R,${fm0}
    477 
    478 	b		L\$outer
    479 	ldo		`$LOCALS+32+4`($fp),$tp
    480 
    482 L\$outerdone
    483 	addl		$hi0,$ab1,$ab1
    484 	addl		$ti1,$ab1,$ab1
    485 	extrd,u		$ab1,31,32,$hi0
    486 	extrd,u		$ab1,63,32,$ab1
    487 
    488 	ldw		4($tp),$ti0		; tp[j]
    489 
    490 	addl		$hi1,$nm1,$nm1
    491 	addl		$ab1,$nm1,$nm1
    492 	extrd,u		$nm1,31,32,$hi1
    493 	stw		$nm1,-4($tp)		; tp[j-1]
    494 
    495 	addl		$hi1,$hi0,$hi0
    496 	addl		$ti0,$hi0,$hi0
    497 	extrd,u		$hi0,31,32,$hi1
    498 	stw		$hi0,0($tp)
    499 	stw		$hi1,4($tp)
    500 
    501 	ldo		`$LOCALS+32`($fp),$tp
    502 	sub		%r0,%r0,%r0		; clear borrow
    503 ___
    504 $code.=<<___ if ($BN_SZ==4);
    505 	ldws,ma		4($tp),$ti0
    506 	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
    507 	b		L\$sub_pa11
    508 	addl		$tp,$arrsz,$tp
    509 L\$sub
    510 	ldwx		$idx($np),$hi0
    511 	subb		$ti0,$hi0,$hi1
    512 	ldwx		$idx($tp),$ti0
    513 	addib,<>	4,$idx,L\$sub
    514 	stws,ma		$hi1,4($rp)
    515 
    516 	subb		$ti0,%r0,$hi1
    517 	ldo		-4($tp),$tp
    518 ___
    519 $code.=<<___ if ($BN_SZ==8);
    520 	ldd,ma		8($tp),$ti0
    521 L\$sub
    522 	ldd		$idx($np),$hi0
    523 	shrpd		$ti0,$ti0,32,$ti0	; flip word order
    524 	std		$ti0,-8($tp)		; save flipped value
    525 	sub,db		$ti0,$hi0,$hi1
    526 	ldd,ma		8($tp),$ti0
    527 	addib,<>	8,$idx,L\$sub
    528 	std,ma		$hi1,8($rp)
    529 
    530 	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
    531 	sub,db		$ti0,%r0,$hi1
    532 	ldo		-8($tp),$tp
    533 ___
    534 $code.=<<___;
    535 	and		$tp,$hi1,$ap
    536 	andcm		$rp,$hi1,$bp
    537 	or		$ap,$bp,$np
    538 
    539 	sub		$rp,$arrsz,$rp		; rewind rp
    540 	subi		0,$arrsz,$idx
    541 	ldo		`$LOCALS+32`($fp),$tp
    542 L\$copy
    543 	ldd		$idx($np),$hi0
    544 	std,ma		%r0,8($tp)
    545 	addib,<>	8,$idx,.-8		; L\$copy
    546 	std,ma		$hi0,8($rp)	
    547 ___
    548 
    549 if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
    550 $ablo=$ab0;
    551 $abhi=$ab1;
    552 $nmlo0=$nm0;
    553 $nmhi0=$nm1;
    554 $nmlo1="%r9";
    555 $nmhi1="%r8";
    556 
    557 $code.=<<___;
    558 	b		L\$done
    559 	nop
    560 
    561 	.ALIGN		8
    562 L\$parisc11
    563 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
    564 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    565 	ldw		-12($xfer),$ablo
    566 	ldw		-16($xfer),$hi0
    567 	ldw		-4($xfer),$nmlo0
    568 	ldw		-8($xfer),$nmhi0
    569 	fstds		${fab0},-16($xfer)
    570 	fstds		${fnm0},-8($xfer)
    571 
    572 	 ldo		8($idx),$idx		; j++++
    573 	 add		$ablo,$nmlo0,$nmlo0	; discarded
    574 	 addc		%r0,$nmhi0,$hi1
    575 	ldw		4($xfer),$ablo
    576 	ldw		0($xfer),$abhi
    577 	nop
    578 
    580 L\$1st_pa11
    581 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
    582 	flddx		$idx($ap),${fai}	; ap[j,j+1]
    583 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
    584 	flddx		$idx($np),${fni}	; np[j,j+1]
    585 	 add		$hi0,$ablo,$ablo
    586 	ldw		12($xfer),$nmlo1
    587 	 addc		%r0,$abhi,$hi0
    588 	ldw		8($xfer),$nmhi1
    589 	 add		$ablo,$nmlo1,$nmlo1
    590 	fstds		${fab1},0($xfer)
    591 	 addc		%r0,$nmhi1,$nmhi1
    592 	fstds		${fnm1},8($xfer)
    593 	 add		$hi1,$nmlo1,$nmlo1
    594 	ldw		-12($xfer),$ablo
    595 	 addc		%r0,$nmhi1,$hi1
    596 	ldw		-16($xfer),$abhi
    597 
    598 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
    599 	ldw		-4($xfer),$nmlo0
    600 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    601 	ldw		-8($xfer),$nmhi0
    602 	 add		$hi0,$ablo,$ablo
    603 	stw		$nmlo1,-4($tp)		; tp[j-1]
    604 	 addc		%r0,$abhi,$hi0
    605 	fstds		${fab0},-16($xfer)
    606 	 add		$ablo,$nmlo0,$nmlo0
    607 	fstds		${fnm0},-8($xfer)
    608 	 addc		%r0,$nmhi0,$nmhi0
    609 	ldw		0($xfer),$abhi
    610 	 add		$hi1,$nmlo0,$nmlo0
    611 	ldw		4($xfer),$ablo
    612 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
    613 	addib,<>	8,$idx,L\$1st_pa11	; j++++
    614 	 addc		%r0,$nmhi0,$hi1
    615 
    616 	 ldw		8($xfer),$nmhi1
    617 	 ldw		12($xfer),$nmlo1
    618 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
    619 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
    620 	 add		$hi0,$ablo,$ablo
    621 	fstds		${fab1},0($xfer)
    622 	 addc		%r0,$abhi,$hi0
    623 	fstds		${fnm1},8($xfer)
    624 	 add		$ablo,$nmlo1,$nmlo1
    625 	ldw		-16($xfer),$abhi
    626 	 addc		%r0,$nmhi1,$nmhi1
    627 	ldw		-12($xfer),$ablo
    628 	 add		$hi1,$nmlo1,$nmlo1
    629 	ldw		-8($xfer),$nmhi0
    630 	 addc		%r0,$nmhi1,$hi1
    631 	ldw		-4($xfer),$nmlo0
    632 
    633 	 add		$hi0,$ablo,$ablo
    634 	stw		$nmlo1,-4($tp)		; tp[j-1]
    635 	 addc		%r0,$abhi,$hi0
    636 	ldw		0($xfer),$abhi
    637 	 add		$ablo,$nmlo0,$nmlo0
    638 	ldw		4($xfer),$ablo
    639 	 addc		%r0,$nmhi0,$nmhi0
    640 	ldws,mb		8($xfer),$nmhi1
    641 	 add		$hi1,$nmlo0,$nmlo0
    642 	ldw		4($xfer),$nmlo1
    643 	 addc		%r0,$nmhi0,$hi1
    644 	stws,ma		$nmlo0,8($tp)		; tp[j-1]
    645 
    646 	ldo		-1($num),$num		; i--
    647 	subi		0,$arrsz,$idx		; j=0
    648 
    649 	 fldws,ma	4($bp),${fbi}		; bp[1]
    650 	 flddx		$idx($ap),${fai}	; ap[0,1]
    651 	 flddx		$idx($np),${fni}	; np[0,1]
    652 	 fldws		8($xfer),${fti}R	; tp[0]
    653 	add		$hi0,$ablo,$ablo
    654 	addc		%r0,$abhi,$hi0
    655 	 ldo		8($idx),$idx		; j++++
    656 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
    657 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
    658 	add		$hi1,$nmlo1,$nmlo1
    659 	addc		%r0,$nmhi1,$nmhi1
    660 	add		$ablo,$nmlo1,$nmlo1
    661 	addc		%r0,$nmhi1,$hi1
    662 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
    663 	stw		$nmlo1,-4($tp)		; tp[j-1]
    664 
    665 	 fcpy,sgl	%fr0,${fti}L		; zero high part
    666 	 fcpy,sgl	%fr0,${fab0}L
    667 	add		$hi1,$hi0,$hi0
    668 	addc		%r0,%r0,$hi1
    669 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
    670 	 fcnvxf,dbl,dbl	${fab0},${fab0}
    671 	stw		$hi0,0($tp)
    672 	stw		$hi1,4($tp)
    673 
    674 	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
    675 	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
    676 	xmpyu		${fn0},${fab0}R,${fm0}
    677 	ldo		`$LOCALS+32+4`($fp),$tp
    678 L\$outer_pa11
    679 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
    680 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
    681 	fstds		${fab0},-16($xfer)	; 33-bit value
    682 	fstds		${fnm0},-8($xfer)
    683 	 flddx		$idx($ap),${fai}	; ap[2,3]
    684 	 flddx		$idx($np),${fni}	; np[2,3]
    685 	ldw		-16($xfer),$abhi	; carry bit actually
    686 	 ldo		8($idx),$idx		; j++++
    687 	ldw		-12($xfer),$ablo
    688 	ldw		-8($xfer),$nmhi0
    689 	ldw		-4($xfer),$nmlo0
    690 	ldw		0($xfer),$hi0		; high part
    691 
    692 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
    693 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    694 	fstds		${fab1},0($xfer)
    695 	 addl		$abhi,$hi0,$hi0		; account carry bit
    696 	fstds		${fnm1},8($xfer)
    697 	 add		$ablo,$nmlo0,$nmlo0	; discarded
    698 	ldw		0($tp),$ti1		; tp[1]
    699 	 addc		%r0,$nmhi0,$hi1
    700 	fstds		${fab0},-16($xfer)
    701 	fstds		${fnm0},-8($xfer)
    702 	ldw		4($xfer),$ablo
    703 	ldw		0($xfer),$abhi
    704 
    706 L\$inner_pa11
    707 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
    708 	flddx		$idx($ap),${fai}	; ap[j,j+1]
    709 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
    710 	flddx		$idx($np),${fni}	; np[j,j+1]
    711 	 add		$hi0,$ablo,$ablo
    712 	ldw		4($tp),$ti0		; tp[j]
    713 	 addc		%r0,$abhi,$abhi
    714 	ldw		12($xfer),$nmlo1
    715 	 add		$ti1,$ablo,$ablo
    716 	ldw		8($xfer),$nmhi1
    717 	 addc		%r0,$abhi,$hi0
    718 	fstds		${fab1},0($xfer)
    719 	 add		$ablo,$nmlo1,$nmlo1
    720 	fstds		${fnm1},8($xfer)
    721 	 addc		%r0,$nmhi1,$nmhi1
    722 	ldw		-12($xfer),$ablo
    723 	 add		$hi1,$nmlo1,$nmlo1
    724 	ldw		-16($xfer),$abhi
    725 	 addc		%r0,$nmhi1,$hi1
    726 
    727 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
    728 	ldw		8($tp),$ti1		; tp[j]
    729 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
    730 	ldw		-4($xfer),$nmlo0
    731 	 add		$hi0,$ablo,$ablo
    732 	ldw		-8($xfer),$nmhi0
    733 	 addc		%r0,$abhi,$abhi
    734 	stw		$nmlo1,-4($tp)		; tp[j-1]
    735 	 add		$ti0,$ablo,$ablo
    736 	fstds		${fab0},-16($xfer)
    737 	 addc		%r0,$abhi,$hi0
    738 	fstds		${fnm0},-8($xfer)
    739 	 add		$ablo,$nmlo0,$nmlo0
    740 	ldw		4($xfer),$ablo
    741 	 addc		%r0,$nmhi0,$nmhi0
    742 	ldw		0($xfer),$abhi
    743 	 add		$hi1,$nmlo0,$nmlo0
    744 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
    745 	addib,<>	8,$idx,L\$inner_pa11	; j++++
    746 	 addc		%r0,$nmhi0,$hi1
    747 
    748 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
    749 	ldw		12($xfer),$nmlo1
    750 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
    751 	ldw		8($xfer),$nmhi1
    752 	 add		$hi0,$ablo,$ablo
    753 	ldw		4($tp),$ti0		; tp[j]
    754 	 addc		%r0,$abhi,$abhi
    755 	fstds		${fab1},0($xfer)
    756 	 add		$ti1,$ablo,$ablo
    757 	fstds		${fnm1},8($xfer)
    758 	 addc		%r0,$abhi,$hi0
    759 	ldw		-16($xfer),$abhi
    760 	 add		$ablo,$nmlo1,$nmlo1
    761 	ldw		-12($xfer),$ablo
    762 	 addc		%r0,$nmhi1,$nmhi1
    763 	ldw		-8($xfer),$nmhi0
    764 	 add		$hi1,$nmlo1,$nmlo1
    765 	ldw		-4($xfer),$nmlo0
    766 	 addc		%r0,$nmhi1,$hi1
    767 
    768 	add		$hi0,$ablo,$ablo
    769 	 stw		$nmlo1,-4($tp)		; tp[j-1]
    770 	addc		%r0,$abhi,$abhi
    771 	 add		$ti0,$ablo,$ablo
    772 	ldw		8($tp),$ti1		; tp[j]
    773 	 addc		%r0,$abhi,$hi0
    774 	ldw		0($xfer),$abhi
    775 	 add		$ablo,$nmlo0,$nmlo0
    776 	ldw		4($xfer),$ablo
    777 	 addc		%r0,$nmhi0,$nmhi0
    778 	ldws,mb		8($xfer),$nmhi1
    779 	 add		$hi1,$nmlo0,$nmlo0
    780 	ldw		4($xfer),$nmlo1
    781 	 addc		%r0,$nmhi0,$hi1
    782 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
    783 
    784 	addib,=		-1,$num,L\$outerdone_pa11; i--
    785 	subi		0,$arrsz,$idx		; j=0
    786 
    787 	 fldws,ma	4($bp),${fbi}		; bp[i]
    788 	 flddx		$idx($ap),${fai}	; ap[0]
    789 	add		$hi0,$ablo,$ablo
    790 	addc		%r0,$abhi,$abhi
    791 	 flddx		$idx($np),${fni}	; np[0]
    792 	 fldws		8($xfer),${fti}R	; tp[0]
    793 	add		$ti1,$ablo,$ablo
    794 	addc		%r0,$abhi,$hi0
    795 
    796 	 ldo		8($idx),$idx		; j++++
    797 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
    798 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
    799 	ldw		4($tp),$ti0		; tp[j]
    800 
    801 	add		$hi1,$nmlo1,$nmlo1
    802 	addc		%r0,$nmhi1,$nmhi1
    803 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
    804 	add		$ablo,$nmlo1,$nmlo1
    805 	addc		%r0,$nmhi1,$hi1
    806 	 fcpy,sgl	%fr0,${fti}L		; zero high part
    807 	 fcpy,sgl	%fr0,${fab0}L
    808 	stw		$nmlo1,-4($tp)		; tp[j-1]
    809 
    810 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
    811 	 fcnvxf,dbl,dbl	${fab0},${fab0}
    812 	add		$hi1,$hi0,$hi0
    813 	addc		%r0,%r0,$hi1
    814 	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
    815 	add		$ti0,$hi0,$hi0
    816 	addc		%r0,$hi1,$hi1
    817 	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
    818 	stw		$hi0,0($tp)
    819 	stw		$hi1,4($tp)
    820 	 xmpyu		${fn0},${fab0}R,${fm0}
    821 
    822 	b		L\$outer_pa11
    823 	ldo		`$LOCALS+32+4`($fp),$tp
    824 
    826 L\$outerdone_pa11
    827 	add		$hi0,$ablo,$ablo
    828 	addc		%r0,$abhi,$abhi
    829 	add		$ti1,$ablo,$ablo
    830 	addc		%r0,$abhi,$hi0
    831 
    832 	ldw		4($tp),$ti0		; tp[j]
    833 
    834 	add		$hi1,$nmlo1,$nmlo1
    835 	addc		%r0,$nmhi1,$nmhi1
    836 	add		$ablo,$nmlo1,$nmlo1
    837 	addc		%r0,$nmhi1,$hi1
    838 	stw		$nmlo1,-4($tp)		; tp[j-1]
    839 
    840 	add		$hi1,$hi0,$hi0
    841 	addc		%r0,%r0,$hi1
    842 	add		$ti0,$hi0,$hi0
    843 	addc		%r0,$hi1,$hi1
    844 	stw		$hi0,0($tp)
    845 	stw		$hi1,4($tp)
    846 
    847 	ldo		`$LOCALS+32+4`($fp),$tp
    848 	sub		%r0,%r0,%r0		; clear borrow
    849 	ldw		-4($tp),$ti0
    850 	addl		$tp,$arrsz,$tp
    851 L\$sub_pa11
    852 	ldwx		$idx($np),$hi0
    853 	subb		$ti0,$hi0,$hi1
    854 	ldwx		$idx($tp),$ti0
    855 	addib,<>	4,$idx,L\$sub_pa11
    856 	stws,ma		$hi1,4($rp)
    857 
    858 	subb		$ti0,%r0,$hi1
    859 	ldo		-4($tp),$tp
    860 	and		$tp,$hi1,$ap
    861 	andcm		$rp,$hi1,$bp
    862 	or		$ap,$bp,$np
    863 
    864 	sub		$rp,$arrsz,$rp		; rewind rp
    865 	subi		0,$arrsz,$idx
    866 	ldo		`$LOCALS+32`($fp),$tp
    867 L\$copy_pa11
    868 	ldwx		$idx($np),$hi0
    869 	stws,ma		%r0,4($tp)
    870 	addib,<>	4,$idx,L\$copy_pa11
    871 	stws,ma		$hi0,4($rp)	
    872 
    873 	nop					; alignment
    874 L\$done
    875 ___
    876 }
    877 
    879 $code.=<<___;
    880 	ldi		1,%r28			; signal "handled"
    881 	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
    882 
    883 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
    884 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
    885 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
    886 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
    887 	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
    888 	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
    889 	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
    890 	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
    891 L\$abort
    892 	bv	(%r2)
    893 	.EXIT
    894 	$POPMB	-$FRAME(%sp),%r3
    895 	.PROCEND
    896 	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
    897 ___
    898 
    900 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
    901 # that it can be compiled with .LEVEL 1.0. It should be noted that I
    902 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
    903 # directive...
    904 
    905 my $ldd = sub {
    906   my ($mod,$args) = @_;
    907   my $orig = "ldd$mod\t$args";
    908 
    909     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
    910     {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
    911 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    912     }
    913     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
    914     {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
    915 	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
    916 	$opcode|=(1<<5)  if ($mod =~ /^,m/);
    917 	$opcode|=(1<<13) if ($mod =~ /^,mb/);
    918 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    919     }
    920     else { "\t".$orig; }
    921 };
    922 
    923 my $std = sub {
    924   my ($mod,$args) = @_;
    925   my $orig = "std$mod\t$args";
    926 
    927     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
    928     {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
    929 	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
    930 	$opcode|=(1<<5)  if ($mod =~ /^,m/);
    931 	$opcode|=(1<<13) if ($mod =~ /^,mb/);
    932 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    933     }
    934     else { "\t".$orig; }
    935 };
    936 
    937 my $extrd = sub {
    938   my ($mod,$args) = @_;
    939   my $orig = "extrd$mod\t$args";
    940 
    941     # I only have ",u" completer, it's implicitly encoded...
    942     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
    943     {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
    944 	my $len=32-$3;
    945 	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
    946 	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
    947 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    948     }
    949     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
    950     {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
    951 	my $len=32-$2;
    952 	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
    953 	$opcode |= (1<<13) if ($mod =~ /,\**=/);
    954 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    955     }
    956     else { "\t".$orig; }
    957 };
    958 
    959 my $shrpd = sub {
    960   my ($mod,$args) = @_;
    961   my $orig = "shrpd$mod\t$args";
    962 
    963     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
    964     {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
    965 	my $cpos=63-$3;
    966 	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
    967 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    968     }
    969     else { "\t".$orig; }
    970 };
    971 
    972 my $sub = sub {
    973   my ($mod,$args) = @_;
    974   my $orig = "sub$mod\t$args";
    975 
    976     if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
    977 	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
    978 	$opcode|=(1<<10);	# e1
    979 	$opcode|=(1<<8);	# e2
    980 	$opcode|=(1<<5);	# d
    981 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
    982     }
    983     else { "\t".$orig; }
    984 };
    985 
    986 sub assemble {
    987   my ($mnemonic,$mod,$args)=@_;
    988   my $opcode = eval("\$$mnemonic");
    989 
    990     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
    991 }
    992 
    993 foreach (split("\n",$code)) {
    994 	s/\`([^\`]*)\`/eval $1/ge;
    995 	# flip word order in 64-bit mode...
    996 	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
    997 	# assemble 2.0 instructions in 32-bit mode...
    998 	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
    999 
   1000 	print $_,"\n";
   1001 }
   1002 close STDOUT;
   1003