Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # April 2010
     11 #
     12 # The module implements "4-bit" GCM GHASH function and underlying
     13 # single multiplication operation in GF(2^128). "4-bit" means that it
     14 # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
     15 # it processes one byte in 19.6 cycles, which is more than twice as
     16 # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
     17 # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
     18 # processed byte. This is ~2.2x faster than 64-bit code generated by
     19 # vendor compiler (which used to be very hard to beat:-).
     20 #
     21 # Special thanks to polarhome.com for providing HP-UX account.
     22 
     23 $flavour = shift;
     24 $output = shift;
     25 open STDOUT,">$output";
     26 
     27 if ($flavour =~ /64/) {
     28 	$LEVEL		="2.0W";
     29 	$SIZE_T		=8;
     30 	$FRAME_MARKER	=80;
     31 	$SAVED_RP	=16;
     32 	$PUSH		="std";
     33 	$PUSHMA		="std,ma";
     34 	$POP		="ldd";
     35 	$POPMB		="ldd,mb";
     36 	$NREGS		=6;
     37 } else {
     38 	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
     39 	$SIZE_T		=4;
     40 	$FRAME_MARKER	=48;
     41 	$SAVED_RP	=20;
     42 	$PUSH		="stw";
     43 	$PUSHMA		="stwm";
     44 	$POP		="ldw";
     45 	$POPMB		="ldwm";
     46 	$NREGS		=11;
     47 }
     48 
     49 $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
     50 				#                 [+ argument transfer]
     51 
     52 ################# volatile registers
     53 $Xi="%r26";	# argument block
     54 $Htbl="%r25";
     55 $inp="%r24";
     56 $len="%r23";
     57 $Hhh=$Htbl;	# variables
     58 $Hll="%r22";
     59 $Zhh="%r21";
     60 $Zll="%r20";
     61 $cnt="%r19";
     62 $rem_4bit="%r28";
     63 $rem="%r29";
     64 $mask0xf0="%r31";
     65 
     66 ################# preserved registers
     67 $Thh="%r1";
     68 $Tll="%r2";
     69 $nlo="%r3";
     70 $nhi="%r4";
     71 $byte="%r5";
     72 if ($SIZE_T==4) {
     73 	$Zhl="%r6";
     74 	$Zlh="%r7";
     75 	$Hhl="%r8";
     76 	$Hlh="%r9";
     77 	$Thl="%r10";
     78 	$Tlh="%r11";
     79 }
     80 $rem2="%r6";	# used in PA-RISC 2.0 code
     81 
     82 $code.=<<___;
     83 	.LEVEL	$LEVEL
     84 	.SPACE	\$TEXT\$
     85 	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
     86 
     87 	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
     88 	.ALIGN	64
     89 gcm_gmult_4bit
     90 	.PROC
     91 	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
     92 	.ENTRY
     93 	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
     94 	$PUSHMA	%r3,$FRAME(%sp)
     95 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
     96 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
     97 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
     98 ___
     99 $code.=<<___ if ($SIZE_T==4);
    100 	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
    101 	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
    102 	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
    103 	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
    104 	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
    105 ___
    106 $code.=<<___;
    107 	blr	%r0,$rem_4bit
    108 	ldi	3,$rem
    109 L\$pic_gmult
    110 	andcm	$rem_4bit,$rem,$rem_4bit
    111 	addl	$inp,$len,$len
    112 	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
    113 	ldi	0xf0,$mask0xf0
    114 ___
    115 $code.=<<___ if ($SIZE_T==4);
    116 	ldi	31,$rem
    117 	mtctl	$rem,%cr11
    118 	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
    119 	b	L\$parisc1_gmult
    120 	nop
    121 ___
    122 
    124 $code.=<<___;
    125 	ldb	15($Xi),$nlo
    126 	ldo	8($Htbl),$Hll
    127 
    128 	and	$mask0xf0,$nlo,$nhi
    129 	depd,z	$nlo,59,4,$nlo
    130 
    131 	ldd	$nlo($Hll),$Zll
    132 	ldd	$nlo($Hhh),$Zhh
    133 
    134 	depd,z	$Zll,60,4,$rem
    135 	shrpd	$Zhh,$Zll,4,$Zll
    136 	extrd,u	$Zhh,59,60,$Zhh
    137 	ldb	14($Xi),$nlo
    138 
    139 	ldd	$nhi($Hll),$Tll
    140 	ldd	$nhi($Hhh),$Thh
    141 	and	$mask0xf0,$nlo,$nhi
    142 	depd,z	$nlo,59,4,$nlo
    143 
    144 	xor	$Tll,$Zll,$Zll
    145 	xor	$Thh,$Zhh,$Zhh
    146 	ldd	$rem($rem_4bit),$rem
    147 	b	L\$oop_gmult_pa2
    148 	ldi	13,$cnt
    149 
    150 	.ALIGN	8
    151 L\$oop_gmult_pa2
    152 	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
    153 	depd,z	$Zll,60,4,$rem
    154 
    155 	shrpd	$Zhh,$Zll,4,$Zll
    156 	extrd,u	$Zhh,59,60,$Zhh
    157 	ldd	$nlo($Hll),$Tll
    158 	ldd	$nlo($Hhh),$Thh
    159 
    160 	xor	$Tll,$Zll,$Zll
    161 	xor	$Thh,$Zhh,$Zhh
    162 	ldd	$rem($rem_4bit),$rem
    163 
    164 	xor	$rem,$Zhh,$Zhh
    165 	depd,z	$Zll,60,4,$rem
    166 	ldbx	$cnt($Xi),$nlo
    167 
    168 	shrpd	$Zhh,$Zll,4,$Zll
    169 	extrd,u	$Zhh,59,60,$Zhh
    170 	ldd	$nhi($Hll),$Tll
    171 	ldd	$nhi($Hhh),$Thh
    172 
    173 	and	$mask0xf0,$nlo,$nhi
    174 	depd,z	$nlo,59,4,$nlo
    175 	ldd	$rem($rem_4bit),$rem
    176 
    177 	xor	$Tll,$Zll,$Zll
    178 	addib,uv -1,$cnt,L\$oop_gmult_pa2
    179 	xor	$Thh,$Zhh,$Zhh
    180 
    181 	xor	$rem,$Zhh,$Zhh
    182 	depd,z	$Zll,60,4,$rem
    183 
    184 	shrpd	$Zhh,$Zll,4,$Zll
    185 	extrd,u	$Zhh,59,60,$Zhh
    186 	ldd	$nlo($Hll),$Tll
    187 	ldd	$nlo($Hhh),$Thh
    188 
    189 	xor	$Tll,$Zll,$Zll
    190 	xor	$Thh,$Zhh,$Zhh
    191 	ldd	$rem($rem_4bit),$rem
    192 
    193 	xor	$rem,$Zhh,$Zhh
    194 	depd,z	$Zll,60,4,$rem
    195 
    196 	shrpd	$Zhh,$Zll,4,$Zll
    197 	extrd,u	$Zhh,59,60,$Zhh
    198 	ldd	$nhi($Hll),$Tll
    199 	ldd	$nhi($Hhh),$Thh
    200 
    201 	xor	$Tll,$Zll,$Zll
    202 	xor	$Thh,$Zhh,$Zhh
    203 	ldd	$rem($rem_4bit),$rem
    204 
    205 	xor	$rem,$Zhh,$Zhh
    206 	std	$Zll,8($Xi)
    207 	std	$Zhh,0($Xi)
    208 ___
    209 
    211 $code.=<<___ if ($SIZE_T==4);
    212 	b	L\$done_gmult
    213 	nop
    214 
    215 L\$parisc1_gmult
    216 	ldb	15($Xi),$nlo
    217 	ldo	12($Htbl),$Hll
    218 	ldo	8($Htbl),$Hlh
    219 	ldo	4($Htbl),$Hhl
    220 
    221 	and	$mask0xf0,$nlo,$nhi
    222 	zdep	$nlo,27,4,$nlo
    223 
    224 	ldwx	$nlo($Hll),$Zll
    225 	ldwx	$nlo($Hlh),$Zlh
    226 	ldwx	$nlo($Hhl),$Zhl
    227 	ldwx	$nlo($Hhh),$Zhh
    228 	zdep	$Zll,28,4,$rem
    229 	ldb	14($Xi),$nlo
    230 	ldwx	$rem($rem_4bit),$rem
    231 	shrpw	$Zlh,$Zll,4,$Zll
    232 	ldwx	$nhi($Hll),$Tll
    233 	shrpw	$Zhl,$Zlh,4,$Zlh
    234 	ldwx	$nhi($Hlh),$Tlh
    235 	shrpw	$Zhh,$Zhl,4,$Zhl
    236 	ldwx	$nhi($Hhl),$Thl
    237 	extru	$Zhh,27,28,$Zhh
    238 	ldwx	$nhi($Hhh),$Thh
    239 	xor	$rem,$Zhh,$Zhh
    240 	and	$mask0xf0,$nlo,$nhi
    241 	zdep	$nlo,27,4,$nlo
    242 
    243 	xor	$Tll,$Zll,$Zll
    244 	ldwx	$nlo($Hll),$Tll
    245 	xor	$Tlh,$Zlh,$Zlh
    246 	ldwx	$nlo($Hlh),$Tlh
    247 	xor	$Thl,$Zhl,$Zhl
    248 	b	L\$oop_gmult_pa1
    249 	ldi	13,$cnt
    250 
    251 	.ALIGN	8
    252 L\$oop_gmult_pa1
    253 	zdep	$Zll,28,4,$rem
    254 	ldwx	$nlo($Hhl),$Thl
    255 	xor	$Thh,$Zhh,$Zhh
    256 	ldwx	$rem($rem_4bit),$rem
    257 	shrpw	$Zlh,$Zll,4,$Zll
    258 	ldwx	$nlo($Hhh),$Thh
    259 	shrpw	$Zhl,$Zlh,4,$Zlh
    260 	ldbx	$cnt($Xi),$nlo
    261 	xor	$Tll,$Zll,$Zll
    262 	ldwx	$nhi($Hll),$Tll
    263 	shrpw	$Zhh,$Zhl,4,$Zhl
    264 	xor	$Tlh,$Zlh,$Zlh
    265 	ldwx	$nhi($Hlh),$Tlh
    266 	extru	$Zhh,27,28,$Zhh
    267 	xor	$Thl,$Zhl,$Zhl
    268 	ldwx	$nhi($Hhl),$Thl
    269 	xor	$rem,$Zhh,$Zhh
    270 	zdep	$Zll,28,4,$rem
    271 	xor	$Thh,$Zhh,$Zhh
    272 	ldwx	$nhi($Hhh),$Thh
    273 	shrpw	$Zlh,$Zll,4,$Zll
    274 	ldwx	$rem($rem_4bit),$rem
    275 	shrpw	$Zhl,$Zlh,4,$Zlh
    276 	shrpw	$Zhh,$Zhl,4,$Zhl
    277 	and	$mask0xf0,$nlo,$nhi
    278 	extru	$Zhh,27,28,$Zhh
    279 	zdep	$nlo,27,4,$nlo
    280 	xor	$Tll,$Zll,$Zll
    281 	ldwx	$nlo($Hll),$Tll
    282 	xor	$Tlh,$Zlh,$Zlh
    283 	ldwx	$nlo($Hlh),$Tlh
    284 	xor	$rem,$Zhh,$Zhh
    285 	addib,uv -1,$cnt,L\$oop_gmult_pa1
    286 	xor	$Thl,$Zhl,$Zhl
    287 
    288 	zdep	$Zll,28,4,$rem
    289 	ldwx	$nlo($Hhl),$Thl
    290 	xor	$Thh,$Zhh,$Zhh
    291 	ldwx	$rem($rem_4bit),$rem
    292 	shrpw	$Zlh,$Zll,4,$Zll
    293 	ldwx	$nlo($Hhh),$Thh
    294 	shrpw	$Zhl,$Zlh,4,$Zlh
    295 	xor	$Tll,$Zll,$Zll
    296 	ldwx	$nhi($Hll),$Tll
    297 	shrpw	$Zhh,$Zhl,4,$Zhl
    298 	xor	$Tlh,$Zlh,$Zlh
    299 	ldwx	$nhi($Hlh),$Tlh
    300 	extru	$Zhh,27,28,$Zhh
    301 	xor	$rem,$Zhh,$Zhh
    302 	xor	$Thl,$Zhl,$Zhl
    303 	ldwx	$nhi($Hhl),$Thl
    304 	xor	$Thh,$Zhh,$Zhh
    305 	ldwx	$nhi($Hhh),$Thh
    306 	zdep	$Zll,28,4,$rem
    307 	ldwx	$rem($rem_4bit),$rem
    308 	shrpw	$Zlh,$Zll,4,$Zll
    309 	shrpw	$Zhl,$Zlh,4,$Zlh
    310 	shrpw	$Zhh,$Zhl,4,$Zhl
    311 	extru	$Zhh,27,28,$Zhh
    312 	xor	$Tll,$Zll,$Zll
    313 	xor	$Tlh,$Zlh,$Zlh
    314 	xor	$rem,$Zhh,$Zhh
    315 	stw	$Zll,12($Xi)
    316 	xor	$Thl,$Zhl,$Zhl
    317 	stw	$Zlh,8($Xi)
    318 	xor	$Thh,$Zhh,$Zhh
    319 	stw	$Zhl,4($Xi)
    320 	stw	$Zhh,0($Xi)
    321 ___
    322 $code.=<<___;
    323 L\$done_gmult
    324 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
    325 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
    326 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
    327 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
    328 ___
    329 $code.=<<___ if ($SIZE_T==4);
    330 	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
    331 	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
    332 	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
    333 	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
    334 	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
    335 ___
    336 $code.=<<___;
    337 	bv	(%r2)
    338 	.EXIT
    339 	$POPMB	-$FRAME(%sp),%r3
    340 	.PROCEND
    341 
    342 	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
    343 	.ALIGN	64
    344 gcm_ghash_4bit
    345 	.PROC
    346 	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
    347 	.ENTRY
    348 	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
    349 	$PUSHMA	%r3,$FRAME(%sp)
    350 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
    351 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
    352 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
    353 ___
    354 $code.=<<___ if ($SIZE_T==4);
    355 	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
    356 	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
    357 	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
    358 	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
    359 	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
    360 ___
    361 $code.=<<___;
    362 	blr	%r0,$rem_4bit
    363 	ldi	3,$rem
    364 L\$pic_ghash
    365 	andcm	$rem_4bit,$rem,$rem_4bit
    366 	addl	$inp,$len,$len
    367 	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
    368 	ldi	0xf0,$mask0xf0
    369 ___
    370 $code.=<<___ if ($SIZE_T==4);
    371 	ldi	31,$rem
    372 	mtctl	$rem,%cr11
    373 	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
    374 	b	L\$parisc1_ghash
    375 	nop
    376 ___
    377 
    380 $code.=<<___;
    381 	ldb	15($Xi),$nlo
    382 	ldo	8($Htbl),$Hll
    383 
    384 L\$outer_ghash_pa2
    385 	ldb	15($inp),$nhi
    386 	xor	$nhi,$nlo,$nlo
    387 	and	$mask0xf0,$nlo,$nhi
    388 	depd,z	$nlo,59,4,$nlo
    389 
    390 	ldd	$nlo($Hll),$Zll
    391 	ldd	$nlo($Hhh),$Zhh
    392 
    393 	depd,z	$Zll,60,4,$rem
    394 	shrpd	$Zhh,$Zll,4,$Zll
    395 	extrd,u	$Zhh,59,60,$Zhh
    396 	ldb	14($Xi),$nlo
    397 	ldb	14($inp),$byte
    398 
    399 	ldd	$nhi($Hll),$Tll
    400 	ldd	$nhi($Hhh),$Thh
    401 	xor	$byte,$nlo,$nlo
    402 	and	$mask0xf0,$nlo,$nhi
    403 	depd,z	$nlo,59,4,$nlo
    404 
    405 	xor	$Tll,$Zll,$Zll
    406 	xor	$Thh,$Zhh,$Zhh
    407 	ldd	$rem($rem_4bit),$rem
    408 	b	L\$oop_ghash_pa2
    409 	ldi	13,$cnt
    410 
    411 	.ALIGN	8
    412 L\$oop_ghash_pa2
    413 	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
    414 	depd,z	$Zll,60,4,$rem2
    415 
    416 	shrpd	$Zhh,$Zll,4,$Zll
    417 	extrd,u	$Zhh,59,60,$Zhh
    418 	ldd	$nlo($Hll),$Tll
    419 	ldd	$nlo($Hhh),$Thh
    420 
    421 	xor	$Tll,$Zll,$Zll
    422 	xor	$Thh,$Zhh,$Zhh
    423 	ldbx	$cnt($Xi),$nlo
    424 	ldbx	$cnt($inp),$byte
    425 
    426 	depd,z	$Zll,60,4,$rem
    427 	shrpd	$Zhh,$Zll,4,$Zll
    428 	ldd	$rem2($rem_4bit),$rem2
    429 
    430 	xor	$rem2,$Zhh,$Zhh
    431 	xor	$byte,$nlo,$nlo
    432 	ldd	$nhi($Hll),$Tll
    433 	ldd	$nhi($Hhh),$Thh
    434 
    435 	and	$mask0xf0,$nlo,$nhi
    436 	depd,z	$nlo,59,4,$nlo
    437 
    438 	extrd,u	$Zhh,59,60,$Zhh
    439 	xor	$Tll,$Zll,$Zll
    440 
    441 	ldd	$rem($rem_4bit),$rem
    442 	addib,uv -1,$cnt,L\$oop_ghash_pa2
    443 	xor	$Thh,$Zhh,$Zhh
    444 
    445 	xor	$rem,$Zhh,$Zhh
    446 	depd,z	$Zll,60,4,$rem2
    447 
    448 	shrpd	$Zhh,$Zll,4,$Zll
    449 	extrd,u	$Zhh,59,60,$Zhh
    450 	ldd	$nlo($Hll),$Tll
    451 	ldd	$nlo($Hhh),$Thh
    452 
    453 	xor	$Tll,$Zll,$Zll
    454 	xor	$Thh,$Zhh,$Zhh
    455 
    456 	depd,z	$Zll,60,4,$rem
    457 	shrpd	$Zhh,$Zll,4,$Zll
    458 	ldd	$rem2($rem_4bit),$rem2
    459 
    460 	xor	$rem2,$Zhh,$Zhh
    461 	ldd	$nhi($Hll),$Tll
    462 	ldd	$nhi($Hhh),$Thh
    463 
    464 	extrd,u	$Zhh,59,60,$Zhh
    465 	xor	$Tll,$Zll,$Zll
    466 	xor	$Thh,$Zhh,$Zhh
    467 	ldd	$rem($rem_4bit),$rem
    468 
    469 	xor	$rem,$Zhh,$Zhh
    470 	std	$Zll,8($Xi)
    471 	ldo	16($inp),$inp
    472 	std	$Zhh,0($Xi)
    473 	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
    474 	copy	$Zll,$nlo
    475 ___
    476 
    478 $code.=<<___ if ($SIZE_T==4);
    479 	b	L\$done_ghash
    480 	nop
    481 
    482 L\$parisc1_ghash
    483 	ldb	15($Xi),$nlo
    484 	ldo	12($Htbl),$Hll
    485 	ldo	8($Htbl),$Hlh
    486 	ldo	4($Htbl),$Hhl
    487 
    488 L\$outer_ghash_pa1
    489 	ldb	15($inp),$byte
    490 	xor	$byte,$nlo,$nlo
    491 	and	$mask0xf0,$nlo,$nhi
    492 	zdep	$nlo,27,4,$nlo
    493 
    494 	ldwx	$nlo($Hll),$Zll
    495 	ldwx	$nlo($Hlh),$Zlh
    496 	ldwx	$nlo($Hhl),$Zhl
    497 	ldwx	$nlo($Hhh),$Zhh
    498 	zdep	$Zll,28,4,$rem
    499 	ldb	14($Xi),$nlo
    500 	ldb	14($inp),$byte
    501 	ldwx	$rem($rem_4bit),$rem
    502 	shrpw	$Zlh,$Zll,4,$Zll
    503 	ldwx	$nhi($Hll),$Tll
    504 	shrpw	$Zhl,$Zlh,4,$Zlh
    505 	ldwx	$nhi($Hlh),$Tlh
    506 	shrpw	$Zhh,$Zhl,4,$Zhl
    507 	ldwx	$nhi($Hhl),$Thl
    508 	extru	$Zhh,27,28,$Zhh
    509 	ldwx	$nhi($Hhh),$Thh
    510 	xor	$byte,$nlo,$nlo
    511 	xor	$rem,$Zhh,$Zhh
    512 	and	$mask0xf0,$nlo,$nhi
    513 	zdep	$nlo,27,4,$nlo
    514 
    515 	xor	$Tll,$Zll,$Zll
    516 	ldwx	$nlo($Hll),$Tll
    517 	xor	$Tlh,$Zlh,$Zlh
    518 	ldwx	$nlo($Hlh),$Tlh
    519 	xor	$Thl,$Zhl,$Zhl
    520 	b	L\$oop_ghash_pa1
    521 	ldi	13,$cnt
    522 
    523 	.ALIGN	8
    524 L\$oop_ghash_pa1
    525 	zdep	$Zll,28,4,$rem
    526 	ldwx	$nlo($Hhl),$Thl
    527 	xor	$Thh,$Zhh,$Zhh
    528 	ldwx	$rem($rem_4bit),$rem
    529 	shrpw	$Zlh,$Zll,4,$Zll
    530 	ldwx	$nlo($Hhh),$Thh
    531 	shrpw	$Zhl,$Zlh,4,$Zlh
    532 	ldbx	$cnt($Xi),$nlo
    533 	xor	$Tll,$Zll,$Zll
    534 	ldwx	$nhi($Hll),$Tll
    535 	shrpw	$Zhh,$Zhl,4,$Zhl
    536 	ldbx	$cnt($inp),$byte
    537 	xor	$Tlh,$Zlh,$Zlh
    538 	ldwx	$nhi($Hlh),$Tlh
    539 	extru	$Zhh,27,28,$Zhh
    540 	xor	$Thl,$Zhl,$Zhl
    541 	ldwx	$nhi($Hhl),$Thl
    542 	xor	$rem,$Zhh,$Zhh
    543 	zdep	$Zll,28,4,$rem
    544 	xor	$Thh,$Zhh,$Zhh
    545 	ldwx	$nhi($Hhh),$Thh
    546 	shrpw	$Zlh,$Zll,4,$Zll
    547 	ldwx	$rem($rem_4bit),$rem
    548 	shrpw	$Zhl,$Zlh,4,$Zlh
    549 	xor	$byte,$nlo,$nlo
    550 	shrpw	$Zhh,$Zhl,4,$Zhl
    551 	and	$mask0xf0,$nlo,$nhi
    552 	extru	$Zhh,27,28,$Zhh
    553 	zdep	$nlo,27,4,$nlo
    554 	xor	$Tll,$Zll,$Zll
    555 	ldwx	$nlo($Hll),$Tll
    556 	xor	$Tlh,$Zlh,$Zlh
    557 	ldwx	$nlo($Hlh),$Tlh
    558 	xor	$rem,$Zhh,$Zhh
    559 	addib,uv -1,$cnt,L\$oop_ghash_pa1
    560 	xor	$Thl,$Zhl,$Zhl
    561 
    562 	zdep	$Zll,28,4,$rem
    563 	ldwx	$nlo($Hhl),$Thl
    564 	xor	$Thh,$Zhh,$Zhh
    565 	ldwx	$rem($rem_4bit),$rem
    566 	shrpw	$Zlh,$Zll,4,$Zll
    567 	ldwx	$nlo($Hhh),$Thh
    568 	shrpw	$Zhl,$Zlh,4,$Zlh
    569 	xor	$Tll,$Zll,$Zll
    570 	ldwx	$nhi($Hll),$Tll
    571 	shrpw	$Zhh,$Zhl,4,$Zhl
    572 	xor	$Tlh,$Zlh,$Zlh
    573 	ldwx	$nhi($Hlh),$Tlh
    574 	extru	$Zhh,27,28,$Zhh
    575 	xor	$rem,$Zhh,$Zhh
    576 	xor	$Thl,$Zhl,$Zhl
    577 	ldwx	$nhi($Hhl),$Thl
    578 	xor	$Thh,$Zhh,$Zhh
    579 	ldwx	$nhi($Hhh),$Thh
    580 	zdep	$Zll,28,4,$rem
    581 	ldwx	$rem($rem_4bit),$rem
    582 	shrpw	$Zlh,$Zll,4,$Zll
    583 	shrpw	$Zhl,$Zlh,4,$Zlh
    584 	shrpw	$Zhh,$Zhl,4,$Zhl
    585 	extru	$Zhh,27,28,$Zhh
    586 	xor	$Tll,$Zll,$Zll
    587 	xor	$Tlh,$Zlh,$Zlh
    588 	xor	$rem,$Zhh,$Zhh
    589 	stw	$Zll,12($Xi)
    590 	xor	$Thl,$Zhl,$Zhl
    591 	stw	$Zlh,8($Xi)
    592 	xor	$Thh,$Zhh,$Zhh
    593 	stw	$Zhl,4($Xi)
    594 	ldo	16($inp),$inp
    595 	stw	$Zhh,0($Xi)
    596 	comb,<>	$inp,$len,L\$outer_ghash_pa1
    597 	copy	$Zll,$nlo
    598 ___
    599 $code.=<<___;
    600 L\$done_ghash
    601 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
    602 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
    603 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
    604 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
    605 ___
    606 $code.=<<___ if ($SIZE_T==4);
    607 	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
    608 	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
    609 	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
    610 	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
    611 	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
    612 ___
    613 $code.=<<___;
    614 	bv	(%r2)
    615 	.EXIT
    616 	$POPMB	-$FRAME(%sp),%r3
    617 	.PROCEND
    618 
    619 	.ALIGN	64
    620 L\$rem_4bit
    621 	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
    622 	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
    623 	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
    624 	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
    625 	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
    626 	.ALIGN	64
    627 ___
    628 
    629 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
    630 # that it can be compiled with .LEVEL 1.0. It should be noted that I
    631 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
    632 # directive...
    633 
    634 my $ldd = sub {
    635   my ($mod,$args) = @_;
    636   my $orig = "ldd$mod\t$args";
    637 
    638     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
    639     {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
    640 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    641     }
    642     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
    643     {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
    644 	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
    645 	$opcode|=(1<<5)  if ($mod =~ /^,m/);
    646 	$opcode|=(1<<13) if ($mod =~ /^,mb/);
    647 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    648     }
    649     else { "\t".$orig; }
    650 };
    651 
    652 my $std = sub {
    653   my ($mod,$args) = @_;
    654   my $orig = "std$mod\t$args";
    655 
    656     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
    657     {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
    658 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    659     }
    660     else { "\t".$orig; }
    661 };
    662 
    663 my $extrd = sub {
    664   my ($mod,$args) = @_;
    665   my $orig = "extrd$mod\t$args";
    666 
    667     # I only have ",u" completer, it's implicitly encoded...
    668     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
    669     {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
    670 	my $len=32-$3;
    671 	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
    672 	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
    673 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    674     }
    675     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
    676     {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
    677 	my $len=32-$2;
    678 	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
    679 	$opcode |= (1<<13) if ($mod =~ /,\**=/);
    680 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    681     }
    682     else { "\t".$orig; }
    683 };
    684 
    685 my $shrpd = sub {
    686   my ($mod,$args) = @_;
    687   my $orig = "shrpd$mod\t$args";
    688 
    689     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
    690     {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
    691 	my $cpos=63-$3;
    692 	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
    693 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    694     }
    695     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
    696     {	sprintf "\t.WORD\t0x%08x\t; %s",
    697 		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
    698     }
    699     else { "\t".$orig; }
    700 };
    701 
    702 my $depd = sub {
    703   my ($mod,$args) = @_;
    704   my $orig = "depd$mod\t$args";
    705 
    706     # I only have ",z" completer, it's impicitly encoded...
    707     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
    708     {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
    709     	my $cpos=63-$2;
    710 	my $len=32-$3;
    711 	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
    712 	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
    713 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    714     }
    715     else { "\t".$orig; }
    716 };
    717 
    718 sub assemble {
    719   my ($mnemonic,$mod,$args)=@_;
    720   my $opcode = eval("\$$mnemonic");
    721 
    722     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
    723 }
    724 
    725 foreach (split("\n",$code)) {
    726 	s/\`([^\`]*)\`/eval $1/ge;
    727 	if ($SIZE_T==4) {
    728 		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
    729 		s/cmpb,\*/comb,/;
    730 		s/,\*/,/;
    731 	}
    732 	print $_,"\n";
    733 }
    734 
    735 close STDOUT;
    736