Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # March 2010
     11 #
     12 # The module implements "4-bit" GCM GHASH function and underlying
     13 # single multiplication operation in GF(2^128). "4-bit" means that it
     14 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
     15 # GHASH performance was measured to be 6.67 cycles per processed byte
     16 # on Itanium 2, which is >90% better than Microsoft compiler generated
     17 # code. To anchor to something else sha1-ia64.pl module processes one
     18 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
     19 # byte.
     20 
     21 # September 2010
     22 #
     23 # It was originally thought that it makes lesser sense to implement
     24 # "528B" variant on Itanium 2 for following reason. Because number of
     25 # functional units is naturally limited, it appeared impossible to
     26 # implement "528B" loop in 4 cycles, only in 5. This would mean that
     27 # theoretically performance improvement couldn't be more than 20%.
     28 # But occasionally you prove yourself wrong:-) I figured out a way to
     29 # fold couple of instructions and having freed yet another instruction
     30 # slot by unrolling the loop... Resulting performance is 4.45 cycles
     31 # per processed byte and 50% better than "256B" version. On original
     32 # Itanium performance should remain the same as the "256B" version,
     33 # i.e. ~8.5 cycles.
     34 
     35 $output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
     36 
     37 if ($^O eq "hpux") {
     38     $ADDP="addp4";
     39     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
     40 } else { $ADDP="add"; }
     41 for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
     42                 $big_endian=0 if (/\-DL_ENDIAN/);  }
     43 if (!defined($big_endian))
     44              {  $big_endian=(unpack('L',pack('N',1))==1);  }
     45 
     46 sub loop() {
     47 my $label=shift;
     48 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
     49 
     50 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
     51 # in scalable manner;-) Naturally assuming data in L1 cache...
     52 # Special note about 'dep' instruction, which is used to construct
     53 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
     54 # bytes boundary and lower 7 bits of its address are guaranteed to
     55 # be zero.
     56 $code.=<<___;
     57 $label:
     58 { .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
     59 	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
     60 { .mfi;	(p19)	xor	Zhi=Zhi,Hhi
     61 	($p17)	xor	xi[1]=xi[1],in[1]	};;
     62 { .mfi;	(p18)	ld8	Hhi=[Hi[1]]
     63 	(p19)	shrp	Zlo=Zhi,Zlo,4		}
     64 { .mfi;	(p19)	ld8	rem=[rem]
     65 	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
     66 { .mmi;	($p16)	ld1	in[0]=[inp],-1
     67 	(p18)	xor	Zlo=Zlo,Hlo
     68 	(p19)	shr.u	Zhi=Zhi,4		}
     69 { .mib;	(p19)	xor	Hhi=Hhi,rem
     70 	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
     71 
     72 { .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
     73 	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
     74 { .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
     75 	(p18)	xor	Zhi=Zhi,Hhi		};;
     76 { .mfi;	(p18)	ld8	Hhi=[Hi[1]]
     77 	(p18)	shrp	Zlo=Zhi,Zlo,4		}
     78 { .mfi;	(p18)	ld8	rem=[rem]
     79 	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
     80 { .mmi;	(p16)	ld1	xi[0]=[Xi],-1
     81 	(p18)	xor	Zlo=Zlo,Hlo
     82 	(p18)	shr.u	Zhi=Zhi,4		}
     83 { .mib;	(p18)	xor	Hhi=Hhi,rem
     84 	(p17)	add	Hi[0]=Htbl,Hi[0]
     85 	br.ctop.sptk	$label			};;
     86 ___
     87 }
     88 
     89 $code=<<___;
     90 .explicit
     91 .text
     92 
     93 prevfs=r2;	prevlc=r3;	prevpr=r8;
     94 mask0xf0=r21;
     95 rem=r22;	rem_4bitp=r23;
     96 Xi=r24;		Htbl=r25;
     97 inp=r26;	end=r27;
     98 Hhi=r28;	Hlo=r29;
     99 Zhi=r30;	Zlo=r31;
    100 
    101 .align	128
    102 .skip	16					// aligns loop body
    103 .global	gcm_gmult_4bit#
    104 .proc	gcm_gmult_4bit#
    105 gcm_gmult_4bit:
    106 	.prologue
    107 { .mmi;	.save	ar.pfs,prevfs
    108 	alloc	prevfs=ar.pfs,2,6,0,8
    109 	$ADDP	Xi=15,in0			// &Xi[15]
    110 	mov	rem_4bitp=ip		}
    111 { .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
    112 	.save	ar.lc,prevlc
    113 	mov	prevlc=ar.lc
    114 	.save	pr,prevpr
    115 	mov	prevpr=pr		};;
    116 
    117 	.body
    118 	.rotr	in[3],xi[3],Hi[2]
    119 
    120 { .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
    121 	mov	mask0xf0=0xf0
    122 	brp.loop.imp	.Loop1,.Lend1-16};;
    123 { .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
    124 					};;
    125 { .mii;	shladd	Hi[1]=xi[2],4,r0
    126 	mov	pr.rot=0x7<<16
    127 	mov	ar.lc=13		};;
    128 { .mii;	and	Hi[1]=mask0xf0,Hi[1]
    129 	mov	ar.ec=3
    130 	xor	Zlo=Zlo,Zlo		};;
    131 { .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
    132 	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
    133 	xor	Zhi=Zhi,Zhi		};;
    134 ___
    135 	&loop	(".Loop1",1);
    136 $code.=<<___;
    137 .Lend1:
    138 { .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
    139 { .mib;	mux1	Zlo=Zlo,\@rev		};;
    140 { .mib;	mux1	Zhi=Zhi,\@rev		};;
    141 { .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
    142 	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
    143 { .mib;	st8	[Hlo]=Zlo
    144 	mov	pr=prevpr,0x1ffff	};;
    145 { .mib;	st8	[Hhi]=Zhi
    146 	mov	ar.lc=prevlc
    147 	br.ret.sptk.many	b0	};;
    148 .endp	gcm_gmult_4bit#
    149 ___
    150 
    151 ######################################################################
    152 # "528B" (well, "512B" actualy) streamed GHASH
    153 #
    154 $Xip="in0";
    155 $Htbl="in1";
    156 $inp="in2";
    157 $len="in3";
    158 $rem_8bit="loc0";
    159 $mask0xff="loc1";
    160 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
    161 
    162 sub load_htable() {
    163     for (my $i=0;$i<8;$i++) {
    164 	$code.=<<___;
    165 { .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
    166 	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
    167 { .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
    168 	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
    169 ___
    170 	$code.=shift	if (($i+$#_)==7);
    171 	$code.="\t};;\n"
    172     }
    173 }
    174 
    175 $code.=<<___;
    176 prevsp=r3;
    177 
    178 .align	32
    179 .skip	16					// aligns loop body
    180 .global	gcm_ghash_4bit#
    181 .proc	gcm_ghash_4bit#
    182 gcm_ghash_4bit:
    183 	.prologue
    184 { .mmi;	.save	ar.pfs,prevfs
    185 	alloc	prevfs=ar.pfs,4,2,0,0
    186 	.vframe	prevsp
    187 	mov	prevsp=sp
    188 	mov	$rem_8bit=ip		};;
    189 	.body
    190 { .mfi;	$ADDP	r8=0+0,$Htbl
    191 	$ADDP	r9=0+8,$Htbl		}
    192 { .mfi;	$ADDP	r10=128+0,$Htbl
    193 	$ADDP	r11=128+8,$Htbl		};;
    194 ___
    195 	&load_htable(
    196 	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
    197 	"	$ADDP	$len=$len,$inp",	# &inp[len]
    198 	"	$ADDP	$inp=15,$inp",		# &inp[15]
    199 	"	mov	$mask0xff=0xff",
    200 	"	add	sp=-512,sp",
    201 	"	andcm	sp=sp,$mask0xff",	# align stack frame
    202 	"	add	r14=0,sp",
    203 	"	add	r15=8,sp");
    204 $code.=<<___;
    205 { .mmi;	$sum	1<<1				// go big-endian
    206 	add	r8=256+0,sp
    207 	add	r9=256+8,sp		}
    208 { .mmi;	add	r10=256+128+0,sp
    209 	add	r11=256+128+8,sp
    210 	add	$len=-17,$len		};;
    211 ___
    212 for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
    213 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
    214 $code.=<<___;
    215 { .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
    216 	st8	[r9]=$rhi,16			// Htable[$i].hi
    217 	shrp	$rlo=$rhi,$rlo,4	}//;;
    218 { .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
    219 	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
    220 	shr.u	$rhi=$rhi,4		};;
    221 { .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
    222 	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
    223 ___
    224 }
    225 $code.=<<___;
    226 { .mmi;	ld8	r16=[r8],16			// Htable[8].lo
    227 	ld8	r17=[r9],16		};;	// Htable[8].hi
    228 { .mmi;	ld8	r18=[r8],16			// Htable[9].lo
    229 	ld8	r19=[r9],16		}	// Htable[9].hi
    230 { .mmi;	rum	1<<5				// clear um.mfh
    231 	shrp	r16=r17,r16,4		};;
    232 ___
    233 for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
    234 $code.=<<___;
    235 { .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
    236 	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
    237 	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
    238 { .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
    239 	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
    240 	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
    241 ___
    242 }
    243 $code.=<<___;
    244 { .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
    245 { .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
    246 	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
    247 	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
    248 { .mmi;	add	$Htbl=256,sp			// &Htable[0]
    249 	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
    250 	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
    251 { .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
    252 	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
    253 ___
    254 
    255 $in="r15";
    256 @xi=("r16","r17");
    257 @rem=("r18","r19");
    258 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
    259 ($Atbl,$Btbl)=("r26","r27");
    260 
    261 $code.=<<___;	# (p16)
    262 { .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
    263 	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
    264 	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
    265 ___
    266 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    267 
    268 $code.=<<___;	# (p16),(p17)
    269 { .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
    270 	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
    271 { .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
    272 	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
    273 	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
    274 .align	32
    275 .LOOP:
    276 { .mmi;
    277 (p6)	st8	[$Xip]=$Zhi,13
    278 	xor	$Zlo=$Zlo,$Zlo
    279 	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
    280 ___
    281 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    282 
    283 $code.=<<___;	# (p16),(p17),(p18)
    284 { .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
    285 	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
    286 	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
    287 { .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
    288 	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
    289 { .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
    290 	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
    291 { .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
    292 	ld1	$in=[$inp],-1		}	//(p16) *inp--
    293 { .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
    294 	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
    295 	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
    296 { .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
    297 	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
    298 	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
    299 { .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
    300 	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
    301 ___
    302 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    303 
    304 for ($i=1;$i<14;$i++) {
    305 # Above and below fragments are derived from this one by removing
    306 # unsuitable (p??) instructions.
    307 $code.=<<___;	# (p16),(p17),(p18),(p19)
    308 { .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
    309 	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
    310 	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
    311 { .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
    312 	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
    313 	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
    314 { .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
    315 	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
    316 	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
    317 { .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
    318 	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
    319 	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
    320 { .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
    321 	ld1	$in=[$inp],-1			//(p16) *inp--
    322 	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
    323 { .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
    324 	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
    325 	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
    326 { .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
    327 	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
    328 	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
    329 { .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
    330 	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
    331 	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
    332 ___
    333 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    334 }
    335 
    336 $code.=<<___;	# (p17),(p18),(p19)
    337 { .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
    338 	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
    339 	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
    340 { .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
    341 	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
    342 	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
    343 { .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
    344 	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
    345 	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
    346 { .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
    347 	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
    348 	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
    349 { .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
    350 	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
    351 { .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
    352 	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
    353 	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
    354 { .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
    355 	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
    356 { .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
    357 	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
    358 	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
    359 ___
    360 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    361 
    362 $code.=<<___;	# (p18),(p19)
    363 { .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
    364 	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
    365 { .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
    366 	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
    367 { .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
    368 	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
    369 { .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
    370 	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
    371 { .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
    372 	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
    373 { .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
    374 	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
    375 { .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
    376 	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
    377 { .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
    378 	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
    379 ___
    380 push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
    381 
    382 $code.=<<___;	# (p19)
    383 { .mmi;	cmp.ltu	p6,p0=$inp,$len
    384 	add	$inp=32,$inp
    385 	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
    386 { .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
    387 	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
    388 	add	$Xip=9,$Xip		};;	//	&Xi.lo
    389 { .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
    390 (p6)	ld1	$in=[$inp],-1			//[p16] *inp--
    391 (p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
    392 { .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
    393 (p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
    394 { .mmi;	st8	[$Xip]=$Zlo,-8
    395 (p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
    396 	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
    397 { .mmi;
    398 (p6)	ld1	$in=[$inp],-1			//[p16] *inp--
    399 	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
    400 (p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
    401 { .mib;
    402 (p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
    403 (p6)	br.cond.dptk.many	.LOOP	};;
    404 
    405 { .mib;	st8	[$Xip]=$Zhi		};;
    406 { .mib;	$rum	1<<1				// return to little-endian
    407 	.restore	sp
    408 	mov	sp=prevsp
    409 	br.ret.sptk.many	b0	};;
    410 .endp	gcm_ghash_4bit#
    411 ___
    412 $code.=<<___;
    413 .align	128
    414 .type	rem_4bit#,\@object
    415 rem_4bit:
    416         data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
    417         data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
    418         data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
    419         data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
    420 .size	rem_4bit#,128
    421 .type	rem_8bit#,\@object
    422 rem_8bit:
    423 	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
    424 	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
    425 	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
    426 	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
    427 	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
    428 	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
    429 	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
    430 	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
    431 	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
    432 	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
    433 	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
    434 	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
    435 	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
    436 	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
    437 	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
    438 	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
    439 	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
    440 	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
    441 	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
    442 	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
    443 	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
    444 	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
    445 	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
    446 	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
    447 	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
    448 	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
    449 	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
    450 	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
    451 	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
    452 	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
    453 	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
    454 	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
    455 .size	rem_8bit#,512
    456 stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
    457 ___
    458 
    459 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
    460 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    461 
    462 print $code;
    463 close STDOUT;
    464