1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # March 2010 11 # 12 # The module implements "4-bit" GCM GHASH function and underlying 13 # single multiplication operation in GF(2^128). "4-bit" means that it 14 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed 15 # GHASH performance was measured to be 6.67 cycles per processed byte 16 # on Itanium 2, which is >90% better than Microsoft compiler generated 17 # code. To anchor to something else sha1-ia64.pl module processes one 18 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per 19 # byte. 20 21 # September 2010 22 # 23 # It was originally thought that it makes lesser sense to implement 24 # "528B" variant on Itanium 2 for following reason. Because number of 25 # functional units is naturally limited, it appeared impossible to 26 # implement "528B" loop in 4 cycles, only in 5. This would mean that 27 # theoretically performance improvement couldn't be more than 20%. 28 # But occasionally you prove yourself wrong:-) I figured out a way to 29 # fold couple of instructions and having freed yet another instruction 30 # slot by unrolling the loop... Resulting performance is 4.45 cycles 31 # per processed byte and 50% better than "256B" version. On original 32 # Itanium performance should remain the same as the "256B" version, 33 # i.e. ~8.5 cycles. 34 35 $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); 36 37 if ($^O eq "hpux") { 38 $ADDP="addp4"; 39 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 40 } else { $ADDP="add"; } 41 for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 42 $big_endian=0 if (/\-DL_ENDIAN/); } 43 if (!defined($big_endian)) 44 { $big_endian=(unpack('L',pack('N',1))==1); } 45 46 sub loop() { 47 my $label=shift; 48 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp 49 50 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. 51 # in scalable manner;-) Naturally assuming data in L1 cache... 52 # Special note about 'dep' instruction, which is used to construct 53 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 54 # bytes boundary and lower 7 bits of its address are guaranteed to 55 # be zero. 56 $code.=<<___; 57 $label: 58 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 59 (p19) dep rem=Zlo,rem_4bitp,3,4 } 60 { .mfi; (p19) xor Zhi=Zhi,Hhi 61 ($p17) xor xi[1]=xi[1],in[1] };; 62 { .mfi; (p18) ld8 Hhi=[Hi[1]] 63 (p19) shrp Zlo=Zhi,Zlo,4 } 64 { .mfi; (p19) ld8 rem=[rem] 65 (p18) and Hi[1]=mask0xf0,xi[2] };; 66 { .mmi; ($p16) ld1 in[0]=[inp],-1 67 (p18) xor Zlo=Zlo,Hlo 68 (p19) shr.u Zhi=Zhi,4 } 69 { .mib; (p19) xor Hhi=Hhi,rem 70 (p18) add Hi[1]=Htbl,Hi[1] };; 71 72 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 73 (p18) dep rem=Zlo,rem_4bitp,3,4 } 74 { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 75 (p18) xor Zhi=Zhi,Hhi };; 76 { .mfi; (p18) ld8 Hhi=[Hi[1]] 77 (p18) shrp Zlo=Zhi,Zlo,4 } 78 { .mfi; (p18) ld8 rem=[rem] 79 (p17) and Hi[0]=mask0xf0,Hi[0] };; 80 { .mmi; (p16) ld1 xi[0]=[Xi],-1 81 (p18) xor Zlo=Zlo,Hlo 82 (p18) shr.u Zhi=Zhi,4 } 83 { .mib; (p18) xor Hhi=Hhi,rem 84 (p17) add Hi[0]=Htbl,Hi[0] 85 br.ctop.sptk $label };; 86 ___ 87 } 88 89 $code=<<___; 90 .explicit 91 .text 92 93 prevfs=r2; prevlc=r3; prevpr=r8; 94 mask0xf0=r21; 95 rem=r22; rem_4bitp=r23; 96 Xi=r24; Htbl=r25; 97 inp=r26; end=r27; 98 Hhi=r28; Hlo=r29; 99 Zhi=r30; Zlo=r31; 100 101 .align 128 102 .skip 16 // aligns loop body 103 .global gcm_gmult_4bit# 104 .proc gcm_gmult_4bit# 105 gcm_gmult_4bit: 106 .prologue 107 { .mmi; .save ar.pfs,prevfs 108 alloc prevfs=ar.pfs,2,6,0,8 109 $ADDP Xi=15,in0 // &Xi[15] 110 mov rem_4bitp=ip } 111 { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo 112 .save ar.lc,prevlc 113 mov prevlc=ar.lc 114 .save pr,prevpr 115 mov prevpr=pr };; 116 117 .body 118 .rotr in[3],xi[3],Hi[2] 119 120 { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] 121 mov mask0xf0=0xf0 122 brp.loop.imp .Loop1,.Lend1-16};; 123 { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] 124 };; 125 { .mii; shladd Hi[1]=xi[2],4,r0 126 mov pr.rot=0x7<<16 127 mov ar.lc=13 };; 128 { .mii; and Hi[1]=mask0xf0,Hi[1] 129 mov ar.ec=3 130 xor Zlo=Zlo,Zlo };; 131 { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo 132 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp 133 xor Zhi=Zhi,Zhi };; 134 ___ 135 &loop (".Loop1",1); 136 $code.=<<___; 137 .Lend1: 138 { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact 139 { .mib; mux1 Zlo=Zlo,\@rev };; 140 { .mib; mux1 Zhi=Zhi,\@rev };; 141 { .mmi; add Hlo=9,Xi;; // ;; is here to prevent 142 add Hhi=1,Xi };; // pipeline flush on Itanium 143 { .mib; st8 [Hlo]=Zlo 144 mov pr=prevpr,0x1ffff };; 145 { .mib; st8 [Hhi]=Zhi 146 mov ar.lc=prevlc 147 br.ret.sptk.many b0 };; 148 .endp gcm_gmult_4bit# 149 ___ 150 151 ###################################################################### 152 # "528B" (well, "512B" actualy) streamed GHASH 153 # 154 $Xip="in0"; 155 $Htbl="in1"; 156 $inp="in2"; 157 $len="in3"; 158 $rem_8bit="loc0"; 159 $mask0xff="loc1"; 160 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); 161 162 sub load_htable() { 163 for (my $i=0;$i<8;$i++) { 164 $code.=<<___; 165 { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi 166 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo 167 { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi 168 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo 169 ___ 170 $code.=shift if (($i+$#_)==7); 171 $code.="\t};;\n" 172 } 173 } 174 175 $code.=<<___; 176 prevsp=r3; 177 178 .align 32 179 .skip 16 // aligns loop body 180 .global gcm_ghash_4bit# 181 .proc gcm_ghash_4bit# 182 gcm_ghash_4bit: 183 .prologue 184 { .mmi; .save ar.pfs,prevfs 185 alloc prevfs=ar.pfs,4,2,0,0 186 .vframe prevsp 187 mov prevsp=sp 188 mov $rem_8bit=ip };; 189 .body 190 { .mfi; $ADDP r8=0+0,$Htbl 191 $ADDP r9=0+8,$Htbl } 192 { .mfi; $ADDP r10=128+0,$Htbl 193 $ADDP r11=128+8,$Htbl };; 194 ___ 195 &load_htable( 196 " $ADDP $Xip=15,$Xip", # &Xi[15] 197 " $ADDP $len=$len,$inp", # &inp[len] 198 " $ADDP $inp=15,$inp", # &inp[15] 199 " mov $mask0xff=0xff", 200 " add sp=-512,sp", 201 " andcm sp=sp,$mask0xff", # align stack frame 202 " add r14=0,sp", 203 " add r15=8,sp"); 204 $code.=<<___; 205 { .mmi; $sum 1<<1 // go big-endian 206 add r8=256+0,sp 207 add r9=256+8,sp } 208 { .mmi; add r10=256+128+0,sp 209 add r11=256+128+8,sp 210 add $len=-17,$len };; 211 ___ 212 for($i=0;$i<8;$i++) { # generate first half of Hshr4[] 213 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); 214 $code.=<<___; 215 { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo 216 st8 [r9]=$rhi,16 // Htable[$i].hi 217 shrp $rlo=$rhi,$rlo,4 }//;; 218 { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo 219 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi 220 shr.u $rhi=$rhi,4 };; 221 { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 222 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 223 ___ 224 } 225 $code.=<<___; 226 { .mmi; ld8 r16=[r8],16 // Htable[8].lo 227 ld8 r17=[r9],16 };; // Htable[8].hi 228 { .mmi; ld8 r18=[r8],16 // Htable[9].lo 229 ld8 r19=[r9],16 } // Htable[9].hi 230 { .mmi; rum 1<<5 // clear um.mfh 231 shrp r16=r17,r16,4 };; 232 ___ 233 for($i=0;$i<6;$i++) { # generate second half of Hshr4[] 234 $code.=<<___; 235 { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo 236 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi 237 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 238 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 239 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 240 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 241 ___ 242 } 243 $code.=<<___; 244 { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 245 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 248 { .mmi; add $Htbl=256,sp // &Htable[0] 249 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit 250 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; 251 { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 252 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 253 ___ 254 255 $in="r15"; 256 @xi=("r16","r17"); 257 @rem=("r18","r19"); 258 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); 259 ($Atbl,$Btbl)=("r26","r27"); 260 261 $code.=<<___; # (p16) 262 { .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- 263 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 264 cmp.eq p0,p6=r0,r0 };; // clear p6 265 ___ 266 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 267 268 $code.=<<___; # (p16),(p17) 269 { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 270 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 271 { .mii; ld1 $in=[$inp],-1 //(p16) *inp-- 272 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo 273 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 274 .align 32 275 .LOOP: 276 { .mmi; 277 (p6) st8 [$Xip]=$Zhi,13 278 xor $Zlo=$Zlo,$Zlo 279 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo 280 ___ 281 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 282 283 $code.=<<___; # (p16),(p17),(p18) 284 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 285 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 286 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 287 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 288 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 289 { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 290 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo 291 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 292 ld1 $in=[$inp],-1 } //(p16) *inp-- 293 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 294 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi 295 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 296 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 297 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 298 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 299 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 300 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 301 ___ 302 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 303 304 for ($i=1;$i<14;$i++) { 305 # Above and below fragments are derived from this one by removing 306 # unsuitable (p??) instructions. 307 $code.=<<___; # (p16),(p17),(p18),(p19) 308 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 309 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 310 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 311 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 312 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 313 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 314 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 315 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 316 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 317 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 318 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 319 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 320 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 321 ld1 $in=[$inp],-1 //(p16) *inp-- 322 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 323 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 324 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 325 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 326 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 327 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 328 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 329 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 330 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 331 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 332 ___ 333 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 334 } 335 336 $code.=<<___; # (p17),(p18),(p19) 337 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 338 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 339 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 340 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 341 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 342 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 343 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 344 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 345 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo 346 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 347 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 348 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 349 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 350 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 351 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 352 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 353 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 354 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 355 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 356 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 357 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 358 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 359 ___ 360 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 361 362 $code.=<<___; # (p18),(p19) 363 { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 364 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 365 { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 366 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo 367 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 368 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo 369 { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 370 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 371 { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi 372 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 373 { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 374 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi 375 { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi 376 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) 377 { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 378 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 379 ___ 380 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 381 382 $code.=<<___; # (p19) 383 { .mmi; cmp.ltu p6,p0=$inp,$len 384 add $inp=32,$inp 385 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 386 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 387 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 388 add $Xip=9,$Xip };; // &Xi.lo 389 { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 390 (p6) ld1 $in=[$inp],-1 //[p16] *inp-- 391 (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] 392 { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi 393 (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] 394 { .mmi; st8 [$Xip]=$Zlo,-8 395 (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] 396 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 397 { .mmi; 398 (p6) ld1 $in=[$inp],-1 //[p16] *inp-- 399 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 400 (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo 401 { .mib; 402 (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 403 (p6) br.cond.dptk.many .LOOP };; 404 405 { .mib; st8 [$Xip]=$Zhi };; 406 { .mib; $rum 1<<1 // return to little-endian 407 .restore sp 408 mov sp=prevsp 409 br.ret.sptk.many b0 };; 410 .endp gcm_ghash_4bit# 411 ___ 412 $code.=<<___; 413 .align 128 414 .type rem_4bit#,\@object 415 rem_4bit: 416 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 417 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 418 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 419 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 420 .size rem_4bit#,128 421 .type rem_8bit#,\@object 422 rem_8bit: 423 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E 424 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E 425 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E 426 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E 427 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E 428 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E 429 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E 430 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E 431 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE 432 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE 433 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE 434 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE 435 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E 436 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E 437 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE 438 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE 439 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E 440 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E 441 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E 442 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E 443 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E 444 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E 445 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E 446 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E 447 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE 448 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE 449 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE 450 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE 451 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E 452 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E 453 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE 454 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE 455 .size rem_8bit#,512 456 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" 457 ___ 458 459 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); 460 $code =~ s/\`([^\`]*)\`/eval $1/gem; 461 462 print $code; 463 close STDOUT; 464