Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # March 2010
     11 #
     12 # The module implements "4-bit" GCM GHASH function and underlying
     13 # single multiplication operation in GF(2^128). "4-bit" means that it
     14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
     15 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
     16 # and are expressed in cycles per processed byte, less is better:
     17 #
     18 #		gcc 3.3.x	cc 5.2		this assembler
     19 #
     20 # 32-bit build	81.4		43.3		12.6	(+546%/+244%)
     21 # 64-bit build	20.2		21.2		12.6	(+60%/+68%)
     22 #
     23 # Here is data collected on UltraSPARC T1 system running Linux:
     24 #
     25 #		gcc 4.4.1			this assembler
     26 #
     27 # 32-bit build	566				50	(+1000%)
     28 # 64-bit build	56				50	(+12%)
     29 #
     30 # I don't quite understand why difference between 32-bit and 64-bit
     31 # compiler-generated code is so big. Compilers *were* instructed to
     32 # generate code for UltraSPARC and should have used 64-bit registers
     33 # for Z vector (see C code) even in 32-bit build... Oh well, it only
     34 # means more impressive improvement coefficients for this assembler
     35 # module;-) Loops are aggressively modulo-scheduled in respect to
     36 # references to input data and Z.hi updates to achieve 12 cycles
     37 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
     38 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
     39 
     40 $bits=32;
     41 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     42 if ($bits==64)  { $bias=2047; $frame=192; }
     43 else            { $bias=0;    $frame=112; }
     44 
     45 $output=shift;
     46 open STDOUT,">$output";
     47 
     48 $Zhi="%o0";	# 64-bit values
     49 $Zlo="%o1";
     50 $Thi="%o2";
     51 $Tlo="%o3";
     52 $rem="%o4";
     53 $tmp="%o5";
     54 
     55 $nhi="%l0";	# small values and pointers
     56 $nlo="%l1";
     57 $xi0="%l2";
     58 $xi1="%l3";
     59 $rem_4bit="%l4";
     60 $remi="%l5";
     61 $Htblo="%l6";
     62 $cnt="%l7";
     63 
     64 $Xi="%i0";	# input argument block
     65 $Htbl="%i1";
     66 $inp="%i2";
     67 $len="%i3";
     68 
     69 $code.=<<___;
     70 .section	".text",#alloc,#execinstr
     71 
     72 .align	64
     73 rem_4bit:
     74 	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
     75 	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
     76 	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
     77 	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
     78 .type	rem_4bit,#object
     79 .size	rem_4bit,(.-rem_4bit)
     80 
     81 .globl	gcm_ghash_4bit
     82 .align	32
     83 gcm_ghash_4bit:
     84 	save	%sp,-$frame,%sp
     85 	ldub	[$inp+15],$nlo
     86 	ldub	[$Xi+15],$xi0
     87 	ldub	[$Xi+14],$xi1
     88 	add	$len,$inp,$len
     89 	add	$Htbl,8,$Htblo
     90 
     91 1:	call	.+8
     92 	add	%o7,rem_4bit-1b,$rem_4bit
     93 
     94 .Louter:
     95 	xor	$xi0,$nlo,$nlo
     96 	and	$nlo,0xf0,$nhi
     97 	and	$nlo,0x0f,$nlo
     98 	sll	$nlo,4,$nlo
     99 	ldx	[$Htblo+$nlo],$Zlo
    100 	ldx	[$Htbl+$nlo],$Zhi
    101 
    102 	ldub	[$inp+14],$nlo
    103 
    104 	ldx	[$Htblo+$nhi],$Tlo
    105 	and	$Zlo,0xf,$remi
    106 	ldx	[$Htbl+$nhi],$Thi
    107 	sll	$remi,3,$remi
    108 	ldx	[$rem_4bit+$remi],$rem
    109 	srlx	$Zlo,4,$Zlo
    110 	mov	13,$cnt
    111 	sllx	$Zhi,60,$tmp
    112 	xor	$Tlo,$Zlo,$Zlo
    113 	srlx	$Zhi,4,$Zhi
    114 	xor	$Zlo,$tmp,$Zlo
    115 
    116 	xor	$xi1,$nlo,$nlo
    117 	and	$Zlo,0xf,$remi
    118 	and	$nlo,0xf0,$nhi
    119 	and	$nlo,0x0f,$nlo
    120 	ba	.Lghash_inner
    121 	sll	$nlo,4,$nlo
    122 .align	32
    123 .Lghash_inner:
    124 	ldx	[$Htblo+$nlo],$Tlo
    125 	sll	$remi,3,$remi
    126 	xor	$Thi,$Zhi,$Zhi
    127 	ldx	[$Htbl+$nlo],$Thi
    128 	srlx	$Zlo,4,$Zlo
    129 	xor	$rem,$Zhi,$Zhi
    130 	ldx	[$rem_4bit+$remi],$rem
    131 	sllx	$Zhi,60,$tmp
    132 	xor	$Tlo,$Zlo,$Zlo
    133 	ldub	[$inp+$cnt],$nlo
    134 	srlx	$Zhi,4,$Zhi
    135 	xor	$Zlo,$tmp,$Zlo
    136 	ldub	[$Xi+$cnt],$xi1
    137 	xor	$Thi,$Zhi,$Zhi
    138 	and	$Zlo,0xf,$remi
    139 
    140 	ldx	[$Htblo+$nhi],$Tlo
    141 	sll	$remi,3,$remi
    142 	xor	$rem,$Zhi,$Zhi
    143 	ldx	[$Htbl+$nhi],$Thi
    144 	srlx	$Zlo,4,$Zlo
    145 	ldx	[$rem_4bit+$remi],$rem
    146 	sllx	$Zhi,60,$tmp
    147 	xor	$xi1,$nlo,$nlo
    148 	srlx	$Zhi,4,$Zhi
    149 	and	$nlo,0xf0,$nhi
    150 	addcc	$cnt,-1,$cnt
    151 	xor	$Zlo,$tmp,$Zlo
    152 	and	$nlo,0x0f,$nlo
    153 	xor	$Tlo,$Zlo,$Zlo
    154 	sll	$nlo,4,$nlo
    155 	blu	.Lghash_inner
    156 	and	$Zlo,0xf,$remi
    157 
    158 	ldx	[$Htblo+$nlo],$Tlo
    159 	sll	$remi,3,$remi
    160 	xor	$Thi,$Zhi,$Zhi
    161 	ldx	[$Htbl+$nlo],$Thi
    162 	srlx	$Zlo,4,$Zlo
    163 	xor	$rem,$Zhi,$Zhi
    164 	ldx	[$rem_4bit+$remi],$rem
    165 	sllx	$Zhi,60,$tmp
    166 	xor	$Tlo,$Zlo,$Zlo
    167 	srlx	$Zhi,4,$Zhi
    168 	xor	$Zlo,$tmp,$Zlo
    169 	xor	$Thi,$Zhi,$Zhi
    170 
    171 	add	$inp,16,$inp
    172 	cmp	$inp,$len
    173 	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
    174 	and	$Zlo,0xf,$remi
    175 
    176 	ldx	[$Htblo+$nhi],$Tlo
    177 	sll	$remi,3,$remi
    178 	xor	$rem,$Zhi,$Zhi
    179 	ldx	[$Htbl+$nhi],$Thi
    180 	srlx	$Zlo,4,$Zlo
    181 	ldx	[$rem_4bit+$remi],$rem
    182 	sllx	$Zhi,60,$tmp
    183 	xor	$Tlo,$Zlo,$Zlo
    184 	ldub	[$inp+15],$nlo
    185 	srlx	$Zhi,4,$Zhi
    186 	xor	$Zlo,$tmp,$Zlo
    187 	xor	$Thi,$Zhi,$Zhi
    188 	stx	$Zlo,[$Xi+8]
    189 	xor	$rem,$Zhi,$Zhi
    190 	stx	$Zhi,[$Xi]
    191 	srl	$Zlo,8,$xi1
    192 	and	$Zlo,0xff,$xi0
    193 	ba	.Louter
    194 	and	$xi1,0xff,$xi1
    195 .align	32
    196 .Ldone:
    197 	ldx	[$Htblo+$nhi],$Tlo
    198 	sll	$remi,3,$remi
    199 	xor	$rem,$Zhi,$Zhi
    200 	ldx	[$Htbl+$nhi],$Thi
    201 	srlx	$Zlo,4,$Zlo
    202 	ldx	[$rem_4bit+$remi],$rem
    203 	sllx	$Zhi,60,$tmp
    204 	xor	$Tlo,$Zlo,$Zlo
    205 	srlx	$Zhi,4,$Zhi
    206 	xor	$Zlo,$tmp,$Zlo
    207 	xor	$Thi,$Zhi,$Zhi
    208 	stx	$Zlo,[$Xi+8]
    209 	xor	$rem,$Zhi,$Zhi
    210 	stx	$Zhi,[$Xi]
    211 
    212 	ret
    213 	restore
    214 .type	gcm_ghash_4bit,#function
    215 .size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
    216 ___
    217 
    218 undef $inp;
    219 undef $len;
    220 
    221 $code.=<<___;
    222 .globl	gcm_gmult_4bit
    223 .align	32
    224 gcm_gmult_4bit:
    225 	save	%sp,-$frame,%sp
    226 	ldub	[$Xi+15],$nlo
    227 	add	$Htbl,8,$Htblo
    228 
    229 1:	call	.+8
    230 	add	%o7,rem_4bit-1b,$rem_4bit
    231 
    232 	and	$nlo,0xf0,$nhi
    233 	and	$nlo,0x0f,$nlo
    234 	sll	$nlo,4,$nlo
    235 	ldx	[$Htblo+$nlo],$Zlo
    236 	ldx	[$Htbl+$nlo],$Zhi
    237 
    238 	ldub	[$Xi+14],$nlo
    239 
    240 	ldx	[$Htblo+$nhi],$Tlo
    241 	and	$Zlo,0xf,$remi
    242 	ldx	[$Htbl+$nhi],$Thi
    243 	sll	$remi,3,$remi
    244 	ldx	[$rem_4bit+$remi],$rem
    245 	srlx	$Zlo,4,$Zlo
    246 	mov	13,$cnt
    247 	sllx	$Zhi,60,$tmp
    248 	xor	$Tlo,$Zlo,$Zlo
    249 	srlx	$Zhi,4,$Zhi
    250 	xor	$Zlo,$tmp,$Zlo
    251 
    252 	and	$Zlo,0xf,$remi
    253 	and	$nlo,0xf0,$nhi
    254 	and	$nlo,0x0f,$nlo
    255 	ba	.Lgmult_inner
    256 	sll	$nlo,4,$nlo
    257 .align	32
    258 .Lgmult_inner:
    259 	ldx	[$Htblo+$nlo],$Tlo
    260 	sll	$remi,3,$remi
    261 	xor	$Thi,$Zhi,$Zhi
    262 	ldx	[$Htbl+$nlo],$Thi
    263 	srlx	$Zlo,4,$Zlo
    264 	xor	$rem,$Zhi,$Zhi
    265 	ldx	[$rem_4bit+$remi],$rem
    266 	sllx	$Zhi,60,$tmp
    267 	xor	$Tlo,$Zlo,$Zlo
    268 	ldub	[$Xi+$cnt],$nlo
    269 	srlx	$Zhi,4,$Zhi
    270 	xor	$Zlo,$tmp,$Zlo
    271 	xor	$Thi,$Zhi,$Zhi
    272 	and	$Zlo,0xf,$remi
    273 
    274 	ldx	[$Htblo+$nhi],$Tlo
    275 	sll	$remi,3,$remi
    276 	xor	$rem,$Zhi,$Zhi
    277 	ldx	[$Htbl+$nhi],$Thi
    278 	srlx	$Zlo,4,$Zlo
    279 	ldx	[$rem_4bit+$remi],$rem
    280 	sllx	$Zhi,60,$tmp
    281 	srlx	$Zhi,4,$Zhi
    282 	and	$nlo,0xf0,$nhi
    283 	addcc	$cnt,-1,$cnt
    284 	xor	$Zlo,$tmp,$Zlo
    285 	and	$nlo,0x0f,$nlo
    286 	xor	$Tlo,$Zlo,$Zlo
    287 	sll	$nlo,4,$nlo
    288 	blu	.Lgmult_inner
    289 	and	$Zlo,0xf,$remi
    290 
    291 	ldx	[$Htblo+$nlo],$Tlo
    292 	sll	$remi,3,$remi
    293 	xor	$Thi,$Zhi,$Zhi
    294 	ldx	[$Htbl+$nlo],$Thi
    295 	srlx	$Zlo,4,$Zlo
    296 	xor	$rem,$Zhi,$Zhi
    297 	ldx	[$rem_4bit+$remi],$rem
    298 	sllx	$Zhi,60,$tmp
    299 	xor	$Tlo,$Zlo,$Zlo
    300 	srlx	$Zhi,4,$Zhi
    301 	xor	$Zlo,$tmp,$Zlo
    302 	xor	$Thi,$Zhi,$Zhi
    303 	and	$Zlo,0xf,$remi
    304 
    305 	ldx	[$Htblo+$nhi],$Tlo
    306 	sll	$remi,3,$remi
    307 	xor	$rem,$Zhi,$Zhi
    308 	ldx	[$Htbl+$nhi],$Thi
    309 	srlx	$Zlo,4,$Zlo
    310 	ldx	[$rem_4bit+$remi],$rem
    311 	sllx	$Zhi,60,$tmp
    312 	xor	$Tlo,$Zlo,$Zlo
    313 	srlx	$Zhi,4,$Zhi
    314 	xor	$Zlo,$tmp,$Zlo
    315 	xor	$Thi,$Zhi,$Zhi
    316 	stx	$Zlo,[$Xi+8]
    317 	xor	$rem,$Zhi,$Zhi
    318 	stx	$Zhi,[$Xi]
    319 
    320 	ret
    321 	restore
    322 .type	gcm_gmult_4bit,#function
    323 .size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
    324 .asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
    325 .align	4
    326 ___
    327 
    328 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    329 print $code;
    330 close STDOUT;
    331