Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # March 2010
     11 #
     12 # The module implements "4-bit" GCM GHASH function and underlying
     13 # single multiplication operation in GF(2^128). "4-bit" means that it
     14 # uses 256 bytes per-key table [+128 bytes shared table]. Even though
     15 # loops are aggressively modulo-scheduled in respect to references to
     16 # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
     17 # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
     18 # scheduling "glitch," because uprofile(1) indicates uniform sample
     19 # distribution, as if all instruction bundles execute in 1.5 cycles.
     20 # Meaning that it could have been even faster, yet 12 cycles is ~60%
     21 # better than gcc-generated code and ~80% than code generated by vendor
     22 # compiler.
     23 
     24 $cnt="v0";	# $0
     25 $t0="t0";
     26 $t1="t1";
     27 $t2="t2";
     28 $Thi0="t3";	# $4
     29 $Tlo0="t4";
     30 $Thi1="t5";
     31 $Tlo1="t6";
     32 $rem="t7";	# $8
     33 #################
     34 $Xi="a0";	# $16, input argument block
     35 $Htbl="a1";
     36 $inp="a2";
     37 $len="a3";
     38 $nlo="a4";	# $20
     39 $nhi="a5";
     40 $Zhi="t8";
     41 $Zlo="t9";
     42 $Xhi="t10";	# $24
     43 $Xlo="t11";
     44 $remp="t12";
     45 $rem_4bit="AT";	# $28
     46 
     47 { my $N;
     48   sub loop() {
     49 
     50 	$N++;
     51 $code.=<<___;
     52 .align	4
     53 	extbl	$Xlo,7,$nlo
     54 	and	$nlo,0xf0,$nhi
     55 	sll	$nlo,4,$nlo
     56 	and	$nlo,0xf0,$nlo
     57 
     58 	addq	$nlo,$Htbl,$nlo
     59 	ldq	$Zlo,8($nlo)
     60 	addq	$nhi,$Htbl,$nhi
     61 	ldq	$Zhi,0($nlo)
     62 
     63 	and	$Zlo,0x0f,$remp
     64 	sll	$Zhi,60,$t0
     65 	lda	$cnt,6(zero)
     66 	extbl	$Xlo,6,$nlo
     67 
     68 	ldq	$Tlo1,8($nhi)
     69 	s8addq	$remp,$rem_4bit,$remp
     70 	ldq	$Thi1,0($nhi)
     71 	srl	$Zlo,4,$Zlo
     72 
     73 	ldq	$rem,0($remp)
     74 	srl	$Zhi,4,$Zhi
     75 	xor	$t0,$Zlo,$Zlo
     76 	and	$nlo,0xf0,$nhi
     77 
     78 	xor	$Tlo1,$Zlo,$Zlo
     79 	sll	$nlo,4,$nlo
     80 	xor	$Thi1,$Zhi,$Zhi
     81 	and	$nlo,0xf0,$nlo
     82 
     83 	addq	$nlo,$Htbl,$nlo
     84 	ldq	$Tlo0,8($nlo)
     85 	addq	$nhi,$Htbl,$nhi
     86 	ldq	$Thi0,0($nlo)
     87 
     88 .Looplo$N:
     89 	and	$Zlo,0x0f,$remp
     90 	sll	$Zhi,60,$t0
     91 	subq	$cnt,1,$cnt
     92 	srl	$Zlo,4,$Zlo
     93 
     94 	ldq	$Tlo1,8($nhi)
     95 	xor	$rem,$Zhi,$Zhi
     96 	ldq	$Thi1,0($nhi)
     97 	s8addq	$remp,$rem_4bit,$remp
     98 
     99 	ldq	$rem,0($remp)
    100 	srl	$Zhi,4,$Zhi
    101 	xor	$t0,$Zlo,$Zlo
    102 	extbl	$Xlo,$cnt,$nlo
    103 
    104 	and	$nlo,0xf0,$nhi
    105 	xor	$Thi0,$Zhi,$Zhi
    106 	xor	$Tlo0,$Zlo,$Zlo
    107 	sll	$nlo,4,$nlo
    108 
    109 
    110 	and	$Zlo,0x0f,$remp
    111 	sll	$Zhi,60,$t0
    112 	and	$nlo,0xf0,$nlo
    113 	srl	$Zlo,4,$Zlo
    114 
    115 	s8addq	$remp,$rem_4bit,$remp
    116 	xor	$rem,$Zhi,$Zhi
    117 	addq	$nlo,$Htbl,$nlo
    118 	addq	$nhi,$Htbl,$nhi
    119 
    120 	ldq	$rem,0($remp)
    121 	srl	$Zhi,4,$Zhi
    122 	ldq	$Tlo0,8($nlo)
    123 	xor	$t0,$Zlo,$Zlo
    124 
    125 	xor	$Tlo1,$Zlo,$Zlo
    126 	xor	$Thi1,$Zhi,$Zhi
    127 	ldq	$Thi0,0($nlo)
    128 	bne	$cnt,.Looplo$N
    129 
    130 
    131 	and	$Zlo,0x0f,$remp
    132 	sll	$Zhi,60,$t0
    133 	lda	$cnt,7(zero)
    134 	srl	$Zlo,4,$Zlo
    135 
    136 	ldq	$Tlo1,8($nhi)
    137 	xor	$rem,$Zhi,$Zhi
    138 	ldq	$Thi1,0($nhi)
    139 	s8addq	$remp,$rem_4bit,$remp
    140 
    141 	ldq	$rem,0($remp)
    142 	srl	$Zhi,4,$Zhi
    143 	xor	$t0,$Zlo,$Zlo
    144 	extbl	$Xhi,$cnt,$nlo
    145 
    146 	and	$nlo,0xf0,$nhi
    147 	xor	$Thi0,$Zhi,$Zhi
    148 	xor	$Tlo0,$Zlo,$Zlo
    149 	sll	$nlo,4,$nlo
    150 
    151 	and	$Zlo,0x0f,$remp
    152 	sll	$Zhi,60,$t0
    153 	and	$nlo,0xf0,$nlo
    154 	srl	$Zlo,4,$Zlo
    155 
    156 	s8addq	$remp,$rem_4bit,$remp
    157 	xor	$rem,$Zhi,$Zhi
    158 	addq	$nlo,$Htbl,$nlo
    159 	addq	$nhi,$Htbl,$nhi
    160 
    161 	ldq	$rem,0($remp)
    162 	srl	$Zhi,4,$Zhi
    163 	ldq	$Tlo0,8($nlo)
    164 	xor	$t0,$Zlo,$Zlo
    165 
    166 	xor	$Tlo1,$Zlo,$Zlo
    167 	xor	$Thi1,$Zhi,$Zhi
    168 	ldq	$Thi0,0($nlo)
    169 	unop
    170 
    171 
    172 .Loophi$N:
    173 	and	$Zlo,0x0f,$remp
    174 	sll	$Zhi,60,$t0
    175 	subq	$cnt,1,$cnt
    176 	srl	$Zlo,4,$Zlo
    177 
    178 	ldq	$Tlo1,8($nhi)
    179 	xor	$rem,$Zhi,$Zhi
    180 	ldq	$Thi1,0($nhi)
    181 	s8addq	$remp,$rem_4bit,$remp
    182 
    183 	ldq	$rem,0($remp)
    184 	srl	$Zhi,4,$Zhi
    185 	xor	$t0,$Zlo,$Zlo
    186 	extbl	$Xhi,$cnt,$nlo
    187 
    188 	and	$nlo,0xf0,$nhi
    189 	xor	$Thi0,$Zhi,$Zhi
    190 	xor	$Tlo0,$Zlo,$Zlo
    191 	sll	$nlo,4,$nlo
    192 
    193 
    194 	and	$Zlo,0x0f,$remp
    195 	sll	$Zhi,60,$t0
    196 	and	$nlo,0xf0,$nlo
    197 	srl	$Zlo,4,$Zlo
    198 
    199 	s8addq	$remp,$rem_4bit,$remp
    200 	xor	$rem,$Zhi,$Zhi
    201 	addq	$nlo,$Htbl,$nlo
    202 	addq	$nhi,$Htbl,$nhi
    203 
    204 	ldq	$rem,0($remp)
    205 	srl	$Zhi,4,$Zhi
    206 	ldq	$Tlo0,8($nlo)
    207 	xor	$t0,$Zlo,$Zlo
    208 
    209 	xor	$Tlo1,$Zlo,$Zlo
    210 	xor	$Thi1,$Zhi,$Zhi
    211 	ldq	$Thi0,0($nlo)
    212 	bne	$cnt,.Loophi$N
    213 
    214 
    215 	and	$Zlo,0x0f,$remp
    216 	sll	$Zhi,60,$t0
    217 	srl	$Zlo,4,$Zlo
    218 
    219 	ldq	$Tlo1,8($nhi)
    220 	xor	$rem,$Zhi,$Zhi
    221 	ldq	$Thi1,0($nhi)
    222 	s8addq	$remp,$rem_4bit,$remp
    223 
    224 	ldq	$rem,0($remp)
    225 	srl	$Zhi,4,$Zhi
    226 	xor	$t0,$Zlo,$Zlo
    227 
    228 	xor	$Tlo0,$Zlo,$Zlo
    229 	xor	$Thi0,$Zhi,$Zhi
    230 
    231 	and	$Zlo,0x0f,$remp
    232 	sll	$Zhi,60,$t0
    233 	srl	$Zlo,4,$Zlo
    234 
    235 	s8addq	$remp,$rem_4bit,$remp
    236 	xor	$rem,$Zhi,$Zhi
    237 
    238 	ldq	$rem,0($remp)
    239 	srl	$Zhi,4,$Zhi
    240 	xor	$Tlo1,$Zlo,$Zlo
    241 	xor	$Thi1,$Zhi,$Zhi
    242 	xor	$t0,$Zlo,$Zlo
    243 	xor	$rem,$Zhi,$Zhi
    244 ___
    245 }}
    246 
    247 $code=<<___;
    248 #ifdef __linux__
    249 #include <asm/regdef.h>
    250 #else
    251 #include <asm.h>
    252 #include <regdef.h>
    253 #endif
    254 
    255 .text
    256 
    257 .set	noat
    258 .set	noreorder
    259 .globl	gcm_gmult_4bit
    260 .align	4
    261 .ent	gcm_gmult_4bit
    262 gcm_gmult_4bit:
    263 	.frame	sp,0,ra
    264 	.prologue 0
    265 
    266 	ldq	$Xlo,8($Xi)
    267 	ldq	$Xhi,0($Xi)
    268 
    269 	bsr	$t0,picmeup
    270 	nop
    271 ___
    272 
    273 	&loop();
    274 
    275 $code.=<<___;
    276 	srl	$Zlo,24,$t0	# byte swap
    277 	srl	$Zlo,8,$t1
    278 
    279 	sll	$Zlo,8,$t2
    280 	sll	$Zlo,24,$Zlo
    281 	zapnot	$t0,0x11,$t0
    282 	zapnot	$t1,0x22,$t1
    283 
    284 	zapnot	$Zlo,0x88,$Zlo
    285 	or	$t0,$t1,$t0
    286 	zapnot	$t2,0x44,$t2
    287 
    288 	or	$Zlo,$t0,$Zlo
    289 	srl	$Zhi,24,$t0
    290 	srl	$Zhi,8,$t1
    291 
    292 	or	$Zlo,$t2,$Zlo
    293 	sll	$Zhi,8,$t2
    294 	sll	$Zhi,24,$Zhi
    295 
    296 	srl	$Zlo,32,$Xlo
    297 	sll	$Zlo,32,$Zlo
    298 
    299 	zapnot	$t0,0x11,$t0
    300 	zapnot	$t1,0x22,$t1
    301 	or	$Zlo,$Xlo,$Xlo
    302 
    303 	zapnot	$Zhi,0x88,$Zhi
    304 	or	$t0,$t1,$t0
    305 	zapnot	$t2,0x44,$t2
    306 
    307 	or	$Zhi,$t0,$Zhi
    308 	or	$Zhi,$t2,$Zhi
    309 
    310 	srl	$Zhi,32,$Xhi
    311 	sll	$Zhi,32,$Zhi
    312 
    313 	or	$Zhi,$Xhi,$Xhi
    314 	stq	$Xlo,8($Xi)
    315 	stq	$Xhi,0($Xi)
    316 
    317 	ret	(ra)
    318 .end	gcm_gmult_4bit
    319 ___
    320 
    321 $inhi="s0";
    322 $inlo="s1";
    323 
    324 $code.=<<___;
    325 .globl	gcm_ghash_4bit
    326 .align	4
    327 .ent	gcm_ghash_4bit
    328 gcm_ghash_4bit:
    329 	lda	sp,-32(sp)
    330 	stq	ra,0(sp)
    331 	stq	s0,8(sp)
    332 	stq	s1,16(sp)
    333 	.mask	0x04000600,-32
    334 	.frame	sp,32,ra
    335 	.prologue 0
    336 
    337 	ldq_u	$inhi,0($inp)
    338 	ldq_u	$Thi0,7($inp)
    339 	ldq_u	$inlo,8($inp)
    340 	ldq_u	$Tlo0,15($inp)
    341 	ldq	$Xhi,0($Xi)
    342 	ldq	$Xlo,8($Xi)
    343 
    344 	bsr	$t0,picmeup
    345 	nop
    346 
    347 .Louter:
    348 	extql	$inhi,$inp,$inhi
    349 	extqh	$Thi0,$inp,$Thi0
    350 	or	$inhi,$Thi0,$inhi
    351 	lda	$inp,16($inp)
    352 
    353 	extql	$inlo,$inp,$inlo
    354 	extqh	$Tlo0,$inp,$Tlo0
    355 	or	$inlo,$Tlo0,$inlo
    356 	subq	$len,16,$len
    357 
    358 	xor	$Xlo,$inlo,$Xlo
    359 	xor	$Xhi,$inhi,$Xhi
    360 ___
    361 
    362 	&loop();
    363 
    364 $code.=<<___;
    365 	srl	$Zlo,24,$t0	# byte swap
    366 	srl	$Zlo,8,$t1
    367 
    368 	sll	$Zlo,8,$t2
    369 	sll	$Zlo,24,$Zlo
    370 	zapnot	$t0,0x11,$t0
    371 	zapnot	$t1,0x22,$t1
    372 
    373 	zapnot	$Zlo,0x88,$Zlo
    374 	or	$t0,$t1,$t0
    375 	zapnot	$t2,0x44,$t2
    376 
    377 	or	$Zlo,$t0,$Zlo
    378 	srl	$Zhi,24,$t0
    379 	srl	$Zhi,8,$t1
    380 
    381 	or	$Zlo,$t2,$Zlo
    382 	sll	$Zhi,8,$t2
    383 	sll	$Zhi,24,$Zhi
    384 
    385 	srl	$Zlo,32,$Xlo
    386 	sll	$Zlo,32,$Zlo
    387 	beq	$len,.Ldone
    388 
    389 	zapnot	$t0,0x11,$t0
    390 	zapnot	$t1,0x22,$t1
    391 	or	$Zlo,$Xlo,$Xlo
    392 	ldq_u	$inhi,0($inp)
    393 
    394 	zapnot	$Zhi,0x88,$Zhi
    395 	or	$t0,$t1,$t0
    396 	zapnot	$t2,0x44,$t2
    397 	ldq_u	$Thi0,7($inp)
    398 
    399 	or	$Zhi,$t0,$Zhi
    400 	or	$Zhi,$t2,$Zhi
    401 	ldq_u	$inlo,8($inp)
    402 	ldq_u	$Tlo0,15($inp)
    403 
    404 	srl	$Zhi,32,$Xhi
    405 	sll	$Zhi,32,$Zhi
    406 
    407 	or	$Zhi,$Xhi,$Xhi
    408 	br	zero,.Louter
    409 
    410 .Ldone:
    411 	zapnot	$t0,0x11,$t0
    412 	zapnot	$t1,0x22,$t1
    413 	or	$Zlo,$Xlo,$Xlo
    414 
    415 	zapnot	$Zhi,0x88,$Zhi
    416 	or	$t0,$t1,$t0
    417 	zapnot	$t2,0x44,$t2
    418 
    419 	or	$Zhi,$t0,$Zhi
    420 	or	$Zhi,$t2,$Zhi
    421 
    422 	srl	$Zhi,32,$Xhi
    423 	sll	$Zhi,32,$Zhi
    424 
    425 	or	$Zhi,$Xhi,$Xhi
    426 
    427 	stq	$Xlo,8($Xi)
    428 	stq	$Xhi,0($Xi)
    429 
    430 	.set	noreorder
    431 	/*ldq	ra,0(sp)*/
    432 	ldq	s0,8(sp)
    433 	ldq	s1,16(sp)
    434 	lda	sp,32(sp)
    435 	ret	(ra)
    436 .end	gcm_ghash_4bit
    437 
    438 .align	4
    439 .ent	picmeup
    440 picmeup:
    441 	.frame	sp,0,$t0
    442 	.prologue 0
    443 	br	$rem_4bit,.Lpic
    444 .Lpic:	lda	$rem_4bit,12($rem_4bit)
    445 	ret	($t0)
    446 .end	picmeup
    447 	nop
    448 rem_4bit:
    449 	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
    450 	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
    451 	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
    452 	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
    453 .ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
    454 .align	4
    455 
    456 ___
    457 $output=shift and open STDOUT,">$output";
    458 print $code;
    459 close STDOUT;
    460 
    461