Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 # ====================================================================
     10 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     11 # project. The module is, however, dual licensed under OpenSSL and
     12 # CRYPTOGAMS licenses depending on where you obtain it. For further
     13 # details see http://www.openssl.org/~appro/cryptogams/.
     14 # ====================================================================
     15 
     16 # This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
     17 # implements the multiplication algorithm described in:
     18 #
     19 # Cmara, D.; Gouva, C. P. L.; Lpez, J. & Dahab, R.: Fast Software
     20 # Polynomial Multiplication on ARM Processors using the NEON Engine.
     21 #
     22 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
     23 #
     24 # The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
     25 # AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
     26 # NEON, the low and high halves of the 128-bit register q0 are accessible as
     27 # 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
     28 # vN. Where the 32-bit version would use the upper half, this file must keep
     29 # halves in separate registers.
     30 #
     31 # The other distinction is in syntax. 32-bit NEON embeds lane information in the
     32 # instruction name, while AArch64 uses suffixes on the registers. For instance,
     33 # left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
     34 #
     35 #     vshl.i64 q0, q0, #1
     36 #
     37 # in 64-bit, it would be written:
     38 #
     39 #     shl v0.2d, v0.2d, #1
     40 #
     41 # See Programmer's Guide for ARMv8-A, section 7 for details.
     42 # http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
     43 #
     44 # Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
     45 # only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
     46 # and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
     47 # polynomial and is conditioned on the PMULL extension. This file emulates the
     48 # latter with the former.
     49 
     50 use strict;
     51 
     52 my $flavour = shift;
     53 my $output;
     54 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
     55 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
     56 
     57 if ($flavour && $flavour ne "void") {
     58     $0 =~ m/(.*[\/\\])[^\/\\]+$/;
     59     my $dir = $1;
     60     my $xlate;
     61     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     62     ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
     63     die "can't locate arm-xlate.pl";
     64 
     65     open STDOUT,"| \"$^X\" $xlate $flavour $output";
     66 } else {
     67     open STDOUT,">$output";
     68 }
     69 
     70 my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
     71 my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
     72 my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
     73 # d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
     74 # to spare.
     75 my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
     76 my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
     77 my ($k48_k32, $k16_k0) = map("v$_", (24..25));
     78 
     79 my $code = "";
     80 
     81 # clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
     82 # must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
     83 sub clmul64x64 {
     84 my ($r, $a, $b) = @_;
     85 $code .= <<___;
     86 	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
     87 	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
     88 	ext	$r.8b, $b.8b, $b.8b, #1		// B1
     89 	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
     90 	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
     91 	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
     92 	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
     93 	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
     94 	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
     95 	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
     96 	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
     97 	ext	$r.8b, $b.8b, $b.8b, #3		// B3
     98 	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
     99 	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
    100 
    101 	// Here we diverge from the 32-bit version. It computes the following
    102 	// (instructions reordered for clarity):
    103 	//
    104 	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
    105 	//     vand	\$t0#hi, \$t0#hi, \$k48
    106 	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
    107 	//
    108 	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
    109 	//     vand	\$t1#hi, \$t1#hi, \$k32
    110 	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
    111 	//
    112 	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
    113 	//     vand	\$t2#hi, \$t2#hi, \$k16
    114 	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
    115 	//
    116 	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
    117 	//     vmov.i64	\$t3#hi, #0
    118 	//
    119 	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
    120 	// upper halves of SIMD registers, so we must split each half into
    121 	// separate registers. To compensate, we pair computations up and
    122 	// parallelize.
    123 
    124 	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
    125 	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
    126 	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
    127 
    128 	// This can probably be scheduled more efficiently. For now, we just
    129 	// pair up independent instructions.
    130 	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
    131 	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
    132 	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
    133 	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
    134 	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
    135 	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
    136 	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
    137 	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
    138 	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
    139 	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
    140 	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
    141 	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
    142 	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
    143 	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
    144 
    145 	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
    146 	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
    147 	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
    148 	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
    149 	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
    150 	eor	$t0.16b, $t0.16b, $t1.16b
    151 	eor	$t2.16b, $t2.16b, $t3.16b
    152 	eor	$r.16b, $r.16b, $t0.16b
    153 	eor	$r.16b, $r.16b, $t2.16b
    154 ___
    155 }
    156 
    157 $code .= <<___;
    158 .text
    159 
    160 .global	gcm_init_neon
    161 .type	gcm_init_neon,%function
    162 .align	4
    163 gcm_init_neon:
    164 	// This function is adapted from gcm_init_v8. xC2 is t3.
    165 	ld1	{$t1.2d}, [x1]			// load H
    166 	movi	$t3.16b, #0xe1
    167 	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
    168 	ext	$INlo.16b, $t1.16b, $t1.16b, #8
    169 	ushr	$t2.2d, $t3.2d, #63
    170 	dup	$t1.4s, $t1.s[1]
    171 	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
    172 	ushr	$t2.2d, $INlo.2d, #63
    173 	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
    174 	and	$t2.16b, $t2.16b, $t0.16b
    175 	shl	$INlo.2d, $INlo.2d, #1
    176 	ext	$t2.16b, $t2.16b, $t2.16b, #8
    177 	and	$t0.16b, $t0.16b, $t1.16b
    178 	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
    179 	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
    180 	st1	{$Hlo.2d}, [x0]			// store Htable[0]
    181 	ret
    182 .size	gcm_init_neon,.-gcm_init_neon
    183 
    184 .global	gcm_gmult_neon
    185 .type	gcm_gmult_neon,%function
    186 .align	4
    187 gcm_gmult_neon:
    188 	ld1	{$INlo.16b}, [$Xi]		// load Xi
    189 	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
    190 	ld1	{$Hhi.1d}, [$Htbl]
    191 	adrp	x9, :pg_hi21:.Lmasks		// load constants
    192 	add	x9, x9, :lo12:.Lmasks
    193 	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
    194 	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
    195 	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
    196 	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
    197 
    198 	mov	$len, #16
    199 	b	.Lgmult_neon
    200 .size	gcm_gmult_neon,.-gcm_gmult_neon
    201 
    202 .global	gcm_ghash_neon
    203 .type	gcm_ghash_neon,%function
    204 .align	4
    205 gcm_ghash_neon:
    206 	ld1	{$Xl.16b}, [$Xi]		// load Xi
    207 	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
    208 	ld1	{$Hhi.1d}, [$Htbl]
    209 	adrp	x9, :pg_hi21:.Lmasks		// load constants
    210 	add	x9, x9, :lo12:.Lmasks
    211 	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
    212 	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
    213 	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
    214 	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
    215 
    216 .Loop_neon:
    217 	ld1	{$INlo.16b}, [$inp], #16	// load inp
    218 	rev64	$INlo.16b, $INlo.16b		// byteswap inp
    219 	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
    220 	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
    221 
    222 .Lgmult_neon:
    223 	// Split the input into $INlo and $INhi. (The upper halves are unused,
    224 	// so it is okay to leave them alone.)
    225 	ins	$INhi.d[0], $INlo.d[1]
    226 ___
    227 &clmul64x64	($Xl, $Hlo, $INlo);		# H.loXi.lo
    228 $code .= <<___;
    229 	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
    230 ___
    231 &clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)(Xi.lo+Xi.hi)
    232 &clmul64x64	($Xh, $Hhi, $INhi);		# H.hiXi.hi
    233 $code .= <<___;
    234 	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
    235 	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
    236 	eor	$Xm.16b, $Xm.16b, $Xh.16b
    237 	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
    238 	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
    239 	// This is a no-op due to the ins instruction below.
    240 	// ins	$Xh.d[0], $Xm.d[1]
    241 
    242 	// equivalent of reduction_avx from ghash-x86_64.pl
    243 	shl	$t1.2d, $Xl.2d, #57		// 1st phase
    244 	shl	$t2.2d, $Xl.2d, #62
    245 	eor	$t2.16b, $t2.16b, $t1.16b	//
    246 	shl	$t1.2d, $Xl.2d, #63
    247 	eor	$t2.16b, $t2.16b, $t1.16b	//
    248 	// Note Xm contains {Xl.d[1], Xh.d[0]}.
    249 	eor	$t2.16b, $t2.16b, $Xm.16b
    250 	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
    251 	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
    252 
    253 	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
    254 	eor	$Xh.16b, $Xh.16b,$Xl.16b
    255 	eor	$Xl.16b, $Xl.16b,$t2.16b	//
    256 	ushr	$t2.2d, $t2.2d, #6
    257 	ushr	$Xl.2d, $Xl.2d, #1		//
    258 	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
    259 	eor	$Xl.16b, $Xl.16b, $t2.16b	//
    260 
    261 	subs	$len, $len, #16
    262 	bne	.Loop_neon
    263 
    264 	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
    265 	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
    266 	st1	{$Xl.16b}, [$Xi]
    267 
    268 	ret
    269 .size	gcm_ghash_neon,.-gcm_ghash_neon
    270 
    271 .section	.rodata
    272 .align	4
    273 .Lmasks:
    274 .quad	0x0000ffffffffffff	// k48
    275 .quad	0x00000000ffffffff	// k32
    276 .quad	0x000000000000ffff	// k16
    277 .quad	0x0000000000000000	// k0
    278 .asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
    279 .align  2
    280 ___
    281 
    282 foreach (split("\n",$code)) {
    283 	s/\`([^\`]*)\`/eval $1/geo;
    284 
    285 	print $_,"\n";
    286 }
    287 close STDOUT; # enforce flush
    288