Home | History | Annotate | Download | only in fipsmodule
      1 // This file is generated from a similarly-named Perl script in the BoringSSL
      2 // source tree. Do not edit by hand.
      3 
      4 #if defined(__has_feature)
      5 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
      6 #define OPENSSL_NO_ASM
      7 #endif
      8 #endif
      9 
     10 #if !defined(OPENSSL_NO_ASM)
     11 #if defined(BORINGSSL_PREFIX)
     12 #include <boringssl_prefix_symbols_asm.h>
     13 #endif
     14 .text
     15 
     16 .globl	_gcm_init_neon
     17 .private_extern	_gcm_init_neon
     18 
     19 .align	4
     20 _gcm_init_neon:
     21 	// This function is adapted from gcm_init_v8. xC2 is t3.
     22 	ld1	{v17.2d}, [x1]			// load H
     23 	movi	v19.16b, #0xe1
     24 	shl	v19.2d, v19.2d, #57		// 0xc2.0
     25 	ext	v3.16b, v17.16b, v17.16b, #8
     26 	ushr	v18.2d, v19.2d, #63
     27 	dup	v17.4s, v17.s[1]
     28 	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
     29 	ushr	v18.2d, v3.2d, #63
     30 	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
     31 	and	v18.16b, v18.16b, v16.16b
     32 	shl	v3.2d, v3.2d, #1
     33 	ext	v18.16b, v18.16b, v18.16b, #8
     34 	and	v16.16b, v16.16b, v17.16b
     35 	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
     36 	eor	v5.16b, v3.16b, v16.16b	// twisted H
     37 	st1	{v5.2d}, [x0]			// store Htable[0]
     38 	ret
     39 
     40 
     41 .globl	_gcm_gmult_neon
     42 .private_extern	_gcm_gmult_neon
     43 
     44 .align	4
     45 _gcm_gmult_neon:
     46 	ld1	{v3.16b}, [x0]		// load Xi
     47 	ld1	{v5.1d}, [x1], #8		// load twisted H
     48 	ld1	{v6.1d}, [x1]
     49 	adrp	x9, Lmasks@PAGE		// load constants
     50 	add	x9, x9, Lmasks@PAGEOFF
     51 	ld1	{v24.2d, v25.2d}, [x9]
     52 	rev64	v3.16b, v3.16b		// byteswap Xi
     53 	ext	v3.16b, v3.16b, v3.16b, #8
     54 	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
     55 
     56 	mov	x3, #16
     57 	b	Lgmult_neon
     58 
     59 
     60 .globl	_gcm_ghash_neon
     61 .private_extern	_gcm_ghash_neon
     62 
     63 .align	4
     64 _gcm_ghash_neon:
     65 	ld1	{v0.16b}, [x0]		// load Xi
     66 	ld1	{v5.1d}, [x1], #8		// load twisted H
     67 	ld1	{v6.1d}, [x1]
     68 	adrp	x9, Lmasks@PAGE		// load constants
     69 	add	x9, x9, Lmasks@PAGEOFF
     70 	ld1	{v24.2d, v25.2d}, [x9]
     71 	rev64	v0.16b, v0.16b		// byteswap Xi
     72 	ext	v0.16b, v0.16b, v0.16b, #8
     73 	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
     74 
     75 Loop_neon:
     76 	ld1	{v3.16b}, [x2], #16	// load inp
     77 	rev64	v3.16b, v3.16b		// byteswap inp
     78 	ext	v3.16b, v3.16b, v3.16b, #8
     79 	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
     80 
     81 Lgmult_neon:
     82 	// Split the input into v3 and v4. (The upper halves are unused,
     83 	// so it is okay to leave them alone.)
     84 	ins	v4.d[0], v3.d[1]
     85 	ext	v16.8b, v5.8b, v5.8b, #1	// A1
     86 	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
     87 	ext	v0.8b, v3.8b, v3.8b, #1		// B1
     88 	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
     89 	ext	v17.8b, v5.8b, v5.8b, #2	// A2
     90 	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
     91 	ext	v19.8b, v3.8b, v3.8b, #2	// B2
     92 	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
     93 	ext	v18.8b, v5.8b, v5.8b, #3	// A3
     94 	eor	v16.16b, v16.16b, v0.16b	// L = E + F
     95 	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
     96 	ext	v0.8b, v3.8b, v3.8b, #3		// B3
     97 	eor	v17.16b, v17.16b, v19.16b	// M = G + H
     98 	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
     99 
    100 	// Here we diverge from the 32-bit version. It computes the following
    101 	// (instructions reordered for clarity):
    102 	//
    103 	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
    104 	//     vand	$t0#hi, $t0#hi, $k48
    105 	//     veor	$t0#lo, $t0#lo, $t0#hi
    106 	//
    107 	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
    108 	//     vand	$t1#hi, $t1#hi, $k32
    109 	//     veor	$t1#lo, $t1#lo, $t1#hi
    110 	//
    111 	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
    112 	//     vand	$t2#hi, $t2#hi, $k16
    113 	//     veor	$t2#lo, $t2#lo, $t2#hi
    114 	//
    115 	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
    116 	//     vmov.i64	$t3#hi, #0
    117 	//
    118 	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
    119 	// upper halves of SIMD registers, so we must split each half into
    120 	// separate registers. To compensate, we pair computations up and
    121 	// parallelize.
    122 
    123 	ext	v19.8b, v3.8b, v3.8b, #4	// B4
    124 	eor	v18.16b, v18.16b, v0.16b	// N = I + J
    125 	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
    126 
    127 	// This can probably be scheduled more efficiently. For now, we just
    128 	// pair up independent instructions.
    129 	zip1	v20.2d, v16.2d, v17.2d
    130 	zip1	v22.2d, v18.2d, v19.2d
    131 	zip2	v21.2d, v16.2d, v17.2d
    132 	zip2	v23.2d, v18.2d, v19.2d
    133 	eor	v20.16b, v20.16b, v21.16b
    134 	eor	v22.16b, v22.16b, v23.16b
    135 	and	v21.16b, v21.16b, v24.16b
    136 	and	v23.16b, v23.16b, v25.16b
    137 	eor	v20.16b, v20.16b, v21.16b
    138 	eor	v22.16b, v22.16b, v23.16b
    139 	zip1	v16.2d, v20.2d, v21.2d
    140 	zip1	v18.2d, v22.2d, v23.2d
    141 	zip2	v17.2d, v20.2d, v21.2d
    142 	zip2	v19.2d, v22.2d, v23.2d
    143 
    144 	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
    145 	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
    146 	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
    147 	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
    148 	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
    149 	eor	v16.16b, v16.16b, v17.16b
    150 	eor	v18.16b, v18.16b, v19.16b
    151 	eor	v0.16b, v0.16b, v16.16b
    152 	eor	v0.16b, v0.16b, v18.16b
    153 	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
    154 	ext	v16.8b, v7.8b, v7.8b, #1	// A1
    155 	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
    156 	ext	v1.8b, v3.8b, v3.8b, #1		// B1
    157 	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
    158 	ext	v17.8b, v7.8b, v7.8b, #2	// A2
    159 	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
    160 	ext	v19.8b, v3.8b, v3.8b, #2	// B2
    161 	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
    162 	ext	v18.8b, v7.8b, v7.8b, #3	// A3
    163 	eor	v16.16b, v16.16b, v1.16b	// L = E + F
    164 	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
    165 	ext	v1.8b, v3.8b, v3.8b, #3		// B3
    166 	eor	v17.16b, v17.16b, v19.16b	// M = G + H
    167 	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
    168 
    169 	// Here we diverge from the 32-bit version. It computes the following
    170 	// (instructions reordered for clarity):
    171 	//
    172 	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
    173 	//     vand	$t0#hi, $t0#hi, $k48
    174 	//     veor	$t0#lo, $t0#lo, $t0#hi
    175 	//
    176 	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
    177 	//     vand	$t1#hi, $t1#hi, $k32
    178 	//     veor	$t1#lo, $t1#lo, $t1#hi
    179 	//
    180 	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
    181 	//     vand	$t2#hi, $t2#hi, $k16
    182 	//     veor	$t2#lo, $t2#lo, $t2#hi
    183 	//
    184 	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
    185 	//     vmov.i64	$t3#hi, #0
    186 	//
    187 	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
    188 	// upper halves of SIMD registers, so we must split each half into
    189 	// separate registers. To compensate, we pair computations up and
    190 	// parallelize.
    191 
    192 	ext	v19.8b, v3.8b, v3.8b, #4	// B4
    193 	eor	v18.16b, v18.16b, v1.16b	// N = I + J
    194 	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
    195 
    196 	// This can probably be scheduled more efficiently. For now, we just
    197 	// pair up independent instructions.
    198 	zip1	v20.2d, v16.2d, v17.2d
    199 	zip1	v22.2d, v18.2d, v19.2d
    200 	zip2	v21.2d, v16.2d, v17.2d
    201 	zip2	v23.2d, v18.2d, v19.2d
    202 	eor	v20.16b, v20.16b, v21.16b
    203 	eor	v22.16b, v22.16b, v23.16b
    204 	and	v21.16b, v21.16b, v24.16b
    205 	and	v23.16b, v23.16b, v25.16b
    206 	eor	v20.16b, v20.16b, v21.16b
    207 	eor	v22.16b, v22.16b, v23.16b
    208 	zip1	v16.2d, v20.2d, v21.2d
    209 	zip1	v18.2d, v22.2d, v23.2d
    210 	zip2	v17.2d, v20.2d, v21.2d
    211 	zip2	v19.2d, v22.2d, v23.2d
    212 
    213 	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
    214 	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
    215 	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
    216 	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
    217 	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
    218 	eor	v16.16b, v16.16b, v17.16b
    219 	eor	v18.16b, v18.16b, v19.16b
    220 	eor	v1.16b, v1.16b, v16.16b
    221 	eor	v1.16b, v1.16b, v18.16b
    222 	ext	v16.8b, v6.8b, v6.8b, #1	// A1
    223 	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
    224 	ext	v2.8b, v4.8b, v4.8b, #1		// B1
    225 	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
    226 	ext	v17.8b, v6.8b, v6.8b, #2	// A2
    227 	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
    228 	ext	v19.8b, v4.8b, v4.8b, #2	// B2
    229 	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
    230 	ext	v18.8b, v6.8b, v6.8b, #3	// A3
    231 	eor	v16.16b, v16.16b, v2.16b	// L = E + F
    232 	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
    233 	ext	v2.8b, v4.8b, v4.8b, #3		// B3
    234 	eor	v17.16b, v17.16b, v19.16b	// M = G + H
    235 	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
    236 
    237 	// Here we diverge from the 32-bit version. It computes the following
    238 	// (instructions reordered for clarity):
    239 	//
    240 	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
    241 	//     vand	$t0#hi, $t0#hi, $k48
    242 	//     veor	$t0#lo, $t0#lo, $t0#hi
    243 	//
    244 	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
    245 	//     vand	$t1#hi, $t1#hi, $k32
    246 	//     veor	$t1#lo, $t1#lo, $t1#hi
    247 	//
    248 	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
    249 	//     vand	$t2#hi, $t2#hi, $k16
    250 	//     veor	$t2#lo, $t2#lo, $t2#hi
    251 	//
    252 	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
    253 	//     vmov.i64	$t3#hi, #0
    254 	//
    255 	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
    256 	// upper halves of SIMD registers, so we must split each half into
    257 	// separate registers. To compensate, we pair computations up and
    258 	// parallelize.
    259 
    260 	ext	v19.8b, v4.8b, v4.8b, #4	// B4
    261 	eor	v18.16b, v18.16b, v2.16b	// N = I + J
    262 	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
    263 
    264 	// This can probably be scheduled more efficiently. For now, we just
    265 	// pair up independent instructions.
    266 	zip1	v20.2d, v16.2d, v17.2d
    267 	zip1	v22.2d, v18.2d, v19.2d
    268 	zip2	v21.2d, v16.2d, v17.2d
    269 	zip2	v23.2d, v18.2d, v19.2d
    270 	eor	v20.16b, v20.16b, v21.16b
    271 	eor	v22.16b, v22.16b, v23.16b
    272 	and	v21.16b, v21.16b, v24.16b
    273 	and	v23.16b, v23.16b, v25.16b
    274 	eor	v20.16b, v20.16b, v21.16b
    275 	eor	v22.16b, v22.16b, v23.16b
    276 	zip1	v16.2d, v20.2d, v21.2d
    277 	zip1	v18.2d, v22.2d, v23.2d
    278 	zip2	v17.2d, v20.2d, v21.2d
    279 	zip2	v19.2d, v22.2d, v23.2d
    280 
    281 	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
    282 	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
    283 	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
    284 	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
    285 	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
    286 	eor	v16.16b, v16.16b, v17.16b
    287 	eor	v18.16b, v18.16b, v19.16b
    288 	eor	v2.16b, v2.16b, v16.16b
    289 	eor	v2.16b, v2.16b, v18.16b
    290 	ext	v16.16b, v0.16b, v2.16b, #8
    291 	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
    292 	eor	v1.16b, v1.16b, v2.16b
    293 	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
    294 	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
    295 	// This is a no-op due to the ins instruction below.
    296 	// ins	v2.d[0], v1.d[1]
    297 
    298 	// equivalent of reduction_avx from ghash-x86_64.pl
    299 	shl	v17.2d, v0.2d, #57		// 1st phase
    300 	shl	v18.2d, v0.2d, #62
    301 	eor	v18.16b, v18.16b, v17.16b	//
    302 	shl	v17.2d, v0.2d, #63
    303 	eor	v18.16b, v18.16b, v17.16b	//
    304 	// Note Xm contains {Xl.d[1], Xh.d[0]}.
    305 	eor	v18.16b, v18.16b, v1.16b
    306 	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
    307 	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
    308 
    309 	ushr	v18.2d, v0.2d, #1		// 2nd phase
    310 	eor	v2.16b, v2.16b,v0.16b
    311 	eor	v0.16b, v0.16b,v18.16b	//
    312 	ushr	v18.2d, v18.2d, #6
    313 	ushr	v0.2d, v0.2d, #1		//
    314 	eor	v0.16b, v0.16b, v2.16b	//
    315 	eor	v0.16b, v0.16b, v18.16b	//
    316 
    317 	subs	x3, x3, #16
    318 	bne	Loop_neon
    319 
    320 	rev64	v0.16b, v0.16b		// byteswap Xi and write
    321 	ext	v0.16b, v0.16b, v0.16b, #8
    322 	st1	{v0.16b}, [x0]
    323 
    324 	ret
    325 
    326 
    327 .section	__TEXT,__const
    328 .align	4
    329 Lmasks:
    330 .quad	0x0000ffffffffffff	// k48
    331 .quad	0x00000000ffffffff	// k32
    332 .quad	0x000000000000ffff	// k16
    333 .quad	0x0000000000000000	// k0
    334 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    335 .align	2
    336 .align	2
    337 #endif  // !OPENSSL_NO_ASM
    338