Home | History | Annotate | Download | only in modes
      1 #if defined(__aarch64__)
      2 #include <openssl/arm_arch.h>
      3 
      4 .text
      5 #if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
      6 .arch	armv8-a+crypto
      7 #endif
      8 .globl	gcm_init_v8
      9 .hidden	gcm_init_v8
     10 .type	gcm_init_v8,%function
     11 .align	4
     12 gcm_init_v8:
     13 	ld1	{v17.2d},[x1]		//load input H
     14 	movi	v19.16b,#0xe1
     15 	shl	v19.2d,v19.2d,#57		//0xc2.0
     16 	ext	v3.16b,v17.16b,v17.16b,#8
     17 	ushr	v18.2d,v19.2d,#63
     18 	dup	v17.4s,v17.s[1]
     19 	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
     20 	ushr	v18.2d,v3.2d,#63
     21 	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
     22 	and	v18.16b,v18.16b,v16.16b
     23 	shl	v3.2d,v3.2d,#1
     24 	ext	v18.16b,v18.16b,v18.16b,#8
     25 	and	v16.16b,v16.16b,v17.16b
     26 	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
     27 	eor	v20.16b,v3.16b,v16.16b		//twisted H
     28 	st1	{v20.2d},[x0],#16		//store Htable[0]
     29 
     30 	//calculate H^2
     31 	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
     32 	pmull	v0.1q,v20.1d,v20.1d
     33 	eor	v16.16b,v16.16b,v20.16b
     34 	pmull2	v2.1q,v20.2d,v20.2d
     35 	pmull	v1.1q,v16.1d,v16.1d
     36 
     37 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     38 	eor	v18.16b,v0.16b,v2.16b
     39 	eor	v1.16b,v1.16b,v17.16b
     40 	eor	v1.16b,v1.16b,v18.16b
     41 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
     42 
     43 	ins	v2.d[0],v1.d[1]
     44 	ins	v1.d[1],v0.d[0]
     45 	eor	v0.16b,v1.16b,v18.16b
     46 
     47 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
     48 	pmull	v0.1q,v0.1d,v19.1d
     49 	eor	v18.16b,v18.16b,v2.16b
     50 	eor	v22.16b,v0.16b,v18.16b
     51 
     52 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
     53 	eor	v17.16b,v17.16b,v22.16b
     54 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
     55 	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
     56 
     57 	ret
     58 .size	gcm_init_v8,.-gcm_init_v8
     59 .globl	gcm_gmult_v8
     60 .hidden	gcm_gmult_v8
     61 .type	gcm_gmult_v8,%function
     62 .align	4
     63 gcm_gmult_v8:
     64 	ld1	{v17.2d},[x0]		//load Xi
     65 	movi	v19.16b,#0xe1
     66 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
     67 	shl	v19.2d,v19.2d,#57
     68 #ifndef __ARMEB__
     69 	rev64	v17.16b,v17.16b
     70 #endif
     71 	ext	v3.16b,v17.16b,v17.16b,#8
     72 
     73 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
     74 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
     75 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
     76 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
     77 
     78 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     79 	eor	v18.16b,v0.16b,v2.16b
     80 	eor	v1.16b,v1.16b,v17.16b
     81 	eor	v1.16b,v1.16b,v18.16b
     82 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
     83 
     84 	ins	v2.d[0],v1.d[1]
     85 	ins	v1.d[1],v0.d[0]
     86 	eor	v0.16b,v1.16b,v18.16b
     87 
     88 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
     89 	pmull	v0.1q,v0.1d,v19.1d
     90 	eor	v18.16b,v18.16b,v2.16b
     91 	eor	v0.16b,v0.16b,v18.16b
     92 
     93 #ifndef __ARMEB__
     94 	rev64	v0.16b,v0.16b
     95 #endif
     96 	ext	v0.16b,v0.16b,v0.16b,#8
     97 	st1	{v0.2d},[x0]		//write out Xi
     98 
     99 	ret
    100 .size	gcm_gmult_v8,.-gcm_gmult_v8
    101 .globl	gcm_ghash_v8
    102 .hidden	gcm_ghash_v8
    103 .type	gcm_ghash_v8,%function
    104 .align	4
    105 gcm_ghash_v8:
    106 	ld1	{v0.2d},[x0]		//load [rotated] Xi
    107 						//"[rotated]" means that
    108 						//loaded value would have
    109 						//to be rotated in order to
    110 						//make it appear as in
    111 						//alorithm specification
    112 	subs	x3,x3,#32		//see if x3 is 32 or larger
    113 	mov	x12,#16		//x12 is used as post-
    114 						//increment for input pointer;
    115 						//as loop is modulo-scheduled
    116 						//x12 is zeroed just in time
    117 						//to preclude oversteping
    118 						//inp[len], which means that
    119 						//last block[s] are actually
    120 						//loaded twice, but last
    121 						//copy is not processed
    122 	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
    123 	movi	v19.16b,#0xe1
    124 	ld1	{v22.2d},[x1]
    125 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    126 	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
    127 	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
    128 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
    129 #ifndef __ARMEB__
    130 	rev64	v16.16b,v16.16b
    131 	rev64	v0.16b,v0.16b
    132 #endif
    133 	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
    134 	b.lo	.Lodd_tail_v8		//x3 was less than 32
    135 	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
    136 #ifndef __ARMEB__
    137 	rev64	v17.16b,v17.16b
    138 #endif
    139 	ext	v7.16b,v17.16b,v17.16b,#8
    140 	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
    141 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    142 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    143 	pmull2	v6.1q,v20.2d,v7.2d
    144 	b	.Loop_mod2x_v8
    145 
    146 .align	4
    147 .Loop_mod2x_v8:
    148 	ext	v18.16b,v3.16b,v3.16b,#8
    149 	subs	x3,x3,#32		//is there more data?
    150 	pmull	v0.1q,v22.1d,v3.1d		//H^2.loXi.lo
    151 	csel	x12,xzr,x12,lo			//is it time to zero x12?
    152 
    153 	pmull	v5.1q,v21.1d,v17.1d
    154 	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
    155 	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hiXi.hi
    156 	eor	v0.16b,v0.16b,v4.16b		//accumulate
    157 	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)(Xi.lo+Xi.hi)
    158 	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
    159 
    160 	eor	v2.16b,v2.16b,v6.16b
    161 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    162 	eor	v1.16b,v1.16b,v5.16b
    163 
    164 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    165 	eor	v18.16b,v0.16b,v2.16b
    166 	eor	v1.16b,v1.16b,v17.16b
    167 	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
    168 #ifndef __ARMEB__
    169 	rev64	v16.16b,v16.16b
    170 #endif
    171 	eor	v1.16b,v1.16b,v18.16b
    172 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    173 
    174 #ifndef __ARMEB__
    175 	rev64	v17.16b,v17.16b
    176 #endif
    177 	ins	v2.d[0],v1.d[1]
    178 	ins	v1.d[1],v0.d[0]
    179 	ext	v7.16b,v17.16b,v17.16b,#8
    180 	ext	v3.16b,v16.16b,v16.16b,#8
    181 	eor	v0.16b,v1.16b,v18.16b
    182 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    183 	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
    184 
    185 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    186 	pmull	v0.1q,v0.1d,v19.1d
    187 	eor	v3.16b,v3.16b,v18.16b
    188 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    189 	eor	v3.16b,v3.16b,v0.16b
    190 	pmull2	v6.1q,v20.2d,v7.2d
    191 	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
    192 
    193 	eor	v2.16b,v2.16b,v18.16b
    194 	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
    195 	adds	x3,x3,#32		//re-construct x3
    196 	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
    197 	b.eq	.Ldone_v8		//is x3 zero?
    198 .Lodd_tail_v8:
    199 	ext	v18.16b,v0.16b,v0.16b,#8
    200 	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
    201 	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
    202 
    203 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
    204 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
    205 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
    206 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
    207 
    208 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    209 	eor	v18.16b,v0.16b,v2.16b
    210 	eor	v1.16b,v1.16b,v17.16b
    211 	eor	v1.16b,v1.16b,v18.16b
    212 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    213 
    214 	ins	v2.d[0],v1.d[1]
    215 	ins	v1.d[1],v0.d[0]
    216 	eor	v0.16b,v1.16b,v18.16b
    217 
    218 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    219 	pmull	v0.1q,v0.1d,v19.1d
    220 	eor	v18.16b,v18.16b,v2.16b
    221 	eor	v0.16b,v0.16b,v18.16b
    222 
    223 .Ldone_v8:
    224 #ifndef __ARMEB__
    225 	rev64	v0.16b,v0.16b
    226 #endif
    227 	ext	v0.16b,v0.16b,v0.16b,#8
    228 	st1	{v0.2d},[x0]		//write out Xi
    229 
    230 	ret
    231 .size	gcm_ghash_v8,.-gcm_ghash_v8
    232 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    233 .align	2
    234 .align	2
    235 #endif
    236