Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 ######################################################################
     11 ## Constant-time SSSE3 AES core implementation.
     12 ## version 0.1
     13 ##
     14 ## By Mike Hamburg (Stanford University), 2009
     15 ## Public domain.
     16 ##
     17 ## For details see http://shiftleft.org/papers/vector_aes/ and
     18 ## http://crypto.stanford.edu/vpaes/.
     19 ##
     20 ######################################################################
     21 # ARMv8 NEON adaptation by <appro (at] openssl.org>
     22 #
     23 # Reason for undertaken effort is that there is at least one popular
     24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
     25 #
     26 #                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
     27 # Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
     28 # Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
     29 # X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
     30 # Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
     31 # Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
     32 # Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
     33 #
     34 # (*)	ECB denotes approximate result for parallelizable modes
     35 #	such as CBC decrypt, CTR, etc.;
     36 # (**)	these results are worse than scalar compiler-generated
     37 #	code, but it's constant-time and therefore preferred;
     38 # (***)	presented for reference/comparison purposes;
     39 
     40 $flavour = shift;
     41 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
     42 
     43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     45 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
     46 die "can't locate arm-xlate.pl";
     47 
     48 open OUT,"| \"$^X\" $xlate $flavour $output";
     49 *STDOUT=*OUT;
     50 
     51 $code.=<<___;
     52 .section	.rodata
     53 
     54 .type	_vpaes_consts,%object
     55 .align	7	// totally strategic alignment
     56 _vpaes_consts:
     57 .Lk_mc_forward:	// mc_forward
     58 	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
     59 	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
     60 	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
     61 	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
     62 .Lk_mc_backward:// mc_backward
     63 	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
     64 	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
     65 	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
     66 	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
     67 .Lk_sr:		// sr
     68 	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
     69 	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
     70 	.quad	0x0F060D040B020900, 0x070E050C030A0108
     71 	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
     72 
     73 //
     74 // "Hot" constants
     75 //
     76 .Lk_inv:	// inv, inva
     77 	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
     78 	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
     79 .Lk_ipt:	// input transform (lo, hi)
     80 	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
     81 	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
     82 .Lk_sbo:	// sbou, sbot
     83 	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
     84 	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
     85 .Lk_sb1:	// sb1u, sb1t
     86 	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
     87 	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
     88 .Lk_sb2:	// sb2u, sb2t
     89 	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
     90 	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
     91 
     92 //
     93 //  Decryption stuff
     94 //
     95 .Lk_dipt:	// decryption input transform
     96 	.quad	0x0F505B040B545F00, 0x154A411E114E451A
     97 	.quad	0x86E383E660056500, 0x12771772F491F194
     98 .Lk_dsbo:	// decryption sbox final output
     99 	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
    100 	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
    101 .Lk_dsb9:	// decryption sbox output *9*u, *9*t
    102 	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
    103 	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
    104 .Lk_dsbd:	// decryption sbox output *D*u, *D*t
    105 	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
    106 	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
    107 .Lk_dsbb:	// decryption sbox output *B*u, *B*t
    108 	.quad	0xD022649296B44200, 0x602646F6B0F2D404
    109 	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
    110 .Lk_dsbe:	// decryption sbox output *E*u, *E*t
    111 	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
    112 	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
    113 
    114 //
    115 //  Key schedule constants
    116 //
    117 .Lk_dksd:	// decryption key schedule: invskew x*D
    118 	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
    119 	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
    120 .Lk_dksb:	// decryption key schedule: invskew x*B
    121 	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
    122 	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
    123 .Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
    124 	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
    125 	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
    126 .Lk_dks9:	// decryption key schedule: invskew x*9
    127 	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
    128 	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
    129 
    130 .Lk_rcon:	// rcon
    131 	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
    132 
    133 .Lk_opt:	// output transform
    134 	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
    135 	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
    136 .Lk_deskew:	// deskew tables: inverts the sbox's "skew"
    137 	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
    138 	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
    139 
    140 .asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
    141 .size	_vpaes_consts,.-_vpaes_consts
    142 .align	6
    143 
    144 .text
    145 ___
    146 
    148 {
    149 my ($inp,$out,$key) = map("x$_",(0..2));
    150 
    151 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
    152 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
    153 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
    154 
    155 $code.=<<___;
    156 ##
    157 ##  _aes_preheat
    158 ##
    159 ##  Fills register %r10 -> .aes_consts (so you can -fPIC)
    160 ##  and %xmm9-%xmm15 as specified below.
    161 ##
    162 .type	_vpaes_encrypt_preheat,%function
    163 .align	4
    164 _vpaes_encrypt_preheat:
    165 	adrp	x10, :pg_hi21:.Lk_inv
    166 	add	x10, x10, :lo12:.Lk_inv
    167 	movi	v17.16b, #0x0f
    168 	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
    169 	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
    170 	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
    171 	ret
    172 .size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
    173 
    174 ##
    175 ##  _aes_encrypt_core
    176 ##
    177 ##  AES-encrypt %xmm0.
    178 ##
    179 ##  Inputs:
    180 ##     %xmm0 = input
    181 ##     %xmm9-%xmm15 as in _vpaes_preheat
    182 ##    (%rdx) = scheduled keys
    183 ##
    184 ##  Output in %xmm0
    185 ##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
    186 ##  Preserves %xmm6 - %xmm8 so you get some local vectors
    187 ##
    188 ##
    189 .type	_vpaes_encrypt_core,%function
    190 .align 4
    191 _vpaes_encrypt_core:
    192 	mov	x9, $key
    193 	ldr	w8, [$key,#240]			// pull rounds
    194 	adrp	x11, :pg_hi21:.Lk_mc_forward+16
    195 	add	x11, x11, :lo12:.Lk_mc_forward+16
    196 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
    197 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
    198 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    199 	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
    200 	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
    201 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
    202 	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
    203 	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
    204 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    205 	b	.Lenc_entry
    206 
    207 .align 4
    208 .Lenc_loop:
    209 	// middle of middle round
    210 	add	x10, x11, #0x40
    211 	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
    212 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
    213 	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
    214 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    215 	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
    216 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    217 	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
    218 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
    219 	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
    220 	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
    221 	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
    222 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
    223 	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
    224 	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
    225 	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
    226 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
    227 	sub	w8, w8, #1			// nr--
    228 
    229 .Lenc_entry:
    230 	// top of round
    231 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
    232 	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
    233 	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
    234 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    235 	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
    236 	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
    237 	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    238 	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
    239 	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
    240 	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
    241 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
    242 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
    243 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
    244 	cbnz	w8, .Lenc_loop
    245 
    246 	// middle of last round
    247 	add	x10, x11, #0x80
    248 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    249 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    250 	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    251 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
    252 	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
    253 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    254 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    255 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
    256 	ret
    257 .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
    258 
    259 .globl	vpaes_encrypt
    260 .type	vpaes_encrypt,%function
    261 .align	4
    262 vpaes_encrypt:
    263 	stp	x29,x30,[sp,#-16]!
    264 	add	x29,sp,#0
    265 
    266 	ld1	{v7.16b}, [$inp]
    267 	bl	_vpaes_encrypt_preheat
    268 	bl	_vpaes_encrypt_core
    269 	st1	{v0.16b}, [$out]
    270 
    271 	ldp	x29,x30,[sp],#16
    272 	ret
    273 .size	vpaes_encrypt,.-vpaes_encrypt
    274 
    275 .type	_vpaes_encrypt_2x,%function
    276 .align 4
    277 _vpaes_encrypt_2x:
    278 	mov	x9, $key
    279 	ldr	w8, [$key,#240]			// pull rounds
    280 	adrp	x11, :pg_hi21:.Lk_mc_forward+16
    281 	add	x11, x11, :lo12:.Lk_mc_forward+16
    282 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
    283 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
    284 	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
    285 	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
    286 	 and	v9.16b,  v15.16b,  v17.16b
    287 	 ushr	v8.16b,  v15.16b,  #4
    288 	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
    289 	 tbl	v9.16b,  {$iptlo}, v9.16b
    290 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
    291 	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
    292 	 tbl	v10.16b, {$ipthi}, v8.16b
    293 	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
    294 	 eor	v8.16b,  v9.16b,   v16.16b
    295 	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
    296 	 eor	v8.16b,  v8.16b,   v10.16b
    297 	b	.Lenc_2x_entry
    298 
    299 .align 4
    300 .Lenc_2x_loop:
    301 	// middle of middle round
    302 	add	x10, x11, #0x40
    303 	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
    304 	 tbl	v12.16b, {$sb1t}, v10.16b
    305 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
    306 	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
    307 	 tbl	v8.16b,  {$sb1u}, v11.16b
    308 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    309 	 eor	v12.16b, v12.16b, v16.16b
    310 	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
    311 	 tbl	v13.16b, {$sb2t}, v10.16b
    312 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    313 	 eor	v8.16b,  v8.16b,  v12.16b
    314 	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
    315 	 tbl	v10.16b, {$sb2u}, v11.16b
    316 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
    317 	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
    318 	 tbl	v11.16b, {v8.16b}, v1.16b
    319 	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
    320 	 eor	v10.16b, v10.16b, v13.16b
    321 	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
    322 	 tbl	v8.16b,  {v8.16b}, v4.16b
    323 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
    324 	 eor	v11.16b, v11.16b, v10.16b
    325 	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
    326 	 tbl	v12.16b, {v11.16b},v1.16b
    327 	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
    328 	 eor	v8.16b,  v8.16b,  v11.16b
    329 	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
    330 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
    331 	 eor	v8.16b,  v8.16b,  v12.16b
    332 	sub	w8, w8, #1			// nr--
    333 
    334 .Lenc_2x_entry:
    335 	// top of round
    336 	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
    337 	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
    338 	 and	v9.16b,  v8.16b, v17.16b
    339 	 ushr	v8.16b,  v8.16b, #4
    340 	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
    341 	 tbl	v13.16b, {$invhi},v9.16b
    342 	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    343 	 eor	v9.16b,  v9.16b,  v8.16b
    344 	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
    345 	 tbl	v11.16b, {$invlo},v8.16b
    346 	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
    347 	 tbl	v12.16b, {$invlo},v9.16b
    348 	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    349 	 eor	v11.16b, v11.16b, v13.16b
    350 	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
    351 	 eor	v12.16b, v12.16b, v13.16b
    352 	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
    353 	 tbl	v10.16b, {$invlo},v11.16b
    354 	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
    355 	 tbl	v11.16b, {$invlo},v12.16b
    356 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
    357 	 eor	v10.16b, v10.16b, v9.16b
    358 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
    359 	 eor	v11.16b, v11.16b, v8.16b
    360 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
    361 	cbnz	w8, .Lenc_2x_loop
    362 
    363 	// middle of last round
    364 	add	x10, x11, #0x80
    365 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    366 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    367 	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    368 	 tbl	v12.16b, {$sbou}, v10.16b
    369 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
    370 	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
    371 	 tbl	v8.16b,  {$sbot}, v11.16b
    372 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    373 	 eor	v12.16b, v12.16b, v16.16b
    374 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    375 	 eor	v8.16b,  v8.16b,  v12.16b
    376 	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
    377 	 tbl	v1.16b,  {v8.16b},v1.16b
    378 	ret
    379 .size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
    380 
    381 .type	_vpaes_decrypt_preheat,%function
    382 .align	4
    383 _vpaes_decrypt_preheat:
    384 	adrp	x10, :pg_hi21:.Lk_inv
    385 	add	x10, x10, :lo12:.Lk_inv
    386 	movi	v17.16b, #0x0f
    387 	adrp	x11, :pg_hi21:.Lk_dipt
    388 	add	x11, x11, :lo12:.Lk_dipt
    389 	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
    390 	ld1	{v20.2d-v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
    391 	ld1	{v24.2d-v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
    392 	ld1	{v28.2d-v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
    393 	ret
    394 .size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
    395 
    396 ##
    397 ##  Decryption core
    398 ##
    399 ##  Same API as encryption core.
    400 ##
    401 .type	_vpaes_decrypt_core,%function
    402 .align	4
    403 _vpaes_decrypt_core:
    404 	mov	x9, $key
    405 	ldr	w8, [$key,#240]			// pull rounds
    406 
    407 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
    408 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
    409 	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
    410 	adrp	x10, :pg_hi21:.Lk_sr
    411 	add	x10, x10, :lo12:.Lk_sr
    412 	and	x11, x11, #0x30			// and		\$0x30,	%r11
    413 	add	x11, x11, x10
    414 	adrp	x10, :pg_hi21:.Lk_mc_forward+48
    415 	add	x10, x10, :lo12:.Lk_mc_forward+48
    416 
    417 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
    418 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    419 	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
    420 	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    421 	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
    422 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
    423 	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    424 	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
    425 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    426 	b	.Ldec_entry
    427 
    428 .align 4
    429 .Ldec_loop:
    430 //
    431 //  Inverse mix columns
    432 //
    433 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
    434 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
    435 	tbl	v4.16b, {$sb9u}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
    436 	tbl	v1.16b, {$sb9t}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
    437 	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
    438 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
    439 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    440 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
    441 
    442 	tbl	v4.16b, {$sbdu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
    443 	tbl 	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    444 	tbl	v1.16b, {$sbdt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
    445 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    446 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
    447 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    448 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
    449 
    450 	tbl	v4.16b, {$sbbu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
    451 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    452 	tbl	v1.16b, {$sbbt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
    453 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    454 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
    455 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    456 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
    457 
    458 	tbl	v4.16b, {$sbeu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
    459 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    460 	tbl	v1.16b, {$sbet}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
    461 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    462 	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
    463 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    464 	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
    465 
    466 .Ldec_entry:
    467 	// top of round
    468 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
    469 	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
    470 	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
    471 	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    472 	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
    473 	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
    474 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    475 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
    476 	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
    477 	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
    478 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
    479 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
    480 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
    481 	cbnz	w8, .Ldec_loop
    482 
    483 	// middle of last round
    484 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
    485 	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    486 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
    487 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
    488 	tbl	v1.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
    489 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
    490 	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
    491 	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
    492 	ret
    493 .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
    494 
    495 .globl	vpaes_decrypt
    496 .type	vpaes_decrypt,%function
    497 .align	4
    498 vpaes_decrypt:
    499 	stp	x29,x30,[sp,#-16]!
    500 	add	x29,sp,#0
    501 
    502 	ld1	{v7.16b}, [$inp]
    503 	bl	_vpaes_decrypt_preheat
    504 	bl	_vpaes_decrypt_core
    505 	st1	{v0.16b}, [$out]
    506 
    507 	ldp	x29,x30,[sp],#16
    508 	ret
    509 .size	vpaes_decrypt,.-vpaes_decrypt
    510 
    511 // v14-v15 input, v0-v1 output
    512 .type	_vpaes_decrypt_2x,%function
    513 .align	4
    514 _vpaes_decrypt_2x:
    515 	mov	x9, $key
    516 	ldr	w8, [$key,#240]			// pull rounds
    517 
    518 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
    519 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
    520 	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
    521 	adrp	x10, :pg_hi21:.Lk_sr
    522 	add	x10, x10, :lo12:.Lk_sr
    523 	and	x11, x11, #0x30			// and		\$0x30,	%r11
    524 	add	x11, x11, x10
    525 	adrp	x10, :pg_hi21:.Lk_mc_forward+48
    526 	add	x10, x10, :lo12:.Lk_mc_forward+48
    527 
    528 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
    529 	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
    530 	ushr	v0.16b,  v14.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
    531 	 and	v9.16b,  v15.16b, v17.16b
    532 	 ushr	v8.16b,  v15.16b, #4
    533 	tbl	v2.16b,  {$iptlo},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    534 	 tbl	v10.16b, {$iptlo},v9.16b
    535 	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
    536 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
    537 	tbl	v0.16b,  {$ipthi},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    538 	 tbl	v8.16b,  {$ipthi},v8.16b
    539 	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
    540 	 eor	v10.16b, v10.16b, v16.16b
    541 	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
    542 	 eor	v8.16b,  v8.16b,  v10.16b
    543 	b	.Ldec_2x_entry
    544 
    545 .align 4
    546 .Ldec_2x_loop:
    547 //
    548 //  Inverse mix columns
    549 //
    550 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
    551 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
    552 	tbl	v4.16b,  {$sb9u}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
    553 	 tbl	v12.16b, {$sb9u}, v10.16b
    554 	tbl	v1.16b,  {$sb9t}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
    555 	 tbl	v9.16b,  {$sb9t}, v11.16b
    556 	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
    557 	 eor	v8.16b,  v12.16b, v16.16b
    558 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
    559 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    560 	 eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    561 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
    562 
    563 	tbl	v4.16b,  {$sbdu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
    564 	 tbl	v12.16b, {$sbdu}, v10.16b
    565 	tbl 	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    566 	 tbl 	v8.16b,  {v8.16b},v5.16b
    567 	tbl	v1.16b,  {$sbdt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
    568 	 tbl	v9.16b,  {$sbdt}, v11.16b
    569 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    570 	 eor	v8.16b,  v8.16b,  v12.16b
    571 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
    572 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    573 	 eor	v8.16b,  v8.16b,  v9.16b
    574 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
    575 
    576 	tbl	v4.16b,  {$sbbu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
    577 	 tbl	v12.16b, {$sbbu}, v10.16b
    578 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    579 	 tbl	v8.16b,  {v8.16b},v5.16b
    580 	tbl	v1.16b,  {$sbbt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
    581 	 tbl	v9.16b,  {$sbbt}, v11.16b
    582 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    583 	 eor	v8.16b,  v8.16b,  v12.16b
    584 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
    585 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    586 	 eor	v8.16b,  v8.16b,  v9.16b
    587 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
    588 
    589 	tbl	v4.16b,  {$sbeu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
    590 	 tbl	v12.16b, {$sbeu}, v10.16b
    591 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    592 	 tbl	v8.16b,  {v8.16b},v5.16b
    593 	tbl	v1.16b,  {$sbet}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
    594 	 tbl	v9.16b,  {$sbet}, v11.16b
    595 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    596 	 eor	v8.16b,  v8.16b,  v12.16b
    597 	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
    598 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    599 	 eor	v8.16b,  v8.16b,  v9.16b
    600 	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
    601 
    602 .Ldec_2x_entry:
    603 	// top of round
    604 	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
    605 	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
    606 	 and	v9.16b,  v8.16b,  v17.16b
    607 	 ushr	v8.16b,  v8.16b,  #4
    608 	tbl	v2.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
    609 	 tbl	v10.16b, {$invhi},v9.16b
    610 	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    611 	 eor	v9.16b,	 v9.16b,  v8.16b
    612 	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
    613 	 tbl	v11.16b, {$invlo},v8.16b
    614 	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
    615 	 tbl	v12.16b, {$invlo},v9.16b
    616 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    617 	 eor	v11.16b, v11.16b, v10.16b
    618 	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
    619 	 eor	v12.16b, v12.16b, v10.16b
    620 	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
    621 	 tbl	v10.16b, {$invlo},v11.16b
    622 	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
    623 	 tbl	v11.16b, {$invlo},v12.16b
    624 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
    625 	 eor	v10.16b, v10.16b, v9.16b
    626 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
    627 	 eor	v11.16b, v11.16b, v8.16b
    628 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
    629 	cbnz	w8, .Ldec_2x_loop
    630 
    631 	// middle of last round
    632 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
    633 	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    634 	 tbl	v12.16b, {$sbou}, v10.16b
    635 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
    636 	tbl	v1.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
    637 	 tbl	v9.16b,  {$sbot}, v11.16b
    638 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
    639 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
    640 	 eor	v12.16b, v12.16b, v16.16b
    641 	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
    642 	 eor	v8.16b,  v9.16b,  v12.16b
    643 	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
    644 	 tbl	v1.16b,  {v8.16b},v2.16b
    645 	ret
    646 .size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
    647 ___
    648 }
    650 {
    651 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
    652 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
    653 
    654 $code.=<<___;
    655 ########################################################
    656 ##                                                    ##
    657 ##                  AES key schedule                  ##
    658 ##                                                    ##
    659 ########################################################
    660 .type	_vpaes_key_preheat,%function
    661 .align	4
    662 _vpaes_key_preheat:
    663 	adrp	x10, :pg_hi21:.Lk_inv
    664 	add	x10, x10, :lo12:.Lk_inv
    665 	movi	v16.16b, #0x5b			// .Lk_s63
    666 	adrp	x11, :pg_hi21:.Lk_sb1
    667 	add	x11, x11, :lo12:.Lk_sb1
    668 	movi	v17.16b, #0x0f			// .Lk_s0F
    669 	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
    670 	adrp	x10, :pg_hi21:.Lk_dksd
    671 	add	x10, x10, :lo12:.Lk_dksd
    672 	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
    673 	adrp	x11, :pg_hi21:.Lk_mc_forward
    674 	add	x11, x11, :lo12:.Lk_mc_forward
    675 	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
    676 	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
    677 	ld1	{v8.2d}, [x10]			// .Lk_rcon
    678 	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
    679 	ret
    680 .size	_vpaes_key_preheat,.-_vpaes_key_preheat
    681 
    682 .type	_vpaes_schedule_core,%function
    683 .align	4
    684 _vpaes_schedule_core:
    685 	stp	x29, x30, [sp,#-16]!
    686 	add	x29,sp,#0
    687 
    688 	bl	_vpaes_key_preheat		// load the tables
    689 
    690 	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
    691 
    692 	// input transform
    693 	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
    694 	bl	_vpaes_schedule_transform
    695 	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
    696 
    697 	adrp	x10, :pg_hi21:.Lk_sr		// lea	.Lk_sr(%rip),%r10
    698 	add	x10, x10, :lo12:.Lk_sr
    699 
    700 	add	x8, x8, x10
    701 	cbnz	$dir, .Lschedule_am_decrypting
    702 
    703 	// encrypting, output zeroth round key after transform
    704 	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
    705 	b	.Lschedule_go
    706 
    707 .Lschedule_am_decrypting:
    708 	// decrypting, output zeroth round key after shiftrows
    709 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
    710 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
    711 	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
    712 	eor	x8, x8, #0x30			// xor	\$0x30, %r8
    713 
    714 .Lschedule_go:
    715 	cmp	$bits, #192			// cmp	\$192,	%esi
    716 	b.hi	.Lschedule_256
    717 	b.eq	.Lschedule_192
    718 	// 128: fall though
    719 
    720 ##
    721 ##  .schedule_128
    722 ##
    723 ##  128-bit specific part of key schedule.
    724 ##
    725 ##  This schedule is really simple, because all its parts
    726 ##  are accomplished by the subroutines.
    727 ##
    728 .Lschedule_128:
    729 	mov	$inp, #10			// mov	\$10, %esi
    730 
    731 .Loop_schedule_128:
    732 	sub	$inp, $inp, #1			// dec	%esi
    733 	bl 	_vpaes_schedule_round
    734 	cbz 	$inp, .Lschedule_mangle_last
    735 	bl	_vpaes_schedule_mangle		// write output
    736 	b 	.Loop_schedule_128
    737 
    738 ##
    739 ##  .aes_schedule_192
    740 ##
    741 ##  192-bit specific part of key schedule.
    742 ##
    743 ##  The main body of this schedule is the same as the 128-bit
    744 ##  schedule, but with more smearing.  The long, high side is
    745 ##  stored in %xmm7 as before, and the short, low side is in
    746 ##  the high bits of %xmm6.
    747 ##
    748 ##  This schedule is somewhat nastier, however, because each
    749 ##  round produces 192 bits of key material, or 1.5 round keys.
    750 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
    751 ##  keys.
    752 ##
    753 .align	4
    754 .Lschedule_192:
    755 	sub	$inp, $inp, #8
    756 	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
    757 	bl	_vpaes_schedule_transform	// input transform
    758 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
    759 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
    760 	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
    761 	mov	$inp, #4			// mov	\$4,	%esi
    762 
    763 .Loop_schedule_192:
    764 	sub	$inp, $inp, #1			// dec	%esi
    765 	bl	_vpaes_schedule_round
    766 	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
    767 	bl	_vpaes_schedule_mangle		// save key n
    768 	bl	_vpaes_schedule_192_smear
    769 	bl	_vpaes_schedule_mangle		// save key n+1
    770 	bl	_vpaes_schedule_round
    771 	cbz 	$inp, .Lschedule_mangle_last
    772 	bl	_vpaes_schedule_mangle		// save key n+2
    773 	bl	_vpaes_schedule_192_smear
    774 	b	.Loop_schedule_192
    775 
    776 ##
    777 ##  .aes_schedule_256
    778 ##
    779 ##  256-bit specific part of key schedule.
    780 ##
    781 ##  The structure here is very similar to the 128-bit
    782 ##  schedule, but with an additional "low side" in
    783 ##  %xmm6.  The low side's rounds are the same as the
    784 ##  high side's, except no rcon and no rotation.
    785 ##
    786 .align	4
    787 .Lschedule_256:
    788 	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
    789 	bl	_vpaes_schedule_transform	// input transform
    790 	mov	$inp, #7			// mov	\$7, %esi
    791 
    792 .Loop_schedule_256:
    793 	sub	$inp, $inp, #1			// dec	%esi
    794 	bl	_vpaes_schedule_mangle		// output low result
    795 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
    796 
    797 	// high round
    798 	bl	_vpaes_schedule_round
    799 	cbz 	$inp, .Lschedule_mangle_last
    800 	bl	_vpaes_schedule_mangle
    801 
    802 	// low round. swap xmm7 and xmm6
    803 	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
    804 	movi	v4.16b, #0
    805 	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
    806 	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
    807 	bl	_vpaes_schedule_low_round
    808 	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
    809 
    810 	b	.Loop_schedule_256
    811 
    812 ##
    813 ##  .aes_schedule_mangle_last
    814 ##
    815 ##  Mangler for last round of key schedule
    816 ##  Mangles %xmm0
    817 ##    when encrypting, outputs out(%xmm0) ^ 63
    818 ##    when decrypting, outputs unskew(%xmm0)
    819 ##
    820 ##  Always called right before return... jumps to cleanup and exits
    821 ##
    822 .align	4
    823 .Lschedule_mangle_last:
    824 	// schedule last round key from xmm0
    825 	adrp	x11, :pg_hi21:.Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
    826 	add	x11, x11, :lo12:.Lk_deskew
    827 
    828 	cbnz	$dir, .Lschedule_mangle_last_dec
    829 
    830 	// encrypting
    831 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
    832 	adrp	x11, :pg_hi21:.Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
    833 	add	x11, x11, :lo12:.Lk_opt
    834 	add	$out, $out, #32			// add	\$32,	%rdx
    835 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
    836 
    837 .Lschedule_mangle_last_dec:
    838 	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
    839 	sub	$out, $out, #16			// add	\$-16,	%rdx
    840 	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
    841 	bl	_vpaes_schedule_transform	// output transform
    842 	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
    843 
    844 	// cleanup
    845 	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
    846 	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
    847 	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
    848 	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
    849 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
    850 	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
    851 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
    852 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
    853 	ldp	x29, x30, [sp],#16
    854 	ret
    855 .size	_vpaes_schedule_core,.-_vpaes_schedule_core
    856 
    857 ##
    858 ##  .aes_schedule_192_smear
    859 ##
    860 ##  Smear the short, low side in the 192-bit key schedule.
    861 ##
    862 ##  Inputs:
    863 ##    %xmm7: high side, b  a  x  y
    864 ##    %xmm6:  low side, d  c  0  0
    865 ##    %xmm13: 0
    866 ##
    867 ##  Outputs:
    868 ##    %xmm6: b+c+d  b+c  0  0
    869 ##    %xmm0: b+c+d  b+c  b  a
    870 ##
    871 .type	_vpaes_schedule_192_smear,%function
    872 .align	4
    873 _vpaes_schedule_192_smear:
    874 	movi	v1.16b, #0
    875 	dup	v0.4s, v7.s[3]
    876 	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
    877 	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
    878 	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
    879 	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
    880 	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
    881 	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
    882 	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
    883 	ret
    884 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
    885 
    886 ##
    887 ##  .aes_schedule_round
    888 ##
    889 ##  Runs one main round of the key schedule on %xmm0, %xmm7
    890 ##
    891 ##  Specifically, runs subbytes on the high dword of %xmm0
    892 ##  then rotates it by one byte and xors into the low dword of
    893 ##  %xmm7.
    894 ##
    895 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    896 ##  next rcon.
    897 ##
    898 ##  Smears the dwords of %xmm7 by xoring the low into the
    899 ##  second low, result into third, result into highest.
    900 ##
    901 ##  Returns results in %xmm7 = %xmm0.
    902 ##  Clobbers %xmm1-%xmm4, %r11.
    903 ##
    904 .type	_vpaes_schedule_round,%function
    905 .align	4
    906 _vpaes_schedule_round:
    907 	// extract rcon from xmm8
    908 	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
    909 	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
    910 	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
    911 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
    912 
    913 	// rotate
    914 	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
    915 	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
    916 
    917 	// fall through...
    918 
    919 	// low round: same as high round, but no rotation and no rcon.
    920 _vpaes_schedule_low_round:
    921 	// smear xmm7
    922 	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
    923 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
    924 	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
    925 
    926 	// subbytes
    927 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
    928 	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
    929 	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
    930 	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
    931 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
    932 	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
    933 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
    934 	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
    935 	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
    936 	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
    937 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
    938 	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
    939 	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
    940 	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
    941 	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
    942 	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
    943 	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
    944 
    945 	// add in smeared stuff
    946 	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
    947 	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
    948 	ret
    949 .size	_vpaes_schedule_round,.-_vpaes_schedule_round
    950 
    951 ##
    952 ##  .aes_schedule_transform
    953 ##
    954 ##  Linear-transform %xmm0 according to tables at (%r11)
    955 ##
    956 ##  Requires that %xmm9 = 0x0F0F... as in preheat
    957 ##  Output in %xmm0
    958 ##  Clobbers %xmm1, %xmm2
    959 ##
    960 .type	_vpaes_schedule_transform,%function
    961 .align	4
    962 _vpaes_schedule_transform:
    963 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    964 	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
    965 						// vmovdqa	(%r11),	%xmm2 	# lo
    966 	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    967 						// vmovdqa	16(%r11),	%xmm1 # hi
    968 	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    969 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    970 	ret
    971 .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
    972 
    973 ##
    974 ##  .aes_schedule_mangle
    975 ##
    976 ##  Mangle xmm0 from (basis-transformed) standard version
    977 ##  to our version.
    978 ##
    979 ##  On encrypt,
    980 ##    xor with 0x63
    981 ##    multiply by circulant 0,1,1,1
    982 ##    apply shiftrows transform
    983 ##
    984 ##  On decrypt,
    985 ##    xor with 0x63
    986 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
    987 ##    deskew
    988 ##    apply shiftrows transform
    989 ##
    990 ##
    991 ##  Writes out to (%rdx), and increments or decrements it
    992 ##  Keeps track of round number mod 4 in %r8
    993 ##  Preserves xmm0
    994 ##  Clobbers xmm1-xmm5
    995 ##
    996 .type	_vpaes_schedule_mangle,%function
    997 .align	4
    998 _vpaes_schedule_mangle:
    999 	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
   1000 						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
   1001 	cbnz	$dir, .Lschedule_mangle_dec
   1002 
   1003 	// encrypting
   1004 	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
   1005 	add	$out, $out, #16			// add	\$16,	%rdx
   1006 	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
   1007 	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
   1008 	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
   1009 	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
   1010 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
   1011 	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
   1012 
   1013 	b	.Lschedule_mangle_both
   1014 .align	4
   1015 .Lschedule_mangle_dec:
   1016 	// inverse mix columns
   1017 						// lea	.Lk_dksd(%rip),%r11
   1018 	ushr	v1.16b, v4.16b, #4		// vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
   1019 	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
   1020 
   1021 						// vmovdqa	0x00(%r11),	%xmm2
   1022 	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
   1023 						// vmovdqa	0x10(%r11),	%xmm3
   1024 	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
   1025 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
   1026 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
   1027 
   1028 						// vmovdqa	0x20(%r11),	%xmm2
   1029 	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
   1030 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
   1031 						// vmovdqa	0x30(%r11),	%xmm3
   1032 	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
   1033 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
   1034 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
   1035 
   1036 						// vmovdqa	0x40(%r11),	%xmm2
   1037 	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
   1038 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
   1039 						// vmovdqa	0x50(%r11),	%xmm3
   1040 	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
   1041 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
   1042 
   1043 						// vmovdqa	0x60(%r11),	%xmm2
   1044 	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
   1045 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
   1046 						// vmovdqa	0x70(%r11),	%xmm4
   1047 	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
   1048 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
   1049 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
   1050 	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
   1051 
   1052 	sub	$out, $out, #16			// add	\$-16,	%rdx
   1053 
   1054 .Lschedule_mangle_both:
   1055 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
   1056 	add	x8, x8, #64-16			// add	\$-16,	%r8
   1057 	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
   1058 	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
   1059 	ret
   1060 .size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
   1061 
   1062 .globl	vpaes_set_encrypt_key
   1063 .type	vpaes_set_encrypt_key,%function
   1064 .align	4
   1065 vpaes_set_encrypt_key:
   1066 	stp	x29,x30,[sp,#-16]!
   1067 	add	x29,sp,#0
   1068 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1069 
   1070 	lsr	w9, $bits, #5		// shr	\$5,%eax
   1071 	add	w9, w9, #5		// \$5,%eax
   1072 	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
   1073 
   1074 	mov	$dir, #0		// mov	\$0,%ecx
   1075 	mov	x8, #0x30		// mov	\$0x30,%r8d
   1076 	bl	_vpaes_schedule_core
   1077 	eor	x0, x0, x0
   1078 
   1079 	ldp	d8,d9,[sp],#16
   1080 	ldp	x29,x30,[sp],#16
   1081 	ret
   1082 .size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
   1083 
   1084 .globl	vpaes_set_decrypt_key
   1085 .type	vpaes_set_decrypt_key,%function
   1086 .align	4
   1087 vpaes_set_decrypt_key:
   1088 	stp	x29,x30,[sp,#-16]!
   1089 	add	x29,sp,#0
   1090 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1091 
   1092 	lsr	w9, $bits, #5		// shr	\$5,%eax
   1093 	add	w9, w9, #5		// \$5,%eax
   1094 	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
   1095 	lsl	w9, w9, #4		// shl	\$4,%eax
   1096 	add	$out, $out, #16		// lea	16(%rdx,%rax),%rdx
   1097 	add	$out, $out, x9
   1098 
   1099 	mov	$dir, #1		// mov	\$1,%ecx
   1100 	lsr	w8, $bits, #1		// shr	\$1,%r8d
   1101 	and	x8, x8, #32		// and	\$32,%r8d
   1102 	eor	x8, x8, #32		// xor	\$32,%r8d	# nbits==192?0:32
   1103 	bl	_vpaes_schedule_core
   1104 
   1105 	ldp	d8,d9,[sp],#16
   1106 	ldp	x29,x30,[sp],#16
   1107 	ret
   1108 .size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
   1109 ___
   1110 }
   1111 {
   1112 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
   1113 
   1114 $code.=<<___;
   1115 .globl	vpaes_cbc_encrypt
   1116 .type	vpaes_cbc_encrypt,%function
   1117 .align	4
   1118 vpaes_cbc_encrypt:
   1119 	cbz	$len, .Lcbc_abort
   1120 	cmp	w5, #0			// check direction
   1121 	b.eq	vpaes_cbc_decrypt
   1122 
   1123 	stp	x29,x30,[sp,#-16]!
   1124 	add	x29,sp,#0
   1125 
   1126 	mov	x17, $len		// reassign
   1127 	mov	x2,  $key		// reassign
   1128 
   1129 	ld1	{v0.16b}, [$ivec]	// load ivec
   1130 	bl	_vpaes_encrypt_preheat
   1131 	b	.Lcbc_enc_loop
   1132 
   1133 .align	4
   1134 .Lcbc_enc_loop:
   1135 	ld1	{v7.16b}, [$inp],#16	// load input
   1136 	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
   1137 	bl	_vpaes_encrypt_core
   1138 	st1	{v0.16b}, [$out],#16	// save output
   1139 	subs	x17, x17, #16
   1140 	b.hi	.Lcbc_enc_loop
   1141 
   1142 	st1	{v0.16b}, [$ivec]	// write ivec
   1143 
   1144 	ldp	x29,x30,[sp],#16
   1145 .Lcbc_abort:
   1146 	ret
   1147 .size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
   1148 
   1149 .type	vpaes_cbc_decrypt,%function
   1150 .align	4
   1151 vpaes_cbc_decrypt:
   1152 	stp	x29,x30,[sp,#-16]!
   1153 	add	x29,sp,#0
   1154 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1155 	stp	d10,d11,[sp,#-16]!
   1156 	stp	d12,d13,[sp,#-16]!
   1157 	stp	d14,d15,[sp,#-16]!
   1158 
   1159 	mov	x17, $len		// reassign
   1160 	mov	x2,  $key		// reassign
   1161 	ld1	{v6.16b}, [$ivec]	// load ivec
   1162 	bl	_vpaes_decrypt_preheat
   1163 	tst	x17, #16
   1164 	b.eq	.Lcbc_dec_loop2x
   1165 
   1166 	ld1	{v7.16b}, [$inp], #16	// load input
   1167 	bl	_vpaes_decrypt_core
   1168 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
   1169 	orr	v6.16b, v7.16b, v7.16b	// next ivec value
   1170 	st1	{v0.16b}, [$out], #16
   1171 	subs	x17, x17, #16
   1172 	b.ls	.Lcbc_dec_done
   1173 
   1174 .align	4
   1175 .Lcbc_dec_loop2x:
   1176 	ld1	{v14.16b,v15.16b}, [$inp], #32
   1177 	bl	_vpaes_decrypt_2x
   1178 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
   1179 	eor	v1.16b, v1.16b, v14.16b
   1180 	orr	v6.16b, v15.16b, v15.16b
   1181 	st1	{v0.16b,v1.16b}, [$out], #32
   1182 	subs	x17, x17, #32
   1183 	b.hi	.Lcbc_dec_loop2x
   1184 
   1185 .Lcbc_dec_done:
   1186 	st1	{v6.16b}, [$ivec]
   1187 
   1188 	ldp	d14,d15,[sp],#16
   1189 	ldp	d12,d13,[sp],#16
   1190 	ldp	d10,d11,[sp],#16
   1191 	ldp	d8,d9,[sp],#16
   1192 	ldp	x29,x30,[sp],#16
   1193 	ret
   1194 .size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
   1195 ___
   1196 # We omit vpaes_ecb_* in BoringSSL. They are unused.
   1197 if (0) {
   1198 $code.=<<___;
   1199 .globl	vpaes_ecb_encrypt
   1200 .type	vpaes_ecb_encrypt,%function
   1201 .align	4
   1202 vpaes_ecb_encrypt:
   1203 	stp	x29,x30,[sp,#-16]!
   1204 	add	x29,sp,#0
   1205 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1206 	stp	d10,d11,[sp,#-16]!
   1207 	stp	d12,d13,[sp,#-16]!
   1208 	stp	d14,d15,[sp,#-16]!
   1209 
   1210 	mov	x17, $len
   1211 	mov	x2,  $key
   1212 	bl	_vpaes_encrypt_preheat
   1213 	tst	x17, #16
   1214 	b.eq	.Lecb_enc_loop
   1215 
   1216 	ld1	{v7.16b}, [$inp],#16
   1217 	bl	_vpaes_encrypt_core
   1218 	st1	{v0.16b}, [$out],#16
   1219 	subs	x17, x17, #16
   1220 	b.ls	.Lecb_enc_done
   1221 
   1222 .align	4
   1223 .Lecb_enc_loop:
   1224 	ld1	{v14.16b,v15.16b}, [$inp], #32
   1225 	bl	_vpaes_encrypt_2x
   1226 	st1	{v0.16b,v1.16b}, [$out], #32
   1227 	subs	x17, x17, #32
   1228 	b.hi	.Lecb_enc_loop
   1229 
   1230 .Lecb_enc_done:
   1231 	ldp	d14,d15,[sp],#16
   1232 	ldp	d12,d13,[sp],#16
   1233 	ldp	d10,d11,[sp],#16
   1234 	ldp	d8,d9,[sp],#16
   1235 	ldp	x29,x30,[sp],#16
   1236 	ret
   1237 .size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
   1238 
   1239 .globl	vpaes_ecb_decrypt
   1240 .type	vpaes_ecb_decrypt,%function
   1241 .align	4
   1242 vpaes_ecb_decrypt:
   1243 	stp	x29,x30,[sp,#-16]!
   1244 	add	x29,sp,#0
   1245 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1246 	stp	d10,d11,[sp,#-16]!
   1247 	stp	d12,d13,[sp,#-16]!
   1248 	stp	d14,d15,[sp,#-16]!
   1249 
   1250 	mov	x17, $len
   1251 	mov	x2,  $key
   1252 	bl	_vpaes_decrypt_preheat
   1253 	tst	x17, #16
   1254 	b.eq	.Lecb_dec_loop
   1255 
   1256 	ld1	{v7.16b}, [$inp],#16
   1257 	bl	_vpaes_encrypt_core
   1258 	st1	{v0.16b}, [$out],#16
   1259 	subs	x17, x17, #16
   1260 	b.ls	.Lecb_dec_done
   1261 
   1262 .align	4
   1263 .Lecb_dec_loop:
   1264 	ld1	{v14.16b,v15.16b}, [$inp], #32
   1265 	bl	_vpaes_decrypt_2x
   1266 	st1	{v0.16b,v1.16b}, [$out], #32
   1267 	subs	x17, x17, #32
   1268 	b.hi	.Lecb_dec_loop
   1269 
   1270 .Lecb_dec_done:
   1271 	ldp	d14,d15,[sp],#16
   1272 	ldp	d12,d13,[sp],#16
   1273 	ldp	d10,d11,[sp],#16
   1274 	ldp	d8,d9,[sp],#16
   1275 	ldp	x29,x30,[sp],#16
   1276 	ret
   1277 .size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
   1278 ___
   1279 }
   1280 
   1281 my ($ctr, $ctr_tmp) = ("w6", "w7");
   1282 
   1283 # void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
   1284 #                                 const AES_KEY *key, const uint8_t ivec[16]);
   1285 $code.=<<___;
   1286 .globl	vpaes_ctr32_encrypt_blocks
   1287 .type	vpaes_ctr32_encrypt_blocks,%function
   1288 .align	4
   1289 vpaes_ctr32_encrypt_blocks:
   1290 	stp	x29,x30,[sp,#-16]!
   1291 	add	x29,sp,#0
   1292 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1293 	stp	d10,d11,[sp,#-16]!
   1294 	stp	d12,d13,[sp,#-16]!
   1295 	stp	d14,d15,[sp,#-16]!
   1296 
   1297 	cbz	$len, .Lctr32_done
   1298 
   1299 	// Note, unlike the other functions, $len here is measured in blocks,
   1300 	// not bytes.
   1301 	mov	x17, $len
   1302 	mov	x2,  $key
   1303 
   1304 	// Load the IV and counter portion.
   1305 	ldr	$ctr, [$ivec, #12]
   1306 	ld1	{v7.16b}, [$ivec]
   1307 
   1308 	bl	_vpaes_encrypt_preheat
   1309 	tst	x17, #1
   1310 	rev	$ctr, $ctr		// The counter is big-endian.
   1311 	b.eq	.Lctr32_prep_loop
   1312 
   1313 	// Handle one block so the remaining block count is even for
   1314 	// _vpaes_encrypt_2x.
   1315 	ld1	{v6.16b}, [$inp], #16	// Load input ahead of time
   1316 	bl	_vpaes_encrypt_core
   1317 	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
   1318 	st1	{v0.16b}, [$out], #16
   1319 	subs	x17, x17, #1
   1320 	// Update the counter.
   1321 	add	$ctr, $ctr, #1
   1322 	rev	$ctr_tmp, $ctr
   1323 	mov	v7.s[3], $ctr_tmp
   1324 	b.ls	.Lctr32_done
   1325 
   1326 .Lctr32_prep_loop:
   1327 	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
   1328 	// uses v14 and v15.
   1329 	mov	v15.16b, v7.16b
   1330 	mov	v14.16b, v7.16b
   1331 	add	$ctr, $ctr, #1
   1332 	rev	$ctr_tmp, $ctr
   1333 	mov	v15.s[3], $ctr_tmp
   1334 
   1335 .Lctr32_loop:
   1336 	ld1	{v6.16b,v7.16b}, [$inp], #32	// Load input ahead of time
   1337 	bl	_vpaes_encrypt_2x
   1338 	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
   1339 	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
   1340 	st1	{v0.16b,v1.16b}, [$out], #32
   1341 	subs	x17, x17, #2
   1342 	// Update the counter.
   1343 	add	$ctr_tmp, $ctr, #1
   1344 	add	$ctr, $ctr, #2
   1345 	rev	$ctr_tmp, $ctr_tmp
   1346 	mov	v14.s[3], $ctr_tmp
   1347 	rev	$ctr_tmp, $ctr
   1348 	mov	v15.s[3], $ctr_tmp
   1349 	b.hi	.Lctr32_loop
   1350 
   1351 .Lctr32_done:
   1352 	ldp	d14,d15,[sp],#16
   1353 	ldp	d12,d13,[sp],#16
   1354 	ldp	d10,d11,[sp],#16
   1355 	ldp	d8,d9,[sp],#16
   1356 	ldp	x29,x30,[sp],#16
   1357 	ret
   1358 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
   1359 ___
   1360 }
   1361 
   1362 print $code;
   1363 
   1364 close STDOUT;
   1365