Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # Version 2.1.
     11 #
     12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
     13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
     14 # [you'll notice a lot of resemblance], such as compressed S-boxes
     15 # in little-endian byte order, prefetch of these tables in CBC mode,
     16 # as well as avoiding L1 cache aliasing between stack frame and key
     17 # schedule and already mentioned tables, compressed Td4...
     18 #
     19 # Performance in number of cycles per processed byte for 128-bit key:
     20 #
     21 #		ECB encrypt	ECB decrypt	CBC large chunk
     22 # AMD64		33		43		13.0
     23 # EM64T		38		56		18.6(*)
     24 # Core 2	30		42		14.5(*)
     25 # Atom		65		86		32.1(*)
     26 #
     27 # (*) with hyper-threading off
     28 
     29 $flavour = shift;
     30 $output  = shift;
     31 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     32 
     33 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     34 
     35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     36 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     37 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     38 die "can't locate x86_64-xlate.pl";
     39 
     40 open OUT,"| \"$^X\" $xlate $flavour $output";
     41 *STDOUT=*OUT;
     42 
     43 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
     44 			# ~15% better on both AMD and Intel cores
     45 $speed_limit=512;	# see aes-586.pl for details
     46 
     47 $code=".text\n";
     48 
     49 $s0="%eax";
     50 $s1="%ebx";
     51 $s2="%ecx";
     52 $s3="%edx";
     53 $acc0="%esi";	$mask80="%rsi";
     54 $acc1="%edi";	$maskfe="%rdi";
     55 $acc2="%ebp";	$mask1b="%rbp";
     56 $inp="%r8";
     57 $out="%r9";
     58 $t0="%r10d";
     59 $t1="%r11d";
     60 $t2="%r12d";
     61 $rnds="%r13d";
     62 $sbox="%r14";
     63 $key="%r15";
     64 
     65 sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
     66 sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
     67 			$r =~ s/%[er]([sd]i)/%\1l/;
     68 			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
     69 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
     70 			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
     71 sub _data_word()
     72 { my $i;
     73     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
     74 }
     75 sub data_word()
     76 { my $i;
     77   my $last=pop(@_);
     78     $code.=".long\t";
     79     while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
     80     $code.=sprintf"0x%08x\n",$last;
     81 }
     82 
     83 sub data_byte()
     84 { my $i;
     85   my $last=pop(@_);
     86     $code.=".byte\t";
     87     while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
     88     $code.=sprintf"0x%02x\n",$last&0xff;
     89 }
     90 
     91 sub encvert()
     92 { my $t3="%r8d";	# zaps $inp!
     93 
     94 $code.=<<___;
     95 	# favor 3-way issue Opteron pipeline...
     96 	movzb	`&lo("$s0")`,$acc0
     97 	movzb	`&lo("$s1")`,$acc1
     98 	movzb	`&lo("$s2")`,$acc2
     99 	mov	0($sbox,$acc0,8),$t0
    100 	mov	0($sbox,$acc1,8),$t1
    101 	mov	0($sbox,$acc2,8),$t2
    102 
    103 	movzb	`&hi("$s1")`,$acc0
    104 	movzb	`&hi("$s2")`,$acc1
    105 	movzb	`&lo("$s3")`,$acc2
    106 	xor	3($sbox,$acc0,8),$t0
    107 	xor	3($sbox,$acc1,8),$t1
    108 	mov	0($sbox,$acc2,8),$t3
    109 
    110 	movzb	`&hi("$s3")`,$acc0
    111 	shr	\$16,$s2
    112 	movzb	`&hi("$s0")`,$acc2
    113 	xor	3($sbox,$acc0,8),$t2
    114 	shr	\$16,$s3
    115 	xor	3($sbox,$acc2,8),$t3
    116 
    117 	shr	\$16,$s1
    118 	lea	16($key),$key
    119 	shr	\$16,$s0
    120 
    121 	movzb	`&lo("$s2")`,$acc0
    122 	movzb	`&lo("$s3")`,$acc1
    123 	movzb	`&lo("$s0")`,$acc2
    124 	xor	2($sbox,$acc0,8),$t0
    125 	xor	2($sbox,$acc1,8),$t1
    126 	xor	2($sbox,$acc2,8),$t2
    127 
    128 	movzb	`&hi("$s3")`,$acc0
    129 	movzb	`&hi("$s0")`,$acc1
    130 	movzb	`&lo("$s1")`,$acc2
    131 	xor	1($sbox,$acc0,8),$t0
    132 	xor	1($sbox,$acc1,8),$t1
    133 	xor	2($sbox,$acc2,8),$t3
    134 
    135 	mov	12($key),$s3
    136 	movzb	`&hi("$s1")`,$acc1
    137 	movzb	`&hi("$s2")`,$acc2
    138 	mov	0($key),$s0
    139 	xor	1($sbox,$acc1,8),$t2
    140 	xor	1($sbox,$acc2,8),$t3
    141 
    142 	mov	4($key),$s1
    143 	mov	8($key),$s2
    144 	xor	$t0,$s0
    145 	xor	$t1,$s1
    146 	xor	$t2,$s2
    147 	xor	$t3,$s3
    148 ___
    149 }
    150 
    151 sub enclastvert()
    152 { my $t3="%r8d";	# zaps $inp!
    153 
    154 $code.=<<___;
    155 	movzb	`&lo("$s0")`,$acc0
    156 	movzb	`&lo("$s1")`,$acc1
    157 	movzb	`&lo("$s2")`,$acc2
    158 	movzb	2($sbox,$acc0,8),$t0
    159 	movzb	2($sbox,$acc1,8),$t1
    160 	movzb	2($sbox,$acc2,8),$t2
    161 
    162 	movzb	`&lo("$s3")`,$acc0
    163 	movzb	`&hi("$s1")`,$acc1
    164 	movzb	`&hi("$s2")`,$acc2
    165 	movzb	2($sbox,$acc0,8),$t3
    166 	mov	0($sbox,$acc1,8),$acc1	#$t0
    167 	mov	0($sbox,$acc2,8),$acc2	#$t1
    168 
    169 	and	\$0x0000ff00,$acc1
    170 	and	\$0x0000ff00,$acc2
    171 
    172 	xor	$acc1,$t0
    173 	xor	$acc2,$t1
    174 	shr	\$16,$s2
    175 
    176 	movzb	`&hi("$s3")`,$acc0
    177 	movzb	`&hi("$s0")`,$acc1
    178 	shr	\$16,$s3
    179 	mov	0($sbox,$acc0,8),$acc0	#$t2
    180 	mov	0($sbox,$acc1,8),$acc1	#$t3
    181 
    182 	and	\$0x0000ff00,$acc0
    183 	and	\$0x0000ff00,$acc1
    184 	shr	\$16,$s1
    185 	xor	$acc0,$t2
    186 	xor	$acc1,$t3
    187 	shr	\$16,$s0
    188 
    189 	movzb	`&lo("$s2")`,$acc0
    190 	movzb	`&lo("$s3")`,$acc1
    191 	movzb	`&lo("$s0")`,$acc2
    192 	mov	0($sbox,$acc0,8),$acc0	#$t0
    193 	mov	0($sbox,$acc1,8),$acc1	#$t1
    194 	mov	0($sbox,$acc2,8),$acc2	#$t2
    195 
    196 	and	\$0x00ff0000,$acc0
    197 	and	\$0x00ff0000,$acc1
    198 	and	\$0x00ff0000,$acc2
    199 
    200 	xor	$acc0,$t0
    201 	xor	$acc1,$t1
    202 	xor	$acc2,$t2
    203 
    204 	movzb	`&lo("$s1")`,$acc0
    205 	movzb	`&hi("$s3")`,$acc1
    206 	movzb	`&hi("$s0")`,$acc2
    207 	mov	0($sbox,$acc0,8),$acc0	#$t3
    208 	mov	2($sbox,$acc1,8),$acc1	#$t0
    209 	mov	2($sbox,$acc2,8),$acc2	#$t1
    210 
    211 	and	\$0x00ff0000,$acc0
    212 	and	\$0xff000000,$acc1
    213 	and	\$0xff000000,$acc2
    214 
    215 	xor	$acc0,$t3
    216 	xor	$acc1,$t0
    217 	xor	$acc2,$t1
    218 
    219 	movzb	`&hi("$s1")`,$acc0
    220 	movzb	`&hi("$s2")`,$acc1
    221 	mov	16+12($key),$s3
    222 	mov	2($sbox,$acc0,8),$acc0	#$t2
    223 	mov	2($sbox,$acc1,8),$acc1	#$t3
    224 	mov	16+0($key),$s0
    225 
    226 	and	\$0xff000000,$acc0
    227 	and	\$0xff000000,$acc1
    228 
    229 	xor	$acc0,$t2
    230 	xor	$acc1,$t3
    231 
    232 	mov	16+4($key),$s1
    233 	mov	16+8($key),$s2
    234 	xor	$t0,$s0
    235 	xor	$t1,$s1
    236 	xor	$t2,$s2
    237 	xor	$t3,$s3
    238 ___
    239 }
    240 
    241 sub encstep()
    242 { my ($i,@s) = @_;
    243   my $tmp0=$acc0;
    244   my $tmp1=$acc1;
    245   my $tmp2=$acc2;
    246   my $out=($t0,$t1,$t2,$s[0])[$i];
    247 
    248 	if ($i==3) {
    249 		$tmp0=$s[1];
    250 		$tmp1=$s[2];
    251 		$tmp2=$s[3];
    252 	}
    253 	$code.="	movzb	".&lo($s[0]).",$out\n";
    254 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    255 	$code.="	lea	16($key),$key\n"	if ($i==0);
    256 
    257 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    258 	$code.="	mov	0($sbox,$out,8),$out\n";
    259 
    260 	$code.="	shr	\$16,$tmp1\n";
    261 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    262 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    263 
    264 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    265 	$code.="	shr	\$24,$tmp2\n";
    266 	$code.="	xor	4*$i($key),$out\n";
    267 
    268 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    269 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    270 
    271 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    272 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    273 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    274 	$code.="\n";
    275 }
    276 
    277 sub enclast()
    278 { my ($i,@s)=@_;
    279   my $tmp0=$acc0;
    280   my $tmp1=$acc1;
    281   my $tmp2=$acc2;
    282   my $out=($t0,$t1,$t2,$s[0])[$i];
    283 
    284 	if ($i==3) {
    285 		$tmp0=$s[1];
    286 		$tmp1=$s[2];
    287 		$tmp2=$s[3];
    288 	}
    289 	$code.="	movzb	".&lo($s[0]).",$out\n";
    290 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    291 
    292 	$code.="	mov	2($sbox,$out,8),$out\n";
    293 	$code.="	shr	\$16,$tmp1\n";
    294 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    295 
    296 	$code.="	and	\$0x000000ff,$out\n";
    297 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    298 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    299 	$code.="	shr	\$24,$tmp2\n";
    300 
    301 	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
    302 	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
    303 	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
    304 
    305 	$code.="	and	\$0x0000ff00,$tmp0\n";
    306 	$code.="	and	\$0x00ff0000,$tmp1\n";
    307 	$code.="	and	\$0xff000000,$tmp2\n";
    308 
    309 	$code.="	xor	$tmp0,$out\n";
    310 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    311 	$code.="	xor	$tmp1,$out\n";
    312 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    313 	$code.="	xor	$tmp2,$out\n";
    314 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    315 	$code.="\n";
    316 }
    317 
    318 $code.=<<___;
    319 .type	_x86_64_AES_encrypt,\@abi-omnipotent
    320 .align	16
    321 _x86_64_AES_encrypt:
    322 	xor	0($key),$s0			# xor with key
    323 	xor	4($key),$s1
    324 	xor	8($key),$s2
    325 	xor	12($key),$s3
    326 
    327 	mov	240($key),$rnds			# load key->rounds
    328 	sub	\$1,$rnds
    329 	jmp	.Lenc_loop
    330 .align	16
    331 .Lenc_loop:
    332 ___
    333 	if ($verticalspin) { &encvert(); }
    334 	else {	&encstep(0,$s0,$s1,$s2,$s3);
    335 		&encstep(1,$s1,$s2,$s3,$s0);
    336 		&encstep(2,$s2,$s3,$s0,$s1);
    337 		&encstep(3,$s3,$s0,$s1,$s2);
    338 	}
    339 $code.=<<___;
    340 	sub	\$1,$rnds
    341 	jnz	.Lenc_loop
    342 ___
    343 	if ($verticalspin) { &enclastvert(); }
    344 	else {	&enclast(0,$s0,$s1,$s2,$s3);
    345 		&enclast(1,$s1,$s2,$s3,$s0);
    346 		&enclast(2,$s2,$s3,$s0,$s1);
    347 		&enclast(3,$s3,$s0,$s1,$s2);
    348 		$code.=<<___;
    349 		xor	16+0($key),$s0		# xor with key
    350 		xor	16+4($key),$s1
    351 		xor	16+8($key),$s2
    352 		xor	16+12($key),$s3
    353 ___
    354 	}
    355 $code.=<<___;
    356 	.byte	0xf3,0xc3			# rep ret
    357 .size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
    358 ___
    359 
    360 # it's possible to implement this by shifting tN by 8, filling least
    361 # significant byte with byte load and finally bswap-ing at the end,
    362 # but such partial register load kills Core 2...
    363 sub enccompactvert()
    364 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    365 
    366 $code.=<<___;
    367 	movzb	`&lo("$s0")`,$t0
    368 	movzb	`&lo("$s1")`,$t1
    369 	movzb	`&lo("$s2")`,$t2
    370 	movzb	`&lo("$s3")`,$t3
    371 	movzb	`&hi("$s1")`,$acc0
    372 	movzb	`&hi("$s2")`,$acc1
    373 	shr	\$16,$s2
    374 	movzb	`&hi("$s3")`,$acc2
    375 	movzb	($sbox,$t0,1),$t0
    376 	movzb	($sbox,$t1,1),$t1
    377 	movzb	($sbox,$t2,1),$t2
    378 	movzb	($sbox,$t3,1),$t3
    379 
    380 	movzb	($sbox,$acc0,1),$t4	#$t0
    381 	movzb	`&hi("$s0")`,$acc0
    382 	movzb	($sbox,$acc1,1),$t5	#$t1
    383 	movzb	`&lo("$s2")`,$acc1
    384 	movzb	($sbox,$acc2,1),$acc2	#$t2
    385 	movzb	($sbox,$acc0,1),$acc0	#$t3
    386 
    387 	shl	\$8,$t4
    388 	shr	\$16,$s3
    389 	shl	\$8,$t5
    390 	xor	$t4,$t0
    391 	shr	\$16,$s0
    392 	movzb	`&lo("$s3")`,$t4
    393 	shr	\$16,$s1
    394 	xor	$t5,$t1
    395 	shl	\$8,$acc2
    396 	movzb	`&lo("$s0")`,$t5
    397 	movzb	($sbox,$acc1,1),$acc1	#$t0
    398 	xor	$acc2,$t2
    399 
    400 	shl	\$8,$acc0
    401 	movzb	`&lo("$s1")`,$acc2
    402 	shl	\$16,$acc1
    403 	xor	$acc0,$t3
    404 	movzb	($sbox,$t4,1),$t4	#$t1
    405 	movzb	`&hi("$s3")`,$acc0
    406 	movzb	($sbox,$t5,1),$t5	#$t2
    407 	xor	$acc1,$t0
    408 
    409 	shr	\$8,$s2
    410 	movzb	`&hi("$s0")`,$acc1
    411 	shl	\$16,$t4
    412 	shr	\$8,$s1
    413 	shl	\$16,$t5
    414 	xor	$t4,$t1
    415 	movzb	($sbox,$acc2,1),$acc2	#$t3
    416 	movzb	($sbox,$acc0,1),$acc0	#$t0
    417 	movzb	($sbox,$acc1,1),$acc1	#$t1
    418 	movzb	($sbox,$s2,1),$s3	#$t3
    419 	movzb	($sbox,$s1,1),$s2	#$t2
    420 
    421 	shl	\$16,$acc2
    422 	xor	$t5,$t2
    423 	shl	\$24,$acc0
    424 	xor	$acc2,$t3
    425 	shl	\$24,$acc1
    426 	xor	$acc0,$t0
    427 	shl	\$24,$s3
    428 	xor	$acc1,$t1
    429 	shl	\$24,$s2
    430 	mov	$t0,$s0
    431 	mov	$t1,$s1
    432 	xor	$t2,$s2
    433 	xor	$t3,$s3
    434 ___
    435 }
    436 
    437 sub enctransform_ref()
    438 { my $sn = shift;
    439   my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
    440 
    441 $code.=<<___;
    442 	mov	$sn,$acc
    443 	and	\$0x80808080,$acc
    444 	mov	$acc,$tmp
    445 	shr	\$7,$tmp
    446 	lea	($sn,$sn),$r2
    447 	sub	$tmp,$acc
    448 	and	\$0xfefefefe,$r2
    449 	and	\$0x1b1b1b1b,$acc
    450 	mov	$sn,$tmp
    451 	xor	$acc,$r2
    452 
    453 	xor	$r2,$sn
    454 	rol	\$24,$sn
    455 	xor	$r2,$sn
    456 	ror	\$16,$tmp
    457 	xor	$tmp,$sn
    458 	ror	\$8,$tmp
    459 	xor	$tmp,$sn
    460 ___
    461 }
    462 
    463 # unlike decrypt case it does not pay off to parallelize enctransform
    464 sub enctransform()
    465 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
    466 
    467 $code.=<<___;
    468 	mov	\$0x80808080,$t0
    469 	mov	\$0x80808080,$t1
    470 	and	$s0,$t0
    471 	and	$s1,$t1
    472 	mov	$t0,$acc0
    473 	mov	$t1,$acc1
    474 	shr	\$7,$t0
    475 	lea	($s0,$s0),$r20
    476 	shr	\$7,$t1
    477 	lea	($s1,$s1),$r21
    478 	sub	$t0,$acc0
    479 	sub	$t1,$acc1
    480 	and	\$0xfefefefe,$r20
    481 	and	\$0xfefefefe,$r21
    482 	and	\$0x1b1b1b1b,$acc0
    483 	and	\$0x1b1b1b1b,$acc1
    484 	mov	$s0,$t0
    485 	mov	$s1,$t1
    486 	xor	$acc0,$r20
    487 	xor	$acc1,$r21
    488 
    489 	xor	$r20,$s0
    490 	xor	$r21,$s1
    491 	 mov	\$0x80808080,$t2
    492 	rol	\$24,$s0
    493 	 mov	\$0x80808080,$t3
    494 	rol	\$24,$s1
    495 	 and	$s2,$t2
    496 	 and	$s3,$t3
    497 	xor	$r20,$s0
    498 	xor	$r21,$s1
    499 	 mov	$t2,$acc0
    500 	ror	\$16,$t0
    501 	 mov	$t3,$acc1
    502 	ror	\$16,$t1
    503 	 lea	($s2,$s2),$r20
    504 	 shr	\$7,$t2
    505 	xor	$t0,$s0
    506 	 shr	\$7,$t3
    507 	xor	$t1,$s1
    508 	ror	\$8,$t0
    509 	 lea	($s3,$s3),$r21
    510 	ror	\$8,$t1
    511 	 sub	$t2,$acc0
    512 	 sub	$t3,$acc1
    513 	xor	$t0,$s0
    514 	xor	$t1,$s1
    515 
    516 	and	\$0xfefefefe,$r20
    517 	and	\$0xfefefefe,$r21
    518 	and	\$0x1b1b1b1b,$acc0
    519 	and	\$0x1b1b1b1b,$acc1
    520 	mov	$s2,$t2
    521 	mov	$s3,$t3
    522 	xor	$acc0,$r20
    523 	xor	$acc1,$r21
    524 
    525 	ror	\$16,$t2
    526 	xor	$r20,$s2
    527 	ror	\$16,$t3
    528 	xor	$r21,$s3
    529 	rol	\$24,$s2
    530 	mov	0($sbox),$acc0			# prefetch Te4
    531 	rol	\$24,$s3
    532 	xor	$r20,$s2
    533 	mov	64($sbox),$acc1
    534 	xor	$r21,$s3
    535 	mov	128($sbox),$r20
    536 	xor	$t2,$s2
    537 	ror	\$8,$t2
    538 	xor	$t3,$s3
    539 	ror	\$8,$t3
    540 	xor	$t2,$s2
    541 	mov	192($sbox),$r21
    542 	xor	$t3,$s3
    543 ___
    544 }
    545 
    546 $code.=<<___;
    547 .type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
    548 .align	16
    549 _x86_64_AES_encrypt_compact:
    550 	lea	128($sbox),$inp			# size optimization
    551 	mov	0-128($inp),$acc1		# prefetch Te4
    552 	mov	32-128($inp),$acc2
    553 	mov	64-128($inp),$t0
    554 	mov	96-128($inp),$t1
    555 	mov	128-128($inp),$acc1
    556 	mov	160-128($inp),$acc2
    557 	mov	192-128($inp),$t0
    558 	mov	224-128($inp),$t1
    559 	jmp	.Lenc_loop_compact
    560 .align	16
    561 .Lenc_loop_compact:
    562 		xor	0($key),$s0		# xor with key
    563 		xor	4($key),$s1
    564 		xor	8($key),$s2
    565 		xor	12($key),$s3
    566 		lea	16($key),$key
    567 ___
    568 		&enccompactvert();
    569 $code.=<<___;
    570 		cmp	16(%rsp),$key
    571 		je	.Lenc_compact_done
    572 ___
    573 		&enctransform();
    574 $code.=<<___;
    575 	jmp	.Lenc_loop_compact
    576 .align	16
    577 .Lenc_compact_done:
    578 	xor	0($key),$s0
    579 	xor	4($key),$s1
    580 	xor	8($key),$s2
    581 	xor	12($key),$s3
    582 	.byte	0xf3,0xc3			# rep ret
    583 .size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
    584 ___
    585 
    586 # void asm_AES_encrypt (const void *inp,void *out,const AES_KEY *key);
    587 $code.=<<___;
    588 .align	16
    589 .globl	asm_AES_encrypt
    590 .type	asm_AES_encrypt,\@function,3
    591 .hidden	asm_AES_encrypt
    592 asm_AES_encrypt:
    593 	push	%rbx
    594 	push	%rbp
    595 	push	%r12
    596 	push	%r13
    597 	push	%r14
    598 	push	%r15
    599 
    600 	# allocate frame "above" key schedule
    601 	mov	%rsp,%r10
    602 	lea	-63(%rdx),%rcx	# %rdx is key argument
    603 	and	\$-64,%rsp
    604 	sub	%rsp,%rcx
    605 	neg	%rcx
    606 	and	\$0x3c0,%rcx
    607 	sub	%rcx,%rsp
    608 	sub	\$32,%rsp
    609 
    610 	mov	%rsi,16(%rsp)	# save out
    611 	mov	%r10,24(%rsp)	# save real stack pointer
    612 .Lenc_prologue:
    613 
    614 	mov	%rdx,$key
    615 	mov	240($key),$rnds	# load rounds
    616 
    617 	mov	0(%rdi),$s0	# load input vector
    618 	mov	4(%rdi),$s1
    619 	mov	8(%rdi),$s2
    620 	mov	12(%rdi),$s3
    621 
    622 	shl	\$4,$rnds
    623 	lea	($key,$rnds),%rbp
    624 	mov	$key,(%rsp)	# key schedule
    625 	mov	%rbp,8(%rsp)	# end of key schedule
    626 
    627 	# pick Te4 copy which can't "overlap" with stack frame or key schedule
    628 	lea	.LAES_Te+2048(%rip),$sbox
    629 	lea	768(%rsp),%rbp
    630 	sub	$sbox,%rbp
    631 	and	\$0x300,%rbp
    632 	lea	($sbox,%rbp),$sbox
    633 
    634 	call	_x86_64_AES_encrypt_compact
    635 
    636 	mov	16(%rsp),$out	# restore out
    637 	mov	24(%rsp),%rsi	# restore saved stack pointer
    638 	mov	$s0,0($out)	# write output vector
    639 	mov	$s1,4($out)
    640 	mov	$s2,8($out)
    641 	mov	$s3,12($out)
    642 
    643 	mov	(%rsi),%r15
    644 	mov	8(%rsi),%r14
    645 	mov	16(%rsi),%r13
    646 	mov	24(%rsi),%r12
    647 	mov	32(%rsi),%rbp
    648 	mov	40(%rsi),%rbx
    649 	lea	48(%rsi),%rsp
    650 .Lenc_epilogue:
    651 	ret
    652 .size	asm_AES_encrypt,.-asm_AES_encrypt
    653 ___
    654 
    655 #------------------------------------------------------------------#
    656 
    657 sub decvert()
    658 { my $t3="%r8d";	# zaps $inp!
    659 
    660 $code.=<<___;
    661 	# favor 3-way issue Opteron pipeline...
    662 	movzb	`&lo("$s0")`,$acc0
    663 	movzb	`&lo("$s1")`,$acc1
    664 	movzb	`&lo("$s2")`,$acc2
    665 	mov	0($sbox,$acc0,8),$t0
    666 	mov	0($sbox,$acc1,8),$t1
    667 	mov	0($sbox,$acc2,8),$t2
    668 
    669 	movzb	`&hi("$s3")`,$acc0
    670 	movzb	`&hi("$s0")`,$acc1
    671 	movzb	`&lo("$s3")`,$acc2
    672 	xor	3($sbox,$acc0,8),$t0
    673 	xor	3($sbox,$acc1,8),$t1
    674 	mov	0($sbox,$acc2,8),$t3
    675 
    676 	movzb	`&hi("$s1")`,$acc0
    677 	shr	\$16,$s0
    678 	movzb	`&hi("$s2")`,$acc2
    679 	xor	3($sbox,$acc0,8),$t2
    680 	shr	\$16,$s3
    681 	xor	3($sbox,$acc2,8),$t3
    682 
    683 	shr	\$16,$s1
    684 	lea	16($key),$key
    685 	shr	\$16,$s2
    686 
    687 	movzb	`&lo("$s2")`,$acc0
    688 	movzb	`&lo("$s3")`,$acc1
    689 	movzb	`&lo("$s0")`,$acc2
    690 	xor	2($sbox,$acc0,8),$t0
    691 	xor	2($sbox,$acc1,8),$t1
    692 	xor	2($sbox,$acc2,8),$t2
    693 
    694 	movzb	`&hi("$s1")`,$acc0
    695 	movzb	`&hi("$s2")`,$acc1
    696 	movzb	`&lo("$s1")`,$acc2
    697 	xor	1($sbox,$acc0,8),$t0
    698 	xor	1($sbox,$acc1,8),$t1
    699 	xor	2($sbox,$acc2,8),$t3
    700 
    701 	movzb	`&hi("$s3")`,$acc0
    702 	mov	12($key),$s3
    703 	movzb	`&hi("$s0")`,$acc2
    704 	xor	1($sbox,$acc0,8),$t2
    705 	mov	0($key),$s0
    706 	xor	1($sbox,$acc2,8),$t3
    707 
    708 	xor	$t0,$s0
    709 	mov	4($key),$s1
    710 	mov	8($key),$s2
    711 	xor	$t2,$s2
    712 	xor	$t1,$s1
    713 	xor	$t3,$s3
    714 ___
    715 }
    716 
    717 sub declastvert()
    718 { my $t3="%r8d";	# zaps $inp!
    719 
    720 $code.=<<___;
    721 	lea	2048($sbox),$sbox	# size optimization
    722 	movzb	`&lo("$s0")`,$acc0
    723 	movzb	`&lo("$s1")`,$acc1
    724 	movzb	`&lo("$s2")`,$acc2
    725 	movzb	($sbox,$acc0,1),$t0
    726 	movzb	($sbox,$acc1,1),$t1
    727 	movzb	($sbox,$acc2,1),$t2
    728 
    729 	movzb	`&lo("$s3")`,$acc0
    730 	movzb	`&hi("$s3")`,$acc1
    731 	movzb	`&hi("$s0")`,$acc2
    732 	movzb	($sbox,$acc0,1),$t3
    733 	movzb	($sbox,$acc1,1),$acc1	#$t0
    734 	movzb	($sbox,$acc2,1),$acc2	#$t1
    735 
    736 	shl	\$8,$acc1
    737 	shl	\$8,$acc2
    738 
    739 	xor	$acc1,$t0
    740 	xor	$acc2,$t1
    741 	shr	\$16,$s3
    742 
    743 	movzb	`&hi("$s1")`,$acc0
    744 	movzb	`&hi("$s2")`,$acc1
    745 	shr	\$16,$s0
    746 	movzb	($sbox,$acc0,1),$acc0	#$t2
    747 	movzb	($sbox,$acc1,1),$acc1	#$t3
    748 
    749 	shl	\$8,$acc0
    750 	shl	\$8,$acc1
    751 	shr	\$16,$s1
    752 	xor	$acc0,$t2
    753 	xor	$acc1,$t3
    754 	shr	\$16,$s2
    755 
    756 	movzb	`&lo("$s2")`,$acc0
    757 	movzb	`&lo("$s3")`,$acc1
    758 	movzb	`&lo("$s0")`,$acc2
    759 	movzb	($sbox,$acc0,1),$acc0	#$t0
    760 	movzb	($sbox,$acc1,1),$acc1	#$t1
    761 	movzb	($sbox,$acc2,1),$acc2	#$t2
    762 
    763 	shl	\$16,$acc0
    764 	shl	\$16,$acc1
    765 	shl	\$16,$acc2
    766 
    767 	xor	$acc0,$t0
    768 	xor	$acc1,$t1
    769 	xor	$acc2,$t2
    770 
    771 	movzb	`&lo("$s1")`,$acc0
    772 	movzb	`&hi("$s1")`,$acc1
    773 	movzb	`&hi("$s2")`,$acc2
    774 	movzb	($sbox,$acc0,1),$acc0	#$t3
    775 	movzb	($sbox,$acc1,1),$acc1	#$t0
    776 	movzb	($sbox,$acc2,1),$acc2	#$t1
    777 
    778 	shl	\$16,$acc0
    779 	shl	\$24,$acc1
    780 	shl	\$24,$acc2
    781 
    782 	xor	$acc0,$t3
    783 	xor	$acc1,$t0
    784 	xor	$acc2,$t1
    785 
    786 	movzb	`&hi("$s3")`,$acc0
    787 	movzb	`&hi("$s0")`,$acc1
    788 	mov	16+12($key),$s3
    789 	movzb	($sbox,$acc0,1),$acc0	#$t2
    790 	movzb	($sbox,$acc1,1),$acc1	#$t3
    791 	mov	16+0($key),$s0
    792 
    793 	shl	\$24,$acc0
    794 	shl	\$24,$acc1
    795 
    796 	xor	$acc0,$t2
    797 	xor	$acc1,$t3
    798 
    799 	mov	16+4($key),$s1
    800 	mov	16+8($key),$s2
    801 	lea	-2048($sbox),$sbox
    802 	xor	$t0,$s0
    803 	xor	$t1,$s1
    804 	xor	$t2,$s2
    805 	xor	$t3,$s3
    806 ___
    807 }
    808 
    809 sub decstep()
    810 { my ($i,@s) = @_;
    811   my $tmp0=$acc0;
    812   my $tmp1=$acc1;
    813   my $tmp2=$acc2;
    814   my $out=($t0,$t1,$t2,$s[0])[$i];
    815 
    816 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    817 			$tmp1=$s[2]			if ($i==3);
    818 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    819 	$code.="	and	\$0xFF,$out\n";
    820 
    821 	$code.="	mov	0($sbox,$out,8),$out\n";
    822 	$code.="	shr	\$16,$tmp1\n";
    823 			$tmp2=$s[3]			if ($i==3);
    824 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    825 
    826 			$tmp0=$s[1]			if ($i==3);
    827 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    828 	$code.="	and	\$0xFF,$tmp1\n";
    829 	$code.="	shr	\$24,$tmp2\n";
    830 
    831 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    832 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    833 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    834 
    835 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    836 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    837 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    838 	$code.="\n";
    839 }
    840 
    841 sub declast()
    842 { my ($i,@s)=@_;
    843   my $tmp0=$acc0;
    844   my $tmp1=$acc1;
    845   my $tmp2=$acc2;
    846   my $out=($t0,$t1,$t2,$s[0])[$i];
    847 
    848 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    849 			$tmp1=$s[2]			if ($i==3);
    850 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    851 	$code.="	and	\$0xFF,$out\n";
    852 
    853 	$code.="	movzb	2048($sbox,$out,1),$out\n";
    854 	$code.="	shr	\$16,$tmp1\n";
    855 			$tmp2=$s[3]			if ($i==3);
    856 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    857 
    858 			$tmp0=$s[1]			if ($i==3);
    859 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    860 	$code.="	and	\$0xFF,$tmp1\n";
    861 	$code.="	shr	\$24,$tmp2\n";
    862 
    863 	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
    864 	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
    865 	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
    866 
    867 	$code.="	shl	\$8,$tmp0\n";
    868 	$code.="	shl	\$16,$tmp1\n";
    869 	$code.="	shl	\$24,$tmp2\n";
    870 
    871 	$code.="	xor	$tmp0,$out\n";
    872 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    873 	$code.="	xor	$tmp1,$out\n";
    874 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    875 	$code.="	xor	$tmp2,$out\n";
    876 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    877 	$code.="\n";
    878 }
    879 
    880 $code.=<<___;
    881 .type	_x86_64_AES_decrypt,\@abi-omnipotent
    882 .align	16
    883 _x86_64_AES_decrypt:
    884 	xor	0($key),$s0			# xor with key
    885 	xor	4($key),$s1
    886 	xor	8($key),$s2
    887 	xor	12($key),$s3
    888 
    889 	mov	240($key),$rnds			# load key->rounds
    890 	sub	\$1,$rnds
    891 	jmp	.Ldec_loop
    892 .align	16
    893 .Ldec_loop:
    894 ___
    895 	if ($verticalspin) { &decvert(); }
    896 	else {	&decstep(0,$s0,$s3,$s2,$s1);
    897 		&decstep(1,$s1,$s0,$s3,$s2);
    898 		&decstep(2,$s2,$s1,$s0,$s3);
    899 		&decstep(3,$s3,$s2,$s1,$s0);
    900 		$code.=<<___;
    901 		lea	16($key),$key
    902 		xor	0($key),$s0			# xor with key
    903 		xor	4($key),$s1
    904 		xor	8($key),$s2
    905 		xor	12($key),$s3
    906 ___
    907 	}
    908 $code.=<<___;
    909 	sub	\$1,$rnds
    910 	jnz	.Ldec_loop
    911 ___
    912 	if ($verticalspin) { &declastvert(); }
    913 	else {	&declast(0,$s0,$s3,$s2,$s1);
    914 		&declast(1,$s1,$s0,$s3,$s2);
    915 		&declast(2,$s2,$s1,$s0,$s3);
    916 		&declast(3,$s3,$s2,$s1,$s0);
    917 		$code.=<<___;
    918 		xor	16+0($key),$s0			# xor with key
    919 		xor	16+4($key),$s1
    920 		xor	16+8($key),$s2
    921 		xor	16+12($key),$s3
    922 ___
    923 	}
    924 $code.=<<___;
    925 	.byte	0xf3,0xc3			# rep ret
    926 .size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
    927 ___
    928 
    929 sub deccompactvert()
    930 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    931 
    932 $code.=<<___;
    933 	movzb	`&lo("$s0")`,$t0
    934 	movzb	`&lo("$s1")`,$t1
    935 	movzb	`&lo("$s2")`,$t2
    936 	movzb	`&lo("$s3")`,$t3
    937 	movzb	`&hi("$s3")`,$acc0
    938 	movzb	`&hi("$s0")`,$acc1
    939 	shr	\$16,$s3
    940 	movzb	`&hi("$s1")`,$acc2
    941 	movzb	($sbox,$t0,1),$t0
    942 	movzb	($sbox,$t1,1),$t1
    943 	movzb	($sbox,$t2,1),$t2
    944 	movzb	($sbox,$t3,1),$t3
    945 
    946 	movzb	($sbox,$acc0,1),$t4	#$t0
    947 	movzb	`&hi("$s2")`,$acc0
    948 	movzb	($sbox,$acc1,1),$t5	#$t1
    949 	movzb	($sbox,$acc2,1),$acc2	#$t2
    950 	movzb	($sbox,$acc0,1),$acc0	#$t3
    951 
    952 	shr	\$16,$s2
    953 	shl	\$8,$t5
    954 	shl	\$8,$t4
    955 	movzb	`&lo("$s2")`,$acc1
    956 	shr	\$16,$s0
    957 	xor	$t4,$t0
    958 	shr	\$16,$s1
    959 	movzb	`&lo("$s3")`,$t4
    960 
    961 	shl	\$8,$acc2
    962 	xor	$t5,$t1
    963 	shl	\$8,$acc0
    964 	movzb	`&lo("$s0")`,$t5
    965 	movzb	($sbox,$acc1,1),$acc1	#$t0
    966 	xor	$acc2,$t2
    967 	movzb	`&lo("$s1")`,$acc2
    968 
    969 	shl	\$16,$acc1
    970 	xor	$acc0,$t3
    971 	movzb	($sbox,$t4,1),$t4	#$t1
    972 	movzb	`&hi("$s1")`,$acc0
    973 	movzb	($sbox,$acc2,1),$acc2	#$t3
    974 	xor	$acc1,$t0
    975 	movzb	($sbox,$t5,1),$t5	#$t2
    976 	movzb	`&hi("$s2")`,$acc1
    977 
    978 	shl	\$16,$acc2
    979 	shl	\$16,$t4
    980 	shl	\$16,$t5
    981 	xor	$acc2,$t3
    982 	movzb	`&hi("$s3")`,$acc2
    983 	xor	$t4,$t1
    984 	shr	\$8,$s0
    985 	xor	$t5,$t2
    986 
    987 	movzb	($sbox,$acc0,1),$acc0	#$t0
    988 	movzb	($sbox,$acc1,1),$s1	#$t1
    989 	movzb	($sbox,$acc2,1),$s2	#$t2
    990 	movzb	($sbox,$s0,1),$s3	#$t3
    991 
    992 	mov	$t0,$s0
    993 	shl	\$24,$acc0
    994 	shl	\$24,$s1
    995 	shl	\$24,$s2
    996 	xor	$acc0,$s0
    997 	shl	\$24,$s3
    998 	xor	$t1,$s1
    999 	xor	$t2,$s2
   1000 	xor	$t3,$s3
   1001 ___
   1002 }
   1003 
   1004 # parallelized version! input is pair of 64-bit values: %rax=s1.s0
   1005 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
   1006 # %ecx=s2 and %edx=s3.
   1007 sub dectransform()
   1008 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
   1009   my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
   1010   my $prefetch = shift;
   1011 
   1012 $code.=<<___;
   1013 	mov	$mask80,$tp40
   1014 	mov	$mask80,$tp48
   1015 	and	$tp10,$tp40
   1016 	and	$tp18,$tp48
   1017 	mov	$tp40,$acc0
   1018 	mov	$tp48,$acc8
   1019 	shr	\$7,$tp40
   1020 	lea	($tp10,$tp10),$tp20
   1021 	shr	\$7,$tp48
   1022 	lea	($tp18,$tp18),$tp28
   1023 	sub	$tp40,$acc0
   1024 	sub	$tp48,$acc8
   1025 	and	$maskfe,$tp20
   1026 	and	$maskfe,$tp28
   1027 	and	$mask1b,$acc0
   1028 	and	$mask1b,$acc8
   1029 	xor	$acc0,$tp20
   1030 	xor	$acc8,$tp28
   1031 	mov	$mask80,$tp80
   1032 	mov	$mask80,$tp88
   1033 
   1034 	and	$tp20,$tp80
   1035 	and	$tp28,$tp88
   1036 	mov	$tp80,$acc0
   1037 	mov	$tp88,$acc8
   1038 	shr	\$7,$tp80
   1039 	lea	($tp20,$tp20),$tp40
   1040 	shr	\$7,$tp88
   1041 	lea	($tp28,$tp28),$tp48
   1042 	sub	$tp80,$acc0
   1043 	sub	$tp88,$acc8
   1044 	and	$maskfe,$tp40
   1045 	and	$maskfe,$tp48
   1046 	and	$mask1b,$acc0
   1047 	and	$mask1b,$acc8
   1048 	xor	$acc0,$tp40
   1049 	xor	$acc8,$tp48
   1050 	mov	$mask80,$tp80
   1051 	mov	$mask80,$tp88
   1052 
   1053 	and	$tp40,$tp80
   1054 	and	$tp48,$tp88
   1055 	mov	$tp80,$acc0
   1056 	mov	$tp88,$acc8
   1057 	shr	\$7,$tp80
   1058 	 xor	$tp10,$tp20		# tp2^=tp1
   1059 	shr	\$7,$tp88
   1060 	 xor	$tp18,$tp28		# tp2^=tp1
   1061 	sub	$tp80,$acc0
   1062 	sub	$tp88,$acc8
   1063 	lea	($tp40,$tp40),$tp80
   1064 	lea	($tp48,$tp48),$tp88
   1065 	 xor	$tp10,$tp40		# tp4^=tp1
   1066 	 xor	$tp18,$tp48		# tp4^=tp1
   1067 	and	$maskfe,$tp80
   1068 	and	$maskfe,$tp88
   1069 	and	$mask1b,$acc0
   1070 	and	$mask1b,$acc8
   1071 	xor	$acc0,$tp80
   1072 	xor	$acc8,$tp88
   1073 
   1074 	xor	$tp80,$tp10		# tp1^=tp8
   1075 	xor	$tp88,$tp18		# tp1^=tp8
   1076 	xor	$tp80,$tp20		# tp2^tp1^=tp8
   1077 	xor	$tp88,$tp28		# tp2^tp1^=tp8
   1078 	mov	$tp10,$acc0
   1079 	mov	$tp18,$acc8
   1080 	xor	$tp80,$tp40		# tp4^tp1^=tp8
   1081 	shr	\$32,$acc0
   1082 	xor	$tp88,$tp48		# tp4^tp1^=tp8
   1083 	shr	\$32,$acc8
   1084 	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
   1085 	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
   1086 	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
   1087 	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
   1088 	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1089 	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
   1090 	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1091 
   1092 	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
   1093 	xor	`&LO("$tp80")`,`&LO("$tp10")`
   1094 	shr	\$32,$tp80
   1095 	xor	`&LO("$tp88")`,`&LO("$tp18")`
   1096 	shr	\$32,$tp88
   1097 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1098 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1099 
   1100 	mov	$tp20,$tp80
   1101 	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
   1102 	mov	$tp28,$tp88
   1103 	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
   1104 	shr	\$32,$tp80
   1105 	xor	`&LO("$tp20")`,`&LO("$tp10")`
   1106 	shr	\$32,$tp88
   1107 	xor	`&LO("$tp28")`,`&LO("$tp18")`
   1108 	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
   1109 	mov	$tp40,$tp20
   1110 	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
   1111 	mov	$tp48,$tp28
   1112 	shr	\$32,$tp20
   1113 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1114 	shr	\$32,$tp28
   1115 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1116 
   1117 	`"mov	0($sbox),$mask80"	if ($prefetch)`
   1118 	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
   1119 	`"mov	64($sbox),$maskfe"	if ($prefetch)`
   1120 	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
   1121 	`"mov	128($sbox),$mask1b"	if ($prefetch)`
   1122 	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
   1123 	`"mov	192($sbox),$tp80"	if ($prefetch)`
   1124 	xor	`&LO("$tp40")`,`&LO("$tp10")`
   1125 	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
   1126 	xor	`&LO("$tp48")`,`&LO("$tp18")`
   1127 	`"mov	256($sbox),$tp88"	if ($prefetch)`
   1128 	xor	`&LO("$tp20")`,`&LO("$acc0")`
   1129 	xor	`&LO("$tp28")`,`&LO("$acc8")`
   1130 ___
   1131 }
   1132 
   1133 $code.=<<___;
   1134 .type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
   1135 .align	16
   1136 _x86_64_AES_decrypt_compact:
   1137 	lea	128($sbox),$inp			# size optimization
   1138 	mov	0-128($inp),$acc1		# prefetch Td4
   1139 	mov	32-128($inp),$acc2
   1140 	mov	64-128($inp),$t0
   1141 	mov	96-128($inp),$t1
   1142 	mov	128-128($inp),$acc1
   1143 	mov	160-128($inp),$acc2
   1144 	mov	192-128($inp),$t0
   1145 	mov	224-128($inp),$t1
   1146 	jmp	.Ldec_loop_compact
   1147 
   1148 .align	16
   1149 .Ldec_loop_compact:
   1150 		xor	0($key),$s0		# xor with key
   1151 		xor	4($key),$s1
   1152 		xor	8($key),$s2
   1153 		xor	12($key),$s3
   1154 		lea	16($key),$key
   1155 ___
   1156 		&deccompactvert();
   1157 $code.=<<___;
   1158 		cmp	16(%rsp),$key
   1159 		je	.Ldec_compact_done
   1160 
   1161 		mov	256+0($sbox),$mask80
   1162 		shl	\$32,%rbx
   1163 		shl	\$32,%rdx
   1164 		mov	256+8($sbox),$maskfe
   1165 		or	%rbx,%rax
   1166 		or	%rdx,%rcx
   1167 		mov	256+16($sbox),$mask1b
   1168 ___
   1169 		&dectransform(1);
   1170 $code.=<<___;
   1171 	jmp	.Ldec_loop_compact
   1172 .align	16
   1173 .Ldec_compact_done:
   1174 	xor	0($key),$s0
   1175 	xor	4($key),$s1
   1176 	xor	8($key),$s2
   1177 	xor	12($key),$s3
   1178 	.byte	0xf3,0xc3			# rep ret
   1179 .size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
   1180 ___
   1181 
   1182 # void asm_AES_decrypt (const void *inp,void *out,const AES_KEY *key);
   1183 $code.=<<___;
   1184 .align	16
   1185 .globl	asm_AES_decrypt
   1186 .type	asm_AES_decrypt,\@function,3
   1187 .hidden	asm_AES_decrypt
   1188 asm_AES_decrypt:
   1189 	push	%rbx
   1190 	push	%rbp
   1191 	push	%r12
   1192 	push	%r13
   1193 	push	%r14
   1194 	push	%r15
   1195 
   1196 	# allocate frame "above" key schedule
   1197 	mov	%rsp,%r10
   1198 	lea	-63(%rdx),%rcx	# %rdx is key argument
   1199 	and	\$-64,%rsp
   1200 	sub	%rsp,%rcx
   1201 	neg	%rcx
   1202 	and	\$0x3c0,%rcx
   1203 	sub	%rcx,%rsp
   1204 	sub	\$32,%rsp
   1205 
   1206 	mov	%rsi,16(%rsp)	# save out
   1207 	mov	%r10,24(%rsp)	# save real stack pointer
   1208 .Ldec_prologue:
   1209 
   1210 	mov	%rdx,$key
   1211 	mov	240($key),$rnds	# load rounds
   1212 
   1213 	mov	0(%rdi),$s0	# load input vector
   1214 	mov	4(%rdi),$s1
   1215 	mov	8(%rdi),$s2
   1216 	mov	12(%rdi),$s3
   1217 
   1218 	shl	\$4,$rnds
   1219 	lea	($key,$rnds),%rbp
   1220 	mov	$key,(%rsp)	# key schedule
   1221 	mov	%rbp,8(%rsp)	# end of key schedule
   1222 
   1223 	# pick Td4 copy which can't "overlap" with stack frame or key schedule
   1224 	lea	.LAES_Td+2048(%rip),$sbox
   1225 	lea	768(%rsp),%rbp
   1226 	sub	$sbox,%rbp
   1227 	and	\$0x300,%rbp
   1228 	lea	($sbox,%rbp),$sbox
   1229 	shr	\$3,%rbp	# recall "magic" constants!
   1230 	add	%rbp,$sbox
   1231 
   1232 	call	_x86_64_AES_decrypt_compact
   1233 
   1234 	mov	16(%rsp),$out	# restore out
   1235 	mov	24(%rsp),%rsi	# restore saved stack pointer
   1236 	mov	$s0,0($out)	# write output vector
   1237 	mov	$s1,4($out)
   1238 	mov	$s2,8($out)
   1239 	mov	$s3,12($out)
   1240 
   1241 	mov	(%rsi),%r15
   1242 	mov	8(%rsi),%r14
   1243 	mov	16(%rsi),%r13
   1244 	mov	24(%rsi),%r12
   1245 	mov	32(%rsi),%rbp
   1246 	mov	40(%rsi),%rbx
   1247 	lea	48(%rsi),%rsp
   1248 .Ldec_epilogue:
   1249 	ret
   1250 .size	asm_AES_decrypt,.-asm_AES_decrypt
   1251 ___
   1252 #------------------------------------------------------------------#
   1253 
   1254 sub enckey()
   1255 {
   1256 $code.=<<___;
   1257 	movz	%dl,%esi		# rk[i]>>0
   1258 	movzb	-128(%rbp,%rsi),%ebx
   1259 	movz	%dh,%esi		# rk[i]>>8
   1260 	shl	\$24,%ebx
   1261 	xor	%ebx,%eax
   1262 
   1263 	movzb	-128(%rbp,%rsi),%ebx
   1264 	shr	\$16,%edx
   1265 	movz	%dl,%esi		# rk[i]>>16
   1266 	xor	%ebx,%eax
   1267 
   1268 	movzb	-128(%rbp,%rsi),%ebx
   1269 	movz	%dh,%esi		# rk[i]>>24
   1270 	shl	\$8,%ebx
   1271 	xor	%ebx,%eax
   1272 
   1273 	movzb	-128(%rbp,%rsi),%ebx
   1274 	shl	\$16,%ebx
   1275 	xor	%ebx,%eax
   1276 
   1277 	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
   1278 ___
   1279 }
   1280 
   1281 # int asm_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
   1282 $code.=<<___;
   1283 .align	16
   1284 .globl asm_AES_set_encrypt_key
   1285 .type  asm_AES_set_encrypt_key,\@function,3
   1286 asm_AES_set_encrypt_key:
   1287 	push	%rbx
   1288 	push	%rbp
   1289 	push	%r12			# redundant, but allows to share 
   1290 	push	%r13			# exception handler...
   1291 	push	%r14
   1292 	push	%r15
   1293 	sub	\$8,%rsp
   1294 .Lenc_key_prologue:
   1295 
   1296 	call	_x86_64_AES_set_encrypt_key
   1297 
   1298 	mov	40(%rsp),%rbp
   1299 	mov	48(%rsp),%rbx
   1300 	add	\$56,%rsp
   1301 .Lenc_key_epilogue:
   1302 	ret
   1303 .size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key
   1304 
   1305 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
   1306 .align	16
   1307 _x86_64_AES_set_encrypt_key:
   1308 	mov	%esi,%ecx			# %ecx=bits
   1309 	mov	%rdi,%rsi			# %rsi=userKey
   1310 	mov	%rdx,%rdi			# %rdi=key
   1311 
   1312 	test	\$-1,%rsi
   1313 	jz	.Lbadpointer
   1314 	test	\$-1,%rdi
   1315 	jz	.Lbadpointer
   1316 
   1317 	lea	.LAES_Te(%rip),%rbp
   1318 	lea	2048+128(%rbp),%rbp
   1319 
   1320 	# prefetch Te4
   1321 	mov	0-128(%rbp),%eax
   1322 	mov	32-128(%rbp),%ebx
   1323 	mov	64-128(%rbp),%r8d
   1324 	mov	96-128(%rbp),%edx
   1325 	mov	128-128(%rbp),%eax
   1326 	mov	160-128(%rbp),%ebx
   1327 	mov	192-128(%rbp),%r8d
   1328 	mov	224-128(%rbp),%edx
   1329 
   1330 	cmp	\$128,%ecx
   1331 	je	.L10rounds
   1332 	cmp	\$192,%ecx
   1333 	je	.L12rounds
   1334 	cmp	\$256,%ecx
   1335 	je	.L14rounds
   1336 	mov	\$-2,%rax			# invalid number of bits
   1337 	jmp	.Lexit
   1338 
   1339 .L10rounds:
   1340 	mov	0(%rsi),%rax			# copy first 4 dwords
   1341 	mov	8(%rsi),%rdx
   1342 	mov	%rax,0(%rdi)
   1343 	mov	%rdx,8(%rdi)
   1344 
   1345 	shr	\$32,%rdx
   1346 	xor	%ecx,%ecx
   1347 	jmp	.L10shortcut
   1348 .align	4
   1349 .L10loop:
   1350 		mov	0(%rdi),%eax			# rk[0]
   1351 		mov	12(%rdi),%edx			# rk[3]
   1352 .L10shortcut:
   1353 ___
   1354 		&enckey	();
   1355 $code.=<<___;
   1356 		mov	%eax,16(%rdi)			# rk[4]
   1357 		xor	4(%rdi),%eax
   1358 		mov	%eax,20(%rdi)			# rk[5]
   1359 		xor	8(%rdi),%eax
   1360 		mov	%eax,24(%rdi)			# rk[6]
   1361 		xor	12(%rdi),%eax
   1362 		mov	%eax,28(%rdi)			# rk[7]
   1363 		add	\$1,%ecx
   1364 		lea	16(%rdi),%rdi
   1365 		cmp	\$10,%ecx
   1366 	jl	.L10loop
   1367 
   1368 	movl	\$10,80(%rdi)			# setup number of rounds
   1369 	xor	%rax,%rax
   1370 	jmp	.Lexit
   1371 
   1372 .L12rounds:
   1373 	mov	0(%rsi),%rax			# copy first 6 dwords
   1374 	mov	8(%rsi),%rbx
   1375 	mov	16(%rsi),%rdx
   1376 	mov	%rax,0(%rdi)
   1377 	mov	%rbx,8(%rdi)
   1378 	mov	%rdx,16(%rdi)
   1379 
   1380 	shr	\$32,%rdx
   1381 	xor	%ecx,%ecx
   1382 	jmp	.L12shortcut
   1383 .align	4
   1384 .L12loop:
   1385 		mov	0(%rdi),%eax			# rk[0]
   1386 		mov	20(%rdi),%edx			# rk[5]
   1387 .L12shortcut:
   1388 ___
   1389 		&enckey	();
   1390 $code.=<<___;
   1391 		mov	%eax,24(%rdi)			# rk[6]
   1392 		xor	4(%rdi),%eax
   1393 		mov	%eax,28(%rdi)			# rk[7]
   1394 		xor	8(%rdi),%eax
   1395 		mov	%eax,32(%rdi)			# rk[8]
   1396 		xor	12(%rdi),%eax
   1397 		mov	%eax,36(%rdi)			# rk[9]
   1398 
   1399 		cmp	\$7,%ecx
   1400 		je	.L12break
   1401 		add	\$1,%ecx
   1402 
   1403 		xor	16(%rdi),%eax
   1404 		mov	%eax,40(%rdi)			# rk[10]
   1405 		xor	20(%rdi),%eax
   1406 		mov	%eax,44(%rdi)			# rk[11]
   1407 
   1408 		lea	24(%rdi),%rdi
   1409 	jmp	.L12loop
   1410 .L12break:
   1411 	movl	\$12,72(%rdi)		# setup number of rounds
   1412 	xor	%rax,%rax
   1413 	jmp	.Lexit
   1414 
   1415 .L14rounds:		
   1416 	mov	0(%rsi),%rax			# copy first 8 dwords
   1417 	mov	8(%rsi),%rbx
   1418 	mov	16(%rsi),%rcx
   1419 	mov	24(%rsi),%rdx
   1420 	mov	%rax,0(%rdi)
   1421 	mov	%rbx,8(%rdi)
   1422 	mov	%rcx,16(%rdi)
   1423 	mov	%rdx,24(%rdi)
   1424 
   1425 	shr	\$32,%rdx
   1426 	xor	%ecx,%ecx
   1427 	jmp	.L14shortcut
   1428 .align	4
   1429 .L14loop:
   1430 		mov	0(%rdi),%eax			# rk[0]
   1431 		mov	28(%rdi),%edx			# rk[4]
   1432 .L14shortcut:
   1433 ___
   1434 		&enckey	();
   1435 $code.=<<___;
   1436 		mov	%eax,32(%rdi)			# rk[8]
   1437 		xor	4(%rdi),%eax
   1438 		mov	%eax,36(%rdi)			# rk[9]
   1439 		xor	8(%rdi),%eax
   1440 		mov	%eax,40(%rdi)			# rk[10]
   1441 		xor	12(%rdi),%eax
   1442 		mov	%eax,44(%rdi)			# rk[11]
   1443 
   1444 		cmp	\$6,%ecx
   1445 		je	.L14break
   1446 		add	\$1,%ecx
   1447 
   1448 		mov	%eax,%edx
   1449 		mov	16(%rdi),%eax			# rk[4]
   1450 		movz	%dl,%esi			# rk[11]>>0
   1451 		movzb	-128(%rbp,%rsi),%ebx
   1452 		movz	%dh,%esi			# rk[11]>>8
   1453 		xor	%ebx,%eax
   1454 
   1455 		movzb	-128(%rbp,%rsi),%ebx
   1456 		shr	\$16,%edx
   1457 		shl	\$8,%ebx
   1458 		movz	%dl,%esi			# rk[11]>>16
   1459 		xor	%ebx,%eax
   1460 
   1461 		movzb	-128(%rbp,%rsi),%ebx
   1462 		movz	%dh,%esi			# rk[11]>>24
   1463 		shl	\$16,%ebx
   1464 		xor	%ebx,%eax
   1465 
   1466 		movzb	-128(%rbp,%rsi),%ebx
   1467 		shl	\$24,%ebx
   1468 		xor	%ebx,%eax
   1469 
   1470 		mov	%eax,48(%rdi)			# rk[12]
   1471 		xor	20(%rdi),%eax
   1472 		mov	%eax,52(%rdi)			# rk[13]
   1473 		xor	24(%rdi),%eax
   1474 		mov	%eax,56(%rdi)			# rk[14]
   1475 		xor	28(%rdi),%eax
   1476 		mov	%eax,60(%rdi)			# rk[15]
   1477 
   1478 		lea	32(%rdi),%rdi
   1479 	jmp	.L14loop
   1480 .L14break:
   1481 	movl	\$14,48(%rdi)		# setup number of rounds
   1482 	xor	%rax,%rax
   1483 	jmp	.Lexit
   1484 
   1485 .Lbadpointer:
   1486 	mov	\$-1,%rax
   1487 .Lexit:
   1488 	.byte	0xf3,0xc3			# rep ret
   1489 .size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
   1490 ___
   1491 
   1492 sub deckey_ref()
   1493 { my ($i,$ptr,$te,$td) = @_;
   1494   my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
   1495 $code.=<<___;
   1496 	mov	$i($ptr),$tp1
   1497 	mov	$tp1,$acc
   1498 	and	\$0x80808080,$acc
   1499 	mov	$acc,$tp4
   1500 	shr	\$7,$tp4
   1501 	lea	0($tp1,$tp1),$tp2
   1502 	sub	$tp4,$acc
   1503 	and	\$0xfefefefe,$tp2
   1504 	and	\$0x1b1b1b1b,$acc
   1505 	xor	$tp2,$acc
   1506 	mov	$acc,$tp2
   1507 
   1508 	and	\$0x80808080,$acc
   1509 	mov	$acc,$tp8
   1510 	shr	\$7,$tp8
   1511 	lea	0($tp2,$tp2),$tp4
   1512 	sub	$tp8,$acc
   1513 	and	\$0xfefefefe,$tp4
   1514 	and	\$0x1b1b1b1b,$acc
   1515 	 xor	$tp1,$tp2		# tp2^tp1
   1516 	xor	$tp4,$acc
   1517 	mov	$acc,$tp4
   1518 
   1519 	and	\$0x80808080,$acc
   1520 	mov	$acc,$tp8
   1521 	shr	\$7,$tp8
   1522 	sub	$tp8,$acc
   1523 	lea	0($tp4,$tp4),$tp8
   1524 	 xor	$tp1,$tp4		# tp4^tp1
   1525 	and	\$0xfefefefe,$tp8
   1526 	and	\$0x1b1b1b1b,$acc
   1527 	xor	$acc,$tp8
   1528 
   1529 	xor	$tp8,$tp1		# tp1^tp8
   1530 	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
   1531 	xor	$tp8,$tp2		# tp2^tp1^tp8
   1532 	xor	$tp8,$tp4		# tp4^tp1^tp8
   1533 	xor	$tp2,$tp8
   1534 	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
   1535 
   1536 	xor	$tp8,$tp1
   1537 	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
   1538 	xor	$tp2,$tp1
   1539 	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
   1540 	xor	$tp4,$tp1
   1541 
   1542 	mov	$tp1,$i($ptr)
   1543 ___
   1544 }
   1545 
   1546 # int asm_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
   1547 $code.=<<___;
   1548 .align	16
   1549 .globl asm_AES_set_decrypt_key
   1550 .type  asm_AES_set_decrypt_key,\@function,3
   1551 asm_AES_set_decrypt_key:
   1552 	push	%rbx
   1553 	push	%rbp
   1554 	push	%r12
   1555 	push	%r13
   1556 	push	%r14
   1557 	push	%r15
   1558 	push	%rdx			# save key schedule
   1559 .Ldec_key_prologue:
   1560 
   1561 	call	_x86_64_AES_set_encrypt_key
   1562 	mov	(%rsp),%r8		# restore key schedule
   1563 	cmp	\$0,%eax
   1564 	jne	.Labort
   1565 
   1566 	mov	240(%r8),%r14d		# pull number of rounds
   1567 	xor	%rdi,%rdi
   1568 	lea	(%rdi,%r14d,4),%rcx
   1569 	mov	%r8,%rsi
   1570 	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
   1571 .align	4
   1572 .Linvert:
   1573 		mov	0(%rsi),%rax
   1574 		mov	8(%rsi),%rbx
   1575 		mov	0(%rdi),%rcx
   1576 		mov	8(%rdi),%rdx
   1577 		mov	%rax,0(%rdi)
   1578 		mov	%rbx,8(%rdi)
   1579 		mov	%rcx,0(%rsi)
   1580 		mov	%rdx,8(%rsi)
   1581 		lea	16(%rsi),%rsi
   1582 		lea	-16(%rdi),%rdi
   1583 		cmp	%rsi,%rdi
   1584 	jne	.Linvert
   1585 
   1586 	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
   1587 
   1588 	mov	40(%rax),$mask80
   1589 	mov	48(%rax),$maskfe
   1590 	mov	56(%rax),$mask1b
   1591 
   1592 	mov	%r8,$key
   1593 	sub	\$1,%r14d
   1594 .align	4
   1595 .Lpermute:
   1596 		lea	16($key),$key
   1597 		mov	0($key),%rax
   1598 		mov	8($key),%rcx
   1599 ___
   1600 		&dectransform ();
   1601 $code.=<<___;
   1602 		mov	%eax,0($key)
   1603 		mov	%ebx,4($key)
   1604 		mov	%ecx,8($key)
   1605 		mov	%edx,12($key)
   1606 		sub	\$1,%r14d
   1607 	jnz	.Lpermute
   1608 
   1609 	xor	%rax,%rax
   1610 .Labort:
   1611 	mov	8(%rsp),%r15
   1612 	mov	16(%rsp),%r14
   1613 	mov	24(%rsp),%r13
   1614 	mov	32(%rsp),%r12
   1615 	mov	40(%rsp),%rbp
   1616 	mov	48(%rsp),%rbx
   1617 	add	\$56,%rsp
   1618 .Ldec_key_epilogue:
   1619 	ret
   1620 .size	asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key
   1621 ___
   1622 
   1623 # void asm_AES_cbc_encrypt (const void char *inp, unsigned char *out,
   1624 #			    size_t length, const AES_KEY *key,
   1625 #			    unsigned char *ivp,const int enc);
   1626 {
   1627 # stack frame layout
   1628 # -8(%rsp)		return address
   1629 my $keyp="0(%rsp)";		# one to pass as $key
   1630 my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
   1631 my $_rsp="16(%rsp)";		# saved %rsp
   1632 my $_inp="24(%rsp)";		# copy of 1st parameter, inp
   1633 my $_out="32(%rsp)";		# copy of 2nd parameter, out
   1634 my $_len="40(%rsp)";		# copy of 3rd parameter, length
   1635 my $_key="48(%rsp)";		# copy of 4th parameter, key
   1636 my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
   1637 my $ivec="64(%rsp)";		# ivec[16]
   1638 my $aes_key="80(%rsp)";		# copy of aes_key
   1639 my $mark="80+240(%rsp)";	# copy of aes_key->rounds
   1640 
   1641 $code.=<<___;
   1642 .align	16
   1643 .globl	asm_AES_cbc_encrypt
   1644 .type	asm_AES_cbc_encrypt,\@function,6
   1645 .extern	OPENSSL_ia32cap_P
   1646 .hidden	asm_AES_cbc_encrypt
   1647 asm_AES_cbc_encrypt:
   1648 	cmp	\$0,%rdx	# check length
   1649 	je	.Lcbc_epilogue
   1650 	pushfq
   1651 	push	%rbx
   1652 	push	%rbp
   1653 	push	%r12
   1654 	push	%r13
   1655 	push	%r14
   1656 	push	%r15
   1657 .Lcbc_prologue:
   1658 
   1659 	cld
   1660 	mov	%r9d,%r9d	# clear upper half of enc
   1661 
   1662 	lea	.LAES_Te(%rip),$sbox
   1663 	cmp	\$0,%r9
   1664 	jne	.Lcbc_picked_te
   1665 	lea	.LAES_Td(%rip),$sbox
   1666 .Lcbc_picked_te:
   1667 
   1668 	mov	OPENSSL_ia32cap_P(%rip),%r10d
   1669 	cmp	\$$speed_limit,%rdx
   1670 	jb	.Lcbc_slow_prologue
   1671 	test	\$15,%rdx
   1672 	jnz	.Lcbc_slow_prologue
   1673 	bt	\$28,%r10d
   1674 	jc	.Lcbc_slow_prologue
   1675 
   1676 	# allocate aligned stack frame...
   1677 	lea	-88-248(%rsp),$key
   1678 	and	\$-64,$key
   1679 
   1680 	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
   1681 	mov	$sbox,%r10
   1682 	lea	2304($sbox),%r11
   1683 	mov	$key,%r12
   1684 	and	\$0xFFF,%r10	# s = $sbox&0xfff
   1685 	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
   1686 	and	\$0xFFF,%r12	# p = %rsp&0xfff
   1687 
   1688 	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
   1689 	jb	.Lcbc_te_break_out
   1690 	sub	%r11,%r12
   1691 	sub	%r12,$key
   1692 	jmp	.Lcbc_te_ok
   1693 .Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
   1694 	sub	%r10,%r12
   1695 	and	\$0xFFF,%r12
   1696 	add	\$320,%r12
   1697 	sub	%r12,$key
   1698 .align	4
   1699 .Lcbc_te_ok:
   1700 
   1701 	xchg	%rsp,$key
   1702 	#add	\$8,%rsp	# reserve for return address!
   1703 	mov	$key,$_rsp	# save %rsp
   1704 .Lcbc_fast_body:
   1705 	mov	%rdi,$_inp	# save copy of inp
   1706 	mov	%rsi,$_out	# save copy of out
   1707 	mov	%rdx,$_len	# save copy of len
   1708 	mov	%rcx,$_key	# save copy of key
   1709 	mov	%r8,$_ivp	# save copy of ivp
   1710 	movl	\$0,$mark	# copy of aes_key->rounds = 0;
   1711 	mov	%r8,%rbp	# rearrange input arguments
   1712 	mov	%r9,%rbx
   1713 	mov	%rsi,$out
   1714 	mov	%rdi,$inp
   1715 	mov	%rcx,$key
   1716 
   1717 	mov	240($key),%eax		# key->rounds
   1718 	# do we copy key schedule to stack?
   1719 	mov	$key,%r10
   1720 	sub	$sbox,%r10
   1721 	and	\$0xfff,%r10
   1722 	cmp	\$2304,%r10
   1723 	jb	.Lcbc_do_ecopy
   1724 	cmp	\$4096-248,%r10
   1725 	jb	.Lcbc_skip_ecopy
   1726 .align	4
   1727 .Lcbc_do_ecopy:
   1728 		mov	$key,%rsi
   1729 		lea	$aes_key,%rdi
   1730 		lea	$aes_key,$key
   1731 		mov	\$240/8,%ecx
   1732 		.long	0x90A548F3	# rep movsq
   1733 		mov	%eax,(%rdi)	# copy aes_key->rounds
   1734 .Lcbc_skip_ecopy:
   1735 	mov	$key,$keyp	# save key pointer
   1736 
   1737 	mov	\$18,%ecx
   1738 .align	4
   1739 .Lcbc_prefetch_te:
   1740 		mov	0($sbox),%r10
   1741 		mov	32($sbox),%r11
   1742 		mov	64($sbox),%r12
   1743 		mov	96($sbox),%r13
   1744 		lea	128($sbox),$sbox
   1745 		sub	\$1,%ecx
   1746 	jnz	.Lcbc_prefetch_te
   1747 	lea	-2304($sbox),$sbox
   1748 
   1749 	cmp	\$0,%rbx
   1750 	je	.LFAST_DECRYPT
   1751 
   1752 #----------------------------- ENCRYPT -----------------------------#
   1753 	mov	0(%rbp),$s0		# load iv
   1754 	mov	4(%rbp),$s1
   1755 	mov	8(%rbp),$s2
   1756 	mov	12(%rbp),$s3
   1757 
   1758 .align	4
   1759 .Lcbc_fast_enc_loop:
   1760 		xor	0($inp),$s0
   1761 		xor	4($inp),$s1
   1762 		xor	8($inp),$s2
   1763 		xor	12($inp),$s3
   1764 		mov	$keyp,$key	# restore key
   1765 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1766 
   1767 		call	_x86_64_AES_encrypt
   1768 
   1769 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1770 		mov	$_len,%r10
   1771 		mov	$s0,0($out)
   1772 		mov	$s1,4($out)
   1773 		mov	$s2,8($out)
   1774 		mov	$s3,12($out)
   1775 
   1776 		lea	16($inp),$inp
   1777 		lea	16($out),$out
   1778 		sub	\$16,%r10
   1779 		test	\$-16,%r10
   1780 		mov	%r10,$_len
   1781 	jnz	.Lcbc_fast_enc_loop
   1782 	mov	$_ivp,%rbp	# restore ivp
   1783 	mov	$s0,0(%rbp)	# save ivec
   1784 	mov	$s1,4(%rbp)
   1785 	mov	$s2,8(%rbp)
   1786 	mov	$s3,12(%rbp)
   1787 
   1788 	jmp	.Lcbc_fast_cleanup
   1789 
   1790 #----------------------------- DECRYPT -----------------------------#
   1791 .align	16
   1792 .LFAST_DECRYPT:
   1793 	cmp	$inp,$out
   1794 	je	.Lcbc_fast_dec_in_place
   1795 
   1796 	mov	%rbp,$ivec
   1797 .align	4
   1798 .Lcbc_fast_dec_loop:
   1799 		mov	0($inp),$s0	# read input
   1800 		mov	4($inp),$s1
   1801 		mov	8($inp),$s2
   1802 		mov	12($inp),$s3
   1803 		mov	$keyp,$key	# restore key
   1804 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1805 
   1806 		call	_x86_64_AES_decrypt
   1807 
   1808 		mov	$ivec,%rbp	# load ivp
   1809 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1810 		mov	$_len,%r10	# load len
   1811 		xor	0(%rbp),$s0	# xor iv
   1812 		xor	4(%rbp),$s1
   1813 		xor	8(%rbp),$s2
   1814 		xor	12(%rbp),$s3
   1815 		mov	$inp,%rbp	# current input, next iv
   1816 
   1817 		sub	\$16,%r10
   1818 		mov	%r10,$_len	# update len
   1819 		mov	%rbp,$ivec	# update ivp
   1820 
   1821 		mov	$s0,0($out)	# write output
   1822 		mov	$s1,4($out)
   1823 		mov	$s2,8($out)
   1824 		mov	$s3,12($out)
   1825 
   1826 		lea	16($inp),$inp
   1827 		lea	16($out),$out
   1828 	jnz	.Lcbc_fast_dec_loop
   1829 	mov	$_ivp,%r12		# load user ivp
   1830 	mov	0(%rbp),%r10		# load iv
   1831 	mov	8(%rbp),%r11
   1832 	mov	%r10,0(%r12)		# copy back to user
   1833 	mov	%r11,8(%r12)
   1834 	jmp	.Lcbc_fast_cleanup
   1835 
   1836 .align	16
   1837 .Lcbc_fast_dec_in_place:
   1838 	mov	0(%rbp),%r10		# copy iv to stack
   1839 	mov	8(%rbp),%r11
   1840 	mov	%r10,0+$ivec
   1841 	mov	%r11,8+$ivec
   1842 .align	4
   1843 .Lcbc_fast_dec_in_place_loop:
   1844 		mov	0($inp),$s0	# load input
   1845 		mov	4($inp),$s1
   1846 		mov	8($inp),$s2
   1847 		mov	12($inp),$s3
   1848 		mov	$keyp,$key	# restore key
   1849 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1850 
   1851 		call	_x86_64_AES_decrypt
   1852 
   1853 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1854 		mov	$_len,%r10
   1855 		xor	0+$ivec,$s0
   1856 		xor	4+$ivec,$s1
   1857 		xor	8+$ivec,$s2
   1858 		xor	12+$ivec,$s3
   1859 
   1860 		mov	0($inp),%r11	# load input
   1861 		mov	8($inp),%r12
   1862 		sub	\$16,%r10
   1863 		jz	.Lcbc_fast_dec_in_place_done
   1864 
   1865 		mov	%r11,0+$ivec	# copy input to iv
   1866 		mov	%r12,8+$ivec
   1867 
   1868 		mov	$s0,0($out)	# save output [zaps input]
   1869 		mov	$s1,4($out)
   1870 		mov	$s2,8($out)
   1871 		mov	$s3,12($out)
   1872 
   1873 		lea	16($inp),$inp
   1874 		lea	16($out),$out
   1875 		mov	%r10,$_len
   1876 	jmp	.Lcbc_fast_dec_in_place_loop
   1877 .Lcbc_fast_dec_in_place_done:
   1878 	mov	$_ivp,%rdi
   1879 	mov	%r11,0(%rdi)	# copy iv back to user
   1880 	mov	%r12,8(%rdi)
   1881 
   1882 	mov	$s0,0($out)	# save output [zaps input]
   1883 	mov	$s1,4($out)
   1884 	mov	$s2,8($out)
   1885 	mov	$s3,12($out)
   1886 
   1887 .align	4
   1888 .Lcbc_fast_cleanup:
   1889 	cmpl	\$0,$mark	# was the key schedule copied?
   1890 	lea	$aes_key,%rdi
   1891 	je	.Lcbc_exit
   1892 		mov	\$240/8,%ecx
   1893 		xor	%rax,%rax
   1894 		.long	0x90AB48F3	# rep stosq
   1895 
   1896 	jmp	.Lcbc_exit
   1897 
   1898 #--------------------------- SLOW ROUTINE ---------------------------#
   1899 .align	16
   1900 .Lcbc_slow_prologue:
   1901 	# allocate aligned stack frame...
   1902 	lea	-88(%rsp),%rbp
   1903 	and	\$-64,%rbp
   1904 	# ... just "above" key schedule
   1905 	lea	-88-63(%rcx),%r10
   1906 	sub	%rbp,%r10
   1907 	neg	%r10
   1908 	and	\$0x3c0,%r10
   1909 	sub	%r10,%rbp
   1910 
   1911 	xchg	%rsp,%rbp
   1912 	#add	\$8,%rsp	# reserve for return address!
   1913 	mov	%rbp,$_rsp	# save %rsp
   1914 .Lcbc_slow_body:
   1915 	#mov	%rdi,$_inp	# save copy of inp
   1916 	#mov	%rsi,$_out	# save copy of out
   1917 	#mov	%rdx,$_len	# save copy of len
   1918 	#mov	%rcx,$_key	# save copy of key
   1919 	mov	%r8,$_ivp	# save copy of ivp
   1920 	mov	%r8,%rbp	# rearrange input arguments
   1921 	mov	%r9,%rbx
   1922 	mov	%rsi,$out
   1923 	mov	%rdi,$inp
   1924 	mov	%rcx,$key
   1925 	mov	%rdx,%r10
   1926 
   1927 	mov	240($key),%eax
   1928 	mov	$key,$keyp	# save key pointer
   1929 	shl	\$4,%eax
   1930 	lea	($key,%rax),%rax
   1931 	mov	%rax,$keyend
   1932 
   1933 	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
   1934 	lea	2048($sbox),$sbox
   1935 	lea	768-8(%rsp),%rax
   1936 	sub	$sbox,%rax
   1937 	and	\$0x300,%rax
   1938 	lea	($sbox,%rax),$sbox
   1939 
   1940 	cmp	\$0,%rbx
   1941 	je	.LSLOW_DECRYPT
   1942 
   1943 #--------------------------- SLOW ENCRYPT ---------------------------#
   1944 	test	\$-16,%r10		# check upon length
   1945 	mov	0(%rbp),$s0		# load iv
   1946 	mov	4(%rbp),$s1
   1947 	mov	8(%rbp),$s2
   1948 	mov	12(%rbp),$s3
   1949 	jz	.Lcbc_slow_enc_tail	# short input...
   1950 
   1951 .align	4
   1952 .Lcbc_slow_enc_loop:
   1953 		xor	0($inp),$s0
   1954 		xor	4($inp),$s1
   1955 		xor	8($inp),$s2
   1956 		xor	12($inp),$s3
   1957 		mov	$keyp,$key	# restore key
   1958 		mov	$inp,$_inp	# save inp
   1959 		mov	$out,$_out	# save out
   1960 		mov	%r10,$_len	# save len
   1961 
   1962 		call	_x86_64_AES_encrypt_compact
   1963 
   1964 		mov	$_inp,$inp	# restore inp
   1965 		mov	$_out,$out	# restore out
   1966 		mov	$_len,%r10	# restore len
   1967 		mov	$s0,0($out)
   1968 		mov	$s1,4($out)
   1969 		mov	$s2,8($out)
   1970 		mov	$s3,12($out)
   1971 
   1972 		lea	16($inp),$inp
   1973 		lea	16($out),$out
   1974 		sub	\$16,%r10
   1975 		test	\$-16,%r10
   1976 	jnz	.Lcbc_slow_enc_loop
   1977 	test	\$15,%r10
   1978 	jnz	.Lcbc_slow_enc_tail
   1979 	mov	$_ivp,%rbp	# restore ivp
   1980 	mov	$s0,0(%rbp)	# save ivec
   1981 	mov	$s1,4(%rbp)
   1982 	mov	$s2,8(%rbp)
   1983 	mov	$s3,12(%rbp)
   1984 
   1985 	jmp	.Lcbc_exit
   1986 
   1987 .align	4
   1988 .Lcbc_slow_enc_tail:
   1989 	mov	%rax,%r11
   1990 	mov	%rcx,%r12
   1991 	mov	%r10,%rcx
   1992 	mov	$inp,%rsi
   1993 	mov	$out,%rdi
   1994 	.long	0x9066A4F3		# rep movsb
   1995 	mov	\$16,%rcx		# zero tail
   1996 	sub	%r10,%rcx
   1997 	xor	%rax,%rax
   1998 	.long	0x9066AAF3		# rep stosb
   1999 	mov	$out,$inp		# this is not a mistake!
   2000 	mov	\$16,%r10		# len=16
   2001 	mov	%r11,%rax
   2002 	mov	%r12,%rcx
   2003 	jmp	.Lcbc_slow_enc_loop	# one more spin...
   2004 #--------------------------- SLOW DECRYPT ---------------------------#
   2005 .align	16
   2006 .LSLOW_DECRYPT:
   2007 	shr	\$3,%rax
   2008 	add	%rax,$sbox		# recall "magic" constants!
   2009 
   2010 	mov	0(%rbp),%r11		# copy iv to stack
   2011 	mov	8(%rbp),%r12
   2012 	mov	%r11,0+$ivec
   2013 	mov	%r12,8+$ivec
   2014 
   2015 .align	4
   2016 .Lcbc_slow_dec_loop:
   2017 		mov	0($inp),$s0	# load input
   2018 		mov	4($inp),$s1
   2019 		mov	8($inp),$s2
   2020 		mov	12($inp),$s3
   2021 		mov	$keyp,$key	# restore key
   2022 		mov	$inp,$_inp	# save inp
   2023 		mov	$out,$_out	# save out
   2024 		mov	%r10,$_len	# save len
   2025 
   2026 		call	_x86_64_AES_decrypt_compact
   2027 
   2028 		mov	$_inp,$inp	# restore inp
   2029 		mov	$_out,$out	# restore out
   2030 		mov	$_len,%r10
   2031 		xor	0+$ivec,$s0
   2032 		xor	4+$ivec,$s1
   2033 		xor	8+$ivec,$s2
   2034 		xor	12+$ivec,$s3
   2035 
   2036 		mov	0($inp),%r11	# load input
   2037 		mov	8($inp),%r12
   2038 		sub	\$16,%r10
   2039 		jc	.Lcbc_slow_dec_partial
   2040 		jz	.Lcbc_slow_dec_done
   2041 
   2042 		mov	%r11,0+$ivec	# copy input to iv
   2043 		mov	%r12,8+$ivec
   2044 
   2045 		mov	$s0,0($out)	# save output [can zap input]
   2046 		mov	$s1,4($out)
   2047 		mov	$s2,8($out)
   2048 		mov	$s3,12($out)
   2049 
   2050 		lea	16($inp),$inp
   2051 		lea	16($out),$out
   2052 	jmp	.Lcbc_slow_dec_loop
   2053 .Lcbc_slow_dec_done:
   2054 	mov	$_ivp,%rdi
   2055 	mov	%r11,0(%rdi)		# copy iv back to user
   2056 	mov	%r12,8(%rdi)
   2057 
   2058 	mov	$s0,0($out)		# save output [can zap input]
   2059 	mov	$s1,4($out)
   2060 	mov	$s2,8($out)
   2061 	mov	$s3,12($out)
   2062 
   2063 	jmp	.Lcbc_exit
   2064 
   2065 .align	4
   2066 .Lcbc_slow_dec_partial:
   2067 	mov	$_ivp,%rdi
   2068 	mov	%r11,0(%rdi)		# copy iv back to user
   2069 	mov	%r12,8(%rdi)
   2070 
   2071 	mov	$s0,0+$ivec		# save output to stack
   2072 	mov	$s1,4+$ivec
   2073 	mov	$s2,8+$ivec
   2074 	mov	$s3,12+$ivec
   2075 
   2076 	mov	$out,%rdi
   2077 	lea	$ivec,%rsi
   2078 	lea	16(%r10),%rcx
   2079 	.long	0x9066A4F3	# rep movsb
   2080 	jmp	.Lcbc_exit
   2081 
   2082 .align	16
   2083 .Lcbc_exit:
   2084 	mov	$_rsp,%rsi
   2085 	mov	(%rsi),%r15
   2086 	mov	8(%rsi),%r14
   2087 	mov	16(%rsi),%r13
   2088 	mov	24(%rsi),%r12
   2089 	mov	32(%rsi),%rbp
   2090 	mov	40(%rsi),%rbx
   2091 	lea	48(%rsi),%rsp
   2092 .Lcbc_popfq:
   2093 	popfq
   2094 .Lcbc_epilogue:
   2095 	ret
   2096 .size	asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt
   2097 ___
   2098 }
   2099 
   2100 $code.=<<___;
   2101 .align	64
   2102 .LAES_Te:
   2103 ___
   2104 	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
   2105 	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
   2106 	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
   2107 	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
   2108 	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
   2109 	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
   2110 	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
   2111 	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
   2112 	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
   2113 	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
   2114 	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
   2115 	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
   2116 	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
   2117 	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
   2118 	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
   2119 	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
   2120 	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
   2121 	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
   2122 	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
   2123 	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
   2124 	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
   2125 	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
   2126 	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
   2127 	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
   2128 	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
   2129 	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
   2130 	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
   2131 	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
   2132 	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
   2133 	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
   2134 	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
   2135 	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
   2136 	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
   2137 	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
   2138 	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
   2139 	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
   2140 	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
   2141 	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
   2142 	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
   2143 	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
   2144 	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
   2145 	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
   2146 	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
   2147 	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
   2148 	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
   2149 	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
   2150 	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
   2151 	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
   2152 	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
   2153 	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
   2154 	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
   2155 	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
   2156 	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
   2157 	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
   2158 	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
   2159 	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
   2160 	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
   2161 	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
   2162 	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
   2163 	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
   2164 	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
   2165 	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
   2166 	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
   2167 	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
   2168 
   2169 #Te4	# four copies of Te4 to choose from to avoid L1 aliasing
   2170 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2171 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2172 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2173 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2174 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2175 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2176 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2177 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2178 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2179 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2180 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2181 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2182 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2183 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2184 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2185 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2186 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2187 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2188 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2189 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2190 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2191 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2192 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2193 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2194 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2195 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2196 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2197 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2198 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2199 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2200 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2201 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2202 
   2203 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2204 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2205 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2206 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2207 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2208 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2209 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2210 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2211 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2212 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2213 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2214 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2215 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2216 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2217 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2218 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2219 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2220 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2221 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2222 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2223 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2224 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2225 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2226 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2227 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2228 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2229 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2230 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2231 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2232 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2233 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2234 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2235 
   2236 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2237 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2238 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2239 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2240 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2241 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2242 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2243 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2244 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2245 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2246 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2247 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2248 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2249 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2250 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2251 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2252 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2253 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2254 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2255 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2256 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2257 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2258 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2259 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2260 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2261 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2262 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2263 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2264 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2265 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2266 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2267 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2268 
   2269 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2270 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2271 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2272 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2273 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2274 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2275 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2276 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2277 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2278 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2279 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2280 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2281 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2282 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2283 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2284 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2285 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2286 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2287 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2288 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2289 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2290 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2291 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2292 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2293 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2294 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2295 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2296 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2297 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2298 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2299 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2300 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2301 #rcon:
   2302 $code.=<<___;
   2303 	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
   2304 	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
   2305 	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
   2306 	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
   2307 ___
   2308 $code.=<<___;
   2309 .align	64
   2310 .LAES_Td:
   2311 ___
   2312 	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
   2313 	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
   2314 	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
   2315 	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
   2316 	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
   2317 	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
   2318 	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
   2319 	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
   2320 	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
   2321 	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
   2322 	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
   2323 	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
   2324 	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
   2325 	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
   2326 	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
   2327 	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
   2328 	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
   2329 	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
   2330 	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
   2331 	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
   2332 	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
   2333 	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
   2334 	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
   2335 	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
   2336 	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
   2337 	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
   2338 	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
   2339 	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
   2340 	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
   2341 	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
   2342 	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
   2343 	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
   2344 	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
   2345 	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
   2346 	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
   2347 	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
   2348 	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
   2349 	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
   2350 	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
   2351 	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
   2352 	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
   2353 	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
   2354 	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
   2355 	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
   2356 	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
   2357 	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
   2358 	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
   2359 	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
   2360 	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
   2361 	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
   2362 	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
   2363 	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
   2364 	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
   2365 	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
   2366 	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
   2367 	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
   2368 	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
   2369 	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
   2370 	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
   2371 	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
   2372 	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
   2373 	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
   2374 	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
   2375 	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
   2376 
   2377 #Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
   2378 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2379 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2380 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2381 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2382 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2383 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2384 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2385 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2386 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2387 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2388 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2389 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2390 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2391 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2392 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2393 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2394 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2395 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2396 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2397 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2398 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2399 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2400 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2401 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2402 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2403 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2404 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2405 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2406 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2407 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2408 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2409 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2410 $code.=<<___;
   2411 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2412 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2413 ___
   2414 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2415 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2416 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2417 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2418 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2419 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2420 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2421 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2422 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2423 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2424 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2425 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2426 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2427 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2428 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2429 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2430 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2431 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2432 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2433 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2434 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2435 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2436 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2437 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2438 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2439 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2440 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2441 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2442 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2443 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2444 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2445 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2446 $code.=<<___;
   2447 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2448 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2449 ___
   2450 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2451 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2452 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2453 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2454 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2455 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2456 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2457 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2458 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2459 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2460 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2461 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2462 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2463 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2464 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2465 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2466 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2467 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2468 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2469 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2470 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2471 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2472 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2473 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2474 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2475 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2476 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2477 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2478 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2479 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2480 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2481 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2482 $code.=<<___;
   2483 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2484 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2485 ___
   2486 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2487 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2488 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2489 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2490 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2491 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2492 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2493 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2494 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2495 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2496 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2497 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2498 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2499 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2500 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2501 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2502 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2503 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2504 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2505 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2506 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2507 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2508 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2509 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2510 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2511 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2512 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2513 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2514 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2515 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2516 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2517 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2518 $code.=<<___;
   2519 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2520 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2521 .asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   2522 .align	64
   2523 ___
   2524 
   2525 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2526 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2527 if ($win64) {
   2528 $rec="%rcx";
   2529 $frame="%rdx";
   2530 $context="%r8";
   2531 $disp="%r9";
   2532 
   2533 $code.=<<___;
   2534 .extern	__imp_RtlVirtualUnwind
   2535 .type	block_se_handler,\@abi-omnipotent
   2536 .align	16
   2537 block_se_handler:
   2538 	push	%rsi
   2539 	push	%rdi
   2540 	push	%rbx
   2541 	push	%rbp
   2542 	push	%r12
   2543 	push	%r13
   2544 	push	%r14
   2545 	push	%r15
   2546 	pushfq
   2547 	sub	\$64,%rsp
   2548 
   2549 	mov	120($context),%rax	# pull context->Rax
   2550 	mov	248($context),%rbx	# pull context->Rip
   2551 
   2552 	mov	8($disp),%rsi		# disp->ImageBase
   2553 	mov	56($disp),%r11		# disp->HandlerData
   2554 
   2555 	mov	0(%r11),%r10d		# HandlerData[0]
   2556 	lea	(%rsi,%r10),%r10	# prologue label
   2557 	cmp	%r10,%rbx		# context->Rip<prologue label
   2558 	jb	.Lin_block_prologue
   2559 
   2560 	mov	152($context),%rax	# pull context->Rsp
   2561 
   2562 	mov	4(%r11),%r10d		# HandlerData[1]
   2563 	lea	(%rsi,%r10),%r10	# epilogue label
   2564 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2565 	jae	.Lin_block_prologue
   2566 
   2567 	mov	24(%rax),%rax		# pull saved real stack pointer
   2568 	lea	48(%rax),%rax		# adjust...
   2569 
   2570 	mov	-8(%rax),%rbx
   2571 	mov	-16(%rax),%rbp
   2572 	mov	-24(%rax),%r12
   2573 	mov	-32(%rax),%r13
   2574 	mov	-40(%rax),%r14
   2575 	mov	-48(%rax),%r15
   2576 	mov	%rbx,144($context)	# restore context->Rbx
   2577 	mov	%rbp,160($context)	# restore context->Rbp
   2578 	mov	%r12,216($context)	# restore context->R12
   2579 	mov	%r13,224($context)	# restore context->R13
   2580 	mov	%r14,232($context)	# restore context->R14
   2581 	mov	%r15,240($context)	# restore context->R15
   2582 
   2583 .Lin_block_prologue:
   2584 	mov	8(%rax),%rdi
   2585 	mov	16(%rax),%rsi
   2586 	mov	%rax,152($context)	# restore context->Rsp
   2587 	mov	%rsi,168($context)	# restore context->Rsi
   2588 	mov	%rdi,176($context)	# restore context->Rdi
   2589 
   2590 	jmp	.Lcommon_seh_exit
   2591 .size	block_se_handler,.-block_se_handler
   2592 
   2593 .type	key_se_handler,\@abi-omnipotent
   2594 .align	16
   2595 key_se_handler:
   2596 	push	%rsi
   2597 	push	%rdi
   2598 	push	%rbx
   2599 	push	%rbp
   2600 	push	%r12
   2601 	push	%r13
   2602 	push	%r14
   2603 	push	%r15
   2604 	pushfq
   2605 	sub	\$64,%rsp
   2606 
   2607 	mov	120($context),%rax	# pull context->Rax
   2608 	mov	248($context),%rbx	# pull context->Rip
   2609 
   2610 	mov	8($disp),%rsi		# disp->ImageBase
   2611 	mov	56($disp),%r11		# disp->HandlerData
   2612 
   2613 	mov	0(%r11),%r10d		# HandlerData[0]
   2614 	lea	(%rsi,%r10),%r10	# prologue label
   2615 	cmp	%r10,%rbx		# context->Rip<prologue label
   2616 	jb	.Lin_key_prologue
   2617 
   2618 	mov	152($context),%rax	# pull context->Rsp
   2619 
   2620 	mov	4(%r11),%r10d		# HandlerData[1]
   2621 	lea	(%rsi,%r10),%r10	# epilogue label
   2622 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2623 	jae	.Lin_key_prologue
   2624 
   2625 	lea	56(%rax),%rax
   2626 
   2627 	mov	-8(%rax),%rbx
   2628 	mov	-16(%rax),%rbp
   2629 	mov	-24(%rax),%r12
   2630 	mov	-32(%rax),%r13
   2631 	mov	-40(%rax),%r14
   2632 	mov	-48(%rax),%r15
   2633 	mov	%rbx,144($context)	# restore context->Rbx
   2634 	mov	%rbp,160($context)	# restore context->Rbp
   2635 	mov	%r12,216($context)	# restore context->R12
   2636 	mov	%r13,224($context)	# restore context->R13
   2637 	mov	%r14,232($context)	# restore context->R14
   2638 	mov	%r15,240($context)	# restore context->R15
   2639 
   2640 .Lin_key_prologue:
   2641 	mov	8(%rax),%rdi
   2642 	mov	16(%rax),%rsi
   2643 	mov	%rax,152($context)	# restore context->Rsp
   2644 	mov	%rsi,168($context)	# restore context->Rsi
   2645 	mov	%rdi,176($context)	# restore context->Rdi
   2646 
   2647 	jmp	.Lcommon_seh_exit
   2648 .size	key_se_handler,.-key_se_handler
   2649 
   2650 .type	cbc_se_handler,\@abi-omnipotent
   2651 .align	16
   2652 cbc_se_handler:
   2653 	push	%rsi
   2654 	push	%rdi
   2655 	push	%rbx
   2656 	push	%rbp
   2657 	push	%r12
   2658 	push	%r13
   2659 	push	%r14
   2660 	push	%r15
   2661 	pushfq
   2662 	sub	\$64,%rsp
   2663 
   2664 	mov	120($context),%rax	# pull context->Rax
   2665 	mov	248($context),%rbx	# pull context->Rip
   2666 
   2667 	lea	.Lcbc_prologue(%rip),%r10
   2668 	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
   2669 	jb	.Lin_cbc_prologue
   2670 
   2671 	lea	.Lcbc_fast_body(%rip),%r10
   2672 	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
   2673 	jb	.Lin_cbc_frame_setup
   2674 
   2675 	lea	.Lcbc_slow_prologue(%rip),%r10
   2676 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
   2677 	jb	.Lin_cbc_body
   2678 
   2679 	lea	.Lcbc_slow_body(%rip),%r10
   2680 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
   2681 	jb	.Lin_cbc_frame_setup
   2682 
   2683 .Lin_cbc_body:
   2684 	mov	152($context),%rax	# pull context->Rsp
   2685 
   2686 	lea	.Lcbc_epilogue(%rip),%r10
   2687 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
   2688 	jae	.Lin_cbc_prologue
   2689 
   2690 	lea	8(%rax),%rax
   2691 
   2692 	lea	.Lcbc_popfq(%rip),%r10
   2693 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
   2694 	jae	.Lin_cbc_prologue
   2695 
   2696 	mov	`16-8`(%rax),%rax	# biased $_rsp
   2697 	lea	56(%rax),%rax
   2698 
   2699 .Lin_cbc_frame_setup:
   2700 	mov	-16(%rax),%rbx
   2701 	mov	-24(%rax),%rbp
   2702 	mov	-32(%rax),%r12
   2703 	mov	-40(%rax),%r13
   2704 	mov	-48(%rax),%r14
   2705 	mov	-56(%rax),%r15
   2706 	mov	%rbx,144($context)	# restore context->Rbx
   2707 	mov	%rbp,160($context)	# restore context->Rbp
   2708 	mov	%r12,216($context)	# restore context->R12
   2709 	mov	%r13,224($context)	# restore context->R13
   2710 	mov	%r14,232($context)	# restore context->R14
   2711 	mov	%r15,240($context)	# restore context->R15
   2712 
   2713 .Lin_cbc_prologue:
   2714 	mov	8(%rax),%rdi
   2715 	mov	16(%rax),%rsi
   2716 	mov	%rax,152($context)	# restore context->Rsp
   2717 	mov	%rsi,168($context)	# restore context->Rsi
   2718 	mov	%rdi,176($context)	# restore context->Rdi
   2719 
   2720 .Lcommon_seh_exit:
   2721 
   2722 	mov	40($disp),%rdi		# disp->ContextRecord
   2723 	mov	$context,%rsi		# context
   2724 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   2725 	.long	0xa548f3fc		# cld; rep movsq
   2726 
   2727 	mov	$disp,%rsi
   2728 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2729 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2730 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2731 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2732 	mov	40(%rsi),%r10		# disp->ContextRecord
   2733 	lea	56(%rsi),%r11		# &disp->HandlerData
   2734 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2735 	mov	%r10,32(%rsp)		# arg5
   2736 	mov	%r11,40(%rsp)		# arg6
   2737 	mov	%r12,48(%rsp)		# arg7
   2738 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2739 	call	*__imp_RtlVirtualUnwind(%rip)
   2740 
   2741 	mov	\$1,%eax		# ExceptionContinueSearch
   2742 	add	\$64,%rsp
   2743 	popfq
   2744 	pop	%r15
   2745 	pop	%r14
   2746 	pop	%r13
   2747 	pop	%r12
   2748 	pop	%rbp
   2749 	pop	%rbx
   2750 	pop	%rdi
   2751 	pop	%rsi
   2752 	ret
   2753 .size	cbc_se_handler,.-cbc_se_handler
   2754 
   2755 .section	.pdata
   2756 .align	4
   2757 	.rva	.LSEH_begin_asm_AES_encrypt
   2758 	.rva	.LSEH_end_asm_AES_encrypt
   2759 	.rva	.LSEH_info_asm_AES_encrypt
   2760 
   2761 	.rva	.LSEH_begin_asm_AES_decrypt
   2762 	.rva	.LSEH_end_asm_AES_decrypt
   2763 	.rva	.LSEH_info_asm_AES_decrypt
   2764 
   2765 	.rva	.LSEH_begin_asm_AES_set_encrypt_key
   2766 	.rva	.LSEH_end_asm_AES_set_encrypt_key
   2767 	.rva	.LSEH_info_asm_AES_set_encrypt_key
   2768 
   2769 	.rva	.LSEH_begin_asm_AES_set_decrypt_key
   2770 	.rva	.LSEH_end_asm_AES_set_decrypt_key
   2771 	.rva	.LSEH_info_asm_AES_set_decrypt_key
   2772 
   2773 	.rva	.LSEH_begin_asm_AES_cbc_encrypt
   2774 	.rva	.LSEH_end_asm_AES_cbc_encrypt
   2775 	.rva	.LSEH_info_asm_AES_cbc_encrypt
   2776 
   2777 .section	.xdata
   2778 .align	8
   2779 .LSEH_info_asm_AES_encrypt:
   2780 	.byte	9,0,0,0
   2781 	.rva	block_se_handler
   2782 	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
   2783 .LSEH_info_asm_AES_decrypt:
   2784 	.byte	9,0,0,0
   2785 	.rva	block_se_handler
   2786 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
   2787 .LSEH_info_asm_AES_set_encrypt_key:
   2788 	.byte	9,0,0,0
   2789 	.rva	key_se_handler
   2790 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
   2791 .LSEH_info_asm_AES_set_decrypt_key:
   2792 	.byte	9,0,0,0
   2793 	.rva	key_se_handler
   2794 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
   2795 .LSEH_info_asm_AES_cbc_encrypt:
   2796 	.byte	9,0,0,0
   2797 	.rva	cbc_se_handler
   2798 ___
   2799 }
   2800 
   2801 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   2802 
   2803 print $code;
   2804 
   2805 close STDOUT;
   2806