Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # Version 2.1.
     11 #
     12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
     13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
     14 # [you'll notice a lot of resemblance], such as compressed S-boxes
     15 # in little-endian byte order, prefetch of these tables in CBC mode,
     16 # as well as avoiding L1 cache aliasing between stack frame and key
     17 # schedule and already mentioned tables, compressed Td4...
     18 #
     19 # Performance in number of cycles per processed byte for 128-bit key:
     20 #
     21 #		ECB encrypt	ECB decrypt	CBC large chunk
     22 # AMD64		33		41		13.0
     23 # EM64T		38		59		18.6(*)
     24 # Core 2	30		43		14.5(*)
     25 #
     26 # (*) with hyper-threading off
     27 
     28 $flavour = shift;
     29 $output  = shift;
     30 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     31 
     32 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     33 
     34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     37 die "can't locate x86_64-xlate.pl";
     38 
     39 open OUT,"| \"$^X\" $xlate $flavour $output";
     40 *STDOUT=*OUT;
     41 
     42 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
     43 			# ~15% better on both AMD and Intel cores
     44 $speed_limit=512;	# see aes-586.pl for details
     45 
     46 $code=".text\n";
     47 
     48 $s0="%eax";
     49 $s1="%ebx";
     50 $s2="%ecx";
     51 $s3="%edx";
     52 $acc0="%esi";	$mask80="%rsi";
     53 $acc1="%edi";	$maskfe="%rdi";
     54 $acc2="%ebp";	$mask1b="%rbp";
     55 $inp="%r8";
     56 $out="%r9";
     57 $t0="%r10d";
     58 $t1="%r11d";
     59 $t2="%r12d";
     60 $rnds="%r13d";
     61 $sbox="%r14";
     62 $key="%r15";
     63 
     64 sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
     65 sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
     66 			$r =~ s/%[er]([sd]i)/%\1l/;
     67 			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
     68 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
     69 			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
     70 sub _data_word()
     71 { my $i;
     72     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
     73 }
     74 sub data_word()
     75 { my $i;
     76   my $last=pop(@_);
     77     $code.=".long\t";
     78     while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
     79     $code.=sprintf"0x%08x\n",$last;
     80 }
     81 
     82 sub data_byte()
     83 { my $i;
     84   my $last=pop(@_);
     85     $code.=".byte\t";
     86     while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
     87     $code.=sprintf"0x%02x\n",$last&0xff;
     88 }
     89 
     90 sub encvert()
     91 { my $t3="%r8d";	# zaps $inp!
     92 
     93 $code.=<<___;
     94 	# favor 3-way issue Opteron pipeline...
     95 	movzb	`&lo("$s0")`,$acc0
     96 	movzb	`&lo("$s1")`,$acc1
     97 	movzb	`&lo("$s2")`,$acc2
     98 	mov	0($sbox,$acc0,8),$t0
     99 	mov	0($sbox,$acc1,8),$t1
    100 	mov	0($sbox,$acc2,8),$t2
    101 
    102 	movzb	`&hi("$s1")`,$acc0
    103 	movzb	`&hi("$s2")`,$acc1
    104 	movzb	`&lo("$s3")`,$acc2
    105 	xor	3($sbox,$acc0,8),$t0
    106 	xor	3($sbox,$acc1,8),$t1
    107 	mov	0($sbox,$acc2,8),$t3
    108 
    109 	movzb	`&hi("$s3")`,$acc0
    110 	shr	\$16,$s2
    111 	movzb	`&hi("$s0")`,$acc2
    112 	xor	3($sbox,$acc0,8),$t2
    113 	shr	\$16,$s3
    114 	xor	3($sbox,$acc2,8),$t3
    115 
    116 	shr	\$16,$s1
    117 	lea	16($key),$key
    118 	shr	\$16,$s0
    119 
    120 	movzb	`&lo("$s2")`,$acc0
    121 	movzb	`&lo("$s3")`,$acc1
    122 	movzb	`&lo("$s0")`,$acc2
    123 	xor	2($sbox,$acc0,8),$t0
    124 	xor	2($sbox,$acc1,8),$t1
    125 	xor	2($sbox,$acc2,8),$t2
    126 
    127 	movzb	`&hi("$s3")`,$acc0
    128 	movzb	`&hi("$s0")`,$acc1
    129 	movzb	`&lo("$s1")`,$acc2
    130 	xor	1($sbox,$acc0,8),$t0
    131 	xor	1($sbox,$acc1,8),$t1
    132 	xor	2($sbox,$acc2,8),$t3
    133 
    134 	mov	12($key),$s3
    135 	movzb	`&hi("$s1")`,$acc1
    136 	movzb	`&hi("$s2")`,$acc2
    137 	mov	0($key),$s0
    138 	xor	1($sbox,$acc1,8),$t2
    139 	xor	1($sbox,$acc2,8),$t3
    140 
    141 	mov	4($key),$s1
    142 	mov	8($key),$s2
    143 	xor	$t0,$s0
    144 	xor	$t1,$s1
    145 	xor	$t2,$s2
    146 	xor	$t3,$s3
    147 ___
    148 }
    149 
    150 sub enclastvert()
    151 { my $t3="%r8d";	# zaps $inp!
    152 
    153 $code.=<<___;
    154 	movzb	`&lo("$s0")`,$acc0
    155 	movzb	`&lo("$s1")`,$acc1
    156 	movzb	`&lo("$s2")`,$acc2
    157 	movzb	2($sbox,$acc0,8),$t0
    158 	movzb	2($sbox,$acc1,8),$t1
    159 	movzb	2($sbox,$acc2,8),$t2
    160 
    161 	movzb	`&lo("$s3")`,$acc0
    162 	movzb	`&hi("$s1")`,$acc1
    163 	movzb	`&hi("$s2")`,$acc2
    164 	movzb	2($sbox,$acc0,8),$t3
    165 	mov	0($sbox,$acc1,8),$acc1	#$t0
    166 	mov	0($sbox,$acc2,8),$acc2	#$t1
    167 
    168 	and	\$0x0000ff00,$acc1
    169 	and	\$0x0000ff00,$acc2
    170 
    171 	xor	$acc1,$t0
    172 	xor	$acc2,$t1
    173 	shr	\$16,$s2
    174 
    175 	movzb	`&hi("$s3")`,$acc0
    176 	movzb	`&hi("$s0")`,$acc1
    177 	shr	\$16,$s3
    178 	mov	0($sbox,$acc0,8),$acc0	#$t2
    179 	mov	0($sbox,$acc1,8),$acc1	#$t3
    180 
    181 	and	\$0x0000ff00,$acc0
    182 	and	\$0x0000ff00,$acc1
    183 	shr	\$16,$s1
    184 	xor	$acc0,$t2
    185 	xor	$acc1,$t3
    186 	shr	\$16,$s0
    187 
    188 	movzb	`&lo("$s2")`,$acc0
    189 	movzb	`&lo("$s3")`,$acc1
    190 	movzb	`&lo("$s0")`,$acc2
    191 	mov	0($sbox,$acc0,8),$acc0	#$t0
    192 	mov	0($sbox,$acc1,8),$acc1	#$t1
    193 	mov	0($sbox,$acc2,8),$acc2	#$t2
    194 
    195 	and	\$0x00ff0000,$acc0
    196 	and	\$0x00ff0000,$acc1
    197 	and	\$0x00ff0000,$acc2
    198 
    199 	xor	$acc0,$t0
    200 	xor	$acc1,$t1
    201 	xor	$acc2,$t2
    202 
    203 	movzb	`&lo("$s1")`,$acc0
    204 	movzb	`&hi("$s3")`,$acc1
    205 	movzb	`&hi("$s0")`,$acc2
    206 	mov	0($sbox,$acc0,8),$acc0	#$t3
    207 	mov	2($sbox,$acc1,8),$acc1	#$t0
    208 	mov	2($sbox,$acc2,8),$acc2	#$t1
    209 
    210 	and	\$0x00ff0000,$acc0
    211 	and	\$0xff000000,$acc1
    212 	and	\$0xff000000,$acc2
    213 
    214 	xor	$acc0,$t3
    215 	xor	$acc1,$t0
    216 	xor	$acc2,$t1
    217 
    218 	movzb	`&hi("$s1")`,$acc0
    219 	movzb	`&hi("$s2")`,$acc1
    220 	mov	16+12($key),$s3
    221 	mov	2($sbox,$acc0,8),$acc0	#$t2
    222 	mov	2($sbox,$acc1,8),$acc1	#$t3
    223 	mov	16+0($key),$s0
    224 
    225 	and	\$0xff000000,$acc0
    226 	and	\$0xff000000,$acc1
    227 
    228 	xor	$acc0,$t2
    229 	xor	$acc1,$t3
    230 
    231 	mov	16+4($key),$s1
    232 	mov	16+8($key),$s2
    233 	xor	$t0,$s0
    234 	xor	$t1,$s1
    235 	xor	$t2,$s2
    236 	xor	$t3,$s3
    237 ___
    238 }
    239 
    240 sub encstep()
    241 { my ($i,@s) = @_;
    242   my $tmp0=$acc0;
    243   my $tmp1=$acc1;
    244   my $tmp2=$acc2;
    245   my $out=($t0,$t1,$t2,$s[0])[$i];
    246 
    247 	if ($i==3) {
    248 		$tmp0=$s[1];
    249 		$tmp1=$s[2];
    250 		$tmp2=$s[3];
    251 	}
    252 	$code.="	movzb	".&lo($s[0]).",$out\n";
    253 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    254 	$code.="	lea	16($key),$key\n"	if ($i==0);
    255 
    256 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    257 	$code.="	mov	0($sbox,$out,8),$out\n";
    258 
    259 	$code.="	shr	\$16,$tmp1\n";
    260 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    261 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    262 
    263 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    264 	$code.="	shr	\$24,$tmp2\n";
    265 	$code.="	xor	4*$i($key),$out\n";
    266 
    267 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    268 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    269 
    270 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    271 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    272 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    273 	$code.="\n";
    274 }
    275 
    276 sub enclast()
    277 { my ($i,@s)=@_;
    278   my $tmp0=$acc0;
    279   my $tmp1=$acc1;
    280   my $tmp2=$acc2;
    281   my $out=($t0,$t1,$t2,$s[0])[$i];
    282 
    283 	if ($i==3) {
    284 		$tmp0=$s[1];
    285 		$tmp1=$s[2];
    286 		$tmp2=$s[3];
    287 	}
    288 	$code.="	movzb	".&lo($s[0]).",$out\n";
    289 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    290 
    291 	$code.="	mov	2($sbox,$out,8),$out\n";
    292 	$code.="	shr	\$16,$tmp1\n";
    293 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    294 
    295 	$code.="	and	\$0x000000ff,$out\n";
    296 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    297 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    298 	$code.="	shr	\$24,$tmp2\n";
    299 
    300 	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
    301 	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
    302 	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
    303 
    304 	$code.="	and	\$0x0000ff00,$tmp0\n";
    305 	$code.="	and	\$0x00ff0000,$tmp1\n";
    306 	$code.="	and	\$0xff000000,$tmp2\n";
    307 
    308 	$code.="	xor	$tmp0,$out\n";
    309 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    310 	$code.="	xor	$tmp1,$out\n";
    311 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    312 	$code.="	xor	$tmp2,$out\n";
    313 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    314 	$code.="\n";
    315 }
    316 
    317 $code.=<<___;
    318 .type	_x86_64_AES_encrypt,\@abi-omnipotent
    319 .align	16
    320 _x86_64_AES_encrypt:
    321 	xor	0($key),$s0			# xor with key
    322 	xor	4($key),$s1
    323 	xor	8($key),$s2
    324 	xor	12($key),$s3
    325 
    326 	mov	240($key),$rnds			# load key->rounds
    327 	sub	\$1,$rnds
    328 	jmp	.Lenc_loop
    329 .align	16
    330 .Lenc_loop:
    331 ___
    332 	if ($verticalspin) { &encvert(); }
    333 	else {	&encstep(0,$s0,$s1,$s2,$s3);
    334 		&encstep(1,$s1,$s2,$s3,$s0);
    335 		&encstep(2,$s2,$s3,$s0,$s1);
    336 		&encstep(3,$s3,$s0,$s1,$s2);
    337 	}
    338 $code.=<<___;
    339 	sub	\$1,$rnds
    340 	jnz	.Lenc_loop
    341 ___
    342 	if ($verticalspin) { &enclastvert(); }
    343 	else {	&enclast(0,$s0,$s1,$s2,$s3);
    344 		&enclast(1,$s1,$s2,$s3,$s0);
    345 		&enclast(2,$s2,$s3,$s0,$s1);
    346 		&enclast(3,$s3,$s0,$s1,$s2);
    347 		$code.=<<___;
    348 		xor	16+0($key),$s0		# xor with key
    349 		xor	16+4($key),$s1
    350 		xor	16+8($key),$s2
    351 		xor	16+12($key),$s3
    352 ___
    353 	}
    354 $code.=<<___;
    355 	.byte	0xf3,0xc3			# rep ret
    356 .size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
    357 ___
    358 
    359 # it's possible to implement this by shifting tN by 8, filling least
    360 # significant byte with byte load and finally bswap-ing at the end,
    361 # but such partial register load kills Core 2...
    362 sub enccompactvert()
    363 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    364 
    365 $code.=<<___;
    366 	movzb	`&lo("$s0")`,$t0
    367 	movzb	`&lo("$s1")`,$t1
    368 	movzb	`&lo("$s2")`,$t2
    369 	movzb	($sbox,$t0,1),$t0
    370 	movzb	($sbox,$t1,1),$t1
    371 	movzb	($sbox,$t2,1),$t2
    372 
    373 	movzb	`&lo("$s3")`,$t3
    374 	movzb	`&hi("$s1")`,$acc0
    375 	movzb	`&hi("$s2")`,$acc1
    376 	movzb	($sbox,$t3,1),$t3
    377 	movzb	($sbox,$acc0,1),$t4	#$t0
    378 	movzb	($sbox,$acc1,1),$t5	#$t1
    379 
    380 	movzb	`&hi("$s3")`,$acc2
    381 	movzb	`&hi("$s0")`,$acc0
    382 	shr	\$16,$s2
    383 	movzb	($sbox,$acc2,1),$acc2	#$t2
    384 	movzb	($sbox,$acc0,1),$acc0	#$t3
    385 	shr	\$16,$s3
    386 
    387 	movzb	`&lo("$s2")`,$acc1
    388 	shl	\$8,$t4
    389 	shl	\$8,$t5
    390 	movzb	($sbox,$acc1,1),$acc1	#$t0
    391 	xor	$t4,$t0
    392 	xor	$t5,$t1
    393 
    394 	movzb	`&lo("$s3")`,$t4
    395 	shr	\$16,$s0
    396 	shr	\$16,$s1
    397 	movzb	`&lo("$s0")`,$t5
    398 	shl	\$8,$acc2
    399 	shl	\$8,$acc0
    400 	movzb	($sbox,$t4,1),$t4	#$t1
    401 	movzb	($sbox,$t5,1),$t5	#$t2
    402 	xor	$acc2,$t2
    403 	xor	$acc0,$t3
    404 
    405 	movzb	`&lo("$s1")`,$acc2
    406 	movzb	`&hi("$s3")`,$acc0
    407 	shl	\$16,$acc1
    408 	movzb	($sbox,$acc2,1),$acc2	#$t3
    409 	movzb	($sbox,$acc0,1),$acc0	#$t0
    410 	xor	$acc1,$t0
    411 
    412 	movzb	`&hi("$s0")`,$acc1
    413 	shr	\$8,$s2
    414 	shr	\$8,$s1
    415 	movzb	($sbox,$acc1,1),$acc1	#$t1
    416 	movzb	($sbox,$s2,1),$s3	#$t3
    417 	movzb	($sbox,$s1,1),$s2	#$t2
    418 	shl	\$16,$t4
    419 	shl	\$16,$t5
    420 	shl	\$16,$acc2
    421 	xor	$t4,$t1
    422 	xor	$t5,$t2
    423 	xor	$acc2,$t3
    424 
    425 	shl	\$24,$acc0
    426 	shl	\$24,$acc1
    427 	shl	\$24,$s3
    428 	xor	$acc0,$t0
    429 	shl	\$24,$s2
    430 	xor	$acc1,$t1
    431 	mov	$t0,$s0
    432 	mov	$t1,$s1
    433 	xor	$t2,$s2
    434 	xor	$t3,$s3
    435 ___
    436 }
    437 
    438 sub enctransform_ref()
    439 { my $sn = shift;
    440   my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
    441 
    442 $code.=<<___;
    443 	mov	$sn,$acc
    444 	and	\$0x80808080,$acc
    445 	mov	$acc,$tmp
    446 	shr	\$7,$tmp
    447 	lea	($sn,$sn),$r2
    448 	sub	$tmp,$acc
    449 	and	\$0xfefefefe,$r2
    450 	and	\$0x1b1b1b1b,$acc
    451 	mov	$sn,$tmp
    452 	xor	$acc,$r2
    453 
    454 	xor	$r2,$sn
    455 	rol	\$24,$sn
    456 	xor	$r2,$sn
    457 	ror	\$16,$tmp
    458 	xor	$tmp,$sn
    459 	ror	\$8,$tmp
    460 	xor	$tmp,$sn
    461 ___
    462 }
    463 
    464 # unlike decrypt case it does not pay off to parallelize enctransform
    465 sub enctransform()
    466 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
    467 
    468 $code.=<<___;
    469 	mov	$s0,$acc0
    470 	mov	$s1,$acc1
    471 	and	\$0x80808080,$acc0
    472 	and	\$0x80808080,$acc1
    473 	mov	$acc0,$t0
    474 	mov	$acc1,$t1
    475 	shr	\$7,$t0
    476 	lea	($s0,$s0),$r20
    477 	shr	\$7,$t1
    478 	lea	($s1,$s1),$r21
    479 	sub	$t0,$acc0
    480 	sub	$t1,$acc1
    481 	and	\$0xfefefefe,$r20
    482 	and	\$0xfefefefe,$r21
    483 	and	\$0x1b1b1b1b,$acc0
    484 	and	\$0x1b1b1b1b,$acc1
    485 	mov	$s0,$t0
    486 	mov	$s1,$t1
    487 	xor	$acc0,$r20
    488 	xor	$acc1,$r21
    489 
    490 	xor	$r20,$s0
    491 	xor	$r21,$s1
    492 	 mov	$s2,$acc0
    493 	 mov	$s3,$acc1
    494 	rol	\$24,$s0
    495 	rol	\$24,$s1
    496 	 and	\$0x80808080,$acc0
    497 	 and	\$0x80808080,$acc1
    498 	xor	$r20,$s0
    499 	xor	$r21,$s1
    500 	 mov	$acc0,$t2
    501 	 mov	$acc1,$t3
    502 	ror	\$16,$t0
    503 	ror	\$16,$t1
    504 	 shr	\$7,$t2
    505 	 lea	($s2,$s2),$r20
    506 	xor	$t0,$s0
    507 	xor	$t1,$s1
    508 	 shr	\$7,$t3
    509 	 lea	($s3,$s3),$r21
    510 	ror	\$8,$t0
    511 	ror	\$8,$t1
    512 	 sub	$t2,$acc0
    513 	 sub	$t3,$acc1
    514 	xor	$t0,$s0
    515 	xor	$t1,$s1
    516 
    517 	and	\$0xfefefefe,$r20
    518 	and	\$0xfefefefe,$r21
    519 	and	\$0x1b1b1b1b,$acc0
    520 	and	\$0x1b1b1b1b,$acc1
    521 	mov	$s2,$t2
    522 	mov	$s3,$t3
    523 	xor	$acc0,$r20
    524 	xor	$acc1,$r21
    525 
    526 	xor	$r20,$s2
    527 	xor	$r21,$s3
    528 	rol	\$24,$s2
    529 	rol	\$24,$s3
    530 	xor	$r20,$s2
    531 	xor	$r21,$s3
    532 	mov	0($sbox),$acc0			# prefetch Te4
    533 	ror	\$16,$t2
    534 	ror	\$16,$t3
    535 	mov	64($sbox),$acc1
    536 	xor	$t2,$s2
    537 	xor	$t3,$s3
    538 	mov	128($sbox),$r20
    539 	ror	\$8,$t2
    540 	ror	\$8,$t3
    541 	mov	192($sbox),$r21
    542 	xor	$t2,$s2
    543 	xor	$t3,$s3
    544 ___
    545 }
    546 
    547 $code.=<<___;
    548 .type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
    549 .align	16
    550 _x86_64_AES_encrypt_compact:
    551 	lea	128($sbox),$inp			# size optimization
    552 	mov	0-128($inp),$acc1		# prefetch Te4
    553 	mov	32-128($inp),$acc2
    554 	mov	64-128($inp),$t0
    555 	mov	96-128($inp),$t1
    556 	mov	128-128($inp),$acc1
    557 	mov	160-128($inp),$acc2
    558 	mov	192-128($inp),$t0
    559 	mov	224-128($inp),$t1
    560 	jmp	.Lenc_loop_compact
    561 .align	16
    562 .Lenc_loop_compact:
    563 		xor	0($key),$s0		# xor with key
    564 		xor	4($key),$s1
    565 		xor	8($key),$s2
    566 		xor	12($key),$s3
    567 		lea	16($key),$key
    568 ___
    569 		&enccompactvert();
    570 $code.=<<___;
    571 		cmp	16(%rsp),$key
    572 		je	.Lenc_compact_done
    573 ___
    574 		&enctransform();
    575 $code.=<<___;
    576 	jmp	.Lenc_loop_compact
    577 .align	16
    578 .Lenc_compact_done:
    579 	xor	0($key),$s0
    580 	xor	4($key),$s1
    581 	xor	8($key),$s2
    582 	xor	12($key),$s3
    583 	.byte	0xf3,0xc3			# rep ret
    584 .size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
    585 ___
    586 
    587 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
    588 $code.=<<___;
    589 .globl	AES_encrypt
    590 .type	AES_encrypt,\@function,3
    591 .align	16
    592 .globl	asm_AES_encrypt
    593 .hidden	asm_AES_encrypt
    594 asm_AES_encrypt:
    595 AES_encrypt:
    596 	push	%rbx
    597 	push	%rbp
    598 	push	%r12
    599 	push	%r13
    600 	push	%r14
    601 	push	%r15
    602 
    603 	# allocate frame "above" key schedule
    604 	mov	%rsp,%r10
    605 	lea	-63(%rdx),%rcx	# %rdx is key argument
    606 	and	\$-64,%rsp
    607 	sub	%rsp,%rcx
    608 	neg	%rcx
    609 	and	\$0x3c0,%rcx
    610 	sub	%rcx,%rsp
    611 	sub	\$32,%rsp
    612 
    613 	mov	%rsi,16(%rsp)	# save out
    614 	mov	%r10,24(%rsp)	# save real stack pointer
    615 .Lenc_prologue:
    616 
    617 	mov	%rdx,$key
    618 	mov	240($key),$rnds	# load rounds
    619 
    620 	mov	0(%rdi),$s0	# load input vector
    621 	mov	4(%rdi),$s1
    622 	mov	8(%rdi),$s2
    623 	mov	12(%rdi),$s3
    624 
    625 	shl	\$4,$rnds
    626 	lea	($key,$rnds),%rbp
    627 	mov	$key,(%rsp)	# key schedule
    628 	mov	%rbp,8(%rsp)	# end of key schedule
    629 
    630 	# pick Te4 copy which can't "overlap" with stack frame or key schedule
    631 	lea	.LAES_Te+2048(%rip),$sbox
    632 	lea	768(%rsp),%rbp
    633 	sub	$sbox,%rbp
    634 	and	\$0x300,%rbp
    635 	lea	($sbox,%rbp),$sbox
    636 
    637 	call	_x86_64_AES_encrypt_compact
    638 
    639 	mov	16(%rsp),$out	# restore out
    640 	mov	24(%rsp),%rsi	# restore saved stack pointer
    641 	mov	$s0,0($out)	# write output vector
    642 	mov	$s1,4($out)
    643 	mov	$s2,8($out)
    644 	mov	$s3,12($out)
    645 
    646 	mov	(%rsi),%r15
    647 	mov	8(%rsi),%r14
    648 	mov	16(%rsi),%r13
    649 	mov	24(%rsi),%r12
    650 	mov	32(%rsi),%rbp
    651 	mov	40(%rsi),%rbx
    652 	lea	48(%rsi),%rsp
    653 .Lenc_epilogue:
    654 	ret
    655 .size	AES_encrypt,.-AES_encrypt
    656 ___
    657 
    658 #------------------------------------------------------------------#
    659 
    660 sub decvert()
    661 { my $t3="%r8d";	# zaps $inp!
    662 
    663 $code.=<<___;
    664 	# favor 3-way issue Opteron pipeline...
    665 	movzb	`&lo("$s0")`,$acc0
    666 	movzb	`&lo("$s1")`,$acc1
    667 	movzb	`&lo("$s2")`,$acc2
    668 	mov	0($sbox,$acc0,8),$t0
    669 	mov	0($sbox,$acc1,8),$t1
    670 	mov	0($sbox,$acc2,8),$t2
    671 
    672 	movzb	`&hi("$s3")`,$acc0
    673 	movzb	`&hi("$s0")`,$acc1
    674 	movzb	`&lo("$s3")`,$acc2
    675 	xor	3($sbox,$acc0,8),$t0
    676 	xor	3($sbox,$acc1,8),$t1
    677 	mov	0($sbox,$acc2,8),$t3
    678 
    679 	movzb	`&hi("$s1")`,$acc0
    680 	shr	\$16,$s0
    681 	movzb	`&hi("$s2")`,$acc2
    682 	xor	3($sbox,$acc0,8),$t2
    683 	shr	\$16,$s3
    684 	xor	3($sbox,$acc2,8),$t3
    685 
    686 	shr	\$16,$s1
    687 	lea	16($key),$key
    688 	shr	\$16,$s2
    689 
    690 	movzb	`&lo("$s2")`,$acc0
    691 	movzb	`&lo("$s3")`,$acc1
    692 	movzb	`&lo("$s0")`,$acc2
    693 	xor	2($sbox,$acc0,8),$t0
    694 	xor	2($sbox,$acc1,8),$t1
    695 	xor	2($sbox,$acc2,8),$t2
    696 
    697 	movzb	`&hi("$s1")`,$acc0
    698 	movzb	`&hi("$s2")`,$acc1
    699 	movzb	`&lo("$s1")`,$acc2
    700 	xor	1($sbox,$acc0,8),$t0
    701 	xor	1($sbox,$acc1,8),$t1
    702 	xor	2($sbox,$acc2,8),$t3
    703 
    704 	movzb	`&hi("$s3")`,$acc0
    705 	mov	12($key),$s3
    706 	movzb	`&hi("$s0")`,$acc2
    707 	xor	1($sbox,$acc0,8),$t2
    708 	mov	0($key),$s0
    709 	xor	1($sbox,$acc2,8),$t3
    710 
    711 	xor	$t0,$s0
    712 	mov	4($key),$s1
    713 	mov	8($key),$s2
    714 	xor	$t2,$s2
    715 	xor	$t1,$s1
    716 	xor	$t3,$s3
    717 ___
    718 }
    719 
    720 sub declastvert()
    721 { my $t3="%r8d";	# zaps $inp!
    722 
    723 $code.=<<___;
    724 	lea	2048($sbox),$sbox	# size optimization
    725 	movzb	`&lo("$s0")`,$acc0
    726 	movzb	`&lo("$s1")`,$acc1
    727 	movzb	`&lo("$s2")`,$acc2
    728 	movzb	($sbox,$acc0,1),$t0
    729 	movzb	($sbox,$acc1,1),$t1
    730 	movzb	($sbox,$acc2,1),$t2
    731 
    732 	movzb	`&lo("$s3")`,$acc0
    733 	movzb	`&hi("$s3")`,$acc1
    734 	movzb	`&hi("$s0")`,$acc2
    735 	movzb	($sbox,$acc0,1),$t3
    736 	movzb	($sbox,$acc1,1),$acc1	#$t0
    737 	movzb	($sbox,$acc2,1),$acc2	#$t1
    738 
    739 	shl	\$8,$acc1
    740 	shl	\$8,$acc2
    741 
    742 	xor	$acc1,$t0
    743 	xor	$acc2,$t1
    744 	shr	\$16,$s3
    745 
    746 	movzb	`&hi("$s1")`,$acc0
    747 	movzb	`&hi("$s2")`,$acc1
    748 	shr	\$16,$s0
    749 	movzb	($sbox,$acc0,1),$acc0	#$t2
    750 	movzb	($sbox,$acc1,1),$acc1	#$t3
    751 
    752 	shl	\$8,$acc0
    753 	shl	\$8,$acc1
    754 	shr	\$16,$s1
    755 	xor	$acc0,$t2
    756 	xor	$acc1,$t3
    757 	shr	\$16,$s2
    758 
    759 	movzb	`&lo("$s2")`,$acc0
    760 	movzb	`&lo("$s3")`,$acc1
    761 	movzb	`&lo("$s0")`,$acc2
    762 	movzb	($sbox,$acc0,1),$acc0	#$t0
    763 	movzb	($sbox,$acc1,1),$acc1	#$t1
    764 	movzb	($sbox,$acc2,1),$acc2	#$t2
    765 
    766 	shl	\$16,$acc0
    767 	shl	\$16,$acc1
    768 	shl	\$16,$acc2
    769 
    770 	xor	$acc0,$t0
    771 	xor	$acc1,$t1
    772 	xor	$acc2,$t2
    773 
    774 	movzb	`&lo("$s1")`,$acc0
    775 	movzb	`&hi("$s1")`,$acc1
    776 	movzb	`&hi("$s2")`,$acc2
    777 	movzb	($sbox,$acc0,1),$acc0	#$t3
    778 	movzb	($sbox,$acc1,1),$acc1	#$t0
    779 	movzb	($sbox,$acc2,1),$acc2	#$t1
    780 
    781 	shl	\$16,$acc0
    782 	shl	\$24,$acc1
    783 	shl	\$24,$acc2
    784 
    785 	xor	$acc0,$t3
    786 	xor	$acc1,$t0
    787 	xor	$acc2,$t1
    788 
    789 	movzb	`&hi("$s3")`,$acc0
    790 	movzb	`&hi("$s0")`,$acc1
    791 	mov	16+12($key),$s3
    792 	movzb	($sbox,$acc0,1),$acc0	#$t2
    793 	movzb	($sbox,$acc1,1),$acc1	#$t3
    794 	mov	16+0($key),$s0
    795 
    796 	shl	\$24,$acc0
    797 	shl	\$24,$acc1
    798 
    799 	xor	$acc0,$t2
    800 	xor	$acc1,$t3
    801 
    802 	mov	16+4($key),$s1
    803 	mov	16+8($key),$s2
    804 	lea	-2048($sbox),$sbox
    805 	xor	$t0,$s0
    806 	xor	$t1,$s1
    807 	xor	$t2,$s2
    808 	xor	$t3,$s3
    809 ___
    810 }
    811 
    812 sub decstep()
    813 { my ($i,@s) = @_;
    814   my $tmp0=$acc0;
    815   my $tmp1=$acc1;
    816   my $tmp2=$acc2;
    817   my $out=($t0,$t1,$t2,$s[0])[$i];
    818 
    819 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    820 			$tmp1=$s[2]			if ($i==3);
    821 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    822 	$code.="	and	\$0xFF,$out\n";
    823 
    824 	$code.="	mov	0($sbox,$out,8),$out\n";
    825 	$code.="	shr	\$16,$tmp1\n";
    826 			$tmp2=$s[3]			if ($i==3);
    827 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    828 
    829 			$tmp0=$s[1]			if ($i==3);
    830 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    831 	$code.="	and	\$0xFF,$tmp1\n";
    832 	$code.="	shr	\$24,$tmp2\n";
    833 
    834 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    835 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    836 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    837 
    838 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    839 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    840 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    841 	$code.="\n";
    842 }
    843 
    844 sub declast()
    845 { my ($i,@s)=@_;
    846   my $tmp0=$acc0;
    847   my $tmp1=$acc1;
    848   my $tmp2=$acc2;
    849   my $out=($t0,$t1,$t2,$s[0])[$i];
    850 
    851 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    852 			$tmp1=$s[2]			if ($i==3);
    853 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    854 	$code.="	and	\$0xFF,$out\n";
    855 
    856 	$code.="	movzb	2048($sbox,$out,1),$out\n";
    857 	$code.="	shr	\$16,$tmp1\n";
    858 			$tmp2=$s[3]			if ($i==3);
    859 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    860 
    861 			$tmp0=$s[1]			if ($i==3);
    862 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    863 	$code.="	and	\$0xFF,$tmp1\n";
    864 	$code.="	shr	\$24,$tmp2\n";
    865 
    866 	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
    867 	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
    868 	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
    869 
    870 	$code.="	shl	\$8,$tmp0\n";
    871 	$code.="	shl	\$16,$tmp1\n";
    872 	$code.="	shl	\$24,$tmp2\n";
    873 
    874 	$code.="	xor	$tmp0,$out\n";
    875 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    876 	$code.="	xor	$tmp1,$out\n";
    877 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    878 	$code.="	xor	$tmp2,$out\n";
    879 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    880 	$code.="\n";
    881 }
    882 
    883 $code.=<<___;
    884 .type	_x86_64_AES_decrypt,\@abi-omnipotent
    885 .align	16
    886 _x86_64_AES_decrypt:
    887 	xor	0($key),$s0			# xor with key
    888 	xor	4($key),$s1
    889 	xor	8($key),$s2
    890 	xor	12($key),$s3
    891 
    892 	mov	240($key),$rnds			# load key->rounds
    893 	sub	\$1,$rnds
    894 	jmp	.Ldec_loop
    895 .align	16
    896 .Ldec_loop:
    897 ___
    898 	if ($verticalspin) { &decvert(); }
    899 	else {	&decstep(0,$s0,$s3,$s2,$s1);
    900 		&decstep(1,$s1,$s0,$s3,$s2);
    901 		&decstep(2,$s2,$s1,$s0,$s3);
    902 		&decstep(3,$s3,$s2,$s1,$s0);
    903 		$code.=<<___;
    904 		lea	16($key),$key
    905 		xor	0($key),$s0			# xor with key
    906 		xor	4($key),$s1
    907 		xor	8($key),$s2
    908 		xor	12($key),$s3
    909 ___
    910 	}
    911 $code.=<<___;
    912 	sub	\$1,$rnds
    913 	jnz	.Ldec_loop
    914 ___
    915 	if ($verticalspin) { &declastvert(); }
    916 	else {	&declast(0,$s0,$s3,$s2,$s1);
    917 		&declast(1,$s1,$s0,$s3,$s2);
    918 		&declast(2,$s2,$s1,$s0,$s3);
    919 		&declast(3,$s3,$s2,$s1,$s0);
    920 		$code.=<<___;
    921 		xor	16+0($key),$s0			# xor with key
    922 		xor	16+4($key),$s1
    923 		xor	16+8($key),$s2
    924 		xor	16+12($key),$s3
    925 ___
    926 	}
    927 $code.=<<___;
    928 	.byte	0xf3,0xc3			# rep ret
    929 .size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
    930 ___
    931 
    932 sub deccompactvert()
    933 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    934 
    935 $code.=<<___;
    936 	movzb	`&lo("$s0")`,$t0
    937 	movzb	`&lo("$s1")`,$t1
    938 	movzb	`&lo("$s2")`,$t2
    939 	movzb	($sbox,$t0,1),$t0
    940 	movzb	($sbox,$t1,1),$t1
    941 	movzb	($sbox,$t2,1),$t2
    942 
    943 	movzb	`&lo("$s3")`,$t3
    944 	movzb	`&hi("$s3")`,$acc0
    945 	movzb	`&hi("$s0")`,$acc1
    946 	movzb	($sbox,$t3,1),$t3
    947 	movzb	($sbox,$acc0,1),$t4	#$t0
    948 	movzb	($sbox,$acc1,1),$t5	#$t1
    949 
    950 	movzb	`&hi("$s1")`,$acc2
    951 	movzb	`&hi("$s2")`,$acc0
    952 	shr	\$16,$s2
    953 	movzb	($sbox,$acc2,1),$acc2	#$t2
    954 	movzb	($sbox,$acc0,1),$acc0	#$t3
    955 	shr	\$16,$s3
    956 
    957 	movzb	`&lo("$s2")`,$acc1
    958 	shl	\$8,$t4
    959 	shl	\$8,$t5
    960 	movzb	($sbox,$acc1,1),$acc1	#$t0
    961 	xor	$t4,$t0
    962 	xor	$t5,$t1
    963 
    964 	movzb	`&lo("$s3")`,$t4
    965 	shr	\$16,$s0
    966 	shr	\$16,$s1
    967 	movzb	`&lo("$s0")`,$t5
    968 	shl	\$8,$acc2
    969 	shl	\$8,$acc0
    970 	movzb	($sbox,$t4,1),$t4	#$t1
    971 	movzb	($sbox,$t5,1),$t5	#$t2
    972 	xor	$acc2,$t2
    973 	xor	$acc0,$t3
    974 
    975 	movzb	`&lo("$s1")`,$acc2
    976 	movzb	`&hi("$s1")`,$acc0
    977 	shl	\$16,$acc1
    978 	movzb	($sbox,$acc2,1),$acc2	#$t3
    979 	movzb	($sbox,$acc0,1),$acc0	#$t0
    980 	xor	$acc1,$t0
    981 
    982 	movzb	`&hi("$s2")`,$acc1
    983 	shl	\$16,$t4
    984 	shl	\$16,$t5
    985 	movzb	($sbox,$acc1,1),$s1	#$t1
    986 	xor	$t4,$t1
    987 	xor	$t5,$t2
    988 
    989 	movzb	`&hi("$s3")`,$acc1
    990 	shr	\$8,$s0
    991 	shl	\$16,$acc2
    992 	movzb	($sbox,$acc1,1),$s2	#$t2
    993 	movzb	($sbox,$s0,1),$s3	#$t3
    994 	xor	$acc2,$t3
    995 
    996 	shl	\$24,$acc0
    997 	shl	\$24,$s1
    998 	shl	\$24,$s2
    999 	xor	$acc0,$t0
   1000 	shl	\$24,$s3
   1001 	xor	$t1,$s1
   1002 	mov	$t0,$s0
   1003 	xor	$t2,$s2
   1004 	xor	$t3,$s3
   1005 ___
   1006 }
   1007 
   1008 # parallelized version! input is pair of 64-bit values: %rax=s1.s0
   1009 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
   1010 # %ecx=s2 and %edx=s3.
   1011 sub dectransform()
   1012 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
   1013   my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
   1014   my $prefetch = shift;
   1015 
   1016 $code.=<<___;
   1017 	mov	$tp10,$acc0
   1018 	mov	$tp18,$acc8
   1019 	and	$mask80,$acc0
   1020 	and	$mask80,$acc8
   1021 	mov	$acc0,$tp40
   1022 	mov	$acc8,$tp48
   1023 	shr	\$7,$tp40
   1024 	lea	($tp10,$tp10),$tp20
   1025 	shr	\$7,$tp48
   1026 	lea	($tp18,$tp18),$tp28
   1027 	sub	$tp40,$acc0
   1028 	sub	$tp48,$acc8
   1029 	and	$maskfe,$tp20
   1030 	and	$maskfe,$tp28
   1031 	and	$mask1b,$acc0
   1032 	and	$mask1b,$acc8
   1033 	xor	$tp20,$acc0
   1034 	xor	$tp28,$acc8
   1035 	mov	$acc0,$tp20
   1036 	mov	$acc8,$tp28
   1037 
   1038 	and	$mask80,$acc0
   1039 	and	$mask80,$acc8
   1040 	mov	$acc0,$tp80
   1041 	mov	$acc8,$tp88
   1042 	shr	\$7,$tp80
   1043 	lea	($tp20,$tp20),$tp40
   1044 	shr	\$7,$tp88
   1045 	lea	($tp28,$tp28),$tp48
   1046 	sub	$tp80,$acc0
   1047 	sub	$tp88,$acc8
   1048 	and	$maskfe,$tp40
   1049 	and	$maskfe,$tp48
   1050 	and	$mask1b,$acc0
   1051 	and	$mask1b,$acc8
   1052 	xor	$tp40,$acc0
   1053 	xor	$tp48,$acc8
   1054 	mov	$acc0,$tp40
   1055 	mov	$acc8,$tp48
   1056 
   1057 	and	$mask80,$acc0
   1058 	and	$mask80,$acc8
   1059 	mov	$acc0,$tp80
   1060 	mov	$acc8,$tp88
   1061 	shr	\$7,$tp80
   1062 	 xor	$tp10,$tp20		# tp2^=tp1
   1063 	shr	\$7,$tp88
   1064 	 xor	$tp18,$tp28		# tp2^=tp1
   1065 	sub	$tp80,$acc0
   1066 	sub	$tp88,$acc8
   1067 	lea	($tp40,$tp40),$tp80
   1068 	lea	($tp48,$tp48),$tp88
   1069 	 xor	$tp10,$tp40		# tp4^=tp1
   1070 	 xor	$tp18,$tp48		# tp4^=tp1
   1071 	and	$maskfe,$tp80
   1072 	and	$maskfe,$tp88
   1073 	and	$mask1b,$acc0
   1074 	and	$mask1b,$acc8
   1075 	xor	$acc0,$tp80
   1076 	xor	$acc8,$tp88
   1077 
   1078 	xor	$tp80,$tp10		# tp1^=tp8
   1079 	xor	$tp88,$tp18		# tp1^=tp8
   1080 	xor	$tp80,$tp20		# tp2^tp1^=tp8
   1081 	xor	$tp88,$tp28		# tp2^tp1^=tp8
   1082 	mov	$tp10,$acc0
   1083 	mov	$tp18,$acc8
   1084 	xor	$tp80,$tp40		# tp4^tp1^=tp8
   1085 	xor	$tp88,$tp48		# tp4^tp1^=tp8
   1086 	shr	\$32,$acc0
   1087 	shr	\$32,$acc8
   1088 	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
   1089 	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
   1090 	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
   1091 	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
   1092 	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1093 	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1094 
   1095 	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
   1096 	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
   1097 	xor	`&LO("$tp80")`,`&LO("$tp10")`
   1098 	xor	`&LO("$tp88")`,`&LO("$tp18")`
   1099 	shr	\$32,$tp80
   1100 	shr	\$32,$tp88
   1101 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1102 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1103 
   1104 	mov	$tp20,$tp80
   1105 	mov	$tp28,$tp88
   1106 	shr	\$32,$tp80
   1107 	shr	\$32,$tp88
   1108 	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
   1109 	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
   1110 	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
   1111 	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
   1112 	xor	`&LO("$tp20")`,`&LO("$tp10")`
   1113 	xor	`&LO("$tp28")`,`&LO("$tp18")`
   1114 	mov	$tp40,$tp20
   1115 	mov	$tp48,$tp28
   1116 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1117 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1118 
   1119 	`"mov	0($sbox),$mask80"	if ($prefetch)`
   1120 	shr	\$32,$tp20
   1121 	shr	\$32,$tp28
   1122 	`"mov	64($sbox),$maskfe"	if ($prefetch)`
   1123 	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
   1124 	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
   1125 	`"mov	128($sbox),$mask1b"	if ($prefetch)`
   1126 	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
   1127 	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
   1128 	`"mov	192($sbox),$tp80"	if ($prefetch)`
   1129 	xor	`&LO("$tp40")`,`&LO("$tp10")`
   1130 	xor	`&LO("$tp48")`,`&LO("$tp18")`
   1131 	`"mov	256($sbox),$tp88"	if ($prefetch)`
   1132 	xor	`&LO("$tp20")`,`&LO("$acc0")`
   1133 	xor	`&LO("$tp28")`,`&LO("$acc8")`
   1134 ___
   1135 }
   1136 
   1137 $code.=<<___;
   1138 .type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
   1139 .align	16
   1140 _x86_64_AES_decrypt_compact:
   1141 	lea	128($sbox),$inp			# size optimization
   1142 	mov	0-128($inp),$acc1		# prefetch Td4
   1143 	mov	32-128($inp),$acc2
   1144 	mov	64-128($inp),$t0
   1145 	mov	96-128($inp),$t1
   1146 	mov	128-128($inp),$acc1
   1147 	mov	160-128($inp),$acc2
   1148 	mov	192-128($inp),$t0
   1149 	mov	224-128($inp),$t1
   1150 	jmp	.Ldec_loop_compact
   1151 
   1152 .align	16
   1153 .Ldec_loop_compact:
   1154 		xor	0($key),$s0		# xor with key
   1155 		xor	4($key),$s1
   1156 		xor	8($key),$s2
   1157 		xor	12($key),$s3
   1158 		lea	16($key),$key
   1159 ___
   1160 		&deccompactvert();
   1161 $code.=<<___;
   1162 		cmp	16(%rsp),$key
   1163 		je	.Ldec_compact_done
   1164 
   1165 		mov	256+0($sbox),$mask80
   1166 		shl	\$32,%rbx
   1167 		shl	\$32,%rdx
   1168 		mov	256+8($sbox),$maskfe
   1169 		or	%rbx,%rax
   1170 		or	%rdx,%rcx
   1171 		mov	256+16($sbox),$mask1b
   1172 ___
   1173 		&dectransform(1);
   1174 $code.=<<___;
   1175 	jmp	.Ldec_loop_compact
   1176 .align	16
   1177 .Ldec_compact_done:
   1178 	xor	0($key),$s0
   1179 	xor	4($key),$s1
   1180 	xor	8($key),$s2
   1181 	xor	12($key),$s3
   1182 	.byte	0xf3,0xc3			# rep ret
   1183 .size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
   1184 ___
   1185 
   1186 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
   1187 $code.=<<___;
   1188 .globl	AES_decrypt
   1189 .type	AES_decrypt,\@function,3
   1190 .align	16
   1191 .globl	asm_AES_decrypt
   1192 .hidden	asm_AES_decrypt
   1193 asm_AES_decrypt:
   1194 AES_decrypt:
   1195 	push	%rbx
   1196 	push	%rbp
   1197 	push	%r12
   1198 	push	%r13
   1199 	push	%r14
   1200 	push	%r15
   1201 
   1202 	# allocate frame "above" key schedule
   1203 	mov	%rsp,%r10
   1204 	lea	-63(%rdx),%rcx	# %rdx is key argument
   1205 	and	\$-64,%rsp
   1206 	sub	%rsp,%rcx
   1207 	neg	%rcx
   1208 	and	\$0x3c0,%rcx
   1209 	sub	%rcx,%rsp
   1210 	sub	\$32,%rsp
   1211 
   1212 	mov	%rsi,16(%rsp)	# save out
   1213 	mov	%r10,24(%rsp)	# save real stack pointer
   1214 .Ldec_prologue:
   1215 
   1216 	mov	%rdx,$key
   1217 	mov	240($key),$rnds	# load rounds
   1218 
   1219 	mov	0(%rdi),$s0	# load input vector
   1220 	mov	4(%rdi),$s1
   1221 	mov	8(%rdi),$s2
   1222 	mov	12(%rdi),$s3
   1223 
   1224 	shl	\$4,$rnds
   1225 	lea	($key,$rnds),%rbp
   1226 	mov	$key,(%rsp)	# key schedule
   1227 	mov	%rbp,8(%rsp)	# end of key schedule
   1228 
   1229 	# pick Td4 copy which can't "overlap" with stack frame or key schedule
   1230 	lea	.LAES_Td+2048(%rip),$sbox
   1231 	lea	768(%rsp),%rbp
   1232 	sub	$sbox,%rbp
   1233 	and	\$0x300,%rbp
   1234 	lea	($sbox,%rbp),$sbox
   1235 	shr	\$3,%rbp	# recall "magic" constants!
   1236 	add	%rbp,$sbox
   1237 
   1238 	call	_x86_64_AES_decrypt_compact
   1239 
   1240 	mov	16(%rsp),$out	# restore out
   1241 	mov	24(%rsp),%rsi	# restore saved stack pointer
   1242 	mov	$s0,0($out)	# write output vector
   1243 	mov	$s1,4($out)
   1244 	mov	$s2,8($out)
   1245 	mov	$s3,12($out)
   1246 
   1247 	mov	(%rsi),%r15
   1248 	mov	8(%rsi),%r14
   1249 	mov	16(%rsi),%r13
   1250 	mov	24(%rsi),%r12
   1251 	mov	32(%rsi),%rbp
   1252 	mov	40(%rsi),%rbx
   1253 	lea	48(%rsi),%rsp
   1254 .Ldec_epilogue:
   1255 	ret
   1256 .size	AES_decrypt,.-AES_decrypt
   1257 ___
   1258 #------------------------------------------------------------------#
   1259 
   1260 sub enckey()
   1261 {
   1262 $code.=<<___;
   1263 	movz	%dl,%esi		# rk[i]>>0
   1264 	movzb	-128(%rbp,%rsi),%ebx
   1265 	movz	%dh,%esi		# rk[i]>>8
   1266 	shl	\$24,%ebx
   1267 	xor	%ebx,%eax
   1268 
   1269 	movzb	-128(%rbp,%rsi),%ebx
   1270 	shr	\$16,%edx
   1271 	movz	%dl,%esi		# rk[i]>>16
   1272 	xor	%ebx,%eax
   1273 
   1274 	movzb	-128(%rbp,%rsi),%ebx
   1275 	movz	%dh,%esi		# rk[i]>>24
   1276 	shl	\$8,%ebx
   1277 	xor	%ebx,%eax
   1278 
   1279 	movzb	-128(%rbp,%rsi),%ebx
   1280 	shl	\$16,%ebx
   1281 	xor	%ebx,%eax
   1282 
   1283 	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
   1284 ___
   1285 }
   1286 
   1287 # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
   1288 #                        AES_KEY *key)
   1289 $code.=<<___;
   1290 .globl	private_AES_set_encrypt_key
   1291 .type	private_AES_set_encrypt_key,\@function,3
   1292 .align	16
   1293 private_AES_set_encrypt_key:
   1294 	push	%rbx
   1295 	push	%rbp
   1296 	push	%r12			# redundant, but allows to share 
   1297 	push	%r13			# exception handler...
   1298 	push	%r14
   1299 	push	%r15
   1300 	sub	\$8,%rsp
   1301 .Lenc_key_prologue:
   1302 
   1303 	call	_x86_64_AES_set_encrypt_key
   1304 
   1305 	mov	8(%rsp),%r15
   1306 	mov	16(%rsp),%r14
   1307 	mov	24(%rsp),%r13
   1308 	mov	32(%rsp),%r12
   1309 	mov	40(%rsp),%rbp
   1310 	mov	48(%rsp),%rbx
   1311 	add	\$56,%rsp
   1312 .Lenc_key_epilogue:
   1313 	ret
   1314 .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
   1315 
   1316 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
   1317 .align	16
   1318 _x86_64_AES_set_encrypt_key:
   1319 	mov	%esi,%ecx			# %ecx=bits
   1320 	mov	%rdi,%rsi			# %rsi=userKey
   1321 	mov	%rdx,%rdi			# %rdi=key
   1322 
   1323 	test	\$-1,%rsi
   1324 	jz	.Lbadpointer
   1325 	test	\$-1,%rdi
   1326 	jz	.Lbadpointer
   1327 
   1328 	lea	.LAES_Te(%rip),%rbp
   1329 	lea	2048+128(%rbp),%rbp
   1330 
   1331 	# prefetch Te4
   1332 	mov	0-128(%rbp),%eax
   1333 	mov	32-128(%rbp),%ebx
   1334 	mov	64-128(%rbp),%r8d
   1335 	mov	96-128(%rbp),%edx
   1336 	mov	128-128(%rbp),%eax
   1337 	mov	160-128(%rbp),%ebx
   1338 	mov	192-128(%rbp),%r8d
   1339 	mov	224-128(%rbp),%edx
   1340 
   1341 	cmp	\$128,%ecx
   1342 	je	.L10rounds
   1343 	cmp	\$192,%ecx
   1344 	je	.L12rounds
   1345 	cmp	\$256,%ecx
   1346 	je	.L14rounds
   1347 	mov	\$-2,%rax			# invalid number of bits
   1348 	jmp	.Lexit
   1349 
   1350 .L10rounds:
   1351 	mov	0(%rsi),%rax			# copy first 4 dwords
   1352 	mov	8(%rsi),%rdx
   1353 	mov	%rax,0(%rdi)
   1354 	mov	%rdx,8(%rdi)
   1355 
   1356 	shr	\$32,%rdx
   1357 	xor	%ecx,%ecx
   1358 	jmp	.L10shortcut
   1359 .align	4
   1360 .L10loop:
   1361 		mov	0(%rdi),%eax			# rk[0]
   1362 		mov	12(%rdi),%edx			# rk[3]
   1363 .L10shortcut:
   1364 ___
   1365 		&enckey	();
   1366 $code.=<<___;
   1367 		mov	%eax,16(%rdi)			# rk[4]
   1368 		xor	4(%rdi),%eax
   1369 		mov	%eax,20(%rdi)			# rk[5]
   1370 		xor	8(%rdi),%eax
   1371 		mov	%eax,24(%rdi)			# rk[6]
   1372 		xor	12(%rdi),%eax
   1373 		mov	%eax,28(%rdi)			# rk[7]
   1374 		add	\$1,%ecx
   1375 		lea	16(%rdi),%rdi
   1376 		cmp	\$10,%ecx
   1377 	jl	.L10loop
   1378 
   1379 	movl	\$10,80(%rdi)			# setup number of rounds
   1380 	xor	%rax,%rax
   1381 	jmp	.Lexit
   1382 
   1383 .L12rounds:
   1384 	mov	0(%rsi),%rax			# copy first 6 dwords
   1385 	mov	8(%rsi),%rbx
   1386 	mov	16(%rsi),%rdx
   1387 	mov	%rax,0(%rdi)
   1388 	mov	%rbx,8(%rdi)
   1389 	mov	%rdx,16(%rdi)
   1390 
   1391 	shr	\$32,%rdx
   1392 	xor	%ecx,%ecx
   1393 	jmp	.L12shortcut
   1394 .align	4
   1395 .L12loop:
   1396 		mov	0(%rdi),%eax			# rk[0]
   1397 		mov	20(%rdi),%edx			# rk[5]
   1398 .L12shortcut:
   1399 ___
   1400 		&enckey	();
   1401 $code.=<<___;
   1402 		mov	%eax,24(%rdi)			# rk[6]
   1403 		xor	4(%rdi),%eax
   1404 		mov	%eax,28(%rdi)			# rk[7]
   1405 		xor	8(%rdi),%eax
   1406 		mov	%eax,32(%rdi)			# rk[8]
   1407 		xor	12(%rdi),%eax
   1408 		mov	%eax,36(%rdi)			# rk[9]
   1409 
   1410 		cmp	\$7,%ecx
   1411 		je	.L12break
   1412 		add	\$1,%ecx
   1413 
   1414 		xor	16(%rdi),%eax
   1415 		mov	%eax,40(%rdi)			# rk[10]
   1416 		xor	20(%rdi),%eax
   1417 		mov	%eax,44(%rdi)			# rk[11]
   1418 
   1419 		lea	24(%rdi),%rdi
   1420 	jmp	.L12loop
   1421 .L12break:
   1422 	movl	\$12,72(%rdi)		# setup number of rounds
   1423 	xor	%rax,%rax
   1424 	jmp	.Lexit
   1425 
   1426 .L14rounds:		
   1427 	mov	0(%rsi),%rax			# copy first 8 dwords
   1428 	mov	8(%rsi),%rbx
   1429 	mov	16(%rsi),%rcx
   1430 	mov	24(%rsi),%rdx
   1431 	mov	%rax,0(%rdi)
   1432 	mov	%rbx,8(%rdi)
   1433 	mov	%rcx,16(%rdi)
   1434 	mov	%rdx,24(%rdi)
   1435 
   1436 	shr	\$32,%rdx
   1437 	xor	%ecx,%ecx
   1438 	jmp	.L14shortcut
   1439 .align	4
   1440 .L14loop:
   1441 		mov	0(%rdi),%eax			# rk[0]
   1442 		mov	28(%rdi),%edx			# rk[4]
   1443 .L14shortcut:
   1444 ___
   1445 		&enckey	();
   1446 $code.=<<___;
   1447 		mov	%eax,32(%rdi)			# rk[8]
   1448 		xor	4(%rdi),%eax
   1449 		mov	%eax,36(%rdi)			# rk[9]
   1450 		xor	8(%rdi),%eax
   1451 		mov	%eax,40(%rdi)			# rk[10]
   1452 		xor	12(%rdi),%eax
   1453 		mov	%eax,44(%rdi)			# rk[11]
   1454 
   1455 		cmp	\$6,%ecx
   1456 		je	.L14break
   1457 		add	\$1,%ecx
   1458 
   1459 		mov	%eax,%edx
   1460 		mov	16(%rdi),%eax			# rk[4]
   1461 		movz	%dl,%esi			# rk[11]>>0
   1462 		movzb	-128(%rbp,%rsi),%ebx
   1463 		movz	%dh,%esi			# rk[11]>>8
   1464 		xor	%ebx,%eax
   1465 
   1466 		movzb	-128(%rbp,%rsi),%ebx
   1467 		shr	\$16,%edx
   1468 		shl	\$8,%ebx
   1469 		movz	%dl,%esi			# rk[11]>>16
   1470 		xor	%ebx,%eax
   1471 
   1472 		movzb	-128(%rbp,%rsi),%ebx
   1473 		movz	%dh,%esi			# rk[11]>>24
   1474 		shl	\$16,%ebx
   1475 		xor	%ebx,%eax
   1476 
   1477 		movzb	-128(%rbp,%rsi),%ebx
   1478 		shl	\$24,%ebx
   1479 		xor	%ebx,%eax
   1480 
   1481 		mov	%eax,48(%rdi)			# rk[12]
   1482 		xor	20(%rdi),%eax
   1483 		mov	%eax,52(%rdi)			# rk[13]
   1484 		xor	24(%rdi),%eax
   1485 		mov	%eax,56(%rdi)			# rk[14]
   1486 		xor	28(%rdi),%eax
   1487 		mov	%eax,60(%rdi)			# rk[15]
   1488 
   1489 		lea	32(%rdi),%rdi
   1490 	jmp	.L14loop
   1491 .L14break:
   1492 	movl	\$14,48(%rdi)		# setup number of rounds
   1493 	xor	%rax,%rax
   1494 	jmp	.Lexit
   1495 
   1496 .Lbadpointer:
   1497 	mov	\$-1,%rax
   1498 .Lexit:
   1499 	.byte	0xf3,0xc3			# rep ret
   1500 .size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
   1501 ___
   1502 
   1503 sub deckey_ref()
   1504 { my ($i,$ptr,$te,$td) = @_;
   1505   my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
   1506 $code.=<<___;
   1507 	mov	$i($ptr),$tp1
   1508 	mov	$tp1,$acc
   1509 	and	\$0x80808080,$acc
   1510 	mov	$acc,$tp4
   1511 	shr	\$7,$tp4
   1512 	lea	0($tp1,$tp1),$tp2
   1513 	sub	$tp4,$acc
   1514 	and	\$0xfefefefe,$tp2
   1515 	and	\$0x1b1b1b1b,$acc
   1516 	xor	$tp2,$acc
   1517 	mov	$acc,$tp2
   1518 
   1519 	and	\$0x80808080,$acc
   1520 	mov	$acc,$tp8
   1521 	shr	\$7,$tp8
   1522 	lea	0($tp2,$tp2),$tp4
   1523 	sub	$tp8,$acc
   1524 	and	\$0xfefefefe,$tp4
   1525 	and	\$0x1b1b1b1b,$acc
   1526 	 xor	$tp1,$tp2		# tp2^tp1
   1527 	xor	$tp4,$acc
   1528 	mov	$acc,$tp4
   1529 
   1530 	and	\$0x80808080,$acc
   1531 	mov	$acc,$tp8
   1532 	shr	\$7,$tp8
   1533 	sub	$tp8,$acc
   1534 	lea	0($tp4,$tp4),$tp8
   1535 	 xor	$tp1,$tp4		# tp4^tp1
   1536 	and	\$0xfefefefe,$tp8
   1537 	and	\$0x1b1b1b1b,$acc
   1538 	xor	$acc,$tp8
   1539 
   1540 	xor	$tp8,$tp1		# tp1^tp8
   1541 	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
   1542 	xor	$tp8,$tp2		# tp2^tp1^tp8
   1543 	xor	$tp8,$tp4		# tp4^tp1^tp8
   1544 	xor	$tp2,$tp8
   1545 	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
   1546 
   1547 	xor	$tp8,$tp1
   1548 	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
   1549 	xor	$tp2,$tp1
   1550 	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
   1551 	xor	$tp4,$tp1
   1552 
   1553 	mov	$tp1,$i($ptr)
   1554 ___
   1555 }
   1556 
   1557 # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
   1558 #                        AES_KEY *key)
   1559 $code.=<<___;
   1560 .globl	private_AES_set_decrypt_key
   1561 .type	private_AES_set_decrypt_key,\@function,3
   1562 .align	16
   1563 private_AES_set_decrypt_key:
   1564 	push	%rbx
   1565 	push	%rbp
   1566 	push	%r12
   1567 	push	%r13
   1568 	push	%r14
   1569 	push	%r15
   1570 	push	%rdx			# save key schedule
   1571 .Ldec_key_prologue:
   1572 
   1573 	call	_x86_64_AES_set_encrypt_key
   1574 	mov	(%rsp),%r8		# restore key schedule
   1575 	cmp	\$0,%eax
   1576 	jne	.Labort
   1577 
   1578 	mov	240(%r8),%r14d		# pull number of rounds
   1579 	xor	%rdi,%rdi
   1580 	lea	(%rdi,%r14d,4),%rcx
   1581 	mov	%r8,%rsi
   1582 	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
   1583 .align	4
   1584 .Linvert:
   1585 		mov	0(%rsi),%rax
   1586 		mov	8(%rsi),%rbx
   1587 		mov	0(%rdi),%rcx
   1588 		mov	8(%rdi),%rdx
   1589 		mov	%rax,0(%rdi)
   1590 		mov	%rbx,8(%rdi)
   1591 		mov	%rcx,0(%rsi)
   1592 		mov	%rdx,8(%rsi)
   1593 		lea	16(%rsi),%rsi
   1594 		lea	-16(%rdi),%rdi
   1595 		cmp	%rsi,%rdi
   1596 	jne	.Linvert
   1597 
   1598 	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
   1599 
   1600 	mov	40(%rax),$mask80
   1601 	mov	48(%rax),$maskfe
   1602 	mov	56(%rax),$mask1b
   1603 
   1604 	mov	%r8,$key
   1605 	sub	\$1,%r14d
   1606 .align	4
   1607 .Lpermute:
   1608 		lea	16($key),$key
   1609 		mov	0($key),%rax
   1610 		mov	8($key),%rcx
   1611 ___
   1612 		&dectransform ();
   1613 $code.=<<___;
   1614 		mov	%eax,0($key)
   1615 		mov	%ebx,4($key)
   1616 		mov	%ecx,8($key)
   1617 		mov	%edx,12($key)
   1618 		sub	\$1,%r14d
   1619 	jnz	.Lpermute
   1620 
   1621 	xor	%rax,%rax
   1622 .Labort:
   1623 	mov	8(%rsp),%r15
   1624 	mov	16(%rsp),%r14
   1625 	mov	24(%rsp),%r13
   1626 	mov	32(%rsp),%r12
   1627 	mov	40(%rsp),%rbp
   1628 	mov	48(%rsp),%rbx
   1629 	add	\$56,%rsp
   1630 .Ldec_key_epilogue:
   1631 	ret
   1632 .size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
   1633 ___
   1634 
   1635 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
   1636 #			size_t length, const AES_KEY *key,
   1637 #			unsigned char *ivp,const int enc);
   1638 {
   1639 # stack frame layout
   1640 # -8(%rsp)		return address
   1641 my $keyp="0(%rsp)";		# one to pass as $key
   1642 my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
   1643 my $_rsp="16(%rsp)";		# saved %rsp
   1644 my $_inp="24(%rsp)";		# copy of 1st parameter, inp
   1645 my $_out="32(%rsp)";		# copy of 2nd parameter, out
   1646 my $_len="40(%rsp)";		# copy of 3rd parameter, length
   1647 my $_key="48(%rsp)";		# copy of 4th parameter, key
   1648 my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
   1649 my $ivec="64(%rsp)";		# ivec[16]
   1650 my $aes_key="80(%rsp)";		# copy of aes_key
   1651 my $mark="80+240(%rsp)";	# copy of aes_key->rounds
   1652 
   1653 $code.=<<___;
   1654 .globl	AES_cbc_encrypt
   1655 .type	AES_cbc_encrypt,\@function,6
   1656 .align	16
   1657 .extern	OPENSSL_ia32cap_P
   1658 .globl	asm_AES_cbc_encrypt
   1659 .hidden	asm_AES_cbc_encrypt
   1660 asm_AES_cbc_encrypt:
   1661 AES_cbc_encrypt:
   1662 	cmp	\$0,%rdx	# check length
   1663 	je	.Lcbc_epilogue
   1664 	pushfq
   1665 	push	%rbx
   1666 	push	%rbp
   1667 	push	%r12
   1668 	push	%r13
   1669 	push	%r14
   1670 	push	%r15
   1671 .Lcbc_prologue:
   1672 
   1673 	cld
   1674 	mov	%r9d,%r9d	# clear upper half of enc
   1675 
   1676 	lea	.LAES_Te(%rip),$sbox
   1677 	cmp	\$0,%r9
   1678 	jne	.Lcbc_picked_te
   1679 	lea	.LAES_Td(%rip),$sbox
   1680 .Lcbc_picked_te:
   1681 
   1682 	mov	OPENSSL_ia32cap_P(%rip),%r10d
   1683 	cmp	\$$speed_limit,%rdx
   1684 	jb	.Lcbc_slow_prologue
   1685 	test	\$15,%rdx
   1686 	jnz	.Lcbc_slow_prologue
   1687 	bt	\$28,%r10d
   1688 	jc	.Lcbc_slow_prologue
   1689 
   1690 	# allocate aligned stack frame...
   1691 	lea	-88-248(%rsp),$key
   1692 	and	\$-64,$key
   1693 
   1694 	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
   1695 	mov	$sbox,%r10
   1696 	lea	2304($sbox),%r11
   1697 	mov	$key,%r12
   1698 	and	\$0xFFF,%r10	# s = $sbox&0xfff
   1699 	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
   1700 	and	\$0xFFF,%r12	# p = %rsp&0xfff
   1701 
   1702 	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
   1703 	jb	.Lcbc_te_break_out
   1704 	sub	%r11,%r12
   1705 	sub	%r12,$key
   1706 	jmp	.Lcbc_te_ok
   1707 .Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
   1708 	sub	%r10,%r12
   1709 	and	\$0xFFF,%r12
   1710 	add	\$320,%r12
   1711 	sub	%r12,$key
   1712 .align	4
   1713 .Lcbc_te_ok:
   1714 
   1715 	xchg	%rsp,$key
   1716 	#add	\$8,%rsp	# reserve for return address!
   1717 	mov	$key,$_rsp	# save %rsp
   1718 .Lcbc_fast_body:
   1719 	mov	%rdi,$_inp	# save copy of inp
   1720 	mov	%rsi,$_out	# save copy of out
   1721 	mov	%rdx,$_len	# save copy of len
   1722 	mov	%rcx,$_key	# save copy of key
   1723 	mov	%r8,$_ivp	# save copy of ivp
   1724 	movl	\$0,$mark	# copy of aes_key->rounds = 0;
   1725 	mov	%r8,%rbp	# rearrange input arguments
   1726 	mov	%r9,%rbx
   1727 	mov	%rsi,$out
   1728 	mov	%rdi,$inp
   1729 	mov	%rcx,$key
   1730 
   1731 	mov	240($key),%eax		# key->rounds
   1732 	# do we copy key schedule to stack?
   1733 	mov	$key,%r10
   1734 	sub	$sbox,%r10
   1735 	and	\$0xfff,%r10
   1736 	cmp	\$2304,%r10
   1737 	jb	.Lcbc_do_ecopy
   1738 	cmp	\$4096-248,%r10
   1739 	jb	.Lcbc_skip_ecopy
   1740 .align	4
   1741 .Lcbc_do_ecopy:
   1742 		mov	$key,%rsi
   1743 		lea	$aes_key,%rdi
   1744 		lea	$aes_key,$key
   1745 		mov	\$240/8,%ecx
   1746 		.long	0x90A548F3	# rep movsq
   1747 		mov	%eax,(%rdi)	# copy aes_key->rounds
   1748 .Lcbc_skip_ecopy:
   1749 	mov	$key,$keyp	# save key pointer
   1750 
   1751 	mov	\$18,%ecx
   1752 .align	4
   1753 .Lcbc_prefetch_te:
   1754 		mov	0($sbox),%r10
   1755 		mov	32($sbox),%r11
   1756 		mov	64($sbox),%r12
   1757 		mov	96($sbox),%r13
   1758 		lea	128($sbox),$sbox
   1759 		sub	\$1,%ecx
   1760 	jnz	.Lcbc_prefetch_te
   1761 	lea	-2304($sbox),$sbox
   1762 
   1763 	cmp	\$0,%rbx
   1764 	je	.LFAST_DECRYPT
   1765 
   1766 #----------------------------- ENCRYPT -----------------------------#
   1767 	mov	0(%rbp),$s0		# load iv
   1768 	mov	4(%rbp),$s1
   1769 	mov	8(%rbp),$s2
   1770 	mov	12(%rbp),$s3
   1771 
   1772 .align	4
   1773 .Lcbc_fast_enc_loop:
   1774 		xor	0($inp),$s0
   1775 		xor	4($inp),$s1
   1776 		xor	8($inp),$s2
   1777 		xor	12($inp),$s3
   1778 		mov	$keyp,$key	# restore key
   1779 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1780 
   1781 		call	_x86_64_AES_encrypt
   1782 
   1783 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1784 		mov	$_len,%r10
   1785 		mov	$s0,0($out)
   1786 		mov	$s1,4($out)
   1787 		mov	$s2,8($out)
   1788 		mov	$s3,12($out)
   1789 
   1790 		lea	16($inp),$inp
   1791 		lea	16($out),$out
   1792 		sub	\$16,%r10
   1793 		test	\$-16,%r10
   1794 		mov	%r10,$_len
   1795 	jnz	.Lcbc_fast_enc_loop
   1796 	mov	$_ivp,%rbp	# restore ivp
   1797 	mov	$s0,0(%rbp)	# save ivec
   1798 	mov	$s1,4(%rbp)
   1799 	mov	$s2,8(%rbp)
   1800 	mov	$s3,12(%rbp)
   1801 
   1802 	jmp	.Lcbc_fast_cleanup
   1803 
   1804 #----------------------------- DECRYPT -----------------------------#
   1805 .align	16
   1806 .LFAST_DECRYPT:
   1807 	cmp	$inp,$out
   1808 	je	.Lcbc_fast_dec_in_place
   1809 
   1810 	mov	%rbp,$ivec
   1811 .align	4
   1812 .Lcbc_fast_dec_loop:
   1813 		mov	0($inp),$s0	# read input
   1814 		mov	4($inp),$s1
   1815 		mov	8($inp),$s2
   1816 		mov	12($inp),$s3
   1817 		mov	$keyp,$key	# restore key
   1818 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1819 
   1820 		call	_x86_64_AES_decrypt
   1821 
   1822 		mov	$ivec,%rbp	# load ivp
   1823 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1824 		mov	$_len,%r10	# load len
   1825 		xor	0(%rbp),$s0	# xor iv
   1826 		xor	4(%rbp),$s1
   1827 		xor	8(%rbp),$s2
   1828 		xor	12(%rbp),$s3
   1829 		mov	$inp,%rbp	# current input, next iv
   1830 
   1831 		sub	\$16,%r10
   1832 		mov	%r10,$_len	# update len
   1833 		mov	%rbp,$ivec	# update ivp
   1834 
   1835 		mov	$s0,0($out)	# write output
   1836 		mov	$s1,4($out)
   1837 		mov	$s2,8($out)
   1838 		mov	$s3,12($out)
   1839 
   1840 		lea	16($inp),$inp
   1841 		lea	16($out),$out
   1842 	jnz	.Lcbc_fast_dec_loop
   1843 	mov	$_ivp,%r12		# load user ivp
   1844 	mov	0(%rbp),%r10		# load iv
   1845 	mov	8(%rbp),%r11
   1846 	mov	%r10,0(%r12)		# copy back to user
   1847 	mov	%r11,8(%r12)
   1848 	jmp	.Lcbc_fast_cleanup
   1849 
   1850 .align	16
   1851 .Lcbc_fast_dec_in_place:
   1852 	mov	0(%rbp),%r10		# copy iv to stack
   1853 	mov	8(%rbp),%r11
   1854 	mov	%r10,0+$ivec
   1855 	mov	%r11,8+$ivec
   1856 .align	4
   1857 .Lcbc_fast_dec_in_place_loop:
   1858 		mov	0($inp),$s0	# load input
   1859 		mov	4($inp),$s1
   1860 		mov	8($inp),$s2
   1861 		mov	12($inp),$s3
   1862 		mov	$keyp,$key	# restore key
   1863 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1864 
   1865 		call	_x86_64_AES_decrypt
   1866 
   1867 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1868 		mov	$_len,%r10
   1869 		xor	0+$ivec,$s0
   1870 		xor	4+$ivec,$s1
   1871 		xor	8+$ivec,$s2
   1872 		xor	12+$ivec,$s3
   1873 
   1874 		mov	0($inp),%r11	# load input
   1875 		mov	8($inp),%r12
   1876 		sub	\$16,%r10
   1877 		jz	.Lcbc_fast_dec_in_place_done
   1878 
   1879 		mov	%r11,0+$ivec	# copy input to iv
   1880 		mov	%r12,8+$ivec
   1881 
   1882 		mov	$s0,0($out)	# save output [zaps input]
   1883 		mov	$s1,4($out)
   1884 		mov	$s2,8($out)
   1885 		mov	$s3,12($out)
   1886 
   1887 		lea	16($inp),$inp
   1888 		lea	16($out),$out
   1889 		mov	%r10,$_len
   1890 	jmp	.Lcbc_fast_dec_in_place_loop
   1891 .Lcbc_fast_dec_in_place_done:
   1892 	mov	$_ivp,%rdi
   1893 	mov	%r11,0(%rdi)	# copy iv back to user
   1894 	mov	%r12,8(%rdi)
   1895 
   1896 	mov	$s0,0($out)	# save output [zaps input]
   1897 	mov	$s1,4($out)
   1898 	mov	$s2,8($out)
   1899 	mov	$s3,12($out)
   1900 
   1901 .align	4
   1902 .Lcbc_fast_cleanup:
   1903 	cmpl	\$0,$mark	# was the key schedule copied?
   1904 	lea	$aes_key,%rdi
   1905 	je	.Lcbc_exit
   1906 		mov	\$240/8,%ecx
   1907 		xor	%rax,%rax
   1908 		.long	0x90AB48F3	# rep stosq
   1909 
   1910 	jmp	.Lcbc_exit
   1911 
   1912 #--------------------------- SLOW ROUTINE ---------------------------#
   1913 .align	16
   1914 .Lcbc_slow_prologue:
   1915 	# allocate aligned stack frame...
   1916 	lea	-88(%rsp),%rbp
   1917 	and	\$-64,%rbp
   1918 	# ... just "above" key schedule
   1919 	lea	-88-63(%rcx),%r10
   1920 	sub	%rbp,%r10
   1921 	neg	%r10
   1922 	and	\$0x3c0,%r10
   1923 	sub	%r10,%rbp
   1924 
   1925 	xchg	%rsp,%rbp
   1926 	#add	\$8,%rsp	# reserve for return address!
   1927 	mov	%rbp,$_rsp	# save %rsp
   1928 .Lcbc_slow_body:
   1929 	#mov	%rdi,$_inp	# save copy of inp
   1930 	#mov	%rsi,$_out	# save copy of out
   1931 	#mov	%rdx,$_len	# save copy of len
   1932 	#mov	%rcx,$_key	# save copy of key
   1933 	mov	%r8,$_ivp	# save copy of ivp
   1934 	mov	%r8,%rbp	# rearrange input arguments
   1935 	mov	%r9,%rbx
   1936 	mov	%rsi,$out
   1937 	mov	%rdi,$inp
   1938 	mov	%rcx,$key
   1939 	mov	%rdx,%r10
   1940 
   1941 	mov	240($key),%eax
   1942 	mov	$key,$keyp	# save key pointer
   1943 	shl	\$4,%eax
   1944 	lea	($key,%rax),%rax
   1945 	mov	%rax,$keyend
   1946 
   1947 	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
   1948 	lea	2048($sbox),$sbox
   1949 	lea	768-8(%rsp),%rax
   1950 	sub	$sbox,%rax
   1951 	and	\$0x300,%rax
   1952 	lea	($sbox,%rax),$sbox
   1953 
   1954 	cmp	\$0,%rbx
   1955 	je	.LSLOW_DECRYPT
   1956 
   1957 #--------------------------- SLOW ENCRYPT ---------------------------#
   1958 	test	\$-16,%r10		# check upon length
   1959 	mov	0(%rbp),$s0		# load iv
   1960 	mov	4(%rbp),$s1
   1961 	mov	8(%rbp),$s2
   1962 	mov	12(%rbp),$s3
   1963 	jz	.Lcbc_slow_enc_tail	# short input...
   1964 
   1965 .align	4
   1966 .Lcbc_slow_enc_loop:
   1967 		xor	0($inp),$s0
   1968 		xor	4($inp),$s1
   1969 		xor	8($inp),$s2
   1970 		xor	12($inp),$s3
   1971 		mov	$keyp,$key	# restore key
   1972 		mov	$inp,$_inp	# save inp
   1973 		mov	$out,$_out	# save out
   1974 		mov	%r10,$_len	# save len
   1975 
   1976 		call	_x86_64_AES_encrypt_compact
   1977 
   1978 		mov	$_inp,$inp	# restore inp
   1979 		mov	$_out,$out	# restore out
   1980 		mov	$_len,%r10	# restore len
   1981 		mov	$s0,0($out)
   1982 		mov	$s1,4($out)
   1983 		mov	$s2,8($out)
   1984 		mov	$s3,12($out)
   1985 
   1986 		lea	16($inp),$inp
   1987 		lea	16($out),$out
   1988 		sub	\$16,%r10
   1989 		test	\$-16,%r10
   1990 	jnz	.Lcbc_slow_enc_loop
   1991 	test	\$15,%r10
   1992 	jnz	.Lcbc_slow_enc_tail
   1993 	mov	$_ivp,%rbp	# restore ivp
   1994 	mov	$s0,0(%rbp)	# save ivec
   1995 	mov	$s1,4(%rbp)
   1996 	mov	$s2,8(%rbp)
   1997 	mov	$s3,12(%rbp)
   1998 
   1999 	jmp	.Lcbc_exit
   2000 
   2001 .align	4
   2002 .Lcbc_slow_enc_tail:
   2003 	mov	%rax,%r11
   2004 	mov	%rcx,%r12
   2005 	mov	%r10,%rcx
   2006 	mov	$inp,%rsi
   2007 	mov	$out,%rdi
   2008 	.long	0x9066A4F3		# rep movsb
   2009 	mov	\$16,%rcx		# zero tail
   2010 	sub	%r10,%rcx
   2011 	xor	%rax,%rax
   2012 	.long	0x9066AAF3		# rep stosb
   2013 	mov	$out,$inp		# this is not a mistake!
   2014 	mov	\$16,%r10		# len=16
   2015 	mov	%r11,%rax
   2016 	mov	%r12,%rcx
   2017 	jmp	.Lcbc_slow_enc_loop	# one more spin...
   2018 #--------------------------- SLOW DECRYPT ---------------------------#
   2019 .align	16
   2020 .LSLOW_DECRYPT:
   2021 	shr	\$3,%rax
   2022 	add	%rax,$sbox		# recall "magic" constants!
   2023 
   2024 	mov	0(%rbp),%r11		# copy iv to stack
   2025 	mov	8(%rbp),%r12
   2026 	mov	%r11,0+$ivec
   2027 	mov	%r12,8+$ivec
   2028 
   2029 .align	4
   2030 .Lcbc_slow_dec_loop:
   2031 		mov	0($inp),$s0	# load input
   2032 		mov	4($inp),$s1
   2033 		mov	8($inp),$s2
   2034 		mov	12($inp),$s3
   2035 		mov	$keyp,$key	# restore key
   2036 		mov	$inp,$_inp	# save inp
   2037 		mov	$out,$_out	# save out
   2038 		mov	%r10,$_len	# save len
   2039 
   2040 		call	_x86_64_AES_decrypt_compact
   2041 
   2042 		mov	$_inp,$inp	# restore inp
   2043 		mov	$_out,$out	# restore out
   2044 		mov	$_len,%r10
   2045 		xor	0+$ivec,$s0
   2046 		xor	4+$ivec,$s1
   2047 		xor	8+$ivec,$s2
   2048 		xor	12+$ivec,$s3
   2049 
   2050 		mov	0($inp),%r11	# load input
   2051 		mov	8($inp),%r12
   2052 		sub	\$16,%r10
   2053 		jc	.Lcbc_slow_dec_partial
   2054 		jz	.Lcbc_slow_dec_done
   2055 
   2056 		mov	%r11,0+$ivec	# copy input to iv
   2057 		mov	%r12,8+$ivec
   2058 
   2059 		mov	$s0,0($out)	# save output [can zap input]
   2060 		mov	$s1,4($out)
   2061 		mov	$s2,8($out)
   2062 		mov	$s3,12($out)
   2063 
   2064 		lea	16($inp),$inp
   2065 		lea	16($out),$out
   2066 	jmp	.Lcbc_slow_dec_loop
   2067 .Lcbc_slow_dec_done:
   2068 	mov	$_ivp,%rdi
   2069 	mov	%r11,0(%rdi)		# copy iv back to user
   2070 	mov	%r12,8(%rdi)
   2071 
   2072 	mov	$s0,0($out)		# save output [can zap input]
   2073 	mov	$s1,4($out)
   2074 	mov	$s2,8($out)
   2075 	mov	$s3,12($out)
   2076 
   2077 	jmp	.Lcbc_exit
   2078 
   2079 .align	4
   2080 .Lcbc_slow_dec_partial:
   2081 	mov	$_ivp,%rdi
   2082 	mov	%r11,0(%rdi)		# copy iv back to user
   2083 	mov	%r12,8(%rdi)
   2084 
   2085 	mov	$s0,0+$ivec		# save output to stack
   2086 	mov	$s1,4+$ivec
   2087 	mov	$s2,8+$ivec
   2088 	mov	$s3,12+$ivec
   2089 
   2090 	mov	$out,%rdi
   2091 	lea	$ivec,%rsi
   2092 	lea	16(%r10),%rcx
   2093 	.long	0x9066A4F3	# rep movsb
   2094 	jmp	.Lcbc_exit
   2095 
   2096 .align	16
   2097 .Lcbc_exit:
   2098 	mov	$_rsp,%rsi
   2099 	mov	(%rsi),%r15
   2100 	mov	8(%rsi),%r14
   2101 	mov	16(%rsi),%r13
   2102 	mov	24(%rsi),%r12
   2103 	mov	32(%rsi),%rbp
   2104 	mov	40(%rsi),%rbx
   2105 	lea	48(%rsi),%rsp
   2106 .Lcbc_popfq:
   2107 	popfq
   2108 .Lcbc_epilogue:
   2109 	ret
   2110 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
   2111 ___
   2112 }
   2113 
   2114 $code.=<<___;
   2115 .align	64
   2116 .LAES_Te:
   2117 ___
   2118 	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
   2119 	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
   2120 	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
   2121 	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
   2122 	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
   2123 	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
   2124 	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
   2125 	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
   2126 	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
   2127 	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
   2128 	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
   2129 	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
   2130 	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
   2131 	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
   2132 	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
   2133 	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
   2134 	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
   2135 	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
   2136 	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
   2137 	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
   2138 	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
   2139 	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
   2140 	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
   2141 	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
   2142 	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
   2143 	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
   2144 	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
   2145 	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
   2146 	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
   2147 	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
   2148 	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
   2149 	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
   2150 	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
   2151 	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
   2152 	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
   2153 	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
   2154 	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
   2155 	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
   2156 	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
   2157 	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
   2158 	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
   2159 	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
   2160 	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
   2161 	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
   2162 	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
   2163 	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
   2164 	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
   2165 	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
   2166 	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
   2167 	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
   2168 	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
   2169 	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
   2170 	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
   2171 	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
   2172 	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
   2173 	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
   2174 	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
   2175 	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
   2176 	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
   2177 	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
   2178 	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
   2179 	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
   2180 	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
   2181 	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
   2182 
   2183 #Te4	# four copies of Te4 to choose from to avoid L1 aliasing
   2184 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2185 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2186 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2187 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2188 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2189 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2190 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2191 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2192 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2193 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2194 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2195 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2196 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2197 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2198 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2199 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2200 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2201 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2202 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2203 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2204 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2205 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2206 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2207 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2208 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2209 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2210 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2211 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2212 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2213 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2214 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2215 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2216 
   2217 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2218 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2219 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2220 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2221 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2222 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2223 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2224 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2225 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2226 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2227 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2228 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2229 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2230 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2231 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2232 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2233 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2234 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2235 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2236 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2237 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2238 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2239 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2240 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2241 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2242 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2243 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2244 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2245 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2246 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2247 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2248 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2249 
   2250 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2251 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2252 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2253 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2254 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2255 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2256 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2257 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2258 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2259 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2260 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2261 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2262 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2263 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2264 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2265 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2266 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2267 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2268 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2269 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2270 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2271 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2272 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2273 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2274 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2275 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2276 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2277 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2278 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2279 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2280 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2281 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2282 
   2283 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2284 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2285 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2286 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2287 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2288 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2289 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2290 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2291 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2292 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2293 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2294 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2295 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2296 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2297 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2298 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2299 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2300 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2301 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2302 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2303 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2304 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2305 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2306 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2307 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2308 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2309 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2310 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2311 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2312 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2313 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2314 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2315 #rcon:
   2316 $code.=<<___;
   2317 	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
   2318 	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
   2319 	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
   2320 	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
   2321 ___
   2322 $code.=<<___;
   2323 .align	64
   2324 .LAES_Td:
   2325 ___
   2326 	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
   2327 	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
   2328 	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
   2329 	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
   2330 	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
   2331 	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
   2332 	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
   2333 	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
   2334 	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
   2335 	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
   2336 	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
   2337 	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
   2338 	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
   2339 	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
   2340 	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
   2341 	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
   2342 	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
   2343 	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
   2344 	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
   2345 	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
   2346 	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
   2347 	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
   2348 	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
   2349 	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
   2350 	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
   2351 	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
   2352 	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
   2353 	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
   2354 	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
   2355 	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
   2356 	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
   2357 	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
   2358 	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
   2359 	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
   2360 	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
   2361 	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
   2362 	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
   2363 	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
   2364 	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
   2365 	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
   2366 	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
   2367 	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
   2368 	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
   2369 	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
   2370 	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
   2371 	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
   2372 	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
   2373 	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
   2374 	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
   2375 	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
   2376 	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
   2377 	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
   2378 	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
   2379 	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
   2380 	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
   2381 	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
   2382 	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
   2383 	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
   2384 	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
   2385 	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
   2386 	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
   2387 	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
   2388 	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
   2389 	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
   2390 
   2391 #Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
   2392 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2393 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2394 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2395 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2396 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2397 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2398 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2399 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2400 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2401 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2402 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2403 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2404 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2405 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2406 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2407 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2408 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2409 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2410 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2411 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2412 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2413 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2414 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2415 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2416 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2417 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2418 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2419 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2420 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2421 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2422 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2423 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2424 $code.=<<___;
   2425 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2426 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2427 ___
   2428 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2429 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2430 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2431 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2432 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2433 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2434 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2435 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2436 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2437 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2438 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2439 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2440 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2441 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2442 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2443 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2444 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2445 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2446 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2447 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2448 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2449 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2450 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2451 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2452 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2453 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2454 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2455 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2456 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2457 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2458 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2459 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2460 $code.=<<___;
   2461 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2462 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2463 ___
   2464 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2465 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2466 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2467 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2468 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2469 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2470 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2471 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2472 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2473 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2474 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2475 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2476 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2477 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2478 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2479 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2480 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2481 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2482 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2483 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2484 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2485 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2486 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2487 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2488 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2489 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2490 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2491 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2492 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2493 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2494 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2495 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2496 $code.=<<___;
   2497 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2498 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2499 ___
   2500 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2501 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2502 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2503 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2504 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2505 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2506 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2507 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2508 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2509 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2510 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2511 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2512 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2513 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2514 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2515 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2516 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2517 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2518 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2519 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2520 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2521 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2522 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2523 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2524 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2525 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2526 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2527 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2528 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2529 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2530 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2531 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2532 $code.=<<___;
   2533 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2534 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2535 .asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   2536 .align	64
   2537 ___
   2538 
   2539 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2540 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2541 if ($win64) {
   2542 $rec="%rcx";
   2543 $frame="%rdx";
   2544 $context="%r8";
   2545 $disp="%r9";
   2546 
   2547 $code.=<<___;
   2548 .extern	__imp_RtlVirtualUnwind
   2549 .type	block_se_handler,\@abi-omnipotent
   2550 .align	16
   2551 block_se_handler:
   2552 	push	%rsi
   2553 	push	%rdi
   2554 	push	%rbx
   2555 	push	%rbp
   2556 	push	%r12
   2557 	push	%r13
   2558 	push	%r14
   2559 	push	%r15
   2560 	pushfq
   2561 	sub	\$64,%rsp
   2562 
   2563 	mov	120($context),%rax	# pull context->Rax
   2564 	mov	248($context),%rbx	# pull context->Rip
   2565 
   2566 	mov	8($disp),%rsi		# disp->ImageBase
   2567 	mov	56($disp),%r11		# disp->HandlerData
   2568 
   2569 	mov	0(%r11),%r10d		# HandlerData[0]
   2570 	lea	(%rsi,%r10),%r10	# prologue label
   2571 	cmp	%r10,%rbx		# context->Rip<prologue label
   2572 	jb	.Lin_block_prologue
   2573 
   2574 	mov	152($context),%rax	# pull context->Rsp
   2575 
   2576 	mov	4(%r11),%r10d		# HandlerData[1]
   2577 	lea	(%rsi,%r10),%r10	# epilogue label
   2578 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2579 	jae	.Lin_block_prologue
   2580 
   2581 	mov	24(%rax),%rax		# pull saved real stack pointer
   2582 	lea	48(%rax),%rax		# adjust...
   2583 
   2584 	mov	-8(%rax),%rbx
   2585 	mov	-16(%rax),%rbp
   2586 	mov	-24(%rax),%r12
   2587 	mov	-32(%rax),%r13
   2588 	mov	-40(%rax),%r14
   2589 	mov	-48(%rax),%r15
   2590 	mov	%rbx,144($context)	# restore context->Rbx
   2591 	mov	%rbp,160($context)	# restore context->Rbp
   2592 	mov	%r12,216($context)	# restore context->R12
   2593 	mov	%r13,224($context)	# restore context->R13
   2594 	mov	%r14,232($context)	# restore context->R14
   2595 	mov	%r15,240($context)	# restore context->R15
   2596 
   2597 .Lin_block_prologue:
   2598 	mov	8(%rax),%rdi
   2599 	mov	16(%rax),%rsi
   2600 	mov	%rax,152($context)	# restore context->Rsp
   2601 	mov	%rsi,168($context)	# restore context->Rsi
   2602 	mov	%rdi,176($context)	# restore context->Rdi
   2603 
   2604 	jmp	.Lcommon_seh_exit
   2605 .size	block_se_handler,.-block_se_handler
   2606 
   2607 .type	key_se_handler,\@abi-omnipotent
   2608 .align	16
   2609 key_se_handler:
   2610 	push	%rsi
   2611 	push	%rdi
   2612 	push	%rbx
   2613 	push	%rbp
   2614 	push	%r12
   2615 	push	%r13
   2616 	push	%r14
   2617 	push	%r15
   2618 	pushfq
   2619 	sub	\$64,%rsp
   2620 
   2621 	mov	120($context),%rax	# pull context->Rax
   2622 	mov	248($context),%rbx	# pull context->Rip
   2623 
   2624 	mov	8($disp),%rsi		# disp->ImageBase
   2625 	mov	56($disp),%r11		# disp->HandlerData
   2626 
   2627 	mov	0(%r11),%r10d		# HandlerData[0]
   2628 	lea	(%rsi,%r10),%r10	# prologue label
   2629 	cmp	%r10,%rbx		# context->Rip<prologue label
   2630 	jb	.Lin_key_prologue
   2631 
   2632 	mov	152($context),%rax	# pull context->Rsp
   2633 
   2634 	mov	4(%r11),%r10d		# HandlerData[1]
   2635 	lea	(%rsi,%r10),%r10	# epilogue label
   2636 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2637 	jae	.Lin_key_prologue
   2638 
   2639 	lea	56(%rax),%rax
   2640 
   2641 	mov	-8(%rax),%rbx
   2642 	mov	-16(%rax),%rbp
   2643 	mov	-24(%rax),%r12
   2644 	mov	-32(%rax),%r13
   2645 	mov	-40(%rax),%r14
   2646 	mov	-48(%rax),%r15
   2647 	mov	%rbx,144($context)	# restore context->Rbx
   2648 	mov	%rbp,160($context)	# restore context->Rbp
   2649 	mov	%r12,216($context)	# restore context->R12
   2650 	mov	%r13,224($context)	# restore context->R13
   2651 	mov	%r14,232($context)	# restore context->R14
   2652 	mov	%r15,240($context)	# restore context->R15
   2653 
   2654 .Lin_key_prologue:
   2655 	mov	8(%rax),%rdi
   2656 	mov	16(%rax),%rsi
   2657 	mov	%rax,152($context)	# restore context->Rsp
   2658 	mov	%rsi,168($context)	# restore context->Rsi
   2659 	mov	%rdi,176($context)	# restore context->Rdi
   2660 
   2661 	jmp	.Lcommon_seh_exit
   2662 .size	key_se_handler,.-key_se_handler
   2663 
   2664 .type	cbc_se_handler,\@abi-omnipotent
   2665 .align	16
   2666 cbc_se_handler:
   2667 	push	%rsi
   2668 	push	%rdi
   2669 	push	%rbx
   2670 	push	%rbp
   2671 	push	%r12
   2672 	push	%r13
   2673 	push	%r14
   2674 	push	%r15
   2675 	pushfq
   2676 	sub	\$64,%rsp
   2677 
   2678 	mov	120($context),%rax	# pull context->Rax
   2679 	mov	248($context),%rbx	# pull context->Rip
   2680 
   2681 	lea	.Lcbc_prologue(%rip),%r10
   2682 	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
   2683 	jb	.Lin_cbc_prologue
   2684 
   2685 	lea	.Lcbc_fast_body(%rip),%r10
   2686 	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
   2687 	jb	.Lin_cbc_frame_setup
   2688 
   2689 	lea	.Lcbc_slow_prologue(%rip),%r10
   2690 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
   2691 	jb	.Lin_cbc_body
   2692 
   2693 	lea	.Lcbc_slow_body(%rip),%r10
   2694 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
   2695 	jb	.Lin_cbc_frame_setup
   2696 
   2697 .Lin_cbc_body:
   2698 	mov	152($context),%rax	# pull context->Rsp
   2699 
   2700 	lea	.Lcbc_epilogue(%rip),%r10
   2701 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
   2702 	jae	.Lin_cbc_prologue
   2703 
   2704 	lea	8(%rax),%rax
   2705 
   2706 	lea	.Lcbc_popfq(%rip),%r10
   2707 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
   2708 	jae	.Lin_cbc_prologue
   2709 
   2710 	mov	`16-8`(%rax),%rax	# biased $_rsp
   2711 	lea	56(%rax),%rax
   2712 
   2713 .Lin_cbc_frame_setup:
   2714 	mov	-16(%rax),%rbx
   2715 	mov	-24(%rax),%rbp
   2716 	mov	-32(%rax),%r12
   2717 	mov	-40(%rax),%r13
   2718 	mov	-48(%rax),%r14
   2719 	mov	-56(%rax),%r15
   2720 	mov	%rbx,144($context)	# restore context->Rbx
   2721 	mov	%rbp,160($context)	# restore context->Rbp
   2722 	mov	%r12,216($context)	# restore context->R12
   2723 	mov	%r13,224($context)	# restore context->R13
   2724 	mov	%r14,232($context)	# restore context->R14
   2725 	mov	%r15,240($context)	# restore context->R15
   2726 
   2727 .Lin_cbc_prologue:
   2728 	mov	8(%rax),%rdi
   2729 	mov	16(%rax),%rsi
   2730 	mov	%rax,152($context)	# restore context->Rsp
   2731 	mov	%rsi,168($context)	# restore context->Rsi
   2732 	mov	%rdi,176($context)	# restore context->Rdi
   2733 
   2734 .Lcommon_seh_exit:
   2735 
   2736 	mov	40($disp),%rdi		# disp->ContextRecord
   2737 	mov	$context,%rsi		# context
   2738 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   2739 	.long	0xa548f3fc		# cld; rep movsq
   2740 
   2741 	mov	$disp,%rsi
   2742 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2743 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2744 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2745 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2746 	mov	40(%rsi),%r10		# disp->ContextRecord
   2747 	lea	56(%rsi),%r11		# &disp->HandlerData
   2748 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2749 	mov	%r10,32(%rsp)		# arg5
   2750 	mov	%r11,40(%rsp)		# arg6
   2751 	mov	%r12,48(%rsp)		# arg7
   2752 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2753 	call	*__imp_RtlVirtualUnwind(%rip)
   2754 
   2755 	mov	\$1,%eax		# ExceptionContinueSearch
   2756 	add	\$64,%rsp
   2757 	popfq
   2758 	pop	%r15
   2759 	pop	%r14
   2760 	pop	%r13
   2761 	pop	%r12
   2762 	pop	%rbp
   2763 	pop	%rbx
   2764 	pop	%rdi
   2765 	pop	%rsi
   2766 	ret
   2767 .size	cbc_se_handler,.-cbc_se_handler
   2768 
   2769 .section	.pdata
   2770 .align	4
   2771 	.rva	.LSEH_begin_AES_encrypt
   2772 	.rva	.LSEH_end_AES_encrypt
   2773 	.rva	.LSEH_info_AES_encrypt
   2774 
   2775 	.rva	.LSEH_begin_AES_decrypt
   2776 	.rva	.LSEH_end_AES_decrypt
   2777 	.rva	.LSEH_info_AES_decrypt
   2778 
   2779 	.rva	.LSEH_begin_private_AES_set_encrypt_key
   2780 	.rva	.LSEH_end_private_AES_set_encrypt_key
   2781 	.rva	.LSEH_info_private_AES_set_encrypt_key
   2782 
   2783 	.rva	.LSEH_begin_private_AES_set_decrypt_key
   2784 	.rva	.LSEH_end_private_AES_set_decrypt_key
   2785 	.rva	.LSEH_info_private_AES_set_decrypt_key
   2786 
   2787 	.rva	.LSEH_begin_AES_cbc_encrypt
   2788 	.rva	.LSEH_end_AES_cbc_encrypt
   2789 	.rva	.LSEH_info_AES_cbc_encrypt
   2790 
   2791 .section	.xdata
   2792 .align	8
   2793 .LSEH_info_AES_encrypt:
   2794 	.byte	9,0,0,0
   2795 	.rva	block_se_handler
   2796 	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
   2797 .LSEH_info_AES_decrypt:
   2798 	.byte	9,0,0,0
   2799 	.rva	block_se_handler
   2800 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
   2801 .LSEH_info_private_AES_set_encrypt_key:
   2802 	.byte	9,0,0,0
   2803 	.rva	key_se_handler
   2804 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
   2805 .LSEH_info_private_AES_set_decrypt_key:
   2806 	.byte	9,0,0,0
   2807 	.rva	key_se_handler
   2808 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
   2809 .LSEH_info_AES_cbc_encrypt:
   2810 	.byte	9,0,0,0
   2811 	.rva	cbc_se_handler
   2812 ___
   2813 }
   2814 
   2815 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   2816 
   2817 print $code;
   2818 
   2819 close STDOUT;
   2820