Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # Version 2.1.
     11 #
     12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
     13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
     14 # [you'll notice a lot of resemblance], such as compressed S-boxes
     15 # in little-endian byte order, prefetch of these tables in CBC mode,
     16 # as well as avoiding L1 cache aliasing between stack frame and key
     17 # schedule and already mentioned tables, compressed Td4...
     18 #
     19 # Performance in number of cycles per processed byte for 128-bit key:
     20 #
     21 #		ECB encrypt	ECB decrypt	CBC large chunk
     22 # AMD64		33		41		13.0
     23 # EM64T		38		59		18.6(*)
     24 # Core 2	30		43		14.5(*)
     25 #
     26 # (*) with hyper-threading off
     27 
     28 $flavour = shift;
     29 $output  = shift;
     30 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     31 
     32 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     33 
     34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     37 die "can't locate x86_64-xlate.pl";
     38 
     39 open STDOUT,"| $^X $xlate $flavour $output";
     40 
     41 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
     42 			# ~15% better on both AMD and Intel cores
     43 $speed_limit=512;	# see aes-586.pl for details
     44 
     45 $code=".text\n";
     46 
     47 $s0="%eax";
     48 $s1="%ebx";
     49 $s2="%ecx";
     50 $s3="%edx";
     51 $acc0="%esi";	$mask80="%rsi";
     52 $acc1="%edi";	$maskfe="%rdi";
     53 $acc2="%ebp";	$mask1b="%rbp";
     54 $inp="%r8";
     55 $out="%r9";
     56 $t0="%r10d";
     57 $t1="%r11d";
     58 $t2="%r12d";
     59 $rnds="%r13d";
     60 $sbox="%r14";
     61 $key="%r15";
     62 
     63 sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
     64 sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
     65 			$r =~ s/%[er]([sd]i)/%\1l/;
     66 			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
     67 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
     68 			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
     69 sub _data_word()
     70 { my $i;
     71     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
     72 }
     73 sub data_word()
     74 { my $i;
     75   my $last=pop(@_);
     76     $code.=".long\t";
     77     while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
     78     $code.=sprintf"0x%08x\n",$last;
     79 }
     80 
     81 sub data_byte()
     82 { my $i;
     83   my $last=pop(@_);
     84     $code.=".byte\t";
     85     while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
     86     $code.=sprintf"0x%02x\n",$last&0xff;
     87 }
     88 
     89 sub encvert()
     90 { my $t3="%r8d";	# zaps $inp!
     91 
     92 $code.=<<___;
     93 	# favor 3-way issue Opteron pipeline...
     94 	movzb	`&lo("$s0")`,$acc0
     95 	movzb	`&lo("$s1")`,$acc1
     96 	movzb	`&lo("$s2")`,$acc2
     97 	mov	0($sbox,$acc0,8),$t0
     98 	mov	0($sbox,$acc1,8),$t1
     99 	mov	0($sbox,$acc2,8),$t2
    100 
    101 	movzb	`&hi("$s1")`,$acc0
    102 	movzb	`&hi("$s2")`,$acc1
    103 	movzb	`&lo("$s3")`,$acc2
    104 	xor	3($sbox,$acc0,8),$t0
    105 	xor	3($sbox,$acc1,8),$t1
    106 	mov	0($sbox,$acc2,8),$t3
    107 
    108 	movzb	`&hi("$s3")`,$acc0
    109 	shr	\$16,$s2
    110 	movzb	`&hi("$s0")`,$acc2
    111 	xor	3($sbox,$acc0,8),$t2
    112 	shr	\$16,$s3
    113 	xor	3($sbox,$acc2,8),$t3
    114 
    115 	shr	\$16,$s1
    116 	lea	16($key),$key
    117 	shr	\$16,$s0
    118 
    119 	movzb	`&lo("$s2")`,$acc0
    120 	movzb	`&lo("$s3")`,$acc1
    121 	movzb	`&lo("$s0")`,$acc2
    122 	xor	2($sbox,$acc0,8),$t0
    123 	xor	2($sbox,$acc1,8),$t1
    124 	xor	2($sbox,$acc2,8),$t2
    125 
    126 	movzb	`&hi("$s3")`,$acc0
    127 	movzb	`&hi("$s0")`,$acc1
    128 	movzb	`&lo("$s1")`,$acc2
    129 	xor	1($sbox,$acc0,8),$t0
    130 	xor	1($sbox,$acc1,8),$t1
    131 	xor	2($sbox,$acc2,8),$t3
    132 
    133 	mov	12($key),$s3
    134 	movzb	`&hi("$s1")`,$acc1
    135 	movzb	`&hi("$s2")`,$acc2
    136 	mov	0($key),$s0
    137 	xor	1($sbox,$acc1,8),$t2
    138 	xor	1($sbox,$acc2,8),$t3
    139 
    140 	mov	4($key),$s1
    141 	mov	8($key),$s2
    142 	xor	$t0,$s0
    143 	xor	$t1,$s1
    144 	xor	$t2,$s2
    145 	xor	$t3,$s3
    146 ___
    147 }
    148 
    149 sub enclastvert()
    150 { my $t3="%r8d";	# zaps $inp!
    151 
    152 $code.=<<___;
    153 	movzb	`&lo("$s0")`,$acc0
    154 	movzb	`&lo("$s1")`,$acc1
    155 	movzb	`&lo("$s2")`,$acc2
    156 	movzb	2($sbox,$acc0,8),$t0
    157 	movzb	2($sbox,$acc1,8),$t1
    158 	movzb	2($sbox,$acc2,8),$t2
    159 
    160 	movzb	`&lo("$s3")`,$acc0
    161 	movzb	`&hi("$s1")`,$acc1
    162 	movzb	`&hi("$s2")`,$acc2
    163 	movzb	2($sbox,$acc0,8),$t3
    164 	mov	0($sbox,$acc1,8),$acc1	#$t0
    165 	mov	0($sbox,$acc2,8),$acc2	#$t1
    166 
    167 	and	\$0x0000ff00,$acc1
    168 	and	\$0x0000ff00,$acc2
    169 
    170 	xor	$acc1,$t0
    171 	xor	$acc2,$t1
    172 	shr	\$16,$s2
    173 
    174 	movzb	`&hi("$s3")`,$acc0
    175 	movzb	`&hi("$s0")`,$acc1
    176 	shr	\$16,$s3
    177 	mov	0($sbox,$acc0,8),$acc0	#$t2
    178 	mov	0($sbox,$acc1,8),$acc1	#$t3
    179 
    180 	and	\$0x0000ff00,$acc0
    181 	and	\$0x0000ff00,$acc1
    182 	shr	\$16,$s1
    183 	xor	$acc0,$t2
    184 	xor	$acc1,$t3
    185 	shr	\$16,$s0
    186 
    187 	movzb	`&lo("$s2")`,$acc0
    188 	movzb	`&lo("$s3")`,$acc1
    189 	movzb	`&lo("$s0")`,$acc2
    190 	mov	0($sbox,$acc0,8),$acc0	#$t0
    191 	mov	0($sbox,$acc1,8),$acc1	#$t1
    192 	mov	0($sbox,$acc2,8),$acc2	#$t2
    193 
    194 	and	\$0x00ff0000,$acc0
    195 	and	\$0x00ff0000,$acc1
    196 	and	\$0x00ff0000,$acc2
    197 
    198 	xor	$acc0,$t0
    199 	xor	$acc1,$t1
    200 	xor	$acc2,$t2
    201 
    202 	movzb	`&lo("$s1")`,$acc0
    203 	movzb	`&hi("$s3")`,$acc1
    204 	movzb	`&hi("$s0")`,$acc2
    205 	mov	0($sbox,$acc0,8),$acc0	#$t3
    206 	mov	2($sbox,$acc1,8),$acc1	#$t0
    207 	mov	2($sbox,$acc2,8),$acc2	#$t1
    208 
    209 	and	\$0x00ff0000,$acc0
    210 	and	\$0xff000000,$acc1
    211 	and	\$0xff000000,$acc2
    212 
    213 	xor	$acc0,$t3
    214 	xor	$acc1,$t0
    215 	xor	$acc2,$t1
    216 
    217 	movzb	`&hi("$s1")`,$acc0
    218 	movzb	`&hi("$s2")`,$acc1
    219 	mov	16+12($key),$s3
    220 	mov	2($sbox,$acc0,8),$acc0	#$t2
    221 	mov	2($sbox,$acc1,8),$acc1	#$t3
    222 	mov	16+0($key),$s0
    223 
    224 	and	\$0xff000000,$acc0
    225 	and	\$0xff000000,$acc1
    226 
    227 	xor	$acc0,$t2
    228 	xor	$acc1,$t3
    229 
    230 	mov	16+4($key),$s1
    231 	mov	16+8($key),$s2
    232 	xor	$t0,$s0
    233 	xor	$t1,$s1
    234 	xor	$t2,$s2
    235 	xor	$t3,$s3
    236 ___
    237 }
    238 
    239 sub encstep()
    240 { my ($i,@s) = @_;
    241   my $tmp0=$acc0;
    242   my $tmp1=$acc1;
    243   my $tmp2=$acc2;
    244   my $out=($t0,$t1,$t2,$s[0])[$i];
    245 
    246 	if ($i==3) {
    247 		$tmp0=$s[1];
    248 		$tmp1=$s[2];
    249 		$tmp2=$s[3];
    250 	}
    251 	$code.="	movzb	".&lo($s[0]).",$out\n";
    252 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    253 	$code.="	lea	16($key),$key\n"	if ($i==0);
    254 
    255 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    256 	$code.="	mov	0($sbox,$out,8),$out\n";
    257 
    258 	$code.="	shr	\$16,$tmp1\n";
    259 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    260 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    261 
    262 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    263 	$code.="	shr	\$24,$tmp2\n";
    264 	$code.="	xor	4*$i($key),$out\n";
    265 
    266 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    267 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    268 
    269 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    270 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    271 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    272 	$code.="\n";
    273 }
    274 
    275 sub enclast()
    276 { my ($i,@s)=@_;
    277   my $tmp0=$acc0;
    278   my $tmp1=$acc1;
    279   my $tmp2=$acc2;
    280   my $out=($t0,$t1,$t2,$s[0])[$i];
    281 
    282 	if ($i==3) {
    283 		$tmp0=$s[1];
    284 		$tmp1=$s[2];
    285 		$tmp2=$s[3];
    286 	}
    287 	$code.="	movzb	".&lo($s[0]).",$out\n";
    288 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    289 
    290 	$code.="	mov	2($sbox,$out,8),$out\n";
    291 	$code.="	shr	\$16,$tmp1\n";
    292 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    293 
    294 	$code.="	and	\$0x000000ff,$out\n";
    295 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    296 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    297 	$code.="	shr	\$24,$tmp2\n";
    298 
    299 	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
    300 	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
    301 	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
    302 
    303 	$code.="	and	\$0x0000ff00,$tmp0\n";
    304 	$code.="	and	\$0x00ff0000,$tmp1\n";
    305 	$code.="	and	\$0xff000000,$tmp2\n";
    306 
    307 	$code.="	xor	$tmp0,$out\n";
    308 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    309 	$code.="	xor	$tmp1,$out\n";
    310 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    311 	$code.="	xor	$tmp2,$out\n";
    312 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    313 	$code.="\n";
    314 }
    315 
    316 $code.=<<___;
    317 .type	_x86_64_AES_encrypt,\@abi-omnipotent
    318 .align	16
    319 _x86_64_AES_encrypt:
    320 	xor	0($key),$s0			# xor with key
    321 	xor	4($key),$s1
    322 	xor	8($key),$s2
    323 	xor	12($key),$s3
    324 
    325 	mov	240($key),$rnds			# load key->rounds
    326 	sub	\$1,$rnds
    327 	jmp	.Lenc_loop
    328 .align	16
    329 .Lenc_loop:
    330 ___
    331 	if ($verticalspin) { &encvert(); }
    332 	else {	&encstep(0,$s0,$s1,$s2,$s3);
    333 		&encstep(1,$s1,$s2,$s3,$s0);
    334 		&encstep(2,$s2,$s3,$s0,$s1);
    335 		&encstep(3,$s3,$s0,$s1,$s2);
    336 	}
    337 $code.=<<___;
    338 	sub	\$1,$rnds
    339 	jnz	.Lenc_loop
    340 ___
    341 	if ($verticalspin) { &enclastvert(); }
    342 	else {	&enclast(0,$s0,$s1,$s2,$s3);
    343 		&enclast(1,$s1,$s2,$s3,$s0);
    344 		&enclast(2,$s2,$s3,$s0,$s1);
    345 		&enclast(3,$s3,$s0,$s1,$s2);
    346 		$code.=<<___;
    347 		xor	16+0($key),$s0		# xor with key
    348 		xor	16+4($key),$s1
    349 		xor	16+8($key),$s2
    350 		xor	16+12($key),$s3
    351 ___
    352 	}
    353 $code.=<<___;
    354 	.byte	0xf3,0xc3			# rep ret
    355 .size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
    356 ___
    357 
    358 # it's possible to implement this by shifting tN by 8, filling least
    359 # significant byte with byte load and finally bswap-ing at the end,
    360 # but such partial register load kills Core 2...
    361 sub enccompactvert()
    362 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    363 
    364 $code.=<<___;
    365 	movzb	`&lo("$s0")`,$t0
    366 	movzb	`&lo("$s1")`,$t1
    367 	movzb	`&lo("$s2")`,$t2
    368 	movzb	($sbox,$t0,1),$t0
    369 	movzb	($sbox,$t1,1),$t1
    370 	movzb	($sbox,$t2,1),$t2
    371 
    372 	movzb	`&lo("$s3")`,$t3
    373 	movzb	`&hi("$s1")`,$acc0
    374 	movzb	`&hi("$s2")`,$acc1
    375 	movzb	($sbox,$t3,1),$t3
    376 	movzb	($sbox,$acc0,1),$t4	#$t0
    377 	movzb	($sbox,$acc1,1),$t5	#$t1
    378 
    379 	movzb	`&hi("$s3")`,$acc2
    380 	movzb	`&hi("$s0")`,$acc0
    381 	shr	\$16,$s2
    382 	movzb	($sbox,$acc2,1),$acc2	#$t2
    383 	movzb	($sbox,$acc0,1),$acc0	#$t3
    384 	shr	\$16,$s3
    385 
    386 	movzb	`&lo("$s2")`,$acc1
    387 	shl	\$8,$t4
    388 	shl	\$8,$t5
    389 	movzb	($sbox,$acc1,1),$acc1	#$t0
    390 	xor	$t4,$t0
    391 	xor	$t5,$t1
    392 
    393 	movzb	`&lo("$s3")`,$t4
    394 	shr	\$16,$s0
    395 	shr	\$16,$s1
    396 	movzb	`&lo("$s0")`,$t5
    397 	shl	\$8,$acc2
    398 	shl	\$8,$acc0
    399 	movzb	($sbox,$t4,1),$t4	#$t1
    400 	movzb	($sbox,$t5,1),$t5	#$t2
    401 	xor	$acc2,$t2
    402 	xor	$acc0,$t3
    403 
    404 	movzb	`&lo("$s1")`,$acc2
    405 	movzb	`&hi("$s3")`,$acc0
    406 	shl	\$16,$acc1
    407 	movzb	($sbox,$acc2,1),$acc2	#$t3
    408 	movzb	($sbox,$acc0,1),$acc0	#$t0
    409 	xor	$acc1,$t0
    410 
    411 	movzb	`&hi("$s0")`,$acc1
    412 	shr	\$8,$s2
    413 	shr	\$8,$s1
    414 	movzb	($sbox,$acc1,1),$acc1	#$t1
    415 	movzb	($sbox,$s2,1),$s3	#$t3
    416 	movzb	($sbox,$s1,1),$s2	#$t2
    417 	shl	\$16,$t4
    418 	shl	\$16,$t5
    419 	shl	\$16,$acc2
    420 	xor	$t4,$t1
    421 	xor	$t5,$t2
    422 	xor	$acc2,$t3
    423 
    424 	shl	\$24,$acc0
    425 	shl	\$24,$acc1
    426 	shl	\$24,$s3
    427 	xor	$acc0,$t0
    428 	shl	\$24,$s2
    429 	xor	$acc1,$t1
    430 	mov	$t0,$s0
    431 	mov	$t1,$s1
    432 	xor	$t2,$s2
    433 	xor	$t3,$s3
    434 ___
    435 }
    436 
    437 sub enctransform_ref()
    438 { my $sn = shift;
    439   my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
    440 
    441 $code.=<<___;
    442 	mov	$sn,$acc
    443 	and	\$0x80808080,$acc
    444 	mov	$acc,$tmp
    445 	shr	\$7,$tmp
    446 	lea	($sn,$sn),$r2
    447 	sub	$tmp,$acc
    448 	and	\$0xfefefefe,$r2
    449 	and	\$0x1b1b1b1b,$acc
    450 	mov	$sn,$tmp
    451 	xor	$acc,$r2
    452 
    453 	xor	$r2,$sn
    454 	rol	\$24,$sn
    455 	xor	$r2,$sn
    456 	ror	\$16,$tmp
    457 	xor	$tmp,$sn
    458 	ror	\$8,$tmp
    459 	xor	$tmp,$sn
    460 ___
    461 }
    462 
    463 # unlike decrypt case it does not pay off to parallelize enctransform
    464 sub enctransform()
    465 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
    466 
    467 $code.=<<___;
    468 	mov	$s0,$acc0
    469 	mov	$s1,$acc1
    470 	and	\$0x80808080,$acc0
    471 	and	\$0x80808080,$acc1
    472 	mov	$acc0,$t0
    473 	mov	$acc1,$t1
    474 	shr	\$7,$t0
    475 	lea	($s0,$s0),$r20
    476 	shr	\$7,$t1
    477 	lea	($s1,$s1),$r21
    478 	sub	$t0,$acc0
    479 	sub	$t1,$acc1
    480 	and	\$0xfefefefe,$r20
    481 	and	\$0xfefefefe,$r21
    482 	and	\$0x1b1b1b1b,$acc0
    483 	and	\$0x1b1b1b1b,$acc1
    484 	mov	$s0,$t0
    485 	mov	$s1,$t1
    486 	xor	$acc0,$r20
    487 	xor	$acc1,$r21
    488 
    489 	xor	$r20,$s0
    490 	xor	$r21,$s1
    491 	 mov	$s2,$acc0
    492 	 mov	$s3,$acc1
    493 	rol	\$24,$s0
    494 	rol	\$24,$s1
    495 	 and	\$0x80808080,$acc0
    496 	 and	\$0x80808080,$acc1
    497 	xor	$r20,$s0
    498 	xor	$r21,$s1
    499 	 mov	$acc0,$t2
    500 	 mov	$acc1,$t3
    501 	ror	\$16,$t0
    502 	ror	\$16,$t1
    503 	 shr	\$7,$t2
    504 	 lea	($s2,$s2),$r20
    505 	xor	$t0,$s0
    506 	xor	$t1,$s1
    507 	 shr	\$7,$t3
    508 	 lea	($s3,$s3),$r21
    509 	ror	\$8,$t0
    510 	ror	\$8,$t1
    511 	 sub	$t2,$acc0
    512 	 sub	$t3,$acc1
    513 	xor	$t0,$s0
    514 	xor	$t1,$s1
    515 
    516 	and	\$0xfefefefe,$r20
    517 	and	\$0xfefefefe,$r21
    518 	and	\$0x1b1b1b1b,$acc0
    519 	and	\$0x1b1b1b1b,$acc1
    520 	mov	$s2,$t2
    521 	mov	$s3,$t3
    522 	xor	$acc0,$r20
    523 	xor	$acc1,$r21
    524 
    525 	xor	$r20,$s2
    526 	xor	$r21,$s3
    527 	rol	\$24,$s2
    528 	rol	\$24,$s3
    529 	xor	$r20,$s2
    530 	xor	$r21,$s3
    531 	mov	0($sbox),$acc0			# prefetch Te4
    532 	ror	\$16,$t2
    533 	ror	\$16,$t3
    534 	mov	64($sbox),$acc1
    535 	xor	$t2,$s2
    536 	xor	$t3,$s3
    537 	mov	128($sbox),$r20
    538 	ror	\$8,$t2
    539 	ror	\$8,$t3
    540 	mov	192($sbox),$r21
    541 	xor	$t2,$s2
    542 	xor	$t3,$s3
    543 ___
    544 }
    545 
    546 $code.=<<___;
    547 .type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
    548 .align	16
    549 _x86_64_AES_encrypt_compact:
    550 	lea	128($sbox),$inp			# size optimization
    551 	mov	0-128($inp),$acc1		# prefetch Te4
    552 	mov	32-128($inp),$acc2
    553 	mov	64-128($inp),$t0
    554 	mov	96-128($inp),$t1
    555 	mov	128-128($inp),$acc1
    556 	mov	160-128($inp),$acc2
    557 	mov	192-128($inp),$t0
    558 	mov	224-128($inp),$t1
    559 	jmp	.Lenc_loop_compact
    560 .align	16
    561 .Lenc_loop_compact:
    562 		xor	0($key),$s0		# xor with key
    563 		xor	4($key),$s1
    564 		xor	8($key),$s2
    565 		xor	12($key),$s3
    566 		lea	16($key),$key
    567 ___
    568 		&enccompactvert();
    569 $code.=<<___;
    570 		cmp	16(%rsp),$key
    571 		je	.Lenc_compact_done
    572 ___
    573 		&enctransform();
    574 $code.=<<___;
    575 	jmp	.Lenc_loop_compact
    576 .align	16
    577 .Lenc_compact_done:
    578 	xor	0($key),$s0
    579 	xor	4($key),$s1
    580 	xor	8($key),$s2
    581 	xor	12($key),$s3
    582 	.byte	0xf3,0xc3			# rep ret
    583 .size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
    584 ___
    585 
    586 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
    587 $code.=<<___;
    588 .globl	AES_encrypt
    589 .type	AES_encrypt,\@function,3
    590 .align	16
    591 .globl	asm_AES_encrypt
    592 .hidden	asm_AES_encrypt
    593 asm_AES_encrypt:
    594 AES_encrypt:
    595 	push	%rbx
    596 	push	%rbp
    597 	push	%r12
    598 	push	%r13
    599 	push	%r14
    600 	push	%r15
    601 
    602 	# allocate frame "above" key schedule
    603 	mov	%rsp,%r10
    604 	lea	-63(%rdx),%rcx	# %rdx is key argument
    605 	and	\$-64,%rsp
    606 	sub	%rsp,%rcx
    607 	neg	%rcx
    608 	and	\$0x3c0,%rcx
    609 	sub	%rcx,%rsp
    610 	sub	\$32,%rsp
    611 
    612 	mov	%rsi,16(%rsp)	# save out
    613 	mov	%r10,24(%rsp)	# save real stack pointer
    614 .Lenc_prologue:
    615 
    616 	mov	%rdx,$key
    617 	mov	240($key),$rnds	# load rounds
    618 
    619 	mov	0(%rdi),$s0	# load input vector
    620 	mov	4(%rdi),$s1
    621 	mov	8(%rdi),$s2
    622 	mov	12(%rdi),$s3
    623 
    624 	shl	\$4,$rnds
    625 	lea	($key,$rnds),%rbp
    626 	mov	$key,(%rsp)	# key schedule
    627 	mov	%rbp,8(%rsp)	# end of key schedule
    628 
    629 	# pick Te4 copy which can't "overlap" with stack frame or key schedule
    630 	lea	.LAES_Te+2048(%rip),$sbox
    631 	lea	768(%rsp),%rbp
    632 	sub	$sbox,%rbp
    633 	and	\$0x300,%rbp
    634 	lea	($sbox,%rbp),$sbox
    635 
    636 	call	_x86_64_AES_encrypt_compact
    637 
    638 	mov	16(%rsp),$out	# restore out
    639 	mov	24(%rsp),%rsi	# restore saved stack pointer
    640 	mov	$s0,0($out)	# write output vector
    641 	mov	$s1,4($out)
    642 	mov	$s2,8($out)
    643 	mov	$s3,12($out)
    644 
    645 	mov	(%rsi),%r15
    646 	mov	8(%rsi),%r14
    647 	mov	16(%rsi),%r13
    648 	mov	24(%rsi),%r12
    649 	mov	32(%rsi),%rbp
    650 	mov	40(%rsi),%rbx
    651 	lea	48(%rsi),%rsp
    652 .Lenc_epilogue:
    653 	ret
    654 .size	AES_encrypt,.-AES_encrypt
    655 ___
    656 
    657 #------------------------------------------------------------------#
    658 
    659 sub decvert()
    660 { my $t3="%r8d";	# zaps $inp!
    661 
    662 $code.=<<___;
    663 	# favor 3-way issue Opteron pipeline...
    664 	movzb	`&lo("$s0")`,$acc0
    665 	movzb	`&lo("$s1")`,$acc1
    666 	movzb	`&lo("$s2")`,$acc2
    667 	mov	0($sbox,$acc0,8),$t0
    668 	mov	0($sbox,$acc1,8),$t1
    669 	mov	0($sbox,$acc2,8),$t2
    670 
    671 	movzb	`&hi("$s3")`,$acc0
    672 	movzb	`&hi("$s0")`,$acc1
    673 	movzb	`&lo("$s3")`,$acc2
    674 	xor	3($sbox,$acc0,8),$t0
    675 	xor	3($sbox,$acc1,8),$t1
    676 	mov	0($sbox,$acc2,8),$t3
    677 
    678 	movzb	`&hi("$s1")`,$acc0
    679 	shr	\$16,$s0
    680 	movzb	`&hi("$s2")`,$acc2
    681 	xor	3($sbox,$acc0,8),$t2
    682 	shr	\$16,$s3
    683 	xor	3($sbox,$acc2,8),$t3
    684 
    685 	shr	\$16,$s1
    686 	lea	16($key),$key
    687 	shr	\$16,$s2
    688 
    689 	movzb	`&lo("$s2")`,$acc0
    690 	movzb	`&lo("$s3")`,$acc1
    691 	movzb	`&lo("$s0")`,$acc2
    692 	xor	2($sbox,$acc0,8),$t0
    693 	xor	2($sbox,$acc1,8),$t1
    694 	xor	2($sbox,$acc2,8),$t2
    695 
    696 	movzb	`&hi("$s1")`,$acc0
    697 	movzb	`&hi("$s2")`,$acc1
    698 	movzb	`&lo("$s1")`,$acc2
    699 	xor	1($sbox,$acc0,8),$t0
    700 	xor	1($sbox,$acc1,8),$t1
    701 	xor	2($sbox,$acc2,8),$t3
    702 
    703 	movzb	`&hi("$s3")`,$acc0
    704 	mov	12($key),$s3
    705 	movzb	`&hi("$s0")`,$acc2
    706 	xor	1($sbox,$acc0,8),$t2
    707 	mov	0($key),$s0
    708 	xor	1($sbox,$acc2,8),$t3
    709 
    710 	xor	$t0,$s0
    711 	mov	4($key),$s1
    712 	mov	8($key),$s2
    713 	xor	$t2,$s2
    714 	xor	$t1,$s1
    715 	xor	$t3,$s3
    716 ___
    717 }
    718 
    719 sub declastvert()
    720 { my $t3="%r8d";	# zaps $inp!
    721 
    722 $code.=<<___;
    723 	lea	2048($sbox),$sbox	# size optimization
    724 	movzb	`&lo("$s0")`,$acc0
    725 	movzb	`&lo("$s1")`,$acc1
    726 	movzb	`&lo("$s2")`,$acc2
    727 	movzb	($sbox,$acc0,1),$t0
    728 	movzb	($sbox,$acc1,1),$t1
    729 	movzb	($sbox,$acc2,1),$t2
    730 
    731 	movzb	`&lo("$s3")`,$acc0
    732 	movzb	`&hi("$s3")`,$acc1
    733 	movzb	`&hi("$s0")`,$acc2
    734 	movzb	($sbox,$acc0,1),$t3
    735 	movzb	($sbox,$acc1,1),$acc1	#$t0
    736 	movzb	($sbox,$acc2,1),$acc2	#$t1
    737 
    738 	shl	\$8,$acc1
    739 	shl	\$8,$acc2
    740 
    741 	xor	$acc1,$t0
    742 	xor	$acc2,$t1
    743 	shr	\$16,$s3
    744 
    745 	movzb	`&hi("$s1")`,$acc0
    746 	movzb	`&hi("$s2")`,$acc1
    747 	shr	\$16,$s0
    748 	movzb	($sbox,$acc0,1),$acc0	#$t2
    749 	movzb	($sbox,$acc1,1),$acc1	#$t3
    750 
    751 	shl	\$8,$acc0
    752 	shl	\$8,$acc1
    753 	shr	\$16,$s1
    754 	xor	$acc0,$t2
    755 	xor	$acc1,$t3
    756 	shr	\$16,$s2
    757 
    758 	movzb	`&lo("$s2")`,$acc0
    759 	movzb	`&lo("$s3")`,$acc1
    760 	movzb	`&lo("$s0")`,$acc2
    761 	movzb	($sbox,$acc0,1),$acc0	#$t0
    762 	movzb	($sbox,$acc1,1),$acc1	#$t1
    763 	movzb	($sbox,$acc2,1),$acc2	#$t2
    764 
    765 	shl	\$16,$acc0
    766 	shl	\$16,$acc1
    767 	shl	\$16,$acc2
    768 
    769 	xor	$acc0,$t0
    770 	xor	$acc1,$t1
    771 	xor	$acc2,$t2
    772 
    773 	movzb	`&lo("$s1")`,$acc0
    774 	movzb	`&hi("$s1")`,$acc1
    775 	movzb	`&hi("$s2")`,$acc2
    776 	movzb	($sbox,$acc0,1),$acc0	#$t3
    777 	movzb	($sbox,$acc1,1),$acc1	#$t0
    778 	movzb	($sbox,$acc2,1),$acc2	#$t1
    779 
    780 	shl	\$16,$acc0
    781 	shl	\$24,$acc1
    782 	shl	\$24,$acc2
    783 
    784 	xor	$acc0,$t3
    785 	xor	$acc1,$t0
    786 	xor	$acc2,$t1
    787 
    788 	movzb	`&hi("$s3")`,$acc0
    789 	movzb	`&hi("$s0")`,$acc1
    790 	mov	16+12($key),$s3
    791 	movzb	($sbox,$acc0,1),$acc0	#$t2
    792 	movzb	($sbox,$acc1,1),$acc1	#$t3
    793 	mov	16+0($key),$s0
    794 
    795 	shl	\$24,$acc0
    796 	shl	\$24,$acc1
    797 
    798 	xor	$acc0,$t2
    799 	xor	$acc1,$t3
    800 
    801 	mov	16+4($key),$s1
    802 	mov	16+8($key),$s2
    803 	lea	-2048($sbox),$sbox
    804 	xor	$t0,$s0
    805 	xor	$t1,$s1
    806 	xor	$t2,$s2
    807 	xor	$t3,$s3
    808 ___
    809 }
    810 
    811 sub decstep()
    812 { my ($i,@s) = @_;
    813   my $tmp0=$acc0;
    814   my $tmp1=$acc1;
    815   my $tmp2=$acc2;
    816   my $out=($t0,$t1,$t2,$s[0])[$i];
    817 
    818 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    819 			$tmp1=$s[2]			if ($i==3);
    820 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    821 	$code.="	and	\$0xFF,$out\n";
    822 
    823 	$code.="	mov	0($sbox,$out,8),$out\n";
    824 	$code.="	shr	\$16,$tmp1\n";
    825 			$tmp2=$s[3]			if ($i==3);
    826 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    827 
    828 			$tmp0=$s[1]			if ($i==3);
    829 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    830 	$code.="	and	\$0xFF,$tmp1\n";
    831 	$code.="	shr	\$24,$tmp2\n";
    832 
    833 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    834 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    835 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    836 
    837 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    838 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    839 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    840 	$code.="\n";
    841 }
    842 
    843 sub declast()
    844 { my ($i,@s)=@_;
    845   my $tmp0=$acc0;
    846   my $tmp1=$acc1;
    847   my $tmp2=$acc2;
    848   my $out=($t0,$t1,$t2,$s[0])[$i];
    849 
    850 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    851 			$tmp1=$s[2]			if ($i==3);
    852 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    853 	$code.="	and	\$0xFF,$out\n";
    854 
    855 	$code.="	movzb	2048($sbox,$out,1),$out\n";
    856 	$code.="	shr	\$16,$tmp1\n";
    857 			$tmp2=$s[3]			if ($i==3);
    858 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    859 
    860 			$tmp0=$s[1]			if ($i==3);
    861 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    862 	$code.="	and	\$0xFF,$tmp1\n";
    863 	$code.="	shr	\$24,$tmp2\n";
    864 
    865 	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
    866 	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
    867 	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
    868 
    869 	$code.="	shl	\$8,$tmp0\n";
    870 	$code.="	shl	\$16,$tmp1\n";
    871 	$code.="	shl	\$24,$tmp2\n";
    872 
    873 	$code.="	xor	$tmp0,$out\n";
    874 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    875 	$code.="	xor	$tmp1,$out\n";
    876 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    877 	$code.="	xor	$tmp2,$out\n";
    878 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    879 	$code.="\n";
    880 }
    881 
    882 $code.=<<___;
    883 .type	_x86_64_AES_decrypt,\@abi-omnipotent
    884 .align	16
    885 _x86_64_AES_decrypt:
    886 	xor	0($key),$s0			# xor with key
    887 	xor	4($key),$s1
    888 	xor	8($key),$s2
    889 	xor	12($key),$s3
    890 
    891 	mov	240($key),$rnds			# load key->rounds
    892 	sub	\$1,$rnds
    893 	jmp	.Ldec_loop
    894 .align	16
    895 .Ldec_loop:
    896 ___
    897 	if ($verticalspin) { &decvert(); }
    898 	else {	&decstep(0,$s0,$s3,$s2,$s1);
    899 		&decstep(1,$s1,$s0,$s3,$s2);
    900 		&decstep(2,$s2,$s1,$s0,$s3);
    901 		&decstep(3,$s3,$s2,$s1,$s0);
    902 		$code.=<<___;
    903 		lea	16($key),$key
    904 		xor	0($key),$s0			# xor with key
    905 		xor	4($key),$s1
    906 		xor	8($key),$s2
    907 		xor	12($key),$s3
    908 ___
    909 	}
    910 $code.=<<___;
    911 	sub	\$1,$rnds
    912 	jnz	.Ldec_loop
    913 ___
    914 	if ($verticalspin) { &declastvert(); }
    915 	else {	&declast(0,$s0,$s3,$s2,$s1);
    916 		&declast(1,$s1,$s0,$s3,$s2);
    917 		&declast(2,$s2,$s1,$s0,$s3);
    918 		&declast(3,$s3,$s2,$s1,$s0);
    919 		$code.=<<___;
    920 		xor	16+0($key),$s0			# xor with key
    921 		xor	16+4($key),$s1
    922 		xor	16+8($key),$s2
    923 		xor	16+12($key),$s3
    924 ___
    925 	}
    926 $code.=<<___;
    927 	.byte	0xf3,0xc3			# rep ret
    928 .size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
    929 ___
    930 
    931 sub deccompactvert()
    932 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    933 
    934 $code.=<<___;
    935 	movzb	`&lo("$s0")`,$t0
    936 	movzb	`&lo("$s1")`,$t1
    937 	movzb	`&lo("$s2")`,$t2
    938 	movzb	($sbox,$t0,1),$t0
    939 	movzb	($sbox,$t1,1),$t1
    940 	movzb	($sbox,$t2,1),$t2
    941 
    942 	movzb	`&lo("$s3")`,$t3
    943 	movzb	`&hi("$s3")`,$acc0
    944 	movzb	`&hi("$s0")`,$acc1
    945 	movzb	($sbox,$t3,1),$t3
    946 	movzb	($sbox,$acc0,1),$t4	#$t0
    947 	movzb	($sbox,$acc1,1),$t5	#$t1
    948 
    949 	movzb	`&hi("$s1")`,$acc2
    950 	movzb	`&hi("$s2")`,$acc0
    951 	shr	\$16,$s2
    952 	movzb	($sbox,$acc2,1),$acc2	#$t2
    953 	movzb	($sbox,$acc0,1),$acc0	#$t3
    954 	shr	\$16,$s3
    955 
    956 	movzb	`&lo("$s2")`,$acc1
    957 	shl	\$8,$t4
    958 	shl	\$8,$t5
    959 	movzb	($sbox,$acc1,1),$acc1	#$t0
    960 	xor	$t4,$t0
    961 	xor	$t5,$t1
    962 
    963 	movzb	`&lo("$s3")`,$t4
    964 	shr	\$16,$s0
    965 	shr	\$16,$s1
    966 	movzb	`&lo("$s0")`,$t5
    967 	shl	\$8,$acc2
    968 	shl	\$8,$acc0
    969 	movzb	($sbox,$t4,1),$t4	#$t1
    970 	movzb	($sbox,$t5,1),$t5	#$t2
    971 	xor	$acc2,$t2
    972 	xor	$acc0,$t3
    973 
    974 	movzb	`&lo("$s1")`,$acc2
    975 	movzb	`&hi("$s1")`,$acc0
    976 	shl	\$16,$acc1
    977 	movzb	($sbox,$acc2,1),$acc2	#$t3
    978 	movzb	($sbox,$acc0,1),$acc0	#$t0
    979 	xor	$acc1,$t0
    980 
    981 	movzb	`&hi("$s2")`,$acc1
    982 	shl	\$16,$t4
    983 	shl	\$16,$t5
    984 	movzb	($sbox,$acc1,1),$s1	#$t1
    985 	xor	$t4,$t1
    986 	xor	$t5,$t2
    987 
    988 	movzb	`&hi("$s3")`,$acc1
    989 	shr	\$8,$s0
    990 	shl	\$16,$acc2
    991 	movzb	($sbox,$acc1,1),$s2	#$t2
    992 	movzb	($sbox,$s0,1),$s3	#$t3
    993 	xor	$acc2,$t3
    994 
    995 	shl	\$24,$acc0
    996 	shl	\$24,$s1
    997 	shl	\$24,$s2
    998 	xor	$acc0,$t0
    999 	shl	\$24,$s3
   1000 	xor	$t1,$s1
   1001 	mov	$t0,$s0
   1002 	xor	$t2,$s2
   1003 	xor	$t3,$s3
   1004 ___
   1005 }
   1006 
   1007 # parallelized version! input is pair of 64-bit values: %rax=s1.s0
   1008 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
   1009 # %ecx=s2 and %edx=s3.
   1010 sub dectransform()
   1011 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
   1012   my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
   1013   my $prefetch = shift;
   1014 
   1015 $code.=<<___;
   1016 	mov	$tp10,$acc0
   1017 	mov	$tp18,$acc8
   1018 	and	$mask80,$acc0
   1019 	and	$mask80,$acc8
   1020 	mov	$acc0,$tp40
   1021 	mov	$acc8,$tp48
   1022 	shr	\$7,$tp40
   1023 	lea	($tp10,$tp10),$tp20
   1024 	shr	\$7,$tp48
   1025 	lea	($tp18,$tp18),$tp28
   1026 	sub	$tp40,$acc0
   1027 	sub	$tp48,$acc8
   1028 	and	$maskfe,$tp20
   1029 	and	$maskfe,$tp28
   1030 	and	$mask1b,$acc0
   1031 	and	$mask1b,$acc8
   1032 	xor	$tp20,$acc0
   1033 	xor	$tp28,$acc8
   1034 	mov	$acc0,$tp20
   1035 	mov	$acc8,$tp28
   1036 
   1037 	and	$mask80,$acc0
   1038 	and	$mask80,$acc8
   1039 	mov	$acc0,$tp80
   1040 	mov	$acc8,$tp88
   1041 	shr	\$7,$tp80
   1042 	lea	($tp20,$tp20),$tp40
   1043 	shr	\$7,$tp88
   1044 	lea	($tp28,$tp28),$tp48
   1045 	sub	$tp80,$acc0
   1046 	sub	$tp88,$acc8
   1047 	and	$maskfe,$tp40
   1048 	and	$maskfe,$tp48
   1049 	and	$mask1b,$acc0
   1050 	and	$mask1b,$acc8
   1051 	xor	$tp40,$acc0
   1052 	xor	$tp48,$acc8
   1053 	mov	$acc0,$tp40
   1054 	mov	$acc8,$tp48
   1055 
   1056 	and	$mask80,$acc0
   1057 	and	$mask80,$acc8
   1058 	mov	$acc0,$tp80
   1059 	mov	$acc8,$tp88
   1060 	shr	\$7,$tp80
   1061 	 xor	$tp10,$tp20		# tp2^=tp1
   1062 	shr	\$7,$tp88
   1063 	 xor	$tp18,$tp28		# tp2^=tp1
   1064 	sub	$tp80,$acc0
   1065 	sub	$tp88,$acc8
   1066 	lea	($tp40,$tp40),$tp80
   1067 	lea	($tp48,$tp48),$tp88
   1068 	 xor	$tp10,$tp40		# tp4^=tp1
   1069 	 xor	$tp18,$tp48		# tp4^=tp1
   1070 	and	$maskfe,$tp80
   1071 	and	$maskfe,$tp88
   1072 	and	$mask1b,$acc0
   1073 	and	$mask1b,$acc8
   1074 	xor	$acc0,$tp80
   1075 	xor	$acc8,$tp88
   1076 
   1077 	xor	$tp80,$tp10		# tp1^=tp8
   1078 	xor	$tp88,$tp18		# tp1^=tp8
   1079 	xor	$tp80,$tp20		# tp2^tp1^=tp8
   1080 	xor	$tp88,$tp28		# tp2^tp1^=tp8
   1081 	mov	$tp10,$acc0
   1082 	mov	$tp18,$acc8
   1083 	xor	$tp80,$tp40		# tp4^tp1^=tp8
   1084 	xor	$tp88,$tp48		# tp4^tp1^=tp8
   1085 	shr	\$32,$acc0
   1086 	shr	\$32,$acc8
   1087 	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
   1088 	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
   1089 	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
   1090 	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
   1091 	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1092 	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1093 
   1094 	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
   1095 	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
   1096 	xor	`&LO("$tp80")`,`&LO("$tp10")`
   1097 	xor	`&LO("$tp88")`,`&LO("$tp18")`
   1098 	shr	\$32,$tp80
   1099 	shr	\$32,$tp88
   1100 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1101 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1102 
   1103 	mov	$tp20,$tp80
   1104 	mov	$tp28,$tp88
   1105 	shr	\$32,$tp80
   1106 	shr	\$32,$tp88
   1107 	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
   1108 	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
   1109 	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
   1110 	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
   1111 	xor	`&LO("$tp20")`,`&LO("$tp10")`
   1112 	xor	`&LO("$tp28")`,`&LO("$tp18")`
   1113 	mov	$tp40,$tp20
   1114 	mov	$tp48,$tp28
   1115 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1116 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1117 
   1118 	`"mov	0($sbox),$mask80"	if ($prefetch)`
   1119 	shr	\$32,$tp20
   1120 	shr	\$32,$tp28
   1121 	`"mov	64($sbox),$maskfe"	if ($prefetch)`
   1122 	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
   1123 	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
   1124 	`"mov	128($sbox),$mask1b"	if ($prefetch)`
   1125 	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
   1126 	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
   1127 	`"mov	192($sbox),$tp80"	if ($prefetch)`
   1128 	xor	`&LO("$tp40")`,`&LO("$tp10")`
   1129 	xor	`&LO("$tp48")`,`&LO("$tp18")`
   1130 	`"mov	256($sbox),$tp88"	if ($prefetch)`
   1131 	xor	`&LO("$tp20")`,`&LO("$acc0")`
   1132 	xor	`&LO("$tp28")`,`&LO("$acc8")`
   1133 ___
   1134 }
   1135 
   1136 $code.=<<___;
   1137 .type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
   1138 .align	16
   1139 _x86_64_AES_decrypt_compact:
   1140 	lea	128($sbox),$inp			# size optimization
   1141 	mov	0-128($inp),$acc1		# prefetch Td4
   1142 	mov	32-128($inp),$acc2
   1143 	mov	64-128($inp),$t0
   1144 	mov	96-128($inp),$t1
   1145 	mov	128-128($inp),$acc1
   1146 	mov	160-128($inp),$acc2
   1147 	mov	192-128($inp),$t0
   1148 	mov	224-128($inp),$t1
   1149 	jmp	.Ldec_loop_compact
   1150 
   1151 .align	16
   1152 .Ldec_loop_compact:
   1153 		xor	0($key),$s0		# xor with key
   1154 		xor	4($key),$s1
   1155 		xor	8($key),$s2
   1156 		xor	12($key),$s3
   1157 		lea	16($key),$key
   1158 ___
   1159 		&deccompactvert();
   1160 $code.=<<___;
   1161 		cmp	16(%rsp),$key
   1162 		je	.Ldec_compact_done
   1163 
   1164 		mov	256+0($sbox),$mask80
   1165 		shl	\$32,%rbx
   1166 		shl	\$32,%rdx
   1167 		mov	256+8($sbox),$maskfe
   1168 		or	%rbx,%rax
   1169 		or	%rdx,%rcx
   1170 		mov	256+16($sbox),$mask1b
   1171 ___
   1172 		&dectransform(1);
   1173 $code.=<<___;
   1174 	jmp	.Ldec_loop_compact
   1175 .align	16
   1176 .Ldec_compact_done:
   1177 	xor	0($key),$s0
   1178 	xor	4($key),$s1
   1179 	xor	8($key),$s2
   1180 	xor	12($key),$s3
   1181 	.byte	0xf3,0xc3			# rep ret
   1182 .size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
   1183 ___
   1184 
   1185 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
   1186 $code.=<<___;
   1187 .globl	AES_decrypt
   1188 .type	AES_decrypt,\@function,3
   1189 .align	16
   1190 .globl	asm_AES_decrypt
   1191 .hidden	asm_AES_decrypt
   1192 asm_AES_decrypt:
   1193 AES_decrypt:
   1194 	push	%rbx
   1195 	push	%rbp
   1196 	push	%r12
   1197 	push	%r13
   1198 	push	%r14
   1199 	push	%r15
   1200 
   1201 	# allocate frame "above" key schedule
   1202 	mov	%rsp,%r10
   1203 	lea	-63(%rdx),%rcx	# %rdx is key argument
   1204 	and	\$-64,%rsp
   1205 	sub	%rsp,%rcx
   1206 	neg	%rcx
   1207 	and	\$0x3c0,%rcx
   1208 	sub	%rcx,%rsp
   1209 	sub	\$32,%rsp
   1210 
   1211 	mov	%rsi,16(%rsp)	# save out
   1212 	mov	%r10,24(%rsp)	# save real stack pointer
   1213 .Ldec_prologue:
   1214 
   1215 	mov	%rdx,$key
   1216 	mov	240($key),$rnds	# load rounds
   1217 
   1218 	mov	0(%rdi),$s0	# load input vector
   1219 	mov	4(%rdi),$s1
   1220 	mov	8(%rdi),$s2
   1221 	mov	12(%rdi),$s3
   1222 
   1223 	shl	\$4,$rnds
   1224 	lea	($key,$rnds),%rbp
   1225 	mov	$key,(%rsp)	# key schedule
   1226 	mov	%rbp,8(%rsp)	# end of key schedule
   1227 
   1228 	# pick Td4 copy which can't "overlap" with stack frame or key schedule
   1229 	lea	.LAES_Td+2048(%rip),$sbox
   1230 	lea	768(%rsp),%rbp
   1231 	sub	$sbox,%rbp
   1232 	and	\$0x300,%rbp
   1233 	lea	($sbox,%rbp),$sbox
   1234 	shr	\$3,%rbp	# recall "magic" constants!
   1235 	add	%rbp,$sbox
   1236 
   1237 	call	_x86_64_AES_decrypt_compact
   1238 
   1239 	mov	16(%rsp),$out	# restore out
   1240 	mov	24(%rsp),%rsi	# restore saved stack pointer
   1241 	mov	$s0,0($out)	# write output vector
   1242 	mov	$s1,4($out)
   1243 	mov	$s2,8($out)
   1244 	mov	$s3,12($out)
   1245 
   1246 	mov	(%rsi),%r15
   1247 	mov	8(%rsi),%r14
   1248 	mov	16(%rsi),%r13
   1249 	mov	24(%rsi),%r12
   1250 	mov	32(%rsi),%rbp
   1251 	mov	40(%rsi),%rbx
   1252 	lea	48(%rsi),%rsp
   1253 .Ldec_epilogue:
   1254 	ret
   1255 .size	AES_decrypt,.-AES_decrypt
   1256 ___
   1257 #------------------------------------------------------------------#
   1258 
   1259 sub enckey()
   1260 {
   1261 $code.=<<___;
   1262 	movz	%dl,%esi		# rk[i]>>0
   1263 	movzb	-128(%rbp,%rsi),%ebx
   1264 	movz	%dh,%esi		# rk[i]>>8
   1265 	shl	\$24,%ebx
   1266 	xor	%ebx,%eax
   1267 
   1268 	movzb	-128(%rbp,%rsi),%ebx
   1269 	shr	\$16,%edx
   1270 	movz	%dl,%esi		# rk[i]>>16
   1271 	xor	%ebx,%eax
   1272 
   1273 	movzb	-128(%rbp,%rsi),%ebx
   1274 	movz	%dh,%esi		# rk[i]>>24
   1275 	shl	\$8,%ebx
   1276 	xor	%ebx,%eax
   1277 
   1278 	movzb	-128(%rbp,%rsi),%ebx
   1279 	shl	\$16,%ebx
   1280 	xor	%ebx,%eax
   1281 
   1282 	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
   1283 ___
   1284 }
   1285 
   1286 # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
   1287 #                        AES_KEY *key)
   1288 $code.=<<___;
   1289 .globl	private_AES_set_encrypt_key
   1290 .type	private_AES_set_encrypt_key,\@function,3
   1291 .align	16
   1292 private_AES_set_encrypt_key:
   1293 	push	%rbx
   1294 	push	%rbp
   1295 	push	%r12			# redundant, but allows to share 
   1296 	push	%r13			# exception handler...
   1297 	push	%r14
   1298 	push	%r15
   1299 	sub	\$8,%rsp
   1300 .Lenc_key_prologue:
   1301 
   1302 	call	_x86_64_AES_set_encrypt_key
   1303 
   1304 	mov	8(%rsp),%r15
   1305 	mov	16(%rsp),%r14
   1306 	mov	24(%rsp),%r13
   1307 	mov	32(%rsp),%r12
   1308 	mov	40(%rsp),%rbp
   1309 	mov	48(%rsp),%rbx
   1310 	add	\$56,%rsp
   1311 .Lenc_key_epilogue:
   1312 	ret
   1313 .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
   1314 
   1315 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
   1316 .align	16
   1317 _x86_64_AES_set_encrypt_key:
   1318 	mov	%esi,%ecx			# %ecx=bits
   1319 	mov	%rdi,%rsi			# %rsi=userKey
   1320 	mov	%rdx,%rdi			# %rdi=key
   1321 
   1322 	test	\$-1,%rsi
   1323 	jz	.Lbadpointer
   1324 	test	\$-1,%rdi
   1325 	jz	.Lbadpointer
   1326 
   1327 	lea	.LAES_Te(%rip),%rbp
   1328 	lea	2048+128(%rbp),%rbp
   1329 
   1330 	# prefetch Te4
   1331 	mov	0-128(%rbp),%eax
   1332 	mov	32-128(%rbp),%ebx
   1333 	mov	64-128(%rbp),%r8d
   1334 	mov	96-128(%rbp),%edx
   1335 	mov	128-128(%rbp),%eax
   1336 	mov	160-128(%rbp),%ebx
   1337 	mov	192-128(%rbp),%r8d
   1338 	mov	224-128(%rbp),%edx
   1339 
   1340 	cmp	\$128,%ecx
   1341 	je	.L10rounds
   1342 	cmp	\$192,%ecx
   1343 	je	.L12rounds
   1344 	cmp	\$256,%ecx
   1345 	je	.L14rounds
   1346 	mov	\$-2,%rax			# invalid number of bits
   1347 	jmp	.Lexit
   1348 
   1349 .L10rounds:
   1350 	mov	0(%rsi),%rax			# copy first 4 dwords
   1351 	mov	8(%rsi),%rdx
   1352 	mov	%rax,0(%rdi)
   1353 	mov	%rdx,8(%rdi)
   1354 
   1355 	shr	\$32,%rdx
   1356 	xor	%ecx,%ecx
   1357 	jmp	.L10shortcut
   1358 .align	4
   1359 .L10loop:
   1360 		mov	0(%rdi),%eax			# rk[0]
   1361 		mov	12(%rdi),%edx			# rk[3]
   1362 .L10shortcut:
   1363 ___
   1364 		&enckey	();
   1365 $code.=<<___;
   1366 		mov	%eax,16(%rdi)			# rk[4]
   1367 		xor	4(%rdi),%eax
   1368 		mov	%eax,20(%rdi)			# rk[5]
   1369 		xor	8(%rdi),%eax
   1370 		mov	%eax,24(%rdi)			# rk[6]
   1371 		xor	12(%rdi),%eax
   1372 		mov	%eax,28(%rdi)			# rk[7]
   1373 		add	\$1,%ecx
   1374 		lea	16(%rdi),%rdi
   1375 		cmp	\$10,%ecx
   1376 	jl	.L10loop
   1377 
   1378 	movl	\$10,80(%rdi)			# setup number of rounds
   1379 	xor	%rax,%rax
   1380 	jmp	.Lexit
   1381 
   1382 .L12rounds:
   1383 	mov	0(%rsi),%rax			# copy first 6 dwords
   1384 	mov	8(%rsi),%rbx
   1385 	mov	16(%rsi),%rdx
   1386 	mov	%rax,0(%rdi)
   1387 	mov	%rbx,8(%rdi)
   1388 	mov	%rdx,16(%rdi)
   1389 
   1390 	shr	\$32,%rdx
   1391 	xor	%ecx,%ecx
   1392 	jmp	.L12shortcut
   1393 .align	4
   1394 .L12loop:
   1395 		mov	0(%rdi),%eax			# rk[0]
   1396 		mov	20(%rdi),%edx			# rk[5]
   1397 .L12shortcut:
   1398 ___
   1399 		&enckey	();
   1400 $code.=<<___;
   1401 		mov	%eax,24(%rdi)			# rk[6]
   1402 		xor	4(%rdi),%eax
   1403 		mov	%eax,28(%rdi)			# rk[7]
   1404 		xor	8(%rdi),%eax
   1405 		mov	%eax,32(%rdi)			# rk[8]
   1406 		xor	12(%rdi),%eax
   1407 		mov	%eax,36(%rdi)			# rk[9]
   1408 
   1409 		cmp	\$7,%ecx
   1410 		je	.L12break
   1411 		add	\$1,%ecx
   1412 
   1413 		xor	16(%rdi),%eax
   1414 		mov	%eax,40(%rdi)			# rk[10]
   1415 		xor	20(%rdi),%eax
   1416 		mov	%eax,44(%rdi)			# rk[11]
   1417 
   1418 		lea	24(%rdi),%rdi
   1419 	jmp	.L12loop
   1420 .L12break:
   1421 	movl	\$12,72(%rdi)		# setup number of rounds
   1422 	xor	%rax,%rax
   1423 	jmp	.Lexit
   1424 
   1425 .L14rounds:		
   1426 	mov	0(%rsi),%rax			# copy first 8 dwords
   1427 	mov	8(%rsi),%rbx
   1428 	mov	16(%rsi),%rcx
   1429 	mov	24(%rsi),%rdx
   1430 	mov	%rax,0(%rdi)
   1431 	mov	%rbx,8(%rdi)
   1432 	mov	%rcx,16(%rdi)
   1433 	mov	%rdx,24(%rdi)
   1434 
   1435 	shr	\$32,%rdx
   1436 	xor	%ecx,%ecx
   1437 	jmp	.L14shortcut
   1438 .align	4
   1439 .L14loop:
   1440 		mov	0(%rdi),%eax			# rk[0]
   1441 		mov	28(%rdi),%edx			# rk[4]
   1442 .L14shortcut:
   1443 ___
   1444 		&enckey	();
   1445 $code.=<<___;
   1446 		mov	%eax,32(%rdi)			# rk[8]
   1447 		xor	4(%rdi),%eax
   1448 		mov	%eax,36(%rdi)			# rk[9]
   1449 		xor	8(%rdi),%eax
   1450 		mov	%eax,40(%rdi)			# rk[10]
   1451 		xor	12(%rdi),%eax
   1452 		mov	%eax,44(%rdi)			# rk[11]
   1453 
   1454 		cmp	\$6,%ecx
   1455 		je	.L14break
   1456 		add	\$1,%ecx
   1457 
   1458 		mov	%eax,%edx
   1459 		mov	16(%rdi),%eax			# rk[4]
   1460 		movz	%dl,%esi			# rk[11]>>0
   1461 		movzb	-128(%rbp,%rsi),%ebx
   1462 		movz	%dh,%esi			# rk[11]>>8
   1463 		xor	%ebx,%eax
   1464 
   1465 		movzb	-128(%rbp,%rsi),%ebx
   1466 		shr	\$16,%edx
   1467 		shl	\$8,%ebx
   1468 		movz	%dl,%esi			# rk[11]>>16
   1469 		xor	%ebx,%eax
   1470 
   1471 		movzb	-128(%rbp,%rsi),%ebx
   1472 		movz	%dh,%esi			# rk[11]>>24
   1473 		shl	\$16,%ebx
   1474 		xor	%ebx,%eax
   1475 
   1476 		movzb	-128(%rbp,%rsi),%ebx
   1477 		shl	\$24,%ebx
   1478 		xor	%ebx,%eax
   1479 
   1480 		mov	%eax,48(%rdi)			# rk[12]
   1481 		xor	20(%rdi),%eax
   1482 		mov	%eax,52(%rdi)			# rk[13]
   1483 		xor	24(%rdi),%eax
   1484 		mov	%eax,56(%rdi)			# rk[14]
   1485 		xor	28(%rdi),%eax
   1486 		mov	%eax,60(%rdi)			# rk[15]
   1487 
   1488 		lea	32(%rdi),%rdi
   1489 	jmp	.L14loop
   1490 .L14break:
   1491 	movl	\$14,48(%rdi)		# setup number of rounds
   1492 	xor	%rax,%rax
   1493 	jmp	.Lexit
   1494 
   1495 .Lbadpointer:
   1496 	mov	\$-1,%rax
   1497 .Lexit:
   1498 	.byte	0xf3,0xc3			# rep ret
   1499 .size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
   1500 ___
   1501 
   1502 sub deckey_ref()
   1503 { my ($i,$ptr,$te,$td) = @_;
   1504   my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
   1505 $code.=<<___;
   1506 	mov	$i($ptr),$tp1
   1507 	mov	$tp1,$acc
   1508 	and	\$0x80808080,$acc
   1509 	mov	$acc,$tp4
   1510 	shr	\$7,$tp4
   1511 	lea	0($tp1,$tp1),$tp2
   1512 	sub	$tp4,$acc
   1513 	and	\$0xfefefefe,$tp2
   1514 	and	\$0x1b1b1b1b,$acc
   1515 	xor	$tp2,$acc
   1516 	mov	$acc,$tp2
   1517 
   1518 	and	\$0x80808080,$acc
   1519 	mov	$acc,$tp8
   1520 	shr	\$7,$tp8
   1521 	lea	0($tp2,$tp2),$tp4
   1522 	sub	$tp8,$acc
   1523 	and	\$0xfefefefe,$tp4
   1524 	and	\$0x1b1b1b1b,$acc
   1525 	 xor	$tp1,$tp2		# tp2^tp1
   1526 	xor	$tp4,$acc
   1527 	mov	$acc,$tp4
   1528 
   1529 	and	\$0x80808080,$acc
   1530 	mov	$acc,$tp8
   1531 	shr	\$7,$tp8
   1532 	sub	$tp8,$acc
   1533 	lea	0($tp4,$tp4),$tp8
   1534 	 xor	$tp1,$tp4		# tp4^tp1
   1535 	and	\$0xfefefefe,$tp8
   1536 	and	\$0x1b1b1b1b,$acc
   1537 	xor	$acc,$tp8
   1538 
   1539 	xor	$tp8,$tp1		# tp1^tp8
   1540 	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
   1541 	xor	$tp8,$tp2		# tp2^tp1^tp8
   1542 	xor	$tp8,$tp4		# tp4^tp1^tp8
   1543 	xor	$tp2,$tp8
   1544 	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
   1545 
   1546 	xor	$tp8,$tp1
   1547 	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
   1548 	xor	$tp2,$tp1
   1549 	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
   1550 	xor	$tp4,$tp1
   1551 
   1552 	mov	$tp1,$i($ptr)
   1553 ___
   1554 }
   1555 
   1556 # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
   1557 #                        AES_KEY *key)
   1558 $code.=<<___;
   1559 .globl	private_AES_set_decrypt_key
   1560 .type	private_AES_set_decrypt_key,\@function,3
   1561 .align	16
   1562 private_AES_set_decrypt_key:
   1563 	push	%rbx
   1564 	push	%rbp
   1565 	push	%r12
   1566 	push	%r13
   1567 	push	%r14
   1568 	push	%r15
   1569 	push	%rdx			# save key schedule
   1570 .Ldec_key_prologue:
   1571 
   1572 	call	_x86_64_AES_set_encrypt_key
   1573 	mov	(%rsp),%r8		# restore key schedule
   1574 	cmp	\$0,%eax
   1575 	jne	.Labort
   1576 
   1577 	mov	240(%r8),%r14d		# pull number of rounds
   1578 	xor	%rdi,%rdi
   1579 	lea	(%rdi,%r14d,4),%rcx
   1580 	mov	%r8,%rsi
   1581 	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
   1582 .align	4
   1583 .Linvert:
   1584 		mov	0(%rsi),%rax
   1585 		mov	8(%rsi),%rbx
   1586 		mov	0(%rdi),%rcx
   1587 		mov	8(%rdi),%rdx
   1588 		mov	%rax,0(%rdi)
   1589 		mov	%rbx,8(%rdi)
   1590 		mov	%rcx,0(%rsi)
   1591 		mov	%rdx,8(%rsi)
   1592 		lea	16(%rsi),%rsi
   1593 		lea	-16(%rdi),%rdi
   1594 		cmp	%rsi,%rdi
   1595 	jne	.Linvert
   1596 
   1597 	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
   1598 
   1599 	mov	40(%rax),$mask80
   1600 	mov	48(%rax),$maskfe
   1601 	mov	56(%rax),$mask1b
   1602 
   1603 	mov	%r8,$key
   1604 	sub	\$1,%r14d
   1605 .align	4
   1606 .Lpermute:
   1607 		lea	16($key),$key
   1608 		mov	0($key),%rax
   1609 		mov	8($key),%rcx
   1610 ___
   1611 		&dectransform ();
   1612 $code.=<<___;
   1613 		mov	%eax,0($key)
   1614 		mov	%ebx,4($key)
   1615 		mov	%ecx,8($key)
   1616 		mov	%edx,12($key)
   1617 		sub	\$1,%r14d
   1618 	jnz	.Lpermute
   1619 
   1620 	xor	%rax,%rax
   1621 .Labort:
   1622 	mov	8(%rsp),%r15
   1623 	mov	16(%rsp),%r14
   1624 	mov	24(%rsp),%r13
   1625 	mov	32(%rsp),%r12
   1626 	mov	40(%rsp),%rbp
   1627 	mov	48(%rsp),%rbx
   1628 	add	\$56,%rsp
   1629 .Ldec_key_epilogue:
   1630 	ret
   1631 .size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
   1632 ___
   1633 
   1634 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
   1635 #			size_t length, const AES_KEY *key,
   1636 #			unsigned char *ivp,const int enc);
   1637 {
   1638 # stack frame layout
   1639 # -8(%rsp)		return address
   1640 my $keyp="0(%rsp)";		# one to pass as $key
   1641 my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
   1642 my $_rsp="16(%rsp)";		# saved %rsp
   1643 my $_inp="24(%rsp)";		# copy of 1st parameter, inp
   1644 my $_out="32(%rsp)";		# copy of 2nd parameter, out
   1645 my $_len="40(%rsp)";		# copy of 3rd parameter, length
   1646 my $_key="48(%rsp)";		# copy of 4th parameter, key
   1647 my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
   1648 my $ivec="64(%rsp)";		# ivec[16]
   1649 my $aes_key="80(%rsp)";		# copy of aes_key
   1650 my $mark="80+240(%rsp)";	# copy of aes_key->rounds
   1651 
   1652 $code.=<<___;
   1653 .globl	AES_cbc_encrypt
   1654 .type	AES_cbc_encrypt,\@function,6
   1655 .align	16
   1656 .extern	OPENSSL_ia32cap_P
   1657 .globl	asm_AES_cbc_encrypt
   1658 .hidden	asm_AES_cbc_encrypt
   1659 asm_AES_cbc_encrypt:
   1660 AES_cbc_encrypt:
   1661 	cmp	\$0,%rdx	# check length
   1662 	je	.Lcbc_epilogue
   1663 	pushfq
   1664 	push	%rbx
   1665 	push	%rbp
   1666 	push	%r12
   1667 	push	%r13
   1668 	push	%r14
   1669 	push	%r15
   1670 .Lcbc_prologue:
   1671 
   1672 	cld
   1673 	mov	%r9d,%r9d	# clear upper half of enc
   1674 
   1675 	lea	.LAES_Te(%rip),$sbox
   1676 	cmp	\$0,%r9
   1677 	jne	.Lcbc_picked_te
   1678 	lea	.LAES_Td(%rip),$sbox
   1679 .Lcbc_picked_te:
   1680 
   1681 	mov	OPENSSL_ia32cap_P(%rip),%r10d
   1682 	cmp	\$$speed_limit,%rdx
   1683 	jb	.Lcbc_slow_prologue
   1684 	test	\$15,%rdx
   1685 	jnz	.Lcbc_slow_prologue
   1686 	bt	\$28,%r10d
   1687 	jc	.Lcbc_slow_prologue
   1688 
   1689 	# allocate aligned stack frame...
   1690 	lea	-88-248(%rsp),$key
   1691 	and	\$-64,$key
   1692 
   1693 	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
   1694 	mov	$sbox,%r10
   1695 	lea	2304($sbox),%r11
   1696 	mov	$key,%r12
   1697 	and	\$0xFFF,%r10	# s = $sbox&0xfff
   1698 	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
   1699 	and	\$0xFFF,%r12	# p = %rsp&0xfff
   1700 
   1701 	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
   1702 	jb	.Lcbc_te_break_out
   1703 	sub	%r11,%r12
   1704 	sub	%r12,$key
   1705 	jmp	.Lcbc_te_ok
   1706 .Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
   1707 	sub	%r10,%r12
   1708 	and	\$0xFFF,%r12
   1709 	add	\$320,%r12
   1710 	sub	%r12,$key
   1711 .align	4
   1712 .Lcbc_te_ok:
   1713 
   1714 	xchg	%rsp,$key
   1715 	#add	\$8,%rsp	# reserve for return address!
   1716 	mov	$key,$_rsp	# save %rsp
   1717 .Lcbc_fast_body:
   1718 	mov	%rdi,$_inp	# save copy of inp
   1719 	mov	%rsi,$_out	# save copy of out
   1720 	mov	%rdx,$_len	# save copy of len
   1721 	mov	%rcx,$_key	# save copy of key
   1722 	mov	%r8,$_ivp	# save copy of ivp
   1723 	movl	\$0,$mark	# copy of aes_key->rounds = 0;
   1724 	mov	%r8,%rbp	# rearrange input arguments
   1725 	mov	%r9,%rbx
   1726 	mov	%rsi,$out
   1727 	mov	%rdi,$inp
   1728 	mov	%rcx,$key
   1729 
   1730 	mov	240($key),%eax		# key->rounds
   1731 	# do we copy key schedule to stack?
   1732 	mov	$key,%r10
   1733 	sub	$sbox,%r10
   1734 	and	\$0xfff,%r10
   1735 	cmp	\$2304,%r10
   1736 	jb	.Lcbc_do_ecopy
   1737 	cmp	\$4096-248,%r10
   1738 	jb	.Lcbc_skip_ecopy
   1739 .align	4
   1740 .Lcbc_do_ecopy:
   1741 		mov	$key,%rsi
   1742 		lea	$aes_key,%rdi
   1743 		lea	$aes_key,$key
   1744 		mov	\$240/8,%ecx
   1745 		.long	0x90A548F3	# rep movsq
   1746 		mov	%eax,(%rdi)	# copy aes_key->rounds
   1747 .Lcbc_skip_ecopy:
   1748 	mov	$key,$keyp	# save key pointer
   1749 
   1750 	mov	\$18,%ecx
   1751 .align	4
   1752 .Lcbc_prefetch_te:
   1753 		mov	0($sbox),%r10
   1754 		mov	32($sbox),%r11
   1755 		mov	64($sbox),%r12
   1756 		mov	96($sbox),%r13
   1757 		lea	128($sbox),$sbox
   1758 		sub	\$1,%ecx
   1759 	jnz	.Lcbc_prefetch_te
   1760 	lea	-2304($sbox),$sbox
   1761 
   1762 	cmp	\$0,%rbx
   1763 	je	.LFAST_DECRYPT
   1764 
   1765 #----------------------------- ENCRYPT -----------------------------#
   1766 	mov	0(%rbp),$s0		# load iv
   1767 	mov	4(%rbp),$s1
   1768 	mov	8(%rbp),$s2
   1769 	mov	12(%rbp),$s3
   1770 
   1771 .align	4
   1772 .Lcbc_fast_enc_loop:
   1773 		xor	0($inp),$s0
   1774 		xor	4($inp),$s1
   1775 		xor	8($inp),$s2
   1776 		xor	12($inp),$s3
   1777 		mov	$keyp,$key	# restore key
   1778 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1779 
   1780 		call	_x86_64_AES_encrypt
   1781 
   1782 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1783 		mov	$_len,%r10
   1784 		mov	$s0,0($out)
   1785 		mov	$s1,4($out)
   1786 		mov	$s2,8($out)
   1787 		mov	$s3,12($out)
   1788 
   1789 		lea	16($inp),$inp
   1790 		lea	16($out),$out
   1791 		sub	\$16,%r10
   1792 		test	\$-16,%r10
   1793 		mov	%r10,$_len
   1794 	jnz	.Lcbc_fast_enc_loop
   1795 	mov	$_ivp,%rbp	# restore ivp
   1796 	mov	$s0,0(%rbp)	# save ivec
   1797 	mov	$s1,4(%rbp)
   1798 	mov	$s2,8(%rbp)
   1799 	mov	$s3,12(%rbp)
   1800 
   1801 	jmp	.Lcbc_fast_cleanup
   1802 
   1803 #----------------------------- DECRYPT -----------------------------#
   1804 .align	16
   1805 .LFAST_DECRYPT:
   1806 	cmp	$inp,$out
   1807 	je	.Lcbc_fast_dec_in_place
   1808 
   1809 	mov	%rbp,$ivec
   1810 .align	4
   1811 .Lcbc_fast_dec_loop:
   1812 		mov	0($inp),$s0	# read input
   1813 		mov	4($inp),$s1
   1814 		mov	8($inp),$s2
   1815 		mov	12($inp),$s3
   1816 		mov	$keyp,$key	# restore key
   1817 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1818 
   1819 		call	_x86_64_AES_decrypt
   1820 
   1821 		mov	$ivec,%rbp	# load ivp
   1822 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1823 		mov	$_len,%r10	# load len
   1824 		xor	0(%rbp),$s0	# xor iv
   1825 		xor	4(%rbp),$s1
   1826 		xor	8(%rbp),$s2
   1827 		xor	12(%rbp),$s3
   1828 		mov	$inp,%rbp	# current input, next iv
   1829 
   1830 		sub	\$16,%r10
   1831 		mov	%r10,$_len	# update len
   1832 		mov	%rbp,$ivec	# update ivp
   1833 
   1834 		mov	$s0,0($out)	# write output
   1835 		mov	$s1,4($out)
   1836 		mov	$s2,8($out)
   1837 		mov	$s3,12($out)
   1838 
   1839 		lea	16($inp),$inp
   1840 		lea	16($out),$out
   1841 	jnz	.Lcbc_fast_dec_loop
   1842 	mov	$_ivp,%r12		# load user ivp
   1843 	mov	0(%rbp),%r10		# load iv
   1844 	mov	8(%rbp),%r11
   1845 	mov	%r10,0(%r12)		# copy back to user
   1846 	mov	%r11,8(%r12)
   1847 	jmp	.Lcbc_fast_cleanup
   1848 
   1849 .align	16
   1850 .Lcbc_fast_dec_in_place:
   1851 	mov	0(%rbp),%r10		# copy iv to stack
   1852 	mov	8(%rbp),%r11
   1853 	mov	%r10,0+$ivec
   1854 	mov	%r11,8+$ivec
   1855 .align	4
   1856 .Lcbc_fast_dec_in_place_loop:
   1857 		mov	0($inp),$s0	# load input
   1858 		mov	4($inp),$s1
   1859 		mov	8($inp),$s2
   1860 		mov	12($inp),$s3
   1861 		mov	$keyp,$key	# restore key
   1862 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1863 
   1864 		call	_x86_64_AES_decrypt
   1865 
   1866 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1867 		mov	$_len,%r10
   1868 		xor	0+$ivec,$s0
   1869 		xor	4+$ivec,$s1
   1870 		xor	8+$ivec,$s2
   1871 		xor	12+$ivec,$s3
   1872 
   1873 		mov	0($inp),%r11	# load input
   1874 		mov	8($inp),%r12
   1875 		sub	\$16,%r10
   1876 		jz	.Lcbc_fast_dec_in_place_done
   1877 
   1878 		mov	%r11,0+$ivec	# copy input to iv
   1879 		mov	%r12,8+$ivec
   1880 
   1881 		mov	$s0,0($out)	# save output [zaps input]
   1882 		mov	$s1,4($out)
   1883 		mov	$s2,8($out)
   1884 		mov	$s3,12($out)
   1885 
   1886 		lea	16($inp),$inp
   1887 		lea	16($out),$out
   1888 		mov	%r10,$_len
   1889 	jmp	.Lcbc_fast_dec_in_place_loop
   1890 .Lcbc_fast_dec_in_place_done:
   1891 	mov	$_ivp,%rdi
   1892 	mov	%r11,0(%rdi)	# copy iv back to user
   1893 	mov	%r12,8(%rdi)
   1894 
   1895 	mov	$s0,0($out)	# save output [zaps input]
   1896 	mov	$s1,4($out)
   1897 	mov	$s2,8($out)
   1898 	mov	$s3,12($out)
   1899 
   1900 .align	4
   1901 .Lcbc_fast_cleanup:
   1902 	cmpl	\$0,$mark	# was the key schedule copied?
   1903 	lea	$aes_key,%rdi
   1904 	je	.Lcbc_exit
   1905 		mov	\$240/8,%ecx
   1906 		xor	%rax,%rax
   1907 		.long	0x90AB48F3	# rep stosq
   1908 
   1909 	jmp	.Lcbc_exit
   1910 
   1911 #--------------------------- SLOW ROUTINE ---------------------------#
   1912 .align	16
   1913 .Lcbc_slow_prologue:
   1914 	# allocate aligned stack frame...
   1915 	lea	-88(%rsp),%rbp
   1916 	and	\$-64,%rbp
   1917 	# ... just "above" key schedule
   1918 	lea	-88-63(%rcx),%r10
   1919 	sub	%rbp,%r10
   1920 	neg	%r10
   1921 	and	\$0x3c0,%r10
   1922 	sub	%r10,%rbp
   1923 
   1924 	xchg	%rsp,%rbp
   1925 	#add	\$8,%rsp	# reserve for return address!
   1926 	mov	%rbp,$_rsp	# save %rsp
   1927 .Lcbc_slow_body:
   1928 	#mov	%rdi,$_inp	# save copy of inp
   1929 	#mov	%rsi,$_out	# save copy of out
   1930 	#mov	%rdx,$_len	# save copy of len
   1931 	#mov	%rcx,$_key	# save copy of key
   1932 	mov	%r8,$_ivp	# save copy of ivp
   1933 	mov	%r8,%rbp	# rearrange input arguments
   1934 	mov	%r9,%rbx
   1935 	mov	%rsi,$out
   1936 	mov	%rdi,$inp
   1937 	mov	%rcx,$key
   1938 	mov	%rdx,%r10
   1939 
   1940 	mov	240($key),%eax
   1941 	mov	$key,$keyp	# save key pointer
   1942 	shl	\$4,%eax
   1943 	lea	($key,%rax),%rax
   1944 	mov	%rax,$keyend
   1945 
   1946 	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
   1947 	lea	2048($sbox),$sbox
   1948 	lea	768-8(%rsp),%rax
   1949 	sub	$sbox,%rax
   1950 	and	\$0x300,%rax
   1951 	lea	($sbox,%rax),$sbox
   1952 
   1953 	cmp	\$0,%rbx
   1954 	je	.LSLOW_DECRYPT
   1955 
   1956 #--------------------------- SLOW ENCRYPT ---------------------------#
   1957 	test	\$-16,%r10		# check upon length
   1958 	mov	0(%rbp),$s0		# load iv
   1959 	mov	4(%rbp),$s1
   1960 	mov	8(%rbp),$s2
   1961 	mov	12(%rbp),$s3
   1962 	jz	.Lcbc_slow_enc_tail	# short input...
   1963 
   1964 .align	4
   1965 .Lcbc_slow_enc_loop:
   1966 		xor	0($inp),$s0
   1967 		xor	4($inp),$s1
   1968 		xor	8($inp),$s2
   1969 		xor	12($inp),$s3
   1970 		mov	$keyp,$key	# restore key
   1971 		mov	$inp,$_inp	# save inp
   1972 		mov	$out,$_out	# save out
   1973 		mov	%r10,$_len	# save len
   1974 
   1975 		call	_x86_64_AES_encrypt_compact
   1976 
   1977 		mov	$_inp,$inp	# restore inp
   1978 		mov	$_out,$out	# restore out
   1979 		mov	$_len,%r10	# restore len
   1980 		mov	$s0,0($out)
   1981 		mov	$s1,4($out)
   1982 		mov	$s2,8($out)
   1983 		mov	$s3,12($out)
   1984 
   1985 		lea	16($inp),$inp
   1986 		lea	16($out),$out
   1987 		sub	\$16,%r10
   1988 		test	\$-16,%r10
   1989 	jnz	.Lcbc_slow_enc_loop
   1990 	test	\$15,%r10
   1991 	jnz	.Lcbc_slow_enc_tail
   1992 	mov	$_ivp,%rbp	# restore ivp
   1993 	mov	$s0,0(%rbp)	# save ivec
   1994 	mov	$s1,4(%rbp)
   1995 	mov	$s2,8(%rbp)
   1996 	mov	$s3,12(%rbp)
   1997 
   1998 	jmp	.Lcbc_exit
   1999 
   2000 .align	4
   2001 .Lcbc_slow_enc_tail:
   2002 	mov	%rax,%r11
   2003 	mov	%rcx,%r12
   2004 	mov	%r10,%rcx
   2005 	mov	$inp,%rsi
   2006 	mov	$out,%rdi
   2007 	.long	0x9066A4F3		# rep movsb
   2008 	mov	\$16,%rcx		# zero tail
   2009 	sub	%r10,%rcx
   2010 	xor	%rax,%rax
   2011 	.long	0x9066AAF3		# rep stosb
   2012 	mov	$out,$inp		# this is not a mistake!
   2013 	mov	\$16,%r10		# len=16
   2014 	mov	%r11,%rax
   2015 	mov	%r12,%rcx
   2016 	jmp	.Lcbc_slow_enc_loop	# one more spin...
   2017 #--------------------------- SLOW DECRYPT ---------------------------#
   2018 .align	16
   2019 .LSLOW_DECRYPT:
   2020 	shr	\$3,%rax
   2021 	add	%rax,$sbox		# recall "magic" constants!
   2022 
   2023 	mov	0(%rbp),%r11		# copy iv to stack
   2024 	mov	8(%rbp),%r12
   2025 	mov	%r11,0+$ivec
   2026 	mov	%r12,8+$ivec
   2027 
   2028 .align	4
   2029 .Lcbc_slow_dec_loop:
   2030 		mov	0($inp),$s0	# load input
   2031 		mov	4($inp),$s1
   2032 		mov	8($inp),$s2
   2033 		mov	12($inp),$s3
   2034 		mov	$keyp,$key	# restore key
   2035 		mov	$inp,$_inp	# save inp
   2036 		mov	$out,$_out	# save out
   2037 		mov	%r10,$_len	# save len
   2038 
   2039 		call	_x86_64_AES_decrypt_compact
   2040 
   2041 		mov	$_inp,$inp	# restore inp
   2042 		mov	$_out,$out	# restore out
   2043 		mov	$_len,%r10
   2044 		xor	0+$ivec,$s0
   2045 		xor	4+$ivec,$s1
   2046 		xor	8+$ivec,$s2
   2047 		xor	12+$ivec,$s3
   2048 
   2049 		mov	0($inp),%r11	# load input
   2050 		mov	8($inp),%r12
   2051 		sub	\$16,%r10
   2052 		jc	.Lcbc_slow_dec_partial
   2053 		jz	.Lcbc_slow_dec_done
   2054 
   2055 		mov	%r11,0+$ivec	# copy input to iv
   2056 		mov	%r12,8+$ivec
   2057 
   2058 		mov	$s0,0($out)	# save output [can zap input]
   2059 		mov	$s1,4($out)
   2060 		mov	$s2,8($out)
   2061 		mov	$s3,12($out)
   2062 
   2063 		lea	16($inp),$inp
   2064 		lea	16($out),$out
   2065 	jmp	.Lcbc_slow_dec_loop
   2066 .Lcbc_slow_dec_done:
   2067 	mov	$_ivp,%rdi
   2068 	mov	%r11,0(%rdi)		# copy iv back to user
   2069 	mov	%r12,8(%rdi)
   2070 
   2071 	mov	$s0,0($out)		# save output [can zap input]
   2072 	mov	$s1,4($out)
   2073 	mov	$s2,8($out)
   2074 	mov	$s3,12($out)
   2075 
   2076 	jmp	.Lcbc_exit
   2077 
   2078 .align	4
   2079 .Lcbc_slow_dec_partial:
   2080 	mov	$_ivp,%rdi
   2081 	mov	%r11,0(%rdi)		# copy iv back to user
   2082 	mov	%r12,8(%rdi)
   2083 
   2084 	mov	$s0,0+$ivec		# save output to stack
   2085 	mov	$s1,4+$ivec
   2086 	mov	$s2,8+$ivec
   2087 	mov	$s3,12+$ivec
   2088 
   2089 	mov	$out,%rdi
   2090 	lea	$ivec,%rsi
   2091 	lea	16(%r10),%rcx
   2092 	.long	0x9066A4F3	# rep movsb
   2093 	jmp	.Lcbc_exit
   2094 
   2095 .align	16
   2096 .Lcbc_exit:
   2097 	mov	$_rsp,%rsi
   2098 	mov	(%rsi),%r15
   2099 	mov	8(%rsi),%r14
   2100 	mov	16(%rsi),%r13
   2101 	mov	24(%rsi),%r12
   2102 	mov	32(%rsi),%rbp
   2103 	mov	40(%rsi),%rbx
   2104 	lea	48(%rsi),%rsp
   2105 .Lcbc_popfq:
   2106 	popfq
   2107 .Lcbc_epilogue:
   2108 	ret
   2109 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
   2110 ___
   2111 }
   2112 
   2113 $code.=<<___;
   2114 .align	64
   2115 .LAES_Te:
   2116 ___
   2117 	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
   2118 	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
   2119 	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
   2120 	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
   2121 	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
   2122 	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
   2123 	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
   2124 	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
   2125 	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
   2126 	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
   2127 	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
   2128 	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
   2129 	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
   2130 	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
   2131 	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
   2132 	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
   2133 	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
   2134 	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
   2135 	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
   2136 	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
   2137 	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
   2138 	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
   2139 	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
   2140 	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
   2141 	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
   2142 	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
   2143 	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
   2144 	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
   2145 	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
   2146 	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
   2147 	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
   2148 	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
   2149 	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
   2150 	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
   2151 	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
   2152 	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
   2153 	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
   2154 	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
   2155 	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
   2156 	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
   2157 	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
   2158 	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
   2159 	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
   2160 	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
   2161 	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
   2162 	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
   2163 	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
   2164 	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
   2165 	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
   2166 	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
   2167 	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
   2168 	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
   2169 	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
   2170 	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
   2171 	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
   2172 	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
   2173 	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
   2174 	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
   2175 	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
   2176 	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
   2177 	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
   2178 	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
   2179 	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
   2180 	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
   2181 
   2182 #Te4	# four copies of Te4 to choose from to avoid L1 aliasing
   2183 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2184 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2185 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2186 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2187 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2188 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2189 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2190 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2191 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2192 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2193 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2194 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2195 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2196 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2197 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2198 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2199 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2200 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2201 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2202 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2203 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2204 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2205 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2206 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2207 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2208 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2209 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2210 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2211 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2212 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2213 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2214 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2215 
   2216 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2217 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2218 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2219 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2220 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2221 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2222 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2223 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2224 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2225 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2226 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2227 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2228 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2229 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2230 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2231 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2232 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2233 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2234 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2235 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2236 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2237 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2238 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2239 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2240 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2241 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2242 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2243 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2244 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2245 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2246 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2247 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2248 
   2249 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2250 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2251 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2252 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2253 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2254 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2255 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2256 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2257 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2258 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2259 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2260 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2261 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2262 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2263 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2264 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2265 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2266 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2267 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2268 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2269 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2270 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2271 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2272 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2273 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2274 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2275 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2276 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2277 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2278 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2279 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2280 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2281 
   2282 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2283 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2284 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2285 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2286 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2287 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2288 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2289 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2290 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2291 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2292 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2293 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2294 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2295 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2296 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2297 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2298 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2299 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2300 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2301 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2302 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2303 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2304 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2305 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2306 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2307 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2308 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2309 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2310 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2311 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2312 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2313 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2314 #rcon:
   2315 $code.=<<___;
   2316 	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
   2317 	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
   2318 	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
   2319 	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
   2320 ___
   2321 $code.=<<___;
   2322 .align	64
   2323 .LAES_Td:
   2324 ___
   2325 	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
   2326 	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
   2327 	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
   2328 	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
   2329 	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
   2330 	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
   2331 	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
   2332 	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
   2333 	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
   2334 	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
   2335 	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
   2336 	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
   2337 	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
   2338 	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
   2339 	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
   2340 	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
   2341 	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
   2342 	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
   2343 	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
   2344 	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
   2345 	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
   2346 	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
   2347 	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
   2348 	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
   2349 	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
   2350 	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
   2351 	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
   2352 	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
   2353 	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
   2354 	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
   2355 	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
   2356 	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
   2357 	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
   2358 	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
   2359 	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
   2360 	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
   2361 	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
   2362 	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
   2363 	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
   2364 	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
   2365 	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
   2366 	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
   2367 	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
   2368 	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
   2369 	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
   2370 	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
   2371 	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
   2372 	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
   2373 	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
   2374 	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
   2375 	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
   2376 	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
   2377 	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
   2378 	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
   2379 	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
   2380 	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
   2381 	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
   2382 	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
   2383 	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
   2384 	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
   2385 	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
   2386 	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
   2387 	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
   2388 	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
   2389 
   2390 #Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
   2391 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2392 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2393 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2394 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2395 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2396 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2397 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2398 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2399 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2400 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2401 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2402 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2403 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2404 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2405 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2406 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2407 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2408 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2409 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2410 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2411 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2412 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2413 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2414 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2415 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2416 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2417 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2418 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2419 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2420 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2421 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2422 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2423 $code.=<<___;
   2424 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2425 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2426 ___
   2427 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2428 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2429 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2430 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2431 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2432 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2433 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2434 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2435 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2436 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2437 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2438 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2439 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2440 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2441 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2442 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2443 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2444 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2445 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2446 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2447 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2448 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2449 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2450 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2451 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2452 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2453 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2454 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2455 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2456 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2457 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2458 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2459 $code.=<<___;
   2460 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2461 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2462 ___
   2463 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2464 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2465 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2466 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2467 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2468 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2469 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2470 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2471 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2472 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2473 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2474 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2475 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2476 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2477 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2478 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2479 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2480 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2481 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2482 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2483 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2484 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2485 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2486 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2487 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2488 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2489 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2490 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2491 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2492 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2493 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2494 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2495 $code.=<<___;
   2496 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2497 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2498 ___
   2499 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2500 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2501 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2502 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2503 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2504 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2505 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2506 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2507 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2508 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2509 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2510 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2511 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2512 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2513 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2514 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2515 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2516 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2517 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2518 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2519 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2520 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2521 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2522 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2523 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2524 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2525 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2526 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2527 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2528 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2529 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2530 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2531 $code.=<<___;
   2532 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2533 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2534 .asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   2535 .align	64
   2536 ___
   2537 
   2538 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2539 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2540 if ($win64) {
   2541 $rec="%rcx";
   2542 $frame="%rdx";
   2543 $context="%r8";
   2544 $disp="%r9";
   2545 
   2546 $code.=<<___;
   2547 .extern	__imp_RtlVirtualUnwind
   2548 .type	block_se_handler,\@abi-omnipotent
   2549 .align	16
   2550 block_se_handler:
   2551 	push	%rsi
   2552 	push	%rdi
   2553 	push	%rbx
   2554 	push	%rbp
   2555 	push	%r12
   2556 	push	%r13
   2557 	push	%r14
   2558 	push	%r15
   2559 	pushfq
   2560 	sub	\$64,%rsp
   2561 
   2562 	mov	120($context),%rax	# pull context->Rax
   2563 	mov	248($context),%rbx	# pull context->Rip
   2564 
   2565 	mov	8($disp),%rsi		# disp->ImageBase
   2566 	mov	56($disp),%r11		# disp->HandlerData
   2567 
   2568 	mov	0(%r11),%r10d		# HandlerData[0]
   2569 	lea	(%rsi,%r10),%r10	# prologue label
   2570 	cmp	%r10,%rbx		# context->Rip<prologue label
   2571 	jb	.Lin_block_prologue
   2572 
   2573 	mov	152($context),%rax	# pull context->Rsp
   2574 
   2575 	mov	4(%r11),%r10d		# HandlerData[1]
   2576 	lea	(%rsi,%r10),%r10	# epilogue label
   2577 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2578 	jae	.Lin_block_prologue
   2579 
   2580 	mov	24(%rax),%rax		# pull saved real stack pointer
   2581 	lea	48(%rax),%rax		# adjust...
   2582 
   2583 	mov	-8(%rax),%rbx
   2584 	mov	-16(%rax),%rbp
   2585 	mov	-24(%rax),%r12
   2586 	mov	-32(%rax),%r13
   2587 	mov	-40(%rax),%r14
   2588 	mov	-48(%rax),%r15
   2589 	mov	%rbx,144($context)	# restore context->Rbx
   2590 	mov	%rbp,160($context)	# restore context->Rbp
   2591 	mov	%r12,216($context)	# restore context->R12
   2592 	mov	%r13,224($context)	# restore context->R13
   2593 	mov	%r14,232($context)	# restore context->R14
   2594 	mov	%r15,240($context)	# restore context->R15
   2595 
   2596 .Lin_block_prologue:
   2597 	mov	8(%rax),%rdi
   2598 	mov	16(%rax),%rsi
   2599 	mov	%rax,152($context)	# restore context->Rsp
   2600 	mov	%rsi,168($context)	# restore context->Rsi
   2601 	mov	%rdi,176($context)	# restore context->Rdi
   2602 
   2603 	jmp	.Lcommon_seh_exit
   2604 .size	block_se_handler,.-block_se_handler
   2605 
   2606 .type	key_se_handler,\@abi-omnipotent
   2607 .align	16
   2608 key_se_handler:
   2609 	push	%rsi
   2610 	push	%rdi
   2611 	push	%rbx
   2612 	push	%rbp
   2613 	push	%r12
   2614 	push	%r13
   2615 	push	%r14
   2616 	push	%r15
   2617 	pushfq
   2618 	sub	\$64,%rsp
   2619 
   2620 	mov	120($context),%rax	# pull context->Rax
   2621 	mov	248($context),%rbx	# pull context->Rip
   2622 
   2623 	mov	8($disp),%rsi		# disp->ImageBase
   2624 	mov	56($disp),%r11		# disp->HandlerData
   2625 
   2626 	mov	0(%r11),%r10d		# HandlerData[0]
   2627 	lea	(%rsi,%r10),%r10	# prologue label
   2628 	cmp	%r10,%rbx		# context->Rip<prologue label
   2629 	jb	.Lin_key_prologue
   2630 
   2631 	mov	152($context),%rax	# pull context->Rsp
   2632 
   2633 	mov	4(%r11),%r10d		# HandlerData[1]
   2634 	lea	(%rsi,%r10),%r10	# epilogue label
   2635 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2636 	jae	.Lin_key_prologue
   2637 
   2638 	lea	56(%rax),%rax
   2639 
   2640 	mov	-8(%rax),%rbx
   2641 	mov	-16(%rax),%rbp
   2642 	mov	-24(%rax),%r12
   2643 	mov	-32(%rax),%r13
   2644 	mov	-40(%rax),%r14
   2645 	mov	-48(%rax),%r15
   2646 	mov	%rbx,144($context)	# restore context->Rbx
   2647 	mov	%rbp,160($context)	# restore context->Rbp
   2648 	mov	%r12,216($context)	# restore context->R12
   2649 	mov	%r13,224($context)	# restore context->R13
   2650 	mov	%r14,232($context)	# restore context->R14
   2651 	mov	%r15,240($context)	# restore context->R15
   2652 
   2653 .Lin_key_prologue:
   2654 	mov	8(%rax),%rdi
   2655 	mov	16(%rax),%rsi
   2656 	mov	%rax,152($context)	# restore context->Rsp
   2657 	mov	%rsi,168($context)	# restore context->Rsi
   2658 	mov	%rdi,176($context)	# restore context->Rdi
   2659 
   2660 	jmp	.Lcommon_seh_exit
   2661 .size	key_se_handler,.-key_se_handler
   2662 
   2663 .type	cbc_se_handler,\@abi-omnipotent
   2664 .align	16
   2665 cbc_se_handler:
   2666 	push	%rsi
   2667 	push	%rdi
   2668 	push	%rbx
   2669 	push	%rbp
   2670 	push	%r12
   2671 	push	%r13
   2672 	push	%r14
   2673 	push	%r15
   2674 	pushfq
   2675 	sub	\$64,%rsp
   2676 
   2677 	mov	120($context),%rax	# pull context->Rax
   2678 	mov	248($context),%rbx	# pull context->Rip
   2679 
   2680 	lea	.Lcbc_prologue(%rip),%r10
   2681 	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
   2682 	jb	.Lin_cbc_prologue
   2683 
   2684 	lea	.Lcbc_fast_body(%rip),%r10
   2685 	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
   2686 	jb	.Lin_cbc_frame_setup
   2687 
   2688 	lea	.Lcbc_slow_prologue(%rip),%r10
   2689 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
   2690 	jb	.Lin_cbc_body
   2691 
   2692 	lea	.Lcbc_slow_body(%rip),%r10
   2693 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
   2694 	jb	.Lin_cbc_frame_setup
   2695 
   2696 .Lin_cbc_body:
   2697 	mov	152($context),%rax	# pull context->Rsp
   2698 
   2699 	lea	.Lcbc_epilogue(%rip),%r10
   2700 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
   2701 	jae	.Lin_cbc_prologue
   2702 
   2703 	lea	8(%rax),%rax
   2704 
   2705 	lea	.Lcbc_popfq(%rip),%r10
   2706 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
   2707 	jae	.Lin_cbc_prologue
   2708 
   2709 	mov	`16-8`(%rax),%rax	# biased $_rsp
   2710 	lea	56(%rax),%rax
   2711 
   2712 .Lin_cbc_frame_setup:
   2713 	mov	-16(%rax),%rbx
   2714 	mov	-24(%rax),%rbp
   2715 	mov	-32(%rax),%r12
   2716 	mov	-40(%rax),%r13
   2717 	mov	-48(%rax),%r14
   2718 	mov	-56(%rax),%r15
   2719 	mov	%rbx,144($context)	# restore context->Rbx
   2720 	mov	%rbp,160($context)	# restore context->Rbp
   2721 	mov	%r12,216($context)	# restore context->R12
   2722 	mov	%r13,224($context)	# restore context->R13
   2723 	mov	%r14,232($context)	# restore context->R14
   2724 	mov	%r15,240($context)	# restore context->R15
   2725 
   2726 .Lin_cbc_prologue:
   2727 	mov	8(%rax),%rdi
   2728 	mov	16(%rax),%rsi
   2729 	mov	%rax,152($context)	# restore context->Rsp
   2730 	mov	%rsi,168($context)	# restore context->Rsi
   2731 	mov	%rdi,176($context)	# restore context->Rdi
   2732 
   2733 .Lcommon_seh_exit:
   2734 
   2735 	mov	40($disp),%rdi		# disp->ContextRecord
   2736 	mov	$context,%rsi		# context
   2737 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   2738 	.long	0xa548f3fc		# cld; rep movsq
   2739 
   2740 	mov	$disp,%rsi
   2741 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2742 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2743 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2744 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2745 	mov	40(%rsi),%r10		# disp->ContextRecord
   2746 	lea	56(%rsi),%r11		# &disp->HandlerData
   2747 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2748 	mov	%r10,32(%rsp)		# arg5
   2749 	mov	%r11,40(%rsp)		# arg6
   2750 	mov	%r12,48(%rsp)		# arg7
   2751 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2752 	call	*__imp_RtlVirtualUnwind(%rip)
   2753 
   2754 	mov	\$1,%eax		# ExceptionContinueSearch
   2755 	add	\$64,%rsp
   2756 	popfq
   2757 	pop	%r15
   2758 	pop	%r14
   2759 	pop	%r13
   2760 	pop	%r12
   2761 	pop	%rbp
   2762 	pop	%rbx
   2763 	pop	%rdi
   2764 	pop	%rsi
   2765 	ret
   2766 .size	cbc_se_handler,.-cbc_se_handler
   2767 
   2768 .section	.pdata
   2769 .align	4
   2770 	.rva	.LSEH_begin_AES_encrypt
   2771 	.rva	.LSEH_end_AES_encrypt
   2772 	.rva	.LSEH_info_AES_encrypt
   2773 
   2774 	.rva	.LSEH_begin_AES_decrypt
   2775 	.rva	.LSEH_end_AES_decrypt
   2776 	.rva	.LSEH_info_AES_decrypt
   2777 
   2778 	.rva	.LSEH_begin_private_AES_set_encrypt_key
   2779 	.rva	.LSEH_end_private_AES_set_encrypt_key
   2780 	.rva	.LSEH_info_private_AES_set_encrypt_key
   2781 
   2782 	.rva	.LSEH_begin_private_AES_set_decrypt_key
   2783 	.rva	.LSEH_end_private_AES_set_decrypt_key
   2784 	.rva	.LSEH_info_private_AES_set_decrypt_key
   2785 
   2786 	.rva	.LSEH_begin_AES_cbc_encrypt
   2787 	.rva	.LSEH_end_AES_cbc_encrypt
   2788 	.rva	.LSEH_info_AES_cbc_encrypt
   2789 
   2790 .section	.xdata
   2791 .align	8
   2792 .LSEH_info_AES_encrypt:
   2793 	.byte	9,0,0,0
   2794 	.rva	block_se_handler
   2795 	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
   2796 .LSEH_info_AES_decrypt:
   2797 	.byte	9,0,0,0
   2798 	.rva	block_se_handler
   2799 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
   2800 .LSEH_info_private_AES_set_encrypt_key:
   2801 	.byte	9,0,0,0
   2802 	.rva	key_se_handler
   2803 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
   2804 .LSEH_info_private_AES_set_decrypt_key:
   2805 	.byte	9,0,0,0
   2806 	.rva	key_se_handler
   2807 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
   2808 .LSEH_info_AES_cbc_encrypt:
   2809 	.byte	9,0,0,0
   2810 	.rva	cbc_se_handler
   2811 ___
   2812 }
   2813 
   2814 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   2815 
   2816 print $code;
   2817 
   2818 close STDOUT;
   2819