Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # Version 2.1.
     11 #
     12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
     13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
     14 # [you'll notice a lot of resemblance], such as compressed S-boxes
     15 # in little-endian byte order, prefetch of these tables in CBC mode,
     16 # as well as avoiding L1 cache aliasing between stack frame and key
     17 # schedule and already mentioned tables, compressed Td4...
     18 #
     19 # Performance in number of cycles per processed byte for 128-bit key:
     20 #
     21 #		ECB encrypt	ECB decrypt	CBC large chunk
     22 # AMD64		33		41		13.0
     23 # EM64T		38		59		18.6(*)
     24 # Core 2	30		43		14.5(*)
     25 #
     26 # (*) with hyper-threading off
     27 
     28 $flavour = shift;
     29 $output  = shift;
     30 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     31 
     32 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     33 
     34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     37 die "can't locate x86_64-xlate.pl";
     38 
     39 open STDOUT,"| $^X $xlate $flavour $output";
     40 
     41 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
     42 			# ~15% better on both AMD and Intel cores
     43 $speed_limit=512;	# see aes-586.pl for details
     44 
     45 $code=".text\n";
     46 
     47 $s0="%eax";
     48 $s1="%ebx";
     49 $s2="%ecx";
     50 $s3="%edx";
     51 $acc0="%esi";	$mask80="%rsi";
     52 $acc1="%edi";	$maskfe="%rdi";
     53 $acc2="%ebp";	$mask1b="%rbp";
     54 $inp="%r8";
     55 $out="%r9";
     56 $t0="%r10d";
     57 $t1="%r11d";
     58 $t2="%r12d";
     59 $rnds="%r13d";
     60 $sbox="%r14";
     61 $key="%r15";
     62 
     63 sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
     64 sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
     65 			$r =~ s/%[er]([sd]i)/%\1l/;
     66 			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
     67 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
     68 			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
     69 sub _data_word()
     70 { my $i;
     71     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
     72 }
     73 sub data_word()
     74 { my $i;
     75   my $last=pop(@_);
     76     $code.=".long\t";
     77     while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
     78     $code.=sprintf"0x%08x\n",$last;
     79 }
     80 
     81 sub data_byte()
     82 { my $i;
     83   my $last=pop(@_);
     84     $code.=".byte\t";
     85     while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
     86     $code.=sprintf"0x%02x\n",$last&0xff;
     87 }
     88 
     89 sub encvert()
     90 { my $t3="%r8d";	# zaps $inp!
     91 
     92 $code.=<<___;
     93 	# favor 3-way issue Opteron pipeline...
     94 	movzb	`&lo("$s0")`,$acc0
     95 	movzb	`&lo("$s1")`,$acc1
     96 	movzb	`&lo("$s2")`,$acc2
     97 	mov	0($sbox,$acc0,8),$t0
     98 	mov	0($sbox,$acc1,8),$t1
     99 	mov	0($sbox,$acc2,8),$t2
    100 
    101 	movzb	`&hi("$s1")`,$acc0
    102 	movzb	`&hi("$s2")`,$acc1
    103 	movzb	`&lo("$s3")`,$acc2
    104 	xor	3($sbox,$acc0,8),$t0
    105 	xor	3($sbox,$acc1,8),$t1
    106 	mov	0($sbox,$acc2,8),$t3
    107 
    108 	movzb	`&hi("$s3")`,$acc0
    109 	shr	\$16,$s2
    110 	movzb	`&hi("$s0")`,$acc2
    111 	xor	3($sbox,$acc0,8),$t2
    112 	shr	\$16,$s3
    113 	xor	3($sbox,$acc2,8),$t3
    114 
    115 	shr	\$16,$s1
    116 	lea	16($key),$key
    117 	shr	\$16,$s0
    118 
    119 	movzb	`&lo("$s2")`,$acc0
    120 	movzb	`&lo("$s3")`,$acc1
    121 	movzb	`&lo("$s0")`,$acc2
    122 	xor	2($sbox,$acc0,8),$t0
    123 	xor	2($sbox,$acc1,8),$t1
    124 	xor	2($sbox,$acc2,8),$t2
    125 
    126 	movzb	`&hi("$s3")`,$acc0
    127 	movzb	`&hi("$s0")`,$acc1
    128 	movzb	`&lo("$s1")`,$acc2
    129 	xor	1($sbox,$acc0,8),$t0
    130 	xor	1($sbox,$acc1,8),$t1
    131 	xor	2($sbox,$acc2,8),$t3
    132 
    133 	mov	12($key),$s3
    134 	movzb	`&hi("$s1")`,$acc1
    135 	movzb	`&hi("$s2")`,$acc2
    136 	mov	0($key),$s0
    137 	xor	1($sbox,$acc1,8),$t2
    138 	xor	1($sbox,$acc2,8),$t3
    139 
    140 	mov	4($key),$s1
    141 	mov	8($key),$s2
    142 	xor	$t0,$s0
    143 	xor	$t1,$s1
    144 	xor	$t2,$s2
    145 	xor	$t3,$s3
    146 ___
    147 }
    148 
    149 sub enclastvert()
    150 { my $t3="%r8d";	# zaps $inp!
    151 
    152 $code.=<<___;
    153 	movzb	`&lo("$s0")`,$acc0
    154 	movzb	`&lo("$s1")`,$acc1
    155 	movzb	`&lo("$s2")`,$acc2
    156 	movzb	2($sbox,$acc0,8),$t0
    157 	movzb	2($sbox,$acc1,8),$t1
    158 	movzb	2($sbox,$acc2,8),$t2
    159 
    160 	movzb	`&lo("$s3")`,$acc0
    161 	movzb	`&hi("$s1")`,$acc1
    162 	movzb	`&hi("$s2")`,$acc2
    163 	movzb	2($sbox,$acc0,8),$t3
    164 	mov	0($sbox,$acc1,8),$acc1	#$t0
    165 	mov	0($sbox,$acc2,8),$acc2	#$t1
    166 
    167 	and	\$0x0000ff00,$acc1
    168 	and	\$0x0000ff00,$acc2
    169 
    170 	xor	$acc1,$t0
    171 	xor	$acc2,$t1
    172 	shr	\$16,$s2
    173 
    174 	movzb	`&hi("$s3")`,$acc0
    175 	movzb	`&hi("$s0")`,$acc1
    176 	shr	\$16,$s3
    177 	mov	0($sbox,$acc0,8),$acc0	#$t2
    178 	mov	0($sbox,$acc1,8),$acc1	#$t3
    179 
    180 	and	\$0x0000ff00,$acc0
    181 	and	\$0x0000ff00,$acc1
    182 	shr	\$16,$s1
    183 	xor	$acc0,$t2
    184 	xor	$acc1,$t3
    185 	shr	\$16,$s0
    186 
    187 	movzb	`&lo("$s2")`,$acc0
    188 	movzb	`&lo("$s3")`,$acc1
    189 	movzb	`&lo("$s0")`,$acc2
    190 	mov	0($sbox,$acc0,8),$acc0	#$t0
    191 	mov	0($sbox,$acc1,8),$acc1	#$t1
    192 	mov	0($sbox,$acc2,8),$acc2	#$t2
    193 
    194 	and	\$0x00ff0000,$acc0
    195 	and	\$0x00ff0000,$acc1
    196 	and	\$0x00ff0000,$acc2
    197 
    198 	xor	$acc0,$t0
    199 	xor	$acc1,$t1
    200 	xor	$acc2,$t2
    201 
    202 	movzb	`&lo("$s1")`,$acc0
    203 	movzb	`&hi("$s3")`,$acc1
    204 	movzb	`&hi("$s0")`,$acc2
    205 	mov	0($sbox,$acc0,8),$acc0	#$t3
    206 	mov	2($sbox,$acc1,8),$acc1	#$t0
    207 	mov	2($sbox,$acc2,8),$acc2	#$t1
    208 
    209 	and	\$0x00ff0000,$acc0
    210 	and	\$0xff000000,$acc1
    211 	and	\$0xff000000,$acc2
    212 
    213 	xor	$acc0,$t3
    214 	xor	$acc1,$t0
    215 	xor	$acc2,$t1
    216 
    217 	movzb	`&hi("$s1")`,$acc0
    218 	movzb	`&hi("$s2")`,$acc1
    219 	mov	16+12($key),$s3
    220 	mov	2($sbox,$acc0,8),$acc0	#$t2
    221 	mov	2($sbox,$acc1,8),$acc1	#$t3
    222 	mov	16+0($key),$s0
    223 
    224 	and	\$0xff000000,$acc0
    225 	and	\$0xff000000,$acc1
    226 
    227 	xor	$acc0,$t2
    228 	xor	$acc1,$t3
    229 
    230 	mov	16+4($key),$s1
    231 	mov	16+8($key),$s2
    232 	xor	$t0,$s0
    233 	xor	$t1,$s1
    234 	xor	$t2,$s2
    235 	xor	$t3,$s3
    236 ___
    237 }
    238 
    239 sub encstep()
    240 { my ($i,@s) = @_;
    241   my $tmp0=$acc0;
    242   my $tmp1=$acc1;
    243   my $tmp2=$acc2;
    244   my $out=($t0,$t1,$t2,$s[0])[$i];
    245 
    246 	if ($i==3) {
    247 		$tmp0=$s[1];
    248 		$tmp1=$s[2];
    249 		$tmp2=$s[3];
    250 	}
    251 	$code.="	movzb	".&lo($s[0]).",$out\n";
    252 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    253 	$code.="	lea	16($key),$key\n"	if ($i==0);
    254 
    255 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    256 	$code.="	mov	0($sbox,$out,8),$out\n";
    257 
    258 	$code.="	shr	\$16,$tmp1\n";
    259 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    260 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    261 
    262 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    263 	$code.="	shr	\$24,$tmp2\n";
    264 	$code.="	xor	4*$i($key),$out\n";
    265 
    266 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    267 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    268 
    269 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    270 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    271 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    272 	$code.="\n";
    273 }
    274 
    275 sub enclast()
    276 { my ($i,@s)=@_;
    277   my $tmp0=$acc0;
    278   my $tmp1=$acc1;
    279   my $tmp2=$acc2;
    280   my $out=($t0,$t1,$t2,$s[0])[$i];
    281 
    282 	if ($i==3) {
    283 		$tmp0=$s[1];
    284 		$tmp1=$s[2];
    285 		$tmp2=$s[3];
    286 	}
    287 	$code.="	movzb	".&lo($s[0]).",$out\n";
    288 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    289 
    290 	$code.="	mov	2($sbox,$out,8),$out\n";
    291 	$code.="	shr	\$16,$tmp1\n";
    292 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    293 
    294 	$code.="	and	\$0x000000ff,$out\n";
    295 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    296 	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
    297 	$code.="	shr	\$24,$tmp2\n";
    298 
    299 	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
    300 	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
    301 	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
    302 
    303 	$code.="	and	\$0x0000ff00,$tmp0\n";
    304 	$code.="	and	\$0x00ff0000,$tmp1\n";
    305 	$code.="	and	\$0xff000000,$tmp2\n";
    306 
    307 	$code.="	xor	$tmp0,$out\n";
    308 	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
    309 	$code.="	xor	$tmp1,$out\n";
    310 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    311 	$code.="	xor	$tmp2,$out\n";
    312 	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
    313 	$code.="\n";
    314 }
    315 
    316 $code.=<<___;
    317 .type	_x86_64_AES_encrypt,\@abi-omnipotent
    318 .align	16
    319 _x86_64_AES_encrypt:
    320 	xor	0($key),$s0			# xor with key
    321 	xor	4($key),$s1
    322 	xor	8($key),$s2
    323 	xor	12($key),$s3
    324 
    325 	mov	240($key),$rnds			# load key->rounds
    326 	sub	\$1,$rnds
    327 	jmp	.Lenc_loop
    328 .align	16
    329 .Lenc_loop:
    330 ___
    331 	if ($verticalspin) { &encvert(); }
    332 	else {	&encstep(0,$s0,$s1,$s2,$s3);
    333 		&encstep(1,$s1,$s2,$s3,$s0);
    334 		&encstep(2,$s2,$s3,$s0,$s1);
    335 		&encstep(3,$s3,$s0,$s1,$s2);
    336 	}
    337 $code.=<<___;
    338 	sub	\$1,$rnds
    339 	jnz	.Lenc_loop
    340 ___
    341 	if ($verticalspin) { &enclastvert(); }
    342 	else {	&enclast(0,$s0,$s1,$s2,$s3);
    343 		&enclast(1,$s1,$s2,$s3,$s0);
    344 		&enclast(2,$s2,$s3,$s0,$s1);
    345 		&enclast(3,$s3,$s0,$s1,$s2);
    346 		$code.=<<___;
    347 		xor	16+0($key),$s0		# xor with key
    348 		xor	16+4($key),$s1
    349 		xor	16+8($key),$s2
    350 		xor	16+12($key),$s3
    351 ___
    352 	}
    353 $code.=<<___;
    354 	.byte	0xf3,0xc3			# rep ret
    355 .size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
    356 ___
    357 
    358 # it's possible to implement this by shifting tN by 8, filling least
    359 # significant byte with byte load and finally bswap-ing at the end,
    360 # but such partial register load kills Core 2...
    361 sub enccompactvert()
    362 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    363 
    364 $code.=<<___;
    365 	movzb	`&lo("$s0")`,$t0
    366 	movzb	`&lo("$s1")`,$t1
    367 	movzb	`&lo("$s2")`,$t2
    368 	movzb	($sbox,$t0,1),$t0
    369 	movzb	($sbox,$t1,1),$t1
    370 	movzb	($sbox,$t2,1),$t2
    371 
    372 	movzb	`&lo("$s3")`,$t3
    373 	movzb	`&hi("$s1")`,$acc0
    374 	movzb	`&hi("$s2")`,$acc1
    375 	movzb	($sbox,$t3,1),$t3
    376 	movzb	($sbox,$acc0,1),$t4	#$t0
    377 	movzb	($sbox,$acc1,1),$t5	#$t1
    378 
    379 	movzb	`&hi("$s3")`,$acc2
    380 	movzb	`&hi("$s0")`,$acc0
    381 	shr	\$16,$s2
    382 	movzb	($sbox,$acc2,1),$acc2	#$t2
    383 	movzb	($sbox,$acc0,1),$acc0	#$t3
    384 	shr	\$16,$s3
    385 
    386 	movzb	`&lo("$s2")`,$acc1
    387 	shl	\$8,$t4
    388 	shl	\$8,$t5
    389 	movzb	($sbox,$acc1,1),$acc1	#$t0
    390 	xor	$t4,$t0
    391 	xor	$t5,$t1
    392 
    393 	movzb	`&lo("$s3")`,$t4
    394 	shr	\$16,$s0
    395 	shr	\$16,$s1
    396 	movzb	`&lo("$s0")`,$t5
    397 	shl	\$8,$acc2
    398 	shl	\$8,$acc0
    399 	movzb	($sbox,$t4,1),$t4	#$t1
    400 	movzb	($sbox,$t5,1),$t5	#$t2
    401 	xor	$acc2,$t2
    402 	xor	$acc0,$t3
    403 
    404 	movzb	`&lo("$s1")`,$acc2
    405 	movzb	`&hi("$s3")`,$acc0
    406 	shl	\$16,$acc1
    407 	movzb	($sbox,$acc2,1),$acc2	#$t3
    408 	movzb	($sbox,$acc0,1),$acc0	#$t0
    409 	xor	$acc1,$t0
    410 
    411 	movzb	`&hi("$s0")`,$acc1
    412 	shr	\$8,$s2
    413 	shr	\$8,$s1
    414 	movzb	($sbox,$acc1,1),$acc1	#$t1
    415 	movzb	($sbox,$s2,1),$s3	#$t3
    416 	movzb	($sbox,$s1,1),$s2	#$t2
    417 	shl	\$16,$t4
    418 	shl	\$16,$t5
    419 	shl	\$16,$acc2
    420 	xor	$t4,$t1
    421 	xor	$t5,$t2
    422 	xor	$acc2,$t3
    423 
    424 	shl	\$24,$acc0
    425 	shl	\$24,$acc1
    426 	shl	\$24,$s3
    427 	xor	$acc0,$t0
    428 	shl	\$24,$s2
    429 	xor	$acc1,$t1
    430 	mov	$t0,$s0
    431 	mov	$t1,$s1
    432 	xor	$t2,$s2
    433 	xor	$t3,$s3
    434 ___
    435 }
    436 
    437 sub enctransform_ref()
    438 { my $sn = shift;
    439   my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
    440 
    441 $code.=<<___;
    442 	mov	$sn,$acc
    443 	and	\$0x80808080,$acc
    444 	mov	$acc,$tmp
    445 	shr	\$7,$tmp
    446 	lea	($sn,$sn),$r2
    447 	sub	$tmp,$acc
    448 	and	\$0xfefefefe,$r2
    449 	and	\$0x1b1b1b1b,$acc
    450 	mov	$sn,$tmp
    451 	xor	$acc,$r2
    452 
    453 	xor	$r2,$sn
    454 	rol	\$24,$sn
    455 	xor	$r2,$sn
    456 	ror	\$16,$tmp
    457 	xor	$tmp,$sn
    458 	ror	\$8,$tmp
    459 	xor	$tmp,$sn
    460 ___
    461 }
    462 
    463 # unlike decrypt case it does not pay off to parallelize enctransform
    464 sub enctransform()
    465 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
    466 
    467 $code.=<<___;
    468 	mov	$s0,$acc0
    469 	mov	$s1,$acc1
    470 	and	\$0x80808080,$acc0
    471 	and	\$0x80808080,$acc1
    472 	mov	$acc0,$t0
    473 	mov	$acc1,$t1
    474 	shr	\$7,$t0
    475 	lea	($s0,$s0),$r20
    476 	shr	\$7,$t1
    477 	lea	($s1,$s1),$r21
    478 	sub	$t0,$acc0
    479 	sub	$t1,$acc1
    480 	and	\$0xfefefefe,$r20
    481 	and	\$0xfefefefe,$r21
    482 	and	\$0x1b1b1b1b,$acc0
    483 	and	\$0x1b1b1b1b,$acc1
    484 	mov	$s0,$t0
    485 	mov	$s1,$t1
    486 	xor	$acc0,$r20
    487 	xor	$acc1,$r21
    488 
    489 	xor	$r20,$s0
    490 	xor	$r21,$s1
    491 	 mov	$s2,$acc0
    492 	 mov	$s3,$acc1
    493 	rol	\$24,$s0
    494 	rol	\$24,$s1
    495 	 and	\$0x80808080,$acc0
    496 	 and	\$0x80808080,$acc1
    497 	xor	$r20,$s0
    498 	xor	$r21,$s1
    499 	 mov	$acc0,$t2
    500 	 mov	$acc1,$t3
    501 	ror	\$16,$t0
    502 	ror	\$16,$t1
    503 	 shr	\$7,$t2
    504 	 lea	($s2,$s2),$r20
    505 	xor	$t0,$s0
    506 	xor	$t1,$s1
    507 	 shr	\$7,$t3
    508 	 lea	($s3,$s3),$r21
    509 	ror	\$8,$t0
    510 	ror	\$8,$t1
    511 	 sub	$t2,$acc0
    512 	 sub	$t3,$acc1
    513 	xor	$t0,$s0
    514 	xor	$t1,$s1
    515 
    516 	and	\$0xfefefefe,$r20
    517 	and	\$0xfefefefe,$r21
    518 	and	\$0x1b1b1b1b,$acc0
    519 	and	\$0x1b1b1b1b,$acc1
    520 	mov	$s2,$t2
    521 	mov	$s3,$t3
    522 	xor	$acc0,$r20
    523 	xor	$acc1,$r21
    524 
    525 	xor	$r20,$s2
    526 	xor	$r21,$s3
    527 	rol	\$24,$s2
    528 	rol	\$24,$s3
    529 	xor	$r20,$s2
    530 	xor	$r21,$s3
    531 	mov	0($sbox),$acc0			# prefetch Te4
    532 	ror	\$16,$t2
    533 	ror	\$16,$t3
    534 	mov	64($sbox),$acc1
    535 	xor	$t2,$s2
    536 	xor	$t3,$s3
    537 	mov	128($sbox),$r20
    538 	ror	\$8,$t2
    539 	ror	\$8,$t3
    540 	mov	192($sbox),$r21
    541 	xor	$t2,$s2
    542 	xor	$t3,$s3
    543 ___
    544 }
    545 
    546 $code.=<<___;
    547 .type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
    548 .align	16
    549 _x86_64_AES_encrypt_compact:
    550 	lea	128($sbox),$inp			# size optimization
    551 	mov	0-128($inp),$acc1		# prefetch Te4
    552 	mov	32-128($inp),$acc2
    553 	mov	64-128($inp),$t0
    554 	mov	96-128($inp),$t1
    555 	mov	128-128($inp),$acc1
    556 	mov	160-128($inp),$acc2
    557 	mov	192-128($inp),$t0
    558 	mov	224-128($inp),$t1
    559 	jmp	.Lenc_loop_compact
    560 .align	16
    561 .Lenc_loop_compact:
    562 		xor	0($key),$s0		# xor with key
    563 		xor	4($key),$s1
    564 		xor	8($key),$s2
    565 		xor	12($key),$s3
    566 		lea	16($key),$key
    567 ___
    568 		&enccompactvert();
    569 $code.=<<___;
    570 		cmp	16(%rsp),$key
    571 		je	.Lenc_compact_done
    572 ___
    573 		&enctransform();
    574 $code.=<<___;
    575 	jmp	.Lenc_loop_compact
    576 .align	16
    577 .Lenc_compact_done:
    578 	xor	0($key),$s0
    579 	xor	4($key),$s1
    580 	xor	8($key),$s2
    581 	xor	12($key),$s3
    582 	.byte	0xf3,0xc3			# rep ret
    583 .size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
    584 ___
    585 
    586 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
    587 $code.=<<___;
    588 .globl	AES_encrypt
    589 .type	AES_encrypt,\@function,3
    590 .align	16
    591 AES_encrypt:
    592 	push	%rbx
    593 	push	%rbp
    594 	push	%r12
    595 	push	%r13
    596 	push	%r14
    597 	push	%r15
    598 
    599 	# allocate frame "above" key schedule
    600 	mov	%rsp,%r10
    601 	lea	-63(%rdx),%rcx	# %rdx is key argument
    602 	and	\$-64,%rsp
    603 	sub	%rsp,%rcx
    604 	neg	%rcx
    605 	and	\$0x3c0,%rcx
    606 	sub	%rcx,%rsp
    607 	sub	\$32,%rsp
    608 
    609 	mov	%rsi,16(%rsp)	# save out
    610 	mov	%r10,24(%rsp)	# save real stack pointer
    611 .Lenc_prologue:
    612 
    613 	mov	%rdx,$key
    614 	mov	240($key),$rnds	# load rounds
    615 
    616 	mov	0(%rdi),$s0	# load input vector
    617 	mov	4(%rdi),$s1
    618 	mov	8(%rdi),$s2
    619 	mov	12(%rdi),$s3
    620 
    621 	shl	\$4,$rnds
    622 	lea	($key,$rnds),%rbp
    623 	mov	$key,(%rsp)	# key schedule
    624 	mov	%rbp,8(%rsp)	# end of key schedule
    625 
    626 	# pick Te4 copy which can't "overlap" with stack frame or key schedule
    627 	lea	.LAES_Te+2048(%rip),$sbox
    628 	lea	768(%rsp),%rbp
    629 	sub	$sbox,%rbp
    630 	and	\$0x300,%rbp
    631 	lea	($sbox,%rbp),$sbox
    632 
    633 	call	_x86_64_AES_encrypt_compact
    634 
    635 	mov	16(%rsp),$out	# restore out
    636 	mov	24(%rsp),%rsi	# restore saved stack pointer
    637 	mov	$s0,0($out)	# write output vector
    638 	mov	$s1,4($out)
    639 	mov	$s2,8($out)
    640 	mov	$s3,12($out)
    641 
    642 	mov	(%rsi),%r15
    643 	mov	8(%rsi),%r14
    644 	mov	16(%rsi),%r13
    645 	mov	24(%rsi),%r12
    646 	mov	32(%rsi),%rbp
    647 	mov	40(%rsi),%rbx
    648 	lea	48(%rsi),%rsp
    649 .Lenc_epilogue:
    650 	ret
    651 .size	AES_encrypt,.-AES_encrypt
    652 ___
    653 
    654 #------------------------------------------------------------------#
    655 
    656 sub decvert()
    657 { my $t3="%r8d";	# zaps $inp!
    658 
    659 $code.=<<___;
    660 	# favor 3-way issue Opteron pipeline...
    661 	movzb	`&lo("$s0")`,$acc0
    662 	movzb	`&lo("$s1")`,$acc1
    663 	movzb	`&lo("$s2")`,$acc2
    664 	mov	0($sbox,$acc0,8),$t0
    665 	mov	0($sbox,$acc1,8),$t1
    666 	mov	0($sbox,$acc2,8),$t2
    667 
    668 	movzb	`&hi("$s3")`,$acc0
    669 	movzb	`&hi("$s0")`,$acc1
    670 	movzb	`&lo("$s3")`,$acc2
    671 	xor	3($sbox,$acc0,8),$t0
    672 	xor	3($sbox,$acc1,8),$t1
    673 	mov	0($sbox,$acc2,8),$t3
    674 
    675 	movzb	`&hi("$s1")`,$acc0
    676 	shr	\$16,$s0
    677 	movzb	`&hi("$s2")`,$acc2
    678 	xor	3($sbox,$acc0,8),$t2
    679 	shr	\$16,$s3
    680 	xor	3($sbox,$acc2,8),$t3
    681 
    682 	shr	\$16,$s1
    683 	lea	16($key),$key
    684 	shr	\$16,$s2
    685 
    686 	movzb	`&lo("$s2")`,$acc0
    687 	movzb	`&lo("$s3")`,$acc1
    688 	movzb	`&lo("$s0")`,$acc2
    689 	xor	2($sbox,$acc0,8),$t0
    690 	xor	2($sbox,$acc1,8),$t1
    691 	xor	2($sbox,$acc2,8),$t2
    692 
    693 	movzb	`&hi("$s1")`,$acc0
    694 	movzb	`&hi("$s2")`,$acc1
    695 	movzb	`&lo("$s1")`,$acc2
    696 	xor	1($sbox,$acc0,8),$t0
    697 	xor	1($sbox,$acc1,8),$t1
    698 	xor	2($sbox,$acc2,8),$t3
    699 
    700 	movzb	`&hi("$s3")`,$acc0
    701 	mov	12($key),$s3
    702 	movzb	`&hi("$s0")`,$acc2
    703 	xor	1($sbox,$acc0,8),$t2
    704 	mov	0($key),$s0
    705 	xor	1($sbox,$acc2,8),$t3
    706 
    707 	xor	$t0,$s0
    708 	mov	4($key),$s1
    709 	mov	8($key),$s2
    710 	xor	$t2,$s2
    711 	xor	$t1,$s1
    712 	xor	$t3,$s3
    713 ___
    714 }
    715 
    716 sub declastvert()
    717 { my $t3="%r8d";	# zaps $inp!
    718 
    719 $code.=<<___;
    720 	lea	2048($sbox),$sbox	# size optimization
    721 	movzb	`&lo("$s0")`,$acc0
    722 	movzb	`&lo("$s1")`,$acc1
    723 	movzb	`&lo("$s2")`,$acc2
    724 	movzb	($sbox,$acc0,1),$t0
    725 	movzb	($sbox,$acc1,1),$t1
    726 	movzb	($sbox,$acc2,1),$t2
    727 
    728 	movzb	`&lo("$s3")`,$acc0
    729 	movzb	`&hi("$s3")`,$acc1
    730 	movzb	`&hi("$s0")`,$acc2
    731 	movzb	($sbox,$acc0,1),$t3
    732 	movzb	($sbox,$acc1,1),$acc1	#$t0
    733 	movzb	($sbox,$acc2,1),$acc2	#$t1
    734 
    735 	shl	\$8,$acc1
    736 	shl	\$8,$acc2
    737 
    738 	xor	$acc1,$t0
    739 	xor	$acc2,$t1
    740 	shr	\$16,$s3
    741 
    742 	movzb	`&hi("$s1")`,$acc0
    743 	movzb	`&hi("$s2")`,$acc1
    744 	shr	\$16,$s0
    745 	movzb	($sbox,$acc0,1),$acc0	#$t2
    746 	movzb	($sbox,$acc1,1),$acc1	#$t3
    747 
    748 	shl	\$8,$acc0
    749 	shl	\$8,$acc1
    750 	shr	\$16,$s1
    751 	xor	$acc0,$t2
    752 	xor	$acc1,$t3
    753 	shr	\$16,$s2
    754 
    755 	movzb	`&lo("$s2")`,$acc0
    756 	movzb	`&lo("$s3")`,$acc1
    757 	movzb	`&lo("$s0")`,$acc2
    758 	movzb	($sbox,$acc0,1),$acc0	#$t0
    759 	movzb	($sbox,$acc1,1),$acc1	#$t1
    760 	movzb	($sbox,$acc2,1),$acc2	#$t2
    761 
    762 	shl	\$16,$acc0
    763 	shl	\$16,$acc1
    764 	shl	\$16,$acc2
    765 
    766 	xor	$acc0,$t0
    767 	xor	$acc1,$t1
    768 	xor	$acc2,$t2
    769 
    770 	movzb	`&lo("$s1")`,$acc0
    771 	movzb	`&hi("$s1")`,$acc1
    772 	movzb	`&hi("$s2")`,$acc2
    773 	movzb	($sbox,$acc0,1),$acc0	#$t3
    774 	movzb	($sbox,$acc1,1),$acc1	#$t0
    775 	movzb	($sbox,$acc2,1),$acc2	#$t1
    776 
    777 	shl	\$16,$acc0
    778 	shl	\$24,$acc1
    779 	shl	\$24,$acc2
    780 
    781 	xor	$acc0,$t3
    782 	xor	$acc1,$t0
    783 	xor	$acc2,$t1
    784 
    785 	movzb	`&hi("$s3")`,$acc0
    786 	movzb	`&hi("$s0")`,$acc1
    787 	mov	16+12($key),$s3
    788 	movzb	($sbox,$acc0,1),$acc0	#$t2
    789 	movzb	($sbox,$acc1,1),$acc1	#$t3
    790 	mov	16+0($key),$s0
    791 
    792 	shl	\$24,$acc0
    793 	shl	\$24,$acc1
    794 
    795 	xor	$acc0,$t2
    796 	xor	$acc1,$t3
    797 
    798 	mov	16+4($key),$s1
    799 	mov	16+8($key),$s2
    800 	lea	-2048($sbox),$sbox
    801 	xor	$t0,$s0
    802 	xor	$t1,$s1
    803 	xor	$t2,$s2
    804 	xor	$t3,$s3
    805 ___
    806 }
    807 
    808 sub decstep()
    809 { my ($i,@s) = @_;
    810   my $tmp0=$acc0;
    811   my $tmp1=$acc1;
    812   my $tmp2=$acc2;
    813   my $out=($t0,$t1,$t2,$s[0])[$i];
    814 
    815 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    816 			$tmp1=$s[2]			if ($i==3);
    817 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    818 	$code.="	and	\$0xFF,$out\n";
    819 
    820 	$code.="	mov	0($sbox,$out,8),$out\n";
    821 	$code.="	shr	\$16,$tmp1\n";
    822 			$tmp2=$s[3]			if ($i==3);
    823 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    824 
    825 			$tmp0=$s[1]			if ($i==3);
    826 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    827 	$code.="	and	\$0xFF,$tmp1\n";
    828 	$code.="	shr	\$24,$tmp2\n";
    829 
    830 	$code.="	xor	3($sbox,$tmp0,8),$out\n";
    831 	$code.="	xor	2($sbox,$tmp1,8),$out\n";
    832 	$code.="	xor	1($sbox,$tmp2,8),$out\n";
    833 
    834 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    835 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    836 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    837 	$code.="\n";
    838 }
    839 
    840 sub declast()
    841 { my ($i,@s)=@_;
    842   my $tmp0=$acc0;
    843   my $tmp1=$acc1;
    844   my $tmp2=$acc2;
    845   my $out=($t0,$t1,$t2,$s[0])[$i];
    846 
    847 	$code.="	mov	$s[0],$out\n"		if ($i!=3);
    848 			$tmp1=$s[2]			if ($i==3);
    849 	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
    850 	$code.="	and	\$0xFF,$out\n";
    851 
    852 	$code.="	movzb	2048($sbox,$out,1),$out\n";
    853 	$code.="	shr	\$16,$tmp1\n";
    854 			$tmp2=$s[3]			if ($i==3);
    855 	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
    856 
    857 			$tmp0=$s[1]			if ($i==3);
    858 	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
    859 	$code.="	and	\$0xFF,$tmp1\n";
    860 	$code.="	shr	\$24,$tmp2\n";
    861 
    862 	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
    863 	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
    864 	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
    865 
    866 	$code.="	shl	\$8,$tmp0\n";
    867 	$code.="	shl	\$16,$tmp1\n";
    868 	$code.="	shl	\$24,$tmp2\n";
    869 
    870 	$code.="	xor	$tmp0,$out\n";
    871 	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
    872 	$code.="	xor	$tmp1,$out\n";
    873 	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
    874 	$code.="	xor	$tmp2,$out\n";
    875 	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
    876 	$code.="\n";
    877 }
    878 
    879 $code.=<<___;
    880 .type	_x86_64_AES_decrypt,\@abi-omnipotent
    881 .align	16
    882 _x86_64_AES_decrypt:
    883 	xor	0($key),$s0			# xor with key
    884 	xor	4($key),$s1
    885 	xor	8($key),$s2
    886 	xor	12($key),$s3
    887 
    888 	mov	240($key),$rnds			# load key->rounds
    889 	sub	\$1,$rnds
    890 	jmp	.Ldec_loop
    891 .align	16
    892 .Ldec_loop:
    893 ___
    894 	if ($verticalspin) { &decvert(); }
    895 	else {	&decstep(0,$s0,$s3,$s2,$s1);
    896 		&decstep(1,$s1,$s0,$s3,$s2);
    897 		&decstep(2,$s2,$s1,$s0,$s3);
    898 		&decstep(3,$s3,$s2,$s1,$s0);
    899 		$code.=<<___;
    900 		lea	16($key),$key
    901 		xor	0($key),$s0			# xor with key
    902 		xor	4($key),$s1
    903 		xor	8($key),$s2
    904 		xor	12($key),$s3
    905 ___
    906 	}
    907 $code.=<<___;
    908 	sub	\$1,$rnds
    909 	jnz	.Ldec_loop
    910 ___
    911 	if ($verticalspin) { &declastvert(); }
    912 	else {	&declast(0,$s0,$s3,$s2,$s1);
    913 		&declast(1,$s1,$s0,$s3,$s2);
    914 		&declast(2,$s2,$s1,$s0,$s3);
    915 		&declast(3,$s3,$s2,$s1,$s0);
    916 		$code.=<<___;
    917 		xor	16+0($key),$s0			# xor with key
    918 		xor	16+4($key),$s1
    919 		xor	16+8($key),$s2
    920 		xor	16+12($key),$s3
    921 ___
    922 	}
    923 $code.=<<___;
    924 	.byte	0xf3,0xc3			# rep ret
    925 .size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
    926 ___
    927 
    928 sub deccompactvert()
    929 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
    930 
    931 $code.=<<___;
    932 	movzb	`&lo("$s0")`,$t0
    933 	movzb	`&lo("$s1")`,$t1
    934 	movzb	`&lo("$s2")`,$t2
    935 	movzb	($sbox,$t0,1),$t0
    936 	movzb	($sbox,$t1,1),$t1
    937 	movzb	($sbox,$t2,1),$t2
    938 
    939 	movzb	`&lo("$s3")`,$t3
    940 	movzb	`&hi("$s3")`,$acc0
    941 	movzb	`&hi("$s0")`,$acc1
    942 	movzb	($sbox,$t3,1),$t3
    943 	movzb	($sbox,$acc0,1),$t4	#$t0
    944 	movzb	($sbox,$acc1,1),$t5	#$t1
    945 
    946 	movzb	`&hi("$s1")`,$acc2
    947 	movzb	`&hi("$s2")`,$acc0
    948 	shr	\$16,$s2
    949 	movzb	($sbox,$acc2,1),$acc2	#$t2
    950 	movzb	($sbox,$acc0,1),$acc0	#$t3
    951 	shr	\$16,$s3
    952 
    953 	movzb	`&lo("$s2")`,$acc1
    954 	shl	\$8,$t4
    955 	shl	\$8,$t5
    956 	movzb	($sbox,$acc1,1),$acc1	#$t0
    957 	xor	$t4,$t0
    958 	xor	$t5,$t1
    959 
    960 	movzb	`&lo("$s3")`,$t4
    961 	shr	\$16,$s0
    962 	shr	\$16,$s1
    963 	movzb	`&lo("$s0")`,$t5
    964 	shl	\$8,$acc2
    965 	shl	\$8,$acc0
    966 	movzb	($sbox,$t4,1),$t4	#$t1
    967 	movzb	($sbox,$t5,1),$t5	#$t2
    968 	xor	$acc2,$t2
    969 	xor	$acc0,$t3
    970 
    971 	movzb	`&lo("$s1")`,$acc2
    972 	movzb	`&hi("$s1")`,$acc0
    973 	shl	\$16,$acc1
    974 	movzb	($sbox,$acc2,1),$acc2	#$t3
    975 	movzb	($sbox,$acc0,1),$acc0	#$t0
    976 	xor	$acc1,$t0
    977 
    978 	movzb	`&hi("$s2")`,$acc1
    979 	shl	\$16,$t4
    980 	shl	\$16,$t5
    981 	movzb	($sbox,$acc1,1),$s1	#$t1
    982 	xor	$t4,$t1
    983 	xor	$t5,$t2
    984 
    985 	movzb	`&hi("$s3")`,$acc1
    986 	shr	\$8,$s0
    987 	shl	\$16,$acc2
    988 	movzb	($sbox,$acc1,1),$s2	#$t2
    989 	movzb	($sbox,$s0,1),$s3	#$t3
    990 	xor	$acc2,$t3
    991 
    992 	shl	\$24,$acc0
    993 	shl	\$24,$s1
    994 	shl	\$24,$s2
    995 	xor	$acc0,$t0
    996 	shl	\$24,$s3
    997 	xor	$t1,$s1
    998 	mov	$t0,$s0
    999 	xor	$t2,$s2
   1000 	xor	$t3,$s3
   1001 ___
   1002 }
   1003 
   1004 # parallelized version! input is pair of 64-bit values: %rax=s1.s0
   1005 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
   1006 # %ecx=s2 and %edx=s3.
   1007 sub dectransform()
   1008 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
   1009   my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
   1010   my $prefetch = shift;
   1011 
   1012 $code.=<<___;
   1013 	mov	$tp10,$acc0
   1014 	mov	$tp18,$acc8
   1015 	and	$mask80,$acc0
   1016 	and	$mask80,$acc8
   1017 	mov	$acc0,$tp40
   1018 	mov	$acc8,$tp48
   1019 	shr	\$7,$tp40
   1020 	lea	($tp10,$tp10),$tp20
   1021 	shr	\$7,$tp48
   1022 	lea	($tp18,$tp18),$tp28
   1023 	sub	$tp40,$acc0
   1024 	sub	$tp48,$acc8
   1025 	and	$maskfe,$tp20
   1026 	and	$maskfe,$tp28
   1027 	and	$mask1b,$acc0
   1028 	and	$mask1b,$acc8
   1029 	xor	$tp20,$acc0
   1030 	xor	$tp28,$acc8
   1031 	mov	$acc0,$tp20
   1032 	mov	$acc8,$tp28
   1033 
   1034 	and	$mask80,$acc0
   1035 	and	$mask80,$acc8
   1036 	mov	$acc0,$tp80
   1037 	mov	$acc8,$tp88
   1038 	shr	\$7,$tp80
   1039 	lea	($tp20,$tp20),$tp40
   1040 	shr	\$7,$tp88
   1041 	lea	($tp28,$tp28),$tp48
   1042 	sub	$tp80,$acc0
   1043 	sub	$tp88,$acc8
   1044 	and	$maskfe,$tp40
   1045 	and	$maskfe,$tp48
   1046 	and	$mask1b,$acc0
   1047 	and	$mask1b,$acc8
   1048 	xor	$tp40,$acc0
   1049 	xor	$tp48,$acc8
   1050 	mov	$acc0,$tp40
   1051 	mov	$acc8,$tp48
   1052 
   1053 	and	$mask80,$acc0
   1054 	and	$mask80,$acc8
   1055 	mov	$acc0,$tp80
   1056 	mov	$acc8,$tp88
   1057 	shr	\$7,$tp80
   1058 	 xor	$tp10,$tp20		# tp2^=tp1
   1059 	shr	\$7,$tp88
   1060 	 xor	$tp18,$tp28		# tp2^=tp1
   1061 	sub	$tp80,$acc0
   1062 	sub	$tp88,$acc8
   1063 	lea	($tp40,$tp40),$tp80
   1064 	lea	($tp48,$tp48),$tp88
   1065 	 xor	$tp10,$tp40		# tp4^=tp1
   1066 	 xor	$tp18,$tp48		# tp4^=tp1
   1067 	and	$maskfe,$tp80
   1068 	and	$maskfe,$tp88
   1069 	and	$mask1b,$acc0
   1070 	and	$mask1b,$acc8
   1071 	xor	$acc0,$tp80
   1072 	xor	$acc8,$tp88
   1073 
   1074 	xor	$tp80,$tp10		# tp1^=tp8
   1075 	xor	$tp88,$tp18		# tp1^=tp8
   1076 	xor	$tp80,$tp20		# tp2^tp1^=tp8
   1077 	xor	$tp88,$tp28		# tp2^tp1^=tp8
   1078 	mov	$tp10,$acc0
   1079 	mov	$tp18,$acc8
   1080 	xor	$tp80,$tp40		# tp4^tp1^=tp8
   1081 	xor	$tp88,$tp48		# tp4^tp1^=tp8
   1082 	shr	\$32,$acc0
   1083 	shr	\$32,$acc8
   1084 	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
   1085 	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
   1086 	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
   1087 	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
   1088 	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1089 	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
   1090 
   1091 	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
   1092 	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
   1093 	xor	`&LO("$tp80")`,`&LO("$tp10")`
   1094 	xor	`&LO("$tp88")`,`&LO("$tp18")`
   1095 	shr	\$32,$tp80
   1096 	shr	\$32,$tp88
   1097 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1098 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1099 
   1100 	mov	$tp20,$tp80
   1101 	mov	$tp28,$tp88
   1102 	shr	\$32,$tp80
   1103 	shr	\$32,$tp88
   1104 	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
   1105 	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
   1106 	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
   1107 	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
   1108 	xor	`&LO("$tp20")`,`&LO("$tp10")`
   1109 	xor	`&LO("$tp28")`,`&LO("$tp18")`
   1110 	mov	$tp40,$tp20
   1111 	mov	$tp48,$tp28
   1112 	xor	`&LO("$tp80")`,`&LO("$acc0")`
   1113 	xor	`&LO("$tp88")`,`&LO("$acc8")`
   1114 
   1115 	`"mov	0($sbox),$mask80"	if ($prefetch)`
   1116 	shr	\$32,$tp20
   1117 	shr	\$32,$tp28
   1118 	`"mov	64($sbox),$maskfe"	if ($prefetch)`
   1119 	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
   1120 	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
   1121 	`"mov	128($sbox),$mask1b"	if ($prefetch)`
   1122 	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
   1123 	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
   1124 	`"mov	192($sbox),$tp80"	if ($prefetch)`
   1125 	xor	`&LO("$tp40")`,`&LO("$tp10")`
   1126 	xor	`&LO("$tp48")`,`&LO("$tp18")`
   1127 	`"mov	256($sbox),$tp88"	if ($prefetch)`
   1128 	xor	`&LO("$tp20")`,`&LO("$acc0")`
   1129 	xor	`&LO("$tp28")`,`&LO("$acc8")`
   1130 ___
   1131 }
   1132 
   1133 $code.=<<___;
   1134 .type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
   1135 .align	16
   1136 _x86_64_AES_decrypt_compact:
   1137 	lea	128($sbox),$inp			# size optimization
   1138 	mov	0-128($inp),$acc1		# prefetch Td4
   1139 	mov	32-128($inp),$acc2
   1140 	mov	64-128($inp),$t0
   1141 	mov	96-128($inp),$t1
   1142 	mov	128-128($inp),$acc1
   1143 	mov	160-128($inp),$acc2
   1144 	mov	192-128($inp),$t0
   1145 	mov	224-128($inp),$t1
   1146 	jmp	.Ldec_loop_compact
   1147 
   1148 .align	16
   1149 .Ldec_loop_compact:
   1150 		xor	0($key),$s0		# xor with key
   1151 		xor	4($key),$s1
   1152 		xor	8($key),$s2
   1153 		xor	12($key),$s3
   1154 		lea	16($key),$key
   1155 ___
   1156 		&deccompactvert();
   1157 $code.=<<___;
   1158 		cmp	16(%rsp),$key
   1159 		je	.Ldec_compact_done
   1160 
   1161 		mov	256+0($sbox),$mask80
   1162 		shl	\$32,%rbx
   1163 		shl	\$32,%rdx
   1164 		mov	256+8($sbox),$maskfe
   1165 		or	%rbx,%rax
   1166 		or	%rdx,%rcx
   1167 		mov	256+16($sbox),$mask1b
   1168 ___
   1169 		&dectransform(1);
   1170 $code.=<<___;
   1171 	jmp	.Ldec_loop_compact
   1172 .align	16
   1173 .Ldec_compact_done:
   1174 	xor	0($key),$s0
   1175 	xor	4($key),$s1
   1176 	xor	8($key),$s2
   1177 	xor	12($key),$s3
   1178 	.byte	0xf3,0xc3			# rep ret
   1179 .size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
   1180 ___
   1181 
   1182 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
   1183 $code.=<<___;
   1184 .globl	AES_decrypt
   1185 .type	AES_decrypt,\@function,3
   1186 .align	16
   1187 AES_decrypt:
   1188 	push	%rbx
   1189 	push	%rbp
   1190 	push	%r12
   1191 	push	%r13
   1192 	push	%r14
   1193 	push	%r15
   1194 
   1195 	# allocate frame "above" key schedule
   1196 	mov	%rsp,%r10
   1197 	lea	-63(%rdx),%rcx	# %rdx is key argument
   1198 	and	\$-64,%rsp
   1199 	sub	%rsp,%rcx
   1200 	neg	%rcx
   1201 	and	\$0x3c0,%rcx
   1202 	sub	%rcx,%rsp
   1203 	sub	\$32,%rsp
   1204 
   1205 	mov	%rsi,16(%rsp)	# save out
   1206 	mov	%r10,24(%rsp)	# save real stack pointer
   1207 .Ldec_prologue:
   1208 
   1209 	mov	%rdx,$key
   1210 	mov	240($key),$rnds	# load rounds
   1211 
   1212 	mov	0(%rdi),$s0	# load input vector
   1213 	mov	4(%rdi),$s1
   1214 	mov	8(%rdi),$s2
   1215 	mov	12(%rdi),$s3
   1216 
   1217 	shl	\$4,$rnds
   1218 	lea	($key,$rnds),%rbp
   1219 	mov	$key,(%rsp)	# key schedule
   1220 	mov	%rbp,8(%rsp)	# end of key schedule
   1221 
   1222 	# pick Td4 copy which can't "overlap" with stack frame or key schedule
   1223 	lea	.LAES_Td+2048(%rip),$sbox
   1224 	lea	768(%rsp),%rbp
   1225 	sub	$sbox,%rbp
   1226 	and	\$0x300,%rbp
   1227 	lea	($sbox,%rbp),$sbox
   1228 	shr	\$3,%rbp	# recall "magic" constants!
   1229 	add	%rbp,$sbox
   1230 
   1231 	call	_x86_64_AES_decrypt_compact
   1232 
   1233 	mov	16(%rsp),$out	# restore out
   1234 	mov	24(%rsp),%rsi	# restore saved stack pointer
   1235 	mov	$s0,0($out)	# write output vector
   1236 	mov	$s1,4($out)
   1237 	mov	$s2,8($out)
   1238 	mov	$s3,12($out)
   1239 
   1240 	mov	(%rsi),%r15
   1241 	mov	8(%rsi),%r14
   1242 	mov	16(%rsi),%r13
   1243 	mov	24(%rsi),%r12
   1244 	mov	32(%rsi),%rbp
   1245 	mov	40(%rsi),%rbx
   1246 	lea	48(%rsi),%rsp
   1247 .Ldec_epilogue:
   1248 	ret
   1249 .size	AES_decrypt,.-AES_decrypt
   1250 ___
   1251 #------------------------------------------------------------------#
   1252 
   1253 sub enckey()
   1254 {
   1255 $code.=<<___;
   1256 	movz	%dl,%esi		# rk[i]>>0
   1257 	movzb	-128(%rbp,%rsi),%ebx
   1258 	movz	%dh,%esi		# rk[i]>>8
   1259 	shl	\$24,%ebx
   1260 	xor	%ebx,%eax
   1261 
   1262 	movzb	-128(%rbp,%rsi),%ebx
   1263 	shr	\$16,%edx
   1264 	movz	%dl,%esi		# rk[i]>>16
   1265 	xor	%ebx,%eax
   1266 
   1267 	movzb	-128(%rbp,%rsi),%ebx
   1268 	movz	%dh,%esi		# rk[i]>>24
   1269 	shl	\$8,%ebx
   1270 	xor	%ebx,%eax
   1271 
   1272 	movzb	-128(%rbp,%rsi),%ebx
   1273 	shl	\$16,%ebx
   1274 	xor	%ebx,%eax
   1275 
   1276 	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
   1277 ___
   1278 }
   1279 
   1280 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
   1281 #                        AES_KEY *key)
   1282 $code.=<<___;
   1283 .globl	AES_set_encrypt_key
   1284 .type	AES_set_encrypt_key,\@function,3
   1285 .align	16
   1286 AES_set_encrypt_key:
   1287 	push	%rbx
   1288 	push	%rbp
   1289 	push	%r12			# redundant, but allows to share 
   1290 	push	%r13			# exception handler...
   1291 	push	%r14
   1292 	push	%r15
   1293 	sub	\$8,%rsp
   1294 .Lenc_key_prologue:
   1295 
   1296 	call	_x86_64_AES_set_encrypt_key
   1297 
   1298 	mov	8(%rsp),%r15
   1299 	mov	16(%rsp),%r14
   1300 	mov	24(%rsp),%r13
   1301 	mov	32(%rsp),%r12
   1302 	mov	40(%rsp),%rbp
   1303 	mov	48(%rsp),%rbx
   1304 	add	\$56,%rsp
   1305 .Lenc_key_epilogue:
   1306 	ret
   1307 .size	AES_set_encrypt_key,.-AES_set_encrypt_key
   1308 
   1309 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
   1310 .align	16
   1311 _x86_64_AES_set_encrypt_key:
   1312 	mov	%esi,%ecx			# %ecx=bits
   1313 	mov	%rdi,%rsi			# %rsi=userKey
   1314 	mov	%rdx,%rdi			# %rdi=key
   1315 
   1316 	test	\$-1,%rsi
   1317 	jz	.Lbadpointer
   1318 	test	\$-1,%rdi
   1319 	jz	.Lbadpointer
   1320 
   1321 	lea	.LAES_Te(%rip),%rbp
   1322 	lea	2048+128(%rbp),%rbp
   1323 
   1324 	# prefetch Te4
   1325 	mov	0-128(%rbp),%eax
   1326 	mov	32-128(%rbp),%ebx
   1327 	mov	64-128(%rbp),%r8d
   1328 	mov	96-128(%rbp),%edx
   1329 	mov	128-128(%rbp),%eax
   1330 	mov	160-128(%rbp),%ebx
   1331 	mov	192-128(%rbp),%r8d
   1332 	mov	224-128(%rbp),%edx
   1333 
   1334 	cmp	\$128,%ecx
   1335 	je	.L10rounds
   1336 	cmp	\$192,%ecx
   1337 	je	.L12rounds
   1338 	cmp	\$256,%ecx
   1339 	je	.L14rounds
   1340 	mov	\$-2,%rax			# invalid number of bits
   1341 	jmp	.Lexit
   1342 
   1343 .L10rounds:
   1344 	mov	0(%rsi),%rax			# copy first 4 dwords
   1345 	mov	8(%rsi),%rdx
   1346 	mov	%rax,0(%rdi)
   1347 	mov	%rdx,8(%rdi)
   1348 
   1349 	shr	\$32,%rdx
   1350 	xor	%ecx,%ecx
   1351 	jmp	.L10shortcut
   1352 .align	4
   1353 .L10loop:
   1354 		mov	0(%rdi),%eax			# rk[0]
   1355 		mov	12(%rdi),%edx			# rk[3]
   1356 .L10shortcut:
   1357 ___
   1358 		&enckey	();
   1359 $code.=<<___;
   1360 		mov	%eax,16(%rdi)			# rk[4]
   1361 		xor	4(%rdi),%eax
   1362 		mov	%eax,20(%rdi)			# rk[5]
   1363 		xor	8(%rdi),%eax
   1364 		mov	%eax,24(%rdi)			# rk[6]
   1365 		xor	12(%rdi),%eax
   1366 		mov	%eax,28(%rdi)			# rk[7]
   1367 		add	\$1,%ecx
   1368 		lea	16(%rdi),%rdi
   1369 		cmp	\$10,%ecx
   1370 	jl	.L10loop
   1371 
   1372 	movl	\$10,80(%rdi)			# setup number of rounds
   1373 	xor	%rax,%rax
   1374 	jmp	.Lexit
   1375 
   1376 .L12rounds:
   1377 	mov	0(%rsi),%rax			# copy first 6 dwords
   1378 	mov	8(%rsi),%rbx
   1379 	mov	16(%rsi),%rdx
   1380 	mov	%rax,0(%rdi)
   1381 	mov	%rbx,8(%rdi)
   1382 	mov	%rdx,16(%rdi)
   1383 
   1384 	shr	\$32,%rdx
   1385 	xor	%ecx,%ecx
   1386 	jmp	.L12shortcut
   1387 .align	4
   1388 .L12loop:
   1389 		mov	0(%rdi),%eax			# rk[0]
   1390 		mov	20(%rdi),%edx			# rk[5]
   1391 .L12shortcut:
   1392 ___
   1393 		&enckey	();
   1394 $code.=<<___;
   1395 		mov	%eax,24(%rdi)			# rk[6]
   1396 		xor	4(%rdi),%eax
   1397 		mov	%eax,28(%rdi)			# rk[7]
   1398 		xor	8(%rdi),%eax
   1399 		mov	%eax,32(%rdi)			# rk[8]
   1400 		xor	12(%rdi),%eax
   1401 		mov	%eax,36(%rdi)			# rk[9]
   1402 
   1403 		cmp	\$7,%ecx
   1404 		je	.L12break
   1405 		add	\$1,%ecx
   1406 
   1407 		xor	16(%rdi),%eax
   1408 		mov	%eax,40(%rdi)			# rk[10]
   1409 		xor	20(%rdi),%eax
   1410 		mov	%eax,44(%rdi)			# rk[11]
   1411 
   1412 		lea	24(%rdi),%rdi
   1413 	jmp	.L12loop
   1414 .L12break:
   1415 	movl	\$12,72(%rdi)		# setup number of rounds
   1416 	xor	%rax,%rax
   1417 	jmp	.Lexit
   1418 
   1419 .L14rounds:		
   1420 	mov	0(%rsi),%rax			# copy first 8 dwords
   1421 	mov	8(%rsi),%rbx
   1422 	mov	16(%rsi),%rcx
   1423 	mov	24(%rsi),%rdx
   1424 	mov	%rax,0(%rdi)
   1425 	mov	%rbx,8(%rdi)
   1426 	mov	%rcx,16(%rdi)
   1427 	mov	%rdx,24(%rdi)
   1428 
   1429 	shr	\$32,%rdx
   1430 	xor	%ecx,%ecx
   1431 	jmp	.L14shortcut
   1432 .align	4
   1433 .L14loop:
   1434 		mov	0(%rdi),%eax			# rk[0]
   1435 		mov	28(%rdi),%edx			# rk[4]
   1436 .L14shortcut:
   1437 ___
   1438 		&enckey	();
   1439 $code.=<<___;
   1440 		mov	%eax,32(%rdi)			# rk[8]
   1441 		xor	4(%rdi),%eax
   1442 		mov	%eax,36(%rdi)			# rk[9]
   1443 		xor	8(%rdi),%eax
   1444 		mov	%eax,40(%rdi)			# rk[10]
   1445 		xor	12(%rdi),%eax
   1446 		mov	%eax,44(%rdi)			# rk[11]
   1447 
   1448 		cmp	\$6,%ecx
   1449 		je	.L14break
   1450 		add	\$1,%ecx
   1451 
   1452 		mov	%eax,%edx
   1453 		mov	16(%rdi),%eax			# rk[4]
   1454 		movz	%dl,%esi			# rk[11]>>0
   1455 		movzb	-128(%rbp,%rsi),%ebx
   1456 		movz	%dh,%esi			# rk[11]>>8
   1457 		xor	%ebx,%eax
   1458 
   1459 		movzb	-128(%rbp,%rsi),%ebx
   1460 		shr	\$16,%edx
   1461 		shl	\$8,%ebx
   1462 		movz	%dl,%esi			# rk[11]>>16
   1463 		xor	%ebx,%eax
   1464 
   1465 		movzb	-128(%rbp,%rsi),%ebx
   1466 		movz	%dh,%esi			# rk[11]>>24
   1467 		shl	\$16,%ebx
   1468 		xor	%ebx,%eax
   1469 
   1470 		movzb	-128(%rbp,%rsi),%ebx
   1471 		shl	\$24,%ebx
   1472 		xor	%ebx,%eax
   1473 
   1474 		mov	%eax,48(%rdi)			# rk[12]
   1475 		xor	20(%rdi),%eax
   1476 		mov	%eax,52(%rdi)			# rk[13]
   1477 		xor	24(%rdi),%eax
   1478 		mov	%eax,56(%rdi)			# rk[14]
   1479 		xor	28(%rdi),%eax
   1480 		mov	%eax,60(%rdi)			# rk[15]
   1481 
   1482 		lea	32(%rdi),%rdi
   1483 	jmp	.L14loop
   1484 .L14break:
   1485 	movl	\$14,48(%rdi)		# setup number of rounds
   1486 	xor	%rax,%rax
   1487 	jmp	.Lexit
   1488 
   1489 .Lbadpointer:
   1490 	mov	\$-1,%rax
   1491 .Lexit:
   1492 	.byte	0xf3,0xc3			# rep ret
   1493 .size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
   1494 ___
   1495 
   1496 sub deckey_ref()
   1497 { my ($i,$ptr,$te,$td) = @_;
   1498   my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
   1499 $code.=<<___;
   1500 	mov	$i($ptr),$tp1
   1501 	mov	$tp1,$acc
   1502 	and	\$0x80808080,$acc
   1503 	mov	$acc,$tp4
   1504 	shr	\$7,$tp4
   1505 	lea	0($tp1,$tp1),$tp2
   1506 	sub	$tp4,$acc
   1507 	and	\$0xfefefefe,$tp2
   1508 	and	\$0x1b1b1b1b,$acc
   1509 	xor	$tp2,$acc
   1510 	mov	$acc,$tp2
   1511 
   1512 	and	\$0x80808080,$acc
   1513 	mov	$acc,$tp8
   1514 	shr	\$7,$tp8
   1515 	lea	0($tp2,$tp2),$tp4
   1516 	sub	$tp8,$acc
   1517 	and	\$0xfefefefe,$tp4
   1518 	and	\$0x1b1b1b1b,$acc
   1519 	 xor	$tp1,$tp2		# tp2^tp1
   1520 	xor	$tp4,$acc
   1521 	mov	$acc,$tp4
   1522 
   1523 	and	\$0x80808080,$acc
   1524 	mov	$acc,$tp8
   1525 	shr	\$7,$tp8
   1526 	sub	$tp8,$acc
   1527 	lea	0($tp4,$tp4),$tp8
   1528 	 xor	$tp1,$tp4		# tp4^tp1
   1529 	and	\$0xfefefefe,$tp8
   1530 	and	\$0x1b1b1b1b,$acc
   1531 	xor	$acc,$tp8
   1532 
   1533 	xor	$tp8,$tp1		# tp1^tp8
   1534 	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
   1535 	xor	$tp8,$tp2		# tp2^tp1^tp8
   1536 	xor	$tp8,$tp4		# tp4^tp1^tp8
   1537 	xor	$tp2,$tp8
   1538 	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
   1539 
   1540 	xor	$tp8,$tp1
   1541 	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
   1542 	xor	$tp2,$tp1
   1543 	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
   1544 	xor	$tp4,$tp1
   1545 
   1546 	mov	$tp1,$i($ptr)
   1547 ___
   1548 }
   1549 
   1550 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
   1551 #                        AES_KEY *key)
   1552 $code.=<<___;
   1553 .globl	AES_set_decrypt_key
   1554 .type	AES_set_decrypt_key,\@function,3
   1555 .align	16
   1556 AES_set_decrypt_key:
   1557 	push	%rbx
   1558 	push	%rbp
   1559 	push	%r12
   1560 	push	%r13
   1561 	push	%r14
   1562 	push	%r15
   1563 	push	%rdx			# save key schedule
   1564 .Ldec_key_prologue:
   1565 
   1566 	call	_x86_64_AES_set_encrypt_key
   1567 	mov	(%rsp),%r8		# restore key schedule
   1568 	cmp	\$0,%eax
   1569 	jne	.Labort
   1570 
   1571 	mov	240(%r8),%r14d		# pull number of rounds
   1572 	xor	%rdi,%rdi
   1573 	lea	(%rdi,%r14d,4),%rcx
   1574 	mov	%r8,%rsi
   1575 	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
   1576 .align	4
   1577 .Linvert:
   1578 		mov	0(%rsi),%rax
   1579 		mov	8(%rsi),%rbx
   1580 		mov	0(%rdi),%rcx
   1581 		mov	8(%rdi),%rdx
   1582 		mov	%rax,0(%rdi)
   1583 		mov	%rbx,8(%rdi)
   1584 		mov	%rcx,0(%rsi)
   1585 		mov	%rdx,8(%rsi)
   1586 		lea	16(%rsi),%rsi
   1587 		lea	-16(%rdi),%rdi
   1588 		cmp	%rsi,%rdi
   1589 	jne	.Linvert
   1590 
   1591 	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
   1592 
   1593 	mov	40(%rax),$mask80
   1594 	mov	48(%rax),$maskfe
   1595 	mov	56(%rax),$mask1b
   1596 
   1597 	mov	%r8,$key
   1598 	sub	\$1,%r14d
   1599 .align	4
   1600 .Lpermute:
   1601 		lea	16($key),$key
   1602 		mov	0($key),%rax
   1603 		mov	8($key),%rcx
   1604 ___
   1605 		&dectransform ();
   1606 $code.=<<___;
   1607 		mov	%eax,0($key)
   1608 		mov	%ebx,4($key)
   1609 		mov	%ecx,8($key)
   1610 		mov	%edx,12($key)
   1611 		sub	\$1,%r14d
   1612 	jnz	.Lpermute
   1613 
   1614 	xor	%rax,%rax
   1615 .Labort:
   1616 	mov	8(%rsp),%r15
   1617 	mov	16(%rsp),%r14
   1618 	mov	24(%rsp),%r13
   1619 	mov	32(%rsp),%r12
   1620 	mov	40(%rsp),%rbp
   1621 	mov	48(%rsp),%rbx
   1622 	add	\$56,%rsp
   1623 .Ldec_key_epilogue:
   1624 	ret
   1625 .size	AES_set_decrypt_key,.-AES_set_decrypt_key
   1626 ___
   1627 
   1628 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
   1629 #			size_t length, const AES_KEY *key,
   1630 #			unsigned char *ivp,const int enc);
   1631 {
   1632 # stack frame layout
   1633 # -8(%rsp)		return address
   1634 my $keyp="0(%rsp)";		# one to pass as $key
   1635 my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
   1636 my $_rsp="16(%rsp)";		# saved %rsp
   1637 my $_inp="24(%rsp)";		# copy of 1st parameter, inp
   1638 my $_out="32(%rsp)";		# copy of 2nd parameter, out
   1639 my $_len="40(%rsp)";		# copy of 3rd parameter, length
   1640 my $_key="48(%rsp)";		# copy of 4th parameter, key
   1641 my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
   1642 my $ivec="64(%rsp)";		# ivec[16]
   1643 my $aes_key="80(%rsp)";		# copy of aes_key
   1644 my $mark="80+240(%rsp)";	# copy of aes_key->rounds
   1645 
   1646 $code.=<<___;
   1647 .globl	AES_cbc_encrypt
   1648 .type	AES_cbc_encrypt,\@function,6
   1649 .align	16
   1650 .extern	OPENSSL_ia32cap_P
   1651 AES_cbc_encrypt:
   1652 	cmp	\$0,%rdx	# check length
   1653 	je	.Lcbc_epilogue
   1654 	pushfq
   1655 	push	%rbx
   1656 	push	%rbp
   1657 	push	%r12
   1658 	push	%r13
   1659 	push	%r14
   1660 	push	%r15
   1661 .Lcbc_prologue:
   1662 
   1663 	cld
   1664 	mov	%r9d,%r9d	# clear upper half of enc
   1665 
   1666 	lea	.LAES_Te(%rip),$sbox
   1667 	cmp	\$0,%r9
   1668 	jne	.Lcbc_picked_te
   1669 	lea	.LAES_Td(%rip),$sbox
   1670 .Lcbc_picked_te:
   1671 
   1672 	mov	OPENSSL_ia32cap_P(%rip),%r10d
   1673 	cmp	\$$speed_limit,%rdx
   1674 	jb	.Lcbc_slow_prologue
   1675 	test	\$15,%rdx
   1676 	jnz	.Lcbc_slow_prologue
   1677 	bt	\$28,%r10d
   1678 	jc	.Lcbc_slow_prologue
   1679 
   1680 	# allocate aligned stack frame...
   1681 	lea	-88-248(%rsp),$key
   1682 	and	\$-64,$key
   1683 
   1684 	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
   1685 	mov	$sbox,%r10
   1686 	lea	2304($sbox),%r11
   1687 	mov	$key,%r12
   1688 	and	\$0xFFF,%r10	# s = $sbox&0xfff
   1689 	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
   1690 	and	\$0xFFF,%r12	# p = %rsp&0xfff
   1691 
   1692 	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
   1693 	jb	.Lcbc_te_break_out
   1694 	sub	%r11,%r12
   1695 	sub	%r12,$key
   1696 	jmp	.Lcbc_te_ok
   1697 .Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
   1698 	sub	%r10,%r12
   1699 	and	\$0xFFF,%r12
   1700 	add	\$320,%r12
   1701 	sub	%r12,$key
   1702 .align	4
   1703 .Lcbc_te_ok:
   1704 
   1705 	xchg	%rsp,$key
   1706 	#add	\$8,%rsp	# reserve for return address!
   1707 	mov	$key,$_rsp	# save %rsp
   1708 .Lcbc_fast_body:
   1709 	mov	%rdi,$_inp	# save copy of inp
   1710 	mov	%rsi,$_out	# save copy of out
   1711 	mov	%rdx,$_len	# save copy of len
   1712 	mov	%rcx,$_key	# save copy of key
   1713 	mov	%r8,$_ivp	# save copy of ivp
   1714 	movl	\$0,$mark	# copy of aes_key->rounds = 0;
   1715 	mov	%r8,%rbp	# rearrange input arguments
   1716 	mov	%r9,%rbx
   1717 	mov	%rsi,$out
   1718 	mov	%rdi,$inp
   1719 	mov	%rcx,$key
   1720 
   1721 	mov	240($key),%eax		# key->rounds
   1722 	# do we copy key schedule to stack?
   1723 	mov	$key,%r10
   1724 	sub	$sbox,%r10
   1725 	and	\$0xfff,%r10
   1726 	cmp	\$2304,%r10
   1727 	jb	.Lcbc_do_ecopy
   1728 	cmp	\$4096-248,%r10
   1729 	jb	.Lcbc_skip_ecopy
   1730 .align	4
   1731 .Lcbc_do_ecopy:
   1732 		mov	$key,%rsi
   1733 		lea	$aes_key,%rdi
   1734 		lea	$aes_key,$key
   1735 		mov	\$240/8,%ecx
   1736 		.long	0x90A548F3	# rep movsq
   1737 		mov	%eax,(%rdi)	# copy aes_key->rounds
   1738 .Lcbc_skip_ecopy:
   1739 	mov	$key,$keyp	# save key pointer
   1740 
   1741 	mov	\$18,%ecx
   1742 .align	4
   1743 .Lcbc_prefetch_te:
   1744 		mov	0($sbox),%r10
   1745 		mov	32($sbox),%r11
   1746 		mov	64($sbox),%r12
   1747 		mov	96($sbox),%r13
   1748 		lea	128($sbox),$sbox
   1749 		sub	\$1,%ecx
   1750 	jnz	.Lcbc_prefetch_te
   1751 	lea	-2304($sbox),$sbox
   1752 
   1753 	cmp	\$0,%rbx
   1754 	je	.LFAST_DECRYPT
   1755 
   1756 #----------------------------- ENCRYPT -----------------------------#
   1757 	mov	0(%rbp),$s0		# load iv
   1758 	mov	4(%rbp),$s1
   1759 	mov	8(%rbp),$s2
   1760 	mov	12(%rbp),$s3
   1761 
   1762 .align	4
   1763 .Lcbc_fast_enc_loop:
   1764 		xor	0($inp),$s0
   1765 		xor	4($inp),$s1
   1766 		xor	8($inp),$s2
   1767 		xor	12($inp),$s3
   1768 		mov	$keyp,$key	# restore key
   1769 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1770 
   1771 		call	_x86_64_AES_encrypt
   1772 
   1773 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1774 		mov	$_len,%r10
   1775 		mov	$s0,0($out)
   1776 		mov	$s1,4($out)
   1777 		mov	$s2,8($out)
   1778 		mov	$s3,12($out)
   1779 
   1780 		lea	16($inp),$inp
   1781 		lea	16($out),$out
   1782 		sub	\$16,%r10
   1783 		test	\$-16,%r10
   1784 		mov	%r10,$_len
   1785 	jnz	.Lcbc_fast_enc_loop
   1786 	mov	$_ivp,%rbp	# restore ivp
   1787 	mov	$s0,0(%rbp)	# save ivec
   1788 	mov	$s1,4(%rbp)
   1789 	mov	$s2,8(%rbp)
   1790 	mov	$s3,12(%rbp)
   1791 
   1792 	jmp	.Lcbc_fast_cleanup
   1793 
   1794 #----------------------------- DECRYPT -----------------------------#
   1795 .align	16
   1796 .LFAST_DECRYPT:
   1797 	cmp	$inp,$out
   1798 	je	.Lcbc_fast_dec_in_place
   1799 
   1800 	mov	%rbp,$ivec
   1801 .align	4
   1802 .Lcbc_fast_dec_loop:
   1803 		mov	0($inp),$s0	# read input
   1804 		mov	4($inp),$s1
   1805 		mov	8($inp),$s2
   1806 		mov	12($inp),$s3
   1807 		mov	$keyp,$key	# restore key
   1808 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1809 
   1810 		call	_x86_64_AES_decrypt
   1811 
   1812 		mov	$ivec,%rbp	# load ivp
   1813 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1814 		mov	$_len,%r10	# load len
   1815 		xor	0(%rbp),$s0	# xor iv
   1816 		xor	4(%rbp),$s1
   1817 		xor	8(%rbp),$s2
   1818 		xor	12(%rbp),$s3
   1819 		mov	$inp,%rbp	# current input, next iv
   1820 
   1821 		sub	\$16,%r10
   1822 		mov	%r10,$_len	# update len
   1823 		mov	%rbp,$ivec	# update ivp
   1824 
   1825 		mov	$s0,0($out)	# write output
   1826 		mov	$s1,4($out)
   1827 		mov	$s2,8($out)
   1828 		mov	$s3,12($out)
   1829 
   1830 		lea	16($inp),$inp
   1831 		lea	16($out),$out
   1832 	jnz	.Lcbc_fast_dec_loop
   1833 	mov	$_ivp,%r12		# load user ivp
   1834 	mov	0(%rbp),%r10		# load iv
   1835 	mov	8(%rbp),%r11
   1836 	mov	%r10,0(%r12)		# copy back to user
   1837 	mov	%r11,8(%r12)
   1838 	jmp	.Lcbc_fast_cleanup
   1839 
   1840 .align	16
   1841 .Lcbc_fast_dec_in_place:
   1842 	mov	0(%rbp),%r10		# copy iv to stack
   1843 	mov	8(%rbp),%r11
   1844 	mov	%r10,0+$ivec
   1845 	mov	%r11,8+$ivec
   1846 .align	4
   1847 .Lcbc_fast_dec_in_place_loop:
   1848 		mov	0($inp),$s0	# load input
   1849 		mov	4($inp),$s1
   1850 		mov	8($inp),$s2
   1851 		mov	12($inp),$s3
   1852 		mov	$keyp,$key	# restore key
   1853 		mov	$inp,$_inp	# if ($verticalspin) save inp
   1854 
   1855 		call	_x86_64_AES_decrypt
   1856 
   1857 		mov	$_inp,$inp	# if ($verticalspin) restore inp
   1858 		mov	$_len,%r10
   1859 		xor	0+$ivec,$s0
   1860 		xor	4+$ivec,$s1
   1861 		xor	8+$ivec,$s2
   1862 		xor	12+$ivec,$s3
   1863 
   1864 		mov	0($inp),%r11	# load input
   1865 		mov	8($inp),%r12
   1866 		sub	\$16,%r10
   1867 		jz	.Lcbc_fast_dec_in_place_done
   1868 
   1869 		mov	%r11,0+$ivec	# copy input to iv
   1870 		mov	%r12,8+$ivec
   1871 
   1872 		mov	$s0,0($out)	# save output [zaps input]
   1873 		mov	$s1,4($out)
   1874 		mov	$s2,8($out)
   1875 		mov	$s3,12($out)
   1876 
   1877 		lea	16($inp),$inp
   1878 		lea	16($out),$out
   1879 		mov	%r10,$_len
   1880 	jmp	.Lcbc_fast_dec_in_place_loop
   1881 .Lcbc_fast_dec_in_place_done:
   1882 	mov	$_ivp,%rdi
   1883 	mov	%r11,0(%rdi)	# copy iv back to user
   1884 	mov	%r12,8(%rdi)
   1885 
   1886 	mov	$s0,0($out)	# save output [zaps input]
   1887 	mov	$s1,4($out)
   1888 	mov	$s2,8($out)
   1889 	mov	$s3,12($out)
   1890 
   1891 .align	4
   1892 .Lcbc_fast_cleanup:
   1893 	cmpl	\$0,$mark	# was the key schedule copied?
   1894 	lea	$aes_key,%rdi
   1895 	je	.Lcbc_exit
   1896 		mov	\$240/8,%ecx
   1897 		xor	%rax,%rax
   1898 		.long	0x90AB48F3	# rep stosq
   1899 
   1900 	jmp	.Lcbc_exit
   1901 
   1902 #--------------------------- SLOW ROUTINE ---------------------------#
   1903 .align	16
   1904 .Lcbc_slow_prologue:
   1905 	# allocate aligned stack frame...
   1906 	lea	-88(%rsp),%rbp
   1907 	and	\$-64,%rbp
   1908 	# ... just "above" key schedule
   1909 	lea	-88-63(%rcx),%r10
   1910 	sub	%rbp,%r10
   1911 	neg	%r10
   1912 	and	\$0x3c0,%r10
   1913 	sub	%r10,%rbp
   1914 
   1915 	xchg	%rsp,%rbp
   1916 	#add	\$8,%rsp	# reserve for return address!
   1917 	mov	%rbp,$_rsp	# save %rsp
   1918 .Lcbc_slow_body:
   1919 	#mov	%rdi,$_inp	# save copy of inp
   1920 	#mov	%rsi,$_out	# save copy of out
   1921 	#mov	%rdx,$_len	# save copy of len
   1922 	#mov	%rcx,$_key	# save copy of key
   1923 	mov	%r8,$_ivp	# save copy of ivp
   1924 	mov	%r8,%rbp	# rearrange input arguments
   1925 	mov	%r9,%rbx
   1926 	mov	%rsi,$out
   1927 	mov	%rdi,$inp
   1928 	mov	%rcx,$key
   1929 	mov	%rdx,%r10
   1930 
   1931 	mov	240($key),%eax
   1932 	mov	$key,$keyp	# save key pointer
   1933 	shl	\$4,%eax
   1934 	lea	($key,%rax),%rax
   1935 	mov	%rax,$keyend
   1936 
   1937 	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
   1938 	lea	2048($sbox),$sbox
   1939 	lea	768-8(%rsp),%rax
   1940 	sub	$sbox,%rax
   1941 	and	\$0x300,%rax
   1942 	lea	($sbox,%rax),$sbox
   1943 
   1944 	cmp	\$0,%rbx
   1945 	je	.LSLOW_DECRYPT
   1946 
   1947 #--------------------------- SLOW ENCRYPT ---------------------------#
   1948 	test	\$-16,%r10		# check upon length
   1949 	mov	0(%rbp),$s0		# load iv
   1950 	mov	4(%rbp),$s1
   1951 	mov	8(%rbp),$s2
   1952 	mov	12(%rbp),$s3
   1953 	jz	.Lcbc_slow_enc_tail	# short input...
   1954 
   1955 .align	4
   1956 .Lcbc_slow_enc_loop:
   1957 		xor	0($inp),$s0
   1958 		xor	4($inp),$s1
   1959 		xor	8($inp),$s2
   1960 		xor	12($inp),$s3
   1961 		mov	$keyp,$key	# restore key
   1962 		mov	$inp,$_inp	# save inp
   1963 		mov	$out,$_out	# save out
   1964 		mov	%r10,$_len	# save len
   1965 
   1966 		call	_x86_64_AES_encrypt_compact
   1967 
   1968 		mov	$_inp,$inp	# restore inp
   1969 		mov	$_out,$out	# restore out
   1970 		mov	$_len,%r10	# restore len
   1971 		mov	$s0,0($out)
   1972 		mov	$s1,4($out)
   1973 		mov	$s2,8($out)
   1974 		mov	$s3,12($out)
   1975 
   1976 		lea	16($inp),$inp
   1977 		lea	16($out),$out
   1978 		sub	\$16,%r10
   1979 		test	\$-16,%r10
   1980 	jnz	.Lcbc_slow_enc_loop
   1981 	test	\$15,%r10
   1982 	jnz	.Lcbc_slow_enc_tail
   1983 	mov	$_ivp,%rbp	# restore ivp
   1984 	mov	$s0,0(%rbp)	# save ivec
   1985 	mov	$s1,4(%rbp)
   1986 	mov	$s2,8(%rbp)
   1987 	mov	$s3,12(%rbp)
   1988 
   1989 	jmp	.Lcbc_exit
   1990 
   1991 .align	4
   1992 .Lcbc_slow_enc_tail:
   1993 	mov	%rax,%r11
   1994 	mov	%rcx,%r12
   1995 	mov	%r10,%rcx
   1996 	mov	$inp,%rsi
   1997 	mov	$out,%rdi
   1998 	.long	0x9066A4F3		# rep movsb
   1999 	mov	\$16,%rcx		# zero tail
   2000 	sub	%r10,%rcx
   2001 	xor	%rax,%rax
   2002 	.long	0x9066AAF3		# rep stosb
   2003 	mov	$out,$inp		# this is not a mistake!
   2004 	mov	\$16,%r10		# len=16
   2005 	mov	%r11,%rax
   2006 	mov	%r12,%rcx
   2007 	jmp	.Lcbc_slow_enc_loop	# one more spin...
   2008 #--------------------------- SLOW DECRYPT ---------------------------#
   2009 .align	16
   2010 .LSLOW_DECRYPT:
   2011 	shr	\$3,%rax
   2012 	add	%rax,$sbox		# recall "magic" constants!
   2013 
   2014 	mov	0(%rbp),%r11		# copy iv to stack
   2015 	mov	8(%rbp),%r12
   2016 	mov	%r11,0+$ivec
   2017 	mov	%r12,8+$ivec
   2018 
   2019 .align	4
   2020 .Lcbc_slow_dec_loop:
   2021 		mov	0($inp),$s0	# load input
   2022 		mov	4($inp),$s1
   2023 		mov	8($inp),$s2
   2024 		mov	12($inp),$s3
   2025 		mov	$keyp,$key	# restore key
   2026 		mov	$inp,$_inp	# save inp
   2027 		mov	$out,$_out	# save out
   2028 		mov	%r10,$_len	# save len
   2029 
   2030 		call	_x86_64_AES_decrypt_compact
   2031 
   2032 		mov	$_inp,$inp	# restore inp
   2033 		mov	$_out,$out	# restore out
   2034 		mov	$_len,%r10
   2035 		xor	0+$ivec,$s0
   2036 		xor	4+$ivec,$s1
   2037 		xor	8+$ivec,$s2
   2038 		xor	12+$ivec,$s3
   2039 
   2040 		mov	0($inp),%r11	# load input
   2041 		mov	8($inp),%r12
   2042 		sub	\$16,%r10
   2043 		jc	.Lcbc_slow_dec_partial
   2044 		jz	.Lcbc_slow_dec_done
   2045 
   2046 		mov	%r11,0+$ivec	# copy input to iv
   2047 		mov	%r12,8+$ivec
   2048 
   2049 		mov	$s0,0($out)	# save output [can zap input]
   2050 		mov	$s1,4($out)
   2051 		mov	$s2,8($out)
   2052 		mov	$s3,12($out)
   2053 
   2054 		lea	16($inp),$inp
   2055 		lea	16($out),$out
   2056 	jmp	.Lcbc_slow_dec_loop
   2057 .Lcbc_slow_dec_done:
   2058 	mov	$_ivp,%rdi
   2059 	mov	%r11,0(%rdi)		# copy iv back to user
   2060 	mov	%r12,8(%rdi)
   2061 
   2062 	mov	$s0,0($out)		# save output [can zap input]
   2063 	mov	$s1,4($out)
   2064 	mov	$s2,8($out)
   2065 	mov	$s3,12($out)
   2066 
   2067 	jmp	.Lcbc_exit
   2068 
   2069 .align	4
   2070 .Lcbc_slow_dec_partial:
   2071 	mov	$_ivp,%rdi
   2072 	mov	%r11,0(%rdi)		# copy iv back to user
   2073 	mov	%r12,8(%rdi)
   2074 
   2075 	mov	$s0,0+$ivec		# save output to stack
   2076 	mov	$s1,4+$ivec
   2077 	mov	$s2,8+$ivec
   2078 	mov	$s3,12+$ivec
   2079 
   2080 	mov	$out,%rdi
   2081 	lea	$ivec,%rsi
   2082 	lea	16(%r10),%rcx
   2083 	.long	0x9066A4F3	# rep movsb
   2084 	jmp	.Lcbc_exit
   2085 
   2086 .align	16
   2087 .Lcbc_exit:
   2088 	mov	$_rsp,%rsi
   2089 	mov	(%rsi),%r15
   2090 	mov	8(%rsi),%r14
   2091 	mov	16(%rsi),%r13
   2092 	mov	24(%rsi),%r12
   2093 	mov	32(%rsi),%rbp
   2094 	mov	40(%rsi),%rbx
   2095 	lea	48(%rsi),%rsp
   2096 .Lcbc_popfq:
   2097 	popfq
   2098 .Lcbc_epilogue:
   2099 	ret
   2100 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
   2101 ___
   2102 }
   2103 
   2104 $code.=<<___;
   2105 .align	64
   2106 .LAES_Te:
   2107 ___
   2108 	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
   2109 	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
   2110 	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
   2111 	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
   2112 	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
   2113 	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
   2114 	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
   2115 	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
   2116 	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
   2117 	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
   2118 	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
   2119 	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
   2120 	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
   2121 	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
   2122 	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
   2123 	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
   2124 	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
   2125 	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
   2126 	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
   2127 	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
   2128 	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
   2129 	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
   2130 	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
   2131 	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
   2132 	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
   2133 	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
   2134 	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
   2135 	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
   2136 	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
   2137 	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
   2138 	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
   2139 	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
   2140 	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
   2141 	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
   2142 	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
   2143 	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
   2144 	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
   2145 	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
   2146 	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
   2147 	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
   2148 	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
   2149 	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
   2150 	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
   2151 	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
   2152 	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
   2153 	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
   2154 	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
   2155 	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
   2156 	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
   2157 	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
   2158 	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
   2159 	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
   2160 	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
   2161 	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
   2162 	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
   2163 	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
   2164 	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
   2165 	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
   2166 	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
   2167 	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
   2168 	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
   2169 	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
   2170 	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
   2171 	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
   2172 
   2173 #Te4	# four copies of Te4 to choose from to avoid L1 aliasing
   2174 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2175 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2176 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2177 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2178 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2179 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2180 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2181 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2182 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2183 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2184 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2185 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2186 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2187 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2188 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2189 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2190 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2191 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2192 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2193 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2194 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2195 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2196 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2197 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2198 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2199 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2200 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2201 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2202 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2203 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2204 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2205 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2206 
   2207 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2208 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2209 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2210 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2211 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2212 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2213 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2214 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2215 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2216 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2217 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2218 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2219 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2220 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2221 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2222 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2223 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2224 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2225 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2226 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2227 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2228 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2229 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2230 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2231 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2232 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2233 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2234 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2235 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2236 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2237 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2238 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2239 
   2240 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2241 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2242 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2243 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2244 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2245 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2246 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2247 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2248 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2249 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2250 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2251 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2252 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2253 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2254 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2255 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2256 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2257 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2258 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2259 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2260 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2261 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2262 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2263 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2264 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2265 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2266 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2267 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2268 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2269 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2270 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2271 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2272 
   2273 	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
   2274 	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
   2275 	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
   2276 	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
   2277 	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
   2278 	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
   2279 	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
   2280 	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
   2281 	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
   2282 	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
   2283 	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
   2284 	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
   2285 	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
   2286 	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
   2287 	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
   2288 	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
   2289 	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
   2290 	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
   2291 	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
   2292 	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
   2293 	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
   2294 	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
   2295 	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
   2296 	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
   2297 	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
   2298 	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
   2299 	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
   2300 	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
   2301 	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
   2302 	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
   2303 	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
   2304 	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
   2305 #rcon:
   2306 $code.=<<___;
   2307 	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
   2308 	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
   2309 	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
   2310 	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
   2311 ___
   2312 $code.=<<___;
   2313 .align	64
   2314 .LAES_Td:
   2315 ___
   2316 	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
   2317 	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
   2318 	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
   2319 	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
   2320 	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
   2321 	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
   2322 	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
   2323 	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
   2324 	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
   2325 	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
   2326 	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
   2327 	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
   2328 	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
   2329 	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
   2330 	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
   2331 	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
   2332 	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
   2333 	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
   2334 	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
   2335 	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
   2336 	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
   2337 	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
   2338 	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
   2339 	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
   2340 	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
   2341 	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
   2342 	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
   2343 	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
   2344 	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
   2345 	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
   2346 	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
   2347 	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
   2348 	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
   2349 	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
   2350 	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
   2351 	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
   2352 	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
   2353 	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
   2354 	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
   2355 	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
   2356 	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
   2357 	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
   2358 	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
   2359 	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
   2360 	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
   2361 	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
   2362 	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
   2363 	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
   2364 	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
   2365 	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
   2366 	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
   2367 	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
   2368 	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
   2369 	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
   2370 	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
   2371 	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
   2372 	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
   2373 	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
   2374 	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
   2375 	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
   2376 	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
   2377 	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
   2378 	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
   2379 	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
   2380 
   2381 #Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
   2382 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2383 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2384 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2385 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2386 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2387 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2388 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2389 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2390 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2391 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2392 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2393 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2394 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2395 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2396 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2397 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2398 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2399 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2400 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2401 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2402 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2403 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2404 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2405 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2406 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2407 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2408 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2409 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2410 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2411 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2412 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2413 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2414 $code.=<<___;
   2415 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2416 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2417 ___
   2418 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2419 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2420 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2421 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2422 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2423 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2424 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2425 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2426 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2427 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2428 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2429 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2430 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2431 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2432 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2433 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2434 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2435 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2436 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2437 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2438 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2439 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2440 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2441 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2442 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2443 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2444 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2445 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2446 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2447 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2448 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2449 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2450 $code.=<<___;
   2451 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2452 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2453 ___
   2454 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2455 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2456 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2457 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2458 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2459 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2460 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2461 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2462 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2463 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2464 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2465 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2466 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2467 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2468 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2469 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2470 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2471 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2472 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2473 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2474 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2475 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2476 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2477 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2478 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2479 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2480 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2481 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2482 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2483 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2484 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2485 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2486 $code.=<<___;
   2487 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2488 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2489 ___
   2490 	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
   2491 	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
   2492 	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
   2493 	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
   2494 	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
   2495 	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
   2496 	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
   2497 	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
   2498 	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
   2499 	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
   2500 	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
   2501 	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
   2502 	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
   2503 	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
   2504 	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
   2505 	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
   2506 	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
   2507 	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
   2508 	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
   2509 	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
   2510 	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
   2511 	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
   2512 	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
   2513 	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
   2514 	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
   2515 	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
   2516 	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
   2517 	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
   2518 	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
   2519 	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
   2520 	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
   2521 	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
   2522 $code.=<<___;
   2523 	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
   2524 	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
   2525 .asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   2526 .align	64
   2527 ___
   2528 
   2529 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
   2530 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
   2531 if ($win64) {
   2532 $rec="%rcx";
   2533 $frame="%rdx";
   2534 $context="%r8";
   2535 $disp="%r9";
   2536 
   2537 $code.=<<___;
   2538 .extern	__imp_RtlVirtualUnwind
   2539 .type	block_se_handler,\@abi-omnipotent
   2540 .align	16
   2541 block_se_handler:
   2542 	push	%rsi
   2543 	push	%rdi
   2544 	push	%rbx
   2545 	push	%rbp
   2546 	push	%r12
   2547 	push	%r13
   2548 	push	%r14
   2549 	push	%r15
   2550 	pushfq
   2551 	sub	\$64,%rsp
   2552 
   2553 	mov	120($context),%rax	# pull context->Rax
   2554 	mov	248($context),%rbx	# pull context->Rip
   2555 
   2556 	mov	8($disp),%rsi		# disp->ImageBase
   2557 	mov	56($disp),%r11		# disp->HandlerData
   2558 
   2559 	mov	0(%r11),%r10d		# HandlerData[0]
   2560 	lea	(%rsi,%r10),%r10	# prologue label
   2561 	cmp	%r10,%rbx		# context->Rip<prologue label
   2562 	jb	.Lin_block_prologue
   2563 
   2564 	mov	152($context),%rax	# pull context->Rsp
   2565 
   2566 	mov	4(%r11),%r10d		# HandlerData[1]
   2567 	lea	(%rsi,%r10),%r10	# epilogue label
   2568 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2569 	jae	.Lin_block_prologue
   2570 
   2571 	mov	24(%rax),%rax		# pull saved real stack pointer
   2572 	lea	48(%rax),%rax		# adjust...
   2573 
   2574 	mov	-8(%rax),%rbx
   2575 	mov	-16(%rax),%rbp
   2576 	mov	-24(%rax),%r12
   2577 	mov	-32(%rax),%r13
   2578 	mov	-40(%rax),%r14
   2579 	mov	-48(%rax),%r15
   2580 	mov	%rbx,144($context)	# restore context->Rbx
   2581 	mov	%rbp,160($context)	# restore context->Rbp
   2582 	mov	%r12,216($context)	# restore context->R12
   2583 	mov	%r13,224($context)	# restore context->R13
   2584 	mov	%r14,232($context)	# restore context->R14
   2585 	mov	%r15,240($context)	# restore context->R15
   2586 
   2587 .Lin_block_prologue:
   2588 	mov	8(%rax),%rdi
   2589 	mov	16(%rax),%rsi
   2590 	mov	%rax,152($context)	# restore context->Rsp
   2591 	mov	%rsi,168($context)	# restore context->Rsi
   2592 	mov	%rdi,176($context)	# restore context->Rdi
   2593 
   2594 	jmp	.Lcommon_seh_exit
   2595 .size	block_se_handler,.-block_se_handler
   2596 
   2597 .type	key_se_handler,\@abi-omnipotent
   2598 .align	16
   2599 key_se_handler:
   2600 	push	%rsi
   2601 	push	%rdi
   2602 	push	%rbx
   2603 	push	%rbp
   2604 	push	%r12
   2605 	push	%r13
   2606 	push	%r14
   2607 	push	%r15
   2608 	pushfq
   2609 	sub	\$64,%rsp
   2610 
   2611 	mov	120($context),%rax	# pull context->Rax
   2612 	mov	248($context),%rbx	# pull context->Rip
   2613 
   2614 	mov	8($disp),%rsi		# disp->ImageBase
   2615 	mov	56($disp),%r11		# disp->HandlerData
   2616 
   2617 	mov	0(%r11),%r10d		# HandlerData[0]
   2618 	lea	(%rsi,%r10),%r10	# prologue label
   2619 	cmp	%r10,%rbx		# context->Rip<prologue label
   2620 	jb	.Lin_key_prologue
   2621 
   2622 	mov	152($context),%rax	# pull context->Rsp
   2623 
   2624 	mov	4(%r11),%r10d		# HandlerData[1]
   2625 	lea	(%rsi,%r10),%r10	# epilogue label
   2626 	cmp	%r10,%rbx		# context->Rip>=epilogue label
   2627 	jae	.Lin_key_prologue
   2628 
   2629 	lea	56(%rax),%rax
   2630 
   2631 	mov	-8(%rax),%rbx
   2632 	mov	-16(%rax),%rbp
   2633 	mov	-24(%rax),%r12
   2634 	mov	-32(%rax),%r13
   2635 	mov	-40(%rax),%r14
   2636 	mov	-48(%rax),%r15
   2637 	mov	%rbx,144($context)	# restore context->Rbx
   2638 	mov	%rbp,160($context)	# restore context->Rbp
   2639 	mov	%r12,216($context)	# restore context->R12
   2640 	mov	%r13,224($context)	# restore context->R13
   2641 	mov	%r14,232($context)	# restore context->R14
   2642 	mov	%r15,240($context)	# restore context->R15
   2643 
   2644 .Lin_key_prologue:
   2645 	mov	8(%rax),%rdi
   2646 	mov	16(%rax),%rsi
   2647 	mov	%rax,152($context)	# restore context->Rsp
   2648 	mov	%rsi,168($context)	# restore context->Rsi
   2649 	mov	%rdi,176($context)	# restore context->Rdi
   2650 
   2651 	jmp	.Lcommon_seh_exit
   2652 .size	key_se_handler,.-key_se_handler
   2653 
   2654 .type	cbc_se_handler,\@abi-omnipotent
   2655 .align	16
   2656 cbc_se_handler:
   2657 	push	%rsi
   2658 	push	%rdi
   2659 	push	%rbx
   2660 	push	%rbp
   2661 	push	%r12
   2662 	push	%r13
   2663 	push	%r14
   2664 	push	%r15
   2665 	pushfq
   2666 	sub	\$64,%rsp
   2667 
   2668 	mov	120($context),%rax	# pull context->Rax
   2669 	mov	248($context),%rbx	# pull context->Rip
   2670 
   2671 	lea	.Lcbc_prologue(%rip),%r10
   2672 	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
   2673 	jb	.Lin_cbc_prologue
   2674 
   2675 	lea	.Lcbc_fast_body(%rip),%r10
   2676 	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
   2677 	jb	.Lin_cbc_frame_setup
   2678 
   2679 	lea	.Lcbc_slow_prologue(%rip),%r10
   2680 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
   2681 	jb	.Lin_cbc_body
   2682 
   2683 	lea	.Lcbc_slow_body(%rip),%r10
   2684 	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
   2685 	jb	.Lin_cbc_frame_setup
   2686 
   2687 .Lin_cbc_body:
   2688 	mov	152($context),%rax	# pull context->Rsp
   2689 
   2690 	lea	.Lcbc_epilogue(%rip),%r10
   2691 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
   2692 	jae	.Lin_cbc_prologue
   2693 
   2694 	lea	8(%rax),%rax
   2695 
   2696 	lea	.Lcbc_popfq(%rip),%r10
   2697 	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
   2698 	jae	.Lin_cbc_prologue
   2699 
   2700 	mov	`16-8`(%rax),%rax	# biased $_rsp
   2701 	lea	56(%rax),%rax
   2702 
   2703 .Lin_cbc_frame_setup:
   2704 	mov	-16(%rax),%rbx
   2705 	mov	-24(%rax),%rbp
   2706 	mov	-32(%rax),%r12
   2707 	mov	-40(%rax),%r13
   2708 	mov	-48(%rax),%r14
   2709 	mov	-56(%rax),%r15
   2710 	mov	%rbx,144($context)	# restore context->Rbx
   2711 	mov	%rbp,160($context)	# restore context->Rbp
   2712 	mov	%r12,216($context)	# restore context->R12
   2713 	mov	%r13,224($context)	# restore context->R13
   2714 	mov	%r14,232($context)	# restore context->R14
   2715 	mov	%r15,240($context)	# restore context->R15
   2716 
   2717 .Lin_cbc_prologue:
   2718 	mov	8(%rax),%rdi
   2719 	mov	16(%rax),%rsi
   2720 	mov	%rax,152($context)	# restore context->Rsp
   2721 	mov	%rsi,168($context)	# restore context->Rsi
   2722 	mov	%rdi,176($context)	# restore context->Rdi
   2723 
   2724 .Lcommon_seh_exit:
   2725 
   2726 	mov	40($disp),%rdi		# disp->ContextRecord
   2727 	mov	$context,%rsi		# context
   2728 	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
   2729 	.long	0xa548f3fc		# cld; rep movsq
   2730 
   2731 	mov	$disp,%rsi
   2732 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
   2733 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
   2734 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
   2735 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
   2736 	mov	40(%rsi),%r10		# disp->ContextRecord
   2737 	lea	56(%rsi),%r11		# &disp->HandlerData
   2738 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
   2739 	mov	%r10,32(%rsp)		# arg5
   2740 	mov	%r11,40(%rsp)		# arg6
   2741 	mov	%r12,48(%rsp)		# arg7
   2742 	mov	%rcx,56(%rsp)		# arg8, (NULL)
   2743 	call	*__imp_RtlVirtualUnwind(%rip)
   2744 
   2745 	mov	\$1,%eax		# ExceptionContinueSearch
   2746 	add	\$64,%rsp
   2747 	popfq
   2748 	pop	%r15
   2749 	pop	%r14
   2750 	pop	%r13
   2751 	pop	%r12
   2752 	pop	%rbp
   2753 	pop	%rbx
   2754 	pop	%rdi
   2755 	pop	%rsi
   2756 	ret
   2757 .size	cbc_se_handler,.-cbc_se_handler
   2758 
   2759 .section	.pdata
   2760 .align	4
   2761 	.rva	.LSEH_begin_AES_encrypt
   2762 	.rva	.LSEH_end_AES_encrypt
   2763 	.rva	.LSEH_info_AES_encrypt
   2764 
   2765 	.rva	.LSEH_begin_AES_decrypt
   2766 	.rva	.LSEH_end_AES_decrypt
   2767 	.rva	.LSEH_info_AES_decrypt
   2768 
   2769 	.rva	.LSEH_begin_AES_set_encrypt_key
   2770 	.rva	.LSEH_end_AES_set_encrypt_key
   2771 	.rva	.LSEH_info_AES_set_encrypt_key
   2772 
   2773 	.rva	.LSEH_begin_AES_set_decrypt_key
   2774 	.rva	.LSEH_end_AES_set_decrypt_key
   2775 	.rva	.LSEH_info_AES_set_decrypt_key
   2776 
   2777 	.rva	.LSEH_begin_AES_cbc_encrypt
   2778 	.rva	.LSEH_end_AES_cbc_encrypt
   2779 	.rva	.LSEH_info_AES_cbc_encrypt
   2780 
   2781 .section	.xdata
   2782 .align	8
   2783 .LSEH_info_AES_encrypt:
   2784 	.byte	9,0,0,0
   2785 	.rva	block_se_handler
   2786 	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
   2787 .LSEH_info_AES_decrypt:
   2788 	.byte	9,0,0,0
   2789 	.rva	block_se_handler
   2790 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
   2791 .LSEH_info_AES_set_encrypt_key:
   2792 	.byte	9,0,0,0
   2793 	.rva	key_se_handler
   2794 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
   2795 .LSEH_info_AES_set_decrypt_key:
   2796 	.byte	9,0,0,0
   2797 	.rva	key_se_handler
   2798 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
   2799 .LSEH_info_AES_cbc_encrypt:
   2800 	.byte	9,0,0,0
   2801 	.rva	cbc_se_handler
   2802 ___
   2803 }
   2804 
   2805 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
   2806 
   2807 print $code;
   2808 
   2809 close STDOUT;
   2810