Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # AES for s390x.
     11 
     12 # April 2007.
     13 #
     14 # Software performance improvement over gcc-generated code is ~70% and
     15 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
     16 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
     17 # *strictly* in-order execution and issued instruction [in this case
     18 # load value from memory is critical] has to complete before execution
     19 # flow proceeds. S-boxes are compressed to 2KB[+256B].
     20 #
     21 # As for hardware acceleration support. It's basically a "teaser," as
     22 # it can and should be improved in several ways. Most notably support
     23 # for CBC is not utilized, nor multiple blocks are ever processed.
     24 # Then software key schedule can be postponed till hardware support
     25 # detection... Performance improvement over assembler is reportedly
     26 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
     27 # support is implemented.
     28 
     29 # May 2007.
     30 #
     31 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
     32 # for 128-bit keys, if hardware support is detected.
     33 
     34 # Januray 2009.
     35 #
     36 # Add support for hardware AES192/256 and reschedule instructions to
     37 # minimize/avoid Address Generation Interlock hazard and to favour
     38 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
     39 # almost 50% on z9. The gain is smaller on z10, because being dual-
     40 # issue z10 makes it improssible to eliminate the interlock condition:
     41 # critial path is not long enough. Yet it spends ~24 cycles per byte
     42 # processed with 128-bit key.
     43 #
     44 # Unlike previous version hardware support detection takes place only
     45 # at the moment of key schedule setup, which is denoted in key->rounds.
     46 # This is done, because deferred key setup can't be made MT-safe, not
     47 # for keys longer than 128 bits.
     48 #
     49 # Add AES_cbc_encrypt, which gives incredible performance improvement,
     50 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
     51 # because software implementation was optimized.
     52 
     53 # May 2010.
     54 #
     55 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
     56 # performance improvement over "generic" counter mode routine relying
     57 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
     58 # to the fact that exact throughput value depends on current stack
     59 # frame alignment within 4KB page. In worst case you get ~75% of the
     60 # maximum, but *on average* it would be as much as ~98%. Meaning that
     61 # worst case is unlike, it's like hitting ravine on plateau.
     62 
     63 # November 2010.
     64 #
     65 # Adapt for -m31 build. If kernel supports what's called "highgprs"
     66 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
     67 # instructions and achieve "64-bit" performance even in 31-bit legacy
     68 # application context. The feature is not specific to any particular
     69 # processor, as long as it's "z-CPU". Latter implies that the code
     70 # remains z/Architecture specific. On z990 it was measured to perform
     71 # 2x better than code generated by gcc 4.3.
     72 
     73 # December 2010.
     74 #
     75 # Add support for z196 "cipher message with counter" instruction.
     76 # Note however that it's disengaged, because it was measured to
     77 # perform ~12% worse than vanilla km-based code...
     78 
     79 # February 2011.
     80 #
     81 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
     82 # instructions, which deliver ~70% improvement at 8KB block size over
     83 # vanilla km-based code, 37% - at most like 512-bytes block size.
     84 
     85 $flavour = shift;
     86 
     87 if ($flavour =~ /3[12]/) {
     88 	$SIZE_T=4;
     89 	$g="";
     90 } else {
     91 	$SIZE_T=8;
     92 	$g="g";
     93 }
     94 
     95 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
     96 open STDOUT,">$output";
     97 
     98 $softonly=0;	# allow hardware support
     99 
    100 $t0="%r0";	$mask="%r0";
    101 $t1="%r1";
    102 $t2="%r2";	$inp="%r2";
    103 $t3="%r3";	$out="%r3";	$bits="%r3";
    104 $key="%r4";
    105 $i1="%r5";
    106 $i2="%r6";
    107 $i3="%r7";
    108 $s0="%r8";
    109 $s1="%r9";
    110 $s2="%r10";
    111 $s3="%r11";
    112 $tbl="%r12";
    113 $rounds="%r13";
    114 $ra="%r14";
    115 $sp="%r15";
    116 
    117 $stdframe=16*$SIZE_T+4*8;
    118 
    119 sub _data_word()
    120 { my $i;
    121     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
    122 }
    123 
    124 $code=<<___;
    125 .text
    126 
    127 .type	AES_Te,\@object
    128 .align	256
    129 AES_Te:
    130 ___
    131 &_data_word(
    132 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
    133 	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
    134 	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
    135 	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
    136 	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
    137 	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
    138 	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
    139 	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
    140 	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
    141 	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
    142 	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
    143 	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
    144 	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
    145 	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
    146 	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
    147 	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
    148 	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
    149 	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
    150 	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
    151 	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
    152 	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
    153 	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
    154 	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
    155 	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
    156 	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
    157 	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
    158 	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
    159 	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
    160 	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
    161 	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
    162 	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
    163 	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
    164 	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
    165 	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
    166 	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
    167 	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
    168 	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
    169 	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
    170 	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
    171 	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
    172 	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
    173 	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
    174 	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
    175 	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
    176 	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
    177 	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
    178 	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
    179 	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
    180 	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
    181 	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
    182 	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
    183 	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
    184 	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
    185 	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
    186 	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
    187 	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
    188 	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
    189 	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
    190 	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
    191 	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
    192 	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
    193 	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
    194 	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
    195 	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
    196 $code.=<<___;
    197 # Te4[256]
    198 .byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
    199 .byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
    200 .byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
    201 .byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
    202 .byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
    203 .byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
    204 .byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
    205 .byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
    206 .byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
    207 .byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
    208 .byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
    209 .byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
    210 .byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
    211 .byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
    212 .byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
    213 .byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
    214 .byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
    215 .byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
    216 .byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
    217 .byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
    218 .byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
    219 .byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
    220 .byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
    221 .byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
    222 .byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
    223 .byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
    224 .byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
    225 .byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
    226 .byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
    227 .byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
    228 .byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
    229 .byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
    230 # rcon[]
    231 .long	0x01000000, 0x02000000, 0x04000000, 0x08000000
    232 .long	0x10000000, 0x20000000, 0x40000000, 0x80000000
    233 .long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
    234 .align	256
    235 .size	AES_Te,.-AES_Te
    236 
    237 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
    238 # 		 const AES_KEY *key) {
    239 .globl	AES_encrypt
    240 .type	AES_encrypt,\@function
    241 AES_encrypt:
    242 ___
    243 $code.=<<___ if (!$softonly);
    244 	l	%r0,240($key)
    245 	lhi	%r1,16
    246 	clr	%r0,%r1
    247 	jl	.Lesoft
    248 
    249 	la	%r1,0($key)
    250 	#la	%r2,0($inp)
    251 	la	%r4,0($out)
    252 	lghi	%r3,16		# single block length
    253 	.long	0xb92e0042	# km %r4,%r2
    254 	brc	1,.-4		# can this happen?
    255 	br	%r14
    256 .align	64
    257 .Lesoft:
    258 ___
    259 $code.=<<___;
    260 	stm${g}	%r3,$ra,3*$SIZE_T($sp)
    261 
    262 	llgf	$s0,0($inp)
    263 	llgf	$s1,4($inp)
    264 	llgf	$s2,8($inp)
    265 	llgf	$s3,12($inp)
    266 
    267 	larl	$tbl,AES_Te
    268 	bras	$ra,_s390x_AES_encrypt
    269 
    270 	l${g}	$out,3*$SIZE_T($sp)
    271 	st	$s0,0($out)
    272 	st	$s1,4($out)
    273 	st	$s2,8($out)
    274 	st	$s3,12($out)
    275 
    276 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
    277 	br	$ra
    278 .size	AES_encrypt,.-AES_encrypt
    279 
    280 .type   _s390x_AES_encrypt,\@function
    281 .align	16
    282 _s390x_AES_encrypt:
    283 	st${g}	$ra,15*$SIZE_T($sp)
    284 	x	$s0,0($key)
    285 	x	$s1,4($key)
    286 	x	$s2,8($key)
    287 	x	$s3,12($key)
    288 	l	$rounds,240($key)
    289 	llill	$mask,`0xff<<3`
    290 	aghi	$rounds,-1
    291 	j	.Lenc_loop
    292 .align	16
    293 .Lenc_loop:
    294 	sllg	$t1,$s0,`0+3`
    295 	srlg	$t2,$s0,`8-3`
    296 	srlg	$t3,$s0,`16-3`
    297 	srl	$s0,`24-3`
    298 	nr	$s0,$mask
    299 	ngr	$t1,$mask
    300 	nr	$t2,$mask
    301 	nr	$t3,$mask
    302 
    303 	srlg	$i1,$s1,`16-3`	# i0
    304 	sllg	$i2,$s1,`0+3`
    305 	srlg	$i3,$s1,`8-3`
    306 	srl	$s1,`24-3`
    307 	nr	$i1,$mask
    308 	nr	$s1,$mask
    309 	ngr	$i2,$mask
    310 	nr	$i3,$mask
    311 
    312 	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
    313 	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
    314 	l	$t2,2($t2,$tbl) # Te2[s0>>8]
    315 	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
    316 
    317 	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
    318 	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
    319 	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
    320 	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
    321 
    322 	srlg	$i1,$s2,`8-3`	# i0
    323 	srlg	$i2,$s2,`16-3`	# i1
    324 	nr	$i1,$mask
    325 	nr	$i2,$mask
    326 	sllg	$i3,$s2,`0+3`
    327 	srl	$s2,`24-3`
    328 	nr	$s2,$mask
    329 	ngr	$i3,$mask
    330 
    331 	xr	$s1,$t1
    332 	srlg	$ra,$s3,`8-3`	# i1
    333 	sllg	$t1,$s3,`0+3`	# i0
    334 	nr	$ra,$mask
    335 	la	$key,16($key)
    336 	ngr	$t1,$mask
    337 
    338 	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
    339 	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
    340 	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
    341 	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
    342 
    343 	srlg	$i3,$s3,`16-3`	# i2
    344 	xr	$s2,$t2
    345 	srl	$s3,`24-3`
    346 	nr	$i3,$mask
    347 	nr	$s3,$mask
    348 
    349 	x	$s0,0($key)
    350 	x	$s1,4($key)
    351 	x	$s2,8($key)
    352 	x	$t3,12($key)
    353 
    354 	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
    355 	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
    356 	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
    357 	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
    358 	xr	$s3,$t3
    359 
    360 	brct	$rounds,.Lenc_loop
    361 	.align	16
    362 
    363 	sllg	$t1,$s0,`0+3`
    364 	srlg	$t2,$s0,`8-3`
    365 	ngr	$t1,$mask
    366 	srlg	$t3,$s0,`16-3`
    367 	srl	$s0,`24-3`
    368 	nr	$s0,$mask
    369 	nr	$t2,$mask
    370 	nr	$t3,$mask
    371 
    372 	srlg	$i1,$s1,`16-3`	# i0
    373 	sllg	$i2,$s1,`0+3`
    374 	ngr	$i2,$mask
    375 	srlg	$i3,$s1,`8-3`
    376 	srl	$s1,`24-3`
    377 	nr	$i1,$mask
    378 	nr	$s1,$mask
    379 	nr	$i3,$mask
    380 
    381 	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
    382 	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
    383 	sll	$s0,24
    384 	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
    385 	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
    386 	sll	$t2,8
    387 	sll	$t3,16
    388 
    389 	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
    390 	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
    391 	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
    392 	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
    393 	sll	$i1,16
    394 	sll	$s1,24
    395 	sll	$i3,8
    396 	or	$s0,$i1
    397 	or	$s1,$t1
    398 	or	$t2,$i2
    399 	or	$t3,$i3
    400 	
    401 	srlg	$i1,$s2,`8-3`	# i0
    402 	srlg	$i2,$s2,`16-3`	# i1
    403 	nr	$i1,$mask
    404 	nr	$i2,$mask
    405 	sllg	$i3,$s2,`0+3`
    406 	srl	$s2,`24-3`
    407 	ngr	$i3,$mask
    408 	nr	$s2,$mask
    409 
    410 	sllg	$t1,$s3,`0+3`	# i0
    411 	srlg	$ra,$s3,`8-3`	# i1
    412 	ngr	$t1,$mask
    413 
    414 	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
    415 	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
    416 	sll	$i1,8
    417 	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
    418 	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
    419 	sll	$i2,16
    420 	nr	$ra,$mask
    421 	sll	$s2,24
    422 	or	$s0,$i1
    423 	or	$s1,$i2
    424 	or	$s2,$t2
    425 	or	$t3,$i3
    426 
    427 	srlg	$i3,$s3,`16-3`	# i2
    428 	srl	$s3,`24-3`
    429 	nr	$i3,$mask
    430 	nr	$s3,$mask
    431 
    432 	l	$t0,16($key)
    433 	l	$t2,20($key)
    434 
    435 	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
    436 	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
    437 	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
    438 	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
    439 	sll	$i2,8
    440 	sll	$i3,16
    441 	sll	$s3,24
    442 	or	$s0,$i1
    443 	or	$s1,$i2
    444 	or	$s2,$i3
    445 	or	$s3,$t3
    446 
    447 	l${g}	$ra,15*$SIZE_T($sp)
    448 	xr	$s0,$t0
    449 	xr	$s1,$t2
    450 	x	$s2,24($key)
    451 	x	$s3,28($key)
    452 
    453 	br	$ra	
    454 .size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
    455 ___
    456 
    457 $code.=<<___;
    458 .type	AES_Td,\@object
    459 .align	256
    460 AES_Td:
    461 ___
    462 &_data_word(
    463 	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
    464 	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
    465 	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
    466 	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
    467 	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
    468 	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
    469 	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
    470 	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
    471 	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
    472 	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
    473 	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
    474 	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
    475 	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
    476 	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
    477 	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
    478 	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
    479 	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
    480 	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
    481 	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
    482 	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
    483 	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
    484 	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
    485 	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
    486 	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
    487 	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
    488 	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
    489 	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
    490 	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
    491 	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
    492 	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
    493 	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
    494 	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
    495 	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
    496 	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
    497 	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
    498 	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
    499 	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
    500 	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
    501 	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
    502 	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
    503 	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
    504 	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
    505 	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
    506 	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
    507 	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
    508 	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
    509 	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
    510 	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
    511 	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
    512 	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
    513 	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
    514 	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
    515 	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
    516 	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
    517 	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
    518 	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
    519 	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
    520 	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
    521 	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
    522 	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
    523 	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
    524 	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
    525 	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
    526 	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
    527 $code.=<<___;
    528 # Td4[256]
    529 .byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
    530 .byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
    531 .byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
    532 .byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
    533 .byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
    534 .byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
    535 .byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
    536 .byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
    537 .byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
    538 .byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
    539 .byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
    540 .byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
    541 .byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
    542 .byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
    543 .byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
    544 .byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
    545 .byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
    546 .byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
    547 .byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
    548 .byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
    549 .byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
    550 .byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
    551 .byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
    552 .byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
    553 .byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
    554 .byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
    555 .byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
    556 .byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
    557 .byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
    558 .byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
    559 .byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
    560 .byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
    561 .size	AES_Td,.-AES_Td
    562 
    563 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
    564 # 		 const AES_KEY *key) {
    565 .globl	AES_decrypt
    566 .type	AES_decrypt,\@function
    567 AES_decrypt:
    568 ___
    569 $code.=<<___ if (!$softonly);
    570 	l	%r0,240($key)
    571 	lhi	%r1,16
    572 	clr	%r0,%r1
    573 	jl	.Ldsoft
    574 
    575 	la	%r1,0($key)
    576 	#la	%r2,0($inp)
    577 	la	%r4,0($out)
    578 	lghi	%r3,16		# single block length
    579 	.long	0xb92e0042	# km %r4,%r2
    580 	brc	1,.-4		# can this happen?
    581 	br	%r14
    582 .align	64
    583 .Ldsoft:
    584 ___
    585 $code.=<<___;
    586 	stm${g}	%r3,$ra,3*$SIZE_T($sp)
    587 
    588 	llgf	$s0,0($inp)
    589 	llgf	$s1,4($inp)
    590 	llgf	$s2,8($inp)
    591 	llgf	$s3,12($inp)
    592 
    593 	larl	$tbl,AES_Td
    594 	bras	$ra,_s390x_AES_decrypt
    595 
    596 	l${g}	$out,3*$SIZE_T($sp)
    597 	st	$s0,0($out)
    598 	st	$s1,4($out)
    599 	st	$s2,8($out)
    600 	st	$s3,12($out)
    601 
    602 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
    603 	br	$ra
    604 .size	AES_decrypt,.-AES_decrypt
    605 
    606 .type   _s390x_AES_decrypt,\@function
    607 .align	16
    608 _s390x_AES_decrypt:
    609 	st${g}	$ra,15*$SIZE_T($sp)
    610 	x	$s0,0($key)
    611 	x	$s1,4($key)
    612 	x	$s2,8($key)
    613 	x	$s3,12($key)
    614 	l	$rounds,240($key)
    615 	llill	$mask,`0xff<<3`
    616 	aghi	$rounds,-1
    617 	j	.Ldec_loop
    618 .align	16
    619 .Ldec_loop:
    620 	srlg	$t1,$s0,`16-3`
    621 	srlg	$t2,$s0,`8-3`
    622 	sllg	$t3,$s0,`0+3`
    623 	srl	$s0,`24-3`
    624 	nr	$s0,$mask
    625 	nr	$t1,$mask
    626 	nr	$t2,$mask
    627 	ngr	$t3,$mask
    628 
    629 	sllg	$i1,$s1,`0+3`	# i0
    630 	srlg	$i2,$s1,`16-3`
    631 	srlg	$i3,$s1,`8-3`
    632 	srl	$s1,`24-3`
    633 	ngr	$i1,$mask
    634 	nr	$s1,$mask
    635 	nr	$i2,$mask
    636 	nr	$i3,$mask
    637 
    638 	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
    639 	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
    640 	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
    641 	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
    642 
    643 	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
    644 	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
    645 	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
    646 	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
    647 
    648 	srlg	$i1,$s2,`8-3`	# i0
    649 	sllg	$i2,$s2,`0+3`	# i1
    650 	srlg	$i3,$s2,`16-3`
    651 	srl	$s2,`24-3`
    652 	nr	$i1,$mask
    653 	ngr	$i2,$mask
    654 	nr	$s2,$mask
    655 	nr	$i3,$mask
    656 
    657 	xr	$s1,$t1
    658 	srlg	$ra,$s3,`8-3`	# i1
    659 	srlg	$t1,$s3,`16-3`	# i0
    660 	nr	$ra,$mask
    661 	la	$key,16($key)
    662 	nr	$t1,$mask
    663 
    664 	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
    665 	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
    666 	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
    667 	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
    668 
    669 	sllg	$i3,$s3,`0+3`	# i2
    670 	srl	$s3,`24-3`
    671 	ngr	$i3,$mask
    672 	nr	$s3,$mask
    673 
    674 	xr	$s2,$t2
    675 	x	$s0,0($key)
    676 	x	$s1,4($key)
    677 	x	$s2,8($key)
    678 	x	$t3,12($key)
    679 
    680 	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
    681 	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
    682 	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
    683 	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
    684 	xr	$s3,$t3
    685 
    686 	brct	$rounds,.Ldec_loop
    687 	.align	16
    688 
    689 	l	$t1,`2048+0`($tbl)	# prefetch Td4
    690 	l	$t2,`2048+64`($tbl)
    691 	l	$t3,`2048+128`($tbl)
    692 	l	$i1,`2048+192`($tbl)
    693 	llill	$mask,0xff
    694 
    695 	srlg	$i3,$s0,24	# i0
    696 	srlg	$t1,$s0,16
    697 	srlg	$t2,$s0,8
    698 	nr	$s0,$mask	# i3
    699 	nr	$t1,$mask
    700 
    701 	srlg	$i1,$s1,24
    702 	nr	$t2,$mask
    703 	srlg	$i2,$s1,16
    704 	srlg	$ra,$s1,8
    705 	nr	$s1,$mask	# i0
    706 	nr	$i2,$mask
    707 	nr	$ra,$mask
    708 
    709 	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
    710 	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
    711 	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
    712 	sll	$t1,16
    713 	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
    714 	sllg	$s0,$i3,24
    715 	sll	$t2,8
    716 
    717 	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
    718 	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
    719 	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
    720 	sll	$i1,24
    721 	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
    722 	sll	$i2,16
    723 	sll	$i3,8
    724 	or	$s0,$s1
    725 	or	$t1,$i1
    726 	or	$t2,$i2
    727 	or	$t3,$i3
    728 
    729 	srlg	$i1,$s2,8	# i0
    730 	srlg	$i2,$s2,24
    731 	srlg	$i3,$s2,16
    732 	nr	$s2,$mask	# i1
    733 	nr	$i1,$mask
    734 	nr	$i3,$mask
    735 	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
    736 	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
    737 	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
    738 	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
    739 	sll	$i1,8
    740 	sll	$i2,24
    741 	or	$s0,$i1
    742 	sll	$i3,16
    743 	or	$t2,$i2
    744 	or	$t3,$i3
    745 
    746 	srlg	$i1,$s3,16	# i0
    747 	srlg	$i2,$s3,8	# i1
    748 	srlg	$i3,$s3,24
    749 	nr	$s3,$mask	# i2
    750 	nr	$i1,$mask
    751 	nr	$i2,$mask
    752 
    753 	l${g}	$ra,15*$SIZE_T($sp)
    754 	or	$s1,$t1
    755 	l	$t0,16($key)
    756 	l	$t1,20($key)
    757 
    758 	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
    759 	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
    760 	sll	$i1,16
    761 	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
    762 	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
    763 	sll	$i2,8
    764 	sll	$s3,24
    765 	or	$s0,$i1
    766 	or	$s1,$i2
    767 	or	$s2,$t2
    768 	or	$s3,$t3
    769 
    770 	xr	$s0,$t0
    771 	xr	$s1,$t1
    772 	x	$s2,24($key)
    773 	x	$s3,28($key)
    774 
    775 	br	$ra	
    776 .size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
    777 ___
    778 
    779 $code.=<<___;
    780 # void AES_set_encrypt_key(const unsigned char *in, int bits,
    781 # 		 AES_KEY *key) {
    782 .globl	private_AES_set_encrypt_key
    783 .type	private_AES_set_encrypt_key,\@function
    784 .align	16
    785 private_AES_set_encrypt_key:
    786 _s390x_AES_set_encrypt_key:
    787 	lghi	$t0,0
    788 	cl${g}r	$inp,$t0
    789 	je	.Lminus1
    790 	cl${g}r	$key,$t0
    791 	je	.Lminus1
    792 
    793 	lghi	$t0,128
    794 	clr	$bits,$t0
    795 	je	.Lproceed
    796 	lghi	$t0,192
    797 	clr	$bits,$t0
    798 	je	.Lproceed
    799 	lghi	$t0,256
    800 	clr	$bits,$t0
    801 	je	.Lproceed
    802 	lghi	%r2,-2
    803 	br	%r14
    804 
    805 .align	16
    806 .Lproceed:
    807 ___
    808 $code.=<<___ if (!$softonly);
    809 	# convert bits to km code, [128,192,256]->[18,19,20]
    810 	lhi	%r5,-128
    811 	lhi	%r0,18
    812 	ar	%r5,$bits
    813 	srl	%r5,6
    814 	ar	%r5,%r0
    815 
    816 	larl	%r1,OPENSSL_s390xcap_P
    817 	lg	%r0,0(%r1)
    818 	tmhl	%r0,0x4000	# check for message-security assist
    819 	jz	.Lekey_internal
    820 
    821 	lghi	%r0,0		# query capability vector
    822 	la	%r1,16($sp)
    823 	.long	0xb92f0042	# kmc %r4,%r2
    824 
    825 	llihh	%r1,0x8000
    826 	srlg	%r1,%r1,0(%r5)
    827 	ng	%r1,16($sp)
    828 	jz	.Lekey_internal
    829 
    830 	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
    831 	stmg	%r0,%r1,0($key)
    832 	lhi	%r0,192
    833 	cr	$bits,%r0
    834 	jl	1f
    835 	lg	%r1,16($inp)
    836 	stg	%r1,16($key)
    837 	je	1f
    838 	lg	%r1,24($inp)
    839 	stg	%r1,24($key)
    840 1:	st	$bits,236($key)	# save bits [for debugging purposes]
    841 	lgr	$t0,%r5
    842 	st	%r5,240($key)	# save km code
    843 	lghi	%r2,0
    844 	br	%r14
    845 ___
    846 $code.=<<___;
    847 .align	16
    848 .Lekey_internal:
    849 	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
    850 
    851 	larl	$tbl,AES_Te+2048
    852 
    853 	llgf	$s0,0($inp)
    854 	llgf	$s1,4($inp)
    855 	llgf	$s2,8($inp)
    856 	llgf	$s3,12($inp)
    857 	st	$s0,0($key)
    858 	st	$s1,4($key)
    859 	st	$s2,8($key)
    860 	st	$s3,12($key)
    861 	lghi	$t0,128
    862 	cr	$bits,$t0
    863 	jne	.Lnot128
    864 
    865 	llill	$mask,0xff
    866 	lghi	$t3,0			# i=0
    867 	lghi	$rounds,10
    868 	st	$rounds,240($key)
    869 
    870 	llgfr	$t2,$s3			# temp=rk[3]
    871 	srlg	$i1,$s3,8
    872 	srlg	$i2,$s3,16
    873 	srlg	$i3,$s3,24
    874 	nr	$t2,$mask
    875 	nr	$i1,$mask
    876 	nr	$i2,$mask
    877 
    878 .align	16
    879 .L128_loop:
    880 	la	$t2,0($t2,$tbl)
    881 	la	$i1,0($i1,$tbl)
    882 	la	$i2,0($i2,$tbl)
    883 	la	$i3,0($i3,$tbl)
    884 	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
    885 	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
    886 	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
    887 	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
    888 	x	$t2,256($t3,$tbl)	# rcon[i]
    889 	xr	$s0,$t2			# rk[4]=rk[0]^...
    890 	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
    891 	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
    892 	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
    893 
    894 	llgfr	$t2,$s3			# temp=rk[3]
    895 	srlg	$i1,$s3,8
    896 	srlg	$i2,$s3,16
    897 	nr	$t2,$mask
    898 	nr	$i1,$mask
    899 	srlg	$i3,$s3,24
    900 	nr	$i2,$mask
    901 
    902 	st	$s0,16($key)
    903 	st	$s1,20($key)
    904 	st	$s2,24($key)
    905 	st	$s3,28($key)
    906 	la	$key,16($key)		# key+=4
    907 	la	$t3,4($t3)		# i++
    908 	brct	$rounds,.L128_loop
    909 	lghi	$t0,10
    910 	lghi	%r2,0
    911 	lm${g}	%r4,%r13,4*$SIZE_T($sp)
    912 	br	$ra
    913 
    914 .align	16
    915 .Lnot128:
    916 	llgf	$t0,16($inp)
    917 	llgf	$t1,20($inp)
    918 	st	$t0,16($key)
    919 	st	$t1,20($key)
    920 	lghi	$t0,192
    921 	cr	$bits,$t0
    922 	jne	.Lnot192
    923 
    924 	llill	$mask,0xff
    925 	lghi	$t3,0			# i=0
    926 	lghi	$rounds,12
    927 	st	$rounds,240($key)
    928 	lghi	$rounds,8
    929 
    930 	srlg	$i1,$t1,8
    931 	srlg	$i2,$t1,16
    932 	srlg	$i3,$t1,24
    933 	nr	$t1,$mask
    934 	nr	$i1,$mask
    935 	nr	$i2,$mask
    936 
    937 .align	16
    938 .L192_loop:
    939 	la	$t1,0($t1,$tbl)
    940 	la	$i1,0($i1,$tbl)
    941 	la	$i2,0($i2,$tbl)
    942 	la	$i3,0($i3,$tbl)
    943 	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
    944 	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
    945 	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
    946 	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
    947 	x	$t1,256($t3,$tbl)	# rcon[i]
    948 	xr	$s0,$t1			# rk[6]=rk[0]^...
    949 	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
    950 	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
    951 	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
    952 
    953 	st	$s0,24($key)
    954 	st	$s1,28($key)
    955 	st	$s2,32($key)
    956 	st	$s3,36($key)
    957 	brct	$rounds,.L192_continue
    958 	lghi	$t0,12
    959 	lghi	%r2,0
    960 	lm${g}	%r4,%r13,4*$SIZE_T($sp)
    961 	br	$ra
    962 
    963 .align	16
    964 .L192_continue:
    965 	lgr	$t1,$s3
    966 	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
    967 	st	$t1,40($key)
    968 	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
    969 	st	$t1,44($key)
    970 
    971 	srlg	$i1,$t1,8
    972 	srlg	$i2,$t1,16
    973 	srlg	$i3,$t1,24
    974 	nr	$t1,$mask
    975 	nr	$i1,$mask
    976 	nr	$i2,$mask
    977 
    978 	la	$key,24($key)		# key+=6
    979 	la	$t3,4($t3)		# i++
    980 	j	.L192_loop
    981 
    982 .align	16
    983 .Lnot192:
    984 	llgf	$t0,24($inp)
    985 	llgf	$t1,28($inp)
    986 	st	$t0,24($key)
    987 	st	$t1,28($key)
    988 	llill	$mask,0xff
    989 	lghi	$t3,0			# i=0
    990 	lghi	$rounds,14
    991 	st	$rounds,240($key)
    992 	lghi	$rounds,7
    993 
    994 	srlg	$i1,$t1,8
    995 	srlg	$i2,$t1,16
    996 	srlg	$i3,$t1,24
    997 	nr	$t1,$mask
    998 	nr	$i1,$mask
    999 	nr	$i2,$mask
   1000 
   1001 .align	16
   1002 .L256_loop:
   1003 	la	$t1,0($t1,$tbl)
   1004 	la	$i1,0($i1,$tbl)
   1005 	la	$i2,0($i2,$tbl)
   1006 	la	$i3,0($i3,$tbl)
   1007 	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
   1008 	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
   1009 	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
   1010 	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
   1011 	x	$t1,256($t3,$tbl)	# rcon[i]
   1012 	xr	$s0,$t1			# rk[8]=rk[0]^...
   1013 	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
   1014 	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
   1015 	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
   1016 	st	$s0,32($key)
   1017 	st	$s1,36($key)
   1018 	st	$s2,40($key)
   1019 	st	$s3,44($key)
   1020 	brct	$rounds,.L256_continue
   1021 	lghi	$t0,14
   1022 	lghi	%r2,0
   1023 	lm${g}	%r4,%r13,4*$SIZE_T($sp)
   1024 	br	$ra
   1025 
   1026 .align	16
   1027 .L256_continue:
   1028 	lgr	$t1,$s3			# temp=rk[11]
   1029 	srlg	$i1,$s3,8
   1030 	srlg	$i2,$s3,16
   1031 	srlg	$i3,$s3,24
   1032 	nr	$t1,$mask
   1033 	nr	$i1,$mask
   1034 	nr	$i2,$mask
   1035 	la	$t1,0($t1,$tbl)
   1036 	la	$i1,0($i1,$tbl)
   1037 	la	$i2,0($i2,$tbl)
   1038 	la	$i3,0($i3,$tbl)
   1039 	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
   1040 	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
   1041 	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
   1042 	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
   1043 	x	$t1,16($key)		# rk[12]=rk[4]^...
   1044 	st	$t1,48($key)
   1045 	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
   1046 	st	$t1,52($key)
   1047 	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
   1048 	st	$t1,56($key)
   1049 	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
   1050 	st	$t1,60($key)
   1051 
   1052 	srlg	$i1,$t1,8
   1053 	srlg	$i2,$t1,16
   1054 	srlg	$i3,$t1,24
   1055 	nr	$t1,$mask
   1056 	nr	$i1,$mask
   1057 	nr	$i2,$mask
   1058 
   1059 	la	$key,32($key)		# key+=8
   1060 	la	$t3,4($t3)		# i++
   1061 	j	.L256_loop
   1062 
   1063 .Lminus1:
   1064 	lghi	%r2,-1
   1065 	br	$ra
   1066 .size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
   1067 
   1068 # void AES_set_decrypt_key(const unsigned char *in, int bits,
   1069 # 		 AES_KEY *key) {
   1070 .globl	private_AES_set_decrypt_key
   1071 .type	private_AES_set_decrypt_key,\@function
   1072 .align	16
   1073 private_AES_set_decrypt_key:
   1074 	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
   1075 	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
   1076 	bras	$ra,_s390x_AES_set_encrypt_key
   1077 	#l${g}	$key,4*$SIZE_T($sp)
   1078 	l${g}	$ra,14*$SIZE_T($sp)
   1079 	ltgr	%r2,%r2
   1080 	bnzr	$ra
   1081 ___
   1082 $code.=<<___ if (!$softonly);
   1083 	#l	$t0,240($key)
   1084 	lhi	$t1,16
   1085 	cr	$t0,$t1
   1086 	jl	.Lgo
   1087 	oill	$t0,0x80	# set "decrypt" bit
   1088 	st	$t0,240($key)
   1089 	br	$ra
   1090 ___
   1091 $code.=<<___;
   1092 .align	16
   1093 .Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
   1094 	la	$i1,0($key)
   1095 	sllg	$i2,$rounds,4
   1096 	la	$i2,0($i2,$key)
   1097 	srl	$rounds,1
   1098 	lghi	$t1,-16
   1099 
   1100 .align	16
   1101 .Linv:	lmg	$s0,$s1,0($i1)
   1102 	lmg	$s2,$s3,0($i2)
   1103 	stmg	$s0,$s1,0($i2)
   1104 	stmg	$s2,$s3,0($i1)
   1105 	la	$i1,16($i1)
   1106 	la	$i2,0($t1,$i2)
   1107 	brct	$rounds,.Linv
   1108 ___
   1109 $mask80=$i1;
   1110 $mask1b=$i2;
   1111 $maskfe=$i3;
   1112 $code.=<<___;
   1113 	llgf	$rounds,240($key)
   1114 	aghi	$rounds,-1
   1115 	sll	$rounds,2	# (rounds-1)*4
   1116 	llilh	$mask80,0x8080
   1117 	llilh	$mask1b,0x1b1b
   1118 	llilh	$maskfe,0xfefe
   1119 	oill	$mask80,0x8080
   1120 	oill	$mask1b,0x1b1b
   1121 	oill	$maskfe,0xfefe
   1122 
   1123 .align	16
   1124 .Lmix:	l	$s0,16($key)	# tp1
   1125 	lr	$s1,$s0
   1126 	ngr	$s1,$mask80
   1127 	srlg	$t1,$s1,7
   1128 	slr	$s1,$t1
   1129 	nr	$s1,$mask1b
   1130 	sllg	$t1,$s0,1
   1131 	nr	$t1,$maskfe
   1132 	xr	$s1,$t1		# tp2
   1133 
   1134 	lr	$s2,$s1
   1135 	ngr	$s2,$mask80
   1136 	srlg	$t1,$s2,7
   1137 	slr	$s2,$t1
   1138 	nr	$s2,$mask1b
   1139 	sllg	$t1,$s1,1
   1140 	nr	$t1,$maskfe
   1141 	xr	$s2,$t1		# tp4
   1142 
   1143 	lr	$s3,$s2
   1144 	ngr	$s3,$mask80
   1145 	srlg	$t1,$s3,7
   1146 	slr	$s3,$t1
   1147 	nr	$s3,$mask1b
   1148 	sllg	$t1,$s2,1
   1149 	nr	$t1,$maskfe
   1150 	xr	$s3,$t1		# tp8
   1151 
   1152 	xr	$s1,$s0		# tp2^tp1
   1153 	xr	$s2,$s0		# tp4^tp1
   1154 	rll	$s0,$s0,24	# = ROTATE(tp1,8)
   1155 	xr	$s2,$s3		# ^=tp8
   1156 	xr	$s0,$s1		# ^=tp2^tp1
   1157 	xr	$s1,$s3		# tp2^tp1^tp8
   1158 	xr	$s0,$s2		# ^=tp4^tp1^tp8
   1159 	rll	$s1,$s1,8
   1160 	rll	$s2,$s2,16
   1161 	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
   1162 	rll	$s3,$s3,24
   1163 	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
   1164 	xr	$s0,$s3		# ^= ROTATE(tp8,8)
   1165 
   1166 	st	$s0,16($key)
   1167 	la	$key,4($key)
   1168 	brct	$rounds,.Lmix
   1169 
   1170 	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
   1171 	lghi	%r2,0
   1172 	br	$ra
   1173 .size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
   1174 ___
   1175 
   1176 ########################################################################
   1177 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
   1178 #                     size_t length, const AES_KEY *key,
   1179 #                     unsigned char *ivec, const int enc)
   1180 {
   1181 my $inp="%r2";
   1182 my $out="%r4";	# length and out are swapped
   1183 my $len="%r3";
   1184 my $key="%r5";
   1185 my $ivp="%r6";
   1186 
   1187 $code.=<<___;
   1188 .globl	AES_cbc_encrypt
   1189 .type	AES_cbc_encrypt,\@function
   1190 .align	16
   1191 AES_cbc_encrypt:
   1192 	xgr	%r3,%r4		# flip %r3 and %r4, out and len
   1193 	xgr	%r4,%r3
   1194 	xgr	%r3,%r4
   1195 ___
   1196 $code.=<<___ if (!$softonly);
   1197 	lhi	%r0,16
   1198 	cl	%r0,240($key)
   1199 	jh	.Lcbc_software
   1200 
   1201 	lg	%r0,0($ivp)	# copy ivec
   1202 	lg	%r1,8($ivp)
   1203 	stmg	%r0,%r1,16($sp)
   1204 	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
   1205 	stmg	%r0,%r1,32($sp)
   1206 	lmg	%r0,%r1,16($key)
   1207 	stmg	%r0,%r1,48($sp)
   1208 	l	%r0,240($key)	# load kmc code
   1209 	lghi	$key,15		# res=len%16, len-=res;
   1210 	ngr	$key,$len
   1211 	sl${g}r	$len,$key
   1212 	la	%r1,16($sp)	# parameter block - ivec || key
   1213 	jz	.Lkmc_truncated
   1214 	.long	0xb92f0042	# kmc %r4,%r2
   1215 	brc	1,.-4		# pay attention to "partial completion"
   1216 	ltr	$key,$key
   1217 	jnz	.Lkmc_truncated
   1218 .Lkmc_done:
   1219 	lmg	%r0,%r1,16($sp)	# copy ivec to caller
   1220 	stg	%r0,0($ivp)
   1221 	stg	%r1,8($ivp)
   1222 	br	$ra
   1223 .align	16
   1224 .Lkmc_truncated:
   1225 	ahi	$key,-1		# it's the way it's encoded in mvc
   1226 	tmll	%r0,0x80
   1227 	jnz	.Lkmc_truncated_dec
   1228 	lghi	%r1,0
   1229 	stg	%r1,16*$SIZE_T($sp)
   1230 	stg	%r1,16*$SIZE_T+8($sp)
   1231 	bras	%r1,1f
   1232 	mvc	16*$SIZE_T(1,$sp),0($inp)
   1233 1:	ex	$key,0(%r1)
   1234 	la	%r1,16($sp)	# restore parameter block
   1235 	la	$inp,16*$SIZE_T($sp)
   1236 	lghi	$len,16
   1237 	.long	0xb92f0042	# kmc %r4,%r2
   1238 	j	.Lkmc_done
   1239 .align	16
   1240 .Lkmc_truncated_dec:
   1241 	st${g}	$out,4*$SIZE_T($sp)
   1242 	la	$out,16*$SIZE_T($sp)
   1243 	lghi	$len,16
   1244 	.long	0xb92f0042	# kmc %r4,%r2
   1245 	l${g}	$out,4*$SIZE_T($sp)
   1246 	bras	%r1,2f
   1247 	mvc	0(1,$out),16*$SIZE_T($sp)
   1248 2:	ex	$key,0(%r1)
   1249 	j	.Lkmc_done
   1250 .align	16
   1251 .Lcbc_software:
   1252 ___
   1253 $code.=<<___;
   1254 	stm${g}	$key,$ra,5*$SIZE_T($sp)
   1255 	lhi	%r0,0
   1256 	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
   1257 	je	.Lcbc_decrypt
   1258 
   1259 	larl	$tbl,AES_Te
   1260 
   1261 	llgf	$s0,0($ivp)
   1262 	llgf	$s1,4($ivp)
   1263 	llgf	$s2,8($ivp)
   1264 	llgf	$s3,12($ivp)
   1265 
   1266 	lghi	$t0,16
   1267 	sl${g}r	$len,$t0
   1268 	brc	4,.Lcbc_enc_tail	# if borrow
   1269 .Lcbc_enc_loop:
   1270 	stm${g}	$inp,$out,2*$SIZE_T($sp)
   1271 	x	$s0,0($inp)
   1272 	x	$s1,4($inp)
   1273 	x	$s2,8($inp)
   1274 	x	$s3,12($inp)
   1275 	lgr	%r4,$key
   1276 
   1277 	bras	$ra,_s390x_AES_encrypt
   1278 
   1279 	lm${g}	$inp,$key,2*$SIZE_T($sp)
   1280 	st	$s0,0($out)
   1281 	st	$s1,4($out)
   1282 	st	$s2,8($out)
   1283 	st	$s3,12($out)
   1284 
   1285 	la	$inp,16($inp)
   1286 	la	$out,16($out)
   1287 	lghi	$t0,16
   1288 	lt${g}r	$len,$len
   1289 	jz	.Lcbc_enc_done
   1290 	sl${g}r	$len,$t0
   1291 	brc	4,.Lcbc_enc_tail	# if borrow
   1292 	j	.Lcbc_enc_loop
   1293 .align	16
   1294 .Lcbc_enc_done:
   1295 	l${g}	$ivp,6*$SIZE_T($sp)
   1296 	st	$s0,0($ivp)
   1297 	st	$s1,4($ivp)	
   1298 	st	$s2,8($ivp)
   1299 	st	$s3,12($ivp)
   1300 
   1301 	lm${g}	%r7,$ra,7*$SIZE_T($sp)
   1302 	br	$ra
   1303 
   1304 .align	16
   1305 .Lcbc_enc_tail:
   1306 	aghi	$len,15
   1307 	lghi	$t0,0
   1308 	stg	$t0,16*$SIZE_T($sp)
   1309 	stg	$t0,16*$SIZE_T+8($sp)
   1310 	bras	$t1,3f
   1311 	mvc	16*$SIZE_T(1,$sp),0($inp)
   1312 3:	ex	$len,0($t1)
   1313 	lghi	$len,0
   1314 	la	$inp,16*$SIZE_T($sp)
   1315 	j	.Lcbc_enc_loop
   1316 
   1317 .align	16
   1318 .Lcbc_decrypt:
   1319 	larl	$tbl,AES_Td
   1320 
   1321 	lg	$t0,0($ivp)
   1322 	lg	$t1,8($ivp)
   1323 	stmg	$t0,$t1,16*$SIZE_T($sp)
   1324 
   1325 .Lcbc_dec_loop:
   1326 	stm${g}	$inp,$out,2*$SIZE_T($sp)
   1327 	llgf	$s0,0($inp)
   1328 	llgf	$s1,4($inp)
   1329 	llgf	$s2,8($inp)
   1330 	llgf	$s3,12($inp)
   1331 	lgr	%r4,$key
   1332 
   1333 	bras	$ra,_s390x_AES_decrypt
   1334 
   1335 	lm${g}	$inp,$key,2*$SIZE_T($sp)
   1336 	sllg	$s0,$s0,32
   1337 	sllg	$s2,$s2,32
   1338 	lr	$s0,$s1
   1339 	lr	$s2,$s3
   1340 
   1341 	lg	$t0,0($inp)
   1342 	lg	$t1,8($inp)
   1343 	xg	$s0,16*$SIZE_T($sp)
   1344 	xg	$s2,16*$SIZE_T+8($sp)
   1345 	lghi	$s1,16
   1346 	sl${g}r	$len,$s1
   1347 	brc	4,.Lcbc_dec_tail	# if borrow
   1348 	brc	2,.Lcbc_dec_done	# if zero
   1349 	stg	$s0,0($out)
   1350 	stg	$s2,8($out)
   1351 	stmg	$t0,$t1,16*$SIZE_T($sp)
   1352 
   1353 	la	$inp,16($inp)
   1354 	la	$out,16($out)
   1355 	j	.Lcbc_dec_loop
   1356 
   1357 .Lcbc_dec_done:
   1358 	stg	$s0,0($out)
   1359 	stg	$s2,8($out)
   1360 .Lcbc_dec_exit:
   1361 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
   1362 	stmg	$t0,$t1,0($ivp)
   1363 
   1364 	br	$ra
   1365 
   1366 .align	16
   1367 .Lcbc_dec_tail:
   1368 	aghi	$len,15
   1369 	stg	$s0,16*$SIZE_T($sp)
   1370 	stg	$s2,16*$SIZE_T+8($sp)
   1371 	bras	$s1,4f
   1372 	mvc	0(1,$out),16*$SIZE_T($sp)
   1373 4:	ex	$len,0($s1)
   1374 	j	.Lcbc_dec_exit
   1375 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
   1376 ___
   1377 }
   1378 ########################################################################
   1379 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
   1380 #                     size_t blocks, const AES_KEY *key,
   1381 #                     const unsigned char *ivec)
   1382 {
   1383 my $inp="%r2";
   1384 my $out="%r4";	# blocks and out are swapped
   1385 my $len="%r3";
   1386 my $key="%r5";	my $iv0="%r5";
   1387 my $ivp="%r6";
   1388 my $fp ="%r7";
   1389 
   1390 $code.=<<___;
   1391 .globl	AES_ctr32_encrypt
   1392 .type	AES_ctr32_encrypt,\@function
   1393 .align	16
   1394 AES_ctr32_encrypt:
   1395 	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
   1396 	xgr	%r4,%r3
   1397 	xgr	%r3,%r4
   1398 	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
   1399 ___
   1400 $code.=<<___ if (!$softonly);
   1401 	l	%r0,240($key)
   1402 	lhi	%r1,16
   1403 	clr	%r0,%r1
   1404 	jl	.Lctr32_software
   1405 
   1406 	stm${g}	%r6,$s3,6*$SIZE_T($sp)
   1407 
   1408 	slgr	$out,$inp
   1409 	la	%r1,0($key)	# %r1 is permanent copy of $key
   1410 	lg	$iv0,0($ivp)	# load ivec
   1411 	lg	$ivp,8($ivp)
   1412 
   1413 	# prepare and allocate stack frame at the top of 4K page
   1414 	# with 1K reserved for eventual signal handling
   1415 	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
   1416 	lghi	$s1,-4096
   1417 	algr	$s0,$sp
   1418 	lgr	$fp,$sp
   1419 	ngr	$s0,$s1		# align at page boundary
   1420 	slgr	$fp,$s0		# total buffer size
   1421 	lgr	$s2,$sp
   1422 	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
   1423 	slgr	$fp,$s1		# deduct reservation to get usable buffer size
   1424 	# buffer size is at lest 256 and at most 3072+256-16
   1425 
   1426 	la	$sp,1024($s0)	# alloca
   1427 	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
   1428 	st${g}	$s2,0($sp)	# back-chain
   1429 	st${g}	$fp,$SIZE_T($sp)
   1430 
   1431 	slgr	$len,$fp
   1432 	brc	1,.Lctr32_hw_switch	# not zero, no borrow
   1433 	algr	$fp,$len	# input is shorter than allocated buffer
   1434 	lghi	$len,0
   1435 	st${g}	$fp,$SIZE_T($sp)
   1436 
   1437 .Lctr32_hw_switch:
   1438 ___
   1439 $code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
   1440 	larl	$s0,OPENSSL_s390xcap_P
   1441 	lg	$s0,8($s0)
   1442 	tmhh	$s0,0x0004	# check for message_security-assist-4
   1443 	jz	.Lctr32_km_loop
   1444 
   1445 	llgfr	$s0,%r0
   1446 	lgr	$s1,%r1
   1447 	lghi	%r0,0
   1448 	la	%r1,16($sp)
   1449 	.long	0xb92d2042	# kmctr %r4,%r2,%r2
   1450 
   1451 	llihh	%r0,0x8000	# check if kmctr supports the function code
   1452 	srlg	%r0,%r0,0($s0)
   1453 	ng	%r0,16($sp)
   1454 	lgr	%r0,$s0
   1455 	lgr	%r1,$s1
   1456 	jz	.Lctr32_km_loop
   1457 
   1458 ####### kmctr code
   1459 	algr	$out,$inp	# restore $out
   1460 	lgr	$s1,$len	# $s1 undertakes $len
   1461 	j	.Lctr32_kmctr_loop
   1462 .align	16
   1463 .Lctr32_kmctr_loop:
   1464 	la	$s2,16($sp)
   1465 	lgr	$s3,$fp
   1466 .Lctr32_kmctr_prepare:
   1467 	stg	$iv0,0($s2)
   1468 	stg	$ivp,8($s2)
   1469 	la	$s2,16($s2)
   1470 	ahi	$ivp,1		# 32-bit increment, preserves upper half
   1471 	brct	$s3,.Lctr32_kmctr_prepare
   1472 
   1473 	#la	$inp,0($inp)	# inp
   1474 	sllg	$len,$fp,4	# len
   1475 	#la	$out,0($out)	# out
   1476 	la	$s2,16($sp)	# iv
   1477 	.long	0xb92da042	# kmctr $out,$s2,$inp
   1478 	brc	1,.-4		# pay attention to "partial completion"
   1479 
   1480 	slgr	$s1,$fp
   1481 	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
   1482 	algr	$fp,$s1
   1483 	lghi	$s1,0
   1484 	brc	4+1,.Lctr32_kmctr_loop	# not zero
   1485 
   1486 	l${g}	$sp,0($sp)
   1487 	lm${g}	%r6,$s3,6*$SIZE_T($sp)
   1488 	br	$ra
   1489 .align	16
   1490 ___
   1491 $code.=<<___;
   1492 .Lctr32_km_loop:
   1493 	la	$s2,16($sp)
   1494 	lgr	$s3,$fp
   1495 .Lctr32_km_prepare:
   1496 	stg	$iv0,0($s2)
   1497 	stg	$ivp,8($s2)
   1498 	la	$s2,16($s2)
   1499 	ahi	$ivp,1		# 32-bit increment, preserves upper half
   1500 	brct	$s3,.Lctr32_km_prepare
   1501 
   1502 	la	$s0,16($sp)	# inp
   1503 	sllg	$s1,$fp,4	# len
   1504 	la	$s2,16($sp)	# out
   1505 	.long	0xb92e00a8	# km %r10,%r8
   1506 	brc	1,.-4		# pay attention to "partial completion"
   1507 
   1508 	la	$s2,16($sp)
   1509 	lgr	$s3,$fp
   1510 	slgr	$s2,$inp
   1511 .Lctr32_km_xor:
   1512 	lg	$s0,0($inp)
   1513 	lg	$s1,8($inp)
   1514 	xg	$s0,0($s2,$inp)
   1515 	xg	$s1,8($s2,$inp)
   1516 	stg	$s0,0($out,$inp)
   1517 	stg	$s1,8($out,$inp)
   1518 	la	$inp,16($inp)
   1519 	brct	$s3,.Lctr32_km_xor
   1520 
   1521 	slgr	$len,$fp
   1522 	brc	1,.Lctr32_km_loop	# not zero, no borrow
   1523 	algr	$fp,$len
   1524 	lghi	$len,0
   1525 	brc	4+1,.Lctr32_km_loop	# not zero
   1526 
   1527 	l${g}	$s0,0($sp)
   1528 	l${g}	$s1,$SIZE_T($sp)
   1529 	la	$s2,16($sp)
   1530 .Lctr32_km_zap:
   1531 	stg	$s0,0($s2)
   1532 	stg	$s0,8($s2)
   1533 	la	$s2,16($s2)
   1534 	brct	$s1,.Lctr32_km_zap
   1535 
   1536 	la	$sp,0($s0)
   1537 	lm${g}	%r6,$s3,6*$SIZE_T($sp)
   1538 	br	$ra
   1539 .align	16
   1540 .Lctr32_software:
   1541 ___
   1542 $code.=<<___;
   1543 	stm${g}	$key,$ra,5*$SIZE_T($sp)
   1544 	sl${g}r	$inp,$out
   1545 	larl	$tbl,AES_Te
   1546 	llgf	$t1,12($ivp)
   1547 
   1548 .Lctr32_loop:
   1549 	stm${g}	$inp,$out,2*$SIZE_T($sp)
   1550 	llgf	$s0,0($ivp)
   1551 	llgf	$s1,4($ivp)
   1552 	llgf	$s2,8($ivp)
   1553 	lgr	$s3,$t1
   1554 	st	$t1,16*$SIZE_T($sp)
   1555 	lgr	%r4,$key
   1556 
   1557 	bras	$ra,_s390x_AES_encrypt
   1558 
   1559 	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
   1560 	llgf	$t1,16*$SIZE_T($sp)
   1561 	x	$s0,0($inp,$out)
   1562 	x	$s1,4($inp,$out)
   1563 	x	$s2,8($inp,$out)
   1564 	x	$s3,12($inp,$out)
   1565 	stm	$s0,$s3,0($out)
   1566 
   1567 	la	$out,16($out)
   1568 	ahi	$t1,1		# 32-bit increment
   1569 	brct	$len,.Lctr32_loop
   1570 
   1571 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
   1572 	br	$ra
   1573 .size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
   1574 ___
   1575 }
   1576 
   1577 ########################################################################
   1578 # void AES_xts_encrypt(const char *inp,char *out,size_t len,
   1579 #	const AES_KEY *key1, const AES_KEY *key2,
   1580 #	const unsigned char iv[16]);
   1581 #
   1582 {
   1583 my $inp="%r2";
   1584 my $out="%r4";	# len and out are swapped
   1585 my $len="%r3";
   1586 my $key1="%r5";	# $i1
   1587 my $key2="%r6";	# $i2
   1588 my $fp="%r7";	# $i3
   1589 my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
   1590 
   1591 $code.=<<___;
   1592 .type	_s390x_xts_km,\@function
   1593 .align	16
   1594 _s390x_xts_km:
   1595 ___
   1596 $code.=<<___ if(1);
   1597 	llgfr	$s0,%r0			# put aside the function code
   1598 	lghi	$s1,0x7f
   1599 	nr	$s1,%r0
   1600 	lghi	%r0,0			# query capability vector
   1601 	la	%r1,$tweak-16($sp)
   1602 	.long	0xb92e0042		# km %r4,%r2
   1603 	llihh	%r1,0x8000
   1604 	srlg	%r1,%r1,32($s1)		# check for 32+function code
   1605 	ng	%r1,$tweak-16($sp)
   1606 	lgr	%r0,$s0			# restore the function code
   1607 	la	%r1,0($key1)		# restore $key1
   1608 	jz	.Lxts_km_vanilla
   1609 
   1610 	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
   1611 	algr	$out,$inp
   1612 
   1613 	oill	%r0,32			# switch to xts function code
   1614 	aghi	$s1,-18			#
   1615 	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
   1616 	la	%r1,$tweak-16($sp)
   1617 	slgr	%r1,$s1			# parameter block position
   1618 	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
   1619 	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
   1620 					# yes, it contains junk and overlaps
   1621 					# with the tweak in 128-bit case.
   1622 					# it's done to avoid conditional
   1623 					# branch.
   1624 	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
   1625 
   1626 	.long	0xb92e0042		# km %r4,%r2
   1627 	brc	1,.-4			# pay attention to "partial completion"
   1628 
   1629 	lrvg	$s0,$tweak+0($sp)	# load the last tweak
   1630 	lrvg	$s1,$tweak+8($sp)
   1631 	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
   1632 
   1633 	nill	%r0,0xffdf		# switch back to original function code
   1634 	la	%r1,0($key1)		# restore pointer to $key1
   1635 	slgr	$out,$inp
   1636 
   1637 	llgc	$len,2*$SIZE_T-1($sp)
   1638 	nill	$len,0x0f		# $len%=16
   1639 	br	$ra
   1640 	
   1641 .align	16
   1642 .Lxts_km_vanilla:
   1643 ___
   1644 $code.=<<___;
   1645 	# prepare and allocate stack frame at the top of 4K page
   1646 	# with 1K reserved for eventual signal handling
   1647 	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
   1648 	lghi	$s1,-4096
   1649 	algr	$s0,$sp
   1650 	lgr	$fp,$sp
   1651 	ngr	$s0,$s1		# align at page boundary
   1652 	slgr	$fp,$s0		# total buffer size
   1653 	lgr	$s2,$sp
   1654 	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
   1655 	slgr	$fp,$s1		# deduct reservation to get usable buffer size
   1656 	# buffer size is at lest 256 and at most 3072+256-16
   1657 
   1658 	la	$sp,1024($s0)	# alloca
   1659 	nill	$fp,0xfff0	# round to 16*n
   1660 	st${g}	$s2,0($sp)	# back-chain
   1661 	nill	$len,0xfff0	# redundant
   1662 	st${g}	$fp,$SIZE_T($sp)
   1663 
   1664 	slgr	$len,$fp
   1665 	brc	1,.Lxts_km_go	# not zero, no borrow
   1666 	algr	$fp,$len	# input is shorter than allocated buffer
   1667 	lghi	$len,0
   1668 	st${g}	$fp,$SIZE_T($sp)
   1669 
   1670 .Lxts_km_go:
   1671 	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
   1672 	lrvg	$s1,$tweak+8($s2)
   1673 
   1674 	la	$s2,16($sp)		# vector of ascending tweak values
   1675 	slgr	$s2,$inp
   1676 	srlg	$s3,$fp,4
   1677 	j	.Lxts_km_start
   1678 
   1679 .Lxts_km_loop:
   1680 	la	$s2,16($sp)
   1681 	slgr	$s2,$inp
   1682 	srlg	$s3,$fp,4
   1683 .Lxts_km_prepare:
   1684 	lghi	$i1,0x87
   1685 	srag	$i2,$s1,63		# broadcast upper bit
   1686 	ngr	$i1,$i2			# rem
   1687 	algr	$s0,$s0
   1688 	alcgr	$s1,$s1
   1689 	xgr	$s0,$i1
   1690 .Lxts_km_start:
   1691 	lrvgr	$i1,$s0			# flip byte order
   1692 	lrvgr	$i2,$s1
   1693 	stg	$i1,0($s2,$inp)
   1694 	stg	$i2,8($s2,$inp)
   1695 	xg	$i1,0($inp)
   1696 	xg	$i2,8($inp)
   1697 	stg	$i1,0($out,$inp)
   1698 	stg	$i2,8($out,$inp)
   1699 	la	$inp,16($inp)
   1700 	brct	$s3,.Lxts_km_prepare
   1701 
   1702 	slgr	$inp,$fp		# rewind $inp
   1703 	la	$s2,0($out,$inp)
   1704 	lgr	$s3,$fp
   1705 	.long	0xb92e00aa		# km $s2,$s2
   1706 	brc	1,.-4			# pay attention to "partial completion"
   1707 
   1708 	la	$s2,16($sp)
   1709 	slgr	$s2,$inp
   1710 	srlg	$s3,$fp,4
   1711 .Lxts_km_xor:
   1712 	lg	$i1,0($out,$inp)
   1713 	lg	$i2,8($out,$inp)
   1714 	xg	$i1,0($s2,$inp)
   1715 	xg	$i2,8($s2,$inp)
   1716 	stg	$i1,0($out,$inp)
   1717 	stg	$i2,8($out,$inp)
   1718 	la	$inp,16($inp)
   1719 	brct	$s3,.Lxts_km_xor
   1720 
   1721 	slgr	$len,$fp
   1722 	brc	1,.Lxts_km_loop		# not zero, no borrow
   1723 	algr	$fp,$len
   1724 	lghi	$len,0
   1725 	brc	4+1,.Lxts_km_loop	# not zero
   1726 
   1727 	l${g}	$i1,0($sp)		# back-chain
   1728 	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
   1729 	la	$i2,16($sp)
   1730 	srlg	$fp,$fp,4
   1731 .Lxts_km_zap:
   1732 	stg	$i1,0($i2)
   1733 	stg	$i1,8($i2)
   1734 	la	$i2,16($i2)
   1735 	brct	$fp,.Lxts_km_zap
   1736 
   1737 	la	$sp,0($i1)
   1738 	llgc	$len,2*$SIZE_T-1($i1)
   1739 	nill	$len,0x0f		# $len%=16
   1740 	bzr	$ra
   1741 
   1742 	# generate one more tweak...
   1743 	lghi	$i1,0x87
   1744 	srag	$i2,$s1,63		# broadcast upper bit
   1745 	ngr	$i1,$i2			# rem
   1746 	algr	$s0,$s0
   1747 	alcgr	$s1,$s1
   1748 	xgr	$s0,$i1
   1749 
   1750 	ltr	$len,$len		# clear zero flag
   1751 	br	$ra
   1752 .size	_s390x_xts_km,.-_s390x_xts_km
   1753 
   1754 .globl	AES_xts_encrypt
   1755 .type	AES_xts_encrypt,\@function
   1756 .align	16
   1757 AES_xts_encrypt:
   1758 	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
   1759 	xgr	%r4,%r3
   1760 	xgr	%r3,%r4
   1761 ___
   1762 $code.=<<___ if ($SIZE_T==4);
   1763 	llgfr	$len,$len
   1764 ___
   1765 $code.=<<___;
   1766 	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
   1767 	srag	$len,$len,4		# formally wrong, because it expands
   1768 					# sign byte, but who can afford asking
   1769 					# to process more than 2^63-1 bytes?
   1770 					# I use it, because it sets condition
   1771 					# code...
   1772 	bcr	8,$ra			# abort if zero (i.e. less than 16)
   1773 ___
   1774 $code.=<<___ if (!$softonly);
   1775 	llgf	%r0,240($key2)
   1776 	lhi	%r1,16
   1777 	clr	%r0,%r1
   1778 	jl	.Lxts_enc_software
   1779 
   1780 	st${g}	$ra,5*$SIZE_T($sp)
   1781 	stm${g}	%r6,$s3,6*$SIZE_T($sp)
   1782 
   1783 	sllg	$len,$len,4		# $len&=~15
   1784 	slgr	$out,$inp
   1785 
   1786 	# generate the tweak value
   1787 	l${g}	$s3,$stdframe($sp)	# pointer to iv
   1788 	la	$s2,$tweak($sp)
   1789 	lmg	$s0,$s1,0($s3)
   1790 	lghi	$s3,16
   1791 	stmg	$s0,$s1,0($s2)
   1792 	la	%r1,0($key2)		# $key2 is not needed anymore
   1793 	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
   1794 	brc	1,.-4			# can this happen?
   1795 
   1796 	l	%r0,240($key1)
   1797 	la	%r1,0($key1)		# $key1 is not needed anymore
   1798 	bras	$ra,_s390x_xts_km
   1799 	jz	.Lxts_enc_km_done
   1800 
   1801 	aghi	$inp,-16		# take one step back
   1802 	la	$i3,0($out,$inp)	# put aside real $out
   1803 .Lxts_enc_km_steal:
   1804 	llgc	$i1,16($inp)
   1805 	llgc	$i2,0($out,$inp)
   1806 	stc	$i1,0($out,$inp)
   1807 	stc	$i2,16($out,$inp)
   1808 	la	$inp,1($inp)
   1809 	brct	$len,.Lxts_enc_km_steal
   1810 
   1811 	la	$s2,0($i3)
   1812 	lghi	$s3,16
   1813 	lrvgr	$i1,$s0			# flip byte order
   1814 	lrvgr	$i2,$s1
   1815 	xg	$i1,0($s2)
   1816 	xg	$i2,8($s2)
   1817 	stg	$i1,0($s2)
   1818 	stg	$i2,8($s2)
   1819 	.long	0xb92e00aa		# km $s2,$s2
   1820 	brc	1,.-4			# can this happen?
   1821 	lrvgr	$i1,$s0			# flip byte order
   1822 	lrvgr	$i2,$s1
   1823 	xg	$i1,0($i3)
   1824 	xg	$i2,8($i3)
   1825 	stg	$i1,0($i3)
   1826 	stg	$i2,8($i3)
   1827 
   1828 .Lxts_enc_km_done:
   1829 	stg	$sp,$tweak+0($sp)	# wipe tweak
   1830 	stg	$sp,$tweak+8($sp)
   1831 	l${g}	$ra,5*$SIZE_T($sp)
   1832 	lm${g}	%r6,$s3,6*$SIZE_T($sp)
   1833 	br	$ra
   1834 .align	16
   1835 .Lxts_enc_software:
   1836 ___
   1837 $code.=<<___;
   1838 	stm${g}	%r6,$ra,6*$SIZE_T($sp)
   1839 
   1840 	slgr	$out,$inp
   1841 
   1842 	l${g}	$s3,$stdframe($sp)	# ivp
   1843 	llgf	$s0,0($s3)		# load iv
   1844 	llgf	$s1,4($s3)
   1845 	llgf	$s2,8($s3)
   1846 	llgf	$s3,12($s3)
   1847 	stm${g}	%r2,%r5,2*$SIZE_T($sp)
   1848 	la	$key,0($key2)
   1849 	larl	$tbl,AES_Te
   1850 	bras	$ra,_s390x_AES_encrypt	# generate the tweak
   1851 	lm${g}	%r2,%r5,2*$SIZE_T($sp)
   1852 	stm	$s0,$s3,$tweak($sp)	# save the tweak
   1853 	j	.Lxts_enc_enter
   1854 
   1855 .align	16
   1856 .Lxts_enc_loop:
   1857 	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
   1858 	lrvg	$s3,$tweak+8($sp)
   1859 	lghi	%r1,0x87
   1860 	srag	%r0,$s3,63		# broadcast upper bit
   1861 	ngr	%r1,%r0			# rem
   1862 	algr	$s1,$s1
   1863 	alcgr	$s3,$s3
   1864 	xgr	$s1,%r1
   1865 	lrvgr	$s1,$s1			# flip byte order
   1866 	lrvgr	$s3,$s3
   1867 	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
   1868 	stg	$s1,$tweak+0($sp)	# save the tweak
   1869 	llgfr	$s1,$s1
   1870 	srlg	$s2,$s3,32
   1871 	stg	$s3,$tweak+8($sp)
   1872 	llgfr	$s3,$s3
   1873 	la	$inp,16($inp)		# $inp+=16
   1874 .Lxts_enc_enter:
   1875 	x	$s0,0($inp)		# ^=*($inp)
   1876 	x	$s1,4($inp)
   1877 	x	$s2,8($inp)
   1878 	x	$s3,12($inp)
   1879 	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
   1880 	la	$key,0($key1)
   1881 	bras	$ra,_s390x_AES_encrypt
   1882 	lm${g}	%r2,%r5,2*$SIZE_T($sp)
   1883 	x	$s0,$tweak+0($sp)	# ^=tweak
   1884 	x	$s1,$tweak+4($sp)
   1885 	x	$s2,$tweak+8($sp)
   1886 	x	$s3,$tweak+12($sp)
   1887 	st	$s0,0($out,$inp)
   1888 	st	$s1,4($out,$inp)
   1889 	st	$s2,8($out,$inp)
   1890 	st	$s3,12($out,$inp)
   1891 	brct${g}	$len,.Lxts_enc_loop
   1892 
   1893 	llgc	$len,`2*$SIZE_T-1`($sp)
   1894 	nill	$len,0x0f		# $len%16
   1895 	jz	.Lxts_enc_done
   1896 
   1897 	la	$i3,0($inp,$out)	# put aside real $out
   1898 .Lxts_enc_steal:
   1899 	llgc	%r0,16($inp)
   1900 	llgc	%r1,0($out,$inp)
   1901 	stc	%r0,0($out,$inp)
   1902 	stc	%r1,16($out,$inp)
   1903 	la	$inp,1($inp)
   1904 	brct	$len,.Lxts_enc_steal
   1905 	la	$out,0($i3)		# restore real $out
   1906 
   1907 	# generate last tweak...
   1908 	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
   1909 	lrvg	$s3,$tweak+8($sp)
   1910 	lghi	%r1,0x87
   1911 	srag	%r0,$s3,63		# broadcast upper bit
   1912 	ngr	%r1,%r0			# rem
   1913 	algr	$s1,$s1
   1914 	alcgr	$s3,$s3
   1915 	xgr	$s1,%r1
   1916 	lrvgr	$s1,$s1			# flip byte order
   1917 	lrvgr	$s3,$s3
   1918 	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
   1919 	stg	$s1,$tweak+0($sp)	# save the tweak
   1920 	llgfr	$s1,$s1
   1921 	srlg	$s2,$s3,32
   1922 	stg	$s3,$tweak+8($sp)
   1923 	llgfr	$s3,$s3
   1924 
   1925 	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
   1926 	x	$s1,4($out)
   1927 	x	$s2,8($out)
   1928 	x	$s3,12($out)
   1929 	st${g}	$out,4*$SIZE_T($sp)
   1930 	la	$key,0($key1)
   1931 	bras	$ra,_s390x_AES_encrypt
   1932 	l${g}	$out,4*$SIZE_T($sp)
   1933 	x	$s0,`$tweak+0`($sp)	# ^=tweak
   1934 	x	$s1,`$tweak+4`($sp)
   1935 	x	$s2,`$tweak+8`($sp)
   1936 	x	$s3,`$tweak+12`($sp)
   1937 	st	$s0,0($out)
   1938 	st	$s1,4($out)
   1939 	st	$s2,8($out)
   1940 	st	$s3,12($out)
   1941 
   1942 .Lxts_enc_done:
   1943 	stg	$sp,$tweak+0($sp)	# wipe tweak
   1944 	stg	$sp,$twesk+8($sp)
   1945 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
   1946 	br	$ra
   1947 .size	AES_xts_encrypt,.-AES_xts_encrypt
   1948 ___
   1949 # void AES_xts_decrypt(const char *inp,char *out,size_t len,
   1950 #	const AES_KEY *key1, const AES_KEY *key2,
   1951 #	const unsigned char iv[16]);
   1952 #
   1953 $code.=<<___;
   1954 .globl	AES_xts_decrypt
   1955 .type	AES_xts_decrypt,\@function
   1956 .align	16
   1957 AES_xts_decrypt:
   1958 	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
   1959 	xgr	%r4,%r3
   1960 	xgr	%r3,%r4
   1961 ___
   1962 $code.=<<___ if ($SIZE_T==4);
   1963 	llgfr	$len,$len
   1964 ___
   1965 $code.=<<___;
   1966 	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
   1967 	aghi	$len,-16
   1968 	bcr	4,$ra			# abort if less than zero. formally
   1969 					# wrong, because $len is unsigned,
   1970 					# but who can afford asking to
   1971 					# process more than 2^63-1 bytes?
   1972 	tmll	$len,0x0f
   1973 	jnz	.Lxts_dec_proceed
   1974 	aghi	$len,16
   1975 .Lxts_dec_proceed:
   1976 ___
   1977 $code.=<<___ if (!$softonly);
   1978 	llgf	%r0,240($key2)
   1979 	lhi	%r1,16
   1980 	clr	%r0,%r1
   1981 	jl	.Lxts_dec_software
   1982 
   1983 	st${g}	$ra,5*$SIZE_T($sp)
   1984 	stm${g}	%r6,$s3,6*$SIZE_T($sp)
   1985 
   1986 	nill	$len,0xfff0		# $len&=~15
   1987 	slgr	$out,$inp
   1988 
   1989 	# generate the tweak value
   1990 	l${g}	$s3,$stdframe($sp)	# pointer to iv
   1991 	la	$s2,$tweak($sp)
   1992 	lmg	$s0,$s1,0($s3)
   1993 	lghi	$s3,16
   1994 	stmg	$s0,$s1,0($s2)
   1995 	la	%r1,0($key2)		# $key2 is not needed past this point
   1996 	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
   1997 	brc	1,.-4			# can this happen?
   1998 
   1999 	l	%r0,240($key1)
   2000 	la	%r1,0($key1)		# $key1 is not needed anymore
   2001 
   2002 	ltgr	$len,$len
   2003 	jz	.Lxts_dec_km_short
   2004 	bras	$ra,_s390x_xts_km
   2005 	jz	.Lxts_dec_km_done
   2006 
   2007 	lrvgr	$s2,$s0			# make copy in reverse byte order
   2008 	lrvgr	$s3,$s1
   2009 	j	.Lxts_dec_km_2ndtweak
   2010 
   2011 .Lxts_dec_km_short:
   2012 	llgc	$len,`2*$SIZE_T-1`($sp)
   2013 	nill	$len,0x0f		# $len%=16
   2014 	lrvg	$s0,$tweak+0($sp)	# load the tweak
   2015 	lrvg	$s1,$tweak+8($sp)
   2016 	lrvgr	$s2,$s0			# make copy in reverse byte order
   2017 	lrvgr	$s3,$s1
   2018 
   2019 .Lxts_dec_km_2ndtweak:
   2020 	lghi	$i1,0x87
   2021 	srag	$i2,$s1,63		# broadcast upper bit
   2022 	ngr	$i1,$i2			# rem
   2023 	algr	$s0,$s0
   2024 	alcgr	$s1,$s1
   2025 	xgr	$s0,$i1
   2026 	lrvgr	$i1,$s0			# flip byte order
   2027 	lrvgr	$i2,$s1
   2028 
   2029 	xg	$i1,0($inp)
   2030 	xg	$i2,8($inp)
   2031 	stg	$i1,0($out,$inp)
   2032 	stg	$i2,8($out,$inp)
   2033 	la	$i2,0($out,$inp)
   2034 	lghi	$i3,16
   2035 	.long	0xb92e0066		# km $i2,$i2
   2036 	brc	1,.-4			# can this happen?
   2037 	lrvgr	$i1,$s0
   2038 	lrvgr	$i2,$s1
   2039 	xg	$i1,0($out,$inp)
   2040 	xg	$i2,8($out,$inp)
   2041 	stg	$i1,0($out,$inp)
   2042 	stg	$i2,8($out,$inp)
   2043 
   2044 	la	$i3,0($out,$inp)	# put aside real $out
   2045 .Lxts_dec_km_steal:
   2046 	llgc	$i1,16($inp)
   2047 	llgc	$i2,0($out,$inp)
   2048 	stc	$i1,0($out,$inp)
   2049 	stc	$i2,16($out,$inp)
   2050 	la	$inp,1($inp)
   2051 	brct	$len,.Lxts_dec_km_steal
   2052 
   2053 	lgr	$s0,$s2
   2054 	lgr	$s1,$s3
   2055 	xg	$s0,0($i3)
   2056 	xg	$s1,8($i3)
   2057 	stg	$s0,0($i3)
   2058 	stg	$s1,8($i3)
   2059 	la	$s0,0($i3)
   2060 	lghi	$s1,16
   2061 	.long	0xb92e0088		# km $s0,$s0
   2062 	brc	1,.-4			# can this happen?
   2063 	xg	$s2,0($i3)
   2064 	xg	$s3,8($i3)
   2065 	stg	$s2,0($i3)
   2066 	stg	$s3,8($i3)
   2067 .Lxts_dec_km_done:
   2068 	stg	$sp,$tweak+0($sp)	# wipe tweak
   2069 	stg	$sp,$tweak+8($sp)
   2070 	l${g}	$ra,5*$SIZE_T($sp)
   2071 	lm${g}	%r6,$s3,6*$SIZE_T($sp)
   2072 	br	$ra
   2073 .align	16
   2074 .Lxts_dec_software:
   2075 ___
   2076 $code.=<<___;
   2077 	stm${g}	%r6,$ra,6*$SIZE_T($sp)
   2078 
   2079 	srlg	$len,$len,4
   2080 	slgr	$out,$inp
   2081 
   2082 	l${g}	$s3,$stdframe($sp)	# ivp
   2083 	llgf	$s0,0($s3)		# load iv
   2084 	llgf	$s1,4($s3)
   2085 	llgf	$s2,8($s3)
   2086 	llgf	$s3,12($s3)
   2087 	stm${g}	%r2,%r5,2*$SIZE_T($sp)
   2088 	la	$key,0($key2)
   2089 	larl	$tbl,AES_Te
   2090 	bras	$ra,_s390x_AES_encrypt	# generate the tweak
   2091 	lm${g}	%r2,%r5,2*$SIZE_T($sp)
   2092 	larl	$tbl,AES_Td
   2093 	lt${g}r	$len,$len
   2094 	stm	$s0,$s3,$tweak($sp)	# save the tweak
   2095 	jz	.Lxts_dec_short
   2096 	j	.Lxts_dec_enter
   2097 
   2098 .align	16
   2099 .Lxts_dec_loop:
   2100 	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
   2101 	lrvg	$s3,$tweak+8($sp)
   2102 	lghi	%r1,0x87
   2103 	srag	%r0,$s3,63		# broadcast upper bit
   2104 	ngr	%r1,%r0			# rem
   2105 	algr	$s1,$s1
   2106 	alcgr	$s3,$s3
   2107 	xgr	$s1,%r1
   2108 	lrvgr	$s1,$s1			# flip byte order
   2109 	lrvgr	$s3,$s3
   2110 	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
   2111 	stg	$s1,$tweak+0($sp)	# save the tweak
   2112 	llgfr	$s1,$s1
   2113 	srlg	$s2,$s3,32
   2114 	stg	$s3,$tweak+8($sp)
   2115 	llgfr	$s3,$s3
   2116 .Lxts_dec_enter:
   2117 	x	$s0,0($inp)		# tweak^=*(inp)
   2118 	x	$s1,4($inp)
   2119 	x	$s2,8($inp)
   2120 	x	$s3,12($inp)
   2121 	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
   2122 	la	$key,0($key1)
   2123 	bras	$ra,_s390x_AES_decrypt
   2124 	lm${g}	%r2,%r5,2*$SIZE_T($sp)
   2125 	x	$s0,$tweak+0($sp)	# ^=tweak
   2126 	x	$s1,$tweak+4($sp)
   2127 	x	$s2,$tweak+8($sp)
   2128 	x	$s3,$tweak+12($sp)
   2129 	st	$s0,0($out,$inp)
   2130 	st	$s1,4($out,$inp)
   2131 	st	$s2,8($out,$inp)
   2132 	st	$s3,12($out,$inp)
   2133 	la	$inp,16($inp)
   2134 	brct${g}	$len,.Lxts_dec_loop
   2135 
   2136 	llgc	$len,`2*$SIZE_T-1`($sp)
   2137 	nill	$len,0x0f		# $len%16
   2138 	jz	.Lxts_dec_done
   2139 
   2140 	# generate pair of tweaks...
   2141 	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
   2142 	lrvg	$s3,$tweak+8($sp)
   2143 	lghi	%r1,0x87
   2144 	srag	%r0,$s3,63		# broadcast upper bit
   2145 	ngr	%r1,%r0			# rem
   2146 	algr	$s1,$s1
   2147 	alcgr	$s3,$s3
   2148 	xgr	$s1,%r1
   2149 	lrvgr	$i2,$s1			# flip byte order
   2150 	lrvgr	$i3,$s3
   2151 	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
   2152 	j	.Lxts_dec_2ndtweak
   2153 
   2154 .align	16
   2155 .Lxts_dec_short:
   2156 	llgc	$len,`2*$SIZE_T-1`($sp)
   2157 	nill	$len,0x0f		# $len%16
   2158 	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
   2159 	lrvg	$s3,$tweak+8($sp)
   2160 .Lxts_dec_2ndtweak:
   2161 	lghi	%r1,0x87
   2162 	srag	%r0,$s3,63		# broadcast upper bit
   2163 	ngr	%r1,%r0			# rem
   2164 	algr	$s1,$s1
   2165 	alcgr	$s3,$s3
   2166 	xgr	$s1,%r1
   2167 	lrvgr	$s1,$s1			# flip byte order
   2168 	lrvgr	$s3,$s3
   2169 	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
   2170 	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
   2171 	llgfr	$s1,$s1
   2172 	srlg	$s2,$s3,32
   2173 	stg	$s3,$tweak-16+8($sp)
   2174 	llgfr	$s3,$s3
   2175 
   2176 	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
   2177 	x	$s1,4($inp)
   2178 	x	$s2,8($inp)
   2179 	x	$s3,12($inp)
   2180 	stm${g}	%r2,%r3,2*$SIZE_T($sp)
   2181 	la	$key,0($key1)
   2182 	bras	$ra,_s390x_AES_decrypt
   2183 	lm${g}	%r2,%r5,2*$SIZE_T($sp)
   2184 	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
   2185 	x	$s1,$tweak-16+4($sp)
   2186 	x	$s2,$tweak-16+8($sp)
   2187 	x	$s3,$tweak-16+12($sp)
   2188 	st	$s0,0($out,$inp)
   2189 	st	$s1,4($out,$inp)
   2190 	st	$s2,8($out,$inp)
   2191 	st	$s3,12($out,$inp)
   2192 
   2193 	la	$i3,0($out,$inp)	# put aside real $out
   2194 .Lxts_dec_steal:
   2195 	llgc	%r0,16($inp)
   2196 	llgc	%r1,0($out,$inp)
   2197 	stc	%r0,0($out,$inp)
   2198 	stc	%r1,16($out,$inp)
   2199 	la	$inp,1($inp)
   2200 	brct	$len,.Lxts_dec_steal
   2201 	la	$out,0($i3)		# restore real $out
   2202 
   2203 	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
   2204 	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
   2205 	x	$s1,4($out)
   2206 	x	$s2,8($out)
   2207 	x	$s3,12($out)
   2208 	st${g}	$out,4*$SIZE_T($sp)
   2209 	la	$key,0($key1)
   2210 	bras	$ra,_s390x_AES_decrypt
   2211 	l${g}	$out,4*$SIZE_T($sp)
   2212 	x	$s0,$tweak+0($sp)	# ^=tweak
   2213 	x	$s1,$tweak+4($sp)
   2214 	x	$s2,$tweak+8($sp)
   2215 	x	$s3,$tweak+12($sp)
   2216 	st	$s0,0($out)
   2217 	st	$s1,4($out)
   2218 	st	$s2,8($out)
   2219 	st	$s3,12($out)
   2220 	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
   2221 	stg	$sp,$tweak-16+8($sp)
   2222 .Lxts_dec_done:
   2223 	stg	$sp,$tweak+0($sp)	# wipe tweak
   2224 	stg	$sp,$twesk+8($sp)
   2225 	lm${g}	%r6,$ra,6*$SIZE_T($sp)
   2226 	br	$ra
   2227 .size	AES_xts_decrypt,.-AES_xts_decrypt
   2228 ___
   2229 }
   2230 $code.=<<___;
   2231 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
   2232 .comm	OPENSSL_s390xcap_P,16,8
   2233 ___
   2234 
   2235 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   2236 print $code;
   2237 close STDOUT;	# force flush
   2238