Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # AES for s390x.
     11 
     12 # April 2007.
     13 #
     14 # Software performance improvement over gcc-generated code is ~70% and
     15 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
     16 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
     17 # *strictly* in-order execution and issued instruction [in this case
     18 # load value from memory is critical] has to complete before execution
     19 # flow proceeds. S-boxes are compressed to 2KB[+256B].
     20 #
     21 # As for hardware acceleration support. It's basically a "teaser," as
     22 # it can and should be improved in several ways. Most notably support
     23 # for CBC is not utilized, nor multiple blocks are ever processed.
     24 # Then software key schedule can be postponed till hardware support
     25 # detection... Performance improvement over assembler is reportedly
     26 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
     27 # support is implemented.
     28 
     29 # May 2007.
     30 #
     31 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
     32 # for 128-bit keys, if hardware support is detected.
     33 
     34 # Januray 2009.
     35 #
     36 # Add support for hardware AES192/256 and reschedule instructions to
     37 # minimize/avoid Address Generation Interlock hazard and to favour
     38 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
     39 # almost 50% on z9. The gain is smaller on z10, because being dual-
     40 # issue z10 makes it improssible to eliminate the interlock condition:
     41 # critial path is not long enough. Yet it spends ~24 cycles per byte
     42 # processed with 128-bit key.
     43 #
     44 # Unlike previous version hardware support detection takes place only
     45 # at the moment of key schedule setup, which is denoted in key->rounds.
     46 # This is done, because deferred key setup can't be made MT-safe, not
     47 # for key lengthes longer than 128 bits.
     48 #
     49 # Add AES_cbc_encrypt, which gives incredible performance improvement,
     50 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
     51 # because software implementation was optimized.
     52 
     53 $softonly=0;	# allow hardware support
     54 
     55 $t0="%r0";	$mask="%r0";
     56 $t1="%r1";
     57 $t2="%r2";	$inp="%r2";
     58 $t3="%r3";	$out="%r3";	$bits="%r3";
     59 $key="%r4";
     60 $i1="%r5";
     61 $i2="%r6";
     62 $i3="%r7";
     63 $s0="%r8";
     64 $s1="%r9";
     65 $s2="%r10";
     66 $s3="%r11";
     67 $tbl="%r12";
     68 $rounds="%r13";
     69 $ra="%r14";
     70 $sp="%r15";
     71 
     72 sub _data_word()
     73 { my $i;
     74     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
     75 }
     76 
     77 $code=<<___;
     78 .text
     79 
     80 .type	AES_Te,\@object
     81 .align	256
     82 AES_Te:
     83 ___
     84 &_data_word(
     85 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
     86 	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
     87 	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
     88 	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
     89 	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
     90 	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
     91 	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
     92 	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
     93 	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
     94 	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
     95 	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
     96 	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
     97 	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
     98 	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
     99 	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
    100 	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
    101 	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
    102 	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
    103 	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
    104 	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
    105 	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
    106 	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
    107 	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
    108 	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
    109 	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
    110 	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
    111 	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
    112 	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
    113 	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
    114 	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
    115 	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
    116 	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
    117 	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
    118 	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
    119 	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
    120 	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
    121 	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
    122 	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
    123 	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
    124 	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
    125 	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
    126 	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
    127 	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
    128 	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
    129 	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
    130 	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
    131 	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
    132 	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
    133 	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
    134 	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
    135 	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
    136 	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
    137 	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
    138 	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
    139 	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
    140 	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
    141 	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
    142 	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
    143 	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
    144 	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
    145 	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
    146 	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
    147 	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
    148 	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
    149 $code.=<<___;
    150 # Te4[256]
    151 .byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
    152 .byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
    153 .byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
    154 .byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
    155 .byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
    156 .byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
    157 .byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
    158 .byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
    159 .byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
    160 .byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
    161 .byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
    162 .byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
    163 .byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
    164 .byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
    165 .byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
    166 .byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
    167 .byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
    168 .byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
    169 .byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
    170 .byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
    171 .byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
    172 .byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
    173 .byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
    174 .byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
    175 .byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
    176 .byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
    177 .byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
    178 .byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
    179 .byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
    180 .byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
    181 .byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
    182 .byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
    183 # rcon[]
    184 .long	0x01000000, 0x02000000, 0x04000000, 0x08000000
    185 .long	0x10000000, 0x20000000, 0x40000000, 0x80000000
    186 .long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
    187 .align	256
    188 .size	AES_Te,.-AES_Te
    189 
    190 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
    191 # 		 const AES_KEY *key) {
    192 .globl	AES_encrypt
    193 .type	AES_encrypt,\@function
    194 AES_encrypt:
    195 ___
    196 $code.=<<___ if (!$softonly);
    197 	l	%r0,240($key)
    198 	lhi	%r1,16
    199 	clr	%r0,%r1
    200 	jl	.Lesoft
    201 
    202 	la	%r1,0($key)
    203 	#la	%r2,0($inp)
    204 	la	%r4,0($out)
    205 	lghi	%r3,16		# single block length
    206 	.long	0xb92e0042	# km %r4,%r2
    207 	brc	1,.-4		# can this happen?
    208 	br	%r14
    209 .align	64
    210 .Lesoft:
    211 ___
    212 $code.=<<___;
    213 	stmg	%r3,$ra,24($sp)
    214 
    215 	llgf	$s0,0($inp)
    216 	llgf	$s1,4($inp)
    217 	llgf	$s2,8($inp)
    218 	llgf	$s3,12($inp)
    219 
    220 	larl	$tbl,AES_Te
    221 	bras	$ra,_s390x_AES_encrypt
    222 
    223 	lg	$out,24($sp)
    224 	st	$s0,0($out)
    225 	st	$s1,4($out)
    226 	st	$s2,8($out)
    227 	st	$s3,12($out)
    228 
    229 	lmg	%r6,$ra,48($sp)
    230 	br	$ra
    231 .size	AES_encrypt,.-AES_encrypt
    232 
    233 .type   _s390x_AES_encrypt,\@function
    234 .align	16
    235 _s390x_AES_encrypt:
    236 	stg	$ra,152($sp)
    237 	x	$s0,0($key)
    238 	x	$s1,4($key)
    239 	x	$s2,8($key)
    240 	x	$s3,12($key)
    241 	l	$rounds,240($key)
    242 	llill	$mask,`0xff<<3`
    243 	aghi	$rounds,-1
    244 	j	.Lenc_loop
    245 .align	16
    246 .Lenc_loop:
    247 	sllg	$t1,$s0,`0+3`
    248 	srlg	$t2,$s0,`8-3`
    249 	srlg	$t3,$s0,`16-3`
    250 	srl	$s0,`24-3`
    251 	nr	$s0,$mask
    252 	ngr	$t1,$mask
    253 	nr	$t2,$mask
    254 	nr	$t3,$mask
    255 
    256 	srlg	$i1,$s1,`16-3`	# i0
    257 	sllg	$i2,$s1,`0+3`
    258 	srlg	$i3,$s1,`8-3`
    259 	srl	$s1,`24-3`
    260 	nr	$i1,$mask
    261 	nr	$s1,$mask
    262 	ngr	$i2,$mask
    263 	nr	$i3,$mask
    264 
    265 	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
    266 	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
    267 	l	$t2,2($t2,$tbl) # Te2[s0>>8]
    268 	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
    269 
    270 	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
    271 	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
    272 	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
    273 	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
    274 
    275 	srlg	$i1,$s2,`8-3`	# i0
    276 	srlg	$i2,$s2,`16-3`	# i1
    277 	nr	$i1,$mask
    278 	nr	$i2,$mask
    279 	sllg	$i3,$s2,`0+3`
    280 	srl	$s2,`24-3`
    281 	nr	$s2,$mask
    282 	ngr	$i3,$mask
    283 
    284 	xr	$s1,$t1
    285 	srlg	$ra,$s3,`8-3`	# i1
    286 	sllg	$t1,$s3,`0+3`	# i0
    287 	nr	$ra,$mask
    288 	la	$key,16($key)
    289 	ngr	$t1,$mask
    290 
    291 	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
    292 	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
    293 	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
    294 	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
    295 
    296 	srlg	$i3,$s3,`16-3`	# i2
    297 	xr	$s2,$t2
    298 	srl	$s3,`24-3`
    299 	nr	$i3,$mask
    300 	nr	$s3,$mask
    301 
    302 	x	$s0,0($key)
    303 	x	$s1,4($key)
    304 	x	$s2,8($key)
    305 	x	$t3,12($key)
    306 
    307 	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
    308 	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
    309 	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
    310 	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
    311 	xr	$s3,$t3
    312 
    313 	brct	$rounds,.Lenc_loop
    314 	.align	16
    315 
    316 	sllg	$t1,$s0,`0+3`
    317 	srlg	$t2,$s0,`8-3`
    318 	ngr	$t1,$mask
    319 	srlg	$t3,$s0,`16-3`
    320 	srl	$s0,`24-3`
    321 	nr	$s0,$mask
    322 	nr	$t2,$mask
    323 	nr	$t3,$mask
    324 
    325 	srlg	$i1,$s1,`16-3`	# i0
    326 	sllg	$i2,$s1,`0+3`
    327 	ngr	$i2,$mask
    328 	srlg	$i3,$s1,`8-3`
    329 	srl	$s1,`24-3`
    330 	nr	$i1,$mask
    331 	nr	$s1,$mask
    332 	nr	$i3,$mask
    333 
    334 	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
    335 	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
    336 	sll	$s0,24
    337 	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
    338 	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
    339 	sll	$t2,8
    340 	sll	$t3,16
    341 
    342 	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
    343 	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
    344 	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
    345 	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
    346 	sll	$i1,16
    347 	sll	$s1,24
    348 	sll	$i3,8
    349 	or	$s0,$i1
    350 	or	$s1,$t1
    351 	or	$t2,$i2
    352 	or	$t3,$i3
    353 	
    354 	srlg	$i1,$s2,`8-3`	# i0
    355 	srlg	$i2,$s2,`16-3`	# i1
    356 	nr	$i1,$mask
    357 	nr	$i2,$mask
    358 	sllg	$i3,$s2,`0+3`
    359 	srl	$s2,`24-3`
    360 	ngr	$i3,$mask
    361 	nr	$s2,$mask
    362 
    363 	sllg	$t1,$s3,`0+3`	# i0
    364 	srlg	$ra,$s3,`8-3`	# i1
    365 	ngr	$t1,$mask
    366 
    367 	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
    368 	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
    369 	sll	$i1,8
    370 	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
    371 	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
    372 	sll	$i2,16
    373 	nr	$ra,$mask
    374 	sll	$s2,24
    375 	or	$s0,$i1
    376 	or	$s1,$i2
    377 	or	$s2,$t2
    378 	or	$t3,$i3
    379 
    380 	srlg	$i3,$s3,`16-3`	# i2
    381 	srl	$s3,`24-3`
    382 	nr	$i3,$mask
    383 	nr	$s3,$mask
    384 
    385 	l	$t0,16($key)
    386 	l	$t2,20($key)
    387 
    388 	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
    389 	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
    390 	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
    391 	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
    392 	sll	$i2,8
    393 	sll	$i3,16
    394 	sll	$s3,24
    395 	or	$s0,$i1
    396 	or	$s1,$i2
    397 	or	$s2,$i3
    398 	or	$s3,$t3
    399 
    400 	lg	$ra,152($sp)
    401 	xr	$s0,$t0
    402 	xr	$s1,$t2
    403 	x	$s2,24($key)
    404 	x	$s3,28($key)
    405 
    406 	br	$ra	
    407 .size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
    408 ___
    409 
    410 $code.=<<___;
    411 .type	AES_Td,\@object
    412 .align	256
    413 AES_Td:
    414 ___
    415 &_data_word(
    416 	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
    417 	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
    418 	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
    419 	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
    420 	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
    421 	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
    422 	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
    423 	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
    424 	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
    425 	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
    426 	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
    427 	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
    428 	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
    429 	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
    430 	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
    431 	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
    432 	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
    433 	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
    434 	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
    435 	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
    436 	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
    437 	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
    438 	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
    439 	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
    440 	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
    441 	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
    442 	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
    443 	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
    444 	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
    445 	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
    446 	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
    447 	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
    448 	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
    449 	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
    450 	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
    451 	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
    452 	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
    453 	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
    454 	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
    455 	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
    456 	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
    457 	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
    458 	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
    459 	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
    460 	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
    461 	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
    462 	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
    463 	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
    464 	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
    465 	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
    466 	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
    467 	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
    468 	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
    469 	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
    470 	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
    471 	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
    472 	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
    473 	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
    474 	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
    475 	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
    476 	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
    477 	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
    478 	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
    479 	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
    480 $code.=<<___;
    481 # Td4[256]
    482 .byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
    483 .byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
    484 .byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
    485 .byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
    486 .byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
    487 .byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
    488 .byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
    489 .byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
    490 .byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
    491 .byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
    492 .byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
    493 .byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
    494 .byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
    495 .byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
    496 .byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
    497 .byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
    498 .byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
    499 .byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
    500 .byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
    501 .byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
    502 .byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
    503 .byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
    504 .byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
    505 .byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
    506 .byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
    507 .byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
    508 .byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
    509 .byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
    510 .byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
    511 .byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
    512 .byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
    513 .byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
    514 .size	AES_Td,.-AES_Td
    515 
    516 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
    517 # 		 const AES_KEY *key) {
    518 .globl	AES_decrypt
    519 .type	AES_decrypt,\@function
    520 AES_decrypt:
    521 ___
    522 $code.=<<___ if (!$softonly);
    523 	l	%r0,240($key)
    524 	lhi	%r1,16
    525 	clr	%r0,%r1
    526 	jl	.Ldsoft
    527 
    528 	la	%r1,0($key)
    529 	#la	%r2,0($inp)
    530 	la	%r4,0($out)
    531 	lghi	%r3,16		# single block length
    532 	.long	0xb92e0042	# km %r4,%r2
    533 	brc	1,.-4		# can this happen?
    534 	br	%r14
    535 .align	64
    536 .Ldsoft:
    537 ___
    538 $code.=<<___;
    539 	stmg	%r3,$ra,24($sp)
    540 
    541 	llgf	$s0,0($inp)
    542 	llgf	$s1,4($inp)
    543 	llgf	$s2,8($inp)
    544 	llgf	$s3,12($inp)
    545 
    546 	larl	$tbl,AES_Td
    547 	bras	$ra,_s390x_AES_decrypt
    548 
    549 	lg	$out,24($sp)
    550 	st	$s0,0($out)
    551 	st	$s1,4($out)
    552 	st	$s2,8($out)
    553 	st	$s3,12($out)
    554 
    555 	lmg	%r6,$ra,48($sp)
    556 	br	$ra
    557 .size	AES_decrypt,.-AES_decrypt
    558 
    559 .type   _s390x_AES_decrypt,\@function
    560 .align	16
    561 _s390x_AES_decrypt:
    562 	stg	$ra,152($sp)
    563 	x	$s0,0($key)
    564 	x	$s1,4($key)
    565 	x	$s2,8($key)
    566 	x	$s3,12($key)
    567 	l	$rounds,240($key)
    568 	llill	$mask,`0xff<<3`
    569 	aghi	$rounds,-1
    570 	j	.Ldec_loop
    571 .align	16
    572 .Ldec_loop:
    573 	srlg	$t1,$s0,`16-3`
    574 	srlg	$t2,$s0,`8-3`
    575 	sllg	$t3,$s0,`0+3`
    576 	srl	$s0,`24-3`
    577 	nr	$s0,$mask
    578 	nr	$t1,$mask
    579 	nr	$t2,$mask
    580 	ngr	$t3,$mask
    581 
    582 	sllg	$i1,$s1,`0+3`	# i0
    583 	srlg	$i2,$s1,`16-3`
    584 	srlg	$i3,$s1,`8-3`
    585 	srl	$s1,`24-3`
    586 	ngr	$i1,$mask
    587 	nr	$s1,$mask
    588 	nr	$i2,$mask
    589 	nr	$i3,$mask
    590 
    591 	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
    592 	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
    593 	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
    594 	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
    595 
    596 	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
    597 	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
    598 	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
    599 	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
    600 
    601 	srlg	$i1,$s2,`8-3`	# i0
    602 	sllg	$i2,$s2,`0+3`	# i1
    603 	srlg	$i3,$s2,`16-3`
    604 	srl	$s2,`24-3`
    605 	nr	$i1,$mask
    606 	ngr	$i2,$mask
    607 	nr	$s2,$mask
    608 	nr	$i3,$mask
    609 
    610 	xr	$s1,$t1
    611 	srlg	$ra,$s3,`8-3`	# i1
    612 	srlg	$t1,$s3,`16-3`	# i0
    613 	nr	$ra,$mask
    614 	la	$key,16($key)
    615 	nr	$t1,$mask
    616 
    617 	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
    618 	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
    619 	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
    620 	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
    621 
    622 	sllg	$i3,$s3,`0+3`	# i2
    623 	srl	$s3,`24-3`
    624 	ngr	$i3,$mask
    625 	nr	$s3,$mask
    626 
    627 	xr	$s2,$t2
    628 	x	$s0,0($key)
    629 	x	$s1,4($key)
    630 	x	$s2,8($key)
    631 	x	$t3,12($key)
    632 
    633 	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
    634 	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
    635 	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
    636 	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
    637 	xr	$s3,$t3
    638 
    639 	brct	$rounds,.Ldec_loop
    640 	.align	16
    641 
    642 	l	$t1,`2048+0`($tbl)	# prefetch Td4
    643 	l	$t2,`2048+64`($tbl)
    644 	l	$t3,`2048+128`($tbl)
    645 	l	$i1,`2048+192`($tbl)
    646 	llill	$mask,0xff
    647 
    648 	srlg	$i3,$s0,24	# i0
    649 	srlg	$t1,$s0,16
    650 	srlg	$t2,$s0,8
    651 	nr	$s0,$mask	# i3
    652 	nr	$t1,$mask
    653 
    654 	srlg	$i1,$s1,24
    655 	nr	$t2,$mask
    656 	srlg	$i2,$s1,16
    657 	srlg	$ra,$s1,8
    658 	nr	$s1,$mask	# i0
    659 	nr	$i2,$mask
    660 	nr	$ra,$mask
    661 
    662 	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
    663 	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
    664 	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
    665 	sll	$t1,16
    666 	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
    667 	sllg	$s0,$i3,24
    668 	sll	$t2,8
    669 
    670 	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
    671 	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
    672 	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
    673 	sll	$i1,24
    674 	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
    675 	sll	$i2,16
    676 	sll	$i3,8
    677 	or	$s0,$s1
    678 	or	$t1,$i1
    679 	or	$t2,$i2
    680 	or	$t3,$i3
    681 
    682 	srlg	$i1,$s2,8	# i0
    683 	srlg	$i2,$s2,24
    684 	srlg	$i3,$s2,16
    685 	nr	$s2,$mask	# i1
    686 	nr	$i1,$mask
    687 	nr	$i3,$mask
    688 	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
    689 	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
    690 	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
    691 	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
    692 	sll	$i1,8
    693 	sll	$i2,24
    694 	or	$s0,$i1
    695 	sll	$i3,16
    696 	or	$t2,$i2
    697 	or	$t3,$i3
    698 
    699 	srlg	$i1,$s3,16	# i0
    700 	srlg	$i2,$s3,8	# i1
    701 	srlg	$i3,$s3,24
    702 	nr	$s3,$mask	# i2
    703 	nr	$i1,$mask
    704 	nr	$i2,$mask
    705 
    706 	lg	$ra,152($sp)
    707 	or	$s1,$t1
    708 	l	$t0,16($key)
    709 	l	$t1,20($key)
    710 
    711 	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
    712 	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
    713 	sll	$i1,16
    714 	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
    715 	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
    716 	sll	$i2,8
    717 	sll	$s3,24
    718 	or	$s0,$i1
    719 	or	$s1,$i2
    720 	or	$s2,$t2
    721 	or	$s3,$t3
    722 
    723 	xr	$s0,$t0
    724 	xr	$s1,$t1
    725 	x	$s2,24($key)
    726 	x	$s3,28($key)
    727 
    728 	br	$ra	
    729 .size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
    730 ___
    731 
    732 $code.=<<___;
    733 # void AES_set_encrypt_key(const unsigned char *in, int bits,
    734 # 		 AES_KEY *key) {
    735 .globl	AES_set_encrypt_key
    736 .type	AES_set_encrypt_key,\@function
    737 .align	16
    738 AES_set_encrypt_key:
    739 	lghi	$t0,0
    740 	clgr	$inp,$t0
    741 	je	.Lminus1
    742 	clgr	$key,$t0
    743 	je	.Lminus1
    744 
    745 	lghi	$t0,128
    746 	clr	$bits,$t0
    747 	je	.Lproceed
    748 	lghi	$t0,192
    749 	clr	$bits,$t0
    750 	je	.Lproceed
    751 	lghi	$t0,256
    752 	clr	$bits,$t0
    753 	je	.Lproceed
    754 	lghi	%r2,-2
    755 	br	%r14
    756 
    757 .align	16
    758 .Lproceed:
    759 ___
    760 $code.=<<___ if (!$softonly);
    761 	# convert bits to km code, [128,192,256]->[18,19,20]
    762 	lhi	%r5,-128
    763 	lhi	%r0,18
    764 	ar	%r5,$bits
    765 	srl	%r5,6
    766 	ar	%r5,%r0
    767 
    768 	larl	%r1,OPENSSL_s390xcap_P
    769 	lg	%r0,0(%r1)
    770 	tmhl	%r0,0x4000	# check for message-security assist
    771 	jz	.Lekey_internal
    772 
    773 	lghi	%r0,0		# query capability vector
    774 	la	%r1,16($sp)
    775 	.long	0xb92f0042	# kmc %r4,%r2
    776 
    777 	llihh	%r1,0x8000
    778 	srlg	%r1,%r1,0(%r5)
    779 	ng	%r1,16($sp)
    780 	jz	.Lekey_internal
    781 
    782 	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
    783 	stmg	%r0,%r1,0($key)
    784 	lhi	%r0,192
    785 	cr	$bits,%r0
    786 	jl	1f
    787 	lg	%r1,16($inp)
    788 	stg	%r1,16($key)
    789 	je	1f
    790 	lg	%r1,24($inp)
    791 	stg	%r1,24($key)
    792 1:	st	$bits,236($key)	# save bits
    793 	st	%r5,240($key)	# save km code
    794 	lghi	%r2,0
    795 	br	%r14
    796 ___
    797 $code.=<<___;
    798 .align	16
    799 .Lekey_internal:
    800 	stmg	%r6,%r13,48($sp)	# all non-volatile regs
    801 
    802 	larl	$tbl,AES_Te+2048
    803 
    804 	llgf	$s0,0($inp)
    805 	llgf	$s1,4($inp)
    806 	llgf	$s2,8($inp)
    807 	llgf	$s3,12($inp)
    808 	st	$s0,0($key)
    809 	st	$s1,4($key)
    810 	st	$s2,8($key)
    811 	st	$s3,12($key)
    812 	lghi	$t0,128
    813 	cr	$bits,$t0
    814 	jne	.Lnot128
    815 
    816 	llill	$mask,0xff
    817 	lghi	$t3,0			# i=0
    818 	lghi	$rounds,10
    819 	st	$rounds,240($key)
    820 
    821 	llgfr	$t2,$s3			# temp=rk[3]
    822 	srlg	$i1,$s3,8
    823 	srlg	$i2,$s3,16
    824 	srlg	$i3,$s3,24
    825 	nr	$t2,$mask
    826 	nr	$i1,$mask
    827 	nr	$i2,$mask
    828 
    829 .align	16
    830 .L128_loop:
    831 	la	$t2,0($t2,$tbl)
    832 	la	$i1,0($i1,$tbl)
    833 	la	$i2,0($i2,$tbl)
    834 	la	$i3,0($i3,$tbl)
    835 	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
    836 	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
    837 	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
    838 	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
    839 	x	$t2,256($t3,$tbl)	# rcon[i]
    840 	xr	$s0,$t2			# rk[4]=rk[0]^...
    841 	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
    842 	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
    843 	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
    844 
    845 	llgfr	$t2,$s3			# temp=rk[3]
    846 	srlg	$i1,$s3,8
    847 	srlg	$i2,$s3,16
    848 	nr	$t2,$mask
    849 	nr	$i1,$mask
    850 	srlg	$i3,$s3,24
    851 	nr	$i2,$mask
    852 
    853 	st	$s0,16($key)
    854 	st	$s1,20($key)
    855 	st	$s2,24($key)
    856 	st	$s3,28($key)
    857 	la	$key,16($key)		# key+=4
    858 	la	$t3,4($t3)		# i++
    859 	brct	$rounds,.L128_loop
    860 	lghi	%r2,0
    861 	lmg	%r6,%r13,48($sp)
    862 	br	$ra
    863 
    864 .align	16
    865 .Lnot128:
    866 	llgf	$t0,16($inp)
    867 	llgf	$t1,20($inp)
    868 	st	$t0,16($key)
    869 	st	$t1,20($key)
    870 	lghi	$t0,192
    871 	cr	$bits,$t0
    872 	jne	.Lnot192
    873 
    874 	llill	$mask,0xff
    875 	lghi	$t3,0			# i=0
    876 	lghi	$rounds,12
    877 	st	$rounds,240($key)
    878 	lghi	$rounds,8
    879 
    880 	srlg	$i1,$t1,8
    881 	srlg	$i2,$t1,16
    882 	srlg	$i3,$t1,24
    883 	nr	$t1,$mask
    884 	nr	$i1,$mask
    885 	nr	$i2,$mask
    886 
    887 .align	16
    888 .L192_loop:
    889 	la	$t1,0($t1,$tbl)
    890 	la	$i1,0($i1,$tbl)
    891 	la	$i2,0($i2,$tbl)
    892 	la	$i3,0($i3,$tbl)
    893 	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
    894 	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
    895 	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
    896 	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
    897 	x	$t1,256($t3,$tbl)	# rcon[i]
    898 	xr	$s0,$t1			# rk[6]=rk[0]^...
    899 	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
    900 	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
    901 	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
    902 
    903 	st	$s0,24($key)
    904 	st	$s1,28($key)
    905 	st	$s2,32($key)
    906 	st	$s3,36($key)
    907 	brct	$rounds,.L192_continue
    908 	lghi	%r2,0
    909 	lmg	%r6,%r13,48($sp)
    910 	br	$ra
    911 
    912 .align	16
    913 .L192_continue:
    914 	lgr	$t1,$s3
    915 	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
    916 	st	$t1,40($key)
    917 	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
    918 	st	$t1,44($key)
    919 
    920 	srlg	$i1,$t1,8
    921 	srlg	$i2,$t1,16
    922 	srlg	$i3,$t1,24
    923 	nr	$t1,$mask
    924 	nr	$i1,$mask
    925 	nr	$i2,$mask
    926 
    927 	la	$key,24($key)		# key+=6
    928 	la	$t3,4($t3)		# i++
    929 	j	.L192_loop
    930 
    931 .align	16
    932 .Lnot192:
    933 	llgf	$t0,24($inp)
    934 	llgf	$t1,28($inp)
    935 	st	$t0,24($key)
    936 	st	$t1,28($key)
    937 	llill	$mask,0xff
    938 	lghi	$t3,0			# i=0
    939 	lghi	$rounds,14
    940 	st	$rounds,240($key)
    941 	lghi	$rounds,7
    942 
    943 	srlg	$i1,$t1,8
    944 	srlg	$i2,$t1,16
    945 	srlg	$i3,$t1,24
    946 	nr	$t1,$mask
    947 	nr	$i1,$mask
    948 	nr	$i2,$mask
    949 
    950 .align	16
    951 .L256_loop:
    952 	la	$t1,0($t1,$tbl)
    953 	la	$i1,0($i1,$tbl)
    954 	la	$i2,0($i2,$tbl)
    955 	la	$i3,0($i3,$tbl)
    956 	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
    957 	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
    958 	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
    959 	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
    960 	x	$t1,256($t3,$tbl)	# rcon[i]
    961 	xr	$s0,$t1			# rk[8]=rk[0]^...
    962 	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
    963 	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
    964 	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
    965 	st	$s0,32($key)
    966 	st	$s1,36($key)
    967 	st	$s2,40($key)
    968 	st	$s3,44($key)
    969 	brct	$rounds,.L256_continue
    970 	lghi	%r2,0
    971 	lmg	%r6,%r13,48($sp)
    972 	br	$ra
    973 
    974 .align	16
    975 .L256_continue:
    976 	lgr	$t1,$s3			# temp=rk[11]
    977 	srlg	$i1,$s3,8
    978 	srlg	$i2,$s3,16
    979 	srlg	$i3,$s3,24
    980 	nr	$t1,$mask
    981 	nr	$i1,$mask
    982 	nr	$i2,$mask
    983 	la	$t1,0($t1,$tbl)
    984 	la	$i1,0($i1,$tbl)
    985 	la	$i2,0($i2,$tbl)
    986 	la	$i3,0($i3,$tbl)
    987 	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
    988 	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
    989 	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
    990 	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
    991 	x	$t1,16($key)		# rk[12]=rk[4]^...
    992 	st	$t1,48($key)
    993 	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
    994 	st	$t1,52($key)
    995 	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
    996 	st	$t1,56($key)
    997 	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
    998 	st	$t1,60($key)
    999 
   1000 	srlg	$i1,$t1,8
   1001 	srlg	$i2,$t1,16
   1002 	srlg	$i3,$t1,24
   1003 	nr	$t1,$mask
   1004 	nr	$i1,$mask
   1005 	nr	$i2,$mask
   1006 
   1007 	la	$key,32($key)		# key+=8
   1008 	la	$t3,4($t3)		# i++
   1009 	j	.L256_loop
   1010 
   1011 .Lminus1:
   1012 	lghi	%r2,-1
   1013 	br	$ra
   1014 .size	AES_set_encrypt_key,.-AES_set_encrypt_key
   1015 
   1016 # void AES_set_decrypt_key(const unsigned char *in, int bits,
   1017 # 		 AES_KEY *key) {
   1018 .globl	AES_set_decrypt_key
   1019 .type	AES_set_decrypt_key,\@function
   1020 .align	16
   1021 AES_set_decrypt_key:
   1022 	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
   1023 	stg	$ra,112($sp)		# save non-volatile registers!
   1024 	bras	$ra,AES_set_encrypt_key
   1025 	lg	$key,32($sp)
   1026 	lg	$ra,112($sp)
   1027 	ltgr	%r2,%r2
   1028 	bnzr	$ra
   1029 ___
   1030 $code.=<<___ if (!$softonly);
   1031 	l	$t0,240($key)
   1032 	lhi	$t1,16
   1033 	cr	$t0,$t1
   1034 	jl	.Lgo
   1035 	oill	$t0,0x80	# set "decrypt" bit
   1036 	st	$t0,240($key)
   1037 	br	$ra
   1038 
   1039 .align	16
   1040 .Ldkey_internal:
   1041 	stg	$key,32($sp)
   1042 	stg	$ra,40($sp)
   1043 	bras	$ra,.Lekey_internal
   1044 	lg	$key,32($sp)
   1045 	lg	$ra,40($sp)
   1046 ___
   1047 $code.=<<___;
   1048 
   1049 .Lgo:	llgf	$rounds,240($key)
   1050 	la	$i1,0($key)
   1051 	sllg	$i2,$rounds,4
   1052 	la	$i2,0($i2,$key)
   1053 	srl	$rounds,1
   1054 	lghi	$t1,-16
   1055 
   1056 .align	16
   1057 .Linv:	lmg	$s0,$s1,0($i1)
   1058 	lmg	$s2,$s3,0($i2)
   1059 	stmg	$s0,$s1,0($i2)
   1060 	stmg	$s2,$s3,0($i1)
   1061 	la	$i1,16($i1)
   1062 	la	$i2,0($t1,$i2)
   1063 	brct	$rounds,.Linv
   1064 ___
   1065 $mask80=$i1;
   1066 $mask1b=$i2;
   1067 $maskfe=$i3;
   1068 $code.=<<___;
   1069 	llgf	$rounds,240($key)
   1070 	aghi	$rounds,-1
   1071 	sll	$rounds,2	# (rounds-1)*4
   1072 	llilh	$mask80,0x8080
   1073 	llilh	$mask1b,0x1b1b
   1074 	llilh	$maskfe,0xfefe
   1075 	oill	$mask80,0x8080
   1076 	oill	$mask1b,0x1b1b
   1077 	oill	$maskfe,0xfefe
   1078 
   1079 .align	16
   1080 .Lmix:	l	$s0,16($key)	# tp1
   1081 	lr	$s1,$s0
   1082 	ngr	$s1,$mask80
   1083 	srlg	$t1,$s1,7
   1084 	slr	$s1,$t1
   1085 	nr	$s1,$mask1b
   1086 	sllg	$t1,$s0,1
   1087 	nr	$t1,$maskfe
   1088 	xr	$s1,$t1		# tp2
   1089 
   1090 	lr	$s2,$s1
   1091 	ngr	$s2,$mask80
   1092 	srlg	$t1,$s2,7
   1093 	slr	$s2,$t1
   1094 	nr	$s2,$mask1b
   1095 	sllg	$t1,$s1,1
   1096 	nr	$t1,$maskfe
   1097 	xr	$s2,$t1		# tp4
   1098 
   1099 	lr	$s3,$s2
   1100 	ngr	$s3,$mask80
   1101 	srlg	$t1,$s3,7
   1102 	slr	$s3,$t1
   1103 	nr	$s3,$mask1b
   1104 	sllg	$t1,$s2,1
   1105 	nr	$t1,$maskfe
   1106 	xr	$s3,$t1		# tp8
   1107 
   1108 	xr	$s1,$s0		# tp2^tp1
   1109 	xr	$s2,$s0		# tp4^tp1
   1110 	rll	$s0,$s0,24	# = ROTATE(tp1,8)
   1111 	xr	$s2,$s3		# ^=tp8
   1112 	xr	$s0,$s1		# ^=tp2^tp1
   1113 	xr	$s1,$s3		# tp2^tp1^tp8
   1114 	xr	$s0,$s2		# ^=tp4^tp1^tp8
   1115 	rll	$s1,$s1,8
   1116 	rll	$s2,$s2,16
   1117 	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
   1118 	rll	$s3,$s3,24
   1119 	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
   1120 	xr	$s0,$s3		# ^= ROTATE(tp8,8)
   1121 
   1122 	st	$s0,16($key)
   1123 	la	$key,4($key)
   1124 	brct	$rounds,.Lmix
   1125 
   1126 	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
   1127 	lghi	%r2,0
   1128 	br	$ra
   1129 .size	AES_set_decrypt_key,.-AES_set_decrypt_key
   1130 ___
   1131 
   1132 #void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
   1133 #                     size_t length, const AES_KEY *key,
   1134 #                     unsigned char *ivec, const int enc)
   1135 {
   1136 my $inp="%r2";
   1137 my $out="%r4";	# length and out are swapped
   1138 my $len="%r3";
   1139 my $key="%r5";
   1140 my $ivp="%r6";
   1141 
   1142 $code.=<<___;
   1143 .globl	AES_cbc_encrypt
   1144 .type	AES_cbc_encrypt,\@function
   1145 .align	16
   1146 AES_cbc_encrypt:
   1147 	xgr	%r3,%r4		# flip %r3 and %r4, out and len
   1148 	xgr	%r4,%r3
   1149 	xgr	%r3,%r4
   1150 ___
   1151 $code.=<<___ if (!$softonly);
   1152 	lhi	%r0,16
   1153 	cl	%r0,240($key)
   1154 	jh	.Lcbc_software
   1155 
   1156 	lg	%r0,0($ivp)	# copy ivec
   1157 	lg	%r1,8($ivp)
   1158 	stmg	%r0,%r1,16($sp)
   1159 	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
   1160 	stmg	%r0,%r1,32($sp)
   1161 	lmg	%r0,%r1,16($key)
   1162 	stmg	%r0,%r1,48($sp)
   1163 	l	%r0,240($key)	# load kmc code
   1164 	lghi	$key,15		# res=len%16, len-=res;
   1165 	ngr	$key,$len
   1166 	slgr	$len,$key
   1167 	la	%r1,16($sp)	# parameter block - ivec || key
   1168 	jz	.Lkmc_truncated
   1169 	.long	0xb92f0042	# kmc %r4,%r2
   1170 	brc	1,.-4		# pay attention to "partial completion"
   1171 	ltr	$key,$key
   1172 	jnz	.Lkmc_truncated
   1173 .Lkmc_done:
   1174 	lmg	%r0,%r1,16($sp)	# copy ivec to caller
   1175 	stg	%r0,0($ivp)
   1176 	stg	%r1,8($ivp)
   1177 	br	$ra
   1178 .align	16
   1179 .Lkmc_truncated:
   1180 	ahi	$key,-1		# it's the way it's encoded in mvc
   1181 	tmll	%r0,0x80
   1182 	jnz	.Lkmc_truncated_dec
   1183 	lghi	%r1,0
   1184 	stg	%r1,128($sp)
   1185 	stg	%r1,136($sp)
   1186 	bras	%r1,1f
   1187 	mvc	128(1,$sp),0($inp)
   1188 1:	ex	$key,0(%r1)
   1189 	la	%r1,16($sp)	# restore parameter block
   1190 	la	$inp,128($sp)
   1191 	lghi	$len,16
   1192 	.long	0xb92f0042	# kmc %r4,%r2
   1193 	j	.Lkmc_done
   1194 .align	16
   1195 .Lkmc_truncated_dec:
   1196 	stg	$out,64($sp)
   1197 	la	$out,128($sp)
   1198 	lghi	$len,16
   1199 	.long	0xb92f0042	# kmc %r4,%r2
   1200 	lg	$out,64($sp)
   1201 	bras	%r1,2f
   1202 	mvc	0(1,$out),128($sp)
   1203 2:	ex	$key,0(%r1)
   1204 	j	.Lkmc_done
   1205 .align	16
   1206 .Lcbc_software:
   1207 ___
   1208 $code.=<<___;
   1209 	stmg	$key,$ra,40($sp)
   1210 	lhi	%r0,0
   1211 	cl	%r0,164($sp)
   1212 	je	.Lcbc_decrypt
   1213 
   1214 	larl	$tbl,AES_Te
   1215 
   1216 	llgf	$s0,0($ivp)
   1217 	llgf	$s1,4($ivp)
   1218 	llgf	$s2,8($ivp)
   1219 	llgf	$s3,12($ivp)
   1220 
   1221 	lghi	$t0,16
   1222 	slgr	$len,$t0
   1223 	brc	4,.Lcbc_enc_tail	# if borrow
   1224 .Lcbc_enc_loop:
   1225 	stmg	$inp,$out,16($sp)
   1226 	x	$s0,0($inp)
   1227 	x	$s1,4($inp)
   1228 	x	$s2,8($inp)
   1229 	x	$s3,12($inp)
   1230 	lgr	%r4,$key
   1231 
   1232 	bras	$ra,_s390x_AES_encrypt
   1233 
   1234 	lmg	$inp,$key,16($sp)
   1235 	st	$s0,0($out)
   1236 	st	$s1,4($out)
   1237 	st	$s2,8($out)
   1238 	st	$s3,12($out)
   1239 
   1240 	la	$inp,16($inp)
   1241 	la	$out,16($out)
   1242 	lghi	$t0,16
   1243 	ltgr	$len,$len
   1244 	jz	.Lcbc_enc_done
   1245 	slgr	$len,$t0
   1246 	brc	4,.Lcbc_enc_tail	# if borrow
   1247 	j	.Lcbc_enc_loop
   1248 .align	16
   1249 .Lcbc_enc_done:
   1250 	lg	$ivp,48($sp)
   1251 	st	$s0,0($ivp)
   1252 	st	$s1,4($ivp)	
   1253 	st	$s2,8($ivp)
   1254 	st	$s3,12($ivp)
   1255 
   1256 	lmg	%r7,$ra,56($sp)
   1257 	br	$ra
   1258 
   1259 .align	16
   1260 .Lcbc_enc_tail:
   1261 	aghi	$len,15
   1262 	lghi	$t0,0
   1263 	stg	$t0,128($sp)
   1264 	stg	$t0,136($sp)
   1265 	bras	$t1,3f
   1266 	mvc	128(1,$sp),0($inp)
   1267 3:	ex	$len,0($t1)
   1268 	lghi	$len,0
   1269 	la	$inp,128($sp)
   1270 	j	.Lcbc_enc_loop
   1271 
   1272 .align	16
   1273 .Lcbc_decrypt:
   1274 	larl	$tbl,AES_Td
   1275 
   1276 	lg	$t0,0($ivp)
   1277 	lg	$t1,8($ivp)
   1278 	stmg	$t0,$t1,128($sp)
   1279 
   1280 .Lcbc_dec_loop:
   1281 	stmg	$inp,$out,16($sp)
   1282 	llgf	$s0,0($inp)
   1283 	llgf	$s1,4($inp)
   1284 	llgf	$s2,8($inp)
   1285 	llgf	$s3,12($inp)
   1286 	lgr	%r4,$key
   1287 
   1288 	bras	$ra,_s390x_AES_decrypt
   1289 
   1290 	lmg	$inp,$key,16($sp)
   1291 	sllg	$s0,$s0,32
   1292 	sllg	$s2,$s2,32
   1293 	lr	$s0,$s1
   1294 	lr	$s2,$s3
   1295 
   1296 	lg	$t0,0($inp)
   1297 	lg	$t1,8($inp)
   1298 	xg	$s0,128($sp)
   1299 	xg	$s2,136($sp)
   1300 	lghi	$s1,16
   1301 	slgr	$len,$s1
   1302 	brc	4,.Lcbc_dec_tail	# if borrow
   1303 	brc	2,.Lcbc_dec_done	# if zero
   1304 	stg	$s0,0($out)
   1305 	stg	$s2,8($out)
   1306 	stmg	$t0,$t1,128($sp)
   1307 
   1308 	la	$inp,16($inp)
   1309 	la	$out,16($out)
   1310 	j	.Lcbc_dec_loop
   1311 
   1312 .Lcbc_dec_done:
   1313 	stg	$s0,0($out)
   1314 	stg	$s2,8($out)
   1315 .Lcbc_dec_exit:
   1316 	lmg	$ivp,$ra,48($sp)
   1317 	stmg	$t0,$t1,0($ivp)
   1318 
   1319 	br	$ra
   1320 
   1321 .align	16
   1322 .Lcbc_dec_tail:
   1323 	aghi	$len,15
   1324 	stg	$s0,128($sp)
   1325 	stg	$s2,136($sp)
   1326 	bras	$s1,4f
   1327 	mvc	0(1,$out),128($sp)
   1328 4:	ex	$len,0($s1)
   1329 	j	.Lcbc_dec_exit
   1330 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
   1331 .comm  OPENSSL_s390xcap_P,8,8
   1332 ___
   1333 }
   1334 $code.=<<___;
   1335 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
   1336 ___
   1337 
   1338 $code =~ s/\`([^\`]*)\`/eval $1/gem;
   1339 print $code;
   1340