Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. Rights for redistribution and usage in source and binary
      6 # forms are granted according to the OpenSSL license.
      7 # ====================================================================
      8 #
      9 # Version 1.1
     10 #
     11 # The major reason for undertaken effort was to mitigate the hazard of
     12 # cache-timing attack. This is [currently and initially!] addressed in
     13 # two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
     14 # 2. References to them are scheduled for L2 cache latency, meaning
     15 # that the tables don't have to reside in L1 cache. Once again, this
     16 # is an initial draft and one should expect more countermeasures to
     17 # be implemented...
     18 #
     19 # Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
     20 # round.
     21 #
     22 # Even though performance was not the primary goal [on the contrary,
     23 # extra shifts "induced" by compressed S-box and longer loop epilogue
     24 # "induced" by scheduling for L2 have negative effect on performance],
     25 # the code turned out to run in ~23 cycles per processed byte en-/
     26 # decrypted with 128-bit key. This is pretty good result for code
     27 # with mentioned qualities and UltraSPARC core. Compared to Sun C
     28 # generated code my encrypt procedure runs just few percents faster,
     29 # while decrypt one - whole 50% faster [yes, Sun C failed to generate
     30 # optimal decrypt procedure]. Compared to GNU C generated code both
     31 # procedures are more than 60% faster:-)
     32 
     33 $bits=32;
     34 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
     35 if ($bits==64)	{ $bias=2047; $frame=192; }
     36 else		{ $bias=0;    $frame=112; }
     37 $locals=16;
     38 
     39 $acc0="%l0";
     40 $acc1="%o0";
     41 $acc2="%o1";
     42 $acc3="%o2";
     43 
     44 $acc4="%l1";
     45 $acc5="%o3";
     46 $acc6="%o4";
     47 $acc7="%o5";
     48 
     49 $acc8="%l2";
     50 $acc9="%o7";
     51 $acc10="%g1";
     52 $acc11="%g2";
     53 
     54 $acc12="%l3";
     55 $acc13="%g3";
     56 $acc14="%g4";
     57 $acc15="%g5";
     58 
     59 $t0="%l4";
     60 $t1="%l5";
     61 $t2="%l6";
     62 $t3="%l7";
     63 
     64 $s0="%i0";
     65 $s1="%i1";
     66 $s2="%i2";
     67 $s3="%i3";
     68 $tbl="%i4";
     69 $key="%i5";
     70 $rounds="%i7";	# aliases with return address, which is off-loaded to stack
     71 
     72 sub _data_word()
     73 { my $i;
     74     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
     75 }
     76 
     77 $code.=<<___ if ($bits==64);
     78 .register	%g2,#scratch
     79 .register	%g3,#scratch
     80 ___
     81 $code.=<<___;
     82 .section	".text",#alloc,#execinstr
     83 
     84 .align	256
     85 AES_Te:
     86 ___
     87 &_data_word(
     88 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
     89 	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
     90 	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
     91 	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
     92 	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
     93 	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
     94 	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
     95 	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
     96 	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
     97 	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
     98 	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
     99 	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
    100 	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
    101 	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
    102 	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
    103 	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
    104 	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
    105 	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
    106 	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
    107 	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
    108 	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
    109 	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
    110 	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
    111 	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
    112 	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
    113 	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
    114 	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
    115 	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
    116 	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
    117 	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
    118 	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
    119 	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
    120 	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
    121 	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
    122 	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
    123 	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
    124 	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
    125 	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
    126 	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
    127 	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
    128 	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
    129 	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
    130 	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
    131 	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
    132 	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
    133 	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
    134 	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
    135 	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
    136 	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
    137 	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
    138 	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
    139 	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
    140 	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
    141 	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
    142 	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
    143 	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
    144 	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
    145 	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
    146 	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
    147 	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
    148 	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
    149 	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
    150 	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
    151 	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
    152 $code.=<<___;
    153 	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
    154 	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
    155 	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
    156 	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
    157 	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
    158 	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
    159 	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
    160 	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
    161 	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
    162 	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
    163 	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
    164 	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
    165 	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
    166 	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
    167 	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
    168 	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
    169 	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
    170 	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
    171 	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
    172 	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
    173 	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
    174 	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
    175 	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
    176 	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
    177 	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
    178 	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
    179 	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
    180 	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
    181 	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
    182 	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
    183 	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
    184 	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
    185 .type	AES_Te,#object
    186 .size	AES_Te,(.-AES_Te)
    187 
    188 .align	64
    189 .skip	16
    190 _sparcv9_AES_encrypt:
    191 	save	%sp,-$frame-$locals,%sp
    192 	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
    193 	ld	[$key+240],$rounds
    194 	ld	[$key+0],$t0
    195 	ld	[$key+4],$t1			!
    196 	ld	[$key+8],$t2
    197 	srl	$rounds,1,$rounds
    198 	xor	$t0,$s0,$s0
    199 	ld	[$key+12],$t3
    200 	srl	$s0,21,$acc0
    201 	xor	$t1,$s1,$s1
    202 	ld	[$key+16],$t0
    203 	srl	$s1,13,$acc1			!
    204 	xor	$t2,$s2,$s2
    205 	ld	[$key+20],$t1
    206 	xor	$t3,$s3,$s3
    207 	ld	[$key+24],$t2
    208 	and	$acc0,2040,$acc0
    209 	ld	[$key+28],$t3
    210 	nop
    211 .Lenc_loop:
    212 	srl	$s2,5,$acc2			!
    213 	and	$acc1,2040,$acc1
    214 	ldx	[$tbl+$acc0],$acc0
    215 	sll	$s3,3,$acc3
    216 	and	$acc2,2040,$acc2
    217 	ldx	[$tbl+$acc1],$acc1
    218 	srl	$s1,21,$acc4
    219 	and	$acc3,2040,$acc3
    220 	ldx	[$tbl+$acc2],$acc2		!
    221 	srl	$s2,13,$acc5
    222 	and	$acc4,2040,$acc4
    223 	ldx	[$tbl+$acc3],$acc3
    224 	srl	$s3,5,$acc6
    225 	and	$acc5,2040,$acc5
    226 	ldx	[$tbl+$acc4],$acc4
    227 	fmovs	%f0,%f0
    228 	sll	$s0,3,$acc7			!
    229 	and	$acc6,2040,$acc6
    230 	ldx	[$tbl+$acc5],$acc5
    231 	srl	$s2,21,$acc8
    232 	and	$acc7,2040,$acc7
    233 	ldx	[$tbl+$acc6],$acc6
    234 	srl	$s3,13,$acc9
    235 	and	$acc8,2040,$acc8
    236 	ldx	[$tbl+$acc7],$acc7		!
    237 	srl	$s0,5,$acc10
    238 	and	$acc9,2040,$acc9
    239 	ldx	[$tbl+$acc8],$acc8
    240 	sll	$s1,3,$acc11
    241 	and	$acc10,2040,$acc10
    242 	ldx	[$tbl+$acc9],$acc9
    243 	fmovs	%f0,%f0
    244 	srl	$s3,21,$acc12			!
    245 	and	$acc11,2040,$acc11
    246 	ldx	[$tbl+$acc10],$acc10
    247 	srl	$s0,13,$acc13
    248 	and	$acc12,2040,$acc12
    249 	ldx	[$tbl+$acc11],$acc11
    250 	srl	$s1,5,$acc14
    251 	and	$acc13,2040,$acc13
    252 	ldx	[$tbl+$acc12],$acc12		!
    253 	sll	$s2,3,$acc15
    254 	and	$acc14,2040,$acc14
    255 	ldx	[$tbl+$acc13],$acc13
    256 	and	$acc15,2040,$acc15
    257 	add	$key,32,$key
    258 	ldx	[$tbl+$acc14],$acc14
    259 	fmovs	%f0,%f0
    260 	subcc	$rounds,1,$rounds		!
    261 	ldx	[$tbl+$acc15],$acc15
    262 	bz,a,pn	%icc,.Lenc_last
    263 	add	$tbl,2048,$rounds
    264 
    265 		srlx	$acc1,8,$acc1
    266 		xor	$acc0,$t0,$t0
    267 	ld	[$key+0],$s0
    268 	fmovs	%f0,%f0
    269 		srlx	$acc2,16,$acc2		!
    270 		xor	$acc1,$t0,$t0
    271 	ld	[$key+4],$s1
    272 		srlx	$acc3,24,$acc3
    273 		xor	$acc2,$t0,$t0
    274 	ld	[$key+8],$s2
    275 		srlx	$acc5,8,$acc5
    276 		xor	$acc3,$t0,$t0
    277 	ld	[$key+12],$s3			!
    278 		srlx	$acc6,16,$acc6
    279 		xor	$acc4,$t1,$t1
    280 	fmovs	%f0,%f0
    281 		srlx	$acc7,24,$acc7
    282 		xor	$acc5,$t1,$t1
    283 		srlx	$acc9,8,$acc9
    284 		xor	$acc6,$t1,$t1
    285 		srlx	$acc10,16,$acc10	!
    286 		xor	$acc7,$t1,$t1
    287 		srlx	$acc11,24,$acc11
    288 		xor	$acc8,$t2,$t2
    289 		srlx	$acc13,8,$acc13
    290 		xor	$acc9,$t2,$t2
    291 		srlx	$acc14,16,$acc14
    292 		xor	$acc10,$t2,$t2
    293 		srlx	$acc15,24,$acc15	!
    294 		xor	$acc11,$t2,$t2
    295 		xor	$acc12,$acc14,$acc14
    296 		xor	$acc13,$t3,$t3
    297 	srl	$t0,21,$acc0
    298 		xor	$acc14,$t3,$t3
    299 	srl	$t1,13,$acc1
    300 		xor	$acc15,$t3,$t3
    301 
    302 	and	$acc0,2040,$acc0		!
    303 	srl	$t2,5,$acc2
    304 	and	$acc1,2040,$acc1
    305 	ldx	[$tbl+$acc0],$acc0
    306 	sll	$t3,3,$acc3
    307 	and	$acc2,2040,$acc2
    308 	ldx	[$tbl+$acc1],$acc1
    309 	fmovs	%f0,%f0
    310 	srl	$t1,21,$acc4			!
    311 	and	$acc3,2040,$acc3
    312 	ldx	[$tbl+$acc2],$acc2
    313 	srl	$t2,13,$acc5
    314 	and	$acc4,2040,$acc4
    315 	ldx	[$tbl+$acc3],$acc3
    316 	srl	$t3,5,$acc6
    317 	and	$acc5,2040,$acc5
    318 	ldx	[$tbl+$acc4],$acc4		!
    319 	sll	$t0,3,$acc7
    320 	and	$acc6,2040,$acc6
    321 	ldx	[$tbl+$acc5],$acc5
    322 	srl	$t2,21,$acc8
    323 	and	$acc7,2040,$acc7
    324 	ldx	[$tbl+$acc6],$acc6
    325 	fmovs	%f0,%f0
    326 	srl	$t3,13,$acc9			!
    327 	and	$acc8,2040,$acc8
    328 	ldx	[$tbl+$acc7],$acc7
    329 	srl	$t0,5,$acc10
    330 	and	$acc9,2040,$acc9
    331 	ldx	[$tbl+$acc8],$acc8
    332 	sll	$t1,3,$acc11
    333 	and	$acc10,2040,$acc10
    334 	ldx	[$tbl+$acc9],$acc9		!
    335 	srl	$t3,21,$acc12
    336 	and	$acc11,2040,$acc11
    337 	ldx	[$tbl+$acc10],$acc10
    338 	srl	$t0,13,$acc13
    339 	and	$acc12,2040,$acc12
    340 	ldx	[$tbl+$acc11],$acc11
    341 	fmovs	%f0,%f0
    342 	srl	$t1,5,$acc14			!
    343 	and	$acc13,2040,$acc13
    344 	ldx	[$tbl+$acc12],$acc12
    345 	sll	$t2,3,$acc15
    346 	and	$acc14,2040,$acc14
    347 	ldx	[$tbl+$acc13],$acc13
    348 		srlx	$acc1,8,$acc1
    349 	and	$acc15,2040,$acc15
    350 	ldx	[$tbl+$acc14],$acc14		!
    351 
    352 		srlx	$acc2,16,$acc2
    353 		xor	$acc0,$s0,$s0
    354 	ldx	[$tbl+$acc15],$acc15
    355 		srlx	$acc3,24,$acc3
    356 		xor	$acc1,$s0,$s0
    357 	ld	[$key+16],$t0
    358 	fmovs	%f0,%f0
    359 		srlx	$acc5,8,$acc5		!
    360 		xor	$acc2,$s0,$s0
    361 	ld	[$key+20],$t1
    362 		srlx	$acc6,16,$acc6
    363 		xor	$acc3,$s0,$s0
    364 	ld	[$key+24],$t2
    365 		srlx	$acc7,24,$acc7
    366 		xor	$acc4,$s1,$s1
    367 	ld	[$key+28],$t3			!
    368 		srlx	$acc9,8,$acc9
    369 		xor	$acc5,$s1,$s1
    370 	ldx	[$tbl+2048+0],%g0		! prefetch te4
    371 		srlx	$acc10,16,$acc10
    372 		xor	$acc6,$s1,$s1
    373 	ldx	[$tbl+2048+32],%g0		! prefetch te4
    374 		srlx	$acc11,24,$acc11
    375 		xor	$acc7,$s1,$s1
    376 	ldx	[$tbl+2048+64],%g0		! prefetch te4
    377 		srlx	$acc13,8,$acc13
    378 		xor	$acc8,$s2,$s2
    379 	ldx	[$tbl+2048+96],%g0		! prefetch te4
    380 		srlx	$acc14,16,$acc14	!
    381 		xor	$acc9,$s2,$s2
    382 	ldx	[$tbl+2048+128],%g0		! prefetch te4
    383 		srlx	$acc15,24,$acc15
    384 		xor	$acc10,$s2,$s2
    385 	ldx	[$tbl+2048+160],%g0		! prefetch te4
    386 	srl	$s0,21,$acc0
    387 		xor	$acc11,$s2,$s2
    388 	ldx	[$tbl+2048+192],%g0		! prefetch te4
    389 		xor	$acc12,$acc14,$acc14
    390 		xor	$acc13,$s3,$s3
    391 	ldx	[$tbl+2048+224],%g0		! prefetch te4
    392 	srl	$s1,13,$acc1			!
    393 		xor	$acc14,$s3,$s3
    394 		xor	$acc15,$s3,$s3
    395 	ba	.Lenc_loop
    396 	and	$acc0,2040,$acc0
    397 
    398 .align	32
    399 .Lenc_last:
    400 		srlx	$acc1,8,$acc1		!
    401 		xor	$acc0,$t0,$t0
    402 	ld	[$key+0],$s0
    403 		srlx	$acc2,16,$acc2
    404 		xor	$acc1,$t0,$t0
    405 	ld	[$key+4],$s1
    406 		srlx	$acc3,24,$acc3
    407 		xor	$acc2,$t0,$t0
    408 	ld	[$key+8],$s2			!
    409 		srlx	$acc5,8,$acc5
    410 		xor	$acc3,$t0,$t0
    411 	ld	[$key+12],$s3
    412 		srlx	$acc6,16,$acc6
    413 		xor	$acc4,$t1,$t1
    414 		srlx	$acc7,24,$acc7
    415 		xor	$acc5,$t1,$t1
    416 		srlx	$acc9,8,$acc9		!
    417 		xor	$acc6,$t1,$t1
    418 		srlx	$acc10,16,$acc10
    419 		xor	$acc7,$t1,$t1
    420 		srlx	$acc11,24,$acc11
    421 		xor	$acc8,$t2,$t2
    422 		srlx	$acc13,8,$acc13
    423 		xor	$acc9,$t2,$t2
    424 		srlx	$acc14,16,$acc14	!
    425 		xor	$acc10,$t2,$t2
    426 		srlx	$acc15,24,$acc15
    427 		xor	$acc11,$t2,$t2
    428 		xor	$acc12,$acc14,$acc14
    429 		xor	$acc13,$t3,$t3
    430 	srl	$t0,24,$acc0
    431 		xor	$acc14,$t3,$t3
    432 	srl	$t1,16,$acc1			!
    433 		xor	$acc15,$t3,$t3
    434 
    435 	srl	$t2,8,$acc2
    436 	and	$acc1,255,$acc1
    437 	ldub	[$rounds+$acc0],$acc0
    438 	srl	$t1,24,$acc4
    439 	and	$acc2,255,$acc2
    440 	ldub	[$rounds+$acc1],$acc1
    441 	srl	$t2,16,$acc5			!
    442 	and	$t3,255,$acc3
    443 	ldub	[$rounds+$acc2],$acc2
    444 	ldub	[$rounds+$acc3],$acc3
    445 	srl	$t3,8,$acc6
    446 	and	$acc5,255,$acc5
    447 	ldub	[$rounds+$acc4],$acc4
    448 	fmovs	%f0,%f0
    449 	srl	$t2,24,$acc8			!
    450 	and	$acc6,255,$acc6
    451 	ldub	[$rounds+$acc5],$acc5
    452 	srl	$t3,16,$acc9
    453 	and	$t0,255,$acc7
    454 	ldub	[$rounds+$acc6],$acc6
    455 	ldub	[$rounds+$acc7],$acc7
    456 	fmovs	%f0,%f0
    457 	srl	$t0,8,$acc10			!
    458 	and	$acc9,255,$acc9
    459 	ldub	[$rounds+$acc8],$acc8
    460 	srl	$t3,24,$acc12
    461 	and	$acc10,255,$acc10
    462 	ldub	[$rounds+$acc9],$acc9
    463 	srl	$t0,16,$acc13
    464 	and	$t1,255,$acc11
    465 	ldub	[$rounds+$acc10],$acc10		!
    466 	srl	$t1,8,$acc14
    467 	and	$acc13,255,$acc13
    468 	ldub	[$rounds+$acc11],$acc11
    469 	ldub	[$rounds+$acc12],$acc12
    470 	and	$acc14,255,$acc14
    471 	ldub	[$rounds+$acc13],$acc13
    472 	and	$t2,255,$acc15
    473 	ldub	[$rounds+$acc14],$acc14		!
    474 
    475 		sll	$acc0,24,$acc0
    476 		xor	$acc3,$s0,$s0
    477 	ldub	[$rounds+$acc15],$acc15
    478 		sll	$acc1,16,$acc1
    479 		xor	$acc0,$s0,$s0
    480 	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
    481 	fmovs	%f0,%f0
    482 		sll	$acc2,8,$acc2		!
    483 		xor	$acc1,$s0,$s0
    484 		sll	$acc4,24,$acc4
    485 		xor	$acc2,$s0,$s0
    486 		sll	$acc5,16,$acc5
    487 		xor	$acc7,$s1,$s1
    488 		sll	$acc6,8,$acc6
    489 		xor	$acc4,$s1,$s1
    490 		sll	$acc8,24,$acc8		!
    491 		xor	$acc5,$s1,$s1
    492 		sll	$acc9,16,$acc9
    493 		xor	$acc11,$s2,$s2
    494 		sll	$acc10,8,$acc10
    495 		xor	$acc6,$s1,$s1
    496 		sll	$acc12,24,$acc12
    497 		xor	$acc8,$s2,$s2
    498 		sll	$acc13,16,$acc13	!
    499 		xor	$acc9,$s2,$s2
    500 		sll	$acc14,8,$acc14
    501 		xor	$acc10,$s2,$s2
    502 		xor	$acc12,$acc14,$acc14
    503 		xor	$acc13,$s3,$s3
    504 		xor	$acc14,$s3,$s3
    505 		xor	$acc15,$s3,$s3
    506 
    507 	ret
    508 	restore
    509 .type	_sparcv9_AES_encrypt,#function
    510 .size	_sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
    511 
    512 .align	32
    513 .globl	AES_encrypt
    514 AES_encrypt:
    515 	or	%o0,%o1,%g1
    516 	andcc	%g1,3,%g0
    517 	bnz,pn	%xcc,.Lunaligned_enc
    518 	save	%sp,-$frame,%sp
    519 
    520 	ld	[%i0+0],%o0
    521 	ld	[%i0+4],%o1
    522 	ld	[%i0+8],%o2
    523 	ld	[%i0+12],%o3
    524 
    525 1:	call	.+8
    526 	add	%o7,AES_Te-1b,%o4
    527 	call	_sparcv9_AES_encrypt
    528 	mov	%i2,%o5
    529 
    530 	st	%o0,[%i1+0]
    531 	st	%o1,[%i1+4]
    532 	st	%o2,[%i1+8]
    533 	st	%o3,[%i1+12]
    534 
    535 	ret
    536 	restore
    537 
    538 .align	32
    539 .Lunaligned_enc:
    540 	ldub	[%i0+0],%l0
    541 	ldub	[%i0+1],%l1
    542 	ldub	[%i0+2],%l2
    543 
    544 	sll	%l0,24,%l0
    545 	ldub	[%i0+3],%l3
    546 	sll	%l1,16,%l1
    547 	ldub	[%i0+4],%l4
    548 	sll	%l2,8,%l2
    549 	or	%l1,%l0,%l0
    550 	ldub	[%i0+5],%l5
    551 	sll	%l4,24,%l4
    552 	or	%l3,%l2,%l2
    553 	ldub	[%i0+6],%l6
    554 	sll	%l5,16,%l5
    555 	or	%l0,%l2,%o0
    556 	ldub	[%i0+7],%l7
    557 
    558 	sll	%l6,8,%l6
    559 	or	%l5,%l4,%l4
    560 	ldub	[%i0+8],%l0
    561 	or	%l7,%l6,%l6
    562 	ldub	[%i0+9],%l1
    563 	or	%l4,%l6,%o1
    564 	ldub	[%i0+10],%l2
    565 
    566 	sll	%l0,24,%l0
    567 	ldub	[%i0+11],%l3
    568 	sll	%l1,16,%l1
    569 	ldub	[%i0+12],%l4
    570 	sll	%l2,8,%l2
    571 	or	%l1,%l0,%l0
    572 	ldub	[%i0+13],%l5
    573 	sll	%l4,24,%l4
    574 	or	%l3,%l2,%l2
    575 	ldub	[%i0+14],%l6
    576 	sll	%l5,16,%l5
    577 	or	%l0,%l2,%o2
    578 	ldub	[%i0+15],%l7
    579 
    580 	sll	%l6,8,%l6
    581 	or	%l5,%l4,%l4
    582 	or	%l7,%l6,%l6
    583 	or	%l4,%l6,%o3
    584 
    585 1:	call	.+8
    586 	add	%o7,AES_Te-1b,%o4
    587 	call	_sparcv9_AES_encrypt
    588 	mov	%i2,%o5
    589 
    590 	srl	%o0,24,%l0
    591 	srl	%o0,16,%l1
    592 	stb	%l0,[%i1+0]
    593 	srl	%o0,8,%l2
    594 	stb	%l1,[%i1+1]
    595 	stb	%l2,[%i1+2]
    596 	srl	%o1,24,%l4
    597 	stb	%o0,[%i1+3]
    598 
    599 	srl	%o1,16,%l5
    600 	stb	%l4,[%i1+4]
    601 	srl	%o1,8,%l6
    602 	stb	%l5,[%i1+5]
    603 	stb	%l6,[%i1+6]
    604 	srl	%o2,24,%l0
    605 	stb	%o1,[%i1+7]
    606 
    607 	srl	%o2,16,%l1
    608 	stb	%l0,[%i1+8]
    609 	srl	%o2,8,%l2
    610 	stb	%l1,[%i1+9]
    611 	stb	%l2,[%i1+10]
    612 	srl	%o3,24,%l4
    613 	stb	%o2,[%i1+11]
    614 
    615 	srl	%o3,16,%l5
    616 	stb	%l4,[%i1+12]
    617 	srl	%o3,8,%l6
    618 	stb	%l5,[%i1+13]
    619 	stb	%l6,[%i1+14]
    620 	stb	%o3,[%i1+15]
    621 
    622 	ret
    623 	restore
    624 .type	AES_encrypt,#function
    625 .size	AES_encrypt,(.-AES_encrypt)
    626 
    627 ___
    628 
    629 $code.=<<___;
    630 .align	256
    631 AES_Td:
    632 ___
    633 &_data_word(
    634 	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
    635 	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
    636 	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
    637 	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
    638 	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
    639 	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
    640 	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
    641 	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
    642 	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
    643 	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
    644 	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
    645 	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
    646 	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
    647 	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
    648 	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
    649 	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
    650 	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
    651 	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
    652 	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
    653 	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
    654 	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
    655 	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
    656 	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
    657 	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
    658 	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
    659 	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
    660 	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
    661 	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
    662 	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
    663 	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
    664 	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
    665 	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
    666 	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
    667 	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
    668 	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
    669 	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
    670 	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
    671 	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
    672 	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
    673 	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
    674 	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
    675 	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
    676 	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
    677 	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
    678 	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
    679 	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
    680 	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
    681 	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
    682 	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
    683 	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
    684 	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
    685 	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
    686 	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
    687 	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
    688 	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
    689 	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
    690 	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
    691 	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
    692 	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
    693 	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
    694 	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
    695 	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
    696 	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
    697 	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
    698 $code.=<<___;
    699 	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
    700 	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
    701 	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
    702 	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
    703 	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
    704 	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
    705 	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
    706 	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
    707 	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
    708 	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
    709 	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
    710 	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
    711 	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
    712 	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
    713 	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
    714 	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
    715 	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
    716 	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
    717 	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
    718 	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
    719 	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
    720 	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
    721 	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
    722 	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
    723 	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
    724 	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
    725 	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
    726 	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
    727 	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
    728 	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
    729 	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
    730 	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
    731 .type	AES_Td,#object
    732 .size	AES_Td,(.-AES_Td)
    733 
    734 .align	64
    735 .skip	16
    736 _sparcv9_AES_decrypt:
    737 	save	%sp,-$frame-$locals,%sp
    738 	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
    739 	ld	[$key+240],$rounds
    740 	ld	[$key+0],$t0
    741 	ld	[$key+4],$t1			!
    742 	ld	[$key+8],$t2
    743 	ld	[$key+12],$t3
    744 	srl	$rounds,1,$rounds
    745 	xor	$t0,$s0,$s0
    746 	ld	[$key+16],$t0
    747 	xor	$t1,$s1,$s1
    748 	ld	[$key+20],$t1
    749 	srl	$s0,21,$acc0			!
    750 	xor	$t2,$s2,$s2
    751 	ld	[$key+24],$t2
    752 	xor	$t3,$s3,$s3
    753 	and	$acc0,2040,$acc0
    754 	ld	[$key+28],$t3
    755 	srl	$s3,13,$acc1
    756 	nop
    757 .Ldec_loop:
    758 	srl	$s2,5,$acc2			!
    759 	and	$acc1,2040,$acc1
    760 	ldx	[$tbl+$acc0],$acc0
    761 	sll	$s1,3,$acc3
    762 	and	$acc2,2040,$acc2
    763 	ldx	[$tbl+$acc1],$acc1
    764 	srl	$s1,21,$acc4
    765 	and	$acc3,2040,$acc3
    766 	ldx	[$tbl+$acc2],$acc2		!
    767 	srl	$s0,13,$acc5
    768 	and	$acc4,2040,$acc4
    769 	ldx	[$tbl+$acc3],$acc3
    770 	srl	$s3,5,$acc6
    771 	and	$acc5,2040,$acc5
    772 	ldx	[$tbl+$acc4],$acc4
    773 	fmovs	%f0,%f0
    774 	sll	$s2,3,$acc7			!
    775 	and	$acc6,2040,$acc6
    776 	ldx	[$tbl+$acc5],$acc5
    777 	srl	$s2,21,$acc8
    778 	and	$acc7,2040,$acc7
    779 	ldx	[$tbl+$acc6],$acc6
    780 	srl	$s1,13,$acc9
    781 	and	$acc8,2040,$acc8
    782 	ldx	[$tbl+$acc7],$acc7		!
    783 	srl	$s0,5,$acc10
    784 	and	$acc9,2040,$acc9
    785 	ldx	[$tbl+$acc8],$acc8
    786 	sll	$s3,3,$acc11
    787 	and	$acc10,2040,$acc10
    788 	ldx	[$tbl+$acc9],$acc9
    789 	fmovs	%f0,%f0
    790 	srl	$s3,21,$acc12			!
    791 	and	$acc11,2040,$acc11
    792 	ldx	[$tbl+$acc10],$acc10
    793 	srl	$s2,13,$acc13
    794 	and	$acc12,2040,$acc12
    795 	ldx	[$tbl+$acc11],$acc11
    796 	srl	$s1,5,$acc14
    797 	and	$acc13,2040,$acc13
    798 	ldx	[$tbl+$acc12],$acc12		!
    799 	sll	$s0,3,$acc15
    800 	and	$acc14,2040,$acc14
    801 	ldx	[$tbl+$acc13],$acc13
    802 	and	$acc15,2040,$acc15
    803 	add	$key,32,$key
    804 	ldx	[$tbl+$acc14],$acc14
    805 	fmovs	%f0,%f0
    806 	subcc	$rounds,1,$rounds		!
    807 	ldx	[$tbl+$acc15],$acc15
    808 	bz,a,pn	%icc,.Ldec_last
    809 	add	$tbl,2048,$rounds
    810 
    811 		srlx	$acc1,8,$acc1
    812 		xor	$acc0,$t0,$t0
    813 	ld	[$key+0],$s0
    814 	fmovs	%f0,%f0
    815 		srlx	$acc2,16,$acc2		!
    816 		xor	$acc1,$t0,$t0
    817 	ld	[$key+4],$s1
    818 		srlx	$acc3,24,$acc3
    819 		xor	$acc2,$t0,$t0
    820 	ld	[$key+8],$s2
    821 		srlx	$acc5,8,$acc5
    822 		xor	$acc3,$t0,$t0
    823 	ld	[$key+12],$s3			!
    824 		srlx	$acc6,16,$acc6
    825 		xor	$acc4,$t1,$t1
    826 	fmovs	%f0,%f0
    827 		srlx	$acc7,24,$acc7
    828 		xor	$acc5,$t1,$t1
    829 		srlx	$acc9,8,$acc9
    830 		xor	$acc6,$t1,$t1
    831 		srlx	$acc10,16,$acc10	!
    832 		xor	$acc7,$t1,$t1
    833 		srlx	$acc11,24,$acc11
    834 		xor	$acc8,$t2,$t2
    835 		srlx	$acc13,8,$acc13
    836 		xor	$acc9,$t2,$t2
    837 		srlx	$acc14,16,$acc14
    838 		xor	$acc10,$t2,$t2
    839 		srlx	$acc15,24,$acc15	!
    840 		xor	$acc11,$t2,$t2
    841 		xor	$acc12,$acc14,$acc14
    842 		xor	$acc13,$t3,$t3
    843 	srl	$t0,21,$acc0
    844 		xor	$acc14,$t3,$t3
    845 		xor	$acc15,$t3,$t3
    846 	srl	$t3,13,$acc1
    847 
    848 	and	$acc0,2040,$acc0		!
    849 	srl	$t2,5,$acc2
    850 	and	$acc1,2040,$acc1
    851 	ldx	[$tbl+$acc0],$acc0
    852 	sll	$t1,3,$acc3
    853 	and	$acc2,2040,$acc2
    854 	ldx	[$tbl+$acc1],$acc1
    855 	fmovs	%f0,%f0
    856 	srl	$t1,21,$acc4			!
    857 	and	$acc3,2040,$acc3
    858 	ldx	[$tbl+$acc2],$acc2
    859 	srl	$t0,13,$acc5
    860 	and	$acc4,2040,$acc4
    861 	ldx	[$tbl+$acc3],$acc3
    862 	srl	$t3,5,$acc6
    863 	and	$acc5,2040,$acc5
    864 	ldx	[$tbl+$acc4],$acc4		!
    865 	sll	$t2,3,$acc7
    866 	and	$acc6,2040,$acc6
    867 	ldx	[$tbl+$acc5],$acc5
    868 	srl	$t2,21,$acc8
    869 	and	$acc7,2040,$acc7
    870 	ldx	[$tbl+$acc6],$acc6
    871 	fmovs	%f0,%f0
    872 	srl	$t1,13,$acc9			!
    873 	and	$acc8,2040,$acc8
    874 	ldx	[$tbl+$acc7],$acc7
    875 	srl	$t0,5,$acc10
    876 	and	$acc9,2040,$acc9
    877 	ldx	[$tbl+$acc8],$acc8
    878 	sll	$t3,3,$acc11
    879 	and	$acc10,2040,$acc10
    880 	ldx	[$tbl+$acc9],$acc9		!
    881 	srl	$t3,21,$acc12
    882 	and	$acc11,2040,$acc11
    883 	ldx	[$tbl+$acc10],$acc10
    884 	srl	$t2,13,$acc13
    885 	and	$acc12,2040,$acc12
    886 	ldx	[$tbl+$acc11],$acc11
    887 	fmovs	%f0,%f0
    888 	srl	$t1,5,$acc14			!
    889 	and	$acc13,2040,$acc13
    890 	ldx	[$tbl+$acc12],$acc12
    891 	sll	$t0,3,$acc15
    892 	and	$acc14,2040,$acc14
    893 	ldx	[$tbl+$acc13],$acc13
    894 		srlx	$acc1,8,$acc1
    895 	and	$acc15,2040,$acc15
    896 	ldx	[$tbl+$acc14],$acc14		!
    897 
    898 		srlx	$acc2,16,$acc2
    899 		xor	$acc0,$s0,$s0
    900 	ldx	[$tbl+$acc15],$acc15
    901 		srlx	$acc3,24,$acc3
    902 		xor	$acc1,$s0,$s0
    903 	ld	[$key+16],$t0
    904 	fmovs	%f0,%f0
    905 		srlx	$acc5,8,$acc5		!
    906 		xor	$acc2,$s0,$s0
    907 	ld	[$key+20],$t1
    908 		srlx	$acc6,16,$acc6
    909 		xor	$acc3,$s0,$s0
    910 	ld	[$key+24],$t2
    911 		srlx	$acc7,24,$acc7
    912 		xor	$acc4,$s1,$s1
    913 	ld	[$key+28],$t3			!
    914 		srlx	$acc9,8,$acc9
    915 		xor	$acc5,$s1,$s1
    916 	ldx	[$tbl+2048+0],%g0		! prefetch td4
    917 		srlx	$acc10,16,$acc10
    918 		xor	$acc6,$s1,$s1
    919 	ldx	[$tbl+2048+32],%g0		! prefetch td4
    920 		srlx	$acc11,24,$acc11
    921 		xor	$acc7,$s1,$s1
    922 	ldx	[$tbl+2048+64],%g0		! prefetch td4
    923 		srlx	$acc13,8,$acc13
    924 		xor	$acc8,$s2,$s2
    925 	ldx	[$tbl+2048+96],%g0		! prefetch td4
    926 		srlx	$acc14,16,$acc14	!
    927 		xor	$acc9,$s2,$s2
    928 	ldx	[$tbl+2048+128],%g0		! prefetch td4
    929 		srlx	$acc15,24,$acc15
    930 		xor	$acc10,$s2,$s2
    931 	ldx	[$tbl+2048+160],%g0		! prefetch td4
    932 	srl	$s0,21,$acc0
    933 		xor	$acc11,$s2,$s2
    934 	ldx	[$tbl+2048+192],%g0		! prefetch td4
    935 		xor	$acc12,$acc14,$acc14
    936 		xor	$acc13,$s3,$s3
    937 	ldx	[$tbl+2048+224],%g0		! prefetch td4
    938 	and	$acc0,2040,$acc0		!
    939 		xor	$acc14,$s3,$s3
    940 		xor	$acc15,$s3,$s3
    941 	ba	.Ldec_loop
    942 	srl	$s3,13,$acc1
    943 
    944 .align	32
    945 .Ldec_last:
    946 		srlx	$acc1,8,$acc1		!
    947 		xor	$acc0,$t0,$t0
    948 	ld	[$key+0],$s0
    949 		srlx	$acc2,16,$acc2
    950 		xor	$acc1,$t0,$t0
    951 	ld	[$key+4],$s1
    952 		srlx	$acc3,24,$acc3
    953 		xor	$acc2,$t0,$t0
    954 	ld	[$key+8],$s2			!
    955 		srlx	$acc5,8,$acc5
    956 		xor	$acc3,$t0,$t0
    957 	ld	[$key+12],$s3
    958 		srlx	$acc6,16,$acc6
    959 		xor	$acc4,$t1,$t1
    960 		srlx	$acc7,24,$acc7
    961 		xor	$acc5,$t1,$t1
    962 		srlx	$acc9,8,$acc9		!
    963 		xor	$acc6,$t1,$t1
    964 		srlx	$acc10,16,$acc10
    965 		xor	$acc7,$t1,$t1
    966 		srlx	$acc11,24,$acc11
    967 		xor	$acc8,$t2,$t2
    968 		srlx	$acc13,8,$acc13
    969 		xor	$acc9,$t2,$t2
    970 		srlx	$acc14,16,$acc14	!
    971 		xor	$acc10,$t2,$t2
    972 		srlx	$acc15,24,$acc15
    973 		xor	$acc11,$t2,$t2
    974 		xor	$acc12,$acc14,$acc14
    975 		xor	$acc13,$t3,$t3
    976 	srl	$t0,24,$acc0
    977 		xor	$acc14,$t3,$t3
    978 		xor	$acc15,$t3,$t3		!
    979 	srl	$t3,16,$acc1
    980 
    981 	srl	$t2,8,$acc2
    982 	and	$acc1,255,$acc1
    983 	ldub	[$rounds+$acc0],$acc0
    984 	srl	$t1,24,$acc4
    985 	and	$acc2,255,$acc2
    986 	ldub	[$rounds+$acc1],$acc1
    987 	srl	$t0,16,$acc5			!
    988 	and	$t1,255,$acc3
    989 	ldub	[$rounds+$acc2],$acc2
    990 	ldub	[$rounds+$acc3],$acc3
    991 	srl	$t3,8,$acc6
    992 	and	$acc5,255,$acc5
    993 	ldub	[$rounds+$acc4],$acc4
    994 	fmovs	%f0,%f0
    995 	srl	$t2,24,$acc8			!
    996 	and	$acc6,255,$acc6
    997 	ldub	[$rounds+$acc5],$acc5
    998 	srl	$t1,16,$acc9
    999 	and	$t2,255,$acc7
   1000 	ldub	[$rounds+$acc6],$acc6
   1001 	ldub	[$rounds+$acc7],$acc7
   1002 	fmovs	%f0,%f0
   1003 	srl	$t0,8,$acc10			!
   1004 	and	$acc9,255,$acc9
   1005 	ldub	[$rounds+$acc8],$acc8
   1006 	srl	$t3,24,$acc12
   1007 	and	$acc10,255,$acc10
   1008 	ldub	[$rounds+$acc9],$acc9
   1009 	srl	$t2,16,$acc13
   1010 	and	$t3,255,$acc11
   1011 	ldub	[$rounds+$acc10],$acc10		!
   1012 	srl	$t1,8,$acc14
   1013 	and	$acc13,255,$acc13
   1014 	ldub	[$rounds+$acc11],$acc11
   1015 	ldub	[$rounds+$acc12],$acc12
   1016 	and	$acc14,255,$acc14
   1017 	ldub	[$rounds+$acc13],$acc13
   1018 	and	$t0,255,$acc15
   1019 	ldub	[$rounds+$acc14],$acc14		!
   1020 
   1021 		sll	$acc0,24,$acc0
   1022 		xor	$acc3,$s0,$s0
   1023 	ldub	[$rounds+$acc15],$acc15
   1024 		sll	$acc1,16,$acc1
   1025 		xor	$acc0,$s0,$s0
   1026 	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
   1027 	fmovs	%f0,%f0
   1028 		sll	$acc2,8,$acc2		!
   1029 		xor	$acc1,$s0,$s0
   1030 		sll	$acc4,24,$acc4
   1031 		xor	$acc2,$s0,$s0
   1032 		sll	$acc5,16,$acc5
   1033 		xor	$acc7,$s1,$s1
   1034 		sll	$acc6,8,$acc6
   1035 		xor	$acc4,$s1,$s1
   1036 		sll	$acc8,24,$acc8		!
   1037 		xor	$acc5,$s1,$s1
   1038 		sll	$acc9,16,$acc9
   1039 		xor	$acc11,$s2,$s2
   1040 		sll	$acc10,8,$acc10
   1041 		xor	$acc6,$s1,$s1
   1042 		sll	$acc12,24,$acc12
   1043 		xor	$acc8,$s2,$s2
   1044 		sll	$acc13,16,$acc13	!
   1045 		xor	$acc9,$s2,$s2
   1046 		sll	$acc14,8,$acc14
   1047 		xor	$acc10,$s2,$s2
   1048 		xor	$acc12,$acc14,$acc14
   1049 		xor	$acc13,$s3,$s3
   1050 		xor	$acc14,$s3,$s3
   1051 		xor	$acc15,$s3,$s3
   1052 
   1053 	ret
   1054 	restore
   1055 .type	_sparcv9_AES_decrypt,#function
   1056 .size	_sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
   1057 
   1058 .align	32
   1059 .globl	AES_decrypt
   1060 AES_decrypt:
   1061 	or	%o0,%o1,%g1
   1062 	andcc	%g1,3,%g0
   1063 	bnz,pn	%xcc,.Lunaligned_dec
   1064 	save	%sp,-$frame,%sp
   1065 
   1066 	ld	[%i0+0],%o0
   1067 	ld	[%i0+4],%o1
   1068 	ld	[%i0+8],%o2
   1069 	ld	[%i0+12],%o3
   1070 
   1071 1:	call	.+8
   1072 	add	%o7,AES_Td-1b,%o4
   1073 	call	_sparcv9_AES_decrypt
   1074 	mov	%i2,%o5
   1075 
   1076 	st	%o0,[%i1+0]
   1077 	st	%o1,[%i1+4]
   1078 	st	%o2,[%i1+8]
   1079 	st	%o3,[%i1+12]
   1080 
   1081 	ret
   1082 	restore
   1083 
   1084 .align	32
   1085 .Lunaligned_dec:
   1086 	ldub	[%i0+0],%l0
   1087 	ldub	[%i0+1],%l1
   1088 	ldub	[%i0+2],%l2
   1089 
   1090 	sll	%l0,24,%l0
   1091 	ldub	[%i0+3],%l3
   1092 	sll	%l1,16,%l1
   1093 	ldub	[%i0+4],%l4
   1094 	sll	%l2,8,%l2
   1095 	or	%l1,%l0,%l0
   1096 	ldub	[%i0+5],%l5
   1097 	sll	%l4,24,%l4
   1098 	or	%l3,%l2,%l2
   1099 	ldub	[%i0+6],%l6
   1100 	sll	%l5,16,%l5
   1101 	or	%l0,%l2,%o0
   1102 	ldub	[%i0+7],%l7
   1103 
   1104 	sll	%l6,8,%l6
   1105 	or	%l5,%l4,%l4
   1106 	ldub	[%i0+8],%l0
   1107 	or	%l7,%l6,%l6
   1108 	ldub	[%i0+9],%l1
   1109 	or	%l4,%l6,%o1
   1110 	ldub	[%i0+10],%l2
   1111 
   1112 	sll	%l0,24,%l0
   1113 	ldub	[%i0+11],%l3
   1114 	sll	%l1,16,%l1
   1115 	ldub	[%i0+12],%l4
   1116 	sll	%l2,8,%l2
   1117 	or	%l1,%l0,%l0
   1118 	ldub	[%i0+13],%l5
   1119 	sll	%l4,24,%l4
   1120 	or	%l3,%l2,%l2
   1121 	ldub	[%i0+14],%l6
   1122 	sll	%l5,16,%l5
   1123 	or	%l0,%l2,%o2
   1124 	ldub	[%i0+15],%l7
   1125 
   1126 	sll	%l6,8,%l6
   1127 	or	%l5,%l4,%l4
   1128 	or	%l7,%l6,%l6
   1129 	or	%l4,%l6,%o3
   1130 
   1131 1:	call	.+8
   1132 	add	%o7,AES_Td-1b,%o4
   1133 	call	_sparcv9_AES_decrypt
   1134 	mov	%i2,%o5
   1135 
   1136 	srl	%o0,24,%l0
   1137 	srl	%o0,16,%l1
   1138 	stb	%l0,[%i1+0]
   1139 	srl	%o0,8,%l2
   1140 	stb	%l1,[%i1+1]
   1141 	stb	%l2,[%i1+2]
   1142 	srl	%o1,24,%l4
   1143 	stb	%o0,[%i1+3]
   1144 
   1145 	srl	%o1,16,%l5
   1146 	stb	%l4,[%i1+4]
   1147 	srl	%o1,8,%l6
   1148 	stb	%l5,[%i1+5]
   1149 	stb	%l6,[%i1+6]
   1150 	srl	%o2,24,%l0
   1151 	stb	%o1,[%i1+7]
   1152 
   1153 	srl	%o2,16,%l1
   1154 	stb	%l0,[%i1+8]
   1155 	srl	%o2,8,%l2
   1156 	stb	%l1,[%i1+9]
   1157 	stb	%l2,[%i1+10]
   1158 	srl	%o3,24,%l4
   1159 	stb	%o2,[%i1+11]
   1160 
   1161 	srl	%o3,16,%l5
   1162 	stb	%l4,[%i1+12]
   1163 	srl	%o3,8,%l6
   1164 	stb	%l5,[%i1+13]
   1165 	stb	%l6,[%i1+14]
   1166 	stb	%o3,[%i1+15]
   1167 
   1168 	ret
   1169 	restore
   1170 .type	AES_decrypt,#function
   1171 .size	AES_decrypt,(.-AES_decrypt)
   1172 ___
   1173 
   1174 # fmovs instructions substituting for FP nops were originally added
   1175 # to meet specific instruction alignment requirements to maximize ILP.
   1176 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
   1177 # undesired effect, so just omit them and sacrifice some portion of
   1178 # percent in performance...
   1179 $code =~ s/fmovs.*$//gm;
   1180 
   1181 print $code;
   1182 close STDOUT;	# ensure flush
   1183