Home | History | Annotate | Download | only in perlasm
      1 #!/usr/bin/env perl
      2 
      3 # Specific modes implementations for SPARC Architecture 2011. There
      4 # is T4 dependency though, an ASI value that is not specified in the
      5 # Architecture Manual. But as SPARC universe is rather monocultural,
      6 # we imply that processor capable of executing crypto instructions
      7 # can handle the ASI in question as well. This means that we ought to
      8 # keep eyes open when new processors emerge...
      9 #
     10 # As for above mentioned ASI. It's so called "block initializing
     11 # store" which cancels "read" in "read-update-write" on cache lines.
     12 # This is "cooperative" optimization, as it reduces overall pressure
     13 # on memory interface. Benefits can't be observed/quantified with
     14 # usual benchmarks, on the contrary you can notice that single-thread
     15 # performance for parallelizable modes is ~1.5% worse for largest
     16 # block sizes [though few percent better for not so long ones]. All
     17 # this based on suggestions from David Miller.
     18 
     19 sub asm_init {		# to be called with @ARGV as argument
     20     for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
     21     if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
     22     else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
     23 }
     24 
     25 # unified interface
     26 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
     27 # local variables
     28 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
     29 
     30 sub alg_cbc_encrypt_implement {
     31 my ($alg,$bits) = @_;
     32 
     33 $::code.=<<___;
     34 .globl	${alg}${bits}_t4_cbc_encrypt
     35 .align	32
     36 ${alg}${bits}_t4_cbc_encrypt:
     37 	save		%sp, -$::frame, %sp
     38 	sub		$inp, $out, $blk_init	! $inp!=$out
     39 ___
     40 $::code.=<<___ if (!$::evp);
     41 	andcc		$ivec, 7, $ivoff
     42 	alignaddr	$ivec, %g0, $ivec
     43 
     44 	ldd		[$ivec + 0], %f0	! load ivec
     45 	bz,pt		%icc, 1f
     46 	ldd		[$ivec + 8], %f2
     47 	ldd		[$ivec + 16], %f4
     48 	faligndata	%f0, %f2, %f0
     49 	faligndata	%f2, %f4, %f2
     50 1:
     51 ___
     52 $::code.=<<___ if ($::evp);
     53 	ld		[$ivec + 0], %f0
     54 	ld		[$ivec + 4], %f1
     55 	ld		[$ivec + 8], %f2
     56 	ld		[$ivec + 12], %f3
     57 ___
     58 $::code.=<<___;
     59 	prefetch	[$inp], 20
     60 	prefetch	[$inp + 63], 20
     61 	call		_${alg}${bits}_load_enckey
     62 	and		$inp, 7, $ileft
     63 	andn		$inp, 7, $inp
     64 	sll		$ileft, 3, $ileft
     65 	mov		64, $iright
     66 	mov		0xff, $omask
     67 	sub		$iright, $ileft, $iright
     68 	and		$out, 7, $ooff
     69 	cmp		$len, 127
     70 	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
     71 	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
     72 	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
     73 	srl		$omask, $ooff, $omask
     74 
     75 	alignaddrl	$out, %g0, $out
     76 	srlx		$len, 4, $len
     77 	prefetch	[$out], 22
     78 
     79 .L${bits}_cbc_enc_loop:
     80 	ldx		[$inp + 0], %o0
     81 	brz,pt		$ileft, 4f
     82 	ldx		[$inp + 8], %o1
     83 
     84 	ldx		[$inp + 16], %o2
     85 	sllx		%o0, $ileft, %o0
     86 	srlx		%o1, $iright, %g1
     87 	sllx		%o1, $ileft, %o1
     88 	or		%g1, %o0, %o0
     89 	srlx		%o2, $iright, %o2
     90 	or		%o2, %o1, %o1
     91 4:
     92 	xor		%g4, %o0, %o0		! ^= rk[0]
     93 	xor		%g5, %o1, %o1
     94 	movxtod		%o0, %f12
     95 	movxtod		%o1, %f14
     96 
     97 	fxor		%f12, %f0, %f0		! ^= ivec
     98 	fxor		%f14, %f2, %f2
     99 	prefetch	[$out + 63], 22
    100 	prefetch	[$inp + 16+63], 20
    101 	call		_${alg}${bits}_encrypt_1x
    102 	add		$inp, 16, $inp
    103 
    104 	brnz,pn		$ooff, 2f
    105 	sub		$len, 1, $len
    106 		
    107 	std		%f0, [$out + 0]
    108 	std		%f2, [$out + 8]
    109 	brnz,pt		$len, .L${bits}_cbc_enc_loop
    110 	add		$out, 16, $out
    111 ___
    112 $::code.=<<___ if ($::evp);
    113 	st		%f0, [$ivec + 0]
    114 	st		%f1, [$ivec + 4]
    115 	st		%f2, [$ivec + 8]
    116 	st		%f3, [$ivec + 12]
    117 ___
    118 $::code.=<<___ if (!$::evp);
    119 	brnz,pn		$ivoff, 3f
    120 	nop
    121 
    122 	std		%f0, [$ivec + 0]	! write out ivec
    123 	std		%f2, [$ivec + 8]
    124 ___
    125 $::code.=<<___;
    126 	ret
    127 	restore
    128 
    129 .align	16
    130 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
    131 						! and ~3x deterioration
    132 						! in inp==out case
    133 	faligndata	%f0, %f0, %f4		! handle unaligned output
    134 	faligndata	%f0, %f2, %f6
    135 	faligndata	%f2, %f2, %f8
    136 
    137 	stda		%f4, [$out + $omask]0xc0	! partial store
    138 	std		%f6, [$out + 8]
    139 	add		$out, 16, $out
    140 	orn		%g0, $omask, $omask
    141 	stda		%f8, [$out + $omask]0xc0	! partial store
    142 
    143 	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
    144 	orn		%g0, $omask, $omask
    145 ___
    146 $::code.=<<___ if ($::evp);
    147 	st		%f0, [$ivec + 0]
    148 	st		%f1, [$ivec + 4]
    149 	st		%f2, [$ivec + 8]
    150 	st		%f3, [$ivec + 12]
    151 ___
    152 $::code.=<<___ if (!$::evp);
    153 	brnz,pn		$ivoff, 3f
    154 	nop
    155 
    156 	std		%f0, [$ivec + 0]	! write out ivec
    157 	std		%f2, [$ivec + 8]
    158 	ret
    159 	restore
    160 
    161 .align	16
    162 3:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
    163 	mov		0xff, $omask
    164 	srl		$omask, $ivoff, $omask
    165 	faligndata	%f0, %f0, %f4
    166 	faligndata	%f0, %f2, %f6
    167 	faligndata	%f2, %f2, %f8
    168 	stda		%f4, [$ivec + $omask]0xc0
    169 	std		%f6, [$ivec + 8]
    170 	add		$ivec, 16, $ivec
    171 	orn		%g0, $omask, $omask
    172 	stda		%f8, [$ivec + $omask]0xc0
    173 ___
    174 $::code.=<<___;
    175 	ret
    176 	restore
    177 
    178 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    179 .align	32
    180 .L${bits}cbc_enc_blk:
    181 	add	$out, $len, $blk_init
    182 	and	$blk_init, 63, $blk_init	! tail
    183 	sub	$len, $blk_init, $len
    184 	add	$blk_init, 15, $blk_init	! round up to 16n
    185 	srlx	$len, 4, $len
    186 	srl	$blk_init, 4, $blk_init
    187 
    188 .L${bits}_cbc_enc_blk_loop:
    189 	ldx		[$inp + 0], %o0
    190 	brz,pt		$ileft, 5f
    191 	ldx		[$inp + 8], %o1
    192 
    193 	ldx		[$inp + 16], %o2
    194 	sllx		%o0, $ileft, %o0
    195 	srlx		%o1, $iright, %g1
    196 	sllx		%o1, $ileft, %o1
    197 	or		%g1, %o0, %o0
    198 	srlx		%o2, $iright, %o2
    199 	or		%o2, %o1, %o1
    200 5:
    201 	xor		%g4, %o0, %o0		! ^= rk[0]
    202 	xor		%g5, %o1, %o1
    203 	movxtod		%o0, %f12
    204 	movxtod		%o1, %f14
    205 
    206 	fxor		%f12, %f0, %f0		! ^= ivec
    207 	fxor		%f14, %f2, %f2
    208 	prefetch	[$inp + 16+63], 20
    209 	call		_${alg}${bits}_encrypt_1x
    210 	add		$inp, 16, $inp
    211 	sub		$len, 1, $len
    212 		
    213 	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    214 	add		$out, 8, $out
    215 	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    216 	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
    217 	add		$out, 8, $out
    218 
    219 	membar		#StoreLoad|#StoreStore
    220 	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
    221 	mov		$blk_init, $len
    222 ___
    223 $::code.=<<___ if ($::evp);
    224 	st		%f0, [$ivec + 0]
    225 	st		%f1, [$ivec + 4]
    226 	st		%f2, [$ivec + 8]
    227 	st		%f3, [$ivec + 12]
    228 ___
    229 $::code.=<<___ if (!$::evp);
    230 	brnz,pn		$ivoff, 3b
    231 	nop
    232 
    233 	std		%f0, [$ivec + 0]	! write out ivec
    234 	std		%f2, [$ivec + 8]
    235 ___
    236 $::code.=<<___;
    237 	ret
    238 	restore
    239 .type	${alg}${bits}_t4_cbc_encrypt,#function
    240 .size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
    241 ___
    242 }
    243 
    244 sub alg_cbc_decrypt_implement {
    245 my ($alg,$bits) = @_;
    246 
    247 $::code.=<<___;
    248 .globl	${alg}${bits}_t4_cbc_decrypt
    249 .align	32
    250 ${alg}${bits}_t4_cbc_decrypt:
    251 	save		%sp, -$::frame, %sp
    252 	sub		$inp, $out, $blk_init	! $inp!=$out
    253 ___
    254 $::code.=<<___ if (!$::evp);
    255 	andcc		$ivec, 7, $ivoff
    256 	alignaddr	$ivec, %g0, $ivec
    257 
    258 	ldd		[$ivec + 0], %f12	! load ivec
    259 	bz,pt		%icc, 1f
    260 	ldd		[$ivec + 8], %f14
    261 	ldd		[$ivec + 16], %f0
    262 	faligndata	%f12, %f14, %f12
    263 	faligndata	%f14, %f0, %f14
    264 1:
    265 ___
    266 $::code.=<<___ if ($::evp);
    267 	ld		[$ivec + 0], %f12	! load ivec
    268 	ld		[$ivec + 4], %f13
    269 	ld		[$ivec + 8], %f14
    270 	ld		[$ivec + 12], %f15
    271 ___
    272 $::code.=<<___;
    273 	prefetch	[$inp], 20
    274 	prefetch	[$inp + 63], 20
    275 	call		_${alg}${bits}_load_deckey
    276 	and		$inp, 7, $ileft
    277 	andn		$inp, 7, $inp
    278 	sll		$ileft, 3, $ileft
    279 	mov		64, $iright
    280 	mov		0xff, $omask
    281 	sub		$iright, $ileft, $iright
    282 	and		$out, 7, $ooff
    283 	cmp		$len, 255
    284 	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
    285 	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
    286 	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
    287 	srl		$omask, $ooff, $omask
    288 
    289 	andcc		$len, 16, %g0		! is number of blocks even?
    290 	srlx		$len, 4, $len
    291 	alignaddrl	$out, %g0, $out
    292 	bz		%icc, .L${bits}_cbc_dec_loop2x
    293 	prefetch	[$out], 22
    294 .L${bits}_cbc_dec_loop:
    295 	ldx		[$inp + 0], %o0
    296 	brz,pt		$ileft, 4f
    297 	ldx		[$inp + 8], %o1
    298 
    299 	ldx		[$inp + 16], %o2
    300 	sllx		%o0, $ileft, %o0
    301 	srlx		%o1, $iright, %g1
    302 	sllx		%o1, $ileft, %o1
    303 	or		%g1, %o0, %o0
    304 	srlx		%o2, $iright, %o2
    305 	or		%o2, %o1, %o1
    306 4:
    307 	xor		%g4, %o0, %o2		! ^= rk[0]
    308 	xor		%g5, %o1, %o3
    309 	movxtod		%o2, %f0
    310 	movxtod		%o3, %f2
    311 
    312 	prefetch	[$out + 63], 22
    313 	prefetch	[$inp + 16+63], 20
    314 	call		_${alg}${bits}_decrypt_1x
    315 	add		$inp, 16, $inp
    316 
    317 	fxor		%f12, %f0, %f0		! ^= ivec
    318 	fxor		%f14, %f2, %f2
    319 	movxtod		%o0, %f12
    320 	movxtod		%o1, %f14
    321 
    322 	brnz,pn		$ooff, 2f
    323 	sub		$len, 1, $len
    324 		
    325 	std		%f0, [$out + 0]
    326 	std		%f2, [$out + 8]
    327 	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
    328 	add		$out, 16, $out
    329 ___
    330 $::code.=<<___ if ($::evp);
    331 	st		%f12, [$ivec + 0]
    332 	st		%f13, [$ivec + 4]
    333 	st		%f14, [$ivec + 8]
    334 	st		%f15, [$ivec + 12]
    335 ___
    336 $::code.=<<___ if (!$::evp);
    337 	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
    338 	nop
    339 
    340 	std		%f12, [$ivec + 0]	! write out ivec
    341 	std		%f14, [$ivec + 8]
    342 ___
    343 $::code.=<<___;
    344 	ret
    345 	restore
    346 
    347 .align	16
    348 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
    349 						! and ~3x deterioration
    350 						! in inp==out case
    351 	faligndata	%f0, %f0, %f4		! handle unaligned output
    352 	faligndata	%f0, %f2, %f6
    353 	faligndata	%f2, %f2, %f8
    354 
    355 	stda		%f4, [$out + $omask]0xc0	! partial store
    356 	std		%f6, [$out + 8]
    357 	add		$out, 16, $out
    358 	orn		%g0, $omask, $omask
    359 	stda		%f8, [$out + $omask]0xc0	! partial store
    360 
    361 	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
    362 	orn		%g0, $omask, $omask
    363 ___
    364 $::code.=<<___ if ($::evp);
    365 	st		%f12, [$ivec + 0]
    366 	st		%f13, [$ivec + 4]
    367 	st		%f14, [$ivec + 8]
    368 	st		%f15, [$ivec + 12]
    369 ___
    370 $::code.=<<___ if (!$::evp);
    371 	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
    372 	nop
    373 
    374 	std		%f12, [$ivec + 0]	! write out ivec
    375 	std		%f14, [$ivec + 8]
    376 ___
    377 $::code.=<<___;
    378 	ret
    379 	restore
    380 
    381 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    382 .align	32
    383 .L${bits}_cbc_dec_loop2x:
    384 	ldx		[$inp + 0], %o0
    385 	ldx		[$inp + 8], %o1
    386 	ldx		[$inp + 16], %o2
    387 	brz,pt		$ileft, 4f
    388 	ldx		[$inp + 24], %o3
    389 
    390 	ldx		[$inp + 32], %o4
    391 	sllx		%o0, $ileft, %o0
    392 	srlx		%o1, $iright, %g1
    393 	or		%g1, %o0, %o0
    394 	sllx		%o1, $ileft, %o1
    395 	srlx		%o2, $iright, %g1
    396 	or		%g1, %o1, %o1
    397 	sllx		%o2, $ileft, %o2
    398 	srlx		%o3, $iright, %g1
    399 	or		%g1, %o2, %o2
    400 	sllx		%o3, $ileft, %o3
    401 	srlx		%o4, $iright, %o4
    402 	or		%o4, %o3, %o3
    403 4:
    404 	xor		%g4, %o0, %o4		! ^= rk[0]
    405 	xor		%g5, %o1, %o5
    406 	movxtod		%o4, %f0
    407 	movxtod		%o5, %f2
    408 	xor		%g4, %o2, %o4
    409 	xor		%g5, %o3, %o5
    410 	movxtod		%o4, %f4
    411 	movxtod		%o5, %f6
    412 
    413 	prefetch	[$out + 63], 22
    414 	prefetch	[$inp + 32+63], 20
    415 	call		_${alg}${bits}_decrypt_2x
    416 	add		$inp, 32, $inp
    417 
    418 	movxtod		%o0, %f8
    419 	movxtod		%o1, %f10
    420 	fxor		%f12, %f0, %f0		! ^= ivec
    421 	fxor		%f14, %f2, %f2
    422 	movxtod		%o2, %f12
    423 	movxtod		%o3, %f14
    424 	fxor		%f8, %f4, %f4
    425 	fxor		%f10, %f6, %f6
    426 
    427 	brnz,pn		$ooff, 2f
    428 	sub		$len, 2, $len
    429 		
    430 	std		%f0, [$out + 0]
    431 	std		%f2, [$out + 8]
    432 	std		%f4, [$out + 16]
    433 	std		%f6, [$out + 24]
    434 	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
    435 	add		$out, 32, $out
    436 ___
    437 $::code.=<<___ if ($::evp);
    438 	st		%f12, [$ivec + 0]
    439 	st		%f13, [$ivec + 4]
    440 	st		%f14, [$ivec + 8]
    441 	st		%f15, [$ivec + 12]
    442 ___
    443 $::code.=<<___ if (!$::evp);
    444 	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
    445 	nop
    446 
    447 	std		%f12, [$ivec + 0]	! write out ivec
    448 	std		%f14, [$ivec + 8]
    449 ___
    450 $::code.=<<___;
    451 	ret
    452 	restore
    453 
    454 .align	16
    455 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
    456 						! and ~3x deterioration
    457 						! in inp==out case
    458 	faligndata	%f0, %f0, %f8		! handle unaligned output
    459 	faligndata	%f0, %f2, %f0
    460 	faligndata	%f2, %f4, %f2
    461 	faligndata	%f4, %f6, %f4
    462 	faligndata	%f6, %f6, %f6
    463 	stda		%f8, [$out + $omask]0xc0	! partial store
    464 	std		%f0, [$out + 8]
    465 	std		%f2, [$out + 16]
    466 	std		%f4, [$out + 24]
    467 	add		$out, 32, $out
    468 	orn		%g0, $omask, $omask
    469 	stda		%f6, [$out + $omask]0xc0	! partial store
    470 
    471 	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
    472 	orn		%g0, $omask, $omask
    473 ___
    474 $::code.=<<___ if ($::evp);
    475 	st		%f12, [$ivec + 0]
    476 	st		%f13, [$ivec + 4]
    477 	st		%f14, [$ivec + 8]
    478 	st		%f15, [$ivec + 12]
    479 ___
    480 $::code.=<<___ if (!$::evp);
    481 	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
    482 	nop
    483 
    484 	std		%f12, [$ivec + 0]	! write out ivec
    485 	std		%f14, [$ivec + 8]
    486 	ret
    487 	restore
    488 
    489 .align	16
    490 .L${bits}_cbc_dec_unaligned_ivec:
    491 	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
    492 	mov		0xff, $omask
    493 	srl		$omask, $ivoff, $omask
    494 	faligndata	%f12, %f12, %f0
    495 	faligndata	%f12, %f14, %f2
    496 	faligndata	%f14, %f14, %f4
    497 	stda		%f0, [$ivec + $omask]0xc0
    498 	std		%f2, [$ivec + 8]
    499 	add		$ivec, 16, $ivec
    500 	orn		%g0, $omask, $omask
    501 	stda		%f4, [$ivec + $omask]0xc0
    502 ___
    503 $::code.=<<___;
    504 	ret
    505 	restore
    506 
    507 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    508 .align	32
    509 .L${bits}cbc_dec_blk:
    510 	add	$out, $len, $blk_init
    511 	and	$blk_init, 63, $blk_init	! tail
    512 	sub	$len, $blk_init, $len
    513 	add	$blk_init, 15, $blk_init	! round up to 16n
    514 	srlx	$len, 4, $len
    515 	srl	$blk_init, 4, $blk_init
    516 	sub	$len, 1, $len
    517 	add	$blk_init, 1, $blk_init
    518 
    519 .L${bits}_cbc_dec_blk_loop2x:
    520 	ldx		[$inp + 0], %o0
    521 	ldx		[$inp + 8], %o1
    522 	ldx		[$inp + 16], %o2
    523 	brz,pt		$ileft, 5f
    524 	ldx		[$inp + 24], %o3
    525 
    526 	ldx		[$inp + 32], %o4
    527 	sllx		%o0, $ileft, %o0
    528 	srlx		%o1, $iright, %g1
    529 	or		%g1, %o0, %o0
    530 	sllx		%o1, $ileft, %o1
    531 	srlx		%o2, $iright, %g1
    532 	or		%g1, %o1, %o1
    533 	sllx		%o2, $ileft, %o2
    534 	srlx		%o3, $iright, %g1
    535 	or		%g1, %o2, %o2
    536 	sllx		%o3, $ileft, %o3
    537 	srlx		%o4, $iright, %o4
    538 	or		%o4, %o3, %o3
    539 5:
    540 	xor		%g4, %o0, %o4		! ^= rk[0]
    541 	xor		%g5, %o1, %o5
    542 	movxtod		%o4, %f0
    543 	movxtod		%o5, %f2
    544 	xor		%g4, %o2, %o4
    545 	xor		%g5, %o3, %o5
    546 	movxtod		%o4, %f4
    547 	movxtod		%o5, %f6
    548 
    549 	prefetch	[$inp + 32+63], 20
    550 	call		_${alg}${bits}_decrypt_2x
    551 	add		$inp, 32, $inp
    552 	subcc		$len, 2, $len
    553 
    554 	movxtod		%o0, %f8
    555 	movxtod		%o1, %f10
    556 	fxor		%f12, %f0, %f0		! ^= ivec
    557 	fxor		%f14, %f2, %f2
    558 	movxtod		%o2, %f12
    559 	movxtod		%o3, %f14
    560 	fxor		%f8, %f4, %f4
    561 	fxor		%f10, %f6, %f6
    562 
    563 	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    564 	add		$out, 8, $out
    565 	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    566 	add		$out, 8, $out
    567 	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    568 	add		$out, 8, $out
    569 	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    570 	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
    571 	add		$out, 8, $out
    572 
    573 	add		$blk_init, $len, $len
    574 	andcc		$len, 1, %g0		! is number of blocks even?
    575 	membar		#StoreLoad|#StoreStore
    576 	bnz,pt		%icc, .L${bits}_cbc_dec_loop
    577 	srl		$len, 0, $len
    578 	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
    579 	nop
    580 ___
    581 $::code.=<<___ if ($::evp);
    582 	st		%f12, [$ivec + 0]	! write out ivec
    583 	st		%f13, [$ivec + 4]
    584 	st		%f14, [$ivec + 8]
    585 	st		%f15, [$ivec + 12]
    586 ___
    587 $::code.=<<___ if (!$::evp);
    588 	brnz,pn		$ivoff, 3b
    589 	nop
    590 
    591 	std		%f12, [$ivec + 0]	! write out ivec
    592 	std		%f14, [$ivec + 8]
    593 ___
    594 $::code.=<<___;
    595 	ret
    596 	restore
    597 .type	${alg}${bits}_t4_cbc_decrypt,#function
    598 .size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
    599 ___
    600 }
    601 
    602 sub alg_ctr32_implement {
    603 my ($alg,$bits) = @_;
    604 
    605 $::code.=<<___;
    606 .globl	${alg}${bits}_t4_ctr32_encrypt
    607 .align	32
    608 ${alg}${bits}_t4_ctr32_encrypt:
    609 	save		%sp, -$::frame, %sp
    610 
    611 	prefetch	[$inp], 20
    612 	prefetch	[$inp + 63], 20
    613 	call		_${alg}${bits}_load_enckey
    614 	sllx		$len, 4, $len
    615 
    616 	ld		[$ivec + 0], %l4	! counter
    617 	ld		[$ivec + 4], %l5
    618 	ld		[$ivec + 8], %l6
    619 	ld		[$ivec + 12], %l7
    620 
    621 	sllx		%l4, 32, %o5
    622 	or		%l5, %o5, %o5
    623 	sllx		%l6, 32, %g1
    624 	xor		%o5, %g4, %g4		! ^= rk[0]
    625 	xor		%g1, %g5, %g5
    626 	movxtod		%g4, %f14		! most significant 64 bits
    627 
    628 	sub		$inp, $out, $blk_init	! $inp!=$out
    629 	and		$inp, 7, $ileft
    630 	andn		$inp, 7, $inp
    631 	sll		$ileft, 3, $ileft
    632 	mov		64, $iright
    633 	mov		0xff, $omask
    634 	sub		$iright, $ileft, $iright
    635 	and		$out, 7, $ooff
    636 	cmp		$len, 255
    637 	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
    638 	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
    639 	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
    640 	srl		$omask, $ooff, $omask
    641 
    642 	andcc		$len, 16, %g0		! is number of blocks even?
    643 	alignaddrl	$out, %g0, $out
    644 	bz		%icc, .L${bits}_ctr32_loop2x
    645 	srlx		$len, 4, $len
    646 .L${bits}_ctr32_loop:
    647 	ldx		[$inp + 0], %o0
    648 	brz,pt		$ileft, 4f
    649 	ldx		[$inp + 8], %o1
    650 
    651 	ldx		[$inp + 16], %o2
    652 	sllx		%o0, $ileft, %o0
    653 	srlx		%o1, $iright, %g1
    654 	sllx		%o1, $ileft, %o1
    655 	or		%g1, %o0, %o0
    656 	srlx		%o2, $iright, %o2
    657 	or		%o2, %o1, %o1
    658 4:
    659 	xor		%g5, %l7, %g1		! ^= rk[0]
    660 	add		%l7, 1, %l7
    661 	movxtod		%g1, %f2
    662 	srl		%l7, 0, %l7		! clruw
    663 	prefetch	[$out + 63], 22
    664 	prefetch	[$inp + 16+63], 20
    665 ___
    666 $::code.=<<___ if ($alg eq "aes");
    667 	aes_eround01	%f16, %f14, %f2, %f4
    668 	aes_eround23	%f18, %f14, %f2, %f2
    669 ___
    670 $::code.=<<___ if ($alg eq "cmll");
    671 	camellia_f	%f16, %f2, %f14, %f2
    672 	camellia_f	%f18, %f14, %f2, %f0
    673 ___
    674 $::code.=<<___;
    675 	call		_${alg}${bits}_encrypt_1x+8
    676 	add		$inp, 16, $inp
    677 
    678 	movxtod		%o0, %f10
    679 	movxtod		%o1, %f12
    680 	fxor		%f10, %f0, %f0		! ^= inp
    681 	fxor		%f12, %f2, %f2
    682 
    683 	brnz,pn		$ooff, 2f
    684 	sub		$len, 1, $len
    685 		
    686 	std		%f0, [$out + 0]
    687 	std		%f2, [$out + 8]
    688 	brnz,pt		$len, .L${bits}_ctr32_loop2x
    689 	add		$out, 16, $out
    690 
    691 	ret
    692 	restore
    693 
    694 .align	16
    695 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
    696 						! and ~3x deterioration
    697 						! in inp==out case
    698 	faligndata	%f0, %f0, %f4		! handle unaligned output
    699 	faligndata	%f0, %f2, %f6
    700 	faligndata	%f2, %f2, %f8
    701 	stda		%f4, [$out + $omask]0xc0	! partial store
    702 	std		%f6, [$out + 8]
    703 	add		$out, 16, $out
    704 	orn		%g0, $omask, $omask
    705 	stda		%f8, [$out + $omask]0xc0	! partial store
    706 
    707 	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
    708 	orn		%g0, $omask, $omask
    709 
    710 	ret
    711 	restore
    712 
    713 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    714 .align	32
    715 .L${bits}_ctr32_loop2x:
    716 	ldx		[$inp + 0], %o0
    717 	ldx		[$inp + 8], %o1
    718 	ldx		[$inp + 16], %o2
    719 	brz,pt		$ileft, 4f
    720 	ldx		[$inp + 24], %o3
    721 
    722 	ldx		[$inp + 32], %o4
    723 	sllx		%o0, $ileft, %o0
    724 	srlx		%o1, $iright, %g1
    725 	or		%g1, %o0, %o0
    726 	sllx		%o1, $ileft, %o1
    727 	srlx		%o2, $iright, %g1
    728 	or		%g1, %o1, %o1
    729 	sllx		%o2, $ileft, %o2
    730 	srlx		%o3, $iright, %g1
    731 	or		%g1, %o2, %o2
    732 	sllx		%o3, $ileft, %o3
    733 	srlx		%o4, $iright, %o4
    734 	or		%o4, %o3, %o3
    735 4:
    736 	xor		%g5, %l7, %g1		! ^= rk[0]
    737 	add		%l7, 1, %l7
    738 	movxtod		%g1, %f2
    739 	srl		%l7, 0, %l7		! clruw
    740 	xor		%g5, %l7, %g1
    741 	add		%l7, 1, %l7
    742 	movxtod		%g1, %f6
    743 	srl		%l7, 0, %l7		! clruw
    744 	prefetch	[$out + 63], 22
    745 	prefetch	[$inp + 32+63], 20
    746 ___
    747 $::code.=<<___ if ($alg eq "aes");
    748 	aes_eround01	%f16, %f14, %f2, %f8
    749 	aes_eround23	%f18, %f14, %f2, %f2
    750 	aes_eround01	%f16, %f14, %f6, %f10
    751 	aes_eround23	%f18, %f14, %f6, %f6
    752 ___
    753 $::code.=<<___ if ($alg eq "cmll");
    754 	camellia_f	%f16, %f2, %f14, %f2
    755 	camellia_f	%f16, %f6, %f14, %f6
    756 	camellia_f	%f18, %f14, %f2, %f0
    757 	camellia_f	%f18, %f14, %f6, %f4
    758 ___
    759 $::code.=<<___;
    760 	call		_${alg}${bits}_encrypt_2x+16
    761 	add		$inp, 32, $inp
    762 
    763 	movxtod		%o0, %f8
    764 	movxtod		%o1, %f10
    765 	movxtod		%o2, %f12
    766 	fxor		%f8, %f0, %f0		! ^= inp
    767 	movxtod		%o3, %f8
    768 	fxor		%f10, %f2, %f2
    769 	fxor		%f12, %f4, %f4
    770 	fxor		%f8, %f6, %f6
    771 
    772 	brnz,pn		$ooff, 2f
    773 	sub		$len, 2, $len
    774 		
    775 	std		%f0, [$out + 0]
    776 	std		%f2, [$out + 8]
    777 	std		%f4, [$out + 16]
    778 	std		%f6, [$out + 24]
    779 	brnz,pt		$len, .L${bits}_ctr32_loop2x
    780 	add		$out, 32, $out
    781 
    782 	ret
    783 	restore
    784 
    785 .align	16
    786 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
    787 						! and ~3x deterioration
    788 						! in inp==out case
    789 	faligndata	%f0, %f0, %f8		! handle unaligned output
    790 	faligndata	%f0, %f2, %f0
    791 	faligndata	%f2, %f4, %f2
    792 	faligndata	%f4, %f6, %f4
    793 	faligndata	%f6, %f6, %f6
    794 
    795 	stda		%f8, [$out + $omask]0xc0	! partial store
    796 	std		%f0, [$out + 8]
    797 	std		%f2, [$out + 16]
    798 	std		%f4, [$out + 24]
    799 	add		$out, 32, $out
    800 	orn		%g0, $omask, $omask
    801 	stda		%f6, [$out + $omask]0xc0	! partial store
    802 
    803 	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
    804 	orn		%g0, $omask, $omask
    805 
    806 	ret
    807 	restore
    808 
    809 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    810 .align	32
    811 .L${bits}_ctr32_blk:
    812 	add	$out, $len, $blk_init
    813 	and	$blk_init, 63, $blk_init	! tail
    814 	sub	$len, $blk_init, $len
    815 	add	$blk_init, 15, $blk_init	! round up to 16n
    816 	srlx	$len, 4, $len
    817 	srl	$blk_init, 4, $blk_init
    818 	sub	$len, 1, $len
    819 	add	$blk_init, 1, $blk_init
    820 
    821 .L${bits}_ctr32_blk_loop2x:
    822 	ldx		[$inp + 0], %o0
    823 	ldx		[$inp + 8], %o1
    824 	ldx		[$inp + 16], %o2
    825 	brz,pt		$ileft, 5f
    826 	ldx		[$inp + 24], %o3
    827 
    828 	ldx		[$inp + 32], %o4
    829 	sllx		%o0, $ileft, %o0
    830 	srlx		%o1, $iright, %g1
    831 	or		%g1, %o0, %o0
    832 	sllx		%o1, $ileft, %o1
    833 	srlx		%o2, $iright, %g1
    834 	or		%g1, %o1, %o1
    835 	sllx		%o2, $ileft, %o2
    836 	srlx		%o3, $iright, %g1
    837 	or		%g1, %o2, %o2
    838 	sllx		%o3, $ileft, %o3
    839 	srlx		%o4, $iright, %o4
    840 	or		%o4, %o3, %o3
    841 5:
    842 	xor		%g5, %l7, %g1		! ^= rk[0]
    843 	add		%l7, 1, %l7
    844 	movxtod		%g1, %f2
    845 	srl		%l7, 0, %l7		! clruw
    846 	xor		%g5, %l7, %g1
    847 	add		%l7, 1, %l7
    848 	movxtod		%g1, %f6
    849 	srl		%l7, 0, %l7		! clruw
    850 	prefetch	[$inp + 32+63], 20
    851 ___
    852 $::code.=<<___ if ($alg eq "aes");
    853 	aes_eround01	%f16, %f14, %f2, %f8
    854 	aes_eround23	%f18, %f14, %f2, %f2
    855 	aes_eround01	%f16, %f14, %f6, %f10
    856 	aes_eround23	%f18, %f14, %f6, %f6
    857 ___
    858 $::code.=<<___ if ($alg eq "cmll");
    859 	camellia_f	%f16, %f2, %f14, %f2
    860 	camellia_f	%f16, %f6, %f14, %f6
    861 	camellia_f	%f18, %f14, %f2, %f0
    862 	camellia_f	%f18, %f14, %f6, %f4
    863 ___
    864 $::code.=<<___;
    865 	call		_${alg}${bits}_encrypt_2x+16
    866 	add		$inp, 32, $inp
    867 	subcc		$len, 2, $len
    868 
    869 	movxtod		%o0, %f8
    870 	movxtod		%o1, %f10
    871 	movxtod		%o2, %f12
    872 	fxor		%f8, %f0, %f0		! ^= inp
    873 	movxtod		%o3, %f8
    874 	fxor		%f10, %f2, %f2
    875 	fxor		%f12, %f4, %f4
    876 	fxor		%f8, %f6, %f6
    877 
    878 	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    879 	add		$out, 8, $out
    880 	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    881 	add		$out, 8, $out
    882 	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    883 	add		$out, 8, $out
    884 	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
    885 	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
    886 	add		$out, 8, $out
    887 
    888 	add		$blk_init, $len, $len
    889 	andcc		$len, 1, %g0		! is number of blocks even?
    890 	membar		#StoreLoad|#StoreStore
    891 	bnz,pt		%icc, .L${bits}_ctr32_loop
    892 	srl		$len, 0, $len
    893 	brnz,pn		$len, .L${bits}_ctr32_loop2x
    894 	nop
    895 
    896 	ret
    897 	restore
    898 .type	${alg}${bits}_t4_ctr32_encrypt,#function
    899 .size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
    900 ___
    901 }
    902 
    903 sub alg_xts_implement {
    904 my ($alg,$bits,$dir) = @_;
    905 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
    906 my $rem=$ivec;
    907 
    908 $::code.=<<___;
    909 .globl	${alg}${bits}_t4_xts_${dir}crypt
    910 .align	32
    911 ${alg}${bits}_t4_xts_${dir}crypt:
    912 	save		%sp, -$::frame-16, %sp
    913 
    914 	mov		$ivec, %o0
    915 	add		%fp, $::bias-16, %o1
    916 	call		${alg}_t4_encrypt
    917 	mov		$key2, %o2
    918 
    919 	add		%fp, $::bias-16, %l7
    920 	ldxa		[%l7]0x88, %g2
    921 	add		%fp, $::bias-8, %l7
    922 	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
    923 
    924 	sethi		%hi(0x76543210), %l7
    925 	or		%l7, %lo(0x76543210), %l7
    926 	bmask		%l7, %g0, %g0		! byte swap mask
    927 
    928 	prefetch	[$inp], 20
    929 	prefetch	[$inp + 63], 20
    930 	call		_${alg}${bits}_load_${dir}ckey
    931 	and		$len, 15,  $rem
    932 	and		$len, -16, $len
    933 ___
    934 $code.=<<___ if ($dir eq "de");
    935 	mov		0, %l7
    936 	movrnz		$rem, 16,  %l7
    937 	sub		$len, %l7, $len
    938 ___
    939 $code.=<<___;
    940 
    941 	sub		$inp, $out, $blk_init	! $inp!=$out
    942 	and		$inp, 7, $ileft
    943 	andn		$inp, 7, $inp
    944 	sll		$ileft, 3, $ileft
    945 	mov		64, $iright
    946 	mov		0xff, $omask
    947 	sub		$iright, $ileft, $iright
    948 	and		$out, 7, $ooff
    949 	cmp		$len, 255
    950 	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
    951 	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
    952 	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
    953 	srl		$omask, $ooff, $omask
    954 
    955 	andcc		$len, 16, %g0		! is number of blocks even?
    956 ___
    957 $code.=<<___ if ($dir eq "de");
    958 	brz,pn		$len, .L${bits}_xts_${dir}steal
    959 ___
    960 $code.=<<___;
    961 	alignaddrl	$out, %g0, $out
    962 	bz		%icc, .L${bits}_xts_${dir}loop2x
    963 	srlx		$len, 4, $len
    964 .L${bits}_xts_${dir}loop:
    965 	ldx		[$inp + 0], %o0
    966 	brz,pt		$ileft, 4f
    967 	ldx		[$inp + 8], %o1
    968 
    969 	ldx		[$inp + 16], %o2
    970 	sllx		%o0, $ileft, %o0
    971 	srlx		%o1, $iright, %g1
    972 	sllx		%o1, $ileft, %o1
    973 	or		%g1, %o0, %o0
    974 	srlx		%o2, $iright, %o2
    975 	or		%o2, %o1, %o1
    976 4:
    977 	movxtod		%g2, %f12
    978 	movxtod		%g3, %f14
    979 	bshuffle	%f12, %f12, %f12
    980 	bshuffle	%f14, %f14, %f14
    981 
    982 	xor		%g4, %o0, %o0		! ^= rk[0]
    983 	xor		%g5, %o1, %o1
    984 	movxtod		%o0, %f0
    985 	movxtod		%o1, %f2
    986 
    987 	fxor		%f12, %f0, %f0		! ^= tweak[0]
    988 	fxor		%f14, %f2, %f2
    989 
    990 	prefetch	[$out + 63], 22
    991 	prefetch	[$inp + 16+63], 20
    992 	call		_${alg}${bits}_${dir}crypt_1x
    993 	add		$inp, 16, $inp
    994 
    995 	fxor		%f12, %f0, %f0		! ^= tweak[0]
    996 	fxor		%f14, %f2, %f2
    997 
    998 	srax		%g3, 63, %l7		! next tweak value
    999 	addcc		%g2, %g2, %g2
   1000 	and		%l7, 0x87, %l7
   1001 	addxc		%g3, %g3, %g3
   1002 	xor		%l7, %g2, %g2
   1003 
   1004 	brnz,pn		$ooff, 2f
   1005 	sub		$len, 1, $len
   1006 		
   1007 	std		%f0, [$out + 0]
   1008 	std		%f2, [$out + 8]
   1009 	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
   1010 	add		$out, 16, $out
   1011 
   1012 	brnz,pn		$rem, .L${bits}_xts_${dir}steal
   1013 	nop
   1014 
   1015 	ret
   1016 	restore
   1017 
   1018 .align	16
   1019 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
   1020 						! and ~3x deterioration
   1021 						! in inp==out case
   1022 	faligndata	%f0, %f0, %f4		! handle unaligned output
   1023 	faligndata	%f0, %f2, %f6
   1024 	faligndata	%f2, %f2, %f8
   1025 	stda		%f4, [$out + $omask]0xc0	! partial store
   1026 	std		%f6, [$out + 8]
   1027 	add		$out, 16, $out
   1028 	orn		%g0, $omask, $omask
   1029 	stda		%f8, [$out + $omask]0xc0	! partial store
   1030 
   1031 	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
   1032 	orn		%g0, $omask, $omask
   1033 
   1034 	brnz,pn		$rem, .L${bits}_xts_${dir}steal
   1035 	nop
   1036 
   1037 	ret
   1038 	restore
   1039 
   1040 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   1041 .align	32
   1042 .L${bits}_xts_${dir}loop2x:
   1043 	ldx		[$inp + 0], %o0
   1044 	ldx		[$inp + 8], %o1
   1045 	ldx		[$inp + 16], %o2
   1046 	brz,pt		$ileft, 4f
   1047 	ldx		[$inp + 24], %o3
   1048 
   1049 	ldx		[$inp + 32], %o4
   1050 	sllx		%o0, $ileft, %o0
   1051 	srlx		%o1, $iright, %g1
   1052 	or		%g1, %o0, %o0
   1053 	sllx		%o1, $ileft, %o1
   1054 	srlx		%o2, $iright, %g1
   1055 	or		%g1, %o1, %o1
   1056 	sllx		%o2, $ileft, %o2
   1057 	srlx		%o3, $iright, %g1
   1058 	or		%g1, %o2, %o2
   1059 	sllx		%o3, $ileft, %o3
   1060 	srlx		%o4, $iright, %o4
   1061 	or		%o4, %o3, %o3
   1062 4:
   1063 	movxtod		%g2, %f12
   1064 	movxtod		%g3, %f14
   1065 	bshuffle	%f12, %f12, %f12
   1066 	bshuffle	%f14, %f14, %f14
   1067 
   1068 	srax		%g3, 63, %l7		! next tweak value
   1069 	addcc		%g2, %g2, %g2
   1070 	and		%l7, 0x87, %l7
   1071 	addxc		%g3, %g3, %g3
   1072 	xor		%l7, %g2, %g2
   1073 
   1074 	movxtod		%g2, %f8
   1075 	movxtod		%g3, %f10
   1076 	bshuffle	%f8,  %f8,  %f8
   1077 	bshuffle	%f10, %f10, %f10
   1078 
   1079 	xor		%g4, %o0, %o0		! ^= rk[0]
   1080 	xor		%g5, %o1, %o1
   1081 	xor		%g4, %o2, %o2		! ^= rk[0]
   1082 	xor		%g5, %o3, %o3
   1083 	movxtod		%o0, %f0
   1084 	movxtod		%o1, %f2
   1085 	movxtod		%o2, %f4
   1086 	movxtod		%o3, %f6
   1087 
   1088 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1089 	fxor		%f14, %f2, %f2
   1090 	fxor		%f8,  %f4, %f4		! ^= tweak[0]
   1091 	fxor		%f10, %f6, %f6
   1092 
   1093 	prefetch	[$out + 63], 22
   1094 	prefetch	[$inp + 32+63], 20
   1095 	call		_${alg}${bits}_${dir}crypt_2x
   1096 	add		$inp, 32, $inp
   1097 
   1098 	movxtod		%g2, %f8
   1099 	movxtod		%g3, %f10
   1100 
   1101 	srax		%g3, 63, %l7		! next tweak value
   1102 	addcc		%g2, %g2, %g2
   1103 	and		%l7, 0x87, %l7
   1104 	addxc		%g3, %g3, %g3
   1105 	xor		%l7, %g2, %g2
   1106 
   1107 	bshuffle	%f8,  %f8,  %f8
   1108 	bshuffle	%f10, %f10, %f10
   1109 
   1110 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1111 	fxor		%f14, %f2, %f2
   1112 	fxor		%f8,  %f4, %f4
   1113 	fxor		%f10, %f6, %f6
   1114 
   1115 	brnz,pn		$ooff, 2f
   1116 	sub		$len, 2, $len
   1117 		
   1118 	std		%f0, [$out + 0]
   1119 	std		%f2, [$out + 8]
   1120 	std		%f4, [$out + 16]
   1121 	std		%f6, [$out + 24]
   1122 	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
   1123 	add		$out, 32, $out
   1124 
   1125 	fsrc2		%f4, %f0
   1126 	fsrc2		%f6, %f2
   1127 	brnz,pn		$rem, .L${bits}_xts_${dir}steal
   1128 	nop
   1129 
   1130 	ret
   1131 	restore
   1132 
   1133 .align	16
   1134 2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
   1135 						! and ~3x deterioration
   1136 						! in inp==out case
   1137 	faligndata	%f0, %f0, %f8		! handle unaligned output
   1138 	faligndata	%f0, %f2, %f10
   1139 	faligndata	%f2, %f4, %f12
   1140 	faligndata	%f4, %f6, %f14
   1141 	faligndata	%f6, %f6, %f0
   1142 
   1143 	stda		%f8, [$out + $omask]0xc0	! partial store
   1144 	std		%f10, [$out + 8]
   1145 	std		%f12, [$out + 16]
   1146 	std		%f14, [$out + 24]
   1147 	add		$out, 32, $out
   1148 	orn		%g0, $omask, $omask
   1149 	stda		%f0, [$out + $omask]0xc0	! partial store
   1150 
   1151 	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
   1152 	orn		%g0, $omask, $omask
   1153 
   1154 	fsrc2		%f4, %f0
   1155 	fsrc2		%f6, %f2
   1156 	brnz,pn		$rem, .L${bits}_xts_${dir}steal
   1157 	nop
   1158 
   1159 	ret
   1160 	restore
   1161 
   1162 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   1163 .align	32
   1164 .L${bits}_xts_${dir}blk:
   1165 	add	$out, $len, $blk_init
   1166 	and	$blk_init, 63, $blk_init	! tail
   1167 	sub	$len, $blk_init, $len
   1168 	add	$blk_init, 15, $blk_init	! round up to 16n
   1169 	srlx	$len, 4, $len
   1170 	srl	$blk_init, 4, $blk_init
   1171 	sub	$len, 1, $len
   1172 	add	$blk_init, 1, $blk_init
   1173 
   1174 .L${bits}_xts_${dir}blk2x:
   1175 	ldx		[$inp + 0], %o0
   1176 	ldx		[$inp + 8], %o1
   1177 	ldx		[$inp + 16], %o2
   1178 	brz,pt		$ileft, 5f
   1179 	ldx		[$inp + 24], %o3
   1180 
   1181 	ldx		[$inp + 32], %o4
   1182 	sllx		%o0, $ileft, %o0
   1183 	srlx		%o1, $iright, %g1
   1184 	or		%g1, %o0, %o0
   1185 	sllx		%o1, $ileft, %o1
   1186 	srlx		%o2, $iright, %g1
   1187 	or		%g1, %o1, %o1
   1188 	sllx		%o2, $ileft, %o2
   1189 	srlx		%o3, $iright, %g1
   1190 	or		%g1, %o2, %o2
   1191 	sllx		%o3, $ileft, %o3
   1192 	srlx		%o4, $iright, %o4
   1193 	or		%o4, %o3, %o3
   1194 5:
   1195 	movxtod		%g2, %f12
   1196 	movxtod		%g3, %f14
   1197 	bshuffle	%f12, %f12, %f12
   1198 	bshuffle	%f14, %f14, %f14
   1199 
   1200 	srax		%g3, 63, %l7		! next tweak value
   1201 	addcc		%g2, %g2, %g2
   1202 	and		%l7, 0x87, %l7
   1203 	addxc		%g3, %g3, %g3
   1204 	xor		%l7, %g2, %g2
   1205 
   1206 	movxtod		%g2, %f8
   1207 	movxtod		%g3, %f10
   1208 	bshuffle	%f8,  %f8,  %f8
   1209 	bshuffle	%f10, %f10, %f10
   1210 
   1211 	xor		%g4, %o0, %o0		! ^= rk[0]
   1212 	xor		%g5, %o1, %o1
   1213 	xor		%g4, %o2, %o2		! ^= rk[0]
   1214 	xor		%g5, %o3, %o3
   1215 	movxtod		%o0, %f0
   1216 	movxtod		%o1, %f2
   1217 	movxtod		%o2, %f4
   1218 	movxtod		%o3, %f6
   1219 
   1220 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1221 	fxor		%f14, %f2, %f2
   1222 	fxor		%f8,  %f4, %f4		! ^= tweak[0]
   1223 	fxor		%f10, %f6, %f6
   1224 
   1225 	prefetch	[$inp + 32+63], 20
   1226 	call		_${alg}${bits}_${dir}crypt_2x
   1227 	add		$inp, 32, $inp
   1228 
   1229 	movxtod		%g2, %f8
   1230 	movxtod		%g3, %f10
   1231 
   1232 	srax		%g3, 63, %l7		! next tweak value
   1233 	addcc		%g2, %g2, %g2
   1234 	and		%l7, 0x87, %l7
   1235 	addxc		%g3, %g3, %g3
   1236 	xor		%l7, %g2, %g2
   1237 
   1238 	bshuffle	%f8,  %f8,  %f8
   1239 	bshuffle	%f10, %f10, %f10
   1240 
   1241 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1242 	fxor		%f14, %f2, %f2
   1243 	fxor		%f8,  %f4, %f4
   1244 	fxor		%f10, %f6, %f6
   1245 
   1246 	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
   1247 	add		$out, 8, $out
   1248 	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
   1249 	add		$out, 8, $out
   1250 	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
   1251 	add		$out, 8, $out
   1252 	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
   1253 	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
   1254 	add		$out, 8, $out
   1255 
   1256 	add		$blk_init, $len, $len
   1257 	andcc		$len, 1, %g0		! is number of blocks even?
   1258 	membar		#StoreLoad|#StoreStore
   1259 	bnz,pt		%icc, .L${bits}_xts_${dir}loop
   1260 	srl		$len, 0, $len
   1261 	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
   1262 	nop
   1263 
   1264 	fsrc2		%f4, %f0
   1265 	fsrc2		%f6, %f2
   1266 	brnz,pn		$rem, .L${bits}_xts_${dir}steal
   1267 	nop
   1268 
   1269 	ret
   1270 	restore
   1271 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   1272 ___
   1273 $code.=<<___ if ($dir eq "en");
   1274 .align	32
   1275 .L${bits}_xts_${dir}steal:
   1276 	std		%f0, [%fp + $::bias-16]	! copy of output
   1277 	std		%f2, [%fp + $::bias-8]
   1278 
   1279 	srl		$ileft, 3, $ileft
   1280 	add		%fp, $::bias-16, %l7
   1281 	add		$inp, $ileft, $inp	! original $inp+$len&-15
   1282 	add		$out, $ooff, $out	! original $out+$len&-15
   1283 	mov		0, $ileft
   1284 	nop					! align
   1285 
   1286 .L${bits}_xts_${dir}stealing:
   1287 	ldub		[$inp + $ileft], %o0
   1288 	ldub		[%l7  + $ileft], %o1
   1289 	dec		$rem
   1290 	stb		%o0, [%l7  + $ileft]
   1291 	stb		%o1, [$out + $ileft]
   1292 	brnz		$rem, .L${bits}_xts_${dir}stealing
   1293 	inc		$ileft
   1294 
   1295 	mov		%l7, $inp
   1296 	sub		$out, 16, $out
   1297 	mov		0, $ileft
   1298 	sub		$out, $ooff, $out
   1299 	ba		.L${bits}_xts_${dir}loop	! one more time
   1300 	mov		1, $len				! $rem is 0
   1301 ___
   1302 $code.=<<___ if ($dir eq "de");
   1303 .align	32
   1304 .L${bits}_xts_${dir}steal:
   1305 	ldx		[$inp + 0], %o0
   1306 	brz,pt		$ileft, 8f
   1307 	ldx		[$inp + 8], %o1
   1308 
   1309 	ldx		[$inp + 16], %o2
   1310 	sllx		%o0, $ileft, %o0
   1311 	srlx		%o1, $iright, %g1
   1312 	sllx		%o1, $ileft, %o1
   1313 	or		%g1, %o0, %o0
   1314 	srlx		%o2, $iright, %o2
   1315 	or		%o2, %o1, %o1
   1316 8:
   1317 	srax		%g3, 63, %l7		! next tweak value
   1318 	addcc		%g2, %g2, %o2
   1319 	and		%l7, 0x87, %l7
   1320 	addxc		%g3, %g3, %o3
   1321 	xor		%l7, %o2, %o2
   1322 
   1323 	movxtod		%o2, %f12
   1324 	movxtod		%o3, %f14
   1325 	bshuffle	%f12, %f12, %f12
   1326 	bshuffle	%f14, %f14, %f14
   1327 
   1328 	xor		%g4, %o0, %o0		! ^= rk[0]
   1329 	xor		%g5, %o1, %o1
   1330 	movxtod		%o0, %f0
   1331 	movxtod		%o1, %f2
   1332 
   1333 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1334 	fxor		%f14, %f2, %f2
   1335 
   1336 	call		_${alg}${bits}_${dir}crypt_1x
   1337 	add		$inp, 16, $inp
   1338 
   1339 	fxor		%f12, %f0, %f0		! ^= tweak[0]
   1340 	fxor		%f14, %f2, %f2
   1341 
   1342 	std		%f0, [%fp + $::bias-16]
   1343 	std		%f2, [%fp + $::bias-8]
   1344 
   1345 	srl		$ileft, 3, $ileft
   1346 	add		%fp, $::bias-16, %l7
   1347 	add		$inp, $ileft, $inp	! original $inp+$len&-15
   1348 	add		$out, $ooff, $out	! original $out+$len&-15
   1349 	mov		0, $ileft
   1350 	add		$out, 16, $out
   1351 	nop					! align
   1352 
   1353 .L${bits}_xts_${dir}stealing:
   1354 	ldub		[$inp + $ileft], %o0
   1355 	ldub		[%l7  + $ileft], %o1
   1356 	dec		$rem
   1357 	stb		%o0, [%l7  + $ileft]
   1358 	stb		%o1, [$out + $ileft]
   1359 	brnz		$rem, .L${bits}_xts_${dir}stealing
   1360 	inc		$ileft
   1361 
   1362 	mov		%l7, $inp
   1363 	sub		$out, 16, $out
   1364 	mov		0, $ileft
   1365 	sub		$out, $ooff, $out
   1366 	ba		.L${bits}_xts_${dir}loop	! one more time
   1367 	mov		1, $len				! $rem is 0
   1368 ___
   1369 $code.=<<___;
   1370 	ret
   1371 	restore
   1372 .type	${alg}${bits}_t4_xts_${dir}crypt,#function
   1373 .size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
   1374 ___
   1375 }
   1376 
   1377 # Purpose of these subroutines is to explicitly encode VIS instructions,
   1378 # so that one can compile the module without having to specify VIS
   1379 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
   1380 # Idea is to reserve for option to produce "universal" binary and let
   1381 # programmer detect if current CPU is VIS capable at run-time.
   1382 sub unvis {
   1383 my ($mnemonic,$rs1,$rs2,$rd)=@_;
   1384 my ($ref,$opf);
   1385 my %visopf = (	"faligndata"	=> 0x048,
   1386 		"bshuffle"	=> 0x04c,
   1387 		"fnot2"		=> 0x066,
   1388 		"fxor"		=> 0x06c,
   1389 		"fsrc2"		=> 0x078	);
   1390 
   1391     $ref = "$mnemonic\t$rs1,$rs2,$rd";
   1392 
   1393     if ($opf=$visopf{$mnemonic}) {
   1394 	foreach ($rs1,$rs2,$rd) {
   1395 	    return $ref if (!/%f([0-9]{1,2})/);
   1396 	    $_=$1;
   1397 	    if ($1>=32) {
   1398 		return $ref if ($1&1);
   1399 		# re-encode for upper double register addressing
   1400 		$_=($1|$1>>5)&31;
   1401 	    }
   1402 	}
   1403 
   1404 	return	sprintf ".word\t0x%08x !%s",
   1405 			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
   1406 			$ref;
   1407     } else {
   1408 	return $ref;
   1409     }
   1410 }
   1411 
   1412 sub unvis3 {
   1413 my ($mnemonic,$rs1,$rs2,$rd)=@_;
   1414 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
   1415 my ($ref,$opf);
   1416 my %visopf = (	"addxc"		=> 0x011,
   1417 		"addxccc"	=> 0x013,
   1418 		"umulxhi"	=> 0x016,
   1419 		"alignaddr"	=> 0x018,
   1420 		"bmask"		=> 0x019,
   1421 		"alignaddrl"	=> 0x01a	);
   1422 
   1423     $ref = "$mnemonic\t$rs1,$rs2,$rd";
   1424 
   1425     if ($opf=$visopf{$mnemonic}) {
   1426 	foreach ($rs1,$rs2,$rd) {
   1427 	    return $ref if (!/%([goli])([0-9])/);
   1428 	    $_=$bias{$1}+$2;
   1429 	}
   1430 
   1431 	return	sprintf ".word\t0x%08x !%s",
   1432 			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
   1433 			$ref;
   1434     } else {
   1435 	return $ref;
   1436     }
   1437 }
   1438 
   1439 sub unaes_round {	# 4-argument instructions
   1440 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
   1441 my ($ref,$opf);
   1442 my %aesopf = (	"aes_eround01"	=> 0,
   1443 		"aes_eround23"	=> 1,
   1444 		"aes_dround01"	=> 2,
   1445 		"aes_dround23"	=> 3,
   1446 		"aes_eround01_l"=> 4,
   1447 		"aes_eround23_l"=> 5,
   1448 		"aes_dround01_l"=> 6,
   1449 		"aes_dround23_l"=> 7,
   1450 		"aes_kexpand1"	=> 8	);
   1451 
   1452     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
   1453 
   1454     if (defined($opf=$aesopf{$mnemonic})) {
   1455 	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
   1456 	foreach ($rs1,$rs2,$rd) {
   1457 	    return $ref if (!/%f([0-9]{1,2})/);
   1458 	    $_=$1;
   1459 	    if ($1>=32) {
   1460 		return $ref if ($1&1);
   1461 		# re-encode for upper double register addressing
   1462 		$_=($1|$1>>5)&31;
   1463 	    }
   1464 	}
   1465 
   1466 	return	sprintf ".word\t0x%08x !%s",
   1467 			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
   1468 			$ref;
   1469     } else {
   1470 	return $ref;
   1471     }
   1472 }
   1473 
   1474 sub unaes_kexpand {	# 3-argument instructions
   1475 my ($mnemonic,$rs1,$rs2,$rd)=@_;
   1476 my ($ref,$opf);
   1477 my %aesopf = (	"aes_kexpand0"	=> 0x130,
   1478 		"aes_kexpand2"	=> 0x131	);
   1479 
   1480     $ref = "$mnemonic\t$rs1,$rs2,$rd";
   1481 
   1482     if (defined($opf=$aesopf{$mnemonic})) {
   1483 	foreach ($rs1,$rs2,$rd) {
   1484 	    return $ref if (!/%f([0-9]{1,2})/);
   1485 	    $_=$1;
   1486 	    if ($1>=32) {
   1487 		return $ref if ($1&1);
   1488 		# re-encode for upper double register addressing
   1489 		$_=($1|$1>>5)&31;
   1490 	    }
   1491 	}
   1492 
   1493 	return	sprintf ".word\t0x%08x !%s",
   1494 			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
   1495 			$ref;
   1496     } else {
   1497 	return $ref;
   1498     }
   1499 }
   1500 
   1501 sub uncamellia_f {	# 4-argument instructions
   1502 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
   1503 my ($ref,$opf);
   1504 
   1505     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
   1506 
   1507     if (1) {
   1508 	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
   1509 	foreach ($rs1,$rs2,$rd) {
   1510 	    return $ref if (!/%f([0-9]{1,2})/);
   1511 	    $_=$1;
   1512 	    if ($1>=32) {
   1513 		return $ref if ($1&1);
   1514 		# re-encode for upper double register addressing
   1515 		$_=($1|$1>>5)&31;
   1516 	    }
   1517 	}
   1518 
   1519 	return	sprintf ".word\t0x%08x !%s",
   1520 			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
   1521 			$ref;
   1522     } else {
   1523 	return $ref;
   1524     }
   1525 }
   1526 
   1527 sub uncamellia3 {	# 3-argument instructions
   1528 my ($mnemonic,$rs1,$rs2,$rd)=@_;
   1529 my ($ref,$opf);
   1530 my %cmllopf = (	"camellia_fl"	=> 0x13c,
   1531 		"camellia_fli"	=> 0x13d	);
   1532 
   1533     $ref = "$mnemonic\t$rs1,$rs2,$rd";
   1534 
   1535     if (defined($opf=$cmllopf{$mnemonic})) {
   1536 	foreach ($rs1,$rs2,$rd) {
   1537 	    return $ref if (!/%f([0-9]{1,2})/);
   1538 	    $_=$1;
   1539 	    if ($1>=32) {
   1540 		return $ref if ($1&1);
   1541 		# re-encode for upper double register addressing
   1542 		$_=($1|$1>>5)&31;
   1543 	    }
   1544 	}
   1545 
   1546 	return	sprintf ".word\t0x%08x !%s",
   1547 			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
   1548 			$ref;
   1549     } else {
   1550 	return $ref;
   1551     }
   1552 }
   1553 
   1554 sub unmovxtox {		# 2-argument instructions
   1555 my ($mnemonic,$rs,$rd)=@_;
   1556 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
   1557 my ($ref,$opf);
   1558 my %movxopf = (	"movdtox"	=> 0x110,
   1559 		"movstouw"	=> 0x111,
   1560 		"movstosw"	=> 0x113,
   1561 		"movxtod"	=> 0x118,
   1562 		"movwtos"	=> 0x119	);
   1563 
   1564     $ref = "$mnemonic\t$rs,$rd";
   1565 
   1566     if (defined($opf=$movxopf{$mnemonic})) {
   1567 	foreach ($rs,$rd) {
   1568 	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
   1569 	    $_=$bias{$1}+$2;
   1570 	    if ($2>=32) {
   1571 		return $ref if ($2&1);
   1572 		# re-encode for upper double register addressing
   1573 		$_=($2|$2>>5)&31;
   1574 	    }
   1575 	}
   1576 
   1577 	return	sprintf ".word\t0x%08x !%s",
   1578 			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
   1579 			$ref;
   1580     } else {
   1581 	return $ref;
   1582     }
   1583 }
   1584 
   1585 sub undes {
   1586 my ($mnemonic)=shift;
   1587 my @args=@_;
   1588 my ($ref,$opf);
   1589 my %desopf = (	"des_round"	=> 0b1001,
   1590 		"des_ip"	=> 0b100110100,
   1591 		"des_iip"	=> 0b100110101,
   1592 		"des_kexpand"	=> 0b100110110	);
   1593 
   1594     $ref = "$mnemonic\t".join(",",@_);
   1595 
   1596     if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
   1597 	if ($mnemonic eq "des_round") {
   1598 	    foreach (@args[0..3]) {
   1599 		return $ref if (!/%f([0-9]{1,2})/);
   1600 		$_=$1;
   1601 		if ($1>=32) {
   1602 		    return $ref if ($1&1);
   1603 		    # re-encode for upper double register addressing
   1604 		    $_=($1|$1>>5)&31;
   1605 		}
   1606 	    }
   1607 	    return  sprintf ".word\t0x%08x !%s",
   1608 			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
   1609 			    $ref;
   1610 	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
   1611 	    foreach (@args[0..2]) {
   1612 		return $ref if (!/(%f)?([0-9]{1,2})/);
   1613 		$_=$2;
   1614 		if ($2>=32) {
   1615 		    return $ref if ($2&1);
   1616 		    # re-encode for upper double register addressing
   1617 		    $_=($2|$2>>5)&31;
   1618 		}
   1619 	    }
   1620 	    return  sprintf ".word\t0x%08x !%s",
   1621 			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
   1622 			    $ref;
   1623 	} else {				# 2-arg
   1624 	    foreach (@args[0..1]) {
   1625 		return $ref if (!/%f([0-9]{1,2})/);
   1626 		$_=$1;
   1627 		if ($1>=32) {
   1628 		    return $ref if ($2&1);
   1629 		    # re-encode for upper double register addressing
   1630 		    $_=($1|$1>>5)&31;
   1631 		}
   1632 	    }
   1633 	    return  sprintf ".word\t0x%08x !%s",
   1634 			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
   1635 			    $ref;
   1636 	}
   1637     } else {
   1638 	return $ref;
   1639     }
   1640 }
   1641 
   1642 sub emit_assembler {
   1643     foreach (split("\n",$::code)) {
   1644 	s/\`([^\`]*)\`/eval $1/ge;
   1645 
   1646 	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
   1647 
   1648 	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
   1649 		&unaes_round($1,$2,$3,$4,$5)
   1650 	 /geo or
   1651 	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
   1652 		&unaes_kexpand($1,$2,$3,$4)
   1653 	 /geo or
   1654 	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
   1655 		&uncamellia_f($1,$2,$3,$4,$5)
   1656 	 /geo or
   1657 	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
   1658 		&uncamellia3($1,$2,$3,$4)
   1659 	 /geo or
   1660 	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
   1661 		&undes($1,$2,$3,$4,$5)
   1662 	 /geo or
   1663 	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
   1664 		&unmovxtox($1,$2,$3)
   1665 	 /geo or
   1666 	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
   1667 		&unmovxtox($1,$2,$3)
   1668 	 /geo or
   1669 	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
   1670 		&unvis($1,$2,$3,$4)
   1671 	 /geo or
   1672 	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
   1673 		&unvis3($1,$2,$3,$4)
   1674 	 /geo;
   1675 
   1676 	print $_,"\n";
   1677     }
   1678 }
   1679 
   1680 1;
   1681