Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # January 2015
     11 #
     12 # ChaCha20 for x86.
     13 #
     14 # Performance in cycles per byte out of large buffer.
     15 #
     16 #		1xIALU/gcc	4xSSSE3
     17 # Pentium	17.5/+80%
     18 # PIII		14.2/+60%
     19 # P4		18.6/+84%
     20 # Core2		9.56/+89%	4.83
     21 # Westmere	9.50/+45%	3.35
     22 # Sandy Bridge	10.5/+47%	3.20
     23 # Haswell	8.15/+50%	2.83
     24 # Skylake	7.53/+22%	2.75
     25 # Silvermont	17.4/+36%	8.35
     26 # Goldmont	13.4/+40%	4.36
     27 # Sledgehammer	10.2/+54%
     28 # Bulldozer	13.4/+50%	4.38(*)
     29 #
     30 # (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
     31 #
     32 # Modified from upstream OpenSSL to remove the XOP code.
     33 
     34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     35 push(@INC,"${dir}","${dir}../../perlasm");
     36 require "x86asm.pl";
     37 
     38 $output=pop;
     39 open STDOUT,">$output";
     40 
     41 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
     42 
     43 $xmm=$ymm=1;
     44 $gasver=999;  # enable everything
     45 
     46 $a="eax";
     47 ($b,$b_)=("ebx","ebp");
     48 ($c,$c_)=("ecx","esi");
     49 ($d,$d_)=("edx","edi");
     50 
     51 sub QUARTERROUND {
     52 my ($ai,$bi,$ci,$di,$i)=@_;
     53 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
     54 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
     55 
     56 	#       a   b   c   d
     57 	#
     58 	#       0   4   8  12 < even round
     59 	#       1   5   9  13
     60 	#       2   6  10  14
     61 	#       3   7  11  15
     62 	#       0   5  10  15 < odd round
     63 	#       1   6  11  12
     64 	#       2   7   8  13
     65 	#       3   4   9  14
     66 
     67 	if ($i==0) {
     68             my $j=4;
     69 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
     70 	} elsif ($i==3) {
     71             my $j=0;
     72 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
     73 	} elsif ($i==4) {
     74             my $j=4;
     75 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
     76 	} elsif ($i==7) {
     77             my $j=0;
     78 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
     79 	}
     80 
     81 	#&add	($a,$b);			# see elsewhere
     82 	&xor	($d,$a);
     83 	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
     84 	&rol	($d,16);
     85 	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
     86 	&add	($c,$d);
     87 	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
     88 	&xor	($b,$c);
     89 	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
     90 	&rol	($b,12);
     91 	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
     92 	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
     93 	&add	($a,$b);
     94 	&xor	($d,$a);
     95 	&mov	(&DWP(4*$ai,"esp"),$a);
     96 	&rol	($d,8);
     97 	&mov	($a,&DWP(4*$an,"esp"));
     98 	&add	($c,$d);
     99 	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
    100 	&mov	($d_,$d)			if ($di==$dn);
    101 	&xor	($b,$c);
    102 	 &add	($a,$b_)			if ($i<7);	# elsewhere
    103 	&rol	($b,7);
    104 
    105 	($b,$b_)=($b_,$b);
    106 	($c,$c_)=($c_,$c);
    107 	($d,$d_)=($d_,$d);
    108 }
    109 
    110 &static_label("ssse3_shortcut");
    111 &static_label("ssse3_data");
    112 &static_label("pic_point");
    113 
    114 &function_begin("ChaCha20_ctr32");
    115 	&xor	("eax","eax");
    116 	&cmp	("eax",&wparam(2));		# len==0?
    117 	&je	(&label("no_data"));
    118 if ($xmm) {
    119 	&call	(&label("pic_point"));
    120 &set_label("pic_point");
    121 	&blindpop("eax");
    122 	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
    123 	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
    124 	&jz	(&label("x86"));
    125 	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
    126 	&jz	(&label("x86"));
    127 	&jmp	(&label("ssse3_shortcut"));
    128 &set_label("x86");
    129 }
    130 	&mov	("esi",&wparam(3));		# key
    131 	&mov	("edi",&wparam(4));		# counter and nonce
    132 
    133 	&stack_push(33);
    134 
    135 	&mov	("eax",&DWP(4*0,"esi"));	# copy key
    136 	&mov	("ebx",&DWP(4*1,"esi"));
    137 	&mov	("ecx",&DWP(4*2,"esi"));
    138 	&mov	("edx",&DWP(4*3,"esi"));
    139 	&mov	(&DWP(64+4*4,"esp"),"eax");
    140 	&mov	(&DWP(64+4*5,"esp"),"ebx");
    141 	&mov	(&DWP(64+4*6,"esp"),"ecx");
    142 	&mov	(&DWP(64+4*7,"esp"),"edx");
    143 	&mov	("eax",&DWP(4*4,"esi"));
    144 	&mov	("ebx",&DWP(4*5,"esi"));
    145 	&mov	("ecx",&DWP(4*6,"esi"));
    146 	&mov	("edx",&DWP(4*7,"esi"));
    147 	&mov	(&DWP(64+4*8,"esp"),"eax");
    148 	&mov	(&DWP(64+4*9,"esp"),"ebx");
    149 	&mov	(&DWP(64+4*10,"esp"),"ecx");
    150 	&mov	(&DWP(64+4*11,"esp"),"edx");
    151 	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
    152 	&mov	("ebx",&DWP(4*1,"edi"));
    153 	&mov	("ecx",&DWP(4*2,"edi"));
    154 	&mov	("edx",&DWP(4*3,"edi"));
    155 	&sub	("eax",1);
    156 	&mov	(&DWP(64+4*12,"esp"),"eax");
    157 	&mov	(&DWP(64+4*13,"esp"),"ebx");
    158 	&mov	(&DWP(64+4*14,"esp"),"ecx");
    159 	&mov	(&DWP(64+4*15,"esp"),"edx");
    160 	&jmp	(&label("entry"));
    161 
    162 &set_label("outer_loop",16);
    163 	&mov	(&wparam(1),$b);		# save input
    164 	&mov	(&wparam(0),$a);		# save output
    165 	&mov	(&wparam(2),$c);		# save len
    166 &set_label("entry");
    167 	&mov	($a,0x61707865);
    168 	&mov	(&DWP(4*1,"esp"),0x3320646e);
    169 	&mov	(&DWP(4*2,"esp"),0x79622d32);
    170 	&mov	(&DWP(4*3,"esp"),0x6b206574);
    171 
    172 	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
    173 	&mov	($b_,&DWP(64+4*6,"esp"));
    174 	&mov	($c, &DWP(64+4*10,"esp"));
    175 	&mov	($c_,&DWP(64+4*11,"esp"));
    176 	&mov	($d, &DWP(64+4*13,"esp"));
    177 	&mov	($d_,&DWP(64+4*14,"esp"));
    178 	&mov	(&DWP(4*5,"esp"),$b);
    179 	&mov	(&DWP(4*6,"esp"),$b_);
    180 	&mov	(&DWP(4*10,"esp"),$c);
    181 	&mov	(&DWP(4*11,"esp"),$c_);
    182 	&mov	(&DWP(4*13,"esp"),$d);
    183 	&mov	(&DWP(4*14,"esp"),$d_);
    184 
    185 	&mov	($b, &DWP(64+4*7,"esp"));
    186 	&mov	($d_,&DWP(64+4*15,"esp"));
    187 	&mov	($d, &DWP(64+4*12,"esp"));
    188 	&mov	($b_,&DWP(64+4*4,"esp"));
    189 	&mov	($c, &DWP(64+4*8,"esp"));
    190 	&mov	($c_,&DWP(64+4*9,"esp"));
    191 	&add	($d,1);				# counter value
    192 	&mov	(&DWP(4*7,"esp"),$b);
    193 	&mov	(&DWP(4*15,"esp"),$d_);
    194 	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
    195 
    196 	&mov	($b,10);			# loop counter
    197 	&jmp	(&label("loop"));
    198 
    199 &set_label("loop",16);
    200 	&add	($a,$b_);			# elsewhere
    201 	&mov	(&DWP(128,"esp"),$b);		# save loop counter
    202 	&mov	($b,$b_);
    203 	&QUARTERROUND(0, 4, 8, 12, 0);
    204 	&QUARTERROUND(1, 5, 9, 13, 1);
    205 	&QUARTERROUND(2, 6,10, 14, 2);
    206 	&QUARTERROUND(3, 7,11, 15, 3);
    207 	&QUARTERROUND(0, 5,10, 15, 4);
    208 	&QUARTERROUND(1, 6,11, 12, 5);
    209 	&QUARTERROUND(2, 7, 8, 13, 6);
    210 	&QUARTERROUND(3, 4, 9, 14, 7);
    211 	&dec	($b);
    212 	&jnz	(&label("loop"));
    213 
    214 	&mov	($b,&wparam(2));		# load len
    215 
    216 	&add	($a,0x61707865);		# accumulate key material
    217 	&add	($b_,&DWP(64+4*4,"esp"));
    218 	&add	($c, &DWP(64+4*8,"esp"));
    219 	&add	($c_,&DWP(64+4*9,"esp"));
    220 
    221 	&cmp	($b,64);
    222 	&jb	(&label("tail"));
    223 
    224 	&mov	($b,&wparam(1));		# load input pointer
    225 	&add	($d, &DWP(64+4*12,"esp"));
    226 	&add	($d_,&DWP(64+4*14,"esp"));
    227 
    228 	&xor	($a, &DWP(4*0,$b));		# xor with input
    229 	&xor	($b_,&DWP(4*4,$b));
    230 	&mov	(&DWP(4*0,"esp"),$a);
    231 	&mov	($a,&wparam(0));		# load output pointer
    232 	&xor	($c, &DWP(4*8,$b));
    233 	&xor	($c_,&DWP(4*9,$b));
    234 	&xor	($d, &DWP(4*12,$b));
    235 	&xor	($d_,&DWP(4*14,$b));
    236 	&mov	(&DWP(4*4,$a),$b_);		# write output
    237 	&mov	(&DWP(4*8,$a),$c);
    238 	&mov	(&DWP(4*9,$a),$c_);
    239 	&mov	(&DWP(4*12,$a),$d);
    240 	&mov	(&DWP(4*14,$a),$d_);
    241 
    242 	&mov	($b_,&DWP(4*1,"esp"));
    243 	&mov	($c, &DWP(4*2,"esp"));
    244 	&mov	($c_,&DWP(4*3,"esp"));
    245 	&mov	($d, &DWP(4*5,"esp"));
    246 	&mov	($d_,&DWP(4*6,"esp"));
    247 	&add	($b_,0x3320646e);		# accumulate key material
    248 	&add	($c, 0x79622d32);
    249 	&add	($c_,0x6b206574);
    250 	&add	($d, &DWP(64+4*5,"esp"));
    251 	&add	($d_,&DWP(64+4*6,"esp"));
    252 	&xor	($b_,&DWP(4*1,$b));
    253 	&xor	($c, &DWP(4*2,$b));
    254 	&xor	($c_,&DWP(4*3,$b));
    255 	&xor	($d, &DWP(4*5,$b));
    256 	&xor	($d_,&DWP(4*6,$b));
    257 	&mov	(&DWP(4*1,$a),$b_);
    258 	&mov	(&DWP(4*2,$a),$c);
    259 	&mov	(&DWP(4*3,$a),$c_);
    260 	&mov	(&DWP(4*5,$a),$d);
    261 	&mov	(&DWP(4*6,$a),$d_);
    262 
    263 	&mov	($b_,&DWP(4*7,"esp"));
    264 	&mov	($c, &DWP(4*10,"esp"));
    265 	&mov	($c_,&DWP(4*11,"esp"));
    266 	&mov	($d, &DWP(4*13,"esp"));
    267 	&mov	($d_,&DWP(4*15,"esp"));
    268 	&add	($b_,&DWP(64+4*7,"esp"));
    269 	&add	($c, &DWP(64+4*10,"esp"));
    270 	&add	($c_,&DWP(64+4*11,"esp"));
    271 	&add	($d, &DWP(64+4*13,"esp"));
    272 	&add	($d_,&DWP(64+4*15,"esp"));
    273 	&xor	($b_,&DWP(4*7,$b));
    274 	&xor	($c, &DWP(4*10,$b));
    275 	&xor	($c_,&DWP(4*11,$b));
    276 	&xor	($d, &DWP(4*13,$b));
    277 	&xor	($d_,&DWP(4*15,$b));
    278 	&lea	($b,&DWP(4*16,$b));
    279 	&mov	(&DWP(4*7,$a),$b_);
    280 	&mov	($b_,&DWP(4*0,"esp"));
    281 	&mov	(&DWP(4*10,$a),$c);
    282 	&mov	($c,&wparam(2));		# len
    283 	&mov	(&DWP(4*11,$a),$c_);
    284 	&mov	(&DWP(4*13,$a),$d);
    285 	&mov	(&DWP(4*15,$a),$d_);
    286 	&mov	(&DWP(4*0,$a),$b_);
    287 	&lea	($a,&DWP(4*16,$a));
    288 	&sub	($c,64);
    289 	&jnz	(&label("outer_loop"));
    290 
    291 	&jmp	(&label("done"));
    292 
    293 &set_label("tail");
    294 	&add	($d, &DWP(64+4*12,"esp"));
    295 	&add	($d_,&DWP(64+4*14,"esp"));
    296 	&mov	(&DWP(4*0,"esp"),$a);
    297 	&mov	(&DWP(4*4,"esp"),$b_);
    298 	&mov	(&DWP(4*8,"esp"),$c);
    299 	&mov	(&DWP(4*9,"esp"),$c_);
    300 	&mov	(&DWP(4*12,"esp"),$d);
    301 	&mov	(&DWP(4*14,"esp"),$d_);
    302 
    303 	&mov	($b_,&DWP(4*1,"esp"));
    304 	&mov	($c, &DWP(4*2,"esp"));
    305 	&mov	($c_,&DWP(4*3,"esp"));
    306 	&mov	($d, &DWP(4*5,"esp"));
    307 	&mov	($d_,&DWP(4*6,"esp"));
    308 	&add	($b_,0x3320646e);		# accumulate key material
    309 	&add	($c, 0x79622d32);
    310 	&add	($c_,0x6b206574);
    311 	&add	($d, &DWP(64+4*5,"esp"));
    312 	&add	($d_,&DWP(64+4*6,"esp"));
    313 	&mov	(&DWP(4*1,"esp"),$b_);
    314 	&mov	(&DWP(4*2,"esp"),$c);
    315 	&mov	(&DWP(4*3,"esp"),$c_);
    316 	&mov	(&DWP(4*5,"esp"),$d);
    317 	&mov	(&DWP(4*6,"esp"),$d_);
    318 
    319 	&mov	($b_,&DWP(4*7,"esp"));
    320 	&mov	($c, &DWP(4*10,"esp"));
    321 	&mov	($c_,&DWP(4*11,"esp"));
    322 	&mov	($d, &DWP(4*13,"esp"));
    323 	&mov	($d_,&DWP(4*15,"esp"));
    324 	&add	($b_,&DWP(64+4*7,"esp"));
    325 	&add	($c, &DWP(64+4*10,"esp"));
    326 	&add	($c_,&DWP(64+4*11,"esp"));
    327 	&add	($d, &DWP(64+4*13,"esp"));
    328 	&add	($d_,&DWP(64+4*15,"esp"));
    329 	&mov	(&DWP(4*7,"esp"),$b_);
    330 	&mov	($b_,&wparam(1));		# load input
    331 	&mov	(&DWP(4*10,"esp"),$c);
    332 	&mov	($c,&wparam(0));		# load output
    333 	&mov	(&DWP(4*11,"esp"),$c_);
    334 	&xor	($c_,$c_);
    335 	&mov	(&DWP(4*13,"esp"),$d);
    336 	&mov	(&DWP(4*15,"esp"),$d_);
    337 
    338 	&xor	("eax","eax");
    339 	&xor	("edx","edx");
    340 &set_label("tail_loop");
    341 	&movb	("al",&BP(0,$c_,$b_));
    342 	&movb	("dl",&BP(0,"esp",$c_));
    343 	&lea	($c_,&DWP(1,$c_));
    344 	&xor	("al","dl");
    345 	&mov	(&BP(-1,$c,$c_),"al");
    346 	&dec	($b);
    347 	&jnz	(&label("tail_loop"));
    348 
    349 &set_label("done");
    350 	&stack_pop(33);
    351 &set_label("no_data");
    352 &function_end("ChaCha20_ctr32");
    353 
    354 if ($xmm) {
    355 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
    356 my ($out,$inp,$len)=("edi","esi","ecx");
    357 
    358 sub QUARTERROUND_SSSE3 {
    359 my ($ai,$bi,$ci,$di,$i)=@_;
    360 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
    361 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
    362 
    363 	#       a   b   c   d
    364 	#
    365 	#       0   4   8  12 < even round
    366 	#       1   5   9  13
    367 	#       2   6  10  14
    368 	#       3   7  11  15
    369 	#       0   5  10  15 < odd round
    370 	#       1   6  11  12
    371 	#       2   7   8  13
    372 	#       3   4   9  14
    373 
    374 	if ($i==0) {
    375             my $j=4;
    376 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
    377 	} elsif ($i==3) {
    378             my $j=0;
    379 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
    380 	} elsif ($i==4) {
    381             my $j=4;
    382 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
    383 	} elsif ($i==7) {
    384             my $j=0;
    385 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
    386 	}
    387 
    388 	#&paddd	($xa,$xb);			# see elsewhere
    389 	#&pxor	($xd,$xa);			# see elsewhere
    390 	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
    391 	&pshufb	($xd,&QWP(0,"eax"));		# rot16
    392 	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
    393 	&paddd	($xc,$xd);
    394 	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
    395 	&pxor	($xb,$xc);
    396 	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
    397 	&movdqa	($xa_,$xb);			# borrow as temporary
    398 	&pslld	($xb,12);
    399 	&psrld	($xa_,20);
    400 	&por	($xb,$xa_);
    401 	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
    402 	&paddd	($xa,$xb);
    403 	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
    404 	&pxor	($xd,$xa);
    405 	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
    406 	&pshufb	($xd,&QWP(16,"eax"));		# rot8
    407 	&paddd	($xc,$xd);
    408 	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
    409 	&movdqa	($xd_,$xd)			if ($di==$dn);
    410 	&pxor	($xb,$xc);
    411 	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
    412 	&movdqa	($xa,$xb);			# borrow as temporary
    413 	&pslld	($xb,7);
    414 	&psrld	($xa,25);
    415 	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
    416 	&por	($xb,$xa);
    417 
    418 	($xa,$xa_)=($xa_,$xa);
    419 	($xb,$xb_)=($xb_,$xb);
    420 	($xc,$xc_)=($xc_,$xc);
    421 	($xd,$xd_)=($xd_,$xd);
    422 }
    423 
    424 &function_begin("ChaCha20_ssse3");
    425 &set_label("ssse3_shortcut");
    426 	&mov		($out,&wparam(0));
    427 	&mov		($inp,&wparam(1));
    428 	&mov		($len,&wparam(2));
    429 	&mov		("edx",&wparam(3));		# key
    430 	&mov		("ebx",&wparam(4));		# counter and nonce
    431 
    432 	&mov		("ebp","esp");
    433 	&stack_push	(131);
    434 	&and		("esp",-64);
    435 	&mov		(&DWP(512,"esp"),"ebp");
    436 
    437 	&lea		("eax",&DWP(&label("ssse3_data")."-".
    438 				    &label("pic_point"),"eax"));
    439 	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
    440 
    441 if (defined($gasver) && $gasver>=2.17) {		# even though we encode
    442 							# pshufb manually, we
    443 							# handle only register
    444 							# operands, while this
    445 							# segment uses memory
    446 							# operand...
    447 	&cmp		($len,64*4);
    448 	&jb		(&label("1x"));
    449 
    450 	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
    451 	&mov		(&DWP(512+8,"esp"),"ebx");
    452 	&sub		($len,64*4);			# bias len
    453 	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
    454 
    455 	&movdqu		("xmm7",&QWP(0,"edx"));		# key
    456 	&pshufd		("xmm0","xmm3",0x00);
    457 	&pshufd		("xmm1","xmm3",0x55);
    458 	&pshufd		("xmm2","xmm3",0xaa);
    459 	&pshufd		("xmm3","xmm3",0xff);
    460 	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
    461 	&pshufd		("xmm4","xmm7",0x00);
    462 	&pshufd		("xmm5","xmm7",0x55);
    463 	 &psubd		("xmm0",&QWP(16*4,"eax"));
    464 	&pshufd		("xmm6","xmm7",0xaa);
    465 	&pshufd		("xmm7","xmm7",0xff);
    466 	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
    467 	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
    468 	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
    469 	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
    470 	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
    471 	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
    472 	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
    473 	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
    474 	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
    475 	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
    476 	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
    477 
    478 	&pshufd		("xmm0","xmm3",0x00);
    479 	&pshufd		("xmm1","xmm3",0x55);
    480 	&pshufd		("xmm2","xmm3",0xaa);
    481 	&pshufd		("xmm3","xmm3",0xff);
    482 	&pshufd		("xmm4","xmm7",0x00);
    483 	&pshufd		("xmm5","xmm7",0x55);
    484 	&pshufd		("xmm6","xmm7",0xaa);
    485 	&pshufd		("xmm7","xmm7",0xff);
    486 	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
    487 	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
    488 	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
    489 	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
    490 	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
    491 	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
    492 	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
    493 	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
    494 
    495 	&lea		($inp,&DWP(128,$inp));		# size optimization
    496 	&lea		($out,&DWP(128,$out));		# size optimization
    497 	&jmp		(&label("outer_loop"));
    498 
    499 &set_label("outer_loop",16);
    500 	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
    501 	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
    502 	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
    503 	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
    504 	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
    505 	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
    506 	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
    507 	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
    508 	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
    509 	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
    510 	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
    511 	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
    512 	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
    513 	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
    514 	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
    515 	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
    516 	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
    517 	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
    518 	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
    519 	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
    520 	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
    521 	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
    522 	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
    523 	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
    524 	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
    525 	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
    526 	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
    527 	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
    528 	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
    529 	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
    530 	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
    531 	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
    532 	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
    533 	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
    534 
    535 	&movdqa		($xa, &QWP(16*0-128,"ebp"));
    536 	&movdqa		($xd, "xmm4");
    537 	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
    538 	&movdqa		($xc, &QWP(16*8-128,"ebp"));
    539 	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
    540 
    541 	&mov		("edx",10);			# loop counter
    542 	&nop		();
    543 
    544 &set_label("loop",16);
    545 	&paddd		($xa,$xb_);			# elsewhere
    546 	&movdqa		($xb,$xb_);
    547 	&pxor		($xd,$xa);			# elsewhere
    548 	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
    549 	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
    550 	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
    551 	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
    552 	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
    553 	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
    554 	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
    555 	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
    556 	&dec		("edx");
    557 	&jnz		(&label("loop"));
    558 
    559 	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
    560 	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
    561 	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
    562 	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
    563 	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
    564 
    565     my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
    566 
    567 	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
    568 	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
    569 	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
    570 	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
    571 
    572     for($i=0;$i<256;$i+=64) {
    573 	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
    574 	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
    575 	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
    576 	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
    577 
    578 	&movdqa		($xt2,$xa0);		# "de-interlace" data
    579 	&punpckldq	($xa0,$xa1);
    580 	&movdqa		($xt3,$xa2);
    581 	&punpckldq	($xa2,$xa3);
    582 	&punpckhdq	($xt2,$xa1);
    583 	&punpckhdq	($xt3,$xa3);
    584 	&movdqa		($xa1,$xa0);
    585 	&punpcklqdq	($xa0,$xa2);		# "a0"
    586 	&movdqa		($xa3,$xt2);
    587 	&punpcklqdq	($xt2,$xt3);		# "a2"
    588 	&punpckhqdq	($xa1,$xa2);		# "a1"
    589 	&punpckhqdq	($xa3,$xt3);		# "a3"
    590 
    591 	#($xa2,$xt2)=($xt2,$xa2);
    592 
    593 	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
    594 	&movdqu		($xt1,&QWP(64*1-128,$inp));
    595 	&movdqu		($xa2,&QWP(64*2-128,$inp));
    596 	&movdqu		($xt3,&QWP(64*3-128,$inp));
    597 	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
    598 	&pxor		($xt0,$xa0);
    599 	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
    600 	&pxor		($xt1,$xa1);
    601 	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
    602 	&pxor		($xt2,$xa2);
    603 	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
    604 	&pxor		($xt3,$xa3);
    605 	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
    606 	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
    607 	&movdqu		(&QWP(64*1-128,$out),$xt1);
    608 	&movdqu		(&QWP(64*2-128,$out),$xt2);
    609 	&movdqu		(&QWP(64*3-128,$out),$xt3);
    610 	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
    611     }
    612 	&sub		($len,64*4);
    613 	&jnc		(&label("outer_loop"));
    614 
    615 	&add		($len,64*4);
    616 	&jz		(&label("done"));
    617 
    618 	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
    619 	&lea		($inp,&DWP(-128,$inp));
    620 	&mov		("edx",&DWP(512+4,"esp"));
    621 	&lea		($out,&DWP(-128,$out));
    622 
    623 	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
    624 	&movdqu		("xmm3",&QWP(0,"ebx"));
    625 	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
    626 	&pand		("xmm3",&QWP(16*7,"eax"));
    627 	&por		("xmm3","xmm2");		# counter value
    628 }
    629 {
    630 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
    631 
    632 sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
    633 	&paddd		($a,$b);
    634 	&pxor		($d,$a);
    635 	&pshufb		($d,$rot16);
    636 
    637 	&paddd		($c,$d);
    638 	&pxor		($b,$c);
    639 	&movdqa		($t,$b);
    640 	&psrld		($b,20);
    641 	&pslld		($t,12);
    642 	&por		($b,$t);
    643 
    644 	&paddd		($a,$b);
    645 	&pxor		($d,$a);
    646 	&pshufb		($d,$rot24);
    647 
    648 	&paddd		($c,$d);
    649 	&pxor		($b,$c);
    650 	&movdqa		($t,$b);
    651 	&psrld		($b,25);
    652 	&pslld		($t,7);
    653 	&por		($b,$t);
    654 }
    655 
    656 &set_label("1x");
    657 	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
    658 	&movdqu		($b,&QWP(0,"edx"));
    659 	&movdqu		($c,&QWP(16,"edx"));
    660 	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
    661 	&movdqa		($rot16,&QWP(0,"eax"));
    662 	&movdqa		($rot24,&QWP(16,"eax"));
    663 	&mov		(&DWP(16*3,"esp"),"ebp");
    664 
    665 	&movdqa		(&QWP(16*0,"esp"),$a);
    666 	&movdqa		(&QWP(16*1,"esp"),$b);
    667 	&movdqa		(&QWP(16*2,"esp"),$c);
    668 	&movdqa		(&QWP(16*3,"esp"),$d);
    669 	&mov		("edx",10);
    670 	&jmp		(&label("loop1x"));
    671 
    672 &set_label("outer1x",16);
    673 	&movdqa		($d,&QWP(16*5,"eax"));		# one
    674 	&movdqa		($a,&QWP(16*0,"esp"));
    675 	&movdqa		($b,&QWP(16*1,"esp"));
    676 	&movdqa		($c,&QWP(16*2,"esp"));
    677 	&paddd		($d,&QWP(16*3,"esp"));
    678 	&mov		("edx",10);
    679 	&movdqa		(&QWP(16*3,"esp"),$d);
    680 	&jmp		(&label("loop1x"));
    681 
    682 &set_label("loop1x",16);
    683 	&SSSE3ROUND();
    684 	&pshufd	($c,$c,0b01001110);
    685 	&pshufd	($b,$b,0b00111001);
    686 	&pshufd	($d,$d,0b10010011);
    687 	&nop	();
    688 
    689 	&SSSE3ROUND();
    690 	&pshufd	($c,$c,0b01001110);
    691 	&pshufd	($b,$b,0b10010011);
    692 	&pshufd	($d,$d,0b00111001);
    693 
    694 	&dec		("edx");
    695 	&jnz		(&label("loop1x"));
    696 
    697 	&paddd		($a,&QWP(16*0,"esp"));
    698 	&paddd		($b,&QWP(16*1,"esp"));
    699 	&paddd		($c,&QWP(16*2,"esp"));
    700 	&paddd		($d,&QWP(16*3,"esp"));
    701 
    702 	&cmp		($len,64);
    703 	&jb		(&label("tail"));
    704 
    705 	&movdqu		($t,&QWP(16*0,$inp));
    706 	&movdqu		($t1,&QWP(16*1,$inp));
    707 	&pxor		($a,$t);		# xor with input
    708 	&movdqu		($t,&QWP(16*2,$inp));
    709 	&pxor		($b,$t1);
    710 	&movdqu		($t1,&QWP(16*3,$inp));
    711 	&pxor		($c,$t);
    712 	&pxor		($d,$t1);
    713 	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
    714 
    715 	&movdqu		(&QWP(16*0,$out),$a);	# write output
    716 	&movdqu		(&QWP(16*1,$out),$b);
    717 	&movdqu		(&QWP(16*2,$out),$c);
    718 	&movdqu		(&QWP(16*3,$out),$d);
    719 	&lea		($out,&DWP(16*4,$out));	# inp+=64
    720 
    721 	&sub		($len,64);
    722 	&jnz		(&label("outer1x"));
    723 
    724 	&jmp		(&label("done"));
    725 
    726 &set_label("tail");
    727 	&movdqa		(&QWP(16*0,"esp"),$a);
    728 	&movdqa		(&QWP(16*1,"esp"),$b);
    729 	&movdqa		(&QWP(16*2,"esp"),$c);
    730 	&movdqa		(&QWP(16*3,"esp"),$d);
    731 
    732 	&xor		("eax","eax");
    733 	&xor		("edx","edx");
    734 	&xor		("ebp","ebp");
    735 
    736 &set_label("tail_loop");
    737 	&movb		("al",&BP(0,"esp","ebp"));
    738 	&movb		("dl",&BP(0,$inp,"ebp"));
    739 	&lea		("ebp",&DWP(1,"ebp"));
    740 	&xor		("al","dl");
    741 	&movb		(&BP(-1,$out,"ebp"),"al");
    742 	&dec		($len);
    743 	&jnz		(&label("tail_loop"));
    744 }
    745 &set_label("done");
    746 	&mov		("esp",&DWP(512,"esp"));
    747 &function_end("ChaCha20_ssse3");
    748 
    749 &align	(64);
    750 &set_label("ssse3_data");
    751 &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
    752 &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
    753 &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
    754 &data_word(0,1,2,3);
    755 &data_word(4,4,4,4);
    756 &data_word(1,0,0,0);
    757 &data_word(4,0,0,0);
    758 &data_word(0,-1,-1,-1);
    759 &align	(64);
    760 }
    761 &asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
    762 
    763 &asm_finish();
    764 
    765 close STDOUT;
    766