Home | History | Annotate | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the OpenSSL license (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 #
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 # ====================================================================
     16 #
     17 # January 2015
     18 #
     19 # ChaCha20 for x86.
     20 #
     21 # Performance in cycles per byte out of large buffer.
     22 #
     23 #		1xIALU/gcc	4xSSSE3
     24 # Pentium	17.5/+80%
     25 # PIII		14.2/+60%
     26 # P4		18.6/+84%
     27 # Core2		9.56/+89%	4.83
     28 # Westmere	9.50/+45%	3.35
     29 # Sandy Bridge	10.5/+47%	3.20
     30 # Haswell	8.15/+50%	2.83
     31 # Skylake	7.53/+22%	2.75
     32 # Silvermont	17.4/+36%	8.35
     33 # Goldmont	13.4/+40%	4.36
     34 # Sledgehammer	10.2/+54%
     35 # Bulldozer	13.4/+50%	4.38(*)
     36 #
     37 # (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
     38 #
     39 # Modified from upstream OpenSSL to remove the XOP code.
     40 
     41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     42 push(@INC,"${dir}","${dir}../../perlasm");
     43 require "x86asm.pl";
     44 
     45 $output=pop;
     46 open STDOUT,">$output";
     47 
     48 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
     49 
     50 $xmm=$ymm=1;
     51 $gasver=999;  # enable everything
     52 
     53 $a="eax";
     54 ($b,$b_)=("ebx","ebp");
     55 ($c,$c_)=("ecx","esi");
     56 ($d,$d_)=("edx","edi");
     57 
     58 sub QUARTERROUND {
     59 my ($ai,$bi,$ci,$di,$i)=@_;
     60 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
     61 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
     62 
     63 	#       a   b   c   d
     64 	#
     65 	#       0   4   8  12 < even round
     66 	#       1   5   9  13
     67 	#       2   6  10  14
     68 	#       3   7  11  15
     69 	#       0   5  10  15 < odd round
     70 	#       1   6  11  12
     71 	#       2   7   8  13
     72 	#       3   4   9  14
     73 
     74 	if ($i==0) {
     75             my $j=4;
     76 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
     77 	} elsif ($i==3) {
     78             my $j=0;
     79 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
     80 	} elsif ($i==4) {
     81             my $j=4;
     82 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
     83 	} elsif ($i==7) {
     84             my $j=0;
     85 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
     86 	}
     87 
     88 	#&add	($a,$b);			# see elsewhere
     89 	&xor	($d,$a);
     90 	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
     91 	&rol	($d,16);
     92 	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
     93 	&add	($c,$d);
     94 	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
     95 	&xor	($b,$c);
     96 	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
     97 	&rol	($b,12);
     98 	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
     99 	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
    100 	&add	($a,$b);
    101 	&xor	($d,$a);
    102 	&mov	(&DWP(4*$ai,"esp"),$a);
    103 	&rol	($d,8);
    104 	&mov	($a,&DWP(4*$an,"esp"));
    105 	&add	($c,$d);
    106 	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
    107 	&mov	($d_,$d)			if ($di==$dn);
    108 	&xor	($b,$c);
    109 	 &add	($a,$b_)			if ($i<7);	# elsewhere
    110 	&rol	($b,7);
    111 
    112 	($b,$b_)=($b_,$b);
    113 	($c,$c_)=($c_,$c);
    114 	($d,$d_)=($d_,$d);
    115 }
    116 
    117 &static_label("ssse3_shortcut");
    118 &static_label("ssse3_data");
    119 &static_label("pic_point");
    120 
    121 &function_begin("ChaCha20_ctr32");
    122 	&xor	("eax","eax");
    123 	&cmp	("eax",&wparam(2));		# len==0?
    124 	&je	(&label("no_data"));
    125 if ($xmm) {
    126 	&call	(&label("pic_point"));
    127 &set_label("pic_point");
    128 	&blindpop("eax");
    129 	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
    130 	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
    131 	&jz	(&label("x86"));
    132 	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
    133 	&jz	(&label("x86"));
    134 	&jmp	(&label("ssse3_shortcut"));
    135 &set_label("x86");
    136 }
    137 	&mov	("esi",&wparam(3));		# key
    138 	&mov	("edi",&wparam(4));		# counter and nonce
    139 
    140 	&stack_push(33);
    141 
    142 	&mov	("eax",&DWP(4*0,"esi"));	# copy key
    143 	&mov	("ebx",&DWP(4*1,"esi"));
    144 	&mov	("ecx",&DWP(4*2,"esi"));
    145 	&mov	("edx",&DWP(4*3,"esi"));
    146 	&mov	(&DWP(64+4*4,"esp"),"eax");
    147 	&mov	(&DWP(64+4*5,"esp"),"ebx");
    148 	&mov	(&DWP(64+4*6,"esp"),"ecx");
    149 	&mov	(&DWP(64+4*7,"esp"),"edx");
    150 	&mov	("eax",&DWP(4*4,"esi"));
    151 	&mov	("ebx",&DWP(4*5,"esi"));
    152 	&mov	("ecx",&DWP(4*6,"esi"));
    153 	&mov	("edx",&DWP(4*7,"esi"));
    154 	&mov	(&DWP(64+4*8,"esp"),"eax");
    155 	&mov	(&DWP(64+4*9,"esp"),"ebx");
    156 	&mov	(&DWP(64+4*10,"esp"),"ecx");
    157 	&mov	(&DWP(64+4*11,"esp"),"edx");
    158 	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
    159 	&mov	("ebx",&DWP(4*1,"edi"));
    160 	&mov	("ecx",&DWP(4*2,"edi"));
    161 	&mov	("edx",&DWP(4*3,"edi"));
    162 	&sub	("eax",1);
    163 	&mov	(&DWP(64+4*12,"esp"),"eax");
    164 	&mov	(&DWP(64+4*13,"esp"),"ebx");
    165 	&mov	(&DWP(64+4*14,"esp"),"ecx");
    166 	&mov	(&DWP(64+4*15,"esp"),"edx");
    167 	&jmp	(&label("entry"));
    168 
    169 &set_label("outer_loop",16);
    170 	&mov	(&wparam(1),$b);		# save input
    171 	&mov	(&wparam(0),$a);		# save output
    172 	&mov	(&wparam(2),$c);		# save len
    173 &set_label("entry");
    174 	&mov	($a,0x61707865);
    175 	&mov	(&DWP(4*1,"esp"),0x3320646e);
    176 	&mov	(&DWP(4*2,"esp"),0x79622d32);
    177 	&mov	(&DWP(4*3,"esp"),0x6b206574);
    178 
    179 	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
    180 	&mov	($b_,&DWP(64+4*6,"esp"));
    181 	&mov	($c, &DWP(64+4*10,"esp"));
    182 	&mov	($c_,&DWP(64+4*11,"esp"));
    183 	&mov	($d, &DWP(64+4*13,"esp"));
    184 	&mov	($d_,&DWP(64+4*14,"esp"));
    185 	&mov	(&DWP(4*5,"esp"),$b);
    186 	&mov	(&DWP(4*6,"esp"),$b_);
    187 	&mov	(&DWP(4*10,"esp"),$c);
    188 	&mov	(&DWP(4*11,"esp"),$c_);
    189 	&mov	(&DWP(4*13,"esp"),$d);
    190 	&mov	(&DWP(4*14,"esp"),$d_);
    191 
    192 	&mov	($b, &DWP(64+4*7,"esp"));
    193 	&mov	($d_,&DWP(64+4*15,"esp"));
    194 	&mov	($d, &DWP(64+4*12,"esp"));
    195 	&mov	($b_,&DWP(64+4*4,"esp"));
    196 	&mov	($c, &DWP(64+4*8,"esp"));
    197 	&mov	($c_,&DWP(64+4*9,"esp"));
    198 	&add	($d,1);				# counter value
    199 	&mov	(&DWP(4*7,"esp"),$b);
    200 	&mov	(&DWP(4*15,"esp"),$d_);
    201 	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
    202 
    203 	&mov	($b,10);			# loop counter
    204 	&jmp	(&label("loop"));
    205 
    206 &set_label("loop",16);
    207 	&add	($a,$b_);			# elsewhere
    208 	&mov	(&DWP(128,"esp"),$b);		# save loop counter
    209 	&mov	($b,$b_);
    210 	&QUARTERROUND(0, 4, 8, 12, 0);
    211 	&QUARTERROUND(1, 5, 9, 13, 1);
    212 	&QUARTERROUND(2, 6,10, 14, 2);
    213 	&QUARTERROUND(3, 7,11, 15, 3);
    214 	&QUARTERROUND(0, 5,10, 15, 4);
    215 	&QUARTERROUND(1, 6,11, 12, 5);
    216 	&QUARTERROUND(2, 7, 8, 13, 6);
    217 	&QUARTERROUND(3, 4, 9, 14, 7);
    218 	&dec	($b);
    219 	&jnz	(&label("loop"));
    220 
    221 	&mov	($b,&wparam(2));		# load len
    222 
    223 	&add	($a,0x61707865);		# accumulate key material
    224 	&add	($b_,&DWP(64+4*4,"esp"));
    225 	&add	($c, &DWP(64+4*8,"esp"));
    226 	&add	($c_,&DWP(64+4*9,"esp"));
    227 
    228 	&cmp	($b,64);
    229 	&jb	(&label("tail"));
    230 
    231 	&mov	($b,&wparam(1));		# load input pointer
    232 	&add	($d, &DWP(64+4*12,"esp"));
    233 	&add	($d_,&DWP(64+4*14,"esp"));
    234 
    235 	&xor	($a, &DWP(4*0,$b));		# xor with input
    236 	&xor	($b_,&DWP(4*4,$b));
    237 	&mov	(&DWP(4*0,"esp"),$a);
    238 	&mov	($a,&wparam(0));		# load output pointer
    239 	&xor	($c, &DWP(4*8,$b));
    240 	&xor	($c_,&DWP(4*9,$b));
    241 	&xor	($d, &DWP(4*12,$b));
    242 	&xor	($d_,&DWP(4*14,$b));
    243 	&mov	(&DWP(4*4,$a),$b_);		# write output
    244 	&mov	(&DWP(4*8,$a),$c);
    245 	&mov	(&DWP(4*9,$a),$c_);
    246 	&mov	(&DWP(4*12,$a),$d);
    247 	&mov	(&DWP(4*14,$a),$d_);
    248 
    249 	&mov	($b_,&DWP(4*1,"esp"));
    250 	&mov	($c, &DWP(4*2,"esp"));
    251 	&mov	($c_,&DWP(4*3,"esp"));
    252 	&mov	($d, &DWP(4*5,"esp"));
    253 	&mov	($d_,&DWP(4*6,"esp"));
    254 	&add	($b_,0x3320646e);		# accumulate key material
    255 	&add	($c, 0x79622d32);
    256 	&add	($c_,0x6b206574);
    257 	&add	($d, &DWP(64+4*5,"esp"));
    258 	&add	($d_,&DWP(64+4*6,"esp"));
    259 	&xor	($b_,&DWP(4*1,$b));
    260 	&xor	($c, &DWP(4*2,$b));
    261 	&xor	($c_,&DWP(4*3,$b));
    262 	&xor	($d, &DWP(4*5,$b));
    263 	&xor	($d_,&DWP(4*6,$b));
    264 	&mov	(&DWP(4*1,$a),$b_);
    265 	&mov	(&DWP(4*2,$a),$c);
    266 	&mov	(&DWP(4*3,$a),$c_);
    267 	&mov	(&DWP(4*5,$a),$d);
    268 	&mov	(&DWP(4*6,$a),$d_);
    269 
    270 	&mov	($b_,&DWP(4*7,"esp"));
    271 	&mov	($c, &DWP(4*10,"esp"));
    272 	&mov	($c_,&DWP(4*11,"esp"));
    273 	&mov	($d, &DWP(4*13,"esp"));
    274 	&mov	($d_,&DWP(4*15,"esp"));
    275 	&add	($b_,&DWP(64+4*7,"esp"));
    276 	&add	($c, &DWP(64+4*10,"esp"));
    277 	&add	($c_,&DWP(64+4*11,"esp"));
    278 	&add	($d, &DWP(64+4*13,"esp"));
    279 	&add	($d_,&DWP(64+4*15,"esp"));
    280 	&xor	($b_,&DWP(4*7,$b));
    281 	&xor	($c, &DWP(4*10,$b));
    282 	&xor	($c_,&DWP(4*11,$b));
    283 	&xor	($d, &DWP(4*13,$b));
    284 	&xor	($d_,&DWP(4*15,$b));
    285 	&lea	($b,&DWP(4*16,$b));
    286 	&mov	(&DWP(4*7,$a),$b_);
    287 	&mov	($b_,&DWP(4*0,"esp"));
    288 	&mov	(&DWP(4*10,$a),$c);
    289 	&mov	($c,&wparam(2));		# len
    290 	&mov	(&DWP(4*11,$a),$c_);
    291 	&mov	(&DWP(4*13,$a),$d);
    292 	&mov	(&DWP(4*15,$a),$d_);
    293 	&mov	(&DWP(4*0,$a),$b_);
    294 	&lea	($a,&DWP(4*16,$a));
    295 	&sub	($c,64);
    296 	&jnz	(&label("outer_loop"));
    297 
    298 	&jmp	(&label("done"));
    299 
    300 &set_label("tail");
    301 	&add	($d, &DWP(64+4*12,"esp"));
    302 	&add	($d_,&DWP(64+4*14,"esp"));
    303 	&mov	(&DWP(4*0,"esp"),$a);
    304 	&mov	(&DWP(4*4,"esp"),$b_);
    305 	&mov	(&DWP(4*8,"esp"),$c);
    306 	&mov	(&DWP(4*9,"esp"),$c_);
    307 	&mov	(&DWP(4*12,"esp"),$d);
    308 	&mov	(&DWP(4*14,"esp"),$d_);
    309 
    310 	&mov	($b_,&DWP(4*1,"esp"));
    311 	&mov	($c, &DWP(4*2,"esp"));
    312 	&mov	($c_,&DWP(4*3,"esp"));
    313 	&mov	($d, &DWP(4*5,"esp"));
    314 	&mov	($d_,&DWP(4*6,"esp"));
    315 	&add	($b_,0x3320646e);		# accumulate key material
    316 	&add	($c, 0x79622d32);
    317 	&add	($c_,0x6b206574);
    318 	&add	($d, &DWP(64+4*5,"esp"));
    319 	&add	($d_,&DWP(64+4*6,"esp"));
    320 	&mov	(&DWP(4*1,"esp"),$b_);
    321 	&mov	(&DWP(4*2,"esp"),$c);
    322 	&mov	(&DWP(4*3,"esp"),$c_);
    323 	&mov	(&DWP(4*5,"esp"),$d);
    324 	&mov	(&DWP(4*6,"esp"),$d_);
    325 
    326 	&mov	($b_,&DWP(4*7,"esp"));
    327 	&mov	($c, &DWP(4*10,"esp"));
    328 	&mov	($c_,&DWP(4*11,"esp"));
    329 	&mov	($d, &DWP(4*13,"esp"));
    330 	&mov	($d_,&DWP(4*15,"esp"));
    331 	&add	($b_,&DWP(64+4*7,"esp"));
    332 	&add	($c, &DWP(64+4*10,"esp"));
    333 	&add	($c_,&DWP(64+4*11,"esp"));
    334 	&add	($d, &DWP(64+4*13,"esp"));
    335 	&add	($d_,&DWP(64+4*15,"esp"));
    336 	&mov	(&DWP(4*7,"esp"),$b_);
    337 	&mov	($b_,&wparam(1));		# load input
    338 	&mov	(&DWP(4*10,"esp"),$c);
    339 	&mov	($c,&wparam(0));		# load output
    340 	&mov	(&DWP(4*11,"esp"),$c_);
    341 	&xor	($c_,$c_);
    342 	&mov	(&DWP(4*13,"esp"),$d);
    343 	&mov	(&DWP(4*15,"esp"),$d_);
    344 
    345 	&xor	("eax","eax");
    346 	&xor	("edx","edx");
    347 &set_label("tail_loop");
    348 	&movb	("al",&BP(0,$c_,$b_));
    349 	&movb	("dl",&BP(0,"esp",$c_));
    350 	&lea	($c_,&DWP(1,$c_));
    351 	&xor	("al","dl");
    352 	&mov	(&BP(-1,$c,$c_),"al");
    353 	&dec	($b);
    354 	&jnz	(&label("tail_loop"));
    355 
    356 &set_label("done");
    357 	&stack_pop(33);
    358 &set_label("no_data");
    359 &function_end("ChaCha20_ctr32");
    360 
    361 if ($xmm) {
    362 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
    363 my ($out,$inp,$len)=("edi","esi","ecx");
    364 
    365 sub QUARTERROUND_SSSE3 {
    366 my ($ai,$bi,$ci,$di,$i)=@_;
    367 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
    368 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
    369 
    370 	#       a   b   c   d
    371 	#
    372 	#       0   4   8  12 < even round
    373 	#       1   5   9  13
    374 	#       2   6  10  14
    375 	#       3   7  11  15
    376 	#       0   5  10  15 < odd round
    377 	#       1   6  11  12
    378 	#       2   7   8  13
    379 	#       3   4   9  14
    380 
    381 	if ($i==0) {
    382             my $j=4;
    383 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
    384 	} elsif ($i==3) {
    385             my $j=0;
    386 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
    387 	} elsif ($i==4) {
    388             my $j=4;
    389 	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
    390 	} elsif ($i==7) {
    391             my $j=0;
    392 	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
    393 	}
    394 
    395 	#&paddd	($xa,$xb);			# see elsewhere
    396 	#&pxor	($xd,$xa);			# see elsewhere
    397 	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
    398 	&pshufb	($xd,&QWP(0,"eax"));		# rot16
    399 	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
    400 	&paddd	($xc,$xd);
    401 	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
    402 	&pxor	($xb,$xc);
    403 	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
    404 	&movdqa	($xa_,$xb);			# borrow as temporary
    405 	&pslld	($xb,12);
    406 	&psrld	($xa_,20);
    407 	&por	($xb,$xa_);
    408 	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
    409 	&paddd	($xa,$xb);
    410 	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
    411 	&pxor	($xd,$xa);
    412 	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
    413 	&pshufb	($xd,&QWP(16,"eax"));		# rot8
    414 	&paddd	($xc,$xd);
    415 	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
    416 	&movdqa	($xd_,$xd)			if ($di==$dn);
    417 	&pxor	($xb,$xc);
    418 	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
    419 	&movdqa	($xa,$xb);			# borrow as temporary
    420 	&pslld	($xb,7);
    421 	&psrld	($xa,25);
    422 	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
    423 	&por	($xb,$xa);
    424 
    425 	($xa,$xa_)=($xa_,$xa);
    426 	($xb,$xb_)=($xb_,$xb);
    427 	($xc,$xc_)=($xc_,$xc);
    428 	($xd,$xd_)=($xd_,$xd);
    429 }
    430 
    431 &function_begin("ChaCha20_ssse3");
    432 &set_label("ssse3_shortcut");
    433 	&mov		($out,&wparam(0));
    434 	&mov		($inp,&wparam(1));
    435 	&mov		($len,&wparam(2));
    436 	&mov		("edx",&wparam(3));		# key
    437 	&mov		("ebx",&wparam(4));		# counter and nonce
    438 
    439 	&mov		("ebp","esp");
    440 	&stack_push	(131);
    441 	&and		("esp",-64);
    442 	&mov		(&DWP(512,"esp"),"ebp");
    443 
    444 	&lea		("eax",&DWP(&label("ssse3_data")."-".
    445 				    &label("pic_point"),"eax"));
    446 	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
    447 
    448 if (defined($gasver) && $gasver>=2.17) {		# even though we encode
    449 							# pshufb manually, we
    450 							# handle only register
    451 							# operands, while this
    452 							# segment uses memory
    453 							# operand...
    454 	&cmp		($len,64*4);
    455 	&jb		(&label("1x"));
    456 
    457 	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
    458 	&mov		(&DWP(512+8,"esp"),"ebx");
    459 	&sub		($len,64*4);			# bias len
    460 	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
    461 
    462 	&movdqu		("xmm7",&QWP(0,"edx"));		# key
    463 	&pshufd		("xmm0","xmm3",0x00);
    464 	&pshufd		("xmm1","xmm3",0x55);
    465 	&pshufd		("xmm2","xmm3",0xaa);
    466 	&pshufd		("xmm3","xmm3",0xff);
    467 	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
    468 	&pshufd		("xmm4","xmm7",0x00);
    469 	&pshufd		("xmm5","xmm7",0x55);
    470 	 &psubd		("xmm0",&QWP(16*4,"eax"));
    471 	&pshufd		("xmm6","xmm7",0xaa);
    472 	&pshufd		("xmm7","xmm7",0xff);
    473 	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
    474 	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
    475 	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
    476 	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
    477 	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
    478 	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
    479 	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
    480 	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
    481 	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
    482 	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
    483 	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
    484 
    485 	&pshufd		("xmm0","xmm3",0x00);
    486 	&pshufd		("xmm1","xmm3",0x55);
    487 	&pshufd		("xmm2","xmm3",0xaa);
    488 	&pshufd		("xmm3","xmm3",0xff);
    489 	&pshufd		("xmm4","xmm7",0x00);
    490 	&pshufd		("xmm5","xmm7",0x55);
    491 	&pshufd		("xmm6","xmm7",0xaa);
    492 	&pshufd		("xmm7","xmm7",0xff);
    493 	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
    494 	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
    495 	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
    496 	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
    497 	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
    498 	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
    499 	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
    500 	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
    501 
    502 	&lea		($inp,&DWP(128,$inp));		# size optimization
    503 	&lea		($out,&DWP(128,$out));		# size optimization
    504 	&jmp		(&label("outer_loop"));
    505 
    506 &set_label("outer_loop",16);
    507 	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
    508 	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
    509 	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
    510 	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
    511 	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
    512 	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
    513 	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
    514 	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
    515 	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
    516 	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
    517 	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
    518 	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
    519 	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
    520 	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
    521 	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
    522 	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
    523 	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
    524 	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
    525 	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
    526 	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
    527 	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
    528 	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
    529 	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
    530 	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
    531 	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
    532 	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
    533 	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
    534 	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
    535 	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
    536 	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
    537 	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
    538 	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
    539 	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
    540 	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
    541 
    542 	&movdqa		($xa, &QWP(16*0-128,"ebp"));
    543 	&movdqa		($xd, "xmm4");
    544 	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
    545 	&movdqa		($xc, &QWP(16*8-128,"ebp"));
    546 	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
    547 
    548 	&mov		("edx",10);			# loop counter
    549 	&nop		();
    550 
    551 &set_label("loop",16);
    552 	&paddd		($xa,$xb_);			# elsewhere
    553 	&movdqa		($xb,$xb_);
    554 	&pxor		($xd,$xa);			# elsewhere
    555 	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
    556 	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
    557 	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
    558 	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
    559 	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
    560 	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
    561 	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
    562 	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
    563 	&dec		("edx");
    564 	&jnz		(&label("loop"));
    565 
    566 	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
    567 	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
    568 	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
    569 	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
    570 	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
    571 
    572     my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
    573 
    574 	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
    575 	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
    576 	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
    577 	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
    578 
    579     for($i=0;$i<256;$i+=64) {
    580 	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
    581 	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
    582 	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
    583 	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
    584 
    585 	&movdqa		($xt2,$xa0);		# "de-interlace" data
    586 	&punpckldq	($xa0,$xa1);
    587 	&movdqa		($xt3,$xa2);
    588 	&punpckldq	($xa2,$xa3);
    589 	&punpckhdq	($xt2,$xa1);
    590 	&punpckhdq	($xt3,$xa3);
    591 	&movdqa		($xa1,$xa0);
    592 	&punpcklqdq	($xa0,$xa2);		# "a0"
    593 	&movdqa		($xa3,$xt2);
    594 	&punpcklqdq	($xt2,$xt3);		# "a2"
    595 	&punpckhqdq	($xa1,$xa2);		# "a1"
    596 	&punpckhqdq	($xa3,$xt3);		# "a3"
    597 
    598 	#($xa2,$xt2)=($xt2,$xa2);
    599 
    600 	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
    601 	&movdqu		($xt1,&QWP(64*1-128,$inp));
    602 	&movdqu		($xa2,&QWP(64*2-128,$inp));
    603 	&movdqu		($xt3,&QWP(64*3-128,$inp));
    604 	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
    605 	&pxor		($xt0,$xa0);
    606 	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
    607 	&pxor		($xt1,$xa1);
    608 	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
    609 	&pxor		($xt2,$xa2);
    610 	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
    611 	&pxor		($xt3,$xa3);
    612 	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
    613 	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
    614 	&movdqu		(&QWP(64*1-128,$out),$xt1);
    615 	&movdqu		(&QWP(64*2-128,$out),$xt2);
    616 	&movdqu		(&QWP(64*3-128,$out),$xt3);
    617 	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
    618     }
    619 	&sub		($len,64*4);
    620 	&jnc		(&label("outer_loop"));
    621 
    622 	&add		($len,64*4);
    623 	&jz		(&label("done"));
    624 
    625 	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
    626 	&lea		($inp,&DWP(-128,$inp));
    627 	&mov		("edx",&DWP(512+4,"esp"));
    628 	&lea		($out,&DWP(-128,$out));
    629 
    630 	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
    631 	&movdqu		("xmm3",&QWP(0,"ebx"));
    632 	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
    633 	&pand		("xmm3",&QWP(16*7,"eax"));
    634 	&por		("xmm3","xmm2");		# counter value
    635 }
    636 {
    637 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
    638 
    639 sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
    640 	&paddd		($a,$b);
    641 	&pxor		($d,$a);
    642 	&pshufb		($d,$rot16);
    643 
    644 	&paddd		($c,$d);
    645 	&pxor		($b,$c);
    646 	&movdqa		($t,$b);
    647 	&psrld		($b,20);
    648 	&pslld		($t,12);
    649 	&por		($b,$t);
    650 
    651 	&paddd		($a,$b);
    652 	&pxor		($d,$a);
    653 	&pshufb		($d,$rot24);
    654 
    655 	&paddd		($c,$d);
    656 	&pxor		($b,$c);
    657 	&movdqa		($t,$b);
    658 	&psrld		($b,25);
    659 	&pslld		($t,7);
    660 	&por		($b,$t);
    661 }
    662 
    663 &set_label("1x");
    664 	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
    665 	&movdqu		($b,&QWP(0,"edx"));
    666 	&movdqu		($c,&QWP(16,"edx"));
    667 	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
    668 	&movdqa		($rot16,&QWP(0,"eax"));
    669 	&movdqa		($rot24,&QWP(16,"eax"));
    670 	&mov		(&DWP(16*3,"esp"),"ebp");
    671 
    672 	&movdqa		(&QWP(16*0,"esp"),$a);
    673 	&movdqa		(&QWP(16*1,"esp"),$b);
    674 	&movdqa		(&QWP(16*2,"esp"),$c);
    675 	&movdqa		(&QWP(16*3,"esp"),$d);
    676 	&mov		("edx",10);
    677 	&jmp		(&label("loop1x"));
    678 
    679 &set_label("outer1x",16);
    680 	&movdqa		($d,&QWP(16*5,"eax"));		# one
    681 	&movdqa		($a,&QWP(16*0,"esp"));
    682 	&movdqa		($b,&QWP(16*1,"esp"));
    683 	&movdqa		($c,&QWP(16*2,"esp"));
    684 	&paddd		($d,&QWP(16*3,"esp"));
    685 	&mov		("edx",10);
    686 	&movdqa		(&QWP(16*3,"esp"),$d);
    687 	&jmp		(&label("loop1x"));
    688 
    689 &set_label("loop1x",16);
    690 	&SSSE3ROUND();
    691 	&pshufd	($c,$c,0b01001110);
    692 	&pshufd	($b,$b,0b00111001);
    693 	&pshufd	($d,$d,0b10010011);
    694 	&nop	();
    695 
    696 	&SSSE3ROUND();
    697 	&pshufd	($c,$c,0b01001110);
    698 	&pshufd	($b,$b,0b10010011);
    699 	&pshufd	($d,$d,0b00111001);
    700 
    701 	&dec		("edx");
    702 	&jnz		(&label("loop1x"));
    703 
    704 	&paddd		($a,&QWP(16*0,"esp"));
    705 	&paddd		($b,&QWP(16*1,"esp"));
    706 	&paddd		($c,&QWP(16*2,"esp"));
    707 	&paddd		($d,&QWP(16*3,"esp"));
    708 
    709 	&cmp		($len,64);
    710 	&jb		(&label("tail"));
    711 
    712 	&movdqu		($t,&QWP(16*0,$inp));
    713 	&movdqu		($t1,&QWP(16*1,$inp));
    714 	&pxor		($a,$t);		# xor with input
    715 	&movdqu		($t,&QWP(16*2,$inp));
    716 	&pxor		($b,$t1);
    717 	&movdqu		($t1,&QWP(16*3,$inp));
    718 	&pxor		($c,$t);
    719 	&pxor		($d,$t1);
    720 	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
    721 
    722 	&movdqu		(&QWP(16*0,$out),$a);	# write output
    723 	&movdqu		(&QWP(16*1,$out),$b);
    724 	&movdqu		(&QWP(16*2,$out),$c);
    725 	&movdqu		(&QWP(16*3,$out),$d);
    726 	&lea		($out,&DWP(16*4,$out));	# inp+=64
    727 
    728 	&sub		($len,64);
    729 	&jnz		(&label("outer1x"));
    730 
    731 	&jmp		(&label("done"));
    732 
    733 &set_label("tail");
    734 	&movdqa		(&QWP(16*0,"esp"),$a);
    735 	&movdqa		(&QWP(16*1,"esp"),$b);
    736 	&movdqa		(&QWP(16*2,"esp"),$c);
    737 	&movdqa		(&QWP(16*3,"esp"),$d);
    738 
    739 	&xor		("eax","eax");
    740 	&xor		("edx","edx");
    741 	&xor		("ebp","ebp");
    742 
    743 &set_label("tail_loop");
    744 	&movb		("al",&BP(0,"esp","ebp"));
    745 	&movb		("dl",&BP(0,$inp,"ebp"));
    746 	&lea		("ebp",&DWP(1,"ebp"));
    747 	&xor		("al","dl");
    748 	&movb		(&BP(-1,$out,"ebp"),"al");
    749 	&dec		($len);
    750 	&jnz		(&label("tail_loop"));
    751 }
    752 &set_label("done");
    753 	&mov		("esp",&DWP(512,"esp"));
    754 &function_end("ChaCha20_ssse3");
    755 
    756 &align	(64);
    757 &set_label("ssse3_data");
    758 &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
    759 &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
    760 &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
    761 &data_word(0,1,2,3);
    762 &data_word(4,4,4,4);
    763 &data_word(1,0,0,0);
    764 &data_word(4,0,0,0);
    765 &data_word(0,-1,-1,-1);
    766 &align	(64);
    767 }
    768 &asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
    769 
    770 &asm_finish();
    771 
    772 close STDOUT;
    773