1 #! /usr/bin/env perl 2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 # 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 # 17 # January 2015 18 # 19 # ChaCha20 for x86. 20 # 21 # Performance in cycles per byte out of large buffer. 22 # 23 # 1xIALU/gcc 4xSSSE3 24 # Pentium 17.5/+80% 25 # PIII 14.2/+60% 26 # P4 18.6/+84% 27 # Core2 9.56/+89% 4.83 28 # Westmere 9.50/+45% 3.35 29 # Sandy Bridge 10.5/+47% 3.20 30 # Haswell 8.15/+50% 2.83 31 # Skylake 7.53/+22% 2.75 32 # Silvermont 17.4/+36% 8.35 33 # Goldmont 13.4/+40% 4.36 34 # Sledgehammer 10.2/+54% 35 # Bulldozer 13.4/+50% 4.38(*) 36 # 37 # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 38 # 39 # Modified from upstream OpenSSL to remove the XOP code. 40 41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42 push(@INC,"${dir}","${dir}../../perlasm"); 43 require "x86asm.pl"; 44 45 $output=pop; 46 open STDOUT,">$output"; 47 48 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 49 50 $xmm=$ymm=1; 51 $gasver=999; # enable everything 52 53 $a="eax"; 54 ($b,$b_)=("ebx","ebp"); 55 ($c,$c_)=("ecx","esi"); 56 ($d,$d_)=("edx","edi"); 57 58 sub QUARTERROUND { 59 my ($ai,$bi,$ci,$di,$i)=@_; 60 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 61 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 62 63 # a b c d 64 # 65 # 0 4 8 12 < even round 66 # 1 5 9 13 67 # 2 6 10 14 68 # 3 7 11 15 69 # 0 5 10 15 < odd round 70 # 1 6 11 12 71 # 2 7 8 13 72 # 3 4 9 14 73 74 if ($i==0) { 75 my $j=4; 76 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 77 } elsif ($i==3) { 78 my $j=0; 79 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 80 } elsif ($i==4) { 81 my $j=4; 82 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 83 } elsif ($i==7) { 84 my $j=0; 85 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 86 } 87 88 #&add ($a,$b); # see elsewhere 89 &xor ($d,$a); 90 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 91 &rol ($d,16); 92 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 93 &add ($c,$d); 94 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 95 &xor ($b,$c); 96 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 97 &rol ($b,12); 98 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 99 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 100 &add ($a,$b); 101 &xor ($d,$a); 102 &mov (&DWP(4*$ai,"esp"),$a); 103 &rol ($d,8); 104 &mov ($a,&DWP(4*$an,"esp")); 105 &add ($c,$d); 106 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 107 &mov ($d_,$d) if ($di==$dn); 108 &xor ($b,$c); 109 &add ($a,$b_) if ($i<7); # elsewhere 110 &rol ($b,7); 111 112 ($b,$b_)=($b_,$b); 113 ($c,$c_)=($c_,$c); 114 ($d,$d_)=($d_,$d); 115 } 116 117 &static_label("ssse3_shortcut"); 118 &static_label("ssse3_data"); 119 &static_label("pic_point"); 120 121 &function_begin("ChaCha20_ctr32"); 122 &xor ("eax","eax"); 123 &cmp ("eax",&wparam(2)); # len==0? 124 &je (&label("no_data")); 125 if ($xmm) { 126 &call (&label("pic_point")); 127 &set_label("pic_point"); 128 &blindpop("eax"); 129 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); 130 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit 131 &jz (&label("x86")); 132 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit 133 &jz (&label("x86")); 134 &jmp (&label("ssse3_shortcut")); 135 &set_label("x86"); 136 } 137 &mov ("esi",&wparam(3)); # key 138 &mov ("edi",&wparam(4)); # counter and nonce 139 140 &stack_push(33); 141 142 &mov ("eax",&DWP(4*0,"esi")); # copy key 143 &mov ("ebx",&DWP(4*1,"esi")); 144 &mov ("ecx",&DWP(4*2,"esi")); 145 &mov ("edx",&DWP(4*3,"esi")); 146 &mov (&DWP(64+4*4,"esp"),"eax"); 147 &mov (&DWP(64+4*5,"esp"),"ebx"); 148 &mov (&DWP(64+4*6,"esp"),"ecx"); 149 &mov (&DWP(64+4*7,"esp"),"edx"); 150 &mov ("eax",&DWP(4*4,"esi")); 151 &mov ("ebx",&DWP(4*5,"esi")); 152 &mov ("ecx",&DWP(4*6,"esi")); 153 &mov ("edx",&DWP(4*7,"esi")); 154 &mov (&DWP(64+4*8,"esp"),"eax"); 155 &mov (&DWP(64+4*9,"esp"),"ebx"); 156 &mov (&DWP(64+4*10,"esp"),"ecx"); 157 &mov (&DWP(64+4*11,"esp"),"edx"); 158 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 159 &mov ("ebx",&DWP(4*1,"edi")); 160 &mov ("ecx",&DWP(4*2,"edi")); 161 &mov ("edx",&DWP(4*3,"edi")); 162 &sub ("eax",1); 163 &mov (&DWP(64+4*12,"esp"),"eax"); 164 &mov (&DWP(64+4*13,"esp"),"ebx"); 165 &mov (&DWP(64+4*14,"esp"),"ecx"); 166 &mov (&DWP(64+4*15,"esp"),"edx"); 167 &jmp (&label("entry")); 168 169 &set_label("outer_loop",16); 170 &mov (&wparam(1),$b); # save input 171 &mov (&wparam(0),$a); # save output 172 &mov (&wparam(2),$c); # save len 173 &set_label("entry"); 174 &mov ($a,0x61707865); 175 &mov (&DWP(4*1,"esp"),0x3320646e); 176 &mov (&DWP(4*2,"esp"),0x79622d32); 177 &mov (&DWP(4*3,"esp"),0x6b206574); 178 179 &mov ($b, &DWP(64+4*5,"esp")); # copy key material 180 &mov ($b_,&DWP(64+4*6,"esp")); 181 &mov ($c, &DWP(64+4*10,"esp")); 182 &mov ($c_,&DWP(64+4*11,"esp")); 183 &mov ($d, &DWP(64+4*13,"esp")); 184 &mov ($d_,&DWP(64+4*14,"esp")); 185 &mov (&DWP(4*5,"esp"),$b); 186 &mov (&DWP(4*6,"esp"),$b_); 187 &mov (&DWP(4*10,"esp"),$c); 188 &mov (&DWP(4*11,"esp"),$c_); 189 &mov (&DWP(4*13,"esp"),$d); 190 &mov (&DWP(4*14,"esp"),$d_); 191 192 &mov ($b, &DWP(64+4*7,"esp")); 193 &mov ($d_,&DWP(64+4*15,"esp")); 194 &mov ($d, &DWP(64+4*12,"esp")); 195 &mov ($b_,&DWP(64+4*4,"esp")); 196 &mov ($c, &DWP(64+4*8,"esp")); 197 &mov ($c_,&DWP(64+4*9,"esp")); 198 &add ($d,1); # counter value 199 &mov (&DWP(4*7,"esp"),$b); 200 &mov (&DWP(4*15,"esp"),$d_); 201 &mov (&DWP(64+4*12,"esp"),$d); # save counter value 202 203 &mov ($b,10); # loop counter 204 &jmp (&label("loop")); 205 206 &set_label("loop",16); 207 &add ($a,$b_); # elsewhere 208 &mov (&DWP(128,"esp"),$b); # save loop counter 209 &mov ($b,$b_); 210 &QUARTERROUND(0, 4, 8, 12, 0); 211 &QUARTERROUND(1, 5, 9, 13, 1); 212 &QUARTERROUND(2, 6,10, 14, 2); 213 &QUARTERROUND(3, 7,11, 15, 3); 214 &QUARTERROUND(0, 5,10, 15, 4); 215 &QUARTERROUND(1, 6,11, 12, 5); 216 &QUARTERROUND(2, 7, 8, 13, 6); 217 &QUARTERROUND(3, 4, 9, 14, 7); 218 &dec ($b); 219 &jnz (&label("loop")); 220 221 &mov ($b,&wparam(2)); # load len 222 223 &add ($a,0x61707865); # accumulate key material 224 &add ($b_,&DWP(64+4*4,"esp")); 225 &add ($c, &DWP(64+4*8,"esp")); 226 &add ($c_,&DWP(64+4*9,"esp")); 227 228 &cmp ($b,64); 229 &jb (&label("tail")); 230 231 &mov ($b,&wparam(1)); # load input pointer 232 &add ($d, &DWP(64+4*12,"esp")); 233 &add ($d_,&DWP(64+4*14,"esp")); 234 235 &xor ($a, &DWP(4*0,$b)); # xor with input 236 &xor ($b_,&DWP(4*4,$b)); 237 &mov (&DWP(4*0,"esp"),$a); 238 &mov ($a,&wparam(0)); # load output pointer 239 &xor ($c, &DWP(4*8,$b)); 240 &xor ($c_,&DWP(4*9,$b)); 241 &xor ($d, &DWP(4*12,$b)); 242 &xor ($d_,&DWP(4*14,$b)); 243 &mov (&DWP(4*4,$a),$b_); # write output 244 &mov (&DWP(4*8,$a),$c); 245 &mov (&DWP(4*9,$a),$c_); 246 &mov (&DWP(4*12,$a),$d); 247 &mov (&DWP(4*14,$a),$d_); 248 249 &mov ($b_,&DWP(4*1,"esp")); 250 &mov ($c, &DWP(4*2,"esp")); 251 &mov ($c_,&DWP(4*3,"esp")); 252 &mov ($d, &DWP(4*5,"esp")); 253 &mov ($d_,&DWP(4*6,"esp")); 254 &add ($b_,0x3320646e); # accumulate key material 255 &add ($c, 0x79622d32); 256 &add ($c_,0x6b206574); 257 &add ($d, &DWP(64+4*5,"esp")); 258 &add ($d_,&DWP(64+4*6,"esp")); 259 &xor ($b_,&DWP(4*1,$b)); 260 &xor ($c, &DWP(4*2,$b)); 261 &xor ($c_,&DWP(4*3,$b)); 262 &xor ($d, &DWP(4*5,$b)); 263 &xor ($d_,&DWP(4*6,$b)); 264 &mov (&DWP(4*1,$a),$b_); 265 &mov (&DWP(4*2,$a),$c); 266 &mov (&DWP(4*3,$a),$c_); 267 &mov (&DWP(4*5,$a),$d); 268 &mov (&DWP(4*6,$a),$d_); 269 270 &mov ($b_,&DWP(4*7,"esp")); 271 &mov ($c, &DWP(4*10,"esp")); 272 &mov ($c_,&DWP(4*11,"esp")); 273 &mov ($d, &DWP(4*13,"esp")); 274 &mov ($d_,&DWP(4*15,"esp")); 275 &add ($b_,&DWP(64+4*7,"esp")); 276 &add ($c, &DWP(64+4*10,"esp")); 277 &add ($c_,&DWP(64+4*11,"esp")); 278 &add ($d, &DWP(64+4*13,"esp")); 279 &add ($d_,&DWP(64+4*15,"esp")); 280 &xor ($b_,&DWP(4*7,$b)); 281 &xor ($c, &DWP(4*10,$b)); 282 &xor ($c_,&DWP(4*11,$b)); 283 &xor ($d, &DWP(4*13,$b)); 284 &xor ($d_,&DWP(4*15,$b)); 285 &lea ($b,&DWP(4*16,$b)); 286 &mov (&DWP(4*7,$a),$b_); 287 &mov ($b_,&DWP(4*0,"esp")); 288 &mov (&DWP(4*10,$a),$c); 289 &mov ($c,&wparam(2)); # len 290 &mov (&DWP(4*11,$a),$c_); 291 &mov (&DWP(4*13,$a),$d); 292 &mov (&DWP(4*15,$a),$d_); 293 &mov (&DWP(4*0,$a),$b_); 294 &lea ($a,&DWP(4*16,$a)); 295 &sub ($c,64); 296 &jnz (&label("outer_loop")); 297 298 &jmp (&label("done")); 299 300 &set_label("tail"); 301 &add ($d, &DWP(64+4*12,"esp")); 302 &add ($d_,&DWP(64+4*14,"esp")); 303 &mov (&DWP(4*0,"esp"),$a); 304 &mov (&DWP(4*4,"esp"),$b_); 305 &mov (&DWP(4*8,"esp"),$c); 306 &mov (&DWP(4*9,"esp"),$c_); 307 &mov (&DWP(4*12,"esp"),$d); 308 &mov (&DWP(4*14,"esp"),$d_); 309 310 &mov ($b_,&DWP(4*1,"esp")); 311 &mov ($c, &DWP(4*2,"esp")); 312 &mov ($c_,&DWP(4*3,"esp")); 313 &mov ($d, &DWP(4*5,"esp")); 314 &mov ($d_,&DWP(4*6,"esp")); 315 &add ($b_,0x3320646e); # accumulate key material 316 &add ($c, 0x79622d32); 317 &add ($c_,0x6b206574); 318 &add ($d, &DWP(64+4*5,"esp")); 319 &add ($d_,&DWP(64+4*6,"esp")); 320 &mov (&DWP(4*1,"esp"),$b_); 321 &mov (&DWP(4*2,"esp"),$c); 322 &mov (&DWP(4*3,"esp"),$c_); 323 &mov (&DWP(4*5,"esp"),$d); 324 &mov (&DWP(4*6,"esp"),$d_); 325 326 &mov ($b_,&DWP(4*7,"esp")); 327 &mov ($c, &DWP(4*10,"esp")); 328 &mov ($c_,&DWP(4*11,"esp")); 329 &mov ($d, &DWP(4*13,"esp")); 330 &mov ($d_,&DWP(4*15,"esp")); 331 &add ($b_,&DWP(64+4*7,"esp")); 332 &add ($c, &DWP(64+4*10,"esp")); 333 &add ($c_,&DWP(64+4*11,"esp")); 334 &add ($d, &DWP(64+4*13,"esp")); 335 &add ($d_,&DWP(64+4*15,"esp")); 336 &mov (&DWP(4*7,"esp"),$b_); 337 &mov ($b_,&wparam(1)); # load input 338 &mov (&DWP(4*10,"esp"),$c); 339 &mov ($c,&wparam(0)); # load output 340 &mov (&DWP(4*11,"esp"),$c_); 341 &xor ($c_,$c_); 342 &mov (&DWP(4*13,"esp"),$d); 343 &mov (&DWP(4*15,"esp"),$d_); 344 345 &xor ("eax","eax"); 346 &xor ("edx","edx"); 347 &set_label("tail_loop"); 348 &movb ("al",&BP(0,$c_,$b_)); 349 &movb ("dl",&BP(0,"esp",$c_)); 350 &lea ($c_,&DWP(1,$c_)); 351 &xor ("al","dl"); 352 &mov (&BP(-1,$c,$c_),"al"); 353 &dec ($b); 354 &jnz (&label("tail_loop")); 355 356 &set_label("done"); 357 &stack_pop(33); 358 &set_label("no_data"); 359 &function_end("ChaCha20_ctr32"); 360 361 if ($xmm) { 362 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 363 my ($out,$inp,$len)=("edi","esi","ecx"); 364 365 sub QUARTERROUND_SSSE3 { 366 my ($ai,$bi,$ci,$di,$i)=@_; 367 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 368 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 369 370 # a b c d 371 # 372 # 0 4 8 12 < even round 373 # 1 5 9 13 374 # 2 6 10 14 375 # 3 7 11 15 376 # 0 5 10 15 < odd round 377 # 1 6 11 12 378 # 2 7 8 13 379 # 3 4 9 14 380 381 if ($i==0) { 382 my $j=4; 383 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 384 } elsif ($i==3) { 385 my $j=0; 386 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 387 } elsif ($i==4) { 388 my $j=4; 389 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 390 } elsif ($i==7) { 391 my $j=0; 392 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 393 } 394 395 #&paddd ($xa,$xb); # see elsewhere 396 #&pxor ($xd,$xa); # see elsewhere 397 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 398 &pshufb ($xd,&QWP(0,"eax")); # rot16 399 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 400 &paddd ($xc,$xd); 401 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 402 &pxor ($xb,$xc); 403 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 404 &movdqa ($xa_,$xb); # borrow as temporary 405 &pslld ($xb,12); 406 &psrld ($xa_,20); 407 &por ($xb,$xa_); 408 &movdqa($xa_,&QWP(16*$an-128,"ebx")); 409 &paddd ($xa,$xb); 410 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 411 &pxor ($xd,$xa); 412 &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 413 &pshufb ($xd,&QWP(16,"eax")); # rot8 414 &paddd ($xc,$xd); 415 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 416 &movdqa ($xd_,$xd) if ($di==$dn); 417 &pxor ($xb,$xc); 418 &paddd ($xa_,$xb_) if ($i<7); # elsewhere 419 &movdqa ($xa,$xb); # borrow as temporary 420 &pslld ($xb,7); 421 &psrld ($xa,25); 422 &pxor ($xd_,$xa_) if ($i<7); # elsewhere 423 &por ($xb,$xa); 424 425 ($xa,$xa_)=($xa_,$xa); 426 ($xb,$xb_)=($xb_,$xb); 427 ($xc,$xc_)=($xc_,$xc); 428 ($xd,$xd_)=($xd_,$xd); 429 } 430 431 &function_begin("ChaCha20_ssse3"); 432 &set_label("ssse3_shortcut"); 433 &mov ($out,&wparam(0)); 434 &mov ($inp,&wparam(1)); 435 &mov ($len,&wparam(2)); 436 &mov ("edx",&wparam(3)); # key 437 &mov ("ebx",&wparam(4)); # counter and nonce 438 439 &mov ("ebp","esp"); 440 &stack_push (131); 441 &and ("esp",-64); 442 &mov (&DWP(512,"esp"),"ebp"); 443 444 &lea ("eax",&DWP(&label("ssse3_data")."-". 445 &label("pic_point"),"eax")); 446 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 447 448 if (defined($gasver) && $gasver>=2.17) { # even though we encode 449 # pshufb manually, we 450 # handle only register 451 # operands, while this 452 # segment uses memory 453 # operand... 454 &cmp ($len,64*4); 455 &jb (&label("1x")); 456 457 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 458 &mov (&DWP(512+8,"esp"),"ebx"); 459 &sub ($len,64*4); # bias len 460 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 461 462 &movdqu ("xmm7",&QWP(0,"edx")); # key 463 &pshufd ("xmm0","xmm3",0x00); 464 &pshufd ("xmm1","xmm3",0x55); 465 &pshufd ("xmm2","xmm3",0xaa); 466 &pshufd ("xmm3","xmm3",0xff); 467 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 468 &pshufd ("xmm4","xmm7",0x00); 469 &pshufd ("xmm5","xmm7",0x55); 470 &psubd ("xmm0",&QWP(16*4,"eax")); 471 &pshufd ("xmm6","xmm7",0xaa); 472 &pshufd ("xmm7","xmm7",0xff); 473 &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 474 &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 475 &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 476 &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 477 &movdqu ("xmm3",&QWP(16,"edx")); # key 478 &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 479 &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 480 &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 481 &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 482 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 483 &lea ("ebx",&DWP(128,"esp")); # size optimization 484 485 &pshufd ("xmm0","xmm3",0x00); 486 &pshufd ("xmm1","xmm3",0x55); 487 &pshufd ("xmm2","xmm3",0xaa); 488 &pshufd ("xmm3","xmm3",0xff); 489 &pshufd ("xmm4","xmm7",0x00); 490 &pshufd ("xmm5","xmm7",0x55); 491 &pshufd ("xmm6","xmm7",0xaa); 492 &pshufd ("xmm7","xmm7",0xff); 493 &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 494 &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 495 &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 496 &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 497 &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 498 &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 499 &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 500 &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 501 502 &lea ($inp,&DWP(128,$inp)); # size optimization 503 &lea ($out,&DWP(128,$out)); # size optimization 504 &jmp (&label("outer_loop")); 505 506 &set_label("outer_loop",16); 507 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 508 &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 509 &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 510 &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 511 #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 512 &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 513 &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 514 &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 515 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 516 &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 517 &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 518 &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 519 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 520 &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 521 &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 522 &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 523 #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 524 #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 525 &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 526 &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 527 &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 528 &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 529 &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 530 &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 531 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 532 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 533 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 534 &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 535 &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 536 &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 537 &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 538 &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 539 &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 540 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 541 542 &movdqa ($xa, &QWP(16*0-128,"ebp")); 543 &movdqa ($xd, "xmm4"); 544 &movdqa ($xb_,&QWP(16*4-128,"ebp")); 545 &movdqa ($xc, &QWP(16*8-128,"ebp")); 546 &movdqa ($xc_,&QWP(16*9-128,"ebp")); 547 548 &mov ("edx",10); # loop counter 549 &nop (); 550 551 &set_label("loop",16); 552 &paddd ($xa,$xb_); # elsewhere 553 &movdqa ($xb,$xb_); 554 &pxor ($xd,$xa); # elsewhere 555 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 556 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 557 &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 558 &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 559 &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 560 &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 561 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 562 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 563 &dec ("edx"); 564 &jnz (&label("loop")); 565 566 &movdqa (&QWP(16*4-128,"ebx"),$xb_); 567 &movdqa (&QWP(16*8-128,"ebx"),$xc); 568 &movdqa (&QWP(16*9-128,"ebx"),$xc_); 569 &movdqa (&QWP(16*12-128,"ebx"),$xd); 570 &movdqa (&QWP(16*14-128,"ebx"),$xd_); 571 572 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 573 574 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 575 &movdqa ($xa1,&QWP(16*1-128,"ebx")); 576 &movdqa ($xa2,&QWP(16*2-128,"ebx")); 577 &movdqa ($xa3,&QWP(16*3-128,"ebx")); 578 579 for($i=0;$i<256;$i+=64) { 580 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 581 &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 582 &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 583 &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 584 585 &movdqa ($xt2,$xa0); # "de-interlace" data 586 &punpckldq ($xa0,$xa1); 587 &movdqa ($xt3,$xa2); 588 &punpckldq ($xa2,$xa3); 589 &punpckhdq ($xt2,$xa1); 590 &punpckhdq ($xt3,$xa3); 591 &movdqa ($xa1,$xa0); 592 &punpcklqdq ($xa0,$xa2); # "a0" 593 &movdqa ($xa3,$xt2); 594 &punpcklqdq ($xt2,$xt3); # "a2" 595 &punpckhqdq ($xa1,$xa2); # "a1" 596 &punpckhqdq ($xa3,$xt3); # "a3" 597 598 #($xa2,$xt2)=($xt2,$xa2); 599 600 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 601 &movdqu ($xt1,&QWP(64*1-128,$inp)); 602 &movdqu ($xa2,&QWP(64*2-128,$inp)); 603 &movdqu ($xt3,&QWP(64*3-128,$inp)); 604 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 605 &pxor ($xt0,$xa0); 606 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 607 &pxor ($xt1,$xa1); 608 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 609 &pxor ($xt2,$xa2); 610 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 611 &pxor ($xt3,$xa3); 612 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 613 &movdqu (&QWP(64*0-128,$out),$xt0); # store output 614 &movdqu (&QWP(64*1-128,$out),$xt1); 615 &movdqu (&QWP(64*2-128,$out),$xt2); 616 &movdqu (&QWP(64*3-128,$out),$xt3); 617 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 618 } 619 &sub ($len,64*4); 620 &jnc (&label("outer_loop")); 621 622 &add ($len,64*4); 623 &jz (&label("done")); 624 625 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 626 &lea ($inp,&DWP(-128,$inp)); 627 &mov ("edx",&DWP(512+4,"esp")); 628 &lea ($out,&DWP(-128,$out)); 629 630 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 631 &movdqu ("xmm3",&QWP(0,"ebx")); 632 &paddd ("xmm2",&QWP(16*6,"eax")); # +four 633 &pand ("xmm3",&QWP(16*7,"eax")); 634 &por ("xmm3","xmm2"); # counter value 635 } 636 { 637 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 638 639 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 640 &paddd ($a,$b); 641 &pxor ($d,$a); 642 &pshufb ($d,$rot16); 643 644 &paddd ($c,$d); 645 &pxor ($b,$c); 646 &movdqa ($t,$b); 647 &psrld ($b,20); 648 &pslld ($t,12); 649 &por ($b,$t); 650 651 &paddd ($a,$b); 652 &pxor ($d,$a); 653 &pshufb ($d,$rot24); 654 655 &paddd ($c,$d); 656 &pxor ($b,$c); 657 &movdqa ($t,$b); 658 &psrld ($b,25); 659 &pslld ($t,7); 660 &por ($b,$t); 661 } 662 663 &set_label("1x"); 664 &movdqa ($a,&QWP(16*2,"eax")); # sigma 665 &movdqu ($b,&QWP(0,"edx")); 666 &movdqu ($c,&QWP(16,"edx")); 667 #&movdqu ($d,&QWP(0,"ebx")); # already loaded 668 &movdqa ($rot16,&QWP(0,"eax")); 669 &movdqa ($rot24,&QWP(16,"eax")); 670 &mov (&DWP(16*3,"esp"),"ebp"); 671 672 &movdqa (&QWP(16*0,"esp"),$a); 673 &movdqa (&QWP(16*1,"esp"),$b); 674 &movdqa (&QWP(16*2,"esp"),$c); 675 &movdqa (&QWP(16*3,"esp"),$d); 676 &mov ("edx",10); 677 &jmp (&label("loop1x")); 678 679 &set_label("outer1x",16); 680 &movdqa ($d,&QWP(16*5,"eax")); # one 681 &movdqa ($a,&QWP(16*0,"esp")); 682 &movdqa ($b,&QWP(16*1,"esp")); 683 &movdqa ($c,&QWP(16*2,"esp")); 684 &paddd ($d,&QWP(16*3,"esp")); 685 &mov ("edx",10); 686 &movdqa (&QWP(16*3,"esp"),$d); 687 &jmp (&label("loop1x")); 688 689 &set_label("loop1x",16); 690 &SSSE3ROUND(); 691 &pshufd ($c,$c,0b01001110); 692 &pshufd ($b,$b,0b00111001); 693 &pshufd ($d,$d,0b10010011); 694 &nop (); 695 696 &SSSE3ROUND(); 697 &pshufd ($c,$c,0b01001110); 698 &pshufd ($b,$b,0b10010011); 699 &pshufd ($d,$d,0b00111001); 700 701 &dec ("edx"); 702 &jnz (&label("loop1x")); 703 704 &paddd ($a,&QWP(16*0,"esp")); 705 &paddd ($b,&QWP(16*1,"esp")); 706 &paddd ($c,&QWP(16*2,"esp")); 707 &paddd ($d,&QWP(16*3,"esp")); 708 709 &cmp ($len,64); 710 &jb (&label("tail")); 711 712 &movdqu ($t,&QWP(16*0,$inp)); 713 &movdqu ($t1,&QWP(16*1,$inp)); 714 &pxor ($a,$t); # xor with input 715 &movdqu ($t,&QWP(16*2,$inp)); 716 &pxor ($b,$t1); 717 &movdqu ($t1,&QWP(16*3,$inp)); 718 &pxor ($c,$t); 719 &pxor ($d,$t1); 720 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 721 722 &movdqu (&QWP(16*0,$out),$a); # write output 723 &movdqu (&QWP(16*1,$out),$b); 724 &movdqu (&QWP(16*2,$out),$c); 725 &movdqu (&QWP(16*3,$out),$d); 726 &lea ($out,&DWP(16*4,$out)); # inp+=64 727 728 &sub ($len,64); 729 &jnz (&label("outer1x")); 730 731 &jmp (&label("done")); 732 733 &set_label("tail"); 734 &movdqa (&QWP(16*0,"esp"),$a); 735 &movdqa (&QWP(16*1,"esp"),$b); 736 &movdqa (&QWP(16*2,"esp"),$c); 737 &movdqa (&QWP(16*3,"esp"),$d); 738 739 &xor ("eax","eax"); 740 &xor ("edx","edx"); 741 &xor ("ebp","ebp"); 742 743 &set_label("tail_loop"); 744 &movb ("al",&BP(0,"esp","ebp")); 745 &movb ("dl",&BP(0,$inp,"ebp")); 746 &lea ("ebp",&DWP(1,"ebp")); 747 &xor ("al","dl"); 748 &movb (&BP(-1,$out,"ebp"),"al"); 749 &dec ($len); 750 &jnz (&label("tail_loop")); 751 } 752 &set_label("done"); 753 &mov ("esp",&DWP(512,"esp")); 754 &function_end("ChaCha20_ssse3"); 755 756 &align (64); 757 &set_label("ssse3_data"); 758 &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 759 &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 760 &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 761 &data_word(0,1,2,3); 762 &data_word(4,4,4,4); 763 &data_word(1,0,0,0); 764 &data_word(4,0,0,0); 765 &data_word(0,-1,-1,-1); 766 &align (64); 767 } 768 &asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 769 770 &asm_finish(); 771 772 close STDOUT; 773