1 #!/usr/bin/env perl 2 3 # Copyright (c) 2015, CloudFlare Ltd. 4 # 5 # Permission to use, copy, modify, and/or distribute this software for any 6 # purpose with or without fee is hereby granted, provided that the above 7 # copyright notice and this permission notice appear in all copies. 8 # 9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17 ############################################################################## 18 # # 19 # Author: Vlad Krasnov # 20 # # 21 ############################################################################## 22 23 $flavour = shift; 24 $output = shift; 25 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 26 27 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 28 29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 31 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 32 die "can't locate x86_64-xlate.pl"; 33 34 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 35 *STDOUT=*OUT; 36 37 $avx = 2; 38 39 $code.=<<___; 40 .text 41 .extern OPENSSL_ia32cap_P 42 43 chacha20_poly1305_constants: 44 45 .align 64 46 .chacha20_consts: 47 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 48 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 49 .rol8: 50 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 51 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 52 .rol16: 53 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 54 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 55 .avx2_init: 56 .long 0,0,0,0 57 .sse_inc: 58 .long 1,0,0,0 59 .avx2_inc: 60 .long 2,0,0,0,2,0,0,0 61 .clamp: 62 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 63 .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF 64 .align 16 65 .and_masks: 66 .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 67 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 68 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 69 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 70 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 71 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 72 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 73 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 74 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 75 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 76 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 77 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 78 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 79 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 80 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 81 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 82 ___ 83 84 my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8"); 85 my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); 86 my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); 87 my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); 88 my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 89 my $r_store="0*16(%rbp)"; 90 my $s_store="1*16(%rbp)"; 91 my $len_store="2*16(%rbp)"; 92 my $state1_store="3*16(%rbp)"; 93 my $state2_store="4*16(%rbp)"; 94 my $tmp_store="5*16(%rbp)"; 95 my $ctr0_store="6*16(%rbp)"; 96 my $ctr1_store="7*16(%rbp)"; 97 my $ctr2_store="8*16(%rbp)"; 98 my $ctr3_store="9*16(%rbp)"; 99 100 sub chacha_qr { 101 my ($a,$b,$c,$d,$t,$dir)=@_; 102 $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); 103 $code.="paddd $b, $a 104 pxor $a, $d 105 pshufb .rol16(%rip), $d 106 paddd $d, $c 107 pxor $c, $b 108 movdqa $b, $t 109 pslld \$12, $t 110 psrld \$20, $b 111 pxor $t, $b 112 paddd $b, $a 113 pxor $a, $d 114 pshufb .rol8(%rip), $d 115 paddd $d, $c 116 pxor $c, $b 117 movdqa $b, $t 118 pslld \$7, $t 119 psrld \$25, $b 120 pxor $t, $b\n"; 121 $code.="palignr \$4, $b, $b 122 palignr \$8, $c, $c 123 palignr \$12, $d, $d\n" if ($dir =~ /left/); 124 $code.="palignr \$12, $b, $b 125 palignr \$8, $c, $c 126 palignr \$4, $d, $d\n" if ($dir =~ /right/); 127 $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); 128 } 129 130 sub poly_add { 131 my ($src)=@_; 132 $code.="add $src, $acc0 133 adc 8+$src, $acc1 134 adc \$1, $acc2\n"; 135 } 136 137 sub poly_stage1 { 138 $code.="mov 0+$r_store, %rax 139 mov %rax, $t2 140 mul $acc0 141 mov %rax, $t0 142 mov %rdx, $t1 143 mov 0+$r_store, %rax 144 mul $acc1 145 imulq $acc2, $t2 146 add %rax, $t1 147 adc %rdx, $t2\n"; 148 } 149 150 sub poly_stage2 { 151 $code.="mov 8+$r_store, %rax 152 mov %rax, $t3 153 mul $acc0 154 add %rax, $t1 155 adc \$0, %rdx 156 mov %rdx, $acc0 157 mov 8+$r_store, %rax 158 mul $acc1 159 add %rax, $t2 160 adc \$0, %rdx\n"; 161 } 162 163 sub poly_stage3 { 164 $code.="imulq $acc2, $t3 165 add $acc0, $t2 166 adc %rdx, $t3\n"; 167 } 168 169 sub poly_reduce_stage { 170 $code.="mov $t0, $acc0 171 mov $t1, $acc1 172 mov $t2, $acc2 173 and \$3, $acc2 174 mov $t2, $t0 175 and \$-4, $t0 176 mov $t3, $t1 177 shrd \$2, $t3, $t2 178 shr \$2, $t3 179 add $t0, $acc0 180 adc $t1, $acc1 181 adc \$0, $acc2 182 add $t2, $acc0 183 adc $t3, $acc1 184 adc \$0, $acc2\n"; 185 } 186 187 sub poly_mul { 188 &poly_stage1(); 189 &poly_stage2(); 190 &poly_stage3(); 191 &poly_reduce_stage(); 192 } 193 194 sub prep_state { 195 my ($n)=@_; 196 $code.="movdqa .chacha20_consts(%rip), $A0 197 movdqa $state1_store, $B0 198 movdqa $state2_store, $C0\n"; 199 $code.="movdqa $A0, $A1 200 movdqa $B0, $B1 201 movdqa $C0, $C1\n" if ($n ge 2); 202 $code.="movdqa $A0, $A2 203 movdqa $B0, $B2 204 movdqa $C0, $C2\n" if ($n ge 3); 205 $code.="movdqa $A0, $A3 206 movdqa $B0, $B3 207 movdqa $C0, $C3\n" if ($n ge 4); 208 $code.="movdqa $ctr0_store, $D0 209 paddd .sse_inc(%rip), $D0 210 movdqa $D0, $ctr0_store\n" if ($n eq 1); 211 $code.="movdqa $ctr0_store, $D1 212 paddd .sse_inc(%rip), $D1 213 movdqa $D1, $D0 214 paddd .sse_inc(%rip), $D0 215 movdqa $D0, $ctr0_store 216 movdqa $D1, $ctr1_store\n" if ($n eq 2); 217 $code.="movdqa $ctr0_store, $D2 218 paddd .sse_inc(%rip), $D2 219 movdqa $D2, $D1 220 paddd .sse_inc(%rip), $D1 221 movdqa $D1, $D0 222 paddd .sse_inc(%rip), $D0 223 movdqa $D0, $ctr0_store 224 movdqa $D1, $ctr1_store 225 movdqa $D2, $ctr2_store\n" if ($n eq 3); 226 $code.="movdqa $ctr0_store, $D3 227 paddd .sse_inc(%rip), $D3 228 movdqa $D3, $D2 229 paddd .sse_inc(%rip), $D2 230 movdqa $D2, $D1 231 paddd .sse_inc(%rip), $D1 232 movdqa $D1, $D0 233 paddd .sse_inc(%rip), $D0 234 movdqa $D0, $ctr0_store 235 movdqa $D1, $ctr1_store 236 movdqa $D2, $ctr2_store 237 movdqa $D3, $ctr3_store\n" if ($n eq 4); 238 } 239 240 sub finalize_state { 241 my ($n)=@_; 242 $code.="paddd .chacha20_consts(%rip), $A3 243 paddd $state1_store, $B3 244 paddd $state2_store, $C3 245 paddd $ctr3_store, $D3\n" if ($n eq 4); 246 $code.="paddd .chacha20_consts(%rip), $A2 247 paddd $state1_store, $B2 248 paddd $state2_store, $C2 249 paddd $ctr2_store, $D2\n" if ($n ge 3); 250 $code.="paddd .chacha20_consts(%rip), $A1 251 paddd $state1_store, $B1 252 paddd $state2_store, $C1 253 paddd $ctr1_store, $D1\n" if ($n ge 2); 254 $code.="paddd .chacha20_consts(%rip), $A0 255 paddd $state1_store, $B0 256 paddd $state2_store, $C0 257 paddd $ctr0_store, $D0\n"; 258 } 259 260 sub xor_stream { 261 my ($A, $B, $C, $D, $offset)=@_; 262 $code.="movdqu 0*16 + $offset($inp), $A3 263 movdqu 1*16 + $offset($inp), $B3 264 movdqu 2*16 + $offset($inp), $C3 265 movdqu 3*16 + $offset($inp), $D3 266 pxor $A3, $A 267 pxor $B3, $B 268 pxor $C3, $C 269 pxor $D, $D3 270 movdqu $A, 0*16 + $offset($oup) 271 movdqu $B, 1*16 + $offset($oup) 272 movdqu $C, 2*16 + $offset($oup) 273 movdqu $D3, 3*16 + $offset($oup)\n"; 274 } 275 276 sub xor_stream_using_temp { 277 my ($A, $B, $C, $D, $offset, $temp)=@_; 278 $code.="movdqa $temp, $tmp_store 279 movdqu 0*16 + $offset($inp), $temp 280 pxor $A, $temp 281 movdqu $temp, 0*16 + $offset($oup) 282 movdqu 1*16 + $offset($inp), $temp 283 pxor $B, $temp 284 movdqu $temp, 1*16 + $offset($oup) 285 movdqu 2*16 + $offset($inp), $temp 286 pxor $C, $temp 287 movdqu $temp, 2*16 + $offset($oup) 288 movdqu 3*16 + $offset($inp), $temp 289 pxor $D, $temp 290 movdqu $temp, 3*16 + $offset($oup)\n"; 291 } 292 293 sub gen_chacha_round { 294 my ($rot1, $rot2, $shift)=@_; 295 my $round=""; 296 $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); 297 $round.="movdqa $rot2, $C0 298 paddd $B3, $A3 299 paddd $B2, $A2 300 paddd $B1, $A1 301 paddd $B0, $A0 302 pxor $A3, $D3 303 pxor $A2, $D2 304 pxor $A1, $D1 305 pxor $A0, $D0 306 pshufb $C0, $D3 307 pshufb $C0, $D2 308 pshufb $C0, $D1 309 pshufb $C0, $D0 310 movdqa $tmp_store, $C0 311 paddd $D3, $C3 312 paddd $D2, $C2 313 paddd $D1, $C1 314 paddd $D0, $C0 315 pxor $C3, $B3 316 pxor $C2, $B2 317 pxor $C1, $B1 318 pxor $C0, $B0 319 movdqa $C0, $tmp_store 320 movdqa $B3, $C0 321 psrld \$$rot1, $C0 322 pslld \$32-$rot1, $B3 323 pxor $C0, $B3 324 movdqa $B2, $C0 325 psrld \$$rot1, $C0 326 pslld \$32-$rot1, $B2 327 pxor $C0, $B2 328 movdqa $B1, $C0 329 psrld \$$rot1, $C0 330 pslld \$32-$rot1, $B1 331 pxor $C0, $B1 332 movdqa $B0, $C0 333 psrld \$$rot1, $C0 334 pslld \$32-$rot1, $B0 335 pxor $C0, $B0\n"; 336 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 337 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 338 $round.="movdqa $tmp_store, $C0 339 palignr \$$s1, $B3, $B3 340 palignr \$$s2, $C3, $C3 341 palignr \$$s3, $D3, $D3 342 palignr \$$s1, $B2, $B2 343 palignr \$$s2, $C2, $C2 344 palignr \$$s3, $D2, $D2 345 palignr \$$s1, $B1, $B1 346 palignr \$$s2, $C1, $C1 347 palignr \$$s3, $D1, $D1 348 palignr \$$s1, $B0, $B0 349 palignr \$$s2, $C0, $C0 350 palignr \$$s3, $D0, $D0\n" 351 if (($shift =~ /left/) || ($shift =~ /right/)); 352 return $round; 353 }; 354 355 $chacha_body = &gen_chacha_round(20, ".rol16(%rip)") . 356 &gen_chacha_round(25, ".rol8(%rip)", "left") . 357 &gen_chacha_round(20, ".rol16(%rip)") . 358 &gen_chacha_round(25, ".rol8(%rip)", "right"); 359 360 my @loop_body = split /\n/, $chacha_body; 361 362 sub emit_body { 363 my ($n)=@_; 364 for (my $i=0; $i < $n; $i++) { 365 $code=$code.shift(@loop_body)."\n"; 366 }; 367 } 368 369 { 370 ################################################################################ 371 # void poly_hash_ad_internal(); 372 $code.=" 373 .type poly_hash_ad_internal,\@function,2 374 .align 64 375 poly_hash_ad_internal: 376 .cfi_startproc 377 xor $acc0, $acc0 378 xor $acc1, $acc1 379 xor $acc2, $acc2 380 cmp \$13, $itr2 381 jne hash_ad_loop 382 poly_fast_tls_ad: 383 # Special treatment for the TLS case of 13 bytes 384 mov ($adp), $acc0 385 mov 5($adp), $acc1 386 shr \$24, $acc1 387 mov \$1, $acc2\n"; 388 &poly_mul(); $code.=" 389 ret 390 hash_ad_loop: 391 # Hash in 16 byte chunk 392 cmp \$16, $itr2 393 jb hash_ad_tail\n"; 394 &poly_add("0($adp)"); 395 &poly_mul(); $code.=" 396 lea 1*16($adp), $adp 397 sub \$16, $itr2 398 jmp hash_ad_loop 399 hash_ad_tail: 400 cmp \$0, $itr2 401 je 1f 402 # Hash last < 16 byte tail 403 xor $t0, $t0 404 xor $t1, $t1 405 xor $t2, $t2 406 add $itr2, $adp 407 hash_ad_tail_loop: 408 shld \$8, $t0, $t1 409 shl \$8, $t0 410 movzxb -1($adp), $t2 411 xor $t2, $t0 412 dec $adp 413 dec $itr2 414 jne hash_ad_tail_loop 415 416 add $t0, $acc0 417 adc $t1, $acc1 418 adc \$1, $acc2\n"; 419 &poly_mul(); $code.=" 420 # Finished AD 421 1: 422 ret 423 .cfi_endproc 424 .size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; 425 } 426 427 { 428 ################################################################################ 429 # void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 430 $code.=" 431 .globl chacha20_poly1305_open 432 .type chacha20_poly1305_open,\@function,2 433 .align 64 434 chacha20_poly1305_open: 435 .cfi_startproc 436 push %rbp 437 .cfi_adjust_cfa_offset 8 438 push %rbx 439 .cfi_adjust_cfa_offset 8 440 push %r12 441 .cfi_adjust_cfa_offset 8 442 push %r13 443 .cfi_adjust_cfa_offset 8 444 push %r14 445 .cfi_adjust_cfa_offset 8 446 push %r15 447 .cfi_adjust_cfa_offset 8 448 # We write the calculated authenticator back to keyp at the end, so save 449 # the pointer on the stack too. 450 push $keyp 451 .cfi_adjust_cfa_offset 8 452 sub \$288 + 32, %rsp 453 .cfi_adjust_cfa_offset 288 + 32 454 .cfi_offset rbp, -16 455 .cfi_offset rbx, -24 456 .cfi_offset r12, -32 457 .cfi_offset r13, -40 458 .cfi_offset r14, -48 459 .cfi_offset r15, -56 460 lea 32(%rsp), %rbp 461 and \$-32, %rbp 462 mov %rdx, 8+$len_store 463 mov %r8, 0+$len_store 464 mov %rdx, $inl\n"; $code.=" 465 mov OPENSSL_ia32cap_P+8(%rip), %eax 466 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 467 xor \$`(1<<5) + (1<<8)`, %eax 468 jz chacha20_poly1305_open_avx2\n" if ($avx>1); 469 $code.=" 470 1: 471 cmp \$128, $inl 472 jbe open_sse_128 473 # For long buffers, prepare the poly key first 474 movdqa .chacha20_consts(%rip), $A0 475 movdqu 0*16($keyp), $B0 476 movdqu 1*16($keyp), $C0 477 movdqu 2*16($keyp), $D0 478 movdqa $D0, $T1 479 # Store on stack, to free keyp 480 movdqa $B0, $state1_store 481 movdqa $C0, $state2_store 482 movdqa $D0, $ctr0_store 483 mov \$10, $acc0 484 1: \n"; 485 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 486 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 487 dec $acc0 488 jne 1b 489 # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 490 paddd .chacha20_consts(%rip), $A0 491 paddd $state1_store, $B0 492 # Clamp and store the key 493 pand .clamp(%rip), $A0 494 movdqa $A0, $r_store 495 movdqa $B0, $s_store 496 # Hash 497 mov %r8, $itr2 498 call poly_hash_ad_internal 499 open_sse_main_loop: 500 cmp \$16*16, $inl 501 jb 2f 502 # Load state, increment counter blocks\n"; 503 &prep_state(4); $code.=" 504 # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we 505 # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 506 mov \$4, $itr1 507 mov $inp, $itr2 508 1: \n"; 509 &emit_body(20); 510 &poly_add("0($itr2)"); $code.=" 511 lea 2*8($itr2), $itr2\n"; 512 &emit_body(20); 513 &poly_stage1(); 514 &emit_body(20); 515 &poly_stage2(); 516 &emit_body(20); 517 &poly_stage3(); 518 &emit_body(20); 519 &poly_reduce_stage(); 520 foreach $l (@loop_body) {$code.=$l."\n";} 521 @loop_body = split /\n/, $chacha_body; $code.=" 522 dec $itr1 523 jge 1b\n"; 524 &poly_add("0($itr2)"); 525 &poly_mul(); $code.=" 526 lea 2*8($itr2), $itr2 527 cmp \$-6, $itr1 528 jg 1b\n"; 529 &finalize_state(4); 530 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 531 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 532 &xor_stream($A1, $B1, $C1, $D1, "8*16"); 533 &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" 534 lea 16*16($inp), $inp 535 lea 16*16($oup), $oup 536 sub \$16*16, $inl 537 jmp open_sse_main_loop 538 2: 539 # Handle the various tail sizes efficiently 540 test $inl, $inl 541 jz open_sse_finalize 542 cmp \$4*16, $inl 543 ja 3f\n"; 544 ############################################################################### 545 # At most 64 bytes are left 546 &prep_state(1); $code.=" 547 xor $itr2, $itr2 548 mov $inl, $itr1 549 cmp \$16, $itr1 550 jb 2f 551 1: \n"; 552 &poly_add("0($inp, $itr2)"); 553 &poly_mul(); $code.=" 554 sub \$16, $itr1 555 2: 556 add \$16, $itr2\n"; 557 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 558 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 559 cmp \$16, $itr1 560 jae 1b 561 cmp \$10*16, $itr2 562 jne 2b\n"; 563 &finalize_state(1); $code.=" 564 jmp open_sse_tail_64_dec_loop 565 3: 566 cmp \$8*16, $inl 567 ja 3f\n"; 568 ############################################################################### 569 # 65 - 128 bytes are left 570 &prep_state(2); $code.=" 571 mov $inl, $itr1 572 and \$-16, $itr1 573 xor $itr2, $itr2 574 1: \n"; 575 &poly_add("0($inp, $itr2)"); 576 &poly_mul(); $code.=" 577 2: 578 add \$16, $itr2\n"; 579 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 580 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 581 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 582 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" 583 cmp $itr1, $itr2 584 jb 1b 585 cmp \$10*16, $itr2 586 jne 2b\n"; 587 &finalize_state(2); 588 &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" 589 sub \$4*16, $inl 590 lea 4*16($inp), $inp 591 lea 4*16($oup), $oup 592 jmp open_sse_tail_64_dec_loop 593 3: 594 cmp \$12*16, $inl 595 ja 3f\n"; 596 ############################################################################### 597 # 129 - 192 bytes are left 598 &prep_state(3); $code.=" 599 mov $inl, $itr1 600 mov \$10*16, $itr2 601 cmp \$10*16, $itr1 602 cmovg $itr2, $itr1 603 and \$-16, $itr1 604 xor $itr2, $itr2 605 1: \n"; 606 &poly_add("0($inp, $itr2)"); 607 &poly_mul(); $code.=" 608 2: 609 add \$16, $itr2\n"; 610 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 611 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 612 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 613 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 614 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 615 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 616 cmp $itr1, $itr2 617 jb 1b 618 cmp \$10*16, $itr2 619 jne 2b 620 cmp \$11*16, $inl 621 jb 1f\n"; 622 &poly_add("10*16($inp)"); 623 &poly_mul(); $code.=" 624 cmp \$12*16, $inl 625 jb 1f\n"; 626 &poly_add("11*16($inp)"); 627 &poly_mul(); $code.=" 628 1: \n"; 629 &finalize_state(3); 630 &xor_stream($A2, $B2, $C2, $D2, "0*16"); 631 &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" 632 sub \$8*16, $inl 633 lea 8*16($inp), $inp 634 lea 8*16($oup), $oup 635 jmp open_sse_tail_64_dec_loop 636 3: 637 ###############################################################################\n"; 638 # 193 - 255 bytes are left 639 &prep_state(4); $code.=" 640 xor $itr2, $itr2 641 1: \n"; 642 &poly_add("0($inp, $itr2)"); 643 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); 644 &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); 645 &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); 646 &poly_stage1(); 647 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); 648 &poly_stage2(); 649 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); 650 &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); 651 &poly_stage3(); 652 &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); 653 &poly_reduce_stage(); 654 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" 655 add \$16, $itr2 656 cmp \$10*16, $itr2 657 jb 1b 658 mov $inl, $itr1 659 and \$-16, $itr1 660 1: \n"; 661 &poly_add("0($inp, $itr2)"); 662 &poly_mul(); $code.=" 663 add \$16, $itr2 664 cmp $itr1, $itr2 665 jb 1b\n"; 666 &finalize_state(4); 667 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 668 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 669 &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" 670 movdqa $tmp_store, $D0 671 sub \$12*16, $inl 672 lea 12*16($inp), $inp 673 lea 12*16($oup), $oup 674 ############################################################################### 675 # Decrypt the remaining data, 16B at a time, using existing stream 676 open_sse_tail_64_dec_loop: 677 cmp \$16, $inl 678 jb 1f 679 sub \$16, $inl 680 movdqu ($inp), $T0 681 pxor $T0, $A0 682 movdqu $A0, ($oup) 683 lea 16($inp), $inp 684 lea 16($oup), $oup 685 movdqa $B0, $A0 686 movdqa $C0, $B0 687 movdqa $D0, $C0 688 jmp open_sse_tail_64_dec_loop 689 1: 690 movdqa $A0, $A1 691 692 # Decrypt up to 16 bytes at the end. 693 open_sse_tail_16: 694 test $inl, $inl 695 jz open_sse_finalize 696 697 # Read the final bytes into $T0. They need to be read in reverse order so 698 # that they end up in the correct order in $T0. 699 pxor $T0, $T0 700 lea -1($inp, $inl), $inp 701 movq $inl, $itr2 702 2: 703 pslldq \$1, $T0 704 pinsrb \$0, ($inp), $T0 705 sub \$1, $inp 706 sub \$1, $itr2 707 jnz 2b 708 709 3: 710 movq $T0, $t0 711 pextrq \$1, $T0, $t1 712 # The final bytes of keystream are in $A1. 713 pxor $A1, $T0 714 715 # Copy the plaintext bytes out. 716 2: 717 pextrb \$0, $T0, ($oup) 718 psrldq \$1, $T0 719 add \$1, $oup 720 sub \$1, $inl 721 jne 2b 722 723 add $t0, $acc0 724 adc $t1, $acc1 725 adc \$1, $acc2\n"; 726 &poly_mul(); $code.=" 727 728 open_sse_finalize:\n"; 729 &poly_add($len_store); 730 &poly_mul(); $code.=" 731 # Final reduce 732 mov $acc0, $t0 733 mov $acc1, $t1 734 mov $acc2, $t2 735 sub \$-5, $acc0 736 sbb \$-1, $acc1 737 sbb \$3, $acc2 738 cmovc $t0, $acc0 739 cmovc $t1, $acc1 740 cmovc $t2, $acc2 741 # Add in s part of the key 742 add 0+$s_store, $acc0 743 adc 8+$s_store, $acc1 744 745 add \$288 + 32, %rsp 746 .cfi_adjust_cfa_offset -(288 + 32) 747 pop $keyp 748 .cfi_adjust_cfa_offset -8 749 movq $acc0, ($keyp) 750 movq $acc1, 8($keyp) 751 752 pop %r15 753 .cfi_adjust_cfa_offset -8 754 pop %r14 755 .cfi_adjust_cfa_offset -8 756 pop %r13 757 .cfi_adjust_cfa_offset -8 758 pop %r12 759 .cfi_adjust_cfa_offset -8 760 pop %rbx 761 .cfi_adjust_cfa_offset -8 762 pop %rbp 763 .cfi_adjust_cfa_offset -8 764 ret 765 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32 766 ############################################################################### 767 open_sse_128: 768 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 769 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 770 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 771 movdqu 2*16($keyp), $D0 772 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 773 movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2 774 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 775 mov \$10, $acc0 776 1: \n"; 777 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 778 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 779 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 780 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 781 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 782 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 783 dec $acc0 784 jnz 1b 785 paddd .chacha20_consts(%rip), $A0 786 paddd .chacha20_consts(%rip), $A1 787 paddd .chacha20_consts(%rip), $A2 788 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 789 paddd $T2, $C1\npaddd $T2, $C2 790 paddd $T3, $D1 791 paddd .sse_inc(%rip), $T3 792 paddd $T3, $D2 793 # Clamp and store the key 794 pand .clamp(%rip), $A0 795 movdqa $A0, $r_store 796 movdqa $B0, $s_store 797 # Hash 798 mov %r8, $itr2 799 call poly_hash_ad_internal 800 1: 801 cmp \$16, $inl 802 jb open_sse_tail_16 803 sub \$16, $inl\n"; 804 # Load for hashing 805 &poly_add("0*8($inp)"); $code.=" 806 # Load for decryption 807 movdqu 0*16($inp), $T0 808 pxor $T0, $A1 809 movdqu $A1, 0*16($oup) 810 lea 1*16($inp), $inp 811 lea 1*16($oup), $oup\n"; 812 &poly_mul(); $code.=" 813 # Shift the stream left 814 movdqa $B1, $A1 815 movdqa $C1, $B1 816 movdqa $D1, $C1 817 movdqa $A2, $D1 818 movdqa $B2, $A2 819 movdqa $C2, $B2 820 movdqa $D2, $C2 821 jmp 1b 822 jmp open_sse_tail_16 823 .size chacha20_poly1305_open, .-chacha20_poly1305_open 824 .cfi_endproc 825 826 ################################################################################ 827 ################################################################################ 828 # void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 829 .globl chacha20_poly1305_seal 830 .type chacha20_poly1305_seal,\@function,2 831 .align 64 832 chacha20_poly1305_seal: 833 .cfi_startproc 834 push %rbp 835 .cfi_adjust_cfa_offset 8 836 push %rbx 837 .cfi_adjust_cfa_offset 8 838 push %r12 839 .cfi_adjust_cfa_offset 8 840 push %r13 841 .cfi_adjust_cfa_offset 8 842 push %r14 843 .cfi_adjust_cfa_offset 8 844 push %r15 845 .cfi_adjust_cfa_offset 8 846 # We write the calculated authenticator back to keyp at the end, so save 847 # the pointer on the stack too. 848 push $keyp 849 .cfi_adjust_cfa_offset 8 850 sub \$288 + 32, %rsp 851 .cfi_adjust_cfa_offset 288 + 32 852 .cfi_offset rbp, -16 853 .cfi_offset rbx, -24 854 .cfi_offset r12, -32 855 .cfi_offset r13, -40 856 .cfi_offset r14, -48 857 .cfi_offset r15, -56 858 lea 32(%rsp), %rbp 859 and \$-32, %rbp 860 mov 56($keyp), $inl # extra_in_len 861 addq %rdx, $inl 862 mov $inl, 8+$len_store 863 mov %r8, 0+$len_store 864 mov %rdx, $inl\n"; $code.=" 865 mov OPENSSL_ia32cap_P+8(%rip), %eax 866 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 867 xor \$`(1<<5) + (1<<8)`, %eax 868 jz chacha20_poly1305_seal_avx2\n" if ($avx>1); 869 $code.=" 870 cmp \$128, $inl 871 jbe seal_sse_128 872 # For longer buffers, prepare the poly key + some stream 873 movdqa .chacha20_consts(%rip), $A0 874 movdqu 0*16($keyp), $B0 875 movdqu 1*16($keyp), $C0 876 movdqu 2*16($keyp), $D0 877 movdqa $A0, $A1 878 movdqa $A0, $A2 879 movdqa $A0, $A3 880 movdqa $B0, $B1 881 movdqa $B0, $B2 882 movdqa $B0, $B3 883 movdqa $C0, $C1 884 movdqa $C0, $C2 885 movdqa $C0, $C3 886 movdqa $D0, $D3 887 paddd .sse_inc(%rip), $D0 888 movdqa $D0, $D2 889 paddd .sse_inc(%rip), $D0 890 movdqa $D0, $D1 891 paddd .sse_inc(%rip), $D0 892 # Store on stack 893 movdqa $B0, $state1_store 894 movdqa $C0, $state2_store 895 movdqa $D0, $ctr0_store 896 movdqa $D1, $ctr1_store 897 movdqa $D2, $ctr2_store 898 movdqa $D3, $ctr3_store 899 mov \$10, $acc0 900 1: \n"; 901 foreach $l (@loop_body) {$code.=$l."\n";} 902 @loop_body = split /\n/, $chacha_body; $code.=" 903 dec $acc0 904 jnz 1b\n"; 905 &finalize_state(4); $code.=" 906 # Clamp and store the key 907 pand .clamp(%rip), $A3 908 movdqa $A3, $r_store 909 movdqa $B3, $s_store 910 # Hash 911 mov %r8, $itr2 912 call poly_hash_ad_internal\n"; 913 &xor_stream($A2,$B2,$C2,$D2,"0*16"); 914 &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" 915 cmp \$12*16, $inl 916 ja 1f 917 mov \$8*16, $itr1 918 sub \$8*16, $inl 919 lea 8*16($inp), $inp 920 jmp seal_sse_128_seal_hash 921 1: \n"; 922 &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" 923 mov \$12*16, $itr1 924 sub \$12*16, $inl 925 lea 12*16($inp), $inp 926 mov \$2, $itr1 927 mov \$8, $itr2 928 cmp \$4*16, $inl 929 jbe seal_sse_tail_64 930 cmp \$8*16, $inl 931 jbe seal_sse_tail_128 932 cmp \$12*16, $inl 933 jbe seal_sse_tail_192 934 935 1: \n"; 936 # The main loop 937 &prep_state(4); $code.=" 938 2: \n"; 939 &emit_body(20); 940 &poly_add("0($oup)"); 941 &emit_body(20); 942 &poly_stage1(); 943 &emit_body(20); 944 &poly_stage2(); 945 &emit_body(20); 946 &poly_stage3(); 947 &emit_body(20); 948 &poly_reduce_stage(); 949 foreach $l (@loop_body) {$code.=$l."\n";} 950 @loop_body = split /\n/, $chacha_body; $code.=" 951 lea 16($oup), $oup 952 dec $itr2 953 jge 2b\n"; 954 &poly_add("0*8($oup)"); 955 &poly_mul(); $code.=" 956 lea 16($oup), $oup 957 dec $itr1 958 jg 2b\n"; 959 960 &finalize_state(4);$code.=" 961 movdqa $D2, $tmp_store\n"; 962 &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" 963 movdqa $tmp_store, $D2\n"; 964 &xor_stream($A2,$B2,$C2,$D2, 4*16); 965 &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" 966 cmp \$16*16, $inl 967 ja 3f 968 969 mov \$12*16, $itr1 970 sub \$12*16, $inl 971 lea 12*16($inp), $inp 972 jmp seal_sse_128_seal_hash 973 3: \n"; 974 &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" 975 lea 16*16($inp), $inp 976 sub \$16*16, $inl 977 mov \$6, $itr1 978 mov \$4, $itr2 979 cmp \$12*16, $inl 980 jg 1b 981 mov $inl, $itr1 982 test $inl, $inl 983 je seal_sse_128_seal_hash 984 mov \$6, $itr1 985 cmp \$4*16, $inl 986 jg 3f 987 ############################################################################### 988 seal_sse_tail_64:\n"; 989 &prep_state(1); $code.=" 990 1: \n"; 991 &poly_add("0($oup)"); 992 &poly_mul(); $code.=" 993 lea 16($oup), $oup 994 2: \n"; 995 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 996 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 997 &poly_add("0($oup)"); 998 &poly_mul(); $code.=" 999 lea 16($oup), $oup 1000 dec $itr1 1001 jg 1b 1002 dec $itr2 1003 jge 2b\n"; 1004 &finalize_state(1); $code.=" 1005 jmp seal_sse_128_seal 1006 3: 1007 cmp \$8*16, $inl 1008 jg 3f 1009 ############################################################################### 1010 seal_sse_tail_128:\n"; 1011 &prep_state(2); $code.=" 1012 1: \n"; 1013 &poly_add("0($oup)"); 1014 &poly_mul(); $code.=" 1015 lea 16($oup), $oup 1016 2: \n"; 1017 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1018 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1019 &poly_add("0($oup)"); 1020 &poly_mul(); 1021 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1022 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1023 lea 16($oup), $oup 1024 dec $itr1 1025 jg 1b 1026 dec $itr2 1027 jge 2b\n"; 1028 &finalize_state(2); 1029 &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" 1030 mov \$4*16, $itr1 1031 sub \$4*16, $inl 1032 lea 4*16($inp), $inp 1033 jmp seal_sse_128_seal_hash 1034 3: 1035 ############################################################################### 1036 seal_sse_tail_192:\n"; 1037 &prep_state(3); $code.=" 1038 1: \n"; 1039 &poly_add("0($oup)"); 1040 &poly_mul(); $code.=" 1041 lea 16($oup), $oup 1042 2: \n"; 1043 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1044 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1045 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1046 &poly_add("0($oup)"); 1047 &poly_mul(); 1048 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1049 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1050 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1051 lea 16($oup), $oup 1052 dec $itr1 1053 jg 1b 1054 dec $itr2 1055 jge 2b\n"; 1056 &finalize_state(3); 1057 &xor_stream($A2,$B2,$C2,$D2,0*16); 1058 &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" 1059 mov \$8*16, $itr1 1060 sub \$8*16, $inl 1061 lea 8*16($inp), $inp 1062 ############################################################################### 1063 seal_sse_128_seal_hash: 1064 cmp \$16, $itr1 1065 jb seal_sse_128_seal\n"; 1066 &poly_add("0($oup)"); 1067 &poly_mul(); $code.=" 1068 sub \$16, $itr1 1069 lea 16($oup), $oup 1070 jmp seal_sse_128_seal_hash 1071 1072 seal_sse_128_seal: 1073 cmp \$16, $inl 1074 jb seal_sse_tail_16 1075 sub \$16, $inl 1076 # Load for decryption 1077 movdqu 0*16($inp), $T0 1078 pxor $T0, $A0 1079 movdqu $A0, 0*16($oup) 1080 # Then hash 1081 add 0*8($oup), $acc0 1082 adc 1*8($oup), $acc1 1083 adc \$1, $acc2 1084 lea 1*16($inp), $inp 1085 lea 1*16($oup), $oup\n"; 1086 &poly_mul(); $code.=" 1087 # Shift the stream left 1088 movdqa $B0, $A0 1089 movdqa $C0, $B0 1090 movdqa $D0, $C0 1091 movdqa $A1, $D0 1092 movdqa $B1, $A1 1093 movdqa $C1, $B1 1094 movdqa $D1, $C1 1095 jmp seal_sse_128_seal 1096 1097 seal_sse_tail_16: 1098 test $inl, $inl 1099 jz process_blocks_of_extra_in 1100 # We can only load the PT one byte at a time to avoid buffer overread 1101 mov $inl, $itr2 1102 mov $inl, $itr1 1103 lea -1($inp, $inl), $inp 1104 pxor $T3, $T3 1105 1: 1106 pslldq \$1, $T3 1107 pinsrb \$0, ($inp), $T3 1108 lea -1($inp), $inp 1109 dec $itr1 1110 jne 1b 1111 1112 # XOR the keystream with the plaintext. 1113 pxor $A0, $T3 1114 1115 # Write ciphertext out, byte-by-byte. 1116 movq $inl, $itr1 1117 movdqu $T3, $A0 1118 2: 1119 pextrb \$0, $A0, ($oup) 1120 psrldq \$1, $A0 1121 add \$1, $oup 1122 sub \$1, $itr1 1123 jnz 2b 1124 1125 # $T3 contains the final (partial, non-empty) block of ciphertext which 1126 # needs to be fed into the Poly1305 state. The right-most $inl bytes of it 1127 # are valid. We need to fill it with extra_in bytes until full, or until we 1128 # run out of bytes. 1129 # 1130 # $keyp points to the tag output, which is actually a struct with the 1131 # extra_in pointer and length at offset 48. 1132 movq 288+32(%rsp), $keyp 1133 movq 56($keyp), $t1 # extra_in_len 1134 movq 48($keyp), $t0 # extra_in 1135 test $t1, $t1 1136 jz process_partial_block # Common case: no bytes of extra_in 1137 1138 movq \$16, $t2 1139 subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. 1140 cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len 1141 # (note that AT&T syntax reverses the arguments) 1142 jge load_extra_in 1143 movq $t1, $t2 1144 1145 load_extra_in: 1146 # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load 1147 # into $T3. They are loaded in reverse order. 1148 leaq -1($t0, $t2), $inp 1149 # Update extra_in and extra_in_len to reflect the bytes that are about to 1150 # be read. 1151 addq $t2, $t0 1152 subq $t2, $t1 1153 movq $t0, 48($keyp) 1154 movq $t1, 56($keyp) 1155 1156 # Update $itr2, which is used to select the mask later on, to reflect the 1157 # extra bytes about to be added. 1158 addq $t2, $itr2 1159 1160 # Load $t2 bytes of extra_in into $T2. 1161 pxor $T2, $T2 1162 3: 1163 pslldq \$1, $T2 1164 pinsrb \$0, ($inp), $T2 1165 lea -1($inp), $inp 1166 sub \$1, $t2 1167 jnz 3b 1168 1169 # Shift $T2 up the length of the remainder from the main encryption. Sadly, 1170 # the shift for an XMM register has to be a constant, thus we loop to do 1171 # this. 1172 movq $inl, $t2 1173 1174 4: 1175 pslldq \$1, $T2 1176 sub \$1, $t2 1177 jnz 4b 1178 1179 # Mask $T3 (the remainder from the main encryption) so that superfluous 1180 # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are 1181 # disjoint and so we can merge them with an OR. 1182 lea .and_masks(%rip), $t2 1183 shl \$4, $inl 1184 pand -16($t2, $inl), $T3 1185 1186 # Merge $T2 into $T3, forming the remainder block. 1187 por $T2, $T3 1188 1189 # The block of ciphertext + extra_in is ready to be included in the 1190 # Poly1305 state. 1191 movq $T3, $t0 1192 pextrq \$1, $T3, $t1 1193 add $t0, $acc0 1194 adc $t1, $acc1 1195 adc \$1, $acc2\n"; 1196 &poly_mul(); $code.=" 1197 1198 process_blocks_of_extra_in: 1199 # There may be additional bytes of extra_in to process. 1200 movq 288+32(%rsp), $keyp 1201 movq 48($keyp), $inp # extra_in 1202 movq 56($keyp), $itr2 # extra_in_len 1203 movq $itr2, $itr1 1204 shr \$4, $itr2 # number of blocks 1205 1206 5: 1207 jz process_extra_in_trailer\n"; 1208 &poly_add("0($inp)"); 1209 &poly_mul(); $code.=" 1210 leaq 16($inp), $inp 1211 subq \$1, $itr2 1212 jmp 5b 1213 1214 process_extra_in_trailer: 1215 andq \$15, $itr1 # remaining num bytes (<16) of extra_in 1216 movq $itr1, $inl 1217 jz do_length_block 1218 leaq -1($inp, $itr1), $inp 1219 1220 6: 1221 pslldq \$1, $T3 1222 pinsrb \$0, ($inp), $T3 1223 lea -1($inp), $inp 1224 sub \$1, $itr1 1225 jnz 6b 1226 1227 process_partial_block: 1228 # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 1229 lea .and_masks(%rip), $t2 1230 shl \$4, $inl 1231 pand -16($t2, $inl), $T3 1232 movq $T3, $t0 1233 pextrq \$1, $T3, $t1 1234 add $t0, $acc0 1235 adc $t1, $acc1 1236 adc \$1, $acc2\n"; 1237 &poly_mul(); $code.=" 1238 1239 do_length_block:\n"; 1240 &poly_add($len_store); 1241 &poly_mul(); $code.=" 1242 # Final reduce 1243 mov $acc0, $t0 1244 mov $acc1, $t1 1245 mov $acc2, $t2 1246 sub \$-5, $acc0 1247 sbb \$-1, $acc1 1248 sbb \$3, $acc2 1249 cmovc $t0, $acc0 1250 cmovc $t1, $acc1 1251 cmovc $t2, $acc2 1252 # Add in s part of the key 1253 add 0+$s_store, $acc0 1254 adc 8+$s_store, $acc1 1255 1256 add \$288 + 32, %rsp 1257 .cfi_adjust_cfa_offset -(288 + 32) 1258 pop $keyp 1259 .cfi_adjust_cfa_offset -8 1260 mov $acc0, 0*8($keyp) 1261 mov $acc1, 1*8($keyp) 1262 1263 pop %r15 1264 .cfi_adjust_cfa_offset -8 1265 pop %r14 1266 .cfi_adjust_cfa_offset -8 1267 pop %r13 1268 .cfi_adjust_cfa_offset -8 1269 pop %r12 1270 .cfi_adjust_cfa_offset -8 1271 pop %rbx 1272 .cfi_adjust_cfa_offset -8 1273 pop %rbp 1274 .cfi_adjust_cfa_offset -8 1275 ret 1276 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32 1277 ################################################################################ 1278 seal_sse_128: 1279 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 1280 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 1281 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 1282 movdqu 2*16($keyp), $D2 1283 movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0 1284 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 1285 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 1286 mov \$10, $acc0 1287 1:\n"; 1288 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1289 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1290 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1291 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1292 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1293 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1294 dec $acc0 1295 jnz 1b 1296 paddd .chacha20_consts(%rip), $A0 1297 paddd .chacha20_consts(%rip), $A1 1298 paddd .chacha20_consts(%rip), $A2 1299 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 1300 paddd $T2, $C0\npaddd $T2, $C1 1301 paddd $T3, $D0 1302 paddd .sse_inc(%rip), $T3 1303 paddd $T3, $D1 1304 # Clamp and store the key 1305 pand .clamp(%rip), $A2 1306 movdqa $A2, $r_store 1307 movdqa $B2, $s_store 1308 # Hash 1309 mov %r8, $itr2 1310 call poly_hash_ad_internal 1311 jmp seal_sse_128_seal 1312 .size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n"; 1313 } 1314 1315 # There should have been a cfi_endproc at the end of that function, but the two 1316 # following blocks of code are jumped to without a stack frame and the CFI 1317 # context which they are used in happens to match the CFI context at the end of 1318 # the previous function. So the CFI table is just extended to the end of them. 1319 1320 if ($avx>1) { 1321 1322 ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); 1323 my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); 1324 ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 1325 $state1_store="2*32(%rbp)"; 1326 $state2_store="3*32(%rbp)"; 1327 $tmp_store="4*32(%rbp)"; 1328 $ctr0_store="5*32(%rbp)"; 1329 $ctr1_store="6*32(%rbp)"; 1330 $ctr2_store="7*32(%rbp)"; 1331 $ctr3_store="8*32(%rbp)"; 1332 1333 sub chacha_qr_avx2 { 1334 my ($a,$b,$c,$d,$t,$dir)=@_; 1335 $code.=<<___ if ($dir =~ /store/); 1336 vmovdqa $t, $tmp_store 1337 ___ 1338 $code.=<<___; 1339 vpaddd $b, $a, $a 1340 vpxor $a, $d, $d 1341 vpshufb .rol16(%rip), $d, $d 1342 vpaddd $d, $c, $c 1343 vpxor $c, $b, $b 1344 vpsrld \$20, $b, $t 1345 vpslld \$12, $b, $b 1346 vpxor $t, $b, $b 1347 vpaddd $b, $a, $a 1348 vpxor $a, $d, $d 1349 vpshufb .rol8(%rip), $d, $d 1350 vpaddd $d, $c, $c 1351 vpxor $c, $b, $b 1352 vpslld \$7, $b, $t 1353 vpsrld \$25, $b, $b 1354 vpxor $t, $b, $b 1355 ___ 1356 $code.=<<___ if ($dir =~ /left/); 1357 vpalignr \$12, $d, $d, $d 1358 vpalignr \$8, $c, $c, $c 1359 vpalignr \$4, $b, $b, $b 1360 ___ 1361 $code.=<<___ if ($dir =~ /right/); 1362 vpalignr \$4, $d, $d, $d 1363 vpalignr \$8, $c, $c, $c 1364 vpalignr \$12, $b, $b, $b 1365 ___ 1366 $code.=<<___ if ($dir =~ /load/); 1367 vmovdqa $tmp_store, $t 1368 ___ 1369 } 1370 1371 sub prep_state_avx2 { 1372 my ($n)=@_; 1373 $code.=<<___; 1374 vmovdqa .chacha20_consts(%rip), $A0 1375 vmovdqa $state1_store, $B0 1376 vmovdqa $state2_store, $C0 1377 ___ 1378 $code.=<<___ if ($n ge 2); 1379 vmovdqa $A0, $A1 1380 vmovdqa $B0, $B1 1381 vmovdqa $C0, $C1 1382 ___ 1383 $code.=<<___ if ($n ge 3); 1384 vmovdqa $A0, $A2 1385 vmovdqa $B0, $B2 1386 vmovdqa $C0, $C2 1387 ___ 1388 $code.=<<___ if ($n ge 4); 1389 vmovdqa $A0, $A3 1390 vmovdqa $B0, $B3 1391 vmovdqa $C0, $C3 1392 ___ 1393 $code.=<<___ if ($n eq 1); 1394 vmovdqa .avx2_inc(%rip), $D0 1395 vpaddd $ctr0_store, $D0, $D0 1396 vmovdqa $D0, $ctr0_store 1397 ___ 1398 $code.=<<___ if ($n eq 2); 1399 vmovdqa .avx2_inc(%rip), $D0 1400 vpaddd $ctr0_store, $D0, $D1 1401 vpaddd $D1, $D0, $D0 1402 vmovdqa $D0, $ctr0_store 1403 vmovdqa $D1, $ctr1_store 1404 ___ 1405 $code.=<<___ if ($n eq 3); 1406 vmovdqa .avx2_inc(%rip), $D0 1407 vpaddd $ctr0_store, $D0, $D2 1408 vpaddd $D2, $D0, $D1 1409 vpaddd $D1, $D0, $D0 1410 vmovdqa $D0, $ctr0_store 1411 vmovdqa $D1, $ctr1_store 1412 vmovdqa $D2, $ctr2_store 1413 ___ 1414 $code.=<<___ if ($n eq 4); 1415 vmovdqa .avx2_inc(%rip), $D0 1416 vpaddd $ctr0_store, $D0, $D3 1417 vpaddd $D3, $D0, $D2 1418 vpaddd $D2, $D0, $D1 1419 vpaddd $D1, $D0, $D0 1420 vmovdqa $D3, $ctr3_store 1421 vmovdqa $D2, $ctr2_store 1422 vmovdqa $D1, $ctr1_store 1423 vmovdqa $D0, $ctr0_store 1424 ___ 1425 } 1426 1427 sub finalize_state_avx2 { 1428 my ($n)=@_; 1429 $code.=<<___ if ($n eq 4); 1430 vpaddd .chacha20_consts(%rip), $A3, $A3 1431 vpaddd $state1_store, $B3, $B3 1432 vpaddd $state2_store, $C3, $C3 1433 vpaddd $ctr3_store, $D3, $D3 1434 ___ 1435 $code.=<<___ if ($n ge 3); 1436 vpaddd .chacha20_consts(%rip), $A2, $A2 1437 vpaddd $state1_store, $B2, $B2 1438 vpaddd $state2_store, $C2, $C2 1439 vpaddd $ctr2_store, $D2, $D2 1440 ___ 1441 $code.=<<___ if ($n ge 2); 1442 vpaddd .chacha20_consts(%rip), $A1, $A1 1443 vpaddd $state1_store, $B1, $B1 1444 vpaddd $state2_store, $C1, $C1 1445 vpaddd $ctr1_store, $D1, $D1 1446 ___ 1447 $code.=<<___; 1448 vpaddd .chacha20_consts(%rip), $A0, $A0 1449 vpaddd $state1_store, $B0, $B0 1450 vpaddd $state2_store, $C0, $C0 1451 vpaddd $ctr0_store, $D0, $D0 1452 ___ 1453 } 1454 1455 sub xor_stream_avx2 { 1456 my ($A, $B, $C, $D, $offset, $hlp)=@_; 1457 $code.=<<___; 1458 vperm2i128 \$0x02, $A, $B, $hlp 1459 vperm2i128 \$0x13, $A, $B, $B 1460 vperm2i128 \$0x02, $C, $D, $A 1461 vperm2i128 \$0x13, $C, $D, $C 1462 vpxor 0*32+$offset($inp), $hlp, $hlp 1463 vpxor 1*32+$offset($inp), $A, $A 1464 vpxor 2*32+$offset($inp), $B, $B 1465 vpxor 3*32+$offset($inp), $C, $C 1466 vmovdqu $hlp, 0*32+$offset($oup) 1467 vmovdqu $A, 1*32+$offset($oup) 1468 vmovdqu $B, 2*32+$offset($oup) 1469 vmovdqu $C, 3*32+$offset($oup) 1470 ___ 1471 } 1472 1473 sub finish_stream_avx2 { 1474 my ($A, $B, $C, $D, $hlp)=@_; 1475 $code.=<<___; 1476 vperm2i128 \$0x13, $A, $B, $hlp 1477 vperm2i128 \$0x02, $A, $B, $A 1478 vperm2i128 \$0x02, $C, $D, $B 1479 vperm2i128 \$0x13, $C, $D, $D 1480 vmovdqa $hlp, $C 1481 ___ 1482 } 1483 1484 sub poly_stage1_mulx { 1485 $code.=<<___; 1486 mov 0+$r_store, %rdx 1487 mov %rdx, $t2 1488 mulx $acc0, $t0, $t1 1489 mulx $acc1, %rax, %rdx 1490 imulq $acc2, $t2 1491 add %rax, $t1 1492 adc %rdx, $t2 1493 ___ 1494 } 1495 1496 sub poly_stage2_mulx { 1497 $code.=<<___; 1498 mov 8+$r_store, %rdx 1499 mulx $acc0, $acc0, %rax 1500 add $acc0, $t1 1501 mulx $acc1, $acc1, $t3 1502 adc $acc1, $t2 1503 adc \$0, $t3 1504 imulq $acc2, %rdx 1505 ___ 1506 } 1507 1508 sub poly_stage3_mulx { 1509 $code.=<<___; 1510 add %rax, $t2 1511 adc %rdx, $t3 1512 ___ 1513 } 1514 1515 sub poly_mul_mulx { 1516 &poly_stage1_mulx(); 1517 &poly_stage2_mulx(); 1518 &poly_stage3_mulx(); 1519 &poly_reduce_stage(); 1520 } 1521 1522 sub gen_chacha_round_avx2 { 1523 my ($rot1, $rot2, $shift)=@_; 1524 my $round=""; 1525 $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); 1526 $round=$round ."vmovdqa $rot2, $C0 1527 vpaddd $B3, $A3, $A3 1528 vpaddd $B2, $A2, $A2 1529 vpaddd $B1, $A1, $A1 1530 vpaddd $B0, $A0, $A0 1531 vpxor $A3, $D3, $D3 1532 vpxor $A2, $D2, $D2 1533 vpxor $A1, $D1, $D1 1534 vpxor $A0, $D0, $D0 1535 vpshufb $C0, $D3, $D3 1536 vpshufb $C0, $D2, $D2 1537 vpshufb $C0, $D1, $D1 1538 vpshufb $C0, $D0, $D0 1539 vmovdqa $tmp_store, $C0 1540 vpaddd $D3, $C3, $C3 1541 vpaddd $D2, $C2, $C2 1542 vpaddd $D1, $C1, $C1 1543 vpaddd $D0, $C0, $C0 1544 vpxor $C3, $B3, $B3 1545 vpxor $C2, $B2, $B2 1546 vpxor $C1, $B1, $B1 1547 vpxor $C0, $B0, $B0 1548 vmovdqa $C0, $tmp_store 1549 vpsrld \$$rot1, $B3, $C0 1550 vpslld \$32-$rot1, $B3, $B3 1551 vpxor $C0, $B3, $B3 1552 vpsrld \$$rot1, $B2, $C0 1553 vpslld \$32-$rot1, $B2, $B2 1554 vpxor $C0, $B2, $B2 1555 vpsrld \$$rot1, $B1, $C0 1556 vpslld \$32-$rot1, $B1, $B1 1557 vpxor $C0, $B1, $B1 1558 vpsrld \$$rot1, $B0, $C0 1559 vpslld \$32-$rot1, $B0, $B0 1560 vpxor $C0, $B0, $B0\n"; 1561 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 1562 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 1563 $round=$round ."vmovdqa $tmp_store, $C0 1564 vpalignr \$$s1, $B3, $B3, $B3 1565 vpalignr \$$s2, $C3, $C3, $C3 1566 vpalignr \$$s3, $D3, $D3, $D3 1567 vpalignr \$$s1, $B2, $B2, $B2 1568 vpalignr \$$s2, $C2, $C2, $C2 1569 vpalignr \$$s3, $D2, $D2, $D2 1570 vpalignr \$$s1, $B1, $B1, $B1 1571 vpalignr \$$s2, $C1, $C1, $C1 1572 vpalignr \$$s3, $D1, $D1, $D1 1573 vpalignr \$$s1, $B0, $B0, $B0 1574 vpalignr \$$s2, $C0, $C0, $C0 1575 vpalignr \$$s3, $D0, $D0, $D0\n" 1576 if (($shift =~ /left/) || ($shift =~ /right/)); 1577 return $round; 1578 }; 1579 1580 $chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1581 &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") . 1582 &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1583 &gen_chacha_round_avx2(25, ".rol8(%rip)", "right"); 1584 1585 @loop_body = split /\n/, $chacha_body; 1586 1587 $code.=" 1588 ############################################################################### 1589 .type chacha20_poly1305_open_avx2,\@function,2 1590 .align 64 1591 chacha20_poly1305_open_avx2: 1592 vzeroupper 1593 vmovdqa .chacha20_consts(%rip), $A0 1594 vbroadcasti128 0*16($keyp), $B0 1595 vbroadcasti128 1*16($keyp), $C0 1596 vbroadcasti128 2*16($keyp), $D0 1597 vpaddd .avx2_init(%rip), $D0, $D0 1598 cmp \$6*32, $inl 1599 jbe open_avx2_192 1600 cmp \$10*32, $inl 1601 jbe open_avx2_320 1602 1603 vmovdqa $B0, $state1_store 1604 vmovdqa $C0, $state2_store 1605 vmovdqa $D0, $ctr0_store 1606 mov \$10, $acc0 1607 1: \n"; 1608 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1609 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1610 dec $acc0 1611 jne 1b 1612 vpaddd .chacha20_consts(%rip), $A0, $A0 1613 vpaddd $state1_store, $B0, $B0 1614 vpaddd $state2_store, $C0, $C0 1615 vpaddd $ctr0_store, $D0, $D0 1616 1617 vperm2i128 \$0x02, $A0, $B0, $T0 1618 # Clamp and store key 1619 vpand .clamp(%rip), $T0, $T0 1620 vmovdqa $T0, $r_store 1621 # Stream for the first 64 bytes 1622 vperm2i128 \$0x13, $A0, $B0, $A0 1623 vperm2i128 \$0x13, $C0, $D0, $B0 1624 # Hash AD + first 64 bytes 1625 mov %r8, $itr2 1626 call poly_hash_ad_internal 1627 xor $itr1, $itr1 1628 # Hash first 64 bytes 1629 1: \n"; 1630 &poly_add("0($inp, $itr1)"); 1631 &poly_mul(); $code.=" 1632 add \$16, $itr1 1633 cmp \$2*32, $itr1 1634 jne 1b 1635 # Decrypt first 64 bytes 1636 vpxor 0*32($inp), $A0, $A0 1637 vpxor 1*32($inp), $B0, $B0 1638 vmovdqu $A0, 0*32($oup) 1639 vmovdqu $B0, 1*32($oup) 1640 lea 2*32($inp), $inp 1641 lea 2*32($oup), $oup 1642 sub \$2*32, $inl 1643 1: 1644 # Hash and decrypt 512 bytes each iteration 1645 cmp \$16*32, $inl 1646 jb 3f\n"; 1647 &prep_state_avx2(4); $code.=" 1648 xor $itr1, $itr1 1649 2: \n"; 1650 &poly_add("0*8($inp, $itr1)"); 1651 &emit_body(10); 1652 &poly_stage1_mulx(); 1653 &emit_body(9); 1654 &poly_stage2_mulx(); 1655 &emit_body(12); 1656 &poly_stage3_mulx(); 1657 &emit_body(10); 1658 &poly_reduce_stage(); 1659 &emit_body(9); 1660 &poly_add("2*8($inp, $itr1)"); 1661 &emit_body(8); 1662 &poly_stage1_mulx(); 1663 &emit_body(18); 1664 &poly_stage2_mulx(); 1665 &emit_body(18); 1666 &poly_stage3_mulx(); 1667 &emit_body(9); 1668 &poly_reduce_stage(); 1669 &emit_body(8); 1670 &poly_add("4*8($inp, $itr1)"); $code.=" 1671 lea 6*8($itr1), $itr1\n"; 1672 &emit_body(18); 1673 &poly_stage1_mulx(); 1674 &emit_body(8); 1675 &poly_stage2_mulx(); 1676 &emit_body(8); 1677 &poly_stage3_mulx(); 1678 &emit_body(18); 1679 &poly_reduce_stage(); 1680 foreach $l (@loop_body) {$code.=$l."\n";} 1681 @loop_body = split /\n/, $chacha_body; $code.=" 1682 cmp \$10*6*8, $itr1 1683 jne 2b\n"; 1684 &finalize_state_avx2(4); $code.=" 1685 vmovdqa $A0, $tmp_store\n"; 1686 &poly_add("10*6*8($inp)"); 1687 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1688 vmovdqa $tmp_store, $A0\n"; 1689 &poly_mul(); 1690 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1691 &poly_add("10*6*8+2*8($inp)"); 1692 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1693 &poly_mul(); 1694 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 1695 lea 16*32($inp), $inp 1696 lea 16*32($oup), $oup 1697 sub \$16*32, $inl 1698 jmp 1b 1699 3: 1700 test $inl, $inl 1701 vzeroupper 1702 je open_sse_finalize 1703 3: 1704 cmp \$4*32, $inl 1705 ja 3f\n"; 1706 ############################################################################### 1707 # 1-128 bytes left 1708 &prep_state_avx2(1); $code.=" 1709 xor $itr2, $itr2 1710 mov $inl, $itr1 1711 and \$-16, $itr1 1712 test $itr1, $itr1 1713 je 2f 1714 1: \n"; 1715 &poly_add("0*8($inp, $itr2)"); 1716 &poly_mul(); $code.=" 1717 2: 1718 add \$16, $itr2\n"; 1719 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1720 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1721 cmp $itr1, $itr2 1722 jb 1b 1723 cmp \$160, $itr2 1724 jne 2b\n"; 1725 &finalize_state_avx2(1); 1726 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 1727 jmp open_avx2_tail_loop 1728 3: 1729 cmp \$8*32, $inl 1730 ja 3f\n"; 1731 ############################################################################### 1732 # 129-256 bytes left 1733 &prep_state_avx2(2); $code.=" 1734 mov $inl, $tmp_store 1735 mov $inl, $itr1 1736 sub \$4*32, $itr1 1737 shr \$4, $itr1 1738 mov \$10, $itr2 1739 cmp \$10, $itr1 1740 cmovg $itr2, $itr1 1741 mov $inp, $inl 1742 xor $itr2, $itr2 1743 1: \n"; 1744 &poly_add("0*8($inl)"); 1745 &poly_mul_mulx(); $code.=" 1746 lea 16($inl), $inl 1747 2: \n"; 1748 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1749 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" 1750 inc $itr2\n"; 1751 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1752 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1753 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1754 cmp $itr1, $itr2 1755 jb 1b 1756 cmp \$10, $itr2 1757 jne 2b 1758 mov $inl, $itr2 1759 sub $inp, $inl 1760 mov $inl, $itr1 1761 mov $tmp_store, $inl 1762 1: 1763 add \$16, $itr1 1764 cmp $inl, $itr1 1765 jg 1f\n"; 1766 &poly_add("0*8($itr2)"); 1767 &poly_mul_mulx(); $code.=" 1768 lea 16($itr2), $itr2 1769 jmp 1b 1770 1: \n"; 1771 &finalize_state_avx2(2); 1772 &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); 1773 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1774 lea 4*32($inp), $inp 1775 lea 4*32($oup), $oup 1776 sub \$4*32, $inl 1777 jmp open_avx2_tail_loop 1778 3: 1779 cmp \$12*32, $inl 1780 ja 3f\n"; 1781 ############################################################################### 1782 # 257-383 bytes left 1783 &prep_state_avx2(3); $code.=" 1784 mov $inl, $tmp_store 1785 mov $inl, $itr1 1786 sub \$8*32, $itr1 1787 shr \$4, $itr1 1788 add \$6, $itr1 1789 mov \$10, $itr2 1790 cmp \$10, $itr1 1791 cmovg $itr2, $itr1 1792 mov $inp, $inl 1793 xor $itr2, $itr2 1794 1: \n"; 1795 &poly_add("0*8($inl)"); 1796 &poly_mul_mulx(); $code.=" 1797 lea 16($inl), $inl 1798 2: \n"; 1799 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 1800 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1801 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1802 &poly_add("0*8($inl)"); 1803 &poly_mul(); $code.=" 1804 lea 16($inl), $inl 1805 inc $itr2\n"; 1806 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); 1807 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1808 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1809 cmp $itr1, $itr2 1810 jb 1b 1811 cmp \$10, $itr2 1812 jne 2b 1813 mov $inl, $itr2 1814 sub $inp, $inl 1815 mov $inl, $itr1 1816 mov $tmp_store, $inl 1817 1: 1818 add \$16, $itr1 1819 cmp $inl, $itr1 1820 jg 1f\n"; 1821 &poly_add("0*8($itr2)"); 1822 &poly_mul_mulx(); $code.=" 1823 lea 16($itr2), $itr2 1824 jmp 1b 1825 1: \n"; 1826 &finalize_state_avx2(3); 1827 &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); 1828 &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); 1829 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1830 lea 8*32($inp), $inp 1831 lea 8*32($oup), $oup 1832 sub \$8*32, $inl 1833 jmp open_avx2_tail_loop 1834 3: \n"; 1835 ############################################################################### 1836 # 384-512 bytes left 1837 &prep_state_avx2(4); $code.=" 1838 xor $itr1, $itr1 1839 mov $inp, $itr2 1840 1: \n"; 1841 &poly_add("0*8($itr2)"); 1842 &poly_mul(); $code.=" 1843 lea 2*8($itr2), $itr2 1844 2: \n"; 1845 &emit_body(37); 1846 &poly_add("0*8($itr2)"); 1847 &poly_mul_mulx(); 1848 &emit_body(48); 1849 &poly_add("2*8($itr2)"); 1850 &poly_mul_mulx(); $code.=" 1851 lea 4*8($itr2), $itr2\n"; 1852 foreach $l (@loop_body) {$code.=$l."\n";} 1853 @loop_body = split /\n/, $chacha_body; $code.=" 1854 inc $itr1 1855 cmp \$4, $itr1 1856 jl 1b 1857 cmp \$10, $itr1 1858 jne 2b 1859 mov $inl, $itr1 1860 sub \$12*32, $itr1 1861 and \$-16, $itr1 1862 1: 1863 test $itr1, $itr1 1864 je 1f\n"; 1865 &poly_add("0*8($itr2)"); 1866 &poly_mul_mulx(); $code.=" 1867 lea 2*8($itr2), $itr2 1868 sub \$2*8, $itr1 1869 jmp 1b 1870 1: \n"; 1871 &finalize_state_avx2(4); $code.=" 1872 vmovdqa $A0, $tmp_store\n"; 1873 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1874 vmovdqa $tmp_store, $A0\n"; 1875 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1876 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1877 &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" 1878 lea 12*32($inp), $inp 1879 lea 12*32($oup), $oup 1880 sub \$12*32, $inl 1881 open_avx2_tail_loop: 1882 cmp \$32, $inl 1883 jb open_avx2_tail 1884 sub \$32, $inl 1885 vpxor ($inp), $A0, $A0 1886 vmovdqu $A0, ($oup) 1887 lea 1*32($inp), $inp 1888 lea 1*32($oup), $oup 1889 vmovdqa $B0, $A0 1890 vmovdqa $C0, $B0 1891 vmovdqa $D0, $C0 1892 jmp open_avx2_tail_loop 1893 open_avx2_tail: 1894 cmp \$16, $inl 1895 vmovdqa $A0x, $A1x 1896 jb 1f 1897 sub \$16, $inl 1898 #load for decryption 1899 vpxor ($inp), $A0x, $A1x 1900 vmovdqu $A1x, ($oup) 1901 lea 1*16($inp), $inp 1902 lea 1*16($oup), $oup 1903 vperm2i128 \$0x11, $A0, $A0, $A0 1904 vmovdqa $A0x, $A1x 1905 1: 1906 vzeroupper 1907 jmp open_sse_tail_16 1908 ############################################################################### 1909 open_avx2_192: 1910 vmovdqa $A0, $A1 1911 vmovdqa $A0, $A2 1912 vmovdqa $B0, $B1 1913 vmovdqa $B0, $B2 1914 vmovdqa $C0, $C1 1915 vmovdqa $C0, $C2 1916 vpaddd .avx2_inc(%rip), $D0, $D1 1917 vmovdqa $D0, $T2 1918 vmovdqa $D1, $T3 1919 mov \$10, $acc0 1920 1: \n"; 1921 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1922 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1923 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1924 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1925 dec $acc0 1926 jne 1b 1927 vpaddd $A2, $A0, $A0 1928 vpaddd $A2, $A1, $A1 1929 vpaddd $B2, $B0, $B0 1930 vpaddd $B2, $B1, $B1 1931 vpaddd $C2, $C0, $C0 1932 vpaddd $C2, $C1, $C1 1933 vpaddd $T2, $D0, $D0 1934 vpaddd $T3, $D1, $D1 1935 vperm2i128 \$0x02, $A0, $B0, $T0 1936 # Clamp and store the key 1937 vpand .clamp(%rip), $T0, $T0 1938 vmovdqa $T0, $r_store 1939 # Stream for up to 192 bytes 1940 vperm2i128 \$0x13, $A0, $B0, $A0 1941 vperm2i128 \$0x13, $C0, $D0, $B0 1942 vperm2i128 \$0x02, $A1, $B1, $C0 1943 vperm2i128 \$0x02, $C1, $D1, $D0 1944 vperm2i128 \$0x13, $A1, $B1, $A1 1945 vperm2i128 \$0x13, $C1, $D1, $B1 1946 open_avx2_short: 1947 mov %r8, $itr2 1948 call poly_hash_ad_internal 1949 open_avx2_hash_and_xor_loop: 1950 cmp \$32, $inl 1951 jb open_avx2_short_tail_32 1952 sub \$32, $inl\n"; 1953 # Load + hash 1954 &poly_add("0*8($inp)"); 1955 &poly_mul(); 1956 &poly_add("2*8($inp)"); 1957 &poly_mul(); $code.=" 1958 # Load + decrypt 1959 vpxor ($inp), $A0, $A0 1960 vmovdqu $A0, ($oup) 1961 lea 1*32($inp), $inp 1962 lea 1*32($oup), $oup 1963 # Shift stream 1964 vmovdqa $B0, $A0 1965 vmovdqa $C0, $B0 1966 vmovdqa $D0, $C0 1967 vmovdqa $A1, $D0 1968 vmovdqa $B1, $A1 1969 vmovdqa $C1, $B1 1970 vmovdqa $D1, $C1 1971 vmovdqa $A2, $D1 1972 vmovdqa $B2, $A2 1973 jmp open_avx2_hash_and_xor_loop 1974 open_avx2_short_tail_32: 1975 cmp \$16, $inl 1976 vmovdqa $A0x, $A1x 1977 jb 1f 1978 sub \$16, $inl\n"; 1979 &poly_add("0*8($inp)"); 1980 &poly_mul(); $code.=" 1981 vpxor ($inp), $A0x, $A3x 1982 vmovdqu $A3x, ($oup) 1983 lea 1*16($inp), $inp 1984 lea 1*16($oup), $oup 1985 vextracti128 \$1, $A0, $A1x 1986 1: 1987 vzeroupper 1988 jmp open_sse_tail_16 1989 ############################################################################### 1990 open_avx2_320: 1991 vmovdqa $A0, $A1 1992 vmovdqa $A0, $A2 1993 vmovdqa $B0, $B1 1994 vmovdqa $B0, $B2 1995 vmovdqa $C0, $C1 1996 vmovdqa $C0, $C2 1997 vpaddd .avx2_inc(%rip), $D0, $D1 1998 vpaddd .avx2_inc(%rip), $D1, $D2 1999 vmovdqa $B0, $T1 2000 vmovdqa $C0, $T2 2001 vmovdqa $D0, $ctr0_store 2002 vmovdqa $D1, $ctr1_store 2003 vmovdqa $D2, $ctr2_store 2004 mov \$10, $acc0 2005 1: \n"; 2006 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2007 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2008 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2009 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2010 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2011 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2012 dec $acc0 2013 jne 1b 2014 vpaddd .chacha20_consts(%rip), $A0, $A0 2015 vpaddd .chacha20_consts(%rip), $A1, $A1 2016 vpaddd .chacha20_consts(%rip), $A2, $A2 2017 vpaddd $T1, $B0, $B0 2018 vpaddd $T1, $B1, $B1 2019 vpaddd $T1, $B2, $B2 2020 vpaddd $T2, $C0, $C0 2021 vpaddd $T2, $C1, $C1 2022 vpaddd $T2, $C2, $C2 2023 vpaddd $ctr0_store, $D0, $D0 2024 vpaddd $ctr1_store, $D1, $D1 2025 vpaddd $ctr2_store, $D2, $D2 2026 vperm2i128 \$0x02, $A0, $B0, $T0 2027 # Clamp and store the key 2028 vpand .clamp(%rip), $T0, $T0 2029 vmovdqa $T0, $r_store 2030 # Stream for up to 320 bytes 2031 vperm2i128 \$0x13, $A0, $B0, $A0 2032 vperm2i128 \$0x13, $C0, $D0, $B0 2033 vperm2i128 \$0x02, $A1, $B1, $C0 2034 vperm2i128 \$0x02, $C1, $D1, $D0 2035 vperm2i128 \$0x13, $A1, $B1, $A1 2036 vperm2i128 \$0x13, $C1, $D1, $B1 2037 vperm2i128 \$0x02, $A2, $B2, $C1 2038 vperm2i128 \$0x02, $C2, $D2, $D1 2039 vperm2i128 \$0x13, $A2, $B2, $A2 2040 vperm2i128 \$0x13, $C2, $D2, $B2 2041 jmp open_avx2_short 2042 .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 2043 ############################################################################### 2044 ############################################################################### 2045 .type chacha20_poly1305_seal_avx2,\@function,2 2046 .align 64 2047 chacha20_poly1305_seal_avx2: 2048 vzeroupper 2049 vmovdqa .chacha20_consts(%rip), $A0 2050 vbroadcasti128 0*16($keyp), $B0 2051 vbroadcasti128 1*16($keyp), $C0 2052 vbroadcasti128 2*16($keyp), $D0 2053 vpaddd .avx2_init(%rip), $D0, $D0 2054 cmp \$6*32, $inl 2055 jbe seal_avx2_192 2056 cmp \$10*32, $inl 2057 jbe seal_avx2_320 2058 vmovdqa $A0, $A1 2059 vmovdqa $A0, $A2 2060 vmovdqa $A0, $A3 2061 vmovdqa $B0, $B1 2062 vmovdqa $B0, $B2 2063 vmovdqa $B0, $B3 2064 vmovdqa $B0, $state1_store 2065 vmovdqa $C0, $C1 2066 vmovdqa $C0, $C2 2067 vmovdqa $C0, $C3 2068 vmovdqa $C0, $state2_store 2069 vmovdqa $D0, $D3 2070 vpaddd .avx2_inc(%rip), $D3, $D2 2071 vpaddd .avx2_inc(%rip), $D2, $D1 2072 vpaddd .avx2_inc(%rip), $D1, $D0 2073 vmovdqa $D0, $ctr0_store 2074 vmovdqa $D1, $ctr1_store 2075 vmovdqa $D2, $ctr2_store 2076 vmovdqa $D3, $ctr3_store 2077 mov \$10, $acc0 2078 1: \n"; 2079 foreach $l (@loop_body) {$code.=$l."\n";} 2080 @loop_body = split /\n/, $chacha_body; $code.=" 2081 dec $acc0 2082 jnz 1b\n"; 2083 &finalize_state_avx2(4); $code.=" 2084 vperm2i128 \$0x13, $C3, $D3, $C3 2085 vperm2i128 \$0x02, $A3, $B3, $D3 2086 vperm2i128 \$0x13, $A3, $B3, $A3 2087 vpand .clamp(%rip), $D3, $D3 2088 vmovdqa $D3, $r_store 2089 mov %r8, $itr2 2090 call poly_hash_ad_internal 2091 # Safely store 320 bytes (otherwise would handle with optimized call) 2092 vpxor 0*32($inp), $A3, $A3 2093 vpxor 1*32($inp), $C3, $C3 2094 vmovdqu $A3, 0*32($oup) 2095 vmovdqu $C3, 1*32($oup)\n"; 2096 &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); 2097 &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); 2098 &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" 2099 lea 10*32($inp), $inp 2100 sub \$10*32, $inl 2101 mov \$10*32, $itr1 2102 cmp \$4*32, $inl 2103 jbe seal_avx2_hash 2104 vpxor 0*32($inp), $A0, $A0 2105 vpxor 1*32($inp), $B0, $B0 2106 vpxor 2*32($inp), $C0, $C0 2107 vpxor 3*32($inp), $D0, $D0 2108 vmovdqu $A0, 10*32($oup) 2109 vmovdqu $B0, 11*32($oup) 2110 vmovdqu $C0, 12*32($oup) 2111 vmovdqu $D0, 13*32($oup) 2112 lea 4*32($inp), $inp 2113 sub \$4*32, $inl 2114 mov \$8, $itr1 2115 mov \$2, $itr2 2116 cmp \$4*32, $inl 2117 jbe seal_avx2_tail_128 2118 cmp \$8*32, $inl 2119 jbe seal_avx2_tail_256 2120 cmp \$12*32, $inl 2121 jbe seal_avx2_tail_384 2122 cmp \$16*32, $inl 2123 jbe seal_avx2_tail_512\n"; 2124 # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2125 &prep_state_avx2(4); 2126 foreach $l (@loop_body) {$code.=$l."\n";} 2127 @loop_body = split /\n/, $chacha_body; 2128 &emit_body(41); 2129 @loop_body = split /\n/, $chacha_body; $code.=" 2130 sub \$16, $oup 2131 mov \$9, $itr1 2132 jmp 4f 2133 1: \n"; 2134 &prep_state_avx2(4); $code.=" 2135 mov \$10, $itr1 2136 2: \n"; 2137 &poly_add("0*8($oup)"); 2138 &emit_body(10); 2139 &poly_stage1_mulx(); 2140 &emit_body(9); 2141 &poly_stage2_mulx(); 2142 &emit_body(12); 2143 &poly_stage3_mulx(); 2144 &emit_body(10); 2145 &poly_reduce_stage(); $code.=" 2146 4: \n"; 2147 &emit_body(9); 2148 &poly_add("2*8($oup)"); 2149 &emit_body(8); 2150 &poly_stage1_mulx(); 2151 &emit_body(18); 2152 &poly_stage2_mulx(); 2153 &emit_body(18); 2154 &poly_stage3_mulx(); 2155 &emit_body(9); 2156 &poly_reduce_stage(); 2157 &emit_body(8); 2158 &poly_add("4*8($oup)"); $code.=" 2159 lea 6*8($oup), $oup\n"; 2160 &emit_body(18); 2161 &poly_stage1_mulx(); 2162 &emit_body(8); 2163 &poly_stage2_mulx(); 2164 &emit_body(8); 2165 &poly_stage3_mulx(); 2166 &emit_body(18); 2167 &poly_reduce_stage(); 2168 foreach $l (@loop_body) {$code.=$l."\n";} 2169 @loop_body = split /\n/, $chacha_body; $code.=" 2170 dec $itr1 2171 jne 2b\n"; 2172 &finalize_state_avx2(4); $code.=" 2173 lea 4*8($oup), $oup 2174 vmovdqa $A0, $tmp_store\n"; 2175 &poly_add("-4*8($oup)"); 2176 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2177 vmovdqa $tmp_store, $A0\n"; 2178 &poly_mul(); 2179 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2180 &poly_add("-2*8($oup)"); 2181 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2182 &poly_mul(); 2183 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 2184 lea 16*32($inp), $inp 2185 sub \$16*32, $inl 2186 cmp \$16*32, $inl 2187 jg 1b\n"; 2188 &poly_add("0*8($oup)"); 2189 &poly_mul(); 2190 &poly_add("2*8($oup)"); 2191 &poly_mul(); $code.=" 2192 lea 4*8($oup), $oup 2193 mov \$10, $itr1 2194 xor $itr2, $itr2 2195 cmp \$4*32, $inl 2196 ja 3f 2197 ############################################################################### 2198 seal_avx2_tail_128:\n"; 2199 &prep_state_avx2(1); $code.=" 2200 1: \n"; 2201 &poly_add("0($oup)"); 2202 &poly_mul(); $code.=" 2203 lea 2*8($oup), $oup 2204 2: \n"; 2205 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2206 &poly_add("0*8($oup)"); 2207 &poly_mul(); 2208 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2209 &poly_add("2*8($oup)"); 2210 &poly_mul(); $code.=" 2211 lea 4*8($oup), $oup 2212 dec $itr1 2213 jg 1b 2214 dec $itr2 2215 jge 2b\n"; 2216 &finalize_state_avx2(1); 2217 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2218 jmp seal_avx2_short_loop 2219 3: 2220 cmp \$8*32, $inl 2221 ja 3f 2222 ############################################################################### 2223 seal_avx2_tail_256:\n"; 2224 &prep_state_avx2(2); $code.=" 2225 1: \n"; 2226 &poly_add("0($oup)"); 2227 &poly_mul(); $code.=" 2228 lea 2*8($oup), $oup 2229 2: \n"; 2230 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2231 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2232 &poly_add("0*8($oup)"); 2233 &poly_mul(); 2234 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2235 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2236 &poly_add("2*8($oup)"); 2237 &poly_mul(); $code.=" 2238 lea 4*8($oup), $oup 2239 dec $itr1 2240 jg 1b 2241 dec $itr2 2242 jge 2b\n"; 2243 &finalize_state_avx2(2); 2244 &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); 2245 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2246 mov \$4*32, $itr1 2247 lea 4*32($inp), $inp 2248 sub \$4*32, $inl 2249 jmp seal_avx2_hash 2250 3: 2251 cmp \$12*32, $inl 2252 ja seal_avx2_tail_512 2253 ############################################################################### 2254 seal_avx2_tail_384:\n"; 2255 &prep_state_avx2(3); $code.=" 2256 1: \n"; 2257 &poly_add("0($oup)"); 2258 &poly_mul(); $code.=" 2259 lea 2*8($oup), $oup 2260 2: \n"; 2261 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2262 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2263 &poly_add("0*8($oup)"); 2264 &poly_mul(); 2265 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2266 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2267 &poly_add("2*8($oup)"); 2268 &poly_mul(); 2269 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2270 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2271 lea 4*8($oup), $oup 2272 dec $itr1 2273 jg 1b 2274 dec $itr2 2275 jge 2b\n"; 2276 &finalize_state_avx2(3); 2277 &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); 2278 &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); 2279 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2280 mov \$8*32, $itr1 2281 lea 8*32($inp), $inp 2282 sub \$8*32, $inl 2283 jmp seal_avx2_hash 2284 ############################################################################### 2285 seal_avx2_tail_512:\n"; 2286 &prep_state_avx2(4); $code.=" 2287 1: \n"; 2288 &poly_add("0($oup)"); 2289 &poly_mul_mulx(); $code.=" 2290 lea 2*8($oup), $oup 2291 2: \n"; 2292 &emit_body(20); 2293 &poly_add("0*8($oup)"); 2294 &emit_body(20); 2295 &poly_stage1_mulx(); 2296 &emit_body(20); 2297 &poly_stage2_mulx(); 2298 &emit_body(20); 2299 &poly_stage3_mulx(); 2300 &emit_body(20); 2301 &poly_reduce_stage(); 2302 &emit_body(20); 2303 &poly_add("2*8($oup)"); 2304 &emit_body(20); 2305 &poly_stage1_mulx(); 2306 &emit_body(20); 2307 &poly_stage2_mulx(); 2308 &emit_body(20); 2309 &poly_stage3_mulx(); 2310 &emit_body(20); 2311 &poly_reduce_stage(); 2312 foreach $l (@loop_body) {$code.=$l."\n";} 2313 @loop_body = split /\n/, $chacha_body; $code.=" 2314 lea 4*8($oup), $oup 2315 dec $itr1 2316 jg 1b 2317 dec $itr2 2318 jge 2b\n"; 2319 &finalize_state_avx2(4); $code.=" 2320 vmovdqa $A0, $tmp_store\n"; 2321 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2322 vmovdqa $tmp_store, $A0\n"; 2323 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2324 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2325 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2326 mov \$12*32, $itr1 2327 lea 12*32($inp), $inp 2328 sub \$12*32, $inl 2329 jmp seal_avx2_hash 2330 ################################################################################ 2331 seal_avx2_320: 2332 vmovdqa $A0, $A1 2333 vmovdqa $A0, $A2 2334 vmovdqa $B0, $B1 2335 vmovdqa $B0, $B2 2336 vmovdqa $C0, $C1 2337 vmovdqa $C0, $C2 2338 vpaddd .avx2_inc(%rip), $D0, $D1 2339 vpaddd .avx2_inc(%rip), $D1, $D2 2340 vmovdqa $B0, $T1 2341 vmovdqa $C0, $T2 2342 vmovdqa $D0, $ctr0_store 2343 vmovdqa $D1, $ctr1_store 2344 vmovdqa $D2, $ctr2_store 2345 mov \$10, $acc0 2346 1: \n"; 2347 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2348 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2349 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2350 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2351 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2352 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2353 dec $acc0 2354 jne 1b 2355 vpaddd .chacha20_consts(%rip), $A0, $A0 2356 vpaddd .chacha20_consts(%rip), $A1, $A1 2357 vpaddd .chacha20_consts(%rip), $A2, $A2 2358 vpaddd $T1, $B0, $B0 2359 vpaddd $T1, $B1, $B1 2360 vpaddd $T1, $B2, $B2 2361 vpaddd $T2, $C0, $C0 2362 vpaddd $T2, $C1, $C1 2363 vpaddd $T2, $C2, $C2 2364 vpaddd $ctr0_store, $D0, $D0 2365 vpaddd $ctr1_store, $D1, $D1 2366 vpaddd $ctr2_store, $D2, $D2 2367 vperm2i128 \$0x02, $A0, $B0, $T0 2368 # Clamp and store the key 2369 vpand .clamp(%rip), $T0, $T0 2370 vmovdqa $T0, $r_store 2371 # Stream for up to 320 bytes 2372 vperm2i128 \$0x13, $A0, $B0, $A0 2373 vperm2i128 \$0x13, $C0, $D0, $B0 2374 vperm2i128 \$0x02, $A1, $B1, $C0 2375 vperm2i128 \$0x02, $C1, $D1, $D0 2376 vperm2i128 \$0x13, $A1, $B1, $A1 2377 vperm2i128 \$0x13, $C1, $D1, $B1 2378 vperm2i128 \$0x02, $A2, $B2, $C1 2379 vperm2i128 \$0x02, $C2, $D2, $D1 2380 vperm2i128 \$0x13, $A2, $B2, $A2 2381 vperm2i128 \$0x13, $C2, $D2, $B2 2382 jmp seal_avx2_short 2383 ################################################################################ 2384 seal_avx2_192: 2385 vmovdqa $A0, $A1 2386 vmovdqa $A0, $A2 2387 vmovdqa $B0, $B1 2388 vmovdqa $B0, $B2 2389 vmovdqa $C0, $C1 2390 vmovdqa $C0, $C2 2391 vpaddd .avx2_inc(%rip), $D0, $D1 2392 vmovdqa $D0, $T2 2393 vmovdqa $D1, $T3 2394 mov \$10, $acc0 2395 1: \n"; 2396 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2397 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2398 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2399 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 2400 dec $acc0 2401 jne 1b 2402 vpaddd $A2, $A0, $A0 2403 vpaddd $A2, $A1, $A1 2404 vpaddd $B2, $B0, $B0 2405 vpaddd $B2, $B1, $B1 2406 vpaddd $C2, $C0, $C0 2407 vpaddd $C2, $C1, $C1 2408 vpaddd $T2, $D0, $D0 2409 vpaddd $T3, $D1, $D1 2410 vperm2i128 \$0x02, $A0, $B0, $T0 2411 # Clamp and store the key 2412 vpand .clamp(%rip), $T0, $T0 2413 vmovdqa $T0, $r_store 2414 # Stream for up to 192 bytes 2415 vperm2i128 \$0x13, $A0, $B0, $A0 2416 vperm2i128 \$0x13, $C0, $D0, $B0 2417 vperm2i128 \$0x02, $A1, $B1, $C0 2418 vperm2i128 \$0x02, $C1, $D1, $D0 2419 vperm2i128 \$0x13, $A1, $B1, $A1 2420 vperm2i128 \$0x13, $C1, $D1, $B1 2421 seal_avx2_short: 2422 mov %r8, $itr2 2423 call poly_hash_ad_internal 2424 xor $itr1, $itr1 2425 seal_avx2_hash: 2426 cmp \$16, $itr1 2427 jb seal_avx2_short_loop\n"; 2428 &poly_add("0($oup)"); 2429 &poly_mul(); $code.=" 2430 sub \$16, $itr1 2431 add \$16, $oup 2432 jmp seal_avx2_hash 2433 seal_avx2_short_loop: 2434 cmp \$32, $inl 2435 jb seal_avx2_short_tail 2436 sub \$32, $inl 2437 # Encrypt 2438 vpxor ($inp), $A0, $A0 2439 vmovdqu $A0, ($oup) 2440 lea 1*32($inp), $inp 2441 # Load + hash\n"; 2442 &poly_add("0*8($oup)"); 2443 &poly_mul(); 2444 &poly_add("2*8($oup)"); 2445 &poly_mul(); $code.=" 2446 lea 1*32($oup), $oup 2447 # Shift stream 2448 vmovdqa $B0, $A0 2449 vmovdqa $C0, $B0 2450 vmovdqa $D0, $C0 2451 vmovdqa $A1, $D0 2452 vmovdqa $B1, $A1 2453 vmovdqa $C1, $B1 2454 vmovdqa $D1, $C1 2455 vmovdqa $A2, $D1 2456 vmovdqa $B2, $A2 2457 jmp seal_avx2_short_loop 2458 seal_avx2_short_tail: 2459 cmp \$16, $inl 2460 jb 1f 2461 sub \$16, $inl 2462 vpxor ($inp), $A0x, $A3x 2463 vmovdqu $A3x, ($oup) 2464 lea 1*16($inp), $inp\n"; 2465 &poly_add("0*8($oup)"); 2466 &poly_mul(); $code.=" 2467 lea 1*16($oup), $oup 2468 vextracti128 \$1, $A0, $A0x 2469 1: 2470 vzeroupper 2471 jmp seal_sse_tail_16 2472 .cfi_endproc 2473 "; 2474 } 2475 2476 if (!$win64) { 2477 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2478 print $code; 2479 } else { 2480 print <<___; 2481 .globl dummy_chacha20_poly1305_asm 2482 .type dummy_chacha20_poly1305_asm,\@abi-omnipotent 2483 dummy_chacha20_poly1305_asm: 2484 ret 2485 ___ 2486 } 2487 2488 close STDOUT; 2489