Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # Copyright (c) 2015, CloudFlare Ltd.
      4 #
      5 # Permission to use, copy, modify, and/or distribute this software for any
      6 # purpose with or without fee is hereby granted, provided that the above
      7 # copyright notice and this permission notice appear in all copies.
      8 #
      9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
     16 
     17 ##############################################################################
     18 #                                                                            #
     19 # Author:  Vlad Krasnov                                                      #
     20 #                                                                            #
     21 ##############################################################################
     22 
     23 $flavour = shift;
     24 $output  = shift;
     25 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     26 
     27 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     28 
     29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     30 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     31 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     32 die "can't locate x86_64-xlate.pl";
     33 
     34 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     35 *STDOUT=*OUT;
     36 
     37 $avx = 2;
     38 
     39 $code.=<<___;
     40 .text
     41 .extern OPENSSL_ia32cap_P
     42 
     43 chacha20_poly1305_constants:
     44 
     45 .align 64
     46 .chacha20_consts:
     47 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
     48 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
     49 .rol8:
     50 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
     51 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
     52 .rol16:
     53 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
     54 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
     55 .avx2_init:
     56 .long 0,0,0,0
     57 .sse_inc:
     58 .long 1,0,0,0
     59 .avx2_inc:
     60 .long 2,0,0,0,2,0,0,0
     61 .clamp:
     62 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
     63 .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
     64 .align 16
     65 .and_masks:
     66 .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     67 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     68 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     69 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     70 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     71 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     72 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     73 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     74 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
     75 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
     76 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
     77 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
     78 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
     79 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
     80 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
     81 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
     82 ___
     83 
     84 my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
     85 my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
     86 my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
     87 my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
     88 my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
     89 my $r_store="0*16(%rbp)";
     90 my $s_store="1*16(%rbp)";
     91 my $len_store="2*16(%rbp)";
     92 my $state1_store="3*16(%rbp)";
     93 my $state2_store="4*16(%rbp)";
     94 my $tmp_store="5*16(%rbp)";
     95 my $ctr0_store="6*16(%rbp)";
     96 my $ctr1_store="7*16(%rbp)";
     97 my $ctr2_store="8*16(%rbp)";
     98 my $ctr3_store="9*16(%rbp)";
     99 
    100 sub chacha_qr {
    101 my ($a,$b,$c,$d,$t,$dir)=@_;
    102 $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
    103 $code.="paddd $b, $a
    104         pxor $a, $d
    105         pshufb .rol16(%rip), $d
    106         paddd $d, $c
    107         pxor $c, $b
    108         movdqa $b, $t
    109         pslld \$12, $t
    110         psrld \$20, $b
    111         pxor $t, $b
    112         paddd $b, $a
    113         pxor $a, $d
    114         pshufb .rol8(%rip), $d
    115         paddd $d, $c
    116         pxor $c, $b
    117         movdqa $b, $t
    118         pslld \$7, $t
    119         psrld \$25, $b
    120         pxor $t, $b\n";
    121 $code.="palignr \$4, $b, $b
    122         palignr \$8, $c, $c
    123         palignr \$12, $d, $d\n" if ($dir =~ /left/);
    124 $code.="palignr \$12, $b, $b
    125         palignr \$8, $c, $c
    126         palignr \$4, $d, $d\n" if ($dir =~ /right/);
    127 $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
    128 }
    129 
    130 sub poly_add {
    131 my ($src)=@_;
    132 $code.="add $src, $acc0
    133         adc 8+$src, $acc1
    134         adc \$1, $acc2\n";
    135 }
    136 
    137 sub poly_stage1 {
    138 $code.="mov 0+$r_store, %rax
    139         mov %rax, $t2
    140         mul $acc0
    141         mov %rax, $t0
    142         mov %rdx, $t1
    143         mov 0+$r_store, %rax
    144         mul $acc1
    145         imulq $acc2, $t2
    146         add %rax, $t1
    147         adc %rdx, $t2\n";
    148 }
    149 
    150 sub poly_stage2 {
    151 $code.="mov 8+$r_store, %rax
    152         mov %rax, $t3
    153         mul $acc0
    154         add %rax, $t1
    155         adc \$0, %rdx
    156         mov %rdx, $acc0
    157         mov 8+$r_store, %rax
    158         mul $acc1
    159         add %rax, $t2
    160         adc \$0, %rdx\n";
    161 }
    162 
    163 sub poly_stage3 {
    164 $code.="imulq $acc2, $t3
    165         add $acc0, $t2
    166         adc %rdx, $t3\n";
    167 }
    168 
    169 sub poly_reduce_stage {
    170 $code.="mov $t0, $acc0
    171         mov $t1, $acc1
    172         mov $t2, $acc2
    173         and \$3, $acc2
    174         mov $t2, $t0
    175         and \$-4, $t0
    176         mov $t3, $t1
    177         shrd \$2, $t3, $t2
    178         shr \$2, $t3
    179         add $t0, $acc0
    180         adc $t1, $acc1
    181         adc \$0, $acc2
    182         add $t2, $acc0
    183         adc $t3, $acc1
    184         adc \$0, $acc2\n";
    185 }
    186 
    187 sub poly_mul {
    188     &poly_stage1();
    189     &poly_stage2();
    190     &poly_stage3();
    191     &poly_reduce_stage();
    192 }
    193 
    194 sub prep_state {
    195 my ($n)=@_;
    196 $code.="movdqa .chacha20_consts(%rip), $A0
    197         movdqa $state1_store, $B0
    198         movdqa $state2_store, $C0\n";
    199 $code.="movdqa $A0, $A1
    200         movdqa $B0, $B1
    201         movdqa $C0, $C1\n" if ($n ge 2);
    202 $code.="movdqa $A0, $A2
    203         movdqa $B0, $B2
    204         movdqa $C0, $C2\n" if ($n ge 3);
    205 $code.="movdqa $A0, $A3
    206         movdqa $B0, $B3
    207         movdqa $C0, $C3\n" if ($n ge 4);
    208 $code.="movdqa $ctr0_store, $D0
    209         paddd .sse_inc(%rip), $D0
    210         movdqa $D0, $ctr0_store\n" if ($n eq 1);
    211 $code.="movdqa $ctr0_store, $D1
    212         paddd .sse_inc(%rip), $D1
    213         movdqa $D1, $D0
    214         paddd .sse_inc(%rip), $D0
    215         movdqa $D0, $ctr0_store
    216         movdqa $D1, $ctr1_store\n" if ($n eq 2);
    217 $code.="movdqa $ctr0_store, $D2
    218         paddd .sse_inc(%rip), $D2
    219         movdqa $D2, $D1
    220         paddd .sse_inc(%rip), $D1
    221         movdqa $D1, $D0
    222         paddd .sse_inc(%rip), $D0
    223         movdqa $D0, $ctr0_store
    224         movdqa $D1, $ctr1_store
    225         movdqa $D2, $ctr2_store\n" if ($n eq 3);
    226 $code.="movdqa $ctr0_store, $D3
    227         paddd .sse_inc(%rip), $D3
    228         movdqa $D3, $D2
    229         paddd .sse_inc(%rip), $D2
    230         movdqa $D2, $D1
    231         paddd .sse_inc(%rip), $D1
    232         movdqa $D1, $D0
    233         paddd .sse_inc(%rip), $D0
    234         movdqa $D0, $ctr0_store
    235         movdqa $D1, $ctr1_store
    236         movdqa $D2, $ctr2_store
    237         movdqa $D3, $ctr3_store\n" if ($n eq 4);
    238 }
    239 
    240 sub finalize_state {
    241 my ($n)=@_;
    242 $code.="paddd .chacha20_consts(%rip), $A3
    243         paddd $state1_store, $B3
    244         paddd $state2_store, $C3
    245         paddd $ctr3_store, $D3\n" if ($n eq 4);
    246 $code.="paddd .chacha20_consts(%rip), $A2
    247         paddd $state1_store, $B2
    248         paddd $state2_store, $C2
    249         paddd $ctr2_store, $D2\n" if ($n ge 3);
    250 $code.="paddd .chacha20_consts(%rip), $A1
    251         paddd $state1_store, $B1
    252         paddd $state2_store, $C1
    253         paddd $ctr1_store, $D1\n" if ($n ge 2);
    254 $code.="paddd .chacha20_consts(%rip), $A0
    255         paddd $state1_store, $B0
    256         paddd $state2_store, $C0
    257         paddd $ctr0_store, $D0\n";
    258 }
    259 
    260 sub xor_stream {
    261 my ($A, $B, $C, $D, $offset)=@_;
    262 $code.="movdqu 0*16 + $offset($inp), $A3
    263         movdqu 1*16 + $offset($inp), $B3
    264         movdqu 2*16 + $offset($inp), $C3
    265         movdqu 3*16 + $offset($inp), $D3
    266         pxor $A3, $A
    267         pxor $B3, $B
    268         pxor $C3, $C
    269         pxor $D, $D3
    270         movdqu $A, 0*16 + $offset($oup)
    271         movdqu $B, 1*16 + $offset($oup)
    272         movdqu $C, 2*16 + $offset($oup)
    273         movdqu $D3, 3*16 + $offset($oup)\n";
    274 }
    275 
    276 sub xor_stream_using_temp {
    277 my ($A, $B, $C, $D, $offset, $temp)=@_;
    278 $code.="movdqa $temp, $tmp_store
    279         movdqu 0*16 + $offset($inp), $temp
    280         pxor $A, $temp
    281         movdqu $temp, 0*16 + $offset($oup)
    282         movdqu 1*16 + $offset($inp), $temp
    283         pxor $B, $temp
    284         movdqu $temp, 1*16 + $offset($oup)
    285         movdqu 2*16 + $offset($inp), $temp
    286         pxor $C, $temp
    287         movdqu $temp, 2*16 + $offset($oup)
    288         movdqu 3*16 + $offset($inp), $temp
    289         pxor $D, $temp
    290         movdqu $temp, 3*16 + $offset($oup)\n";
    291 }
    292 
    293 sub gen_chacha_round {
    294 my ($rot1, $rot2, $shift)=@_;
    295 my $round="";
    296 $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
    297 $round.="movdqa $rot2, $C0
    298          paddd $B3, $A3
    299          paddd $B2, $A2
    300          paddd $B1, $A1
    301          paddd $B0, $A0
    302          pxor $A3, $D3
    303          pxor $A2, $D2
    304          pxor $A1, $D1
    305          pxor $A0, $D0
    306          pshufb $C0, $D3
    307          pshufb $C0, $D2
    308          pshufb $C0, $D1
    309          pshufb $C0, $D0
    310          movdqa $tmp_store, $C0
    311          paddd $D3, $C3
    312          paddd $D2, $C2
    313          paddd $D1, $C1
    314          paddd $D0, $C0
    315          pxor $C3, $B3
    316          pxor $C2, $B2
    317          pxor $C1, $B1
    318          pxor $C0, $B0
    319          movdqa $C0, $tmp_store
    320          movdqa $B3, $C0
    321          psrld \$$rot1, $C0
    322          pslld \$32-$rot1, $B3
    323          pxor $C0, $B3
    324          movdqa $B2, $C0
    325          psrld \$$rot1, $C0
    326          pslld \$32-$rot1, $B2
    327          pxor $C0, $B2
    328          movdqa $B1, $C0
    329          psrld \$$rot1, $C0
    330          pslld \$32-$rot1, $B1
    331          pxor $C0, $B1
    332          movdqa $B0, $C0
    333          psrld \$$rot1, $C0
    334          pslld \$32-$rot1, $B0
    335          pxor $C0, $B0\n";
    336 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
    337 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
    338 $round.="movdqa $tmp_store, $C0
    339          palignr \$$s1, $B3, $B3
    340          palignr \$$s2, $C3, $C3
    341          palignr \$$s3, $D3, $D3
    342          palignr \$$s1, $B2, $B2
    343          palignr \$$s2, $C2, $C2
    344          palignr \$$s3, $D2, $D2
    345          palignr \$$s1, $B1, $B1
    346          palignr \$$s2, $C1, $C1
    347          palignr \$$s3, $D1, $D1
    348          palignr \$$s1, $B0, $B0
    349          palignr \$$s2, $C0, $C0
    350          palignr \$$s3, $D0, $D0\n"
    351 if (($shift =~ /left/) || ($shift =~ /right/));
    352 return $round;
    353 };
    354 
    355 $chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
    356                &gen_chacha_round(25, ".rol8(%rip)", "left") .
    357                &gen_chacha_round(20, ".rol16(%rip)") .
    358                &gen_chacha_round(25, ".rol8(%rip)", "right");
    359 
    360 my @loop_body = split /\n/, $chacha_body;
    361 
    362 sub emit_body {
    363 my ($n)=@_;
    364     for (my $i=0; $i < $n; $i++) {
    365         $code=$code.shift(@loop_body)."\n";
    366     };
    367 }
    368 
    369 {
    370 ################################################################################
    371 # void poly_hash_ad_internal();
    372 $code.="
    373 .type poly_hash_ad_internal,\@function,2
    374 .align 64
    375 poly_hash_ad_internal:
    376 .cfi_startproc
    377     xor $acc0, $acc0
    378     xor $acc1, $acc1
    379     xor $acc2, $acc2
    380     cmp \$13,  $itr2
    381     jne hash_ad_loop
    382 poly_fast_tls_ad:
    383     # Special treatment for the TLS case of 13 bytes
    384     mov ($adp), $acc0
    385     mov 5($adp), $acc1
    386     shr \$24, $acc1
    387     mov \$1, $acc2\n";
    388     &poly_mul(); $code.="
    389     ret
    390 hash_ad_loop:
    391         # Hash in 16 byte chunk
    392         cmp \$16, $itr2
    393         jb hash_ad_tail\n";
    394         &poly_add("0($adp)");
    395         &poly_mul(); $code.="
    396         lea 1*16($adp), $adp
    397         sub \$16, $itr2
    398     jmp hash_ad_loop
    399 hash_ad_tail:
    400     cmp \$0, $itr2
    401     je 1f
    402     # Hash last < 16 byte tail
    403     xor $t0, $t0
    404     xor $t1, $t1
    405     xor $t2, $t2
    406     add $itr2, $adp
    407 hash_ad_tail_loop:
    408         shld \$8, $t0, $t1
    409         shl \$8, $t0
    410         movzxb -1($adp), $t2
    411         xor $t2, $t0
    412         dec $adp
    413         dec $itr2
    414     jne hash_ad_tail_loop
    415 
    416     add $t0, $acc0
    417     adc $t1, $acc1
    418     adc \$1, $acc2\n";
    419     &poly_mul(); $code.="
    420     # Finished AD
    421 1:
    422     ret
    423 .cfi_endproc
    424 .size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
    425 }
    426 
    427 {
    428 ################################################################################
    429 # void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
    430 $code.="
    431 .globl chacha20_poly1305_open
    432 .type chacha20_poly1305_open,\@function,2
    433 .align 64
    434 chacha20_poly1305_open:
    435 .cfi_startproc
    436     push %rbp
    437 .cfi_adjust_cfa_offset 8
    438     push %rbx
    439 .cfi_adjust_cfa_offset 8
    440     push %r12
    441 .cfi_adjust_cfa_offset 8
    442     push %r13
    443 .cfi_adjust_cfa_offset 8
    444     push %r14
    445 .cfi_adjust_cfa_offset 8
    446     push %r15
    447 .cfi_adjust_cfa_offset 8
    448     # We write the calculated authenticator back to keyp at the end, so save
    449     # the pointer on the stack too.
    450     push $keyp
    451 .cfi_adjust_cfa_offset 8
    452     sub \$288 + 32, %rsp
    453 .cfi_adjust_cfa_offset 288 + 32
    454 .cfi_offset rbp, -16
    455 .cfi_offset rbx, -24
    456 .cfi_offset r12, -32
    457 .cfi_offset r13, -40
    458 .cfi_offset r14, -48
    459 .cfi_offset r15, -56
    460     lea 32(%rsp), %rbp
    461     and \$-32, %rbp
    462     mov %rdx, 8+$len_store
    463     mov %r8, 0+$len_store
    464     mov %rdx, $inl\n"; $code.="
    465     mov OPENSSL_ia32cap_P+8(%rip), %eax
    466     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
    467     xor \$`(1<<5) + (1<<8)`, %eax
    468     jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
    469 $code.="
    470 1:
    471     cmp \$128, $inl
    472     jbe open_sse_128
    473     # For long buffers, prepare the poly key first
    474     movdqa .chacha20_consts(%rip), $A0
    475     movdqu 0*16($keyp), $B0
    476     movdqu 1*16($keyp), $C0
    477     movdqu 2*16($keyp), $D0
    478     movdqa $D0, $T1
    479     # Store on stack, to free keyp
    480     movdqa $B0, $state1_store
    481     movdqa $C0, $state2_store
    482     movdqa $D0, $ctr0_store
    483     mov \$10, $acc0
    484 1:  \n";
    485         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    486         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
    487         dec $acc0
    488     jne 1b
    489     # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
    490     paddd .chacha20_consts(%rip), $A0
    491     paddd $state1_store, $B0
    492     # Clamp and store the key
    493     pand .clamp(%rip), $A0
    494     movdqa $A0, $r_store
    495     movdqa $B0, $s_store
    496     # Hash
    497     mov %r8, $itr2
    498     call poly_hash_ad_internal
    499 open_sse_main_loop:
    500         cmp \$16*16, $inl
    501         jb 2f
    502         # Load state, increment counter blocks\n";
    503         &prep_state(4); $code.="
    504         # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
    505         # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
    506         mov \$4, $itr1
    507         mov $inp, $itr2
    508 1:  \n";
    509             &emit_body(20);
    510             &poly_add("0($itr2)"); $code.="
    511             lea 2*8($itr2), $itr2\n";
    512             &emit_body(20);
    513             &poly_stage1();
    514             &emit_body(20);
    515             &poly_stage2();
    516             &emit_body(20);
    517             &poly_stage3();
    518             &emit_body(20);
    519             &poly_reduce_stage();
    520             foreach $l (@loop_body) {$code.=$l."\n";}
    521             @loop_body = split /\n/, $chacha_body; $code.="
    522             dec $itr1
    523         jge 1b\n";
    524             &poly_add("0($itr2)");
    525             &poly_mul(); $code.="
    526             lea 2*8($itr2), $itr2
    527             cmp \$-6, $itr1
    528         jg 1b\n";
    529         &finalize_state(4);
    530         &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
    531         &xor_stream($A2, $B2, $C2, $D2, "4*16");
    532         &xor_stream($A1, $B1, $C1, $D1, "8*16");
    533         &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
    534         lea 16*16($inp), $inp
    535         lea 16*16($oup), $oup
    536         sub \$16*16, $inl
    537     jmp open_sse_main_loop
    538 2:
    539     # Handle the various tail sizes efficiently
    540     test $inl, $inl
    541     jz open_sse_finalize
    542     cmp \$4*16, $inl
    543     ja 3f\n";
    544 ###############################################################################
    545     # At most 64 bytes are left
    546     &prep_state(1); $code.="
    547     xor $itr2, $itr2
    548     mov $inl, $itr1
    549     cmp \$16, $itr1
    550     jb 2f
    551 1:  \n";
    552         &poly_add("0($inp, $itr2)");
    553         &poly_mul(); $code.="
    554         sub \$16, $itr1
    555 2:
    556         add \$16, $itr2\n";
    557         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    558         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
    559         cmp \$16, $itr1
    560     jae 1b
    561         cmp \$10*16, $itr2
    562     jne 2b\n";
    563     &finalize_state(1); $code.="
    564     jmp open_sse_tail_64_dec_loop
    565 3:
    566     cmp \$8*16, $inl
    567     ja 3f\n";
    568 ###############################################################################
    569     # 65 - 128 bytes are left
    570     &prep_state(2); $code.="
    571     mov $inl, $itr1
    572     and \$-16, $itr1
    573     xor $itr2, $itr2
    574 1:  \n";
    575         &poly_add("0($inp, $itr2)");
    576         &poly_mul(); $code.="
    577 2:
    578         add \$16, $itr2\n";
    579         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    580         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
    581         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
    582         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
    583         cmp $itr1, $itr2
    584     jb 1b
    585         cmp \$10*16, $itr2
    586     jne 2b\n";
    587     &finalize_state(2);
    588     &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
    589     sub \$4*16, $inl
    590     lea 4*16($inp), $inp
    591     lea 4*16($oup), $oup
    592     jmp open_sse_tail_64_dec_loop
    593 3:
    594     cmp \$12*16, $inl
    595     ja 3f\n";
    596 ###############################################################################
    597     # 129 - 192 bytes are left
    598     &prep_state(3); $code.="
    599     mov $inl, $itr1
    600     mov \$10*16, $itr2
    601     cmp \$10*16, $itr1
    602     cmovg $itr2, $itr1
    603     and \$-16, $itr1
    604     xor $itr2, $itr2
    605 1:  \n";
    606         &poly_add("0($inp, $itr2)");
    607         &poly_mul(); $code.="
    608 2:
    609         add \$16, $itr2\n";
    610         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    611         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
    612         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
    613         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
    614         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
    615         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
    616         cmp $itr1, $itr2
    617     jb 1b
    618         cmp \$10*16, $itr2
    619     jne 2b
    620     cmp \$11*16, $inl
    621     jb 1f\n";
    622     &poly_add("10*16($inp)");
    623     &poly_mul(); $code.="
    624     cmp \$12*16, $inl
    625     jb 1f\n";
    626     &poly_add("11*16($inp)");
    627     &poly_mul(); $code.="
    628 1:  \n";
    629     &finalize_state(3);
    630     &xor_stream($A2, $B2, $C2, $D2, "0*16");
    631     &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
    632     sub \$8*16, $inl
    633     lea 8*16($inp), $inp
    634     lea 8*16($oup), $oup
    635     jmp open_sse_tail_64_dec_loop
    636 3:
    637 ###############################################################################\n";
    638     # 193 - 255 bytes are left
    639     &prep_state(4); $code.="
    640     xor $itr2, $itr2
    641 1:  \n";
    642         &poly_add("0($inp, $itr2)");
    643         &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
    644         &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
    645         &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
    646         &poly_stage1();
    647         &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
    648         &poly_stage2();
    649         &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
    650         &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
    651         &poly_stage3();
    652         &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
    653         &poly_reduce_stage();
    654         &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
    655         add \$16, $itr2
    656         cmp \$10*16, $itr2
    657     jb 1b
    658     mov $inl, $itr1
    659     and \$-16, $itr1
    660 1:  \n";
    661         &poly_add("0($inp, $itr2)");
    662         &poly_mul(); $code.="
    663         add \$16, $itr2
    664         cmp $itr1, $itr2
    665     jb 1b\n";
    666     &finalize_state(4);
    667     &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
    668     &xor_stream($A2, $B2, $C2, $D2, "4*16");
    669     &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
    670     movdqa $tmp_store, $D0
    671     sub \$12*16, $inl
    672     lea 12*16($inp), $inp
    673     lea 12*16($oup), $oup
    674 ###############################################################################
    675     # Decrypt the remaining data, 16B at a time, using existing stream
    676 open_sse_tail_64_dec_loop:
    677     cmp \$16, $inl
    678     jb 1f
    679         sub \$16, $inl
    680         movdqu ($inp), $T0
    681         pxor $T0, $A0
    682         movdqu $A0, ($oup)
    683         lea 16($inp), $inp
    684         lea 16($oup), $oup
    685         movdqa $B0, $A0
    686         movdqa $C0, $B0
    687         movdqa $D0, $C0
    688     jmp open_sse_tail_64_dec_loop
    689 1:
    690     movdqa $A0, $A1
    691 
    692     # Decrypt up to 16 bytes at the end.
    693 open_sse_tail_16:
    694     test $inl, $inl
    695     jz open_sse_finalize
    696 
    697     # Read the final bytes into $T0. They need to be read in reverse order so
    698     # that they end up in the correct order in $T0.
    699     pxor $T0, $T0
    700     lea -1($inp, $inl), $inp
    701     movq $inl, $itr2
    702 2:
    703         pslldq \$1, $T0
    704         pinsrb \$0, ($inp), $T0
    705         sub \$1, $inp
    706         sub \$1, $itr2
    707         jnz 2b
    708 
    709 3:
    710     movq $T0, $t0
    711     pextrq \$1, $T0, $t1
    712     # The final bytes of keystream are in $A1.
    713     pxor $A1, $T0
    714 
    715     # Copy the plaintext bytes out.
    716 2:
    717         pextrb \$0, $T0, ($oup)
    718         psrldq \$1, $T0
    719         add \$1, $oup
    720         sub \$1, $inl
    721     jne 2b
    722 
    723     add $t0, $acc0
    724     adc $t1, $acc1
    725     adc \$1, $acc2\n";
    726     &poly_mul(); $code.="
    727 
    728 open_sse_finalize:\n";
    729     &poly_add($len_store);
    730     &poly_mul(); $code.="
    731     # Final reduce
    732     mov $acc0, $t0
    733     mov $acc1, $t1
    734     mov $acc2, $t2
    735     sub \$-5, $acc0
    736     sbb \$-1, $acc1
    737     sbb \$3, $acc2
    738     cmovc $t0, $acc0
    739     cmovc $t1, $acc1
    740     cmovc $t2, $acc2
    741     # Add in s part of the key
    742     add 0+$s_store, $acc0
    743     adc 8+$s_store, $acc1
    744 
    745     add \$288 + 32, %rsp
    746 .cfi_adjust_cfa_offset -(288 + 32)
    747     pop $keyp
    748 .cfi_adjust_cfa_offset -8
    749     movq $acc0, ($keyp)
    750     movq $acc1, 8($keyp)
    751 
    752     pop %r15
    753 .cfi_adjust_cfa_offset -8
    754     pop %r14
    755 .cfi_adjust_cfa_offset -8
    756     pop %r13
    757 .cfi_adjust_cfa_offset -8
    758     pop %r12
    759 .cfi_adjust_cfa_offset -8
    760     pop %rbx
    761 .cfi_adjust_cfa_offset -8
    762     pop %rbp
    763 .cfi_adjust_cfa_offset -8
    764     ret
    765 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
    766 ###############################################################################
    767 open_sse_128:
    768     movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
    769     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
    770     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
    771     movdqu 2*16($keyp), $D0
    772     movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
    773     movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
    774     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
    775     mov \$10, $acc0
    776 1:  \n";
    777         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    778         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
    779         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
    780         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
    781         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
    782         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
    783     dec $acc0
    784     jnz 1b
    785     paddd .chacha20_consts(%rip), $A0
    786     paddd .chacha20_consts(%rip), $A1
    787     paddd .chacha20_consts(%rip), $A2
    788     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
    789     paddd $T2, $C1\npaddd $T2, $C2
    790     paddd $T3, $D1
    791     paddd .sse_inc(%rip), $T3
    792     paddd $T3, $D2
    793     # Clamp and store the key
    794     pand .clamp(%rip), $A0
    795     movdqa $A0, $r_store
    796     movdqa $B0, $s_store
    797     # Hash
    798     mov %r8, $itr2
    799     call poly_hash_ad_internal
    800 1:
    801         cmp \$16, $inl
    802         jb open_sse_tail_16
    803         sub \$16, $inl\n";
    804         # Load for hashing
    805         &poly_add("0*8($inp)"); $code.="
    806         # Load for decryption
    807         movdqu 0*16($inp), $T0
    808         pxor $T0, $A1
    809         movdqu $A1, 0*16($oup)
    810         lea 1*16($inp), $inp
    811         lea 1*16($oup), $oup\n";
    812         &poly_mul(); $code.="
    813         # Shift the stream left
    814         movdqa $B1, $A1
    815         movdqa $C1, $B1
    816         movdqa $D1, $C1
    817         movdqa $A2, $D1
    818         movdqa $B2, $A2
    819         movdqa $C2, $B2
    820         movdqa $D2, $C2
    821     jmp 1b
    822     jmp open_sse_tail_16
    823 .size chacha20_poly1305_open, .-chacha20_poly1305_open
    824 .cfi_endproc
    825 
    826 ################################################################################
    827 ################################################################################
    828 # void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
    829 .globl  chacha20_poly1305_seal
    830 .type chacha20_poly1305_seal,\@function,2
    831 .align 64
    832 chacha20_poly1305_seal:
    833 .cfi_startproc
    834     push %rbp
    835 .cfi_adjust_cfa_offset 8
    836     push %rbx
    837 .cfi_adjust_cfa_offset 8
    838     push %r12
    839 .cfi_adjust_cfa_offset 8
    840     push %r13
    841 .cfi_adjust_cfa_offset 8
    842     push %r14
    843 .cfi_adjust_cfa_offset 8
    844     push %r15
    845 .cfi_adjust_cfa_offset 8
    846     # We write the calculated authenticator back to keyp at the end, so save
    847     # the pointer on the stack too.
    848     push $keyp
    849 .cfi_adjust_cfa_offset 8
    850     sub \$288 + 32, %rsp
    851 .cfi_adjust_cfa_offset 288 + 32
    852 .cfi_offset rbp, -16
    853 .cfi_offset rbx, -24
    854 .cfi_offset r12, -32
    855 .cfi_offset r13, -40
    856 .cfi_offset r14, -48
    857 .cfi_offset r15, -56
    858     lea 32(%rsp), %rbp
    859     and \$-32, %rbp
    860     mov 56($keyp), $inl  # extra_in_len
    861     addq %rdx, $inl
    862     mov $inl, 8+$len_store
    863     mov %r8, 0+$len_store
    864     mov %rdx, $inl\n"; $code.="
    865     mov OPENSSL_ia32cap_P+8(%rip), %eax
    866     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
    867     xor \$`(1<<5) + (1<<8)`, %eax
    868     jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
    869 $code.="
    870     cmp \$128, $inl
    871     jbe seal_sse_128
    872     # For longer buffers, prepare the poly key + some stream
    873     movdqa .chacha20_consts(%rip), $A0
    874     movdqu 0*16($keyp), $B0
    875     movdqu 1*16($keyp), $C0
    876     movdqu 2*16($keyp), $D0
    877     movdqa $A0, $A1
    878     movdqa $A0, $A2
    879     movdqa $A0, $A3
    880     movdqa $B0, $B1
    881     movdqa $B0, $B2
    882     movdqa $B0, $B3
    883     movdqa $C0, $C1
    884     movdqa $C0, $C2
    885     movdqa $C0, $C3
    886     movdqa $D0, $D3
    887     paddd .sse_inc(%rip), $D0
    888     movdqa $D0, $D2
    889     paddd .sse_inc(%rip), $D0
    890     movdqa $D0, $D1
    891     paddd .sse_inc(%rip), $D0
    892     # Store on stack
    893     movdqa $B0, $state1_store
    894     movdqa $C0, $state2_store
    895     movdqa $D0, $ctr0_store
    896     movdqa $D1, $ctr1_store
    897     movdqa $D2, $ctr2_store
    898     movdqa $D3, $ctr3_store
    899     mov \$10, $acc0
    900 1:  \n";
    901         foreach $l (@loop_body) {$code.=$l."\n";}
    902         @loop_body = split /\n/, $chacha_body; $code.="
    903         dec $acc0
    904     jnz 1b\n";
    905     &finalize_state(4); $code.="
    906     # Clamp and store the key
    907     pand .clamp(%rip), $A3
    908     movdqa $A3, $r_store
    909     movdqa $B3, $s_store
    910     # Hash
    911     mov %r8, $itr2
    912     call poly_hash_ad_internal\n";
    913     &xor_stream($A2,$B2,$C2,$D2,"0*16");
    914     &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
    915     cmp \$12*16, $inl
    916     ja 1f
    917     mov \$8*16, $itr1
    918     sub \$8*16, $inl
    919     lea 8*16($inp), $inp
    920     jmp seal_sse_128_seal_hash
    921 1:  \n";
    922     &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
    923     mov \$12*16, $itr1
    924     sub \$12*16, $inl
    925     lea 12*16($inp), $inp
    926     mov \$2, $itr1
    927     mov \$8, $itr2
    928     cmp \$4*16, $inl
    929     jbe seal_sse_tail_64
    930     cmp \$8*16, $inl
    931     jbe seal_sse_tail_128
    932     cmp \$12*16, $inl
    933     jbe seal_sse_tail_192
    934 
    935 1:  \n";
    936     # The main loop
    937         &prep_state(4); $code.="
    938 2:  \n";
    939             &emit_body(20);
    940             &poly_add("0($oup)");
    941             &emit_body(20);
    942             &poly_stage1();
    943             &emit_body(20);
    944             &poly_stage2();
    945             &emit_body(20);
    946             &poly_stage3();
    947             &emit_body(20);
    948             &poly_reduce_stage();
    949             foreach $l (@loop_body) {$code.=$l."\n";}
    950             @loop_body = split /\n/, $chacha_body; $code.="
    951             lea 16($oup), $oup
    952             dec $itr2
    953         jge 2b\n";
    954             &poly_add("0*8($oup)");
    955             &poly_mul(); $code.="
    956             lea 16($oup), $oup
    957             dec $itr1
    958         jg 2b\n";
    959 
    960         &finalize_state(4);$code.="
    961         movdqa $D2, $tmp_store\n";
    962         &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
    963         movdqa $tmp_store, $D2\n";
    964         &xor_stream($A2,$B2,$C2,$D2, 4*16);
    965         &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
    966         cmp \$16*16, $inl
    967         ja 3f
    968 
    969         mov \$12*16, $itr1
    970         sub \$12*16, $inl
    971         lea 12*16($inp), $inp
    972         jmp seal_sse_128_seal_hash
    973 3:  \n";
    974         &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
    975         lea 16*16($inp), $inp
    976         sub \$16*16, $inl
    977         mov \$6, $itr1
    978         mov \$4, $itr2
    979         cmp \$12*16, $inl
    980     jg 1b
    981     mov $inl, $itr1
    982     test $inl, $inl
    983     je seal_sse_128_seal_hash
    984     mov \$6, $itr1
    985     cmp \$4*16, $inl
    986     jg 3f
    987 ###############################################################################
    988 seal_sse_tail_64:\n";
    989     &prep_state(1); $code.="
    990 1:  \n";
    991         &poly_add("0($oup)");
    992         &poly_mul(); $code.="
    993         lea 16($oup), $oup
    994 2:  \n";
    995         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
    996         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
    997         &poly_add("0($oup)");
    998         &poly_mul(); $code.="
    999         lea 16($oup), $oup
   1000     dec $itr1
   1001     jg 1b
   1002     dec $itr2
   1003     jge 2b\n";
   1004     &finalize_state(1); $code.="
   1005     jmp seal_sse_128_seal
   1006 3:
   1007     cmp \$8*16, $inl
   1008     jg 3f
   1009 ###############################################################################
   1010 seal_sse_tail_128:\n";
   1011     &prep_state(2); $code.="
   1012 1:  \n";
   1013         &poly_add("0($oup)");
   1014         &poly_mul(); $code.="
   1015         lea 16($oup), $oup
   1016 2:  \n";
   1017         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
   1018         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
   1019         &poly_add("0($oup)");
   1020         &poly_mul();
   1021         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
   1022         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
   1023         lea 16($oup), $oup
   1024     dec $itr1
   1025     jg 1b
   1026     dec $itr2
   1027     jge 2b\n";
   1028     &finalize_state(2);
   1029     &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
   1030     mov \$4*16, $itr1
   1031     sub \$4*16, $inl
   1032     lea 4*16($inp), $inp
   1033     jmp seal_sse_128_seal_hash
   1034 3:
   1035 ###############################################################################
   1036 seal_sse_tail_192:\n";
   1037     &prep_state(3); $code.="
   1038 1:  \n";
   1039         &poly_add("0($oup)");
   1040         &poly_mul(); $code.="
   1041         lea 16($oup), $oup
   1042 2:  \n";
   1043         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
   1044         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
   1045         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
   1046         &poly_add("0($oup)");
   1047         &poly_mul();
   1048         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
   1049         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
   1050         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   1051         lea 16($oup), $oup
   1052     dec $itr1
   1053     jg 1b
   1054     dec $itr2
   1055     jge 2b\n";
   1056     &finalize_state(3);
   1057     &xor_stream($A2,$B2,$C2,$D2,0*16);
   1058     &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
   1059     mov \$8*16, $itr1
   1060     sub \$8*16, $inl
   1061     lea 8*16($inp), $inp
   1062 ###############################################################################
   1063 seal_sse_128_seal_hash:
   1064         cmp \$16, $itr1
   1065         jb seal_sse_128_seal\n";
   1066         &poly_add("0($oup)");
   1067         &poly_mul(); $code.="
   1068         sub \$16, $itr1
   1069         lea 16($oup), $oup
   1070     jmp seal_sse_128_seal_hash
   1071 
   1072 seal_sse_128_seal:
   1073         cmp \$16, $inl
   1074         jb seal_sse_tail_16
   1075         sub \$16, $inl
   1076         # Load for decryption
   1077         movdqu 0*16($inp), $T0
   1078         pxor $T0, $A0
   1079         movdqu $A0, 0*16($oup)
   1080         # Then hash
   1081         add 0*8($oup), $acc0
   1082         adc 1*8($oup), $acc1
   1083         adc \$1, $acc2
   1084         lea 1*16($inp), $inp
   1085         lea 1*16($oup), $oup\n";
   1086         &poly_mul(); $code.="
   1087         # Shift the stream left
   1088         movdqa $B0, $A0
   1089         movdqa $C0, $B0
   1090         movdqa $D0, $C0
   1091         movdqa $A1, $D0
   1092         movdqa $B1, $A1
   1093         movdqa $C1, $B1
   1094         movdqa $D1, $C1
   1095     jmp seal_sse_128_seal
   1096 
   1097 seal_sse_tail_16:
   1098     test $inl, $inl
   1099     jz process_blocks_of_extra_in
   1100     # We can only load the PT one byte at a time to avoid buffer overread
   1101     mov $inl, $itr2
   1102     mov $inl, $itr1
   1103     lea -1($inp, $inl), $inp
   1104     pxor $T3, $T3
   1105 1:
   1106         pslldq \$1, $T3
   1107         pinsrb \$0, ($inp), $T3
   1108         lea -1($inp), $inp
   1109         dec $itr1
   1110         jne 1b
   1111 
   1112     # XOR the keystream with the plaintext.
   1113     pxor $A0, $T3
   1114 
   1115     # Write ciphertext out, byte-by-byte.
   1116     movq $inl, $itr1
   1117     movdqu $T3, $A0
   1118 2:
   1119         pextrb \$0, $A0, ($oup)
   1120         psrldq \$1, $A0
   1121         add \$1, $oup
   1122         sub \$1, $itr1
   1123         jnz 2b
   1124 
   1125     # $T3 contains the final (partial, non-empty) block of ciphertext which
   1126     # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
   1127     # are valid. We need to fill it with extra_in bytes until full, or until we
   1128     # run out of bytes.
   1129     #
   1130     # $keyp points to the tag output, which is actually a struct with the
   1131     # extra_in pointer and length at offset 48.
   1132     movq 288+32(%rsp), $keyp
   1133     movq 56($keyp), $t1  # extra_in_len
   1134     movq 48($keyp), $t0  # extra_in
   1135     test $t1, $t1
   1136     jz process_partial_block  # Common case: no bytes of extra_in
   1137 
   1138     movq \$16, $t2
   1139     subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
   1140     cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
   1141                     # (note that AT&T syntax reverses the arguments)
   1142     jge load_extra_in
   1143     movq $t1, $t2
   1144 
   1145 load_extra_in:
   1146     # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
   1147     # into $T3. They are loaded in reverse order.
   1148     leaq -1($t0, $t2), $inp
   1149     # Update extra_in and extra_in_len to reflect the bytes that are about to
   1150     # be read.
   1151     addq $t2, $t0
   1152     subq $t2, $t1
   1153     movq $t0, 48($keyp)
   1154     movq $t1, 56($keyp)
   1155 
   1156     # Update $itr2, which is used to select the mask later on, to reflect the
   1157     # extra bytes about to be added.
   1158     addq $t2, $itr2
   1159 
   1160     # Load $t2 bytes of extra_in into $T2.
   1161     pxor $T2, $T2
   1162 3:
   1163         pslldq \$1, $T2
   1164         pinsrb \$0, ($inp), $T2
   1165         lea -1($inp), $inp
   1166         sub \$1, $t2
   1167         jnz 3b
   1168 
   1169     # Shift $T2 up the length of the remainder from the main encryption. Sadly,
   1170     # the shift for an XMM register has to be a constant, thus we loop to do
   1171     # this.
   1172     movq $inl, $t2
   1173 
   1174 4:
   1175         pslldq \$1, $T2
   1176         sub \$1, $t2
   1177         jnz 4b
   1178 
   1179     # Mask $T3 (the remainder from the main encryption) so that superfluous
   1180     # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
   1181     # disjoint and so we can merge them with an OR.
   1182     lea .and_masks(%rip), $t2
   1183     shl \$4, $inl
   1184     pand -16($t2, $inl), $T3
   1185 
   1186     # Merge $T2 into $T3, forming the remainder block.
   1187     por $T2, $T3
   1188 
   1189     # The block of ciphertext + extra_in is ready to be included in the
   1190     # Poly1305 state.
   1191     movq $T3, $t0
   1192     pextrq \$1, $T3, $t1
   1193     add $t0, $acc0
   1194     adc $t1, $acc1
   1195     adc \$1, $acc2\n";
   1196     &poly_mul(); $code.="
   1197 
   1198 process_blocks_of_extra_in:
   1199     # There may be additional bytes of extra_in to process.
   1200     movq 288+32(%rsp), $keyp
   1201     movq 48($keyp), $inp   # extra_in
   1202     movq 56($keyp), $itr2  # extra_in_len
   1203     movq $itr2, $itr1
   1204     shr \$4, $itr2         # number of blocks
   1205 
   1206 5:
   1207         jz process_extra_in_trailer\n";
   1208         &poly_add("0($inp)");
   1209         &poly_mul(); $code.="
   1210         leaq 16($inp), $inp
   1211         subq \$1, $itr2
   1212         jmp 5b
   1213 
   1214 process_extra_in_trailer:
   1215     andq \$15, $itr1       # remaining num bytes (<16) of extra_in
   1216     movq $itr1, $inl
   1217     jz do_length_block
   1218     leaq -1($inp, $itr1), $inp
   1219 
   1220 6:
   1221         pslldq \$1, $T3
   1222         pinsrb \$0, ($inp), $T3
   1223         lea -1($inp), $inp
   1224         sub \$1, $itr1
   1225         jnz 6b
   1226 
   1227 process_partial_block:
   1228     # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
   1229     lea .and_masks(%rip), $t2
   1230     shl \$4, $inl
   1231     pand -16($t2, $inl), $T3
   1232     movq $T3, $t0
   1233     pextrq \$1, $T3, $t1
   1234     add $t0, $acc0
   1235     adc $t1, $acc1
   1236     adc \$1, $acc2\n";
   1237     &poly_mul(); $code.="
   1238 
   1239 do_length_block:\n";
   1240     &poly_add($len_store);
   1241     &poly_mul(); $code.="
   1242     # Final reduce
   1243     mov $acc0, $t0
   1244     mov $acc1, $t1
   1245     mov $acc2, $t2
   1246     sub \$-5, $acc0
   1247     sbb \$-1, $acc1
   1248     sbb \$3, $acc2
   1249     cmovc $t0, $acc0
   1250     cmovc $t1, $acc1
   1251     cmovc $t2, $acc2
   1252     # Add in s part of the key
   1253     add 0+$s_store, $acc0
   1254     adc 8+$s_store, $acc1
   1255 
   1256     add \$288 + 32, %rsp
   1257 .cfi_adjust_cfa_offset -(288 + 32)
   1258     pop $keyp
   1259 .cfi_adjust_cfa_offset -8
   1260     mov $acc0, 0*8($keyp)
   1261     mov $acc1, 1*8($keyp)
   1262 
   1263     pop %r15
   1264 .cfi_adjust_cfa_offset -8
   1265     pop %r14
   1266 .cfi_adjust_cfa_offset -8
   1267     pop %r13
   1268 .cfi_adjust_cfa_offset -8
   1269     pop %r12
   1270 .cfi_adjust_cfa_offset -8
   1271     pop %rbx
   1272 .cfi_adjust_cfa_offset -8
   1273     pop %rbp
   1274 .cfi_adjust_cfa_offset -8
   1275     ret
   1276 .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
   1277 ################################################################################
   1278 seal_sse_128:
   1279     movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
   1280     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
   1281     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
   1282     movdqu 2*16($keyp), $D2
   1283     movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
   1284     movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
   1285     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
   1286     mov \$10, $acc0
   1287 1:\n";
   1288         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
   1289         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
   1290         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
   1291         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
   1292         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
   1293         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   1294         dec $acc0
   1295     jnz 1b
   1296     paddd .chacha20_consts(%rip), $A0
   1297     paddd .chacha20_consts(%rip), $A1
   1298     paddd .chacha20_consts(%rip), $A2
   1299     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
   1300     paddd $T2, $C0\npaddd $T2, $C1
   1301     paddd $T3, $D0
   1302     paddd .sse_inc(%rip), $T3
   1303     paddd $T3, $D1
   1304     # Clamp and store the key
   1305     pand .clamp(%rip), $A2
   1306     movdqa $A2, $r_store
   1307     movdqa $B2, $s_store
   1308     # Hash
   1309     mov %r8, $itr2
   1310     call poly_hash_ad_internal
   1311     jmp seal_sse_128_seal
   1312 .size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
   1313 }
   1314 
   1315 # There should have been a cfi_endproc at the end of that function, but the two
   1316 # following blocks of code are jumped to without a stack frame and the CFI
   1317 # context which they are used in happens to match the CFI context at the end of
   1318 # the previous function. So the CFI table is just extended to the end of them.
   1319 
   1320 if ($avx>1) {
   1321 
   1322 ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
   1323 my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
   1324 ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
   1325 $state1_store="2*32(%rbp)";
   1326 $state2_store="3*32(%rbp)";
   1327 $tmp_store="4*32(%rbp)";
   1328 $ctr0_store="5*32(%rbp)";
   1329 $ctr1_store="6*32(%rbp)";
   1330 $ctr2_store="7*32(%rbp)";
   1331 $ctr3_store="8*32(%rbp)";
   1332 
   1333 sub chacha_qr_avx2 {
   1334 my ($a,$b,$c,$d,$t,$dir)=@_;
   1335 $code.=<<___ if ($dir =~ /store/);
   1336     vmovdqa $t, $tmp_store
   1337 ___
   1338 $code.=<<___;
   1339     vpaddd $b, $a, $a
   1340     vpxor $a, $d, $d
   1341     vpshufb .rol16(%rip), $d, $d
   1342     vpaddd $d, $c, $c
   1343     vpxor $c, $b, $b
   1344     vpsrld \$20, $b, $t
   1345     vpslld \$12, $b, $b
   1346     vpxor $t, $b, $b
   1347     vpaddd $b, $a, $a
   1348     vpxor $a, $d, $d
   1349     vpshufb .rol8(%rip), $d, $d
   1350     vpaddd $d, $c, $c
   1351     vpxor $c, $b, $b
   1352     vpslld \$7, $b, $t
   1353     vpsrld \$25, $b, $b
   1354     vpxor $t, $b, $b
   1355 ___
   1356 $code.=<<___ if ($dir =~ /left/);
   1357     vpalignr \$12, $d, $d, $d
   1358     vpalignr \$8, $c, $c, $c
   1359     vpalignr \$4, $b, $b, $b
   1360 ___
   1361 $code.=<<___ if ($dir =~ /right/);
   1362     vpalignr \$4, $d, $d, $d
   1363     vpalignr \$8, $c, $c, $c
   1364     vpalignr \$12, $b, $b, $b
   1365 ___
   1366 $code.=<<___ if ($dir =~ /load/);
   1367     vmovdqa $tmp_store, $t
   1368 ___
   1369 }
   1370 
   1371 sub prep_state_avx2 {
   1372 my ($n)=@_;
   1373 $code.=<<___;
   1374     vmovdqa .chacha20_consts(%rip), $A0
   1375     vmovdqa $state1_store, $B0
   1376     vmovdqa $state2_store, $C0
   1377 ___
   1378 $code.=<<___ if ($n ge 2);
   1379     vmovdqa $A0, $A1
   1380     vmovdqa $B0, $B1
   1381     vmovdqa $C0, $C1
   1382 ___
   1383 $code.=<<___ if ($n ge 3);
   1384     vmovdqa $A0, $A2
   1385     vmovdqa $B0, $B2
   1386     vmovdqa $C0, $C2
   1387 ___
   1388 $code.=<<___ if ($n ge 4);
   1389     vmovdqa $A0, $A3
   1390     vmovdqa $B0, $B3
   1391     vmovdqa $C0, $C3
   1392 ___
   1393 $code.=<<___ if ($n eq 1);
   1394     vmovdqa .avx2_inc(%rip), $D0
   1395     vpaddd $ctr0_store, $D0, $D0
   1396     vmovdqa $D0, $ctr0_store
   1397 ___
   1398 $code.=<<___ if ($n eq 2);
   1399     vmovdqa .avx2_inc(%rip), $D0
   1400     vpaddd $ctr0_store, $D0, $D1
   1401     vpaddd $D1, $D0, $D0
   1402     vmovdqa $D0, $ctr0_store
   1403     vmovdqa $D1, $ctr1_store
   1404 ___
   1405 $code.=<<___ if ($n eq 3);
   1406     vmovdqa .avx2_inc(%rip), $D0
   1407     vpaddd $ctr0_store, $D0, $D2
   1408     vpaddd $D2, $D0, $D1
   1409     vpaddd $D1, $D0, $D0
   1410     vmovdqa $D0, $ctr0_store
   1411     vmovdqa $D1, $ctr1_store
   1412     vmovdqa $D2, $ctr2_store
   1413 ___
   1414 $code.=<<___ if ($n eq 4);
   1415     vmovdqa .avx2_inc(%rip), $D0
   1416     vpaddd $ctr0_store, $D0, $D3
   1417     vpaddd $D3, $D0, $D2
   1418     vpaddd $D2, $D0, $D1
   1419     vpaddd $D1, $D0, $D0
   1420     vmovdqa $D3, $ctr3_store
   1421     vmovdqa $D2, $ctr2_store
   1422     vmovdqa $D1, $ctr1_store
   1423     vmovdqa $D0, $ctr0_store
   1424 ___
   1425 }
   1426 
   1427 sub finalize_state_avx2 {
   1428 my ($n)=@_;
   1429 $code.=<<___ if ($n eq 4);
   1430     vpaddd .chacha20_consts(%rip), $A3, $A3
   1431     vpaddd $state1_store, $B3, $B3
   1432     vpaddd $state2_store, $C3, $C3
   1433     vpaddd $ctr3_store, $D3, $D3
   1434 ___
   1435 $code.=<<___ if ($n ge 3);
   1436     vpaddd .chacha20_consts(%rip), $A2, $A2
   1437     vpaddd $state1_store, $B2, $B2
   1438     vpaddd $state2_store, $C2, $C2
   1439     vpaddd $ctr2_store, $D2, $D2
   1440 ___
   1441 $code.=<<___ if ($n ge 2);
   1442     vpaddd .chacha20_consts(%rip), $A1, $A1
   1443     vpaddd $state1_store, $B1, $B1
   1444     vpaddd $state2_store, $C1, $C1
   1445     vpaddd $ctr1_store, $D1, $D1
   1446 ___
   1447 $code.=<<___;
   1448     vpaddd .chacha20_consts(%rip), $A0, $A0
   1449     vpaddd $state1_store, $B0, $B0
   1450     vpaddd $state2_store, $C0, $C0
   1451     vpaddd $ctr0_store, $D0, $D0
   1452 ___
   1453 }
   1454 
   1455 sub xor_stream_avx2 {
   1456 my ($A, $B, $C, $D, $offset, $hlp)=@_;
   1457 $code.=<<___;
   1458     vperm2i128 \$0x02, $A, $B, $hlp
   1459     vperm2i128 \$0x13, $A, $B, $B
   1460     vperm2i128 \$0x02, $C, $D, $A
   1461     vperm2i128 \$0x13, $C, $D, $C
   1462     vpxor 0*32+$offset($inp), $hlp, $hlp
   1463     vpxor 1*32+$offset($inp), $A, $A
   1464     vpxor 2*32+$offset($inp), $B, $B
   1465     vpxor 3*32+$offset($inp), $C, $C
   1466     vmovdqu $hlp, 0*32+$offset($oup)
   1467     vmovdqu $A, 1*32+$offset($oup)
   1468     vmovdqu $B, 2*32+$offset($oup)
   1469     vmovdqu $C, 3*32+$offset($oup)
   1470 ___
   1471 }
   1472 
   1473 sub finish_stream_avx2 {
   1474 my ($A, $B, $C, $D, $hlp)=@_;
   1475 $code.=<<___;
   1476     vperm2i128 \$0x13, $A, $B, $hlp
   1477     vperm2i128 \$0x02, $A, $B, $A
   1478     vperm2i128 \$0x02, $C, $D, $B
   1479     vperm2i128 \$0x13, $C, $D, $D
   1480     vmovdqa $hlp, $C
   1481 ___
   1482 }
   1483 
   1484 sub poly_stage1_mulx {
   1485 $code.=<<___;
   1486     mov 0+$r_store, %rdx
   1487     mov %rdx, $t2
   1488     mulx $acc0, $t0, $t1
   1489     mulx $acc1, %rax, %rdx
   1490     imulq $acc2, $t2
   1491     add %rax, $t1
   1492     adc %rdx, $t2
   1493 ___
   1494 }
   1495 
   1496 sub poly_stage2_mulx {
   1497 $code.=<<___;
   1498     mov 8+$r_store, %rdx
   1499     mulx $acc0, $acc0, %rax
   1500     add $acc0, $t1
   1501     mulx $acc1, $acc1, $t3
   1502     adc $acc1, $t2
   1503     adc \$0, $t3
   1504     imulq $acc2, %rdx
   1505 ___
   1506 }
   1507 
   1508 sub poly_stage3_mulx {
   1509 $code.=<<___;
   1510     add %rax, $t2
   1511     adc %rdx, $t3
   1512 ___
   1513 }
   1514 
   1515 sub poly_mul_mulx {
   1516     &poly_stage1_mulx();
   1517     &poly_stage2_mulx();
   1518     &poly_stage3_mulx();
   1519     &poly_reduce_stage();
   1520 }
   1521 
   1522 sub gen_chacha_round_avx2 {
   1523 my ($rot1, $rot2, $shift)=@_;
   1524 my $round="";
   1525 $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
   1526 $round=$round ."vmovdqa $rot2, $C0
   1527                 vpaddd $B3, $A3, $A3
   1528                 vpaddd $B2, $A2, $A2
   1529                 vpaddd $B1, $A1, $A1
   1530                 vpaddd $B0, $A0, $A0
   1531                 vpxor $A3, $D3, $D3
   1532                 vpxor $A2, $D2, $D2
   1533                 vpxor $A1, $D1, $D1
   1534                 vpxor $A0, $D0, $D0
   1535                 vpshufb $C0, $D3, $D3
   1536                 vpshufb $C0, $D2, $D2
   1537                 vpshufb $C0, $D1, $D1
   1538                 vpshufb $C0, $D0, $D0
   1539                 vmovdqa $tmp_store, $C0
   1540                 vpaddd $D3, $C3, $C3
   1541                 vpaddd $D2, $C2, $C2
   1542                 vpaddd $D1, $C1, $C1
   1543                 vpaddd $D0, $C0, $C0
   1544                 vpxor $C3, $B3, $B3
   1545                 vpxor $C2, $B2, $B2
   1546                 vpxor $C1, $B1, $B1
   1547                 vpxor $C0, $B0, $B0
   1548                 vmovdqa $C0, $tmp_store
   1549                 vpsrld \$$rot1, $B3, $C0
   1550                 vpslld \$32-$rot1, $B3, $B3
   1551                 vpxor $C0, $B3, $B3
   1552                 vpsrld \$$rot1, $B2, $C0
   1553                 vpslld \$32-$rot1, $B2, $B2
   1554                 vpxor $C0, $B2, $B2
   1555                 vpsrld \$$rot1, $B1, $C0
   1556                 vpslld \$32-$rot1, $B1, $B1
   1557                 vpxor $C0, $B1, $B1
   1558                 vpsrld \$$rot1, $B0, $C0
   1559                 vpslld \$32-$rot1, $B0, $B0
   1560                 vpxor $C0, $B0, $B0\n";
   1561 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
   1562 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
   1563 $round=$round ."vmovdqa $tmp_store, $C0
   1564                 vpalignr \$$s1, $B3, $B3, $B3
   1565                 vpalignr \$$s2, $C3, $C3, $C3
   1566                 vpalignr \$$s3, $D3, $D3, $D3
   1567                 vpalignr \$$s1, $B2, $B2, $B2
   1568                 vpalignr \$$s2, $C2, $C2, $C2
   1569                 vpalignr \$$s3, $D2, $D2, $D2
   1570                 vpalignr \$$s1, $B1, $B1, $B1
   1571                 vpalignr \$$s2, $C1, $C1, $C1
   1572                 vpalignr \$$s3, $D1, $D1, $D1
   1573                 vpalignr \$$s1, $B0, $B0, $B0
   1574                 vpalignr \$$s2, $C0, $C0, $C0
   1575                 vpalignr \$$s3, $D0, $D0, $D0\n"
   1576 if (($shift =~ /left/) || ($shift =~ /right/));
   1577 return $round;
   1578 };
   1579 
   1580 $chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
   1581                &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
   1582                &gen_chacha_round_avx2(20, ".rol16(%rip)") .
   1583                &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
   1584 
   1585 @loop_body = split /\n/, $chacha_body;
   1586 
   1587 $code.="
   1588 ###############################################################################
   1589 .type chacha20_poly1305_open_avx2,\@function,2
   1590 .align 64
   1591 chacha20_poly1305_open_avx2:
   1592     vzeroupper
   1593     vmovdqa .chacha20_consts(%rip), $A0
   1594     vbroadcasti128 0*16($keyp), $B0
   1595     vbroadcasti128 1*16($keyp), $C0
   1596     vbroadcasti128 2*16($keyp), $D0
   1597     vpaddd .avx2_init(%rip), $D0, $D0
   1598     cmp \$6*32, $inl
   1599     jbe open_avx2_192
   1600     cmp \$10*32, $inl
   1601     jbe open_avx2_320
   1602 
   1603     vmovdqa $B0, $state1_store
   1604     vmovdqa $C0, $state2_store
   1605     vmovdqa $D0, $ctr0_store
   1606     mov \$10, $acc0
   1607 1:  \n";
   1608         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   1609         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
   1610         dec $acc0
   1611     jne 1b
   1612     vpaddd .chacha20_consts(%rip), $A0, $A0
   1613     vpaddd $state1_store, $B0, $B0
   1614     vpaddd $state2_store, $C0, $C0
   1615     vpaddd $ctr0_store, $D0, $D0
   1616 
   1617     vperm2i128 \$0x02, $A0, $B0, $T0
   1618     # Clamp and store key
   1619     vpand .clamp(%rip), $T0, $T0
   1620     vmovdqa $T0, $r_store
   1621     # Stream for the first 64 bytes
   1622     vperm2i128 \$0x13, $A0, $B0, $A0
   1623     vperm2i128 \$0x13, $C0, $D0, $B0
   1624     # Hash AD + first 64 bytes
   1625     mov %r8, $itr2
   1626     call poly_hash_ad_internal
   1627     xor $itr1, $itr1
   1628     # Hash first 64 bytes
   1629 1:  \n";
   1630        &poly_add("0($inp, $itr1)");
   1631        &poly_mul(); $code.="
   1632        add \$16, $itr1
   1633        cmp \$2*32, $itr1
   1634     jne 1b
   1635     # Decrypt first 64 bytes
   1636     vpxor 0*32($inp), $A0, $A0
   1637     vpxor 1*32($inp), $B0, $B0
   1638     vmovdqu $A0, 0*32($oup)
   1639     vmovdqu $B0, 1*32($oup)
   1640     lea 2*32($inp), $inp
   1641     lea 2*32($oup), $oup
   1642     sub \$2*32, $inl
   1643 1:
   1644         # Hash and decrypt 512 bytes each iteration
   1645         cmp \$16*32, $inl
   1646         jb 3f\n";
   1647         &prep_state_avx2(4); $code.="
   1648         xor $itr1, $itr1
   1649 2:  \n";
   1650             &poly_add("0*8($inp, $itr1)");
   1651             &emit_body(10);
   1652             &poly_stage1_mulx();
   1653             &emit_body(9);
   1654             &poly_stage2_mulx();
   1655             &emit_body(12);
   1656             &poly_stage3_mulx();
   1657             &emit_body(10);
   1658             &poly_reduce_stage();
   1659             &emit_body(9);
   1660             &poly_add("2*8($inp, $itr1)");
   1661             &emit_body(8);
   1662             &poly_stage1_mulx();
   1663             &emit_body(18);
   1664             &poly_stage2_mulx();
   1665             &emit_body(18);
   1666             &poly_stage3_mulx();
   1667             &emit_body(9);
   1668             &poly_reduce_stage();
   1669             &emit_body(8);
   1670             &poly_add("4*8($inp, $itr1)"); $code.="
   1671             lea 6*8($itr1), $itr1\n";
   1672             &emit_body(18);
   1673             &poly_stage1_mulx();
   1674             &emit_body(8);
   1675             &poly_stage2_mulx();
   1676             &emit_body(8);
   1677             &poly_stage3_mulx();
   1678             &emit_body(18);
   1679             &poly_reduce_stage();
   1680             foreach $l (@loop_body) {$code.=$l."\n";}
   1681             @loop_body = split /\n/, $chacha_body; $code.="
   1682             cmp \$10*6*8, $itr1
   1683         jne 2b\n";
   1684         &finalize_state_avx2(4); $code.="
   1685         vmovdqa $A0, $tmp_store\n";
   1686         &poly_add("10*6*8($inp)");
   1687         &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
   1688         vmovdqa $tmp_store, $A0\n";
   1689         &poly_mul();
   1690         &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
   1691         &poly_add("10*6*8+2*8($inp)");
   1692         &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
   1693         &poly_mul();
   1694         &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
   1695         lea 16*32($inp), $inp
   1696         lea 16*32($oup), $oup
   1697         sub \$16*32, $inl
   1698     jmp 1b
   1699 3:
   1700     test $inl, $inl
   1701     vzeroupper
   1702     je open_sse_finalize
   1703 3:
   1704     cmp \$4*32, $inl
   1705     ja 3f\n";
   1706 ###############################################################################
   1707     # 1-128 bytes left
   1708     &prep_state_avx2(1); $code.="
   1709     xor $itr2, $itr2
   1710     mov $inl, $itr1
   1711     and \$-16, $itr1
   1712     test $itr1, $itr1
   1713     je 2f
   1714 1:  \n";
   1715         &poly_add("0*8($inp, $itr2)");
   1716         &poly_mul(); $code.="
   1717 2:
   1718         add \$16, $itr2\n";
   1719         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   1720         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
   1721         cmp $itr1, $itr2
   1722     jb 1b
   1723         cmp \$160, $itr2
   1724     jne 2b\n";
   1725     &finalize_state_avx2(1);
   1726     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
   1727     jmp open_avx2_tail_loop
   1728 3:
   1729     cmp \$8*32, $inl
   1730     ja 3f\n";
   1731 ###############################################################################
   1732     # 129-256 bytes left
   1733     &prep_state_avx2(2); $code.="
   1734     mov $inl, $tmp_store
   1735     mov $inl, $itr1
   1736     sub \$4*32, $itr1
   1737     shr \$4, $itr1
   1738     mov \$10, $itr2
   1739     cmp \$10, $itr1
   1740     cmovg $itr2, $itr1
   1741     mov $inp, $inl
   1742     xor $itr2, $itr2
   1743 1:  \n";
   1744         &poly_add("0*8($inl)");
   1745         &poly_mul_mulx(); $code.="
   1746         lea 16($inl), $inl
   1747 2:  \n";
   1748         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   1749         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
   1750         inc $itr2\n";
   1751         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   1752         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   1753         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   1754         cmp $itr1, $itr2
   1755     jb 1b
   1756         cmp \$10, $itr2
   1757     jne 2b
   1758     mov $inl, $itr2
   1759     sub $inp, $inl
   1760     mov $inl, $itr1
   1761     mov $tmp_store, $inl
   1762 1:
   1763         add \$16, $itr1
   1764         cmp $inl, $itr1
   1765         jg 1f\n";
   1766         &poly_add("0*8($itr2)");
   1767         &poly_mul_mulx(); $code.="
   1768         lea 16($itr2), $itr2
   1769     jmp 1b
   1770 1:  \n";
   1771     &finalize_state_avx2(2);
   1772     &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
   1773     &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
   1774     lea 4*32($inp), $inp
   1775     lea 4*32($oup), $oup
   1776     sub \$4*32, $inl
   1777     jmp open_avx2_tail_loop
   1778 3:
   1779     cmp \$12*32, $inl
   1780     ja 3f\n";
   1781 ###############################################################################
   1782     # 257-383 bytes left
   1783     &prep_state_avx2(3); $code.="
   1784     mov $inl, $tmp_store
   1785     mov $inl, $itr1
   1786     sub \$8*32, $itr1
   1787     shr \$4, $itr1
   1788     add \$6, $itr1
   1789     mov \$10, $itr2
   1790     cmp \$10, $itr1
   1791     cmovg $itr2, $itr1
   1792     mov $inp, $inl
   1793     xor $itr2, $itr2
   1794 1:  \n";
   1795         &poly_add("0*8($inl)");
   1796         &poly_mul_mulx(); $code.="
   1797         lea 16($inl), $inl
   1798 2:  \n";
   1799         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
   1800         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   1801         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   1802         &poly_add("0*8($inl)");
   1803         &poly_mul(); $code.="
   1804         lea 16($inl), $inl
   1805         inc $itr2\n";
   1806         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
   1807         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   1808         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
   1809         cmp $itr1, $itr2
   1810     jb 1b
   1811         cmp \$10, $itr2
   1812     jne 2b
   1813     mov $inl, $itr2
   1814     sub $inp, $inl
   1815     mov $inl, $itr1
   1816     mov $tmp_store, $inl
   1817 1:
   1818         add \$16, $itr1
   1819         cmp $inl, $itr1
   1820         jg 1f\n";
   1821         &poly_add("0*8($itr2)");
   1822         &poly_mul_mulx(); $code.="
   1823         lea 16($itr2), $itr2
   1824     jmp 1b
   1825 1:  \n";
   1826     &finalize_state_avx2(3);
   1827     &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
   1828     &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
   1829     &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
   1830     lea 8*32($inp), $inp
   1831     lea 8*32($oup), $oup
   1832     sub \$8*32, $inl
   1833     jmp open_avx2_tail_loop
   1834 3:  \n";
   1835 ###############################################################################
   1836     # 384-512 bytes left
   1837     &prep_state_avx2(4); $code.="
   1838     xor $itr1, $itr1
   1839     mov $inp, $itr2
   1840 1:  \n";
   1841         &poly_add("0*8($itr2)");
   1842         &poly_mul(); $code.="
   1843         lea 2*8($itr2), $itr2
   1844 2:  \n";
   1845         &emit_body(37);
   1846         &poly_add("0*8($itr2)");
   1847         &poly_mul_mulx();
   1848         &emit_body(48);
   1849         &poly_add("2*8($itr2)");
   1850         &poly_mul_mulx(); $code.="
   1851         lea 4*8($itr2), $itr2\n";
   1852         foreach $l (@loop_body) {$code.=$l."\n";}
   1853         @loop_body = split /\n/, $chacha_body; $code.="
   1854         inc $itr1
   1855         cmp \$4, $itr1
   1856     jl  1b
   1857         cmp \$10, $itr1
   1858     jne 2b
   1859     mov $inl, $itr1
   1860     sub \$12*32, $itr1
   1861     and \$-16, $itr1
   1862 1:
   1863         test $itr1, $itr1
   1864         je 1f\n";
   1865         &poly_add("0*8($itr2)");
   1866         &poly_mul_mulx(); $code.="
   1867         lea 2*8($itr2), $itr2
   1868         sub \$2*8, $itr1
   1869     jmp 1b
   1870 1:  \n";
   1871     &finalize_state_avx2(4); $code.="
   1872     vmovdqa $A0, $tmp_store\n";
   1873     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
   1874     vmovdqa $tmp_store, $A0\n";
   1875     &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
   1876     &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
   1877     &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
   1878     lea 12*32($inp), $inp
   1879     lea 12*32($oup), $oup
   1880     sub \$12*32, $inl
   1881 open_avx2_tail_loop:
   1882     cmp \$32, $inl
   1883     jb open_avx2_tail
   1884         sub \$32, $inl
   1885         vpxor ($inp), $A0, $A0
   1886         vmovdqu $A0, ($oup)
   1887         lea 1*32($inp), $inp
   1888         lea 1*32($oup), $oup
   1889         vmovdqa $B0, $A0
   1890         vmovdqa $C0, $B0
   1891         vmovdqa $D0, $C0
   1892     jmp open_avx2_tail_loop
   1893 open_avx2_tail:
   1894     cmp \$16, $inl
   1895     vmovdqa $A0x, $A1x
   1896     jb 1f
   1897     sub \$16, $inl
   1898     #load for decryption
   1899     vpxor ($inp), $A0x, $A1x
   1900     vmovdqu $A1x, ($oup)
   1901     lea 1*16($inp), $inp
   1902     lea 1*16($oup), $oup
   1903     vperm2i128 \$0x11, $A0, $A0, $A0
   1904     vmovdqa $A0x, $A1x
   1905 1:
   1906     vzeroupper
   1907     jmp open_sse_tail_16
   1908 ###############################################################################
   1909 open_avx2_192:
   1910     vmovdqa $A0, $A1
   1911     vmovdqa $A0, $A2
   1912     vmovdqa $B0, $B1
   1913     vmovdqa $B0, $B2
   1914     vmovdqa $C0, $C1
   1915     vmovdqa $C0, $C2
   1916     vpaddd .avx2_inc(%rip), $D0, $D1
   1917     vmovdqa $D0, $T2
   1918     vmovdqa $D1, $T3
   1919     mov \$10, $acc0
   1920 1:  \n";
   1921         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   1922         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   1923         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   1924         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
   1925         dec $acc0
   1926     jne 1b
   1927     vpaddd $A2, $A0, $A0
   1928     vpaddd $A2, $A1, $A1
   1929     vpaddd $B2, $B0, $B0
   1930     vpaddd $B2, $B1, $B1
   1931     vpaddd $C2, $C0, $C0
   1932     vpaddd $C2, $C1, $C1
   1933     vpaddd $T2, $D0, $D0
   1934     vpaddd $T3, $D1, $D1
   1935     vperm2i128 \$0x02, $A0, $B0, $T0
   1936     # Clamp and store the key
   1937     vpand .clamp(%rip), $T0, $T0
   1938     vmovdqa $T0, $r_store
   1939     # Stream for up to 192 bytes
   1940     vperm2i128 \$0x13, $A0, $B0, $A0
   1941     vperm2i128 \$0x13, $C0, $D0, $B0
   1942     vperm2i128 \$0x02, $A1, $B1, $C0
   1943     vperm2i128 \$0x02, $C1, $D1, $D0
   1944     vperm2i128 \$0x13, $A1, $B1, $A1
   1945     vperm2i128 \$0x13, $C1, $D1, $B1
   1946 open_avx2_short:
   1947     mov %r8, $itr2
   1948     call poly_hash_ad_internal
   1949 open_avx2_hash_and_xor_loop:
   1950         cmp \$32, $inl
   1951         jb open_avx2_short_tail_32
   1952         sub \$32, $inl\n";
   1953         # Load + hash
   1954         &poly_add("0*8($inp)");
   1955         &poly_mul();
   1956         &poly_add("2*8($inp)");
   1957         &poly_mul(); $code.="
   1958         # Load + decrypt
   1959         vpxor ($inp), $A0, $A0
   1960         vmovdqu $A0, ($oup)
   1961         lea 1*32($inp), $inp
   1962         lea 1*32($oup), $oup
   1963         # Shift stream
   1964         vmovdqa $B0, $A0
   1965         vmovdqa $C0, $B0
   1966         vmovdqa $D0, $C0
   1967         vmovdqa $A1, $D0
   1968         vmovdqa $B1, $A1
   1969         vmovdqa $C1, $B1
   1970         vmovdqa $D1, $C1
   1971         vmovdqa $A2, $D1
   1972         vmovdqa $B2, $A2
   1973     jmp open_avx2_hash_and_xor_loop
   1974 open_avx2_short_tail_32:
   1975     cmp \$16, $inl
   1976     vmovdqa $A0x, $A1x
   1977     jb 1f
   1978     sub \$16, $inl\n";
   1979     &poly_add("0*8($inp)");
   1980     &poly_mul(); $code.="
   1981     vpxor ($inp), $A0x, $A3x
   1982     vmovdqu $A3x, ($oup)
   1983     lea 1*16($inp), $inp
   1984     lea 1*16($oup), $oup
   1985     vextracti128 \$1, $A0, $A1x
   1986 1:
   1987     vzeroupper
   1988     jmp open_sse_tail_16
   1989 ###############################################################################
   1990 open_avx2_320:
   1991     vmovdqa $A0, $A1
   1992     vmovdqa $A0, $A2
   1993     vmovdqa $B0, $B1
   1994     vmovdqa $B0, $B2
   1995     vmovdqa $C0, $C1
   1996     vmovdqa $C0, $C2
   1997     vpaddd .avx2_inc(%rip), $D0, $D1
   1998     vpaddd .avx2_inc(%rip), $D1, $D2
   1999     vmovdqa $B0, $T1
   2000     vmovdqa $C0, $T2
   2001     vmovdqa $D0, $ctr0_store
   2002     vmovdqa $D1, $ctr1_store
   2003     vmovdqa $D2, $ctr2_store
   2004     mov \$10, $acc0
   2005 1:  \n";
   2006         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2007         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   2008         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
   2009         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2010         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   2011         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   2012         dec $acc0
   2013     jne 1b
   2014     vpaddd .chacha20_consts(%rip), $A0, $A0
   2015     vpaddd .chacha20_consts(%rip), $A1, $A1
   2016     vpaddd .chacha20_consts(%rip), $A2, $A2
   2017     vpaddd $T1, $B0, $B0
   2018     vpaddd $T1, $B1, $B1
   2019     vpaddd $T1, $B2, $B2
   2020     vpaddd $T2, $C0, $C0
   2021     vpaddd $T2, $C1, $C1
   2022     vpaddd $T2, $C2, $C2
   2023     vpaddd $ctr0_store, $D0, $D0
   2024     vpaddd $ctr1_store, $D1, $D1
   2025     vpaddd $ctr2_store, $D2, $D2
   2026     vperm2i128 \$0x02, $A0, $B0, $T0
   2027     # Clamp and store the key
   2028     vpand .clamp(%rip), $T0, $T0
   2029     vmovdqa $T0, $r_store
   2030     # Stream for up to 320 bytes
   2031     vperm2i128 \$0x13, $A0, $B0, $A0
   2032     vperm2i128 \$0x13, $C0, $D0, $B0
   2033     vperm2i128 \$0x02, $A1, $B1, $C0
   2034     vperm2i128 \$0x02, $C1, $D1, $D0
   2035     vperm2i128 \$0x13, $A1, $B1, $A1
   2036     vperm2i128 \$0x13, $C1, $D1, $B1
   2037     vperm2i128 \$0x02, $A2, $B2, $C1
   2038     vperm2i128 \$0x02, $C2, $D2, $D1
   2039     vperm2i128 \$0x13, $A2, $B2, $A2
   2040     vperm2i128 \$0x13, $C2, $D2, $B2
   2041     jmp open_avx2_short
   2042 .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
   2043 ###############################################################################
   2044 ###############################################################################
   2045 .type chacha20_poly1305_seal_avx2,\@function,2
   2046 .align 64
   2047 chacha20_poly1305_seal_avx2:
   2048     vzeroupper
   2049     vmovdqa .chacha20_consts(%rip), $A0
   2050     vbroadcasti128 0*16($keyp), $B0
   2051     vbroadcasti128 1*16($keyp), $C0
   2052     vbroadcasti128 2*16($keyp), $D0
   2053     vpaddd .avx2_init(%rip), $D0, $D0
   2054     cmp \$6*32, $inl
   2055     jbe seal_avx2_192
   2056     cmp \$10*32, $inl
   2057     jbe seal_avx2_320
   2058     vmovdqa $A0, $A1
   2059     vmovdqa $A0, $A2
   2060     vmovdqa $A0, $A3
   2061     vmovdqa $B0, $B1
   2062     vmovdqa $B0, $B2
   2063     vmovdqa $B0, $B3
   2064     vmovdqa $B0, $state1_store
   2065     vmovdqa $C0, $C1
   2066     vmovdqa $C0, $C2
   2067     vmovdqa $C0, $C3
   2068     vmovdqa $C0, $state2_store
   2069     vmovdqa $D0, $D3
   2070     vpaddd .avx2_inc(%rip), $D3, $D2
   2071     vpaddd .avx2_inc(%rip), $D2, $D1
   2072     vpaddd .avx2_inc(%rip), $D1, $D0
   2073     vmovdqa $D0, $ctr0_store
   2074     vmovdqa $D1, $ctr1_store
   2075     vmovdqa $D2, $ctr2_store
   2076     vmovdqa $D3, $ctr3_store
   2077     mov \$10, $acc0
   2078 1:  \n";
   2079         foreach $l (@loop_body) {$code.=$l."\n";}
   2080         @loop_body = split /\n/, $chacha_body; $code.="
   2081         dec $acc0
   2082         jnz 1b\n";
   2083     &finalize_state_avx2(4); $code.="
   2084     vperm2i128 \$0x13, $C3, $D3, $C3
   2085     vperm2i128 \$0x02, $A3, $B3, $D3
   2086     vperm2i128 \$0x13, $A3, $B3, $A3
   2087     vpand .clamp(%rip), $D3, $D3
   2088     vmovdqa $D3, $r_store
   2089     mov %r8, $itr2
   2090     call poly_hash_ad_internal
   2091     # Safely store 320 bytes (otherwise would handle with optimized call)
   2092     vpxor 0*32($inp), $A3, $A3
   2093     vpxor 1*32($inp), $C3, $C3
   2094     vmovdqu $A3, 0*32($oup)
   2095     vmovdqu $C3, 1*32($oup)\n";
   2096     &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
   2097     &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
   2098     &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
   2099     lea 10*32($inp), $inp
   2100     sub \$10*32, $inl
   2101     mov \$10*32, $itr1
   2102     cmp \$4*32, $inl
   2103     jbe seal_avx2_hash
   2104     vpxor 0*32($inp), $A0, $A0
   2105     vpxor 1*32($inp), $B0, $B0
   2106     vpxor 2*32($inp), $C0, $C0
   2107     vpxor 3*32($inp), $D0, $D0
   2108     vmovdqu $A0, 10*32($oup)
   2109     vmovdqu $B0, 11*32($oup)
   2110     vmovdqu $C0, 12*32($oup)
   2111     vmovdqu $D0, 13*32($oup)
   2112     lea 4*32($inp), $inp
   2113     sub \$4*32, $inl
   2114     mov \$8, $itr1
   2115     mov \$2, $itr2
   2116     cmp \$4*32, $inl
   2117     jbe seal_avx2_tail_128
   2118     cmp \$8*32, $inl
   2119     jbe seal_avx2_tail_256
   2120     cmp \$12*32, $inl
   2121     jbe seal_avx2_tail_384
   2122     cmp \$16*32, $inl
   2123     jbe seal_avx2_tail_512\n";
   2124     # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
   2125     &prep_state_avx2(4);
   2126     foreach $l (@loop_body) {$code.=$l."\n";}
   2127     @loop_body = split /\n/, $chacha_body;
   2128     &emit_body(41);
   2129     @loop_body = split /\n/, $chacha_body; $code.="
   2130     sub \$16, $oup
   2131     mov \$9, $itr1
   2132     jmp 4f
   2133 1:  \n";
   2134         &prep_state_avx2(4); $code.="
   2135         mov \$10, $itr1
   2136 2:  \n";
   2137             &poly_add("0*8($oup)");
   2138             &emit_body(10);
   2139             &poly_stage1_mulx();
   2140             &emit_body(9);
   2141             &poly_stage2_mulx();
   2142             &emit_body(12);
   2143             &poly_stage3_mulx();
   2144             &emit_body(10);
   2145             &poly_reduce_stage(); $code.="
   2146 4:  \n";
   2147             &emit_body(9);
   2148             &poly_add("2*8($oup)");
   2149             &emit_body(8);
   2150             &poly_stage1_mulx();
   2151             &emit_body(18);
   2152             &poly_stage2_mulx();
   2153             &emit_body(18);
   2154             &poly_stage3_mulx();
   2155             &emit_body(9);
   2156             &poly_reduce_stage();
   2157             &emit_body(8);
   2158             &poly_add("4*8($oup)"); $code.="
   2159             lea 6*8($oup), $oup\n";
   2160             &emit_body(18);
   2161             &poly_stage1_mulx();
   2162             &emit_body(8);
   2163             &poly_stage2_mulx();
   2164             &emit_body(8);
   2165             &poly_stage3_mulx();
   2166             &emit_body(18);
   2167             &poly_reduce_stage();
   2168             foreach $l (@loop_body) {$code.=$l."\n";}
   2169             @loop_body = split /\n/, $chacha_body; $code.="
   2170             dec $itr1
   2171         jne 2b\n";
   2172         &finalize_state_avx2(4); $code.="
   2173         lea 4*8($oup), $oup
   2174         vmovdqa $A0, $tmp_store\n";
   2175         &poly_add("-4*8($oup)");
   2176         &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
   2177         vmovdqa $tmp_store, $A0\n";
   2178         &poly_mul();
   2179         &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
   2180         &poly_add("-2*8($oup)");
   2181         &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
   2182         &poly_mul();
   2183         &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
   2184         lea 16*32($inp), $inp
   2185         sub \$16*32, $inl
   2186         cmp \$16*32, $inl
   2187     jg 1b\n";
   2188     &poly_add("0*8($oup)");
   2189     &poly_mul();
   2190     &poly_add("2*8($oup)");
   2191     &poly_mul(); $code.="
   2192     lea 4*8($oup), $oup
   2193     mov \$10, $itr1
   2194     xor $itr2, $itr2
   2195     cmp \$4*32, $inl
   2196     ja 3f
   2197 ###############################################################################
   2198 seal_avx2_tail_128:\n";
   2199     &prep_state_avx2(1); $code.="
   2200 1:  \n";
   2201         &poly_add("0($oup)");
   2202         &poly_mul(); $code.="
   2203         lea 2*8($oup), $oup
   2204 2:  \n";
   2205         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2206         &poly_add("0*8($oup)");
   2207         &poly_mul();
   2208         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2209         &poly_add("2*8($oup)");
   2210         &poly_mul(); $code.="
   2211         lea 4*8($oup), $oup
   2212         dec $itr1
   2213     jg 1b
   2214         dec $itr2
   2215     jge 2b\n";
   2216     &finalize_state_avx2(1);
   2217     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
   2218     jmp seal_avx2_short_loop
   2219 3:
   2220     cmp \$8*32, $inl
   2221     ja 3f
   2222 ###############################################################################
   2223 seal_avx2_tail_256:\n";
   2224     &prep_state_avx2(2); $code.="
   2225 1:  \n";
   2226         &poly_add("0($oup)");
   2227         &poly_mul(); $code.="
   2228         lea 2*8($oup), $oup
   2229 2:  \n";
   2230         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2231         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   2232         &poly_add("0*8($oup)");
   2233         &poly_mul();
   2234         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2235         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   2236         &poly_add("2*8($oup)");
   2237         &poly_mul(); $code.="
   2238         lea 4*8($oup), $oup
   2239         dec $itr1
   2240     jg 1b
   2241         dec $itr2
   2242     jge 2b\n";
   2243     &finalize_state_avx2(2);
   2244     &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
   2245     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
   2246     mov \$4*32, $itr1
   2247     lea 4*32($inp), $inp
   2248     sub \$4*32, $inl
   2249     jmp seal_avx2_hash
   2250 3:
   2251     cmp \$12*32, $inl
   2252     ja seal_avx2_tail_512
   2253 ###############################################################################
   2254 seal_avx2_tail_384:\n";
   2255     &prep_state_avx2(3); $code.="
   2256 1:  \n";
   2257         &poly_add("0($oup)");
   2258         &poly_mul(); $code.="
   2259         lea 2*8($oup), $oup
   2260 2:  \n";
   2261         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2262         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   2263         &poly_add("0*8($oup)");
   2264         &poly_mul();
   2265         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
   2266         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2267         &poly_add("2*8($oup)");
   2268         &poly_mul();
   2269         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   2270         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   2271         lea 4*8($oup), $oup
   2272         dec $itr1
   2273     jg 1b
   2274         dec $itr2
   2275     jge 2b\n";
   2276     &finalize_state_avx2(3);
   2277     &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
   2278     &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
   2279     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
   2280     mov \$8*32, $itr1
   2281     lea 8*32($inp), $inp
   2282     sub \$8*32, $inl
   2283     jmp seal_avx2_hash
   2284 ###############################################################################
   2285 seal_avx2_tail_512:\n";
   2286     &prep_state_avx2(4); $code.="
   2287 1:  \n";
   2288         &poly_add("0($oup)");
   2289         &poly_mul_mulx(); $code.="
   2290         lea 2*8($oup), $oup
   2291 2:  \n";
   2292         &emit_body(20);
   2293         &poly_add("0*8($oup)");
   2294         &emit_body(20);
   2295         &poly_stage1_mulx();
   2296         &emit_body(20);
   2297         &poly_stage2_mulx();
   2298         &emit_body(20);
   2299         &poly_stage3_mulx();
   2300         &emit_body(20);
   2301         &poly_reduce_stage();
   2302         &emit_body(20);
   2303         &poly_add("2*8($oup)");
   2304         &emit_body(20);
   2305         &poly_stage1_mulx();
   2306         &emit_body(20);
   2307         &poly_stage2_mulx();
   2308         &emit_body(20);
   2309         &poly_stage3_mulx();
   2310         &emit_body(20);
   2311         &poly_reduce_stage();
   2312         foreach $l (@loop_body) {$code.=$l."\n";}
   2313         @loop_body = split /\n/, $chacha_body; $code.="
   2314         lea 4*8($oup), $oup
   2315         dec $itr1
   2316     jg 1b
   2317         dec $itr2
   2318     jge 2b\n";
   2319     &finalize_state_avx2(4); $code.="
   2320     vmovdqa $A0, $tmp_store\n";
   2321     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
   2322     vmovdqa $tmp_store, $A0\n";
   2323     &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
   2324     &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
   2325     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
   2326     mov \$12*32, $itr1
   2327     lea 12*32($inp), $inp
   2328     sub \$12*32, $inl
   2329     jmp seal_avx2_hash
   2330 ################################################################################
   2331 seal_avx2_320:
   2332     vmovdqa $A0, $A1
   2333     vmovdqa $A0, $A2
   2334     vmovdqa $B0, $B1
   2335     vmovdqa $B0, $B2
   2336     vmovdqa $C0, $C1
   2337     vmovdqa $C0, $C2
   2338     vpaddd .avx2_inc(%rip), $D0, $D1
   2339     vpaddd .avx2_inc(%rip), $D1, $D2
   2340     vmovdqa $B0, $T1
   2341     vmovdqa $C0, $T2
   2342     vmovdqa $D0, $ctr0_store
   2343     vmovdqa $D1, $ctr1_store
   2344     vmovdqa $D2, $ctr2_store
   2345     mov \$10, $acc0
   2346 1:  \n";
   2347         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2348         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   2349         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
   2350         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2351         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
   2352         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
   2353         dec $acc0
   2354     jne 1b
   2355     vpaddd .chacha20_consts(%rip), $A0, $A0
   2356     vpaddd .chacha20_consts(%rip), $A1, $A1
   2357     vpaddd .chacha20_consts(%rip), $A2, $A2
   2358     vpaddd $T1, $B0, $B0
   2359     vpaddd $T1, $B1, $B1
   2360     vpaddd $T1, $B2, $B2
   2361     vpaddd $T2, $C0, $C0
   2362     vpaddd $T2, $C1, $C1
   2363     vpaddd $T2, $C2, $C2
   2364     vpaddd $ctr0_store, $D0, $D0
   2365     vpaddd $ctr1_store, $D1, $D1
   2366     vpaddd $ctr2_store, $D2, $D2
   2367     vperm2i128 \$0x02, $A0, $B0, $T0
   2368     # Clamp and store the key
   2369     vpand .clamp(%rip), $T0, $T0
   2370     vmovdqa $T0, $r_store
   2371     # Stream for up to 320 bytes
   2372     vperm2i128 \$0x13, $A0, $B0, $A0
   2373     vperm2i128 \$0x13, $C0, $D0, $B0
   2374     vperm2i128 \$0x02, $A1, $B1, $C0
   2375     vperm2i128 \$0x02, $C1, $D1, $D0
   2376     vperm2i128 \$0x13, $A1, $B1, $A1
   2377     vperm2i128 \$0x13, $C1, $D1, $B1
   2378     vperm2i128 \$0x02, $A2, $B2, $C1
   2379     vperm2i128 \$0x02, $C2, $D2, $D1
   2380     vperm2i128 \$0x13, $A2, $B2, $A2
   2381     vperm2i128 \$0x13, $C2, $D2, $B2
   2382     jmp seal_avx2_short
   2383 ################################################################################
   2384 seal_avx2_192:
   2385     vmovdqa $A0, $A1
   2386     vmovdqa $A0, $A2
   2387     vmovdqa $B0, $B1
   2388     vmovdqa $B0, $B2
   2389     vmovdqa $C0, $C1
   2390     vmovdqa $C0, $C2
   2391     vpaddd .avx2_inc(%rip), $D0, $D1
   2392     vmovdqa $D0, $T2
   2393     vmovdqa $D1, $T3
   2394     mov \$10, $acc0
   2395 1:  \n";
   2396         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
   2397         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
   2398         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
   2399         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
   2400         dec $acc0
   2401     jne 1b
   2402     vpaddd $A2, $A0, $A0
   2403     vpaddd $A2, $A1, $A1
   2404     vpaddd $B2, $B0, $B0
   2405     vpaddd $B2, $B1, $B1
   2406     vpaddd $C2, $C0, $C0
   2407     vpaddd $C2, $C1, $C1
   2408     vpaddd $T2, $D0, $D0
   2409     vpaddd $T3, $D1, $D1
   2410     vperm2i128 \$0x02, $A0, $B0, $T0
   2411     # Clamp and store the key
   2412     vpand .clamp(%rip), $T0, $T0
   2413     vmovdqa $T0, $r_store
   2414     # Stream for up to 192 bytes
   2415     vperm2i128 \$0x13, $A0, $B0, $A0
   2416     vperm2i128 \$0x13, $C0, $D0, $B0
   2417     vperm2i128 \$0x02, $A1, $B1, $C0
   2418     vperm2i128 \$0x02, $C1, $D1, $D0
   2419     vperm2i128 \$0x13, $A1, $B1, $A1
   2420     vperm2i128 \$0x13, $C1, $D1, $B1
   2421 seal_avx2_short:
   2422     mov %r8, $itr2
   2423     call poly_hash_ad_internal
   2424     xor $itr1, $itr1
   2425 seal_avx2_hash:
   2426         cmp \$16, $itr1
   2427         jb seal_avx2_short_loop\n";
   2428         &poly_add("0($oup)");
   2429         &poly_mul(); $code.="
   2430         sub \$16, $itr1
   2431         add \$16, $oup
   2432     jmp seal_avx2_hash
   2433 seal_avx2_short_loop:
   2434         cmp \$32, $inl
   2435         jb seal_avx2_short_tail
   2436         sub \$32, $inl
   2437         # Encrypt
   2438         vpxor ($inp), $A0, $A0
   2439         vmovdqu $A0, ($oup)
   2440         lea 1*32($inp), $inp
   2441         # Load + hash\n";
   2442         &poly_add("0*8($oup)");
   2443         &poly_mul();
   2444         &poly_add("2*8($oup)");
   2445         &poly_mul(); $code.="
   2446         lea 1*32($oup), $oup
   2447         # Shift stream
   2448         vmovdqa $B0, $A0
   2449         vmovdqa $C0, $B0
   2450         vmovdqa $D0, $C0
   2451         vmovdqa $A1, $D0
   2452         vmovdqa $B1, $A1
   2453         vmovdqa $C1, $B1
   2454         vmovdqa $D1, $C1
   2455         vmovdqa $A2, $D1
   2456         vmovdqa $B2, $A2
   2457     jmp seal_avx2_short_loop
   2458 seal_avx2_short_tail:
   2459     cmp \$16, $inl
   2460     jb 1f
   2461     sub \$16, $inl
   2462     vpxor ($inp), $A0x, $A3x
   2463     vmovdqu $A3x, ($oup)
   2464     lea 1*16($inp), $inp\n";
   2465     &poly_add("0*8($oup)");
   2466     &poly_mul(); $code.="
   2467     lea 1*16($oup), $oup
   2468     vextracti128 \$1, $A0, $A0x
   2469 1:
   2470     vzeroupper
   2471     jmp seal_sse_tail_16
   2472 .cfi_endproc
   2473 ";
   2474 }
   2475 
   2476 if (!$win64) {
   2477   $code =~ s/\`([^\`]*)\`/eval $1/gem;
   2478   print $code;
   2479 } else {
   2480   print <<___;
   2481 .globl dummy_chacha20_poly1305_asm
   2482 .type dummy_chacha20_poly1305_asm,\@abi-omnipotent
   2483 dummy_chacha20_poly1305_asm:
   2484     ret
   2485 ___
   2486 }
   2487 
   2488 close STDOUT;
   2489