Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # Copyright (c) 2017, Shay Gueron.
      4 # Copyright (c) 2017, Google Inc.
      5 #
      6 # Permission to use, copy, modify, and/or distribute this software for any
      7 # purpose with or without fee is hereby granted, provided that the above
      8 # copyright notice and this permission notice appear in all copies.
      9 #
     10 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     11 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     12 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     13 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     14 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     15 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     16 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
     17 
     18 use warnings FATAL => 'all';
     19 
     20 $flavour = shift;
     21 $output  = shift;
     22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
     23 
     24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
     25 
     26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
     28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
     29 die "can't locate x86_64-xlate.pl";
     30 
     31 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
     32 *STDOUT=*OUT;
     33 
     34 $code.=<<___;
     35 .data
     36 
     37 .align 16
     38 one:
     39 .quad 1,0
     40 two:
     41 .quad 2,0
     42 three:
     43 .quad 3,0
     44 four:
     45 .quad 4,0
     46 five:
     47 .quad 5,0
     48 six:
     49 .quad 6,0
     50 seven:
     51 .quad 7,0
     52 eight:
     53 .quad 8,0
     54 
     55 OR_MASK:
     56 .long 0x00000000,0x00000000,0x00000000,0x80000000
     57 poly:
     58 .quad 0x1, 0xc200000000000000
     59 mask:
     60 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
     61 con1:
     62 .long 1,1,1,1
     63 con2:
     64 .long 0x1b,0x1b,0x1b,0x1b
     65 con3:
     66 .byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
     67 and_mask:
     68 .long 0,0xffffffff, 0xffffffff, 0xffffffff
     69 ___
     70 
     71 $code.=<<___;
     72 .text
     73 ___
     74 
     75 sub gfmul {
     76   #########################
     77   # a = T
     78   # b = TMP0 - remains unchanged
     79   # res = T
     80   # uses also TMP1,TMP2,TMP3,TMP4
     81   # __m128i GFMUL(__m128i A, __m128i B);
     82 
     83   my $T = "%xmm0";
     84   my $TMP0 = "%xmm1";
     85   my $TMP1 = "%xmm2";
     86   my $TMP2 = "%xmm3";
     87   my $TMP3 = "%xmm4";
     88   my $TMP4 = "%xmm5";
     89 
     90   $code.=<<___;
     91 .type GFMUL,\@abi-omnipotent
     92 .align 16
     93 GFMUL:
     94 .cfi_startproc
     95     vpclmulqdq  \$0x00, $TMP0, $T, $TMP1
     96     vpclmulqdq  \$0x11, $TMP0, $T, $TMP4
     97     vpclmulqdq  \$0x10, $TMP0, $T, $TMP2
     98     vpclmulqdq  \$0x01, $TMP0, $T, $TMP3
     99     vpxor       $TMP3, $TMP2, $TMP2
    100     vpslldq     \$8, $TMP2, $TMP3
    101     vpsrldq     \$8, $TMP2, $TMP2
    102     vpxor       $TMP3, $TMP1, $TMP1
    103     vpxor       $TMP2, $TMP4, $TMP4
    104 
    105     vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
    106     vpshufd     \$78, $TMP1, $TMP3
    107     vpxor       $TMP3, $TMP2, $TMP1
    108 
    109     vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
    110     vpshufd     \$78, $TMP1, $TMP3
    111     vpxor       $TMP3, $TMP2, $TMP1
    112 
    113     vpxor       $TMP4, $TMP1, $T
    114     ret
    115 .cfi_endproc
    116 .size GFMUL, .-GFMUL
    117 ___
    118 }
    119 gfmul();
    120 
    121 sub aesgcmsiv_htable_init {
    122   # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
    123   # |out_htable|.
    124   # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
    125 
    126   my $Htbl = "%rdi";
    127   my $H = "%rsi";
    128   my $T = "%xmm0";
    129   my $TMP0 = "%xmm1";
    130 
    131 $code.=<<___;
    132 .globl aesgcmsiv_htable_init
    133 .type aesgcmsiv_htable_init,\@function,2
    134 .align 16
    135 aesgcmsiv_htable_init:
    136 .cfi_startproc
    137     vmovdqa ($H), $T
    138     vmovdqa $T, $TMP0
    139     vmovdqa $T, ($Htbl)      # H
    140     call GFMUL
    141     vmovdqa $T, 16($Htbl)    # H^2
    142     call GFMUL
    143     vmovdqa $T, 32($Htbl)    # H^3
    144     call GFMUL
    145     vmovdqa $T, 48($Htbl)    # H^4
    146     call GFMUL
    147     vmovdqa $T, 64($Htbl)    # H^5
    148     call GFMUL
    149     vmovdqa $T, 80($Htbl)    # H^6
    150     call GFMUL
    151     vmovdqa $T, 96($Htbl)    # H^7
    152     call GFMUL
    153     vmovdqa $T, 112($Htbl)   # H^8
    154     ret
    155 .cfi_endproc
    156 .size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
    157 ___
    158 }
    159 aesgcmsiv_htable_init();
    160 
    161 sub aesgcmsiv_htable6_init {
    162   # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
    163   # |out_htable|.
    164   # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
    165   #
    166   my $Htbl = "%rdi";
    167   my $H = "%rsi";
    168   my $T = "%xmm0";
    169   my $TMP0 = "%xmm1";
    170 
    171   $code.=<<___;
    172 .globl aesgcmsiv_htable6_init
    173 .type aesgcmsiv_htable6_init,\@function,2
    174 .align 16
    175 aesgcmsiv_htable6_init:
    176 .cfi_startproc
    177     vmovdqa ($H), $T
    178     vmovdqa $T, $TMP0
    179     vmovdqa $T, ($Htbl)      # H
    180     call GFMUL
    181     vmovdqa $T, 16($Htbl)    # H^2
    182     call GFMUL
    183     vmovdqa $T, 32($Htbl)    # H^3
    184     call GFMUL
    185     vmovdqa $T, 48($Htbl)    # H^4
    186     call GFMUL
    187     vmovdqa $T, 64($Htbl)    # H^5
    188     call GFMUL
    189     vmovdqa $T, 80($Htbl)    # H^6
    190     ret
    191 .cfi_endproc
    192 .size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
    193 ___
    194 }
    195 aesgcmsiv_htable6_init();
    196 
    197 sub aesgcmsiv_htable_polyval {
    198   # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
    199   # parameter 1: %rdi     Htable  - pointer to Htable
    200   # parameter 2: %rsi     INp     - pointer to input
    201   # parameter 3: %rdx     LEN     - length of BUFFER in bytes
    202   # parameter 4: %rcx     T       - pointer to POLYVAL output
    203 
    204   my $DATA = "%xmm0";
    205   my $hlp0 = "%r11";
    206   my $Htbl = "%rdi";
    207   my $inp = "%rsi";
    208   my $len = "%rdx";
    209   my $TMP0 = "%xmm3";
    210   my $TMP1 = "%xmm4";
    211   my $TMP2 = "%xmm5";
    212   my $TMP3 = "%xmm6";
    213   my $TMP4 = "%xmm7";
    214   my $Tp = "%rcx";
    215   my $T = "%xmm1";
    216   my $Xhi = "%xmm9";
    217 
    218   my $SCHOOLBOOK_AAD = sub {
    219     my ($i)=@_;
    220     return <<___;
    221     vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
    222     vpxor $TMP3, $TMP2, $TMP2
    223     vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
    224     vpxor $TMP3, $TMP0, $TMP0
    225     vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
    226     vpxor $TMP3, $TMP1, $TMP1
    227     vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
    228     vpxor $TMP3, $TMP2, $TMP2
    229 ___
    230   };
    231 
    232   $code.=<<___;
    233 .globl aesgcmsiv_htable_polyval
    234 .type aesgcmsiv_htable_polyval,\@function,4
    235 .align 16
    236 aesgcmsiv_htable_polyval:
    237 .cfi_startproc
    238     test  $len, $len
    239     jnz   .Lhtable_polyval_start
    240     ret
    241 
    242 .Lhtable_polyval_start:
    243     vzeroall
    244 
    245     # We hash 8 blocks each iteration. If the total number of blocks is not a
    246     # multiple of 8, we first hash the leading n%8 blocks.
    247     movq $len, $hlp0
    248     andq \$127, $hlp0
    249 
    250     jz .Lhtable_polyval_no_prefix
    251 
    252     vpxor $Xhi, $Xhi, $Xhi
    253     vmovdqa ($Tp), $T
    254     sub $hlp0, $len
    255 
    256     sub \$16, $hlp0
    257 
    258     # hash first prefix block
    259     vmovdqu ($inp), $DATA
    260     vpxor $T, $DATA, $DATA
    261 
    262     vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
    263     vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
    264     vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
    265     vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
    266     vpxor $TMP3, $TMP2, $TMP2
    267 
    268     lea 16($inp), $inp
    269     test $hlp0, $hlp0
    270     jnz .Lhtable_polyval_prefix_loop
    271     jmp .Lhtable_polyval_prefix_complete
    272 
    273     # hash remaining prefix bocks (up to 7 total prefix blocks)
    274 .align 64
    275 .Lhtable_polyval_prefix_loop:
    276     sub \$16, $hlp0
    277 
    278     vmovdqu ($inp), $DATA           # next data block
    279 
    280     vpclmulqdq  \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
    281     vpxor       $TMP3, $TMP0, $TMP0
    282     vpclmulqdq  \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
    283     vpxor       $TMP3, $TMP1, $TMP1
    284     vpclmulqdq  \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
    285     vpxor       $TMP3, $TMP2, $TMP2
    286     vpclmulqdq  \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
    287     vpxor       $TMP3, $TMP2, $TMP2
    288 
    289     test $hlp0, $hlp0
    290 
    291     lea 16($inp), $inp
    292 
    293     jnz .Lhtable_polyval_prefix_loop
    294 
    295 .Lhtable_polyval_prefix_complete:
    296     vpsrldq \$8, $TMP2, $TMP3
    297     vpslldq \$8, $TMP2, $TMP2
    298 
    299     vpxor $TMP3, $TMP1, $Xhi
    300     vpxor $TMP2, $TMP0, $T
    301 
    302     jmp .Lhtable_polyval_main_loop
    303 
    304 .Lhtable_polyval_no_prefix:
    305     # At this point we know the number of blocks is a multiple of 8. However,
    306     # the reduction in the main loop includes a multiplication by x^(-128). In
    307     # order to counter this, the existing tag needs to be multipled by x^128.
    308     # In practice, this just means that it is loaded into $Xhi, not $T.
    309     vpxor $T, $T, $T
    310     vmovdqa ($Tp), $Xhi
    311 
    312 .align 64
    313 .Lhtable_polyval_main_loop:
    314     sub \$0x80, $len
    315     jb .Lhtable_polyval_out
    316 
    317     vmovdqu 16*7($inp), $DATA      # Ii
    318 
    319     vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
    320     vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
    321     vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
    322     vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
    323     vpxor $TMP3, $TMP2, $TMP2
    324 
    325     #########################################################
    326     vmovdqu 16*6($inp), $DATA
    327     ${\$SCHOOLBOOK_AAD->(1)}
    328 
    329     #########################################################
    330     vmovdqu 16*5($inp), $DATA
    331 
    332     vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 1a
    333     vpalignr \$8, $T, $T, $T
    334 
    335     ${\$SCHOOLBOOK_AAD->(2)}
    336 
    337     vpxor $TMP4, $T, $T                              # reduction stage 1b
    338     #########################################################
    339     vmovdqu     16*4($inp), $DATA
    340 
    341     ${\$SCHOOLBOOK_AAD->(3)}
    342     #########################################################
    343     vmovdqu     16*3($inp), $DATA
    344 
    345     vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 2a
    346     vpalignr \$8, $T, $T, $T
    347 
    348     ${\$SCHOOLBOOK_AAD->(4)}
    349 
    350     vpxor $TMP4, $T, $T                              # reduction stage 2b
    351     #########################################################
    352     vmovdqu 16*2($inp), $DATA
    353 
    354     ${\$SCHOOLBOOK_AAD->(5)}
    355 
    356     vpxor $Xhi, $T, $T                               # reduction finalize
    357     #########################################################
    358     vmovdqu 16*1($inp), $DATA
    359 
    360     ${\$SCHOOLBOOK_AAD->(6)}
    361     #########################################################
    362     vmovdqu 16*0($inp), $DATA
    363     vpxor $T, $DATA, $DATA
    364 
    365     ${\$SCHOOLBOOK_AAD->(7)}
    366     #########################################################
    367     vpsrldq \$8, $TMP2, $TMP3
    368     vpslldq \$8, $TMP2, $TMP2
    369 
    370     vpxor $TMP3, $TMP1, $Xhi
    371     vpxor $TMP2, $TMP0, $T
    372 
    373     lea 16*8($inp), $inp
    374     jmp .Lhtable_polyval_main_loop
    375 
    376     #########################################################
    377 
    378 .Lhtable_polyval_out:
    379     vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
    380     vpalignr    \$8, $T, $T, $T
    381     vpxor       $TMP3, $T, $T
    382 
    383     vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
    384     vpalignr    \$8, $T, $T, $T
    385     vpxor       $TMP3, $T, $T
    386     vpxor       $Xhi, $T, $T
    387 
    388     vmovdqu $T, ($Tp)
    389     vzeroupper
    390     ret
    391 .cfi_endproc
    392 .size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
    393 ___
    394 }
    395 aesgcmsiv_htable_polyval();
    396 
    397 sub aesgcmsiv_polyval_horner {
    398   #void aesgcmsiv_polyval_horner(unsigned char T[16],  // output
    399   #      const unsigned char* H, // H
    400   #      unsigned char* BUF,  // Buffer
    401   #      unsigned int blocks);  // Len2
    402   #
    403   # parameter 1: %rdi T - pointers to POLYVAL output
    404   # parameter 2: %rsi Hp - pointer to H (user key)
    405   # parameter 3: %rdx INp - pointer to input
    406   # parameter 4: %rcx L - total number of blocks in input BUFFER
    407   #
    408   my $T = "%rdi";
    409   my $Hp = "%rsi";
    410   my $INp = "%rdx";
    411   my $L = "%rcx";
    412   my $LOC = "%r10";
    413   my $LEN = "%eax";
    414   my $H = "%xmm1";
    415   my $RES = "%xmm0";
    416 
    417   $code.=<<___;
    418 .globl aesgcmsiv_polyval_horner
    419 .type aesgcmsiv_polyval_horner,\@function,4
    420 .align 16
    421 aesgcmsiv_polyval_horner:
    422 .cfi_startproc
    423     test $L, $L
    424     jnz .Lpolyval_horner_start
    425     ret
    426 
    427 .Lpolyval_horner_start:
    428     # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
    429     # RES = GFMUL(RES, H)
    430 
    431     xorq $LOC, $LOC
    432     shlq \$4, $L    # L contains number of bytes to process
    433 
    434     vmovdqa ($Hp), $H
    435     vmovdqa ($T), $RES
    436 
    437 .Lpolyval_horner_loop:
    438     vpxor ($INp,$LOC), $RES, $RES  # RES = RES + Xi
    439     call GFMUL  # RES = RES * H
    440 
    441     add \$16, $LOC
    442     cmp $LOC, $L
    443     jne .Lpolyval_horner_loop
    444 
    445     # calculation of T is complete. RES=T
    446     vmovdqa $RES, ($T)
    447     ret
    448 .cfi_endproc
    449 .size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
    450 ___
    451 }
    452 aesgcmsiv_polyval_horner();
    453 
    454 # void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
    455 # parameter 1: %rdi
    456 # parameter 2: %rsi
    457 $code.=<<___;
    458 .globl aes128gcmsiv_aes_ks
    459 .type aes128gcmsiv_aes_ks,\@function,2
    460 .align 16
    461 aes128gcmsiv_aes_ks:
    462 .cfi_startproc
    463     vmovdqu (%rdi), %xmm1           # xmm1 = user key
    464     vmovdqa %xmm1, (%rsi)           # rsi points to output
    465 
    466     vmovdqa con1(%rip), %xmm0
    467     vmovdqa mask(%rip), %xmm15
    468 
    469     movq \$8, %rax
    470 
    471 .Lks128_loop:
    472     addq \$16, %rsi                 # rsi points for next key
    473     subq \$1, %rax
    474     vpshufb %xmm15, %xmm1, %xmm2    # xmm2 = shuffled user key
    475     vaesenclast %xmm0, %xmm2, %xmm2
    476     vpslld \$1, %xmm0, %xmm0
    477     vpslldq \$4, %xmm1, %xmm3
    478     vpxor %xmm3, %xmm1, %xmm1
    479     vpslldq \$4, %xmm3, %xmm3
    480     vpxor %xmm3, %xmm1, %xmm1
    481     vpslldq \$4, %xmm3, %xmm3
    482     vpxor %xmm3, %xmm1, %xmm1
    483     vpxor %xmm2, %xmm1, %xmm1
    484     vmovdqa %xmm1, (%rsi)
    485     jne .Lks128_loop
    486 
    487     vmovdqa con2(%rip), %xmm0
    488     vpshufb %xmm15, %xmm1, %xmm2
    489     vaesenclast %xmm0, %xmm2, %xmm2
    490     vpslld \$1, %xmm0, %xmm0
    491     vpslldq \$4, %xmm1, %xmm3
    492     vpxor %xmm3, %xmm1, %xmm1
    493     vpslldq \$4, %xmm3, %xmm3
    494     vpxor %xmm3, %xmm1, %xmm1
    495     vpslldq \$4, %xmm3, %xmm3
    496     vpxor %xmm3, %xmm1, %xmm1
    497     vpxor %xmm2, %xmm1, %xmm1
    498     vmovdqa %xmm1, 16(%rsi)
    499 
    500     vpshufb %xmm15, %xmm1, %xmm2
    501     vaesenclast %xmm0, %xmm2, %xmm2
    502     vpslldq \$4, %xmm1, %xmm3
    503     vpxor %xmm3, %xmm1, %xmm1
    504     vpslldq \$4, %xmm3, %xmm3
    505     vpxor %xmm3, %xmm1, %xmm1
    506     vpslldq \$4, %xmm3, %xmm3
    507     vpxor %xmm3, %xmm1, %xmm1
    508     vpxor %xmm2, %xmm1, %xmm1
    509     vmovdqa %xmm1, 32(%rsi)
    510     ret
    511 .cfi_endproc
    512 .size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
    513 ___
    514 
    515 # void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
    516 # parameter 1: %rdi
    517 # parameter 2: %rsi
    518 $code.=<<___;
    519 .globl aes256gcmsiv_aes_ks
    520 .type aes256gcmsiv_aes_ks,\@function,2
    521 .align 16
    522 aes256gcmsiv_aes_ks:
    523 .cfi_startproc
    524     vmovdqu (%rdi), %xmm1
    525     vmovdqu 16(%rdi), %xmm3
    526     vmovdqa %xmm1, (%rsi)
    527     vmovdqa %xmm3, 16(%rsi)
    528     vmovdqa con1(%rip), %xmm0
    529     vmovdqa mask(%rip), %xmm15
    530     vpxor %xmm14, %xmm14, %xmm14
    531     mov \$6, %rax
    532 
    533 .Lks256_loop:
    534     add \$32, %rsi
    535     subq \$1, %rax
    536     vpshufb %xmm15, %xmm3, %xmm2
    537     vaesenclast %xmm0, %xmm2, %xmm2
    538     vpslld \$1, %xmm0, %xmm0
    539     vpsllq \$32, %xmm1, %xmm4
    540     vpxor %xmm4, %xmm1, %xmm1
    541     vpshufb con3(%rip), %xmm1,  %xmm4
    542     vpxor %xmm4, %xmm1, %xmm1
    543     vpxor %xmm2, %xmm1, %xmm1
    544     vmovdqa %xmm1, (%rsi)
    545     vpshufd \$0xff, %xmm1, %xmm2
    546     vaesenclast %xmm14, %xmm2, %xmm2
    547     vpsllq \$32, %xmm3, %xmm4
    548     vpxor %xmm4, %xmm3, %xmm3
    549     vpshufb con3(%rip), %xmm3,  %xmm4
    550     vpxor %xmm4, %xmm3, %xmm3
    551     vpxor %xmm2, %xmm3, %xmm3
    552     vmovdqa %xmm3, 16(%rsi)
    553     jne .Lks256_loop
    554 
    555     vpshufb %xmm15, %xmm3, %xmm2
    556     vaesenclast %xmm0, %xmm2, %xmm2
    557     vpsllq \$32, %xmm1, %xmm4
    558     vpxor %xmm4, %xmm1, %xmm1
    559     vpshufb con3(%rip), %xmm1,  %xmm4
    560     vpxor %xmm4, %xmm1, %xmm1
    561     vpxor %xmm2, %xmm1, %xmm1
    562     vmovdqa %xmm1, 32(%rsi)
    563     ret
    564 .cfi_endproc
    565 ___
    566 
    567 sub aes128gcmsiv_aes_ks_enc_x1 {
    568   my $KS1_REGA = "%xmm1";
    569   my $KS1_REGB = "%xmm2";
    570   my $BLOCK1 = "%xmm4";
    571   my $AUXREG = "%xmm3";
    572 
    573   my $KS_BLOCK = sub {
    574     my ($reg, $reg2, $auxReg) = @_;
    575     return <<___;
    576     vpsllq \$32, $reg, $auxReg         #!!saving mov instruction to xmm3
    577     vpxor $auxReg, $reg, $reg
    578     vpshufb con3(%rip), $reg,  $auxReg
    579     vpxor $auxReg, $reg, $reg
    580     vpxor $reg2, $reg, $reg
    581 ___
    582   };
    583 
    584   my $round = sub {
    585     my ($i, $j) = @_;
    586     return <<___;
    587     vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
    588     vaesenclast %xmm0, %xmm2, %xmm2
    589     vpslld \$1, %xmm0, %xmm0
    590     ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
    591     vaesenc %xmm1, $BLOCK1, $BLOCK1
    592     vmovdqa %xmm1, ${\eval(16*$i)}($j)
    593 ___
    594   };
    595 
    596   my $roundlast = sub {
    597     my ($i, $j) = @_;
    598     return <<___;
    599     vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
    600     vaesenclast %xmm0, %xmm2, %xmm2
    601     ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
    602     vaesenclast %xmm1, $BLOCK1, $BLOCK1
    603     vmovdqa %xmm1, ${\eval(16*$i)}($j)
    604 ___
    605   };
    606 
    607 # parameter 1: %rdi                         Pointer to PT
    608 # parameter 2: %rsi                         Pointer to CT
    609 # parameter 4: %rdx                         Pointer to keys
    610 # parameter 5: %rcx                         Pointer to initial key
    611   $code.=<<___;
    612 .globl aes128gcmsiv_aes_ks_enc_x1
    613 .type aes128gcmsiv_aes_ks_enc_x1,\@function,4
    614 .align 16
    615 aes128gcmsiv_aes_ks_enc_x1:
    616 .cfi_startproc
    617     vmovdqa (%rcx), %xmm1                 # xmm1 = first 16 bytes of random key
    618     vmovdqa 0*16(%rdi), $BLOCK1
    619 
    620     vmovdqa %xmm1, (%rdx)                 # KEY[0] = first 16 bytes of random key
    621     vpxor %xmm1, $BLOCK1, $BLOCK1
    622 
    623     vmovdqa con1(%rip), %xmm0             # xmm0  = 1,1,1,1
    624     vmovdqa mask(%rip), %xmm15            # xmm15 = mask
    625 
    626     ${\$round->(1, "%rdx")}
    627     ${\$round->(2, "%rdx")}
    628     ${\$round->(3, "%rdx")}
    629     ${\$round->(4, "%rdx")}
    630     ${\$round->(5, "%rdx")}
    631     ${\$round->(6, "%rdx")}
    632     ${\$round->(7, "%rdx")}
    633     ${\$round->(8, "%rdx")}
    634 
    635     vmovdqa con2(%rip), %xmm0
    636 
    637     ${\$round->(9, "%rdx")}
    638     ${\$roundlast->(10, "%rdx")}
    639 
    640     vmovdqa $BLOCK1, 0*16(%rsi)
    641     ret
    642 .cfi_endproc
    643 .size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
    644 ___
    645 }
    646 aes128gcmsiv_aes_ks_enc_x1();
    647 
    648 sub aes128gcmsiv_kdf {
    649   my $BLOCK1 = "%xmm9";
    650   my $BLOCK2 = "%xmm10";
    651   my $BLOCK3 = "%xmm11";
    652   my $BLOCK4 = "%xmm12";
    653   my $BLOCK5 = "%xmm13";
    654   my $BLOCK6 = "%xmm14";
    655   my $ONE = "%xmm13";
    656   my $KSp = "%rdx";
    657   my $STATE_1 = "%xmm1";
    658 
    659   my $enc_roundx4 = sub {
    660     my ($i, $j) = @_;
    661     return <<___;
    662     vmovdqa ${\eval($i*16)}(%rdx), $j
    663     vaesenc $j, $BLOCK1, $BLOCK1
    664     vaesenc $j, $BLOCK2, $BLOCK2
    665     vaesenc $j, $BLOCK3, $BLOCK3
    666     vaesenc $j, $BLOCK4, $BLOCK4
    667 ___
    668   };
    669 
    670   my $enc_roundlastx4 = sub {
    671     my ($i, $j) = @_;
    672     return <<___;
    673     vmovdqa ${\eval($i*16)}(%rdx), $j
    674     vaesenclast $j, $BLOCK1, $BLOCK1
    675     vaesenclast $j, $BLOCK2, $BLOCK2
    676     vaesenclast $j, $BLOCK3, $BLOCK3
    677     vaesenclast $j, $BLOCK4, $BLOCK4
    678 ___
    679   };
    680 
    681 # void aes128gcmsiv_kdf(const uint8_t nonce[16],
    682 #                       uint8_t *out_key_material,
    683 #                       const uint8_t *key_schedule);
    684   $code.=<<___;
    685 .globl aes128gcmsiv_kdf
    686 .type aes128gcmsiv_kdf,\@function,3
    687 .align 16
    688 aes128gcmsiv_kdf:
    689 .cfi_startproc
    690 # parameter 1: %rdi                         Pointer to NONCE
    691 # parameter 2: %rsi                         Pointer to CT
    692 # parameter 4: %rdx                         Pointer to keys
    693 
    694     vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
    695     vmovdqa 0*16(%rdi), $BLOCK1
    696     vmovdqa and_mask(%rip), $BLOCK4
    697     vmovdqa one(%rip), $ONE
    698     vpshufd \$0x90, $BLOCK1, $BLOCK1
    699     vpand $BLOCK4, $BLOCK1, $BLOCK1
    700     vpaddd $ONE, $BLOCK1, $BLOCK2
    701     vpaddd $ONE, $BLOCK2, $BLOCK3
    702     vpaddd $ONE, $BLOCK3, $BLOCK4
    703 
    704     vpxor %xmm1, $BLOCK1, $BLOCK1
    705     vpxor %xmm1, $BLOCK2, $BLOCK2
    706     vpxor %xmm1, $BLOCK3, $BLOCK3
    707     vpxor %xmm1, $BLOCK4, $BLOCK4
    708 
    709     ${\$enc_roundx4->(1, "%xmm1")}
    710     ${\$enc_roundx4->(2, "%xmm2")}
    711     ${\$enc_roundx4->(3, "%xmm1")}
    712     ${\$enc_roundx4->(4, "%xmm2")}
    713     ${\$enc_roundx4->(5, "%xmm1")}
    714     ${\$enc_roundx4->(6, "%xmm2")}
    715     ${\$enc_roundx4->(7, "%xmm1")}
    716     ${\$enc_roundx4->(8, "%xmm2")}
    717     ${\$enc_roundx4->(9, "%xmm1")}
    718     ${\$enc_roundlastx4->(10, "%xmm2")}
    719 
    720     vmovdqa $BLOCK1, 0*16(%rsi)
    721     vmovdqa $BLOCK2, 1*16(%rsi)
    722     vmovdqa $BLOCK3, 2*16(%rsi)
    723     vmovdqa $BLOCK4, 3*16(%rsi)
    724     ret
    725 .cfi_endproc
    726 .size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
    727 ___
    728 }
    729 aes128gcmsiv_kdf();
    730 
    731 sub aes128gcmsiv_enc_msg_x4 {
    732   my $CTR1 = "%xmm0";
    733   my $CTR2 = "%xmm1";
    734   my $CTR3 = "%xmm2";
    735   my $CTR4 = "%xmm3";
    736   my $ADDER = "%xmm4";
    737 
    738   my $STATE1 = "%xmm5";
    739   my $STATE2 = "%xmm6";
    740   my $STATE3 = "%xmm7";
    741   my $STATE4 = "%xmm8";
    742 
    743   my $TMP = "%xmm12";
    744   my $TMP2 = "%xmm13";
    745   my $TMP3 = "%xmm14";
    746   my $IV = "%xmm15";
    747 
    748   my $PT = "%rdi";
    749   my $CT = "%rsi";
    750   my $TAG = "%rdx";
    751   my $KS = "%rcx";
    752   my $LEN = "%r8";
    753 
    754   my $aes_round = sub {
    755     my ($i) = @_;
    756     return <<___;
    757     vmovdqu ${\eval($i*16)}($KS), $TMP
    758     vaesenc $TMP, $STATE1, $STATE1
    759     vaesenc $TMP, $STATE2, $STATE2
    760     vaesenc $TMP, $STATE3, $STATE3
    761     vaesenc $TMP, $STATE4, $STATE4
    762 ___
    763   };
    764 
    765   my $aes_lastround = sub {
    766     my ($i) = @_;
    767     return <<___;
    768     vmovdqu ${\eval($i*16)}($KS), $TMP
    769     vaesenclast $TMP, $STATE1, $STATE1
    770     vaesenclast $TMP, $STATE2, $STATE2
    771     vaesenclast $TMP, $STATE3, $STATE3
    772     vaesenclast $TMP, $STATE4, $STATE4
    773 ___
    774   };
    775 
    776 # void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
    777 #                              unsigned char* TAG, unsigned char* KS,
    778 #                              size_t byte_len);
    779 # parameter 1: %rdi     #PT
    780 # parameter 2: %rsi     #CT
    781 # parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
    782 # parameter 4: %rcx     #KS
    783 # parameter 5: %r8      #LEN MSG_length in bytes
    784   $code.=<<___;
    785 .globl aes128gcmsiv_enc_msg_x4
    786 .type aes128gcmsiv_enc_msg_x4,\@function,5
    787 .align 16
    788 aes128gcmsiv_enc_msg_x4:
    789 .cfi_startproc
    790     test $LEN, $LEN
    791     jnz .L128_enc_msg_x4_start
    792     ret
    793 
    794 .L128_enc_msg_x4_start:
    795     pushq %r12
    796 .cfi_push %r12
    797     pushq %r13
    798 .cfi_push %r13
    799 
    800     shrq \$4, $LEN      # LEN = num of blocks
    801     movq $LEN, %r10
    802     shlq \$62, %r10
    803     shrq \$62, %r10
    804 
    805     # make IV from TAG
    806     vmovdqa ($TAG), $IV
    807     vpor OR_MASK(%rip), $IV, $IV  #IV = [1]TAG[126...32][00..00]
    808 
    809     vmovdqu four(%rip), $ADDER     # Register to increment counters
    810     vmovdqa $IV, $CTR1             # CTR1 = TAG[1][127...32][00..00]
    811     vpaddd one(%rip), $IV, $CTR2   # CTR2 = TAG[1][127...32][00..01]
    812     vpaddd two(%rip), $IV, $CTR3   # CTR3 = TAG[1][127...32][00..02]
    813     vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
    814 
    815     shrq \$2, $LEN
    816     je .L128_enc_msg_x4_check_remainder
    817 
    818     subq \$64, $CT
    819     subq \$64, $PT
    820 
    821 .L128_enc_msg_x4_loop1:
    822     addq \$64, $CT
    823     addq \$64, $PT
    824 
    825     vmovdqa $CTR1, $STATE1
    826     vmovdqa $CTR2, $STATE2
    827     vmovdqa $CTR3, $STATE3
    828     vmovdqa $CTR4, $STATE4
    829 
    830     vpxor ($KS), $STATE1, $STATE1
    831     vpxor ($KS), $STATE2, $STATE2
    832     vpxor ($KS), $STATE3, $STATE3
    833     vpxor ($KS), $STATE4, $STATE4
    834 
    835     ${\$aes_round->(1)}
    836     vpaddd $ADDER, $CTR1, $CTR1
    837     ${\$aes_round->(2)}
    838     vpaddd $ADDER, $CTR2, $CTR2
    839     ${\$aes_round->(3)}
    840     vpaddd $ADDER, $CTR3, $CTR3
    841     ${\$aes_round->(4)}
    842     vpaddd $ADDER, $CTR4, $CTR4
    843 
    844     ${\$aes_round->(5)}
    845     ${\$aes_round->(6)}
    846     ${\$aes_round->(7)}
    847     ${\$aes_round->(8)}
    848     ${\$aes_round->(9)}
    849     ${\$aes_lastround->(10)}
    850 
    851     # XOR with Plaintext
    852     vpxor 0*16($PT), $STATE1, $STATE1
    853     vpxor 1*16($PT), $STATE2, $STATE2
    854     vpxor 2*16($PT), $STATE3, $STATE3
    855     vpxor 3*16($PT), $STATE4, $STATE4
    856 
    857     subq \$1, $LEN
    858 
    859     vmovdqu $STATE1, 0*16($CT)
    860     vmovdqu $STATE2, 1*16($CT)
    861     vmovdqu $STATE3, 2*16($CT)
    862     vmovdqu $STATE4, 3*16($CT)
    863 
    864     jne .L128_enc_msg_x4_loop1
    865 
    866     addq \$64,$CT
    867     addq \$64,$PT
    868 
    869 .L128_enc_msg_x4_check_remainder:
    870     cmpq \$0, %r10
    871     je .L128_enc_msg_x4_out
    872 
    873 .L128_enc_msg_x4_loop2:
    874     # enc each block separately
    875     # CTR1 is the highest counter (even if no LOOP done)
    876     vmovdqa $CTR1, $STATE1
    877     vpaddd one(%rip), $CTR1, $CTR1  # inc counter
    878 
    879     vpxor ($KS), $STATE1, $STATE1
    880     vaesenc 16($KS), $STATE1, $STATE1
    881     vaesenc 32($KS), $STATE1, $STATE1
    882     vaesenc 48($KS), $STATE1, $STATE1
    883     vaesenc 64($KS), $STATE1, $STATE1
    884     vaesenc 80($KS), $STATE1, $STATE1
    885     vaesenc 96($KS), $STATE1, $STATE1
    886     vaesenc 112($KS), $STATE1, $STATE1
    887     vaesenc 128($KS), $STATE1, $STATE1
    888     vaesenc 144($KS), $STATE1, $STATE1
    889     vaesenclast 160($KS), $STATE1, $STATE1
    890 
    891     # XOR with plaintext
    892     vpxor ($PT), $STATE1, $STATE1
    893     vmovdqu $STATE1, ($CT)
    894 
    895     addq \$16, $PT
    896     addq \$16, $CT
    897 
    898     subq \$1, %r10
    899     jne .L128_enc_msg_x4_loop2
    900 
    901 .L128_enc_msg_x4_out:
    902     popq %r13
    903 .cfi_pop %r13
    904     popq %r12
    905 .cfi_pop %r12
    906     ret
    907 .cfi_endproc
    908 .size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
    909 ___
    910 }
    911 aes128gcmsiv_enc_msg_x4();
    912 
    913 sub aes128gcmsiv_enc_msg_x8 {
    914   my $STATE1 = "%xmm1";
    915   my $STATE2 = "%xmm2";
    916   my $STATE3 = "%xmm3";
    917   my $STATE4 = "%xmm4";
    918   my $STATE5 = "%xmm5";
    919   my $STATE6 = "%xmm6";
    920   my $STATE7 = "%xmm7";
    921   my $STATE8 = "%xmm8";
    922 
    923   my $CTR1 = "%xmm0";
    924   my $CTR2 = "%xmm9";
    925   my $CTR3 = "%xmm10";
    926   my $CTR4 = "%xmm11";
    927   my $CTR5 = "%xmm12";
    928   my $CTR6 = "%xmm13";
    929   my $CTR7 = "%xmm14";
    930   my $SCHED = "%xmm15";
    931 
    932   my $TMP1 = "%xmm1";
    933   my $TMP2 = "%xmm2";
    934 
    935   my $PT = "%rdi";
    936   my $CT = "%rsi";
    937   my $TAG = "%rdx";
    938   my $KS = "%rcx";
    939   my $LEN = "%r8";
    940 
    941   my $aes_round8 = sub {
    942     my ($i) = @_;
    943     return <<___;
    944     vmovdqu ${\eval($i*16)}($KS), $SCHED
    945     vaesenc $SCHED, $STATE1, $STATE1
    946     vaesenc $SCHED, $STATE2, $STATE2
    947     vaesenc $SCHED, $STATE3, $STATE3
    948     vaesenc $SCHED, $STATE4, $STATE4
    949     vaesenc $SCHED, $STATE5, $STATE5
    950     vaesenc $SCHED, $STATE6, $STATE6
    951     vaesenc $SCHED, $STATE7, $STATE7
    952     vaesenc $SCHED, $STATE8, $STATE8
    953 ___
    954   };
    955 
    956   my $aes_lastround8 = sub {
    957     my ($i) = @_;
    958     return <<___;
    959     vmovdqu ${\eval($i*16)}($KS), $SCHED
    960     vaesenclast $SCHED, $STATE1, $STATE1
    961     vaesenclast $SCHED, $STATE2, $STATE2
    962     vaesenclast $SCHED, $STATE3, $STATE3
    963     vaesenclast $SCHED, $STATE4, $STATE4
    964     vaesenclast $SCHED, $STATE5, $STATE5
    965     vaesenclast $SCHED, $STATE6, $STATE6
    966     vaesenclast $SCHED, $STATE7, $STATE7
    967     vaesenclast $SCHED, $STATE8, $STATE8
    968 ___
    969   };
    970 
    971 # void ENC_MSG_x8(unsigned char* PT,
    972 #                 unsigned char* CT,
    973 #                 unsigned char* TAG,
    974 #                 unsigned char* KS,
    975 #                 size_t byte_len);
    976 # parameter 1: %rdi     #PT
    977 # parameter 2: %rsi     #CT
    978 # parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
    979 # parameter 4: %rcx     #KS
    980 # parameter 5: %r8      #LEN MSG_length in bytes
    981   $code.=<<___;
    982 .globl aes128gcmsiv_enc_msg_x8
    983 .type aes128gcmsiv_enc_msg_x8,\@function,5
    984 .align 16
    985 aes128gcmsiv_enc_msg_x8:
    986 .cfi_startproc
    987     test $LEN, $LEN
    988     jnz .L128_enc_msg_x8_start
    989     ret
    990 
    991 .L128_enc_msg_x8_start:
    992     pushq %r12
    993 .cfi_push %r12
    994     pushq %r13
    995 .cfi_push %r13
    996     pushq %rbp
    997 .cfi_push %rbp
    998     movq %rsp, %rbp
    999 .cfi_def_cfa_register rbp
   1000 
   1001     # Place in stack
   1002     subq \$128, %rsp
   1003     andq \$-64, %rsp
   1004 
   1005     shrq \$4, $LEN  # LEN = num of blocks
   1006     movq $LEN, %r10
   1007     shlq \$61, %r10
   1008     shrq \$61, %r10
   1009 
   1010     # make IV from TAG
   1011     vmovdqu ($TAG), $TMP1
   1012     vpor OR_MASK(%rip), $TMP1, $TMP1  # TMP1= IV = [1]TAG[126...32][00..00]
   1013 
   1014     # store counter8 in the stack
   1015     vpaddd seven(%rip), $TMP1, $CTR1
   1016     vmovdqu $CTR1, (%rsp)             # CTR8 = TAG[127...32][00..07]
   1017     vpaddd one(%rip), $TMP1, $CTR2    # CTR2 = TAG[127...32][00..01]
   1018     vpaddd two(%rip), $TMP1, $CTR3    # CTR3 = TAG[127...32][00..02]
   1019     vpaddd three(%rip), $TMP1, $CTR4  # CTR4 = TAG[127...32][00..03]
   1020     vpaddd four(%rip), $TMP1, $CTR5   # CTR5 = TAG[127...32][00..04]
   1021     vpaddd five(%rip), $TMP1, $CTR6   # CTR6 = TAG[127...32][00..05]
   1022     vpaddd six(%rip), $TMP1, $CTR7    # CTR7 = TAG[127...32][00..06]
   1023     vmovdqa $TMP1, $CTR1              # CTR1 = TAG[127...32][00..00]
   1024 
   1025     shrq \$3, $LEN
   1026     je .L128_enc_msg_x8_check_remainder
   1027 
   1028     subq \$128, $CT
   1029     subq \$128, $PT
   1030 
   1031 .L128_enc_msg_x8_loop1:
   1032     addq \$128, $CT
   1033     addq \$128, $PT
   1034 
   1035     vmovdqa $CTR1, $STATE1
   1036     vmovdqa $CTR2, $STATE2
   1037     vmovdqa $CTR3, $STATE3
   1038     vmovdqa $CTR4, $STATE4
   1039     vmovdqa $CTR5, $STATE5
   1040     vmovdqa $CTR6, $STATE6
   1041     vmovdqa $CTR7, $STATE7
   1042     # move from stack
   1043     vmovdqu (%rsp), $STATE8
   1044 
   1045     vpxor ($KS), $STATE1, $STATE1
   1046     vpxor ($KS), $STATE2, $STATE2
   1047     vpxor ($KS), $STATE3, $STATE3
   1048     vpxor ($KS), $STATE4, $STATE4
   1049     vpxor ($KS), $STATE5, $STATE5
   1050     vpxor ($KS), $STATE6, $STATE6
   1051     vpxor ($KS), $STATE7, $STATE7
   1052     vpxor ($KS), $STATE8, $STATE8
   1053 
   1054     ${\$aes_round8->(1)}
   1055     vmovdqu (%rsp), $CTR7  # deal with CTR8
   1056     vpaddd eight(%rip), $CTR7, $CTR7
   1057     vmovdqu $CTR7, (%rsp)
   1058     ${\$aes_round8->(2)}
   1059     vpsubd one(%rip), $CTR7, $CTR7
   1060     ${\$aes_round8->(3)}
   1061     vpaddd eight(%rip), $CTR1, $CTR1
   1062     ${\$aes_round8->(4)}
   1063     vpaddd eight(%rip), $CTR2, $CTR2
   1064     ${\$aes_round8->(5)}
   1065     vpaddd eight(%rip), $CTR3, $CTR3
   1066     ${\$aes_round8->(6)}
   1067     vpaddd eight(%rip), $CTR4, $CTR4
   1068     ${\$aes_round8->(7)}
   1069     vpaddd eight(%rip), $CTR5, $CTR5
   1070     ${\$aes_round8->(8)}
   1071     vpaddd eight(%rip), $CTR6, $CTR6
   1072     ${\$aes_round8->(9)}
   1073     ${\$aes_lastround8->(10)}
   1074 
   1075     # XOR with Plaintext
   1076     vpxor 0*16($PT), $STATE1, $STATE1
   1077     vpxor 1*16($PT), $STATE2, $STATE2
   1078     vpxor 2*16($PT), $STATE3, $STATE3
   1079     vpxor 3*16($PT), $STATE4, $STATE4
   1080     vpxor 4*16($PT), $STATE5, $STATE5
   1081     vpxor 5*16($PT), $STATE6, $STATE6
   1082     vpxor 6*16($PT), $STATE7, $STATE7
   1083     vpxor 7*16($PT), $STATE8, $STATE8
   1084 
   1085     dec $LEN
   1086 
   1087     vmovdqu $STATE1, 0*16($CT)
   1088     vmovdqu $STATE2, 1*16($CT)
   1089     vmovdqu $STATE3, 2*16($CT)
   1090     vmovdqu $STATE4, 3*16($CT)
   1091     vmovdqu $STATE5, 4*16($CT)
   1092     vmovdqu $STATE6, 5*16($CT)
   1093     vmovdqu $STATE7, 6*16($CT)
   1094     vmovdqu $STATE8, 7*16($CT)
   1095 
   1096     jne .L128_enc_msg_x8_loop1
   1097 
   1098     addq \$128, $CT
   1099     addq \$128, $PT
   1100 
   1101 .L128_enc_msg_x8_check_remainder:
   1102     cmpq \$0, %r10
   1103     je .L128_enc_msg_x8_out
   1104 
   1105 .L128_enc_msg_x8_loop2:
   1106     # enc each block separately
   1107     # CTR1 is the highest counter (even if no LOOP done)
   1108     vmovdqa $CTR1, $STATE1
   1109     vpaddd one(%rip), $CTR1, $CTR1  # inc counter
   1110 
   1111     vpxor ($KS), $STATE1, $STATE1
   1112     vaesenc 16($KS), $STATE1, $STATE1
   1113     vaesenc 32($KS), $STATE1, $STATE1
   1114     vaesenc 48($KS), $STATE1, $STATE1
   1115     vaesenc 64($KS), $STATE1, $STATE1
   1116     vaesenc 80($KS), $STATE1, $STATE1
   1117     vaesenc 96($KS), $STATE1, $STATE1
   1118     vaesenc 112($KS), $STATE1, $STATE1
   1119     vaesenc 128($KS), $STATE1, $STATE1
   1120     vaesenc 144($KS), $STATE1, $STATE1
   1121     vaesenclast 160($KS), $STATE1, $STATE1
   1122 
   1123     # XOR with Plaintext
   1124     vpxor ($PT), $STATE1, $STATE1
   1125 
   1126     vmovdqu $STATE1, ($CT)
   1127 
   1128     addq \$16, $PT
   1129     addq \$16, $CT
   1130 
   1131     decq %r10
   1132     jne .L128_enc_msg_x8_loop2
   1133 
   1134 .L128_enc_msg_x8_out:
   1135     movq %rbp, %rsp
   1136 .cfi_def_cfa_register %rsp
   1137     popq %rbp
   1138 .cfi_pop %rbp
   1139     popq %r13
   1140 .cfi_pop %r13
   1141     popq %r12
   1142 .cfi_pop %r12
   1143     ret
   1144 .cfi_endproc
   1145 .size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
   1146 ___
   1147 }
   1148 aes128gcmsiv_enc_msg_x8();
   1149 
   1150 sub aesgcmsiv_dec {
   1151   my ($aes256) = @_;
   1152 
   1153   my $T = "%xmm0";
   1154   my $TMP0 = "%xmm1";
   1155   my $TMP1 = "%xmm2";
   1156   my $TMP2 = "%xmm3";
   1157   my $TMP3 = "%xmm4";
   1158   my $TMP4 = "%xmm5";
   1159   my $TMP5 = "%xmm6";
   1160   my $CTR1 = "%xmm7";
   1161   my $CTR2 = "%xmm8";
   1162   my $CTR3 = "%xmm9";
   1163   my $CTR4 = "%xmm10";
   1164   my $CTR5 = "%xmm11";
   1165   my $CTR6 = "%xmm12";
   1166   my $CTR = "%xmm15";
   1167   my $CT = "%rdi";
   1168   my $PT = "%rsi";
   1169   my $POL = "%rdx";
   1170   my $Htbl = "%rcx";
   1171   my $KS = "%r8";
   1172   my $LEN = "%r9";
   1173   my $secureBuffer = "%rax";
   1174   my $HTABLE_ROUNDS = "%xmm13";
   1175 
   1176   my $labelPrefix = "128";
   1177   if ($aes256) {
   1178     $labelPrefix = "256";
   1179   }
   1180 
   1181   my $aes_round_dec = sub {
   1182     my ($i) = @_;
   1183     return <<___;
   1184     vmovdqu ${\eval($i*16)}($KS), $TMP3
   1185     vaesenc $TMP3, $CTR1, $CTR1
   1186     vaesenc $TMP3, $CTR2, $CTR2
   1187     vaesenc $TMP3, $CTR3, $CTR3
   1188     vaesenc $TMP3, $CTR4, $CTR4
   1189     vaesenc $TMP3, $CTR5, $CTR5
   1190     vaesenc $TMP3, $CTR6, $CTR6
   1191 ___
   1192   };
   1193 
   1194   my $aes_lastround_dec = sub {
   1195     my ($i) = @_;
   1196     return <<___;
   1197     vmovdqu ${\eval($i*16)}($KS), $TMP3
   1198     vaesenclast $TMP3, $CTR1, $CTR1
   1199     vaesenclast $TMP3, $CTR2, $CTR2
   1200     vaesenclast $TMP3, $CTR3, $CTR3
   1201     vaesenclast $TMP3, $CTR4, $CTR4
   1202     vaesenclast $TMP3, $CTR5, $CTR5
   1203     vaesenclast $TMP3, $CTR6, $CTR6
   1204 ___
   1205   };
   1206 
   1207   my $schoolbook = sub {
   1208     my ($i) = @_;
   1209     return <<___;
   1210     vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
   1211     vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
   1212 
   1213     vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
   1214     vpxor $TMP3, $TMP0, $TMP0
   1215     vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
   1216     vpxor $TMP3, $TMP1, $TMP1
   1217     vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
   1218     vpxor $TMP3, $TMP2, $TMP2
   1219     vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
   1220     vpxor $TMP3, $TMP0, $TMP0
   1221 ___
   1222   };
   1223 
   1224   if ($aes256) {
   1225     $code.=<<___;
   1226 .globl aes256gcmsiv_dec
   1227 .type aes256gcmsiv_dec,\@function,6
   1228 .align 16
   1229 aes256gcmsiv_dec:
   1230 ___
   1231   } else {
   1232     $code.=<<___;
   1233 .globl aes128gcmsiv_dec
   1234 .type aes128gcmsiv_dec,\@function,6
   1235 .align 16
   1236 aes128gcmsiv_dec:
   1237 ___
   1238   }
   1239 
   1240   $code.=<<___;
   1241 .cfi_startproc
   1242     test \$~15, $LEN
   1243     jnz .L${labelPrefix}_dec_start
   1244     ret
   1245 
   1246 .L${labelPrefix}_dec_start:
   1247     vzeroupper
   1248     vmovdqa ($POL), $T
   1249     movq $POL, $secureBuffer
   1250 
   1251     leaq 32($secureBuffer), $secureBuffer
   1252     leaq 32($Htbl), $Htbl
   1253 
   1254     # make CTRBLKs from given tag.
   1255     vmovdqu ($CT,$LEN), $CTR
   1256     vpor OR_MASK(%rip), $CTR, $CTR      # CTR = [1]TAG[126...32][00..00]
   1257     andq \$~15, $LEN
   1258 
   1259     # If less then 6 blocks, make singles
   1260     cmp \$96, $LEN
   1261     jb .L${labelPrefix}_dec_loop2
   1262 
   1263     # Decrypt the first six blocks
   1264     sub \$96, $LEN
   1265     vmovdqa $CTR, $CTR1
   1266     vpaddd one(%rip), $CTR1, $CTR2
   1267     vpaddd two(%rip), $CTR1, $CTR3
   1268     vpaddd one(%rip), $CTR3, $CTR4
   1269     vpaddd two(%rip), $CTR3, $CTR5
   1270     vpaddd one(%rip), $CTR5, $CTR6
   1271     vpaddd two(%rip), $CTR5, $CTR
   1272 
   1273     vpxor ($KS), $CTR1, $CTR1
   1274     vpxor ($KS), $CTR2, $CTR2
   1275     vpxor ($KS), $CTR3, $CTR3
   1276     vpxor ($KS), $CTR4, $CTR4
   1277     vpxor ($KS), $CTR5, $CTR5
   1278     vpxor ($KS), $CTR6, $CTR6
   1279 
   1280     ${\$aes_round_dec->(1)}
   1281     ${\$aes_round_dec->(2)}
   1282     ${\$aes_round_dec->(3)}
   1283     ${\$aes_round_dec->(4)}
   1284     ${\$aes_round_dec->(5)}
   1285     ${\$aes_round_dec->(6)}
   1286     ${\$aes_round_dec->(7)}
   1287     ${\$aes_round_dec->(8)}
   1288     ${\$aes_round_dec->(9)}
   1289 ___
   1290 
   1291 if ($aes256) {
   1292 $code.=<<___;
   1293     ${\$aes_round_dec->(10)}
   1294     ${\$aes_round_dec->(11)}
   1295     ${\$aes_round_dec->(12)}
   1296     ${\$aes_round_dec->(13)}
   1297     ${\$aes_lastround_dec->(14)}
   1298 ___
   1299 } else {
   1300 $code.=<<___;
   1301     ${\$aes_lastround_dec->(10)}
   1302 ___
   1303 }
   1304 
   1305 $code.=<<___;
   1306     # XOR with CT
   1307     vpxor 0*16($CT), $CTR1, $CTR1
   1308     vpxor 1*16($CT), $CTR2, $CTR2
   1309     vpxor 2*16($CT), $CTR3, $CTR3
   1310     vpxor 3*16($CT), $CTR4, $CTR4
   1311     vpxor 4*16($CT), $CTR5, $CTR5
   1312     vpxor 5*16($CT), $CTR6, $CTR6
   1313 
   1314     vmovdqu $CTR1, 0*16($PT)
   1315     vmovdqu $CTR2, 1*16($PT)
   1316     vmovdqu $CTR3, 2*16($PT)
   1317     vmovdqu $CTR4, 3*16($PT)
   1318     vmovdqu $CTR5, 4*16($PT)
   1319     vmovdqu $CTR6, 5*16($PT)
   1320 
   1321     addq \$96, $CT
   1322     addq \$96, $PT
   1323     jmp .L${labelPrefix}_dec_loop1
   1324 
   1325 # Decrypt 6 blocks each time while hashing previous 6 blocks
   1326 .align 64
   1327 .L${labelPrefix}_dec_loop1:
   1328     cmp \$96, $LEN
   1329     jb .L${labelPrefix}_dec_finish_96
   1330     sub \$96, $LEN
   1331 
   1332     vmovdqa $CTR6, $TMP5
   1333     vmovdqa $CTR5, 1*16-32($secureBuffer)
   1334     vmovdqa $CTR4, 2*16-32($secureBuffer)
   1335     vmovdqa $CTR3, 3*16-32($secureBuffer)
   1336     vmovdqa $CTR2, 4*16-32($secureBuffer)
   1337     vmovdqa $CTR1, 5*16-32($secureBuffer)
   1338 
   1339     vmovdqa $CTR, $CTR1
   1340     vpaddd one(%rip), $CTR1, $CTR2
   1341     vpaddd two(%rip), $CTR1, $CTR3
   1342     vpaddd one(%rip), $CTR3, $CTR4
   1343     vpaddd two(%rip), $CTR3, $CTR5
   1344     vpaddd one(%rip), $CTR5, $CTR6
   1345     vpaddd two(%rip), $CTR5, $CTR
   1346 
   1347     vmovdqa ($KS), $TMP3
   1348     vpxor $TMP3, $CTR1, $CTR1
   1349     vpxor $TMP3, $CTR2, $CTR2
   1350     vpxor $TMP3, $CTR3, $CTR3
   1351     vpxor $TMP3, $CTR4, $CTR4
   1352     vpxor $TMP3, $CTR5, $CTR5
   1353     vpxor $TMP3, $CTR6, $CTR6
   1354 
   1355     vmovdqu 0*16-32($Htbl), $TMP3
   1356     vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
   1357     vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
   1358     vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
   1359     vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
   1360     vpxor $TMP3, $TMP0, $TMP0
   1361 
   1362     ${\$aes_round_dec->(1)}
   1363     ${\$schoolbook->(1)}
   1364 
   1365     ${\$aes_round_dec->(2)}
   1366     ${\$schoolbook->(2)}
   1367 
   1368     ${\$aes_round_dec->(3)}
   1369     ${\$schoolbook->(3)}
   1370 
   1371     ${\$aes_round_dec->(4)}
   1372     ${\$schoolbook->(4)}
   1373 
   1374     ${\$aes_round_dec->(5)}
   1375     ${\$aes_round_dec->(6)}
   1376     ${\$aes_round_dec->(7)}
   1377 
   1378     vmovdqa 5*16-32($secureBuffer), $TMP5
   1379     vpxor $T, $TMP5, $TMP5
   1380     vmovdqu 5*16-32($Htbl), $TMP4
   1381 
   1382     vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
   1383     vpxor $TMP3, $TMP0, $TMP0
   1384     vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
   1385     vpxor $TMP3, $TMP1, $TMP1
   1386     vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
   1387     vpxor $TMP3, $TMP2, $TMP2
   1388     vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
   1389     vpxor $TMP3, $TMP0, $TMP0
   1390 
   1391     ${\$aes_round_dec->(8)}
   1392 
   1393     vpsrldq \$8, $TMP0, $TMP3
   1394     vpxor $TMP3, $TMP1, $TMP4
   1395     vpslldq \$8, $TMP0, $TMP3
   1396     vpxor $TMP3, $TMP2, $T
   1397 
   1398     vmovdqa poly(%rip), $TMP2
   1399 
   1400     ${\$aes_round_dec->(9)}
   1401 ___
   1402 
   1403 if ($aes256) {
   1404 $code.=<<___;
   1405     ${\$aes_round_dec->(10)}
   1406     ${\$aes_round_dec->(11)}
   1407     ${\$aes_round_dec->(12)}
   1408     ${\$aes_round_dec->(13)}
   1409     vmovdqu 14*16($KS), $TMP5
   1410 ___
   1411 } else {
   1412 $code.=<<___;
   1413     vmovdqu 10*16($KS), $TMP5
   1414 ___
   1415 }
   1416 
   1417 $code.=<<___;
   1418     vpalignr \$8, $T, $T, $TMP1
   1419     vpclmulqdq \$0x10, $TMP2, $T, $T
   1420     vpxor $T, $TMP1, $T
   1421 
   1422     vpxor 0*16($CT), $TMP5, $TMP3
   1423     vaesenclast $TMP3, $CTR1, $CTR1
   1424     vpxor 1*16($CT), $TMP5, $TMP3
   1425     vaesenclast $TMP3, $CTR2, $CTR2
   1426     vpxor 2*16($CT), $TMP5, $TMP3
   1427     vaesenclast $TMP3, $CTR3, $CTR3
   1428     vpxor 3*16($CT), $TMP5, $TMP3
   1429     vaesenclast $TMP3, $CTR4, $CTR4
   1430     vpxor 4*16($CT), $TMP5, $TMP3
   1431     vaesenclast $TMP3, $CTR5, $CTR5
   1432     vpxor 5*16($CT), $TMP5, $TMP3
   1433     vaesenclast $TMP3, $CTR6, $CTR6
   1434 
   1435     vpalignr \$8, $T, $T, $TMP1
   1436     vpclmulqdq \$0x10, $TMP2, $T, $T
   1437     vpxor $T, $TMP1, $T
   1438 
   1439     vmovdqu $CTR1, 0*16($PT)
   1440     vmovdqu $CTR2, 1*16($PT)
   1441     vmovdqu $CTR3, 2*16($PT)
   1442     vmovdqu $CTR4, 3*16($PT)
   1443     vmovdqu $CTR5, 4*16($PT)
   1444     vmovdqu $CTR6, 5*16($PT)
   1445 
   1446     vpxor $TMP4, $T, $T
   1447 
   1448     lea 96($CT), $CT
   1449     lea 96($PT), $PT
   1450     jmp .L${labelPrefix}_dec_loop1
   1451 
   1452 .L${labelPrefix}_dec_finish_96:
   1453     vmovdqa $CTR6, $TMP5
   1454     vmovdqa $CTR5, 1*16-32($secureBuffer)
   1455     vmovdqa $CTR4, 2*16-32($secureBuffer)
   1456     vmovdqa $CTR3, 3*16-32($secureBuffer)
   1457     vmovdqa $CTR2, 4*16-32($secureBuffer)
   1458     vmovdqa $CTR1, 5*16-32($secureBuffer)
   1459 
   1460     vmovdqu 0*16-32($Htbl), $TMP3
   1461     vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
   1462     vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
   1463     vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
   1464     vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
   1465     vpxor $TMP3, $TMP0, $TMP0
   1466 
   1467     ${\$schoolbook->(1)}
   1468     ${\$schoolbook->(2)}
   1469     ${\$schoolbook->(3)}
   1470     ${\$schoolbook->(4)}
   1471 
   1472     vmovdqu 5*16-32($secureBuffer), $TMP5
   1473     vpxor $T, $TMP5, $TMP5
   1474     vmovdqu 5*16-32($Htbl), $TMP4
   1475     vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
   1476     vpxor $TMP3, $TMP1, $TMP1
   1477     vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
   1478     vpxor $TMP3, $TMP2, $TMP2
   1479     vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
   1480     vpxor $TMP3, $TMP0, $TMP0
   1481     vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
   1482     vpxor $TMP3, $TMP0, $TMP0
   1483 
   1484     vpsrldq \$8, $TMP0, $TMP3
   1485     vpxor $TMP3, $TMP1, $TMP4
   1486     vpslldq \$8, $TMP0, $TMP3
   1487     vpxor $TMP3, $TMP2, $T
   1488 
   1489     vmovdqa poly(%rip), $TMP2
   1490 
   1491     vpalignr \$8, $T, $T, $TMP1
   1492     vpclmulqdq \$0x10, $TMP2, $T, $T
   1493     vpxor $T, $TMP1, $T
   1494 
   1495     vpalignr \$8, $T, $T, $TMP1
   1496     vpclmulqdq \$0x10, $TMP2, $T, $T
   1497     vpxor $T, $TMP1, $T
   1498 
   1499     vpxor $TMP4, $T, $T
   1500 
   1501 .L${labelPrefix}_dec_loop2:
   1502     # Here we encrypt any remaining whole block
   1503 
   1504     # if there are no whole blocks
   1505     cmp \$16, $LEN
   1506     jb .L${labelPrefix}_dec_out
   1507     sub \$16, $LEN
   1508 
   1509     vmovdqa $CTR, $TMP1
   1510     vpaddd one(%rip), $CTR, $CTR
   1511 
   1512     vpxor 0*16($KS), $TMP1, $TMP1
   1513     vaesenc 1*16($KS), $TMP1, $TMP1
   1514     vaesenc 2*16($KS), $TMP1, $TMP1
   1515     vaesenc 3*16($KS), $TMP1, $TMP1
   1516     vaesenc 4*16($KS), $TMP1, $TMP1
   1517     vaesenc 5*16($KS), $TMP1, $TMP1
   1518     vaesenc 6*16($KS), $TMP1, $TMP1
   1519     vaesenc 7*16($KS), $TMP1, $TMP1
   1520     vaesenc 8*16($KS), $TMP1, $TMP1
   1521     vaesenc 9*16($KS), $TMP1, $TMP1
   1522 ___
   1523 if ($aes256) {
   1524 $code.=<<___;
   1525     vaesenc 10*16($KS), $TMP1, $TMP1
   1526     vaesenc 11*16($KS), $TMP1, $TMP1
   1527     vaesenc 12*16($KS), $TMP1, $TMP1
   1528     vaesenc 13*16($KS), $TMP1, $TMP1
   1529     vaesenclast 14*16($KS), $TMP1, $TMP1
   1530 ___
   1531 } else {
   1532 $code.=<<___;
   1533     vaesenclast 10*16($KS), $TMP1, $TMP1
   1534 ___
   1535 }
   1536 
   1537 $code.=<<___;
   1538     vpxor ($CT), $TMP1, $TMP1
   1539     vmovdqu $TMP1, ($PT)
   1540     addq \$16, $CT
   1541     addq \$16, $PT
   1542 
   1543     vpxor $TMP1, $T, $T
   1544     vmovdqa -32($Htbl), $TMP0
   1545     call GFMUL
   1546 
   1547     jmp .L${labelPrefix}_dec_loop2
   1548 
   1549 .L${labelPrefix}_dec_out:
   1550     vmovdqu $T, ($POL)
   1551     ret
   1552 .cfi_endproc
   1553 ___
   1554 
   1555   if ($aes256) {
   1556     $code.=<<___;
   1557 .size aes256gcmsiv_dec, .-aes256gcmsiv_dec
   1558 ___
   1559   } else {
   1560     $code.=<<___;
   1561 .size aes128gcmsiv_dec, .-aes128gcmsiv_dec
   1562 ___
   1563   }
   1564 }
   1565 
   1566 aesgcmsiv_dec(0);  # emit 128-bit version
   1567 
   1568 sub aes128gcmsiv_ecb_enc_block {
   1569   my $STATE_1 = "%xmm1";
   1570   my $KSp = "%rdx";
   1571 
   1572   # parameter 1: PT            %rdi    (pointer to 128 bit)
   1573   # parameter 2: CT            %rsi    (pointer to 128 bit)
   1574   # parameter 3: ks            %rdx    (pointer to ks)
   1575   $code.=<<___;
   1576 .globl aes128gcmsiv_ecb_enc_block
   1577 .type aes128gcmsiv_ecb_enc_block,\@function,3
   1578 .align 16
   1579 aes128gcmsiv_ecb_enc_block:
   1580 .cfi_startproc
   1581     vmovdqa (%rdi), $STATE_1
   1582 
   1583     vpxor       ($KSp), $STATE_1, $STATE_1
   1584     vaesenc 1*16($KSp), $STATE_1, $STATE_1
   1585     vaesenc 2*16($KSp), $STATE_1, $STATE_1
   1586     vaesenc 3*16($KSp), $STATE_1, $STATE_1
   1587     vaesenc 4*16($KSp), $STATE_1, $STATE_1
   1588     vaesenc 5*16($KSp), $STATE_1, $STATE_1
   1589     vaesenc 6*16($KSp), $STATE_1, $STATE_1
   1590     vaesenc 7*16($KSp), $STATE_1, $STATE_1
   1591     vaesenc 8*16($KSp), $STATE_1, $STATE_1
   1592     vaesenc 9*16($KSp), $STATE_1, $STATE_1
   1593     vaesenclast 10*16($KSp), $STATE_1, $STATE_1    # STATE_1 == IV
   1594 
   1595     vmovdqa $STATE_1, (%rsi)
   1596 
   1597     ret
   1598 .cfi_endproc
   1599 .size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
   1600 ___
   1601 }
   1602 aes128gcmsiv_ecb_enc_block();
   1603 
   1604 sub aes256gcmsiv_aes_ks_enc_x1 {
   1605   my $KS = "%rdx";
   1606   my $KEYp = "%rcx";
   1607   my $CON_MASK = "%xmm0";
   1608   my $MASK_256 = "%xmm15";
   1609   my $KEY_1 = "%xmm1";
   1610   my $KEY_2 = "%xmm3";
   1611   my $BLOCK1 = "%xmm8";
   1612   my $AUX_REG = "%xmm14";
   1613   my $PT = "%rdi";
   1614   my $CT = "%rsi";
   1615 
   1616   my $round_double = sub {
   1617     my ($i, $j) = @_;
   1618     return <<___;
   1619     vpshufb %xmm15, %xmm3, %xmm2
   1620     vaesenclast %xmm0, %xmm2, %xmm2
   1621     vpslld \$1, %xmm0, %xmm0
   1622     vpslldq \$4, %xmm1, %xmm4
   1623     vpxor %xmm4, %xmm1, %xmm1
   1624     vpslldq \$4, %xmm4, %xmm4
   1625     vpxor %xmm4, %xmm1, %xmm1
   1626     vpslldq \$4, %xmm4, %xmm4
   1627     vpxor %xmm4, %xmm1, %xmm1
   1628     vpxor %xmm2, %xmm1, %xmm1
   1629     vaesenc %xmm1, $BLOCK1, $BLOCK1
   1630     vmovdqu %xmm1, ${\eval(16*$i)}($KS)
   1631 
   1632     vpshufd \$0xff, %xmm1, %xmm2
   1633     vaesenclast %xmm14, %xmm2, %xmm2
   1634     vpslldq \$4, %xmm3, %xmm4
   1635     vpxor %xmm4, %xmm3, %xmm3
   1636     vpslldq \$4, %xmm4, %xmm4
   1637     vpxor %xmm4, %xmm3, %xmm3
   1638     vpslldq \$4, %xmm4, %xmm4
   1639     vpxor %xmm4, %xmm3, %xmm3
   1640     vpxor %xmm2, %xmm3, %xmm3
   1641     vaesenc %xmm3, $BLOCK1, $BLOCK1
   1642     vmovdqu %xmm3, ${\eval(16*$j)}($KS)
   1643 ___
   1644   };
   1645 
   1646   my $round_last = sub {
   1647     my ($i) = @_;
   1648     return <<___;
   1649     vpshufb %xmm15, %xmm3, %xmm2
   1650     vaesenclast %xmm0, %xmm2, %xmm2
   1651     vpslldq \$4, %xmm1, %xmm4
   1652     vpxor %xmm4, %xmm1, %xmm1
   1653     vpslldq \$4, %xmm4, %xmm4
   1654     vpxor %xmm4, %xmm1, %xmm1
   1655     vpslldq \$4, %xmm4, %xmm4
   1656     vpxor %xmm4, %xmm1, %xmm1
   1657     vpxor %xmm2, %xmm1, %xmm1
   1658     vaesenclast %xmm1, $BLOCK1, $BLOCK1
   1659     vmovdqu %xmm1, ${\eval(16*$i)}($KS)
   1660 ___
   1661   };
   1662 
   1663   # parameter 1: %rdi         Pointer to PT1
   1664   # parameter 2: %rsi         Pointer to CT1
   1665   # parameter 3: %rdx         Pointer to KS
   1666   # parameter 4: %rcx         Pointer to initial key
   1667   $code.=<<___;
   1668 .globl aes256gcmsiv_aes_ks_enc_x1
   1669 .type aes256gcmsiv_aes_ks_enc_x1,\@function,4
   1670 .align 16
   1671 aes256gcmsiv_aes_ks_enc_x1:
   1672 .cfi_startproc
   1673     vmovdqa con1(%rip), $CON_MASK    # CON_MASK  = 1,1,1,1
   1674     vmovdqa mask(%rip), $MASK_256    # MASK_256
   1675     vmovdqa ($PT), $BLOCK1
   1676     vmovdqa ($KEYp), $KEY_1          # KEY_1 || KEY_2 [0..7] = user key
   1677     vmovdqa 16($KEYp), $KEY_2
   1678     vpxor $KEY_1, $BLOCK1, $BLOCK1
   1679     vaesenc $KEY_2, $BLOCK1, $BLOCK1
   1680     vmovdqu $KEY_1, ($KS)            # First round key
   1681     vmovdqu $KEY_2, 16($KS)
   1682     vpxor $AUX_REG, $AUX_REG, $AUX_REG
   1683 
   1684     ${\$round_double->(2, 3)}
   1685     ${\$round_double->(4, 5)}
   1686     ${\$round_double->(6, 7)}
   1687     ${\$round_double->(8, 9)}
   1688     ${\$round_double->(10, 11)}
   1689     ${\$round_double->(12, 13)}
   1690     ${\$round_last->(14)}
   1691     vmovdqa $BLOCK1, ($CT)
   1692     ret
   1693 .cfi_endproc
   1694 .size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
   1695 ___
   1696 }
   1697 aes256gcmsiv_aes_ks_enc_x1();
   1698 
   1699 sub aes256gcmsiv_ecb_enc_block {
   1700   my $STATE_1 = "%xmm1";
   1701   my $PT = "%rdi";
   1702   my $CT = "%rsi";
   1703   my $KSp = "%rdx";
   1704 
   1705   # parameter 1: PT            %rdi    (pointer to 128 bit)
   1706   # parameter 2: CT            %rsi    (pointer to 128 bit)
   1707   # parameter 3: ks            %rdx    (pointer to ks)
   1708   $code.=<<___;
   1709 .globl aes256gcmsiv_ecb_enc_block
   1710 .type aes256gcmsiv_ecb_enc_block,\@function,3
   1711 .align 16
   1712 aes256gcmsiv_ecb_enc_block:
   1713 .cfi_startproc
   1714     vmovdqa (%rdi), $STATE_1
   1715     vpxor ($KSp), $STATE_1, $STATE_1
   1716     vaesenc 1*16($KSp), $STATE_1, $STATE_1
   1717     vaesenc 2*16($KSp), $STATE_1, $STATE_1
   1718     vaesenc 3*16($KSp), $STATE_1, $STATE_1
   1719     vaesenc 4*16($KSp), $STATE_1, $STATE_1
   1720     vaesenc 5*16($KSp), $STATE_1, $STATE_1
   1721     vaesenc 6*16($KSp), $STATE_1, $STATE_1
   1722     vaesenc 7*16($KSp), $STATE_1, $STATE_1
   1723     vaesenc 8*16($KSp), $STATE_1, $STATE_1
   1724     vaesenc 9*16($KSp), $STATE_1, $STATE_1
   1725     vaesenc 10*16($KSp), $STATE_1, $STATE_1
   1726     vaesenc 11*16($KSp), $STATE_1, $STATE_1
   1727     vaesenc 12*16($KSp), $STATE_1, $STATE_1
   1728     vaesenc 13*16($KSp), $STATE_1, $STATE_1
   1729     vaesenclast 14*16($KSp), $STATE_1, $STATE_1    # $STATE_1 == IV
   1730     vmovdqa $STATE_1, (%rsi)
   1731     ret
   1732 .cfi_endproc
   1733 .size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
   1734 ___
   1735 }
   1736 aes256gcmsiv_ecb_enc_block();
   1737 
   1738 sub aes256gcmsiv_enc_msg_x4 {
   1739   my $CTR1 = "%xmm0";
   1740   my $CTR2 = "%xmm1";
   1741   my $CTR3 = "%xmm2";
   1742   my $CTR4 = "%xmm3";
   1743   my $ADDER = "%xmm4";
   1744 
   1745   my $STATE1 = "%xmm5";
   1746   my $STATE2 = "%xmm6";
   1747   my $STATE3 = "%xmm7";
   1748   my $STATE4 = "%xmm8";
   1749 
   1750   my $TMP = "%xmm12";
   1751   my $TMP2 = "%xmm13";
   1752   my $TMP3 = "%xmm14";
   1753   my $IV = "%xmm15";
   1754 
   1755   my $PT = "%rdi";
   1756   my $CT = "%rsi";
   1757   my $TAG = "%rdx";
   1758   my $KS = "%rcx";
   1759   my $LEN = "%r8";
   1760 
   1761   my $aes_round = sub {
   1762     my ($i) = @_;
   1763     return <<___;
   1764     vmovdqu ${\eval($i*16)}($KS), $TMP
   1765     vaesenc $TMP, $STATE1, $STATE1
   1766     vaesenc $TMP, $STATE2, $STATE2
   1767     vaesenc $TMP, $STATE3, $STATE3
   1768     vaesenc $TMP, $STATE4, $STATE4
   1769 ___
   1770   };
   1771 
   1772   my $aes_lastround = sub {
   1773     my ($i) = @_;
   1774     return <<___;
   1775     vmovdqu ${\eval($i*16)}($KS), $TMP
   1776     vaesenclast $TMP, $STATE1, $STATE1
   1777     vaesenclast $TMP, $STATE2, $STATE2
   1778     vaesenclast $TMP, $STATE3, $STATE3
   1779     vaesenclast $TMP, $STATE4, $STATE4
   1780 ___
   1781   };
   1782 
   1783   # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
   1784   #                              unsigned char* TAG, unsigned char* KS,
   1785   #                              size_t byte_len);
   1786   # parameter 1: %rdi     #PT
   1787   # parameter 2: %rsi     #CT
   1788   # parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
   1789   # parameter 4: %rcx     #KS
   1790   # parameter 5: %r8      #LEN MSG_length in bytes
   1791   $code.=<<___;
   1792 .globl aes256gcmsiv_enc_msg_x4
   1793 .type aes256gcmsiv_enc_msg_x4,\@function,5
   1794 .align 16
   1795 aes256gcmsiv_enc_msg_x4:
   1796 .cfi_startproc
   1797     test $LEN, $LEN
   1798     jnz .L256_enc_msg_x4_start
   1799     ret
   1800 
   1801 .L256_enc_msg_x4_start:
   1802     movq $LEN, %r10
   1803     shrq \$4, $LEN                       # LEN = num of blocks
   1804     shlq \$60, %r10
   1805     jz .L256_enc_msg_x4_start2
   1806     addq \$1, $LEN
   1807 
   1808 .L256_enc_msg_x4_start2:
   1809     movq $LEN, %r10
   1810     shlq \$62, %r10
   1811     shrq \$62, %r10
   1812 
   1813     # make IV from TAG
   1814     vmovdqa ($TAG), $IV
   1815     vpor OR_MASK(%rip), $IV, $IV        # IV = [1]TAG[126...32][00..00]
   1816 
   1817     vmovdqa four(%rip), $ADDER          # Register to increment counters
   1818     vmovdqa $IV, $CTR1                  # CTR1 = TAG[1][127...32][00..00]
   1819     vpaddd one(%rip), $IV, $CTR2        # CTR2 = TAG[1][127...32][00..01]
   1820     vpaddd two(%rip), $IV, $CTR3        # CTR3 = TAG[1][127...32][00..02]
   1821     vpaddd three(%rip), $IV, $CTR4      # CTR4 = TAG[1][127...32][00..03]
   1822 
   1823     shrq \$2, $LEN
   1824     je .L256_enc_msg_x4_check_remainder
   1825 
   1826     subq \$64, $CT
   1827     subq \$64, $PT
   1828 
   1829 .L256_enc_msg_x4_loop1:
   1830     addq \$64, $CT
   1831     addq \$64, $PT
   1832 
   1833     vmovdqa $CTR1, $STATE1
   1834     vmovdqa $CTR2, $STATE2
   1835     vmovdqa $CTR3, $STATE3
   1836     vmovdqa $CTR4, $STATE4
   1837 
   1838     vpxor ($KS), $STATE1, $STATE1
   1839     vpxor ($KS), $STATE2, $STATE2
   1840     vpxor ($KS), $STATE3, $STATE3
   1841     vpxor ($KS), $STATE4, $STATE4
   1842 
   1843     ${\$aes_round->(1)}
   1844     vpaddd $ADDER, $CTR1, $CTR1
   1845     ${\$aes_round->(2)}
   1846     vpaddd $ADDER, $CTR2, $CTR2
   1847     ${\$aes_round->(3)}
   1848     vpaddd $ADDER, $CTR3, $CTR3
   1849     ${\$aes_round->(4)}
   1850     vpaddd $ADDER, $CTR4, $CTR4
   1851 
   1852     ${\$aes_round->(5)}
   1853     ${\$aes_round->(6)}
   1854     ${\$aes_round->(7)}
   1855     ${\$aes_round->(8)}
   1856     ${\$aes_round->(9)}
   1857     ${\$aes_round->(10)}
   1858     ${\$aes_round->(11)}
   1859     ${\$aes_round->(12)}
   1860     ${\$aes_round->(13)}
   1861     ${\$aes_lastround->(14)}
   1862 
   1863     # XOR with Plaintext
   1864     vpxor 0*16($PT), $STATE1, $STATE1
   1865     vpxor 1*16($PT), $STATE2, $STATE2
   1866     vpxor 2*16($PT), $STATE3, $STATE3
   1867     vpxor 3*16($PT), $STATE4, $STATE4
   1868 
   1869     subq \$1, $LEN
   1870 
   1871     vmovdqu $STATE1, 0*16($CT)
   1872     vmovdqu $STATE2, 1*16($CT)
   1873     vmovdqu $STATE3, 2*16($CT)
   1874     vmovdqu $STATE4, 3*16($CT)
   1875 
   1876     jne .L256_enc_msg_x4_loop1
   1877 
   1878     addq \$64, $CT
   1879     addq \$64, $PT
   1880 
   1881 .L256_enc_msg_x4_check_remainder:
   1882     cmpq \$0, %r10
   1883     je .L256_enc_msg_x4_out
   1884 
   1885 .L256_enc_msg_x4_loop2:
   1886     # encrypt each block separately
   1887     # CTR1 is the highest counter (even if no LOOP done)
   1888 
   1889     vmovdqa $CTR1, $STATE1
   1890     vpaddd one(%rip), $CTR1, $CTR1      # inc counter
   1891     vpxor ($KS), $STATE1, $STATE1
   1892     vaesenc 16($KS), $STATE1, $STATE1
   1893     vaesenc 32($KS), $STATE1, $STATE1
   1894     vaesenc 48($KS), $STATE1, $STATE1
   1895     vaesenc 64($KS), $STATE1, $STATE1
   1896     vaesenc 80($KS), $STATE1, $STATE1
   1897     vaesenc 96($KS), $STATE1, $STATE1
   1898     vaesenc 112($KS), $STATE1, $STATE1
   1899     vaesenc 128($KS), $STATE1, $STATE1
   1900     vaesenc 144($KS), $STATE1, $STATE1
   1901     vaesenc 160($KS), $STATE1, $STATE1
   1902     vaesenc 176($KS), $STATE1, $STATE1
   1903     vaesenc 192($KS), $STATE1, $STATE1
   1904     vaesenc 208($KS), $STATE1, $STATE1
   1905     vaesenclast 224($KS), $STATE1, $STATE1
   1906 
   1907     # XOR with Plaintext
   1908     vpxor ($PT), $STATE1, $STATE1
   1909 
   1910     vmovdqu $STATE1, ($CT)
   1911 
   1912     addq \$16, $PT
   1913     addq \$16, $CT
   1914 
   1915     subq \$1, %r10
   1916     jne .L256_enc_msg_x4_loop2
   1917 
   1918 .L256_enc_msg_x4_out:
   1919     ret
   1920 .cfi_endproc
   1921 .size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
   1922 ___
   1923 }
   1924 aes256gcmsiv_enc_msg_x4();
   1925 
   1926 sub aes256gcmsiv_enc_msg_x8() {
   1927   my $STATE1 = "%xmm1";
   1928   my $STATE2 = "%xmm2";
   1929   my $STATE3 = "%xmm3";
   1930   my $STATE4 = "%xmm4";
   1931   my $STATE5 = "%xmm5";
   1932   my $STATE6 = "%xmm6";
   1933   my $STATE7 = "%xmm7";
   1934   my $STATE8 = "%xmm8";
   1935   my $CTR1 = "%xmm0";
   1936   my $CTR2 = "%xmm9";
   1937   my $CTR3 = "%xmm10";
   1938   my $CTR4 = "%xmm11";
   1939   my $CTR5 = "%xmm12";
   1940   my $CTR6 = "%xmm13";
   1941   my $CTR7 = "%xmm14";
   1942   my $TMP1 = "%xmm1";
   1943   my $TMP2 = "%xmm2";
   1944   my $KS = "%rcx";
   1945   my $LEN = "%r8";
   1946   my $PT = "%rdi";
   1947   my $CT = "%rsi";
   1948   my $TAG = "%rdx";
   1949   my $SCHED = "%xmm15";
   1950 
   1951   my $aes_round8 = sub {
   1952     my ($i) = @_;
   1953     return <<___;
   1954     vmovdqu ${\eval($i*16)}($KS), $SCHED
   1955     vaesenc $SCHED, $STATE1, $STATE1
   1956     vaesenc $SCHED, $STATE2, $STATE2
   1957     vaesenc $SCHED, $STATE3, $STATE3
   1958     vaesenc $SCHED, $STATE4, $STATE4
   1959     vaesenc $SCHED, $STATE5, $STATE5
   1960     vaesenc $SCHED, $STATE6, $STATE6
   1961     vaesenc $SCHED, $STATE7, $STATE7
   1962     vaesenc $SCHED, $STATE8, $STATE8
   1963 ___
   1964   };
   1965 
   1966   my $aes_lastround8 = sub {
   1967     my ($i) = @_;
   1968     return <<___;
   1969     vmovdqu ${\eval($i*16)}($KS), $SCHED
   1970     vaesenclast $SCHED, $STATE1, $STATE1
   1971     vaesenclast $SCHED, $STATE2, $STATE2
   1972     vaesenclast $SCHED, $STATE3, $STATE3
   1973     vaesenclast $SCHED, $STATE4, $STATE4
   1974     vaesenclast $SCHED, $STATE5, $STATE5
   1975     vaesenclast $SCHED, $STATE6, $STATE6
   1976     vaesenclast $SCHED, $STATE7, $STATE7
   1977     vaesenclast $SCHED, $STATE8, $STATE8
   1978 ___
   1979   };
   1980 
   1981   # void ENC_MSG_x8(unsigned char* PT,
   1982   #                 unsigned char* CT,
   1983   #                 unsigned char* TAG,
   1984   #                 unsigned char* KS,
   1985   #                 size_t byte_len);
   1986   # parameter 1: %rdi     #PT
   1987   # parameter 2: %rsi     #CT
   1988   # parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
   1989   # parameter 4: %rcx     #KS
   1990   # parameter 5: %r8      #LEN MSG_length in bytes
   1991   $code.=<<___;
   1992 .globl aes256gcmsiv_enc_msg_x8
   1993 .type aes256gcmsiv_enc_msg_x8,\@function,5
   1994 .align 16
   1995 aes256gcmsiv_enc_msg_x8:
   1996 .cfi_startproc
   1997     test $LEN, $LEN
   1998     jnz .L256_enc_msg_x8_start
   1999     ret
   2000 
   2001 .L256_enc_msg_x8_start:
   2002     # Place in stack
   2003     movq %rsp, %r11
   2004     subq \$16, %r11
   2005     andq \$-64, %r11
   2006 
   2007     movq $LEN, %r10
   2008     shrq \$4, $LEN                       # LEN = num of blocks
   2009     shlq \$60, %r10
   2010     jz .L256_enc_msg_x8_start2
   2011     addq \$1, $LEN
   2012 
   2013 .L256_enc_msg_x8_start2:
   2014     movq $LEN, %r10
   2015     shlq \$61, %r10
   2016     shrq \$61, %r10
   2017 
   2018     # Make IV from TAG
   2019     vmovdqa ($TAG), $TMP1
   2020     vpor OR_MASK(%rip), $TMP1, $TMP1    # TMP1= IV = [1]TAG[126...32][00..00]
   2021 
   2022     # store counter8 on the stack
   2023     vpaddd seven(%rip), $TMP1, $CTR1
   2024     vmovdqa $CTR1, (%r11)                # CTR8 = TAG[127...32][00..07]
   2025     vpaddd one(%rip), $TMP1, $CTR2       # CTR2 = TAG[127...32][00..01]
   2026     vpaddd two(%rip), $TMP1, $CTR3       # CTR3 = TAG[127...32][00..02]
   2027     vpaddd three(%rip), $TMP1, $CTR4     # CTR4 = TAG[127...32][00..03]
   2028     vpaddd four(%rip), $TMP1, $CTR5      # CTR5 = TAG[127...32][00..04]
   2029     vpaddd five(%rip), $TMP1, $CTR6      # CTR6 = TAG[127...32][00..05]
   2030     vpaddd six(%rip), $TMP1, $CTR7       # CTR7 = TAG[127...32][00..06]
   2031     vmovdqa $TMP1, $CTR1                 # CTR1 = TAG[127...32][00..00]
   2032 
   2033     shrq \$3, $LEN
   2034     jz .L256_enc_msg_x8_check_remainder
   2035 
   2036     subq \$128, $CT
   2037     subq \$128, $PT
   2038 
   2039 .L256_enc_msg_x8_loop1:
   2040     addq \$128, $CT
   2041     addq \$128, $PT
   2042 
   2043     vmovdqa $CTR1, $STATE1
   2044     vmovdqa $CTR2, $STATE2
   2045     vmovdqa $CTR3, $STATE3
   2046     vmovdqa $CTR4, $STATE4
   2047     vmovdqa $CTR5, $STATE5
   2048     vmovdqa $CTR6, $STATE6
   2049     vmovdqa $CTR7, $STATE7
   2050     # move from stack
   2051     vmovdqa (%r11), $STATE8
   2052 
   2053     vpxor ($KS), $STATE1, $STATE1
   2054     vpxor ($KS), $STATE2, $STATE2
   2055     vpxor ($KS), $STATE3, $STATE3
   2056     vpxor ($KS), $STATE4, $STATE4
   2057     vpxor ($KS), $STATE5, $STATE5
   2058     vpxor ($KS), $STATE6, $STATE6
   2059     vpxor ($KS), $STATE7, $STATE7
   2060     vpxor ($KS), $STATE8, $STATE8
   2061 
   2062     ${\$aes_round8->(1)}
   2063     vmovdqa (%r11), $CTR7                # deal with CTR8
   2064     vpaddd eight(%rip), $CTR7, $CTR7
   2065     vmovdqa $CTR7, (%r11)
   2066     ${\$aes_round8->(2)}
   2067     vpsubd one(%rip), $CTR7, $CTR7
   2068     ${\$aes_round8->(3)}
   2069     vpaddd eight(%rip), $CTR1, $CTR1
   2070     ${\$aes_round8->(4)}
   2071     vpaddd eight(%rip), $CTR2, $CTR2
   2072     ${\$aes_round8->(5)}
   2073     vpaddd eight(%rip), $CTR3, $CTR3
   2074     ${\$aes_round8->(6)}
   2075     vpaddd eight(%rip), $CTR4, $CTR4
   2076     ${\$aes_round8->(7)}
   2077     vpaddd eight(%rip), $CTR5, $CTR5
   2078     ${\$aes_round8->(8)}
   2079     vpaddd eight(%rip), $CTR6, $CTR6
   2080     ${\$aes_round8->(9)}
   2081     ${\$aes_round8->(10)}
   2082     ${\$aes_round8->(11)}
   2083     ${\$aes_round8->(12)}
   2084     ${\$aes_round8->(13)}
   2085     ${\$aes_lastround8->(14)}
   2086 
   2087     # XOR with Plaintext
   2088     vpxor 0*16($PT), $STATE1, $STATE1
   2089     vpxor 1*16($PT), $STATE2, $STATE2
   2090     vpxor 2*16($PT), $STATE3, $STATE3
   2091     vpxor 3*16($PT), $STATE4, $STATE4
   2092     vpxor 4*16($PT), $STATE5, $STATE5
   2093     vpxor 5*16($PT), $STATE6, $STATE6
   2094     vpxor 6*16($PT), $STATE7, $STATE7
   2095     vpxor 7*16($PT), $STATE8, $STATE8
   2096 
   2097     subq \$1, $LEN
   2098 
   2099     vmovdqu $STATE1, 0*16($CT)
   2100     vmovdqu $STATE2, 1*16($CT)
   2101     vmovdqu $STATE3, 2*16($CT)
   2102     vmovdqu $STATE4, 3*16($CT)
   2103     vmovdqu $STATE5, 4*16($CT)
   2104     vmovdqu $STATE6, 5*16($CT)
   2105     vmovdqu $STATE7, 6*16($CT)
   2106     vmovdqu $STATE8, 7*16($CT)
   2107 
   2108     jne .L256_enc_msg_x8_loop1
   2109 
   2110     addq \$128, $CT
   2111     addq \$128, $PT
   2112 
   2113 .L256_enc_msg_x8_check_remainder:
   2114    cmpq \$0, %r10
   2115    je .L256_enc_msg_x8_out
   2116 
   2117 .L256_enc_msg_x8_loop2:
   2118     # encrypt each block separately
   2119     # CTR1 is the highest counter (even if no LOOP done)
   2120     vmovdqa $CTR1, $STATE1
   2121     vpaddd one(%rip), $CTR1, $CTR1
   2122 
   2123     vpxor ($KS), $STATE1, $STATE1
   2124     vaesenc 16($KS), $STATE1, $STATE1
   2125     vaesenc 32($KS), $STATE1, $STATE1
   2126     vaesenc 48($KS), $STATE1, $STATE1
   2127     vaesenc 64($KS), $STATE1, $STATE1
   2128     vaesenc 80($KS), $STATE1, $STATE1
   2129     vaesenc 96($KS), $STATE1, $STATE1
   2130     vaesenc 112($KS), $STATE1, $STATE1
   2131     vaesenc 128($KS), $STATE1, $STATE1
   2132     vaesenc 144($KS), $STATE1, $STATE1
   2133     vaesenc 160($KS), $STATE1, $STATE1
   2134     vaesenc 176($KS), $STATE1, $STATE1
   2135     vaesenc 192($KS), $STATE1, $STATE1
   2136     vaesenc 208($KS), $STATE1, $STATE1
   2137     vaesenclast 224($KS), $STATE1, $STATE1
   2138 
   2139     # XOR with Plaintext
   2140     vpxor ($PT), $STATE1, $STATE1
   2141 
   2142     vmovdqu $STATE1, ($CT)
   2143 
   2144     addq \$16, $PT
   2145     addq \$16, $CT
   2146     subq \$1, %r10
   2147     jnz .L256_enc_msg_x8_loop2
   2148 
   2149 .L256_enc_msg_x8_out:
   2150     ret
   2151 
   2152 .cfi_endproc
   2153 .size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
   2154 ___
   2155 }
   2156 aes256gcmsiv_enc_msg_x8();
   2157 aesgcmsiv_dec(1);
   2158 
   2159 sub aes256gcmsiv_kdf {
   2160   my $ONE = "%xmm8";
   2161   my $BLOCK1 = "%xmm4";
   2162   my $BLOCK2 = "%xmm6";
   2163   my $BLOCK3 = "%xmm7";
   2164   my $BLOCK4 = "%xmm11";
   2165   my $BLOCK5 = "%xmm12";
   2166   my $BLOCK6 = "%xmm13";
   2167 
   2168   my $enc_roundx6 = sub {
   2169     my ($i, $j) = @_;
   2170     return <<___;
   2171     vmovdqa ${\eval($i*16)}(%rdx), $j
   2172     vaesenc $j, $BLOCK1, $BLOCK1
   2173     vaesenc $j, $BLOCK2, $BLOCK2
   2174     vaesenc $j, $BLOCK3, $BLOCK3
   2175     vaesenc $j, $BLOCK4, $BLOCK4
   2176     vaesenc $j, $BLOCK5, $BLOCK5
   2177     vaesenc $j, $BLOCK6, $BLOCK6
   2178 ___
   2179   };
   2180 
   2181   my $enc_roundlastx6 = sub {
   2182     my ($i, $j) = @_;
   2183     return <<___;
   2184     vmovdqa ${\eval($i*16)}(%rdx), $j
   2185     vaesenclast $j, $BLOCK1, $BLOCK1
   2186     vaesenclast $j, $BLOCK2, $BLOCK2
   2187     vaesenclast $j, $BLOCK3, $BLOCK3
   2188     vaesenclast $j, $BLOCK4, $BLOCK4
   2189     vaesenclast $j, $BLOCK5, $BLOCK5
   2190     vaesenclast $j, $BLOCK6, $BLOCK6
   2191 ___
   2192   };
   2193 
   2194   # void aes256gcmsiv_kdf(const uint8_t nonce[16],
   2195   #                       uint8_t *out_key_material,
   2196   #                       const uint8_t *key_schedule);
   2197   $code.=<<___;
   2198 .globl aes256gcmsiv_kdf
   2199 .type aes256gcmsiv_kdf,\@function,3
   2200 .align 16
   2201 aes256gcmsiv_kdf:
   2202 .cfi_startproc
   2203 # parameter 1: %rdi                         Pointer to NONCE
   2204 # parameter 2: %rsi                         Pointer to CT
   2205 # parameter 4: %rdx                         Pointer to keys
   2206 
   2207     vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
   2208     vmovdqa 0*16(%rdi), $BLOCK1
   2209     vmovdqa and_mask(%rip), $BLOCK4
   2210     vmovdqa one(%rip), $ONE
   2211     vpshufd \$0x90, $BLOCK1, $BLOCK1
   2212     vpand $BLOCK4, $BLOCK1, $BLOCK1
   2213     vpaddd $ONE, $BLOCK1, $BLOCK2
   2214     vpaddd $ONE, $BLOCK2, $BLOCK3
   2215     vpaddd $ONE, $BLOCK3, $BLOCK4
   2216     vpaddd $ONE, $BLOCK4, $BLOCK5
   2217     vpaddd $ONE, $BLOCK5, $BLOCK6
   2218 
   2219     vpxor %xmm1, $BLOCK1, $BLOCK1
   2220     vpxor %xmm1, $BLOCK2, $BLOCK2
   2221     vpxor %xmm1, $BLOCK3, $BLOCK3
   2222     vpxor %xmm1, $BLOCK4, $BLOCK4
   2223     vpxor %xmm1, $BLOCK5, $BLOCK5
   2224     vpxor %xmm1, $BLOCK6, $BLOCK6
   2225 
   2226     ${\$enc_roundx6->(1, "%xmm1")}
   2227     ${\$enc_roundx6->(2, "%xmm2")}
   2228     ${\$enc_roundx6->(3, "%xmm1")}
   2229     ${\$enc_roundx6->(4, "%xmm2")}
   2230     ${\$enc_roundx6->(5, "%xmm1")}
   2231     ${\$enc_roundx6->(6, "%xmm2")}
   2232     ${\$enc_roundx6->(7, "%xmm1")}
   2233     ${\$enc_roundx6->(8, "%xmm2")}
   2234     ${\$enc_roundx6->(9, "%xmm1")}
   2235     ${\$enc_roundx6->(10, "%xmm2")}
   2236     ${\$enc_roundx6->(11, "%xmm1")}
   2237     ${\$enc_roundx6->(12, "%xmm2")}
   2238     ${\$enc_roundx6->(13, "%xmm1")}
   2239     ${\$enc_roundlastx6->(14, "%xmm2")}
   2240 
   2241     vmovdqa $BLOCK1, 0*16(%rsi)
   2242     vmovdqa $BLOCK2, 1*16(%rsi)
   2243     vmovdqa $BLOCK3, 2*16(%rsi)
   2244     vmovdqa $BLOCK4, 3*16(%rsi)
   2245     vmovdqa $BLOCK5, 4*16(%rsi)
   2246     vmovdqa $BLOCK6, 5*16(%rsi)
   2247     ret
   2248 .cfi_endproc
   2249 .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
   2250 ___
   2251 }
   2252 aes256gcmsiv_kdf();
   2253 
   2254 print $code;
   2255 
   2256 close STDOUT;
   2257