Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 # Copyright (c) 2019, Google Inc.
      3 #
      4 # Permission to use, copy, modify, and/or distribute this software for any
      5 # purpose with or without fee is hereby granted, provided that the above
      6 # copyright notice and this permission notice appear in all copies.
      7 #
      8 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      9 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     11 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     13 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     14 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15 
     16 # ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
     17 # table-based GHASH implementation. It requires SSSE3 instructions.
     18 #
     19 # For background, the table-based strategy is a 4-bit windowed multiplication.
     20 # It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
     21 # over 4-bit windows of the input and indexes them up into the table. Visually,
     22 # it multiplies as in the schoolbook multiplication diagram below, but with
     23 # more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
     24 # it incorporates the terms labeled '1' by indexing the most significant term
     25 # of X into the table. Then it shifts and repeats for '2' and so on.
     26 #
     27 #        hhhhhh
     28 #  *     xxxxxx
     29 #  ============
     30 #        666666
     31 #       555555
     32 #      444444
     33 #     333333
     34 #    222222
     35 #   111111
     36 #
     37 # This implementation changes the order. We treat the table as a 1616 matrix
     38 # and transpose it. The first row is then the first byte of each multiple of H,
     39 # and so on. We then reorder terms as below. Observe that the terms labeled '1'
     40 # and '2' are all lookups into the first row, etc. This maps well to the SSSE3
     41 # pshufb instruction, using alternating terms of X in parallel as indices. This
     42 # alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
     43 # repeat for each row.
     44 #
     45 #        hhhhhh
     46 #  *     xxxxxx
     47 #  ============
     48 #        224466
     49 #       113355
     50 #      224466
     51 #     113355
     52 #    224466
     53 #   113355
     54 #
     55 # Next we account for GCM's confusing bit order. The "first" bit is the least
     56 # significant coefficient, but GCM treats the most sigificant bit within a byte
     57 # as first. Bytes are little-endian, and bits are big-endian. We reverse the
     58 # bytes in XMM registers for a consistent bit and byte ordering, but this means
     59 # the least significant bit is the most significant coefficient and vice versa.
     60 #
     61 # For consistency, "low", "high", "left-shift", and "right-shift" refer to the
     62 # bit ordering within the XMM register, rather than the reversed coefficient
     63 # ordering. Low bits are less significant bits and more significant
     64 # coefficients. Right-shifts move from MSB to the LSB and correspond to
     65 # increasing the power of each coefficient.
     66 #
     67 # Note this bit reversal enters into the table's column indices. H*1 is stored
     68 # in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
     69 # table rows contain more significant coefficients, so we iterate forwards.
     70 
     71 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     72 push(@INC,"${dir}","${dir}../../../perlasm");
     73 require "x86asm.pl";
     74 
     75 $output = pop;
     76 open STDOUT, ">$output";
     77 
     78 &asm_init($ARGV[0]);
     79 
     80 my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
     81 &static_label("reverse_bytes");
     82 &static_label("low4_mask");
     83 
     84 my $call_counter = 0;
     85 # process_rows emits assembly code to process $rows rows of the table. On
     86 # input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
     87 # low and high halves of the input. The result so far is passed in xmm2. xmm3
     88 # must be zero. On output, $Htable is advanced to the next row and xmm2 is
     89 # updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
     90 sub process_rows {
     91 	my ($rows) = @_;
     92 	$call_counter++;
     93 
     94 	# Shifting whole XMM registers by bits is complex. psrldq shifts by
     95 	# bytes, and psrlq shifts the two 64-bit halves separately. Each row
     96 	# produces 8 bits of carry, and the reduction needs an additional 7-bit
     97 	# shift. This must fit in 64 bits so reduction can use psrlq. This
     98 	# allows up to 7 rows at a time.
     99 	die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
    100 
    101 	&mov("eax", $rows);
    102 &set_label("loop_row_$call_counter");
    103 	&movdqa("xmm4", &QWP(0, $Htable));
    104 	&lea($Htable, &DWP(16, $Htable));
    105 
    106 	# Right-shift xmm2 and xmm3 by 8 bytes.
    107 	&movdqa("xmm6", "xmm2");
    108 	&palignr("xmm6", "xmm3", 1);
    109 	&movdqa("xmm3", "xmm6");
    110 	&psrldq("xmm2", 1);
    111 
    112 	# Load the next table row and index the low and high bits of the input.
    113 	# Note the low (respectively, high) half corresponds to more
    114 	# (respectively, less) significant coefficients.
    115 	&movdqa("xmm5", "xmm4");
    116 	&pshufb("xmm4", "xmm0");
    117 	&pshufb("xmm5", "xmm1");
    118 
    119 	# Add the high half (xmm5) without shifting.
    120 	&pxor("xmm2", "xmm5");
    121 
    122 	# Add the low half (xmm4). This must be right-shifted by 4 bits. First,
    123 	# add into the carry register (xmm3).
    124 	&movdqa("xmm5", "xmm4");
    125 	&psllq("xmm5", 60);
    126 	&movdqa("xmm6", "xmm5");
    127 	&pslldq("xmm6", 8);
    128 	&pxor("xmm3", "xmm6");
    129 
    130 	# Next, add into xmm2.
    131 	&psrldq("xmm5", 8);
    132 	&pxor("xmm2", "xmm5");
    133 	&psrlq("xmm4", 4);
    134 	&pxor("xmm2", "xmm4");
    135 
    136 	&sub("eax", 1);
    137 	&jnz(&label("loop_row_$call_counter"));
    138 
    139 	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
    140 	# x^7, so we shift and XOR four times.
    141 	&pxor("xmm2", "xmm3");	# x^0 = 0
    142 	&psrlq("xmm3", 1);
    143 	&pxor("xmm2", "xmm3");	# x^1 = x
    144 	&psrlq("xmm3", 1);
    145 	&pxor("xmm2", "xmm3");	# x^(1+1) = x^2
    146 	&psrlq("xmm3", 5);
    147 	&pxor("xmm2", "xmm3");	# x^(1+1+5) = x^7
    148 	&pxor("xmm3", "xmm3");
    149 ____
    150 }
    151 
    152 # gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
    153 # |Xi| is represented in GHASH's serialized byte representation. |Htable| is
    154 # formatted as described above.
    155 # void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
    156 &function_begin("gcm_gmult_ssse3");
    157 	&mov($Xi, &wparam(0));
    158 	&mov($Htable, &wparam(1));
    159 
    160 	&movdqu("xmm0", &QWP(0, $Xi));
    161 	&call(&label("pic_point"));
    162 &set_label("pic_point");
    163 	&blindpop("eax");
    164 	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
    165 	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
    166 
    167 	# Reverse input bytes to deserialize.
    168 	&pshufb("xmm0", "xmm7");
    169 
    170 	# Split each byte into low (xmm0) and high (xmm1) halves.
    171 	&movdqa("xmm1", "xmm2");
    172 	&pandn("xmm1", "xmm0");
    173 	&psrld("xmm1", 4);
    174 	&pand("xmm0", "xmm2");
    175 
    176 	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
    177 	# that, due to bit reversal, xmm3 contains bits that fall off when
    178 	# right-shifting, not left-shifting.
    179 	&pxor("xmm2", "xmm2");
    180 	&pxor("xmm3", "xmm3");
    181 
    182 	# We must reduce at least once every 7 rows, so divide into three
    183 	# chunks.
    184 	&process_rows(5);
    185 	&process_rows(5);
    186 	&process_rows(6);
    187 
    188 	# Store the result. Reverse bytes to serialize.
    189 	&pshufb("xmm2", "xmm7");
    190 	&movdqu(&QWP(0, $Xi), "xmm2");
    191 
    192 	# Zero any registers which contain secrets.
    193 	&pxor("xmm0", "xmm0");
    194 	&pxor("xmm1", "xmm1");
    195 	&pxor("xmm2", "xmm2");
    196 	&pxor("xmm3", "xmm3");
    197 	&pxor("xmm4", "xmm4");
    198 	&pxor("xmm5", "xmm5");
    199 	&pxor("xmm6", "xmm6");
    200 &function_end("gcm_gmult_ssse3");
    201 
    202 # gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
    203 # the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
    204 # serialized byte representation. |Htable| is formatted as described above.
    205 # void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
    206 #                      size_t len);
    207 &function_begin("gcm_ghash_ssse3");
    208 	&mov($Xi, &wparam(0));
    209 	&mov($Htable, &wparam(1));
    210 	&mov($in, &wparam(2));
    211 	&mov($len, &wparam(3));
    212 
    213 	&movdqu("xmm0", &QWP(0, $Xi));
    214 	&call(&label("pic_point"));
    215 &set_label("pic_point");
    216 	&blindpop("ebx");
    217 	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
    218 
    219 	# This function only processes whole blocks.
    220 	&and($len, -16);
    221 
    222 	# Reverse input bytes to deserialize. We maintain the running
    223 	# total in xmm0.
    224 	&pshufb("xmm0", "xmm7");
    225 
    226 	# Iterate over each block. On entry to each iteration, xmm3 is zero.
    227 	&pxor("xmm3", "xmm3");
    228 &set_label("loop_ghash");
    229 	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
    230 
    231 	# Incorporate the next block of input.
    232 	&movdqu("xmm1", &QWP(0, $in));
    233 	&pshufb("xmm1", "xmm7");	# Reverse bytes.
    234 	&pxor("xmm0", "xmm1");
    235 
    236 	# Split each byte into low (xmm0) and high (xmm1) halves.
    237 	&movdqa("xmm1", "xmm2");
    238 	&pandn("xmm1", "xmm0");
    239 	&psrld("xmm1", 4);
    240 	&pand("xmm0", "xmm2");
    241 
    242 	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
    243 	# that, due to bit reversal, xmm3 contains bits that fall off when
    244 	# right-shifting, not left-shifting.
    245 	&pxor("xmm2", "xmm2");
    246 	# xmm3 is already zero at this point.
    247 
    248 	# We must reduce at least once every 7 rows, so divide into three
    249 	# chunks.
    250 	&process_rows(5);
    251 	&process_rows(5);
    252 	&process_rows(6);
    253 
    254 	&movdqa("xmm0", "xmm2");
    255 
    256 	# Rewind $Htable for the next iteration.
    257 	&lea($Htable, &DWP(-256, $Htable));
    258 
    259 	# Advance input and continue.
    260 	&lea($in, &DWP(16, $in));
    261 	&sub($len, 16);
    262 	&jnz(&label("loop_ghash"));
    263 
    264 	# Reverse bytes and store the result.
    265 	&pshufb("xmm0", "xmm7");
    266 	&movdqu(&QWP(0, $Xi), "xmm0");
    267 
    268 	# Zero any registers which contain secrets.
    269 	&pxor("xmm0", "xmm0");
    270 	&pxor("xmm1", "xmm1");
    271 	&pxor("xmm2", "xmm2");
    272 	&pxor("xmm3", "xmm3");
    273 	&pxor("xmm4", "xmm4");
    274 	&pxor("xmm5", "xmm5");
    275 	&pxor("xmm6", "xmm6");
    276 &function_end("gcm_ghash_ssse3");
    277 
    278 # reverse_bytes is a permutation which, if applied with pshufb, reverses the
    279 # bytes in an XMM register.
    280 &set_label("reverse_bytes", 16);
    281 &data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    282 # low4_mask is an XMM mask which selects the low four bits of each byte.
    283 &set_label("low4_mask", 16);
    284 &data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
    285 
    286 &asm_finish();
    287 
    288 close STDOUT;
    289