Home | History | Annotate | Download | only in crc32
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "textflag.h"
      6 
      7 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
      8 //
      9 // func castagnoliSSE42(crc uint32, p []byte) uint32
     10 TEXT castagnoliSSE42(SB),NOSPLIT,$0
     11 	MOVL crc+0(FP), AX  // CRC value
     12 	MOVQ p+8(FP), SI  // data pointer
     13 	MOVQ p_len+16(FP), CX  // len(p)
     14 
     15 	// If there are fewer than 8 bytes to process, skip alignment.
     16 	CMPQ CX, $8
     17 	JL less_than_8
     18 
     19 	MOVQ SI, BX
     20 	ANDQ $7, BX
     21 	JZ aligned
     22 
     23 	// Process the first few bytes to 8-byte align the input.
     24 
     25 	// BX = 8 - BX. We need to process this many bytes to align.
     26 	SUBQ $1, BX
     27 	XORQ $7, BX
     28 
     29 	BTQ $0, BX
     30 	JNC align_2
     31 
     32 	CRC32B (SI), AX
     33 	DECQ CX
     34 	INCQ SI
     35 
     36 align_2:
     37 	BTQ $1, BX
     38 	JNC align_4
     39 
     40 	// CRC32W (SI), AX
     41 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
     42 
     43 	SUBQ $2, CX
     44 	ADDQ $2, SI
     45 
     46 align_4:
     47 	BTQ $2, BX
     48 	JNC aligned
     49 
     50 	// CRC32L (SI), AX
     51 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
     52 
     53 	SUBQ $4, CX
     54 	ADDQ $4, SI
     55 
     56 aligned:
     57 	// The input is now 8-byte aligned and we can process 8-byte chunks.
     58 	CMPQ CX, $8
     59 	JL less_than_8
     60 
     61 	CRC32Q (SI), AX
     62 	ADDQ $8, SI
     63 	SUBQ $8, CX
     64 	JMP aligned
     65 
     66 less_than_8:
     67 	// We may have some bytes left over; process 4 bytes, then 2, then 1.
     68 	BTQ $2, CX
     69 	JNC less_than_4
     70 
     71 	// CRC32L (SI), AX
     72 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
     73 	ADDQ $4, SI
     74 
     75 less_than_4:
     76 	BTQ $1, CX
     77 	JNC less_than_2
     78 
     79 	// CRC32W (SI), AX
     80 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
     81 	ADDQ $2, SI
     82 
     83 less_than_2:
     84 	BTQ $0, CX
     85 	JNC done
     86 
     87 	CRC32B (SI), AX
     88 
     89 done:
     90 	MOVL AX, ret+32(FP)
     91 	RET
     92 
     93 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
     94 // bytes from each buffer.
     95 //
     96 // func castagnoliSSE42Triple(
     97 //     crc1, crc2, crc3 uint32,
     98 //     a, b, c []byte,
     99 //     rounds uint32,
    100 // ) (retA uint32, retB uint32, retC uint32)
    101 TEXT castagnoliSSE42Triple(SB),NOSPLIT,$0
    102 	MOVL crcA+0(FP), AX
    103 	MOVL crcB+4(FP), CX
    104 	MOVL crcC+8(FP), DX
    105 
    106 	MOVQ a+16(FP), R8   // data pointer
    107 	MOVQ b+40(FP), R9   // data pointer
    108 	MOVQ c+64(FP), R10  // data pointer
    109 
    110 	MOVL rounds+88(FP), R11
    111 
    112 loop:
    113 	CRC32Q (R8), AX
    114 	CRC32Q (R9), CX
    115 	CRC32Q (R10), DX
    116 
    117 	CRC32Q 8(R8), AX
    118 	CRC32Q 8(R9), CX
    119 	CRC32Q 8(R10), DX
    120 
    121 	CRC32Q 16(R8), AX
    122 	CRC32Q 16(R9), CX
    123 	CRC32Q 16(R10), DX
    124 
    125 	ADDQ $24, R8
    126 	ADDQ $24, R9
    127 	ADDQ $24, R10
    128 
    129 	DECQ R11
    130 	JNZ loop
    131 
    132 	MOVL AX, retA+96(FP)
    133 	MOVL CX, retB+100(FP)
    134 	MOVL DX, retC+104(FP)
    135 	RET
    136 
    137 // func haveSSE42() bool
    138 TEXT haveSSE42(SB),NOSPLIT,$0
    139 	XORQ AX, AX
    140 	INCL AX
    141 	CPUID
    142 	SHRQ $20, CX
    143 	ANDQ $1, CX
    144 	MOVB CX, ret+0(FP)
    145 	RET
    146 
    147 // func haveCLMUL() bool
    148 TEXT haveCLMUL(SB),NOSPLIT,$0
    149 	XORQ AX, AX
    150 	INCL AX
    151 	CPUID
    152 	SHRQ $1, CX
    153 	ANDQ $1, CX
    154 	MOVB CX, ret+0(FP)
    155 	RET
    156 
    157 // func haveSSE41() bool
    158 TEXT haveSSE41(SB),NOSPLIT,$0
    159 	XORQ AX, AX
    160 	INCL AX
    161 	CPUID
    162 	SHRQ $19, CX
    163 	ANDQ $1, CX
    164 	MOVB CX, ret+0(FP)
    165 	RET
    166 
    167 // CRC32 polynomial data
    168 //
    169 // These constants are lifted from the
    170 // Linux kernel, since they avoid the costly
    171 // PSHUFB 16 byte reversal proposed in the
    172 // original Intel paper.
    173 DATA r2r1<>+0(SB)/8, $0x154442bd4
    174 DATA r2r1<>+8(SB)/8, $0x1c6e41596
    175 DATA r4r3<>+0(SB)/8, $0x1751997d0
    176 DATA r4r3<>+8(SB)/8, $0x0ccaa009e
    177 DATA rupoly<>+0(SB)/8, $0x1db710641
    178 DATA rupoly<>+8(SB)/8, $0x1f7011641
    179 DATA r5<>+0(SB)/8, $0x163cd6124
    180 
    181 GLOBL r2r1<>(SB),RODATA,$16
    182 GLOBL r4r3<>(SB),RODATA,$16
    183 GLOBL rupoly<>(SB),RODATA,$16
    184 GLOBL r5<>(SB),RODATA,$8
    185 
    186 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
    187 // len(p) must be at least 64, and must be a multiple of 16.
    188 
    189 // func ieeeCLMUL(crc uint32, p []byte) uint32
    190 TEXT ieeeCLMUL(SB),NOSPLIT,$0
    191 	MOVL   crc+0(FP), X0             // Initial CRC value
    192 	MOVQ   p+8(FP), SI  	         // data pointer
    193 	MOVQ   p_len+16(FP), CX          // len(p)
    194 
    195 	MOVOU  (SI), X1
    196 	MOVOU  16(SI), X2
    197 	MOVOU  32(SI), X3
    198 	MOVOU  48(SI), X4
    199 	PXOR   X0, X1
    200 	ADDQ   $64, SI                  // buf+=64
    201 	SUBQ   $64, CX                  // len-=64
    202 	CMPQ   CX, $64                  // Less than 64 bytes left
    203 	JB     remain64
    204 
    205 	MOVOA  r2r1<>+0(SB), X0
    206 loopback64:
    207 	MOVOA  X1, X5
    208 	MOVOA  X2, X6
    209 	MOVOA  X3, X7
    210 	MOVOA  X4, X8
    211 
    212 	PCLMULQDQ $0, X0, X1
    213 	PCLMULQDQ $0, X0, X2
    214 	PCLMULQDQ $0, X0, X3
    215 	PCLMULQDQ $0, X0, X4
    216 
    217 	/* Load next early */
    218 	MOVOU    (SI), X11
    219 	MOVOU    16(SI), X12
    220 	MOVOU    32(SI), X13
    221 	MOVOU    48(SI), X14
    222 
    223 	PCLMULQDQ $0x11, X0, X5
    224 	PCLMULQDQ $0x11, X0, X6
    225 	PCLMULQDQ $0x11, X0, X7
    226 	PCLMULQDQ $0x11, X0, X8
    227 
    228 	PXOR     X5, X1
    229 	PXOR     X6, X2
    230 	PXOR     X7, X3
    231 	PXOR     X8, X4
    232 
    233 	PXOR     X11, X1
    234 	PXOR     X12, X2
    235 	PXOR     X13, X3
    236 	PXOR     X14, X4
    237 
    238 	ADDQ    $0x40, DI
    239 	ADDQ    $64, SI      // buf+=64
    240 	SUBQ    $64, CX      // len-=64
    241 	CMPQ    CX, $64      // Less than 64 bytes left?
    242 	JGE     loopback64
    243 
    244 	/* Fold result into a single register (X1) */
    245 remain64:
    246 	MOVOA       r4r3<>+0(SB), X0
    247 
    248 	MOVOA       X1, X5
    249 	PCLMULQDQ   $0, X0, X1
    250 	PCLMULQDQ   $0x11, X0, X5
    251 	PXOR        X5, X1
    252 	PXOR        X2, X1
    253 
    254 	MOVOA       X1, X5
    255 	PCLMULQDQ   $0, X0, X1
    256 	PCLMULQDQ   $0x11, X0, X5
    257 	PXOR        X5, X1
    258 	PXOR        X3, X1
    259 
    260 	MOVOA       X1, X5
    261 	PCLMULQDQ   $0, X0, X1
    262 	PCLMULQDQ   $0x11, X0, X5
    263 	PXOR        X5, X1
    264 	PXOR        X4, X1
    265 
    266 	/* If there is less than 16 bytes left we are done */
    267 	CMPQ        CX, $16
    268 	JB          finish
    269 
    270 	/* Encode 16 bytes */
    271 remain16:
    272 	MOVOU       (SI), X10
    273 	MOVOA       X1, X5
    274 	PCLMULQDQ   $0, X0, X1
    275 	PCLMULQDQ   $0x11, X0, X5
    276 	PXOR        X5, X1
    277 	PXOR        X10, X1
    278 	SUBQ        $16, CX
    279 	ADDQ        $16, SI
    280 	CMPQ        CX, $16
    281 	JGE         remain16
    282 
    283 finish:
    284 	/* Fold final result into 32 bits and return it */
    285 	PCMPEQB     X3, X3
    286 	PCLMULQDQ   $1, X1, X0
    287 	PSRLDQ      $8, X1
    288 	PXOR        X0, X1
    289 
    290 	MOVOA       X1, X2
    291 	MOVQ        r5<>+0(SB), X0
    292 
    293 	/* Creates 32 bit mask. Note that we don't care about upper half. */
    294 	PSRLQ       $32, X3
    295 
    296 	PSRLDQ      $4, X2
    297 	PAND        X3, X1
    298 	PCLMULQDQ   $0, X0, X1
    299 	PXOR        X2, X1
    300 
    301 	MOVOA       rupoly<>+0(SB), X0
    302 
    303 	MOVOA       X1, X2
    304 	PAND        X3, X1
    305 	PCLMULQDQ   $0x10, X0, X1
    306 	PAND        X3, X1
    307 	PCLMULQDQ   $0, X0, X1
    308 	PXOR        X2, X1
    309 
    310 	PEXTRD	$1, X1, AX
    311 	MOVL        AX, ret+32(FP)
    312 
    313 	RET
    314