Home | History | Annotate | Download | only in crc32
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "textflag.h"
      6 
      7 // Vector register range containing CRC-32 constants
      8 
      9 #define CONST_PERM_LE2BE        V9
     10 #define CONST_R2R1              V10
     11 #define CONST_R4R3              V11
     12 #define CONST_R5                V12
     13 #define CONST_RU_POLY           V13
     14 #define CONST_CRC_POLY          V14
     15 
     16 
     17 // The CRC-32 constant block contains reduction constants to fold and
     18 // process particular chunks of the input data stream in parallel.
     19 //
     20 // Note that the constant definitions below are extended in order to compute
     21 // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
     22 // The rightmost doubleword can be 0 to prevent contribution to the result or
     23 // can be multiplied by 1 to perform an XOR without the need for a separate
     24 // VECTOR EXCLUSIVE OR instruction.
     25 //
     26 // The polynomials used are bit-reflected:
     27 //
     28 //            IEEE: P'(x) = 0x0edb88320
     29 //      Castagnoli: P'(x) = 0x082f63b78
     30 
     31 
     32 // IEEE polynomial constants
     33 DATA    crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
     34 DATA    crclecons+8(SB)/8,  $0x0706050403020100
     35 DATA    crclecons+16(SB)/8, $0x00000001c6e41596       // R2
     36 DATA    crclecons+24(SB)/8, $0x0000000154442bd4       // R1
     37 DATA    crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
     38 DATA    crclecons+40(SB)/8, $0x00000001751997d0       // R3
     39 DATA    crclecons+48(SB)/8, $0x0000000000000000
     40 DATA    crclecons+56(SB)/8, $0x0000000163cd6124       // R5
     41 DATA    crclecons+64(SB)/8, $0x0000000000000000
     42 DATA    crclecons+72(SB)/8, $0x00000001F7011641       // u'
     43 DATA    crclecons+80(SB)/8, $0x0000000000000000
     44 DATA    crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
     45 
     46 GLOBL    crclecons(SB),RODATA, $144
     47 
     48 // Castagonli Polynomial constants
     49 DATA    crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
     50 DATA    crcclecons+8(SB)/8,  $0x0706050403020100
     51 DATA    crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
     52 DATA    crcclecons+24(SB)/8, $0x00000000740eef02      // R1
     53 DATA    crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
     54 DATA    crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
     55 DATA    crcclecons+48(SB)/8, $0x0000000000000000
     56 DATA    crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
     57 DATA    crcclecons+64(SB)/8, $0x0000000000000000
     58 DATA    crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
     59 DATA    crcclecons+80(SB)/8, $0x0000000000000000
     60 DATA    crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
     61 
     62 GLOBL   crcclecons(SB),RODATA, $144
     63 
     64 // func hasVectorFacility() bool
     65 TEXT hasVectorFacility(SB),NOSPLIT,$24-1
     66 	MOVD    $x-24(SP), R1
     67 	XC      $24, 0(R1), 0(R1) // clear the storage
     68 	MOVD    $2, R0            // R0 is the number of double words stored -1
     69 	WORD    $0xB2B01000       // STFLE 0(R1)
     70 	XOR     R0, R0            // reset the value of R0
     71 	MOVBZ   z-8(SP), R1
     72 	AND     $0x40, R1
     73 	BEQ     novector
     74 vectorinstalled:
     75 	// check if the vector instruction has been enabled
     76 	VLEIB   $0, $0xF, V16
     77 	VLGVB   $0, V16, R1
     78 	CMPBNE  R1, $0xF, novector
     79 	MOVB    $1, ret+0(FP) // have vx
     80 	RET
     81 novector:
     82 	MOVB    $0, ret+0(FP) // no vx
     83 	RET
     84 
     85 
     86 // The CRC-32 function(s) use these calling conventions:
     87 //
     88 // Parameters:
     89 //
     90 //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
     91 //      R3:    Input buffer pointer, performance might be improved if the
     92 //             buffer is on a doubleword boundary.
     93 //      R4:    Length of the buffer, must be 64 bytes or greater.
     94 //
     95 // Register usage:
     96 //
     97 //      R5:     CRC-32 constant pool base pointer.
     98 //      V0:     Initial CRC value and intermediate constants and results.
     99 //      V1..V4: Data for CRC computation.
    100 //      V5..V8: Next data chunks that are fetched from the input buffer.
    101 //
    102 //      V9..V14: CRC-32 constants.
    103 
    104 // func vectorizedIEEE(crc uint32, p []byte) uint32
    105 TEXT vectorizedIEEE(SB),NOSPLIT,$0
    106 	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
    107 	MOVD    p+8(FP), R3       // data pointer
    108 	MOVD    p_len+16(FP), R4  // len(p)
    109 
    110 	MOVD    $crclecons(SB), R5
    111 	BR      vectorizedBody<>(SB)
    112 
    113 // func vectorizedCastagnoli(crc uint32, p []byte) uint32
    114 TEXT vectorizedCastagnoli(SB),NOSPLIT,$0
    115 	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
    116 	MOVD    p+8(FP), R3       // data pointer
    117 	MOVD    p_len+16(FP), R4  // len(p)
    118 
    119 	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
    120 	MOVD    $crcclecons(SB), R5
    121 	BR      vectorizedBody<>(SB)
    122 
    123 TEXT vectorizedBody<>(SB),NOSPLIT,$0
    124 	XOR     $0xffffffff, R2 // NOTW R2
    125 	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
    126 
    127 	// Load the initial CRC value into the rightmost word of V0
    128 	VZERO   V0
    129 	VLVGF   $3, R2, V0
    130 
    131 	// Crash if the input size is less than 64-bytes.
    132 	CMP     R4, $64
    133 	BLT     crash
    134 
    135 	// Load a 64-byte data chunk and XOR with CRC
    136 	VLM     0(R3), V1, V4    // 64-bytes into V1..V4
    137 
    138 	// Reflect the data if the CRC operation is in the bit-reflected domain
    139 	VPERM   V1, V1, CONST_PERM_LE2BE, V1
    140 	VPERM   V2, V2, CONST_PERM_LE2BE, V2
    141 	VPERM   V3, V3, CONST_PERM_LE2BE, V3
    142 	VPERM   V4, V4, CONST_PERM_LE2BE, V4
    143 
    144 	VX      V0, V1, V1     // V1 ^= CRC
    145 	ADD     $64, R3        // BUF = BUF + 64
    146 	ADD     $(-64), R4
    147 
    148 	// Check remaining buffer size and jump to proper folding method
    149 	CMP     R4, $64
    150 	BLT     less_than_64bytes
    151 
    152 fold_64bytes_loop:
    153 	// Load the next 64-byte data chunk into V5 to V8
    154 	VLM     0(R3), V5, V8
    155 	VPERM   V5, V5, CONST_PERM_LE2BE, V5
    156 	VPERM   V6, V6, CONST_PERM_LE2BE, V6
    157 	VPERM   V7, V7, CONST_PERM_LE2BE, V7
    158 	VPERM   V8, V8, CONST_PERM_LE2BE, V8
    159 
    160 
    161 	// Perform a GF(2) multiplication of the doublewords in V1 with
    162 	// the reduction constants in V0.  The intermediate result is
    163 	// then folded (accumulated) with the next data chunk in V5 and
    164 	// stored in V1.  Repeat this step for the register contents
    165 	// in V2, V3, and V4 respectively.
    166 
    167 	VGFMAG  CONST_R2R1, V1, V5, V1
    168 	VGFMAG  CONST_R2R1, V2, V6, V2
    169 	VGFMAG  CONST_R2R1, V3, V7, V3
    170 	VGFMAG  CONST_R2R1, V4, V8 ,V4
    171 
    172 	// Adjust buffer pointer and length for next loop
    173 	ADD     $64, R3                  // BUF = BUF + 64
    174 	ADD     $(-64), R4               // LEN = LEN - 64
    175 
    176 	CMP     R4, $64
    177 	BGE     fold_64bytes_loop
    178 
    179 less_than_64bytes:
    180 	// Fold V1 to V4 into a single 128-bit value in V1
    181 	VGFMAG  CONST_R4R3, V1, V2, V1
    182 	VGFMAG  CONST_R4R3, V1, V3, V1
    183 	VGFMAG  CONST_R4R3, V1, V4, V1
    184 
    185 	// Check whether to continue with 64-bit folding
    186 	CMP R4, $16
    187 	BLT final_fold
    188 
    189 fold_16bytes_loop:
    190 	VL      0(R3), V2               // Load next data chunk
    191 	VPERM   V2, V2, CONST_PERM_LE2BE, V2
    192 
    193 	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
    194 
    195 	// Adjust buffer pointer and size for folding next data chunk
    196 	ADD     $16, R3
    197 	ADD     $-16, R4
    198 
    199 	// Process remaining data chunks
    200 	CMP     R4 ,$16
    201 	BGE     fold_16bytes_loop
    202 
    203 final_fold:
    204 	VLEIB   $7, $0x40, V9
    205 	VSRLB   V9, CONST_R4R3, V0
    206 	VLEIG   $0, $1, V0
    207 
    208 	VGFMG   V0, V1, V1
    209 
    210 	VLEIB   $7, $0x20, V9         // Shift by words
    211 	VSRLB   V9, V1, V2            // Store remaining bits in V2
    212 	VUPLLF  V1, V1                // Split rightmost doubleword
    213 	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
    214 
    215 
    216 	// The input values to the Barret reduction are the degree-63 polynomial
    217 	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
    218 	// constant u.  The Barret reduction result is the CRC value of R(x) mod
    219 	// P(x).
    220 	//
    221 	// The Barret reduction algorithm is defined as:
    222 	//
    223 	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
    224 	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
    225 	//    3. C(x)  = R(x) XOR T2(x) mod x^32
    226 	//
    227 	// Note: To compensate the division by x^32, use the vector unpack
    228 	// instruction to move the leftmost word into the leftmost doubleword
    229 	// of the vector register.  The rightmost doubleword is multiplied
    230 	// with zero to not contribute to the intermedate results.
    231 
    232 
    233 	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
    234 	VUPLLF  V1, V2
    235 	VGFMG   CONST_RU_POLY, V2, V2
    236 
    237 
    238 	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
    239 	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
    240 	// The final result is in the rightmost word of V2.
    241 
    242 	VUPLLF  V2 , V2
    243 	VGFMAG  CONST_CRC_POLY, V2, V1, V2
    244 
    245 done:
    246 	VLGVF   $2, V2, R2
    247 	XOR     $0xffffffff, R2 // NOTW R2
    248 	MOVWZ   R2, ret + 32(FP)
    249 	RET
    250 
    251 crash:
    252 	MOVD    $0, (R0) // input size is less than 64-bytes
    253