Home | History | Annotate | Download | only in sha256
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Based on CRYPTOGAMS code with the following comment:
      6 // # ====================================================================
      7 // # Written by Andy Polyakov <appro (at) openssl.org> for the OpenSSL
      8 // # project. The module is, however, dual licensed under OpenSSL and
      9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
     10 // # details see http://www.openssl.org/~appro/cryptogams/.
     11 // # ====================================================================
     12 
     13 #include "textflag.h"
     14 
     15 // SHA256 block routine. See sha256block.go for Go equivalent.
     16 //
     17 // The algorithm is detailed in FIPS 180-4:
     18 //
     19 //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
     20 //
     21 // Wt = Mt; for 0 <= t <= 15
     22 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
     23 //
     24 // a = H0
     25 // b = H1
     26 // c = H2
     27 // d = H3
     28 // e = H4
     29 // f = H5
     30 // g = H6
     31 // h = H7
     32 //
     33 // for t = 0 to 63 {
     34 //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
     35 //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
     36 //    h = g
     37 //    g = f
     38 //    f = e
     39 //    e = d + T1
     40 //    d = c
     41 //    c = b
     42 //    b = a
     43 //    a = T1 + T2
     44 // }
     45 //
     46 // H0 = a + H0
     47 // H1 = b + H1
     48 // H2 = c + H2
     49 // H3 = d + H3
     50 // H4 = e + H4
     51 // H5 = f + H5
     52 // H6 = g + H6
     53 // H7 = h + H7
     54 
     55 #define CTX	R3
     56 #define INP	R4
     57 #define END	R5
     58 #define TBL	R6
     59 #define IDX	R7
     60 #define CNT	R8
     61 #define LEN	R9
     62 #define OFFLOAD	R11
     63 #define TEMP	R12
     64 
     65 #define HEX00	R0
     66 #define HEX10	R10
     67 #define HEX20	R25
     68 #define HEX30	R26
     69 #define HEX40	R27
     70 #define HEX50	R28
     71 #define HEX60	R29
     72 #define HEX70	R31
     73 
     74 // V0-V7 are A-H
     75 // V8-V23 are used for the message schedule
     76 #define KI	V24
     77 #define FUNC	V25
     78 #define S0	V26
     79 #define S1	V27
     80 #define s0	V28
     81 #define s1	V29
     82 #define LEMASK	V31	// Permutation control register for little endian
     83 
     84 // 4 copies of each Kt, to fill all 4 words of a vector register
     85 DATA  kcon+0x000(SB)/8, $0x428a2f98428a2f98
     86 DATA  kcon+0x008(SB)/8, $0x428a2f98428a2f98
     87 DATA  kcon+0x010(SB)/8, $0x7137449171374491
     88 DATA  kcon+0x018(SB)/8, $0x7137449171374491
     89 DATA  kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
     90 DATA  kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
     91 DATA  kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
     92 DATA  kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
     93 DATA  kcon+0x040(SB)/8, $0x3956c25b3956c25b
     94 DATA  kcon+0x048(SB)/8, $0x3956c25b3956c25b
     95 DATA  kcon+0x050(SB)/8, $0x59f111f159f111f1
     96 DATA  kcon+0x058(SB)/8, $0x59f111f159f111f1
     97 DATA  kcon+0x060(SB)/8, $0x923f82a4923f82a4
     98 DATA  kcon+0x068(SB)/8, $0x923f82a4923f82a4
     99 DATA  kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
    100 DATA  kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
    101 DATA  kcon+0x080(SB)/8, $0xd807aa98d807aa98
    102 DATA  kcon+0x088(SB)/8, $0xd807aa98d807aa98
    103 DATA  kcon+0x090(SB)/8, $0x12835b0112835b01
    104 DATA  kcon+0x098(SB)/8, $0x12835b0112835b01
    105 DATA  kcon+0x0A0(SB)/8, $0x243185be243185be
    106 DATA  kcon+0x0A8(SB)/8, $0x243185be243185be
    107 DATA  kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
    108 DATA  kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
    109 DATA  kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
    110 DATA  kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
    111 DATA  kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
    112 DATA  kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
    113 DATA  kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
    114 DATA  kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
    115 DATA  kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
    116 DATA  kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
    117 DATA  kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
    118 DATA  kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
    119 DATA  kcon+0x110(SB)/8, $0xefbe4786efbe4786
    120 DATA  kcon+0x118(SB)/8, $0xefbe4786efbe4786
    121 DATA  kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
    122 DATA  kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
    123 DATA  kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
    124 DATA  kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
    125 DATA  kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
    126 DATA  kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
    127 DATA  kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
    128 DATA  kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
    129 DATA  kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
    130 DATA  kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
    131 DATA  kcon+0x170(SB)/8, $0x76f988da76f988da
    132 DATA  kcon+0x178(SB)/8, $0x76f988da76f988da
    133 DATA  kcon+0x180(SB)/8, $0x983e5152983e5152
    134 DATA  kcon+0x188(SB)/8, $0x983e5152983e5152
    135 DATA  kcon+0x190(SB)/8, $0xa831c66da831c66d
    136 DATA  kcon+0x198(SB)/8, $0xa831c66da831c66d
    137 DATA  kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
    138 DATA  kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
    139 DATA  kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
    140 DATA  kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
    141 DATA  kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
    142 DATA  kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
    143 DATA  kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
    144 DATA  kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
    145 DATA  kcon+0x1E0(SB)/8, $0x06ca635106ca6351
    146 DATA  kcon+0x1E8(SB)/8, $0x06ca635106ca6351
    147 DATA  kcon+0x1F0(SB)/8, $0x1429296714292967
    148 DATA  kcon+0x1F8(SB)/8, $0x1429296714292967
    149 DATA  kcon+0x200(SB)/8, $0x27b70a8527b70a85
    150 DATA  kcon+0x208(SB)/8, $0x27b70a8527b70a85
    151 DATA  kcon+0x210(SB)/8, $0x2e1b21382e1b2138
    152 DATA  kcon+0x218(SB)/8, $0x2e1b21382e1b2138
    153 DATA  kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
    154 DATA  kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
    155 DATA  kcon+0x230(SB)/8, $0x53380d1353380d13
    156 DATA  kcon+0x238(SB)/8, $0x53380d1353380d13
    157 DATA  kcon+0x240(SB)/8, $0x650a7354650a7354
    158 DATA  kcon+0x248(SB)/8, $0x650a7354650a7354
    159 DATA  kcon+0x250(SB)/8, $0x766a0abb766a0abb
    160 DATA  kcon+0x258(SB)/8, $0x766a0abb766a0abb
    161 DATA  kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
    162 DATA  kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
    163 DATA  kcon+0x270(SB)/8, $0x92722c8592722c85
    164 DATA  kcon+0x278(SB)/8, $0x92722c8592722c85
    165 DATA  kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
    166 DATA  kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
    167 DATA  kcon+0x290(SB)/8, $0xa81a664ba81a664b
    168 DATA  kcon+0x298(SB)/8, $0xa81a664ba81a664b
    169 DATA  kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
    170 DATA  kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
    171 DATA  kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
    172 DATA  kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
    173 DATA  kcon+0x2C0(SB)/8, $0xd192e819d192e819
    174 DATA  kcon+0x2C8(SB)/8, $0xd192e819d192e819
    175 DATA  kcon+0x2D0(SB)/8, $0xd6990624d6990624
    176 DATA  kcon+0x2D8(SB)/8, $0xd6990624d6990624
    177 DATA  kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
    178 DATA  kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
    179 DATA  kcon+0x2F0(SB)/8, $0x106aa070106aa070
    180 DATA  kcon+0x2F8(SB)/8, $0x106aa070106aa070
    181 DATA  kcon+0x300(SB)/8, $0x19a4c11619a4c116
    182 DATA  kcon+0x308(SB)/8, $0x19a4c11619a4c116
    183 DATA  kcon+0x310(SB)/8, $0x1e376c081e376c08
    184 DATA  kcon+0x318(SB)/8, $0x1e376c081e376c08
    185 DATA  kcon+0x320(SB)/8, $0x2748774c2748774c
    186 DATA  kcon+0x328(SB)/8, $0x2748774c2748774c
    187 DATA  kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
    188 DATA  kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
    189 DATA  kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
    190 DATA  kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
    191 DATA  kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
    192 DATA  kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
    193 DATA  kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
    194 DATA  kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
    195 DATA  kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
    196 DATA  kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
    197 DATA  kcon+0x380(SB)/8, $0x748f82ee748f82ee
    198 DATA  kcon+0x388(SB)/8, $0x748f82ee748f82ee
    199 DATA  kcon+0x390(SB)/8, $0x78a5636f78a5636f
    200 DATA  kcon+0x398(SB)/8, $0x78a5636f78a5636f
    201 DATA  kcon+0x3A0(SB)/8, $0x84c8781484c87814
    202 DATA  kcon+0x3A8(SB)/8, $0x84c8781484c87814
    203 DATA  kcon+0x3B0(SB)/8, $0x8cc702088cc70208
    204 DATA  kcon+0x3B8(SB)/8, $0x8cc702088cc70208
    205 DATA  kcon+0x3C0(SB)/8, $0x90befffa90befffa
    206 DATA  kcon+0x3C8(SB)/8, $0x90befffa90befffa
    207 DATA  kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
    208 DATA  kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
    209 DATA  kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
    210 DATA  kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
    211 DATA  kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
    212 DATA  kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
    213 DATA  kcon+0x400(SB)/8, $0x0000000000000000
    214 DATA  kcon+0x408(SB)/8, $0x0000000000000000
    215 DATA  kcon+0x410(SB)/8, $0x1011121310111213	// permutation control vectors
    216 DATA  kcon+0x418(SB)/8, $0x1011121300010203
    217 DATA  kcon+0x420(SB)/8, $0x1011121310111213
    218 DATA  kcon+0x428(SB)/8, $0x0405060700010203
    219 DATA  kcon+0x430(SB)/8, $0x1011121308090a0b
    220 DATA  kcon+0x438(SB)/8, $0x0405060700010203
    221 GLOBL kcon(SB), RODATA, $1088
    222 
    223 #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \
    224 	VSEL		g, f, e, FUNC; \
    225 	VSHASIGMAW	$15, e, $1, S1; \
    226 	VADDUWM		xi, h, h; \
    227 	VSHASIGMAW	$0, a, $1, S0; \
    228 	VADDUWM		FUNC, h, h; \
    229 	VXOR		b, a, FUNC; \
    230 	VADDUWM		S1, h, h; \
    231 	VSEL		b, c, FUNC, FUNC; \
    232 	VADDUWM		KI, g, g; \
    233 	VADDUWM		h, d, d; \
    234 	VADDUWM		FUNC, S0, S0; \
    235 	LVX		(TBL)(IDX), KI; \
    236 	ADD		$16, IDX; \
    237 	VADDUWM		S0, h, h
    238 
    239 #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
    240 	VSHASIGMAW	$0, xj_1, $0, s0; \
    241 	VSEL		g, f, e, FUNC; \
    242 	VSHASIGMAW	$15, e, $1, S1; \
    243 	VADDUWM		xi, h, h; \
    244 	VSHASIGMAW	$0, a, $1, S0; \
    245 	VSHASIGMAW	$15, xj_14, $0, s1; \
    246 	VADDUWM		FUNC, h, h; \
    247 	VXOR		b, a, FUNC; \
    248 	VADDUWM		xj_9, xj, xj; \
    249 	VADDUWM		S1, h, h; \
    250 	VSEL		b, c, FUNC, FUNC; \
    251 	VADDUWM		KI, g, g; \
    252 	VADDUWM		h, d, d; \
    253 	VADDUWM		FUNC, S0, S0; \
    254 	VADDUWM		s0, xj, xj; \
    255 	LVX		(TBL)(IDX), KI; \
    256 	ADD		$16, IDX; \
    257 	VADDUWM		S0, h, h; \
    258 	VADDUWM		s1, xj, xj
    259 
    260 // func block(dig *digest, p []byte)
    261 TEXT block(SB),0,$128-32
    262 	MOVD	dig+0(FP), CTX
    263 	MOVD	p_base+8(FP), INP
    264 	MOVD	p_len+16(FP), LEN
    265 
    266 	SRD	$6, LEN
    267 	SLD	$6, LEN
    268 
    269 	ADD	INP, LEN, END
    270 
    271 	CMP	INP, END
    272 	BEQ	end
    273 
    274 	MOVD	$kcon(SB), TBL
    275 	MOVD	R1, OFFLOAD
    276 
    277 	MOVD	R0, CNT
    278 	MOVWZ	$0x10, HEX10
    279 	MOVWZ	$0x20, HEX20
    280 	MOVWZ	$0x30, HEX30
    281 	MOVWZ	$0x40, HEX40
    282 	MOVWZ	$0x50, HEX50
    283 	MOVWZ	$0x60, HEX60
    284 	MOVWZ	$0x70, HEX70
    285 
    286 	MOVWZ	$8, IDX
    287 	LVSL	(IDX)(R0), LEMASK
    288 	VSPLTISB	$0x0F, KI
    289 	VXOR	KI, LEMASK, LEMASK
    290 
    291 	LXVW4X	(CTX)(HEX00), VS32	// v0 = vs32
    292 	LXVW4X	(CTX)(HEX10), VS36	// v4 = vs36
    293 
    294 	// unpack the input values into vector registers
    295 	VSLDOI	$4, V0, V0, V1
    296 	VSLDOI	$8, V0, V0, V2
    297 	VSLDOI	$12, V0, V0, V3
    298 	VSLDOI	$4, V4, V4, V5
    299 	VSLDOI	$8, V4, V4, V6
    300 	VSLDOI	$12, V4, V4, V7
    301 
    302 loop:
    303 	LVX	(TBL)(HEX00), KI
    304 	MOVWZ	$16, IDX
    305 
    306 	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
    307 	ADD	$16, INP
    308 
    309 	STVX	V0, (OFFLOAD+HEX00)
    310 	STVX	V1, (OFFLOAD+HEX10)
    311 	STVX	V2, (OFFLOAD+HEX20)
    312 	STVX	V3, (OFFLOAD+HEX30)
    313 	STVX	V4, (OFFLOAD+HEX40)
    314 	STVX	V5, (OFFLOAD+HEX50)
    315 	STVX	V6, (OFFLOAD+HEX60)
    316 	STVX	V7, (OFFLOAD+HEX70)
    317 
    318 	VADDUWM	KI, V7, V7	// h+K[i]
    319 	LVX	(TBL)(IDX), KI
    320 	ADD	$16, IDX
    321 
    322 	VPERM	V8, V8, LEMASK, V8
    323 	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
    324 	VSLDOI	$4, V8, V8, V9
    325 	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
    326 	VSLDOI	$4, V9, V9, V10
    327 	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
    328 	LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
    329 	ADD	$16, INP, INP
    330 	VSLDOI	$4, V10, V10, V11
    331 	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
    332 	VPERM	V12, V12, LEMASK, V12
    333 	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
    334 	VSLDOI	$4, V12, V12, V13
    335 	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
    336 	VSLDOI	$4, V13, V13, V14
    337 	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
    338 	LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
    339 	ADD	$16, INP, INP
    340 	VSLDOI	$4, V14, V14, V15
    341 	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
    342 	VPERM	V16, V16, LEMASK, V16
    343 	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
    344 	VSLDOI	$4, V16, V16, V17
    345 	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
    346 	VSLDOI	$4, V17, V17, V18
    347 	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
    348 	VSLDOI	$4, V18, V18, V19
    349 	LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
    350 	ADD	$16, INP, INP
    351 	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
    352 	VPERM	V20, V20, LEMASK, V20
    353 	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
    354 	VSLDOI	$4, V20, V20, V21
    355 	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
    356 	VSLDOI	$4, V21, V21, V22
    357 	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
    358 	VSLDOI	$4, V22, V22, V23
    359 	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
    360 
    361 	MOVWZ	$3, TEMP
    362 	MOVWZ	TEMP, CTR
    363 
    364 L16_xx:
    365 	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
    366 	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
    367 	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
    368 	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
    369 	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
    370 	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
    371 	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
    372 	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
    373 	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
    374 	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
    375 	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
    376 	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
    377 	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
    378 	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
    379 	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
    380 	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
    381 
    382 	BC	0x10, 0, L16_xx		// bdnz
    383 
    384 	LVX	(OFFLOAD)(HEX00), V10
    385 
    386 	LVX	(OFFLOAD)(HEX10), V11
    387 	VADDUWM	V10, V0, V0
    388 	LVX	(OFFLOAD)(HEX20), V12
    389 	VADDUWM	V11, V1, V1
    390 	LVX	(OFFLOAD)(HEX30), V13
    391 	VADDUWM	V12, V2, V2
    392 	LVX	(OFFLOAD)(HEX40), V14
    393 	VADDUWM	V13, V3, V3
    394 	LVX	(OFFLOAD)(HEX50), V15
    395 	VADDUWM	V14, V4, V4
    396 	LVX	(OFFLOAD)(HEX60), V16
    397 	VADDUWM	V15, V5, V5
    398 	LVX	(OFFLOAD)(HEX70), V17
    399 	VADDUWM	V16, V6, V6
    400 	VADDUWM	V17, V7, V7
    401 
    402 	CMPU	INP, END
    403 	BLT	loop
    404 
    405 	LVX	(TBL)(IDX), V8
    406 	ADD	$16, IDX
    407 	VPERM	V0, V1, KI, V0
    408 	LVX	(TBL)(IDX), V9
    409 	VPERM	V4, V5, KI, V4
    410 	VPERM	V0, V2, V8, V0
    411 	VPERM	V4, V6, V8, V4
    412 	VPERM	V0, V3, V9, V0
    413 	VPERM	V4, V7, V9, V4
    414 	STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
    415 	STXVD2X	VS36, (CTX+HEX10)	// v4 = vs36
    416 
    417 end:
    418 	RET
    419 
    420