Home | History | Annotate | Download | only in sha512
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Based on CRYPTOGAMS code with the following comment:
      6 // # ====================================================================
      7 // # Written by Andy Polyakov <appro (at) openssl.org> for the OpenSSL
      8 // # project. The module is, however, dual licensed under OpenSSL and
      9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
     10 // # details see http://www.openssl.org/~appro/cryptogams/.
     11 // # ====================================================================
     12 
     13 #include "textflag.h"
     14 
     15 // SHA512 block routine. See sha512block.go for Go equivalent.
     16 //
     17 // The algorithm is detailed in FIPS 180-4:
     18 //
     19 //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
     20 //
     21 // Wt = Mt; for 0 <= t <= 15
     22 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
     23 //
     24 // a = H0
     25 // b = H1
     26 // c = H2
     27 // d = H3
     28 // e = H4
     29 // f = H5
     30 // g = H6
     31 // h = H7
     32 //
     33 // for t = 0 to 79 {
     34 //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
     35 //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
     36 //    h = g
     37 //    g = f
     38 //    f = e
     39 //    e = d + T1
     40 //    d = c
     41 //    c = b
     42 //    b = a
     43 //    a = T1 + T2
     44 // }
     45 //
     46 // H0 = a + H0
     47 // H1 = b + H1
     48 // H2 = c + H2
     49 // H3 = d + H3
     50 // H4 = e + H4
     51 // H5 = f + H5
     52 // H6 = g + H6
     53 // H7 = h + H7
     54 
     55 #define CTX	R3
     56 #define INP	R4
     57 #define END	R5
     58 #define TBL	R6
     59 #define IDX	R7
     60 #define CNT	R8
     61 #define LEN	R9
     62 #define OFFLOAD	R11
     63 #define TEMP	R12
     64 
     65 #define HEX00	R0
     66 #define HEX10	R10
     67 #define HEX20	R25
     68 #define HEX30	R26
     69 #define HEX40	R27
     70 #define HEX50	R28
     71 #define HEX60	R29
     72 #define HEX70	R31
     73 
     74 // V0-V7 are A-H
     75 // V8-V23 are used for the message schedule
     76 #define KI	V24
     77 #define FUNC	V25
     78 #define S0	V26
     79 #define S1	V27
     80 #define s0	V28
     81 #define s1	V29
     82 #define LEMASK	V31	// Permutation control register for little endian
     83 
     84 // 2 copies of each Kt, to fill both doublewords of a vector register
     85 DATA  kcon+0x000(SB)/8, $0x428a2f98d728ae22
     86 DATA  kcon+0x008(SB)/8, $0x428a2f98d728ae22
     87 DATA  kcon+0x010(SB)/8, $0x7137449123ef65cd
     88 DATA  kcon+0x018(SB)/8, $0x7137449123ef65cd
     89 DATA  kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
     90 DATA  kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
     91 DATA  kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
     92 DATA  kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
     93 DATA  kcon+0x040(SB)/8, $0x3956c25bf348b538
     94 DATA  kcon+0x048(SB)/8, $0x3956c25bf348b538
     95 DATA  kcon+0x050(SB)/8, $0x59f111f1b605d019
     96 DATA  kcon+0x058(SB)/8, $0x59f111f1b605d019
     97 DATA  kcon+0x060(SB)/8, $0x923f82a4af194f9b
     98 DATA  kcon+0x068(SB)/8, $0x923f82a4af194f9b
     99 DATA  kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
    100 DATA  kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
    101 DATA  kcon+0x080(SB)/8, $0xd807aa98a3030242
    102 DATA  kcon+0x088(SB)/8, $0xd807aa98a3030242
    103 DATA  kcon+0x090(SB)/8, $0x12835b0145706fbe
    104 DATA  kcon+0x098(SB)/8, $0x12835b0145706fbe
    105 DATA  kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
    106 DATA  kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
    107 DATA  kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
    108 DATA  kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
    109 DATA  kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
    110 DATA  kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
    111 DATA  kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
    112 DATA  kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
    113 DATA  kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
    114 DATA  kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
    115 DATA  kcon+0x0F0(SB)/8, $0xc19bf174cf692694
    116 DATA  kcon+0x0F8(SB)/8, $0xc19bf174cf692694
    117 DATA  kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
    118 DATA  kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
    119 DATA  kcon+0x110(SB)/8, $0xefbe4786384f25e3
    120 DATA  kcon+0x118(SB)/8, $0xefbe4786384f25e3
    121 DATA  kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
    122 DATA  kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
    123 DATA  kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
    124 DATA  kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
    125 DATA  kcon+0x140(SB)/8, $0x2de92c6f592b0275
    126 DATA  kcon+0x148(SB)/8, $0x2de92c6f592b0275
    127 DATA  kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
    128 DATA  kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
    129 DATA  kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
    130 DATA  kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
    131 DATA  kcon+0x170(SB)/8, $0x76f988da831153b5
    132 DATA  kcon+0x178(SB)/8, $0x76f988da831153b5
    133 DATA  kcon+0x180(SB)/8, $0x983e5152ee66dfab
    134 DATA  kcon+0x188(SB)/8, $0x983e5152ee66dfab
    135 DATA  kcon+0x190(SB)/8, $0xa831c66d2db43210
    136 DATA  kcon+0x198(SB)/8, $0xa831c66d2db43210
    137 DATA  kcon+0x1A0(SB)/8, $0xb00327c898fb213f
    138 DATA  kcon+0x1A8(SB)/8, $0xb00327c898fb213f
    139 DATA  kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
    140 DATA  kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
    141 DATA  kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
    142 DATA  kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
    143 DATA  kcon+0x1D0(SB)/8, $0xd5a79147930aa725
    144 DATA  kcon+0x1D8(SB)/8, $0xd5a79147930aa725
    145 DATA  kcon+0x1E0(SB)/8, $0x06ca6351e003826f
    146 DATA  kcon+0x1E8(SB)/8, $0x06ca6351e003826f
    147 DATA  kcon+0x1F0(SB)/8, $0x142929670a0e6e70
    148 DATA  kcon+0x1F8(SB)/8, $0x142929670a0e6e70
    149 DATA  kcon+0x200(SB)/8, $0x27b70a8546d22ffc
    150 DATA  kcon+0x208(SB)/8, $0x27b70a8546d22ffc
    151 DATA  kcon+0x210(SB)/8, $0x2e1b21385c26c926
    152 DATA  kcon+0x218(SB)/8, $0x2e1b21385c26c926
    153 DATA  kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
    154 DATA  kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
    155 DATA  kcon+0x230(SB)/8, $0x53380d139d95b3df
    156 DATA  kcon+0x238(SB)/8, $0x53380d139d95b3df
    157 DATA  kcon+0x240(SB)/8, $0x650a73548baf63de
    158 DATA  kcon+0x248(SB)/8, $0x650a73548baf63de
    159 DATA  kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
    160 DATA  kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
    161 DATA  kcon+0x260(SB)/8, $0x81c2c92e47edaee6
    162 DATA  kcon+0x268(SB)/8, $0x81c2c92e47edaee6
    163 DATA  kcon+0x270(SB)/8, $0x92722c851482353b
    164 DATA  kcon+0x278(SB)/8, $0x92722c851482353b
    165 DATA  kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
    166 DATA  kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
    167 DATA  kcon+0x290(SB)/8, $0xa81a664bbc423001
    168 DATA  kcon+0x298(SB)/8, $0xa81a664bbc423001
    169 DATA  kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
    170 DATA  kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
    171 DATA  kcon+0x2B0(SB)/8, $0xc76c51a30654be30
    172 DATA  kcon+0x2B8(SB)/8, $0xc76c51a30654be30
    173 DATA  kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
    174 DATA  kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
    175 DATA  kcon+0x2D0(SB)/8, $0xd69906245565a910
    176 DATA  kcon+0x2D8(SB)/8, $0xd69906245565a910
    177 DATA  kcon+0x2E0(SB)/8, $0xf40e35855771202a
    178 DATA  kcon+0x2E8(SB)/8, $0xf40e35855771202a
    179 DATA  kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
    180 DATA  kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
    181 DATA  kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
    182 DATA  kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
    183 DATA  kcon+0x310(SB)/8, $0x1e376c085141ab53
    184 DATA  kcon+0x318(SB)/8, $0x1e376c085141ab53
    185 DATA  kcon+0x320(SB)/8, $0x2748774cdf8eeb99
    186 DATA  kcon+0x328(SB)/8, $0x2748774cdf8eeb99
    187 DATA  kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
    188 DATA  kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
    189 DATA  kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
    190 DATA  kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
    191 DATA  kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
    192 DATA  kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
    193 DATA  kcon+0x360(SB)/8, $0x5b9cca4f7763e373
    194 DATA  kcon+0x368(SB)/8, $0x5b9cca4f7763e373
    195 DATA  kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
    196 DATA  kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
    197 DATA  kcon+0x380(SB)/8, $0x748f82ee5defb2fc
    198 DATA  kcon+0x388(SB)/8, $0x748f82ee5defb2fc
    199 DATA  kcon+0x390(SB)/8, $0x78a5636f43172f60
    200 DATA  kcon+0x398(SB)/8, $0x78a5636f43172f60
    201 DATA  kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
    202 DATA  kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
    203 DATA  kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
    204 DATA  kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
    205 DATA  kcon+0x3C0(SB)/8, $0x90befffa23631e28
    206 DATA  kcon+0x3C8(SB)/8, $0x90befffa23631e28
    207 DATA  kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
    208 DATA  kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
    209 DATA  kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
    210 DATA  kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
    211 DATA  kcon+0x3F0(SB)/8, $0xc67178f2e372532b
    212 DATA  kcon+0x3F8(SB)/8, $0xc67178f2e372532b
    213 DATA  kcon+0x400(SB)/8, $0xca273eceea26619c
    214 DATA  kcon+0x408(SB)/8, $0xca273eceea26619c
    215 DATA  kcon+0x410(SB)/8, $0xd186b8c721c0c207
    216 DATA  kcon+0x418(SB)/8, $0xd186b8c721c0c207
    217 DATA  kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
    218 DATA  kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
    219 DATA  kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
    220 DATA  kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
    221 DATA  kcon+0x440(SB)/8, $0x06f067aa72176fba
    222 DATA  kcon+0x448(SB)/8, $0x06f067aa72176fba
    223 DATA  kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
    224 DATA  kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
    225 DATA  kcon+0x460(SB)/8, $0x113f9804bef90dae
    226 DATA  kcon+0x468(SB)/8, $0x113f9804bef90dae
    227 DATA  kcon+0x470(SB)/8, $0x1b710b35131c471b
    228 DATA  kcon+0x478(SB)/8, $0x1b710b35131c471b
    229 DATA  kcon+0x480(SB)/8, $0x28db77f523047d84
    230 DATA  kcon+0x488(SB)/8, $0x28db77f523047d84
    231 DATA  kcon+0x490(SB)/8, $0x32caab7b40c72493
    232 DATA  kcon+0x498(SB)/8, $0x32caab7b40c72493
    233 DATA  kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
    234 DATA  kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
    235 DATA  kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
    236 DATA  kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
    237 DATA  kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
    238 DATA  kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
    239 DATA  kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
    240 DATA  kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
    241 DATA  kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
    242 DATA  kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
    243 DATA  kcon+0x4F0(SB)/8, $0x6c44198c4a475817
    244 DATA  kcon+0x4F8(SB)/8, $0x6c44198c4a475817
    245 DATA  kcon+0x500(SB)/8, $0x0000000000000000
    246 DATA  kcon+0x508(SB)/8, $0x0000000000000000
    247 DATA  kcon+0x510(SB)/8, $0x1011121314151617
    248 DATA  kcon+0x518(SB)/8, $0x0001020304050607
    249 GLOBL kcon(SB), RODATA, $1312
    250 
    251 #define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
    252 	VSEL		g, f, e, FUNC; \
    253 	VSHASIGMAD	$15, e, $1, S1; \
    254 	VADDUDM		xi, h, h; \
    255 	VSHASIGMAD	$0, a, $1, S0; \
    256 	VADDUDM		FUNC, h, h; \
    257 	VXOR		b, a, FUNC; \
    258 	VADDUDM		S1, h, h; \
    259 	VSEL		b, c, FUNC, FUNC; \
    260 	VADDUDM		KI, g, g; \
    261 	VADDUDM		h, d, d; \
    262 	VADDUDM		FUNC, S0, S0; \
    263 	LVX		(TBL)(IDX), KI; \
    264 	ADD		$16, IDX; \
    265 	VADDUDM		S0, h, h
    266 
    267 #define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
    268 	VSHASIGMAD	$0, xj_1, $0, s0; \
    269 	VSEL		g, f, e, FUNC; \
    270 	VSHASIGMAD	$15, e, $1, S1; \
    271 	VADDUDM		xi, h, h; \
    272 	VSHASIGMAD	$0, a, $1, S0; \
    273 	VSHASIGMAD	$15, xj_14, $0, s1; \
    274 	VADDUDM		FUNC, h, h; \
    275 	VXOR		b, a, FUNC; \
    276 	VADDUDM		xj_9, xj, xj; \
    277 	VADDUDM		S1, h, h; \
    278 	VSEL		b, c, FUNC, FUNC; \
    279 	VADDUDM		KI, g, g; \
    280 	VADDUDM		h, d, d; \
    281 	VADDUDM		FUNC, S0, S0; \
    282 	VADDUDM		s0, xj, xj; \
    283 	LVX		(TBL)(IDX), KI; \
    284 	ADD		$16, IDX; \
    285 	VADDUDM		S0, h, h; \
    286 	VADDUDM		s1, xj, xj
    287 
    288 // func block(dig *digest, p []byte)
    289 TEXT block(SB),0,$128-32
    290 	MOVD	dig+0(FP), CTX
    291 	MOVD	p_base+8(FP), INP
    292 	MOVD	p_len+16(FP), LEN
    293 
    294 	SRD	$6, LEN
    295 	SLD	$6, LEN
    296 
    297 	ADD	INP, LEN, END
    298 
    299 	CMP	INP, END
    300 	BEQ	end
    301 
    302 	MOVD	$kcon(SB), TBL
    303 	MOVD	R1, OFFLOAD
    304 
    305 	MOVD	R0, CNT
    306 	MOVWZ	$0x10, HEX10
    307 	MOVWZ	$0x20, HEX20
    308 	MOVWZ	$0x30, HEX30
    309 	MOVWZ	$0x40, HEX40
    310 	MOVWZ	$0x50, HEX50
    311 	MOVWZ	$0x60, HEX60
    312 	MOVWZ	$0x70, HEX70
    313 
    314 	MOVWZ	$8, IDX
    315 	LVSL	(IDX)(R0), LEMASK
    316 	VSPLTISB	$0x0F, KI
    317 	VXOR	KI, LEMASK, LEMASK
    318 
    319 	LXVD2X	(CTX)(HEX00), VS32	// v0 = vs32
    320 	LXVD2X	(CTX)(HEX10), VS34	// v2 = vs34
    321 	LXVD2X	(CTX)(HEX20), VS36	// v4 = vs36
    322 	// unpack the input values into vector registers
    323 	VSLDOI	$8, V0, V0, V1
    324 	LXVD2X	(CTX)(HEX30), VS38	// v6 = vs38
    325 	VSLDOI	$8, V2, V2, V3
    326 	VSLDOI	$8, V4, V4, V5
    327 	VSLDOI	$8, V6, V6, V7
    328 
    329 loop:
    330 	LVX	(TBL)(HEX00), KI
    331 	MOVWZ	$16, IDX
    332 
    333 	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
    334 	ADD	$16, INP
    335 
    336 	STVX	V0, (OFFLOAD+HEX00)
    337 	STVX	V1, (OFFLOAD+HEX10)
    338 	STVX	V2, (OFFLOAD+HEX20)
    339 	STVX	V3, (OFFLOAD+HEX30)
    340 	STVX	V4, (OFFLOAD+HEX40)
    341 	STVX	V5, (OFFLOAD+HEX50)
    342 	STVX	V6, (OFFLOAD+HEX60)
    343 	STVX	V7, (OFFLOAD+HEX70)
    344 
    345 	VADDUDM	KI, V7, V7	// h+K[i]
    346 	LVX	(TBL)(IDX), KI
    347 	ADD	$16, IDX
    348 
    349 	VPERM	V8, V8, LEMASK, V8
    350 	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
    351 	LXVD2X	(INP)(R0), VS42	// load v10 (=vs42) in advance
    352 	ADD	$16, INP, INP
    353 	VSLDOI	$8, V8, V8, V9
    354 	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
    355 	VPERM	V10, V10, LEMASK, V10
    356 	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
    357 	LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
    358 	ADD	$16, INP, INP
    359 	VSLDOI	$8, V10, V10, V11
    360 	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
    361 	VPERM	V12, V12, LEMASK, V12
    362 	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
    363 	LXVD2X	(INP)(R0), VS46	// load v14 (=vs46) in advance
    364 	ADD	$16, INP, INP
    365 	VSLDOI	$8, V12, V12, V13
    366 	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
    367 	VPERM	V14, V14, LEMASK, V14
    368 	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
    369 	LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
    370 	ADD	$16, INP, INP
    371 	VSLDOI	$8, V14, V14, V15
    372 	SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
    373 	VPERM	V16, V16, LEMASK, V16
    374 	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
    375 	LXVD2X	(INP)(R0), VS50	// load v18 (=vs50) in advance
    376 	ADD	$16, INP, INP
    377 	VSLDOI	$8, V16, V16, V17
    378 	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
    379 	VPERM	V18, V18, LEMASK, V18
    380 	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
    381 	LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
    382 	ADD	$16, INP, INP
    383 	VSLDOI	$8, V18, V18, V19
    384 	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
    385 	VPERM	V20, V20, LEMASK, V20
    386 	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
    387 	LXVD2X	(INP)(R0), VS54	// load v22 (=vs54) in advance
    388 	ADD	$16, INP, INP
    389 	VSLDOI	$8, V20, V20, V21
    390 	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
    391 	VPERM	V22, V22, LEMASK, V22
    392 	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
    393 	VSLDOI	$8, V22, V22, V23
    394 	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
    395 
    396 	MOVWZ	$4, TEMP
    397 	MOVWZ	TEMP, CTR
    398 
    399 L16_xx:
    400 	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
    401 	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
    402 	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
    403 	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
    404 	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
    405 	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
    406 	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
    407 	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
    408 	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
    409 	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
    410 	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
    411 	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
    412 	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
    413 	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
    414 	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
    415 	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
    416 
    417 	BC	0x10, 0, L16_xx		// bdnz
    418 
    419 	LVX	(OFFLOAD)(HEX00), V10
    420 
    421 	LVX	(OFFLOAD)(HEX10), V11
    422 	VADDUDM	V10, V0, V0
    423 	LVX	(OFFLOAD)(HEX20), V12
    424 	VADDUDM	V11, V1, V1
    425 	LVX	(OFFLOAD)(HEX30), V13
    426 	VADDUDM	V12, V2, V2
    427 	LVX	(OFFLOAD)(HEX40), V14
    428 	VADDUDM	V13, V3, V3
    429 	LVX	(OFFLOAD)(HEX50), V15
    430 	VADDUDM	V14, V4, V4
    431 	LVX	(OFFLOAD)(HEX60), V16
    432 	VADDUDM	V15, V5, V5
    433 	LVX	(OFFLOAD)(HEX70), V17
    434 	VADDUDM	V16, V6, V6
    435 	VADDUDM	V17, V7, V7
    436 
    437 	CMPU	INP, END
    438 	BLT	loop
    439 
    440 	VPERM	V0, V1, KI, V0
    441 	VPERM	V2, V3, KI, V2
    442 	VPERM	V4, V5, KI, V4
    443 	VPERM	V6, V7, KI, V6
    444 	STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
    445 	STXVD2X	VS34, (CTX+HEX10)	// v2 = vs34
    446 	STXVD2X	VS36, (CTX+HEX20)	// v4 = vs36
    447 	STXVD2X	VS38, (CTX+HEX30)	// v6 = vs38
    448 
    449 end:
    450 	RET
    451 
    452