Home | History | Annotate | Download | only in big
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !math_big_pure_go,s390x
      6 
      7 #include "textflag.h"
      8 
      9 // This file provides fast assembly versions for the elementary
     10 // arithmetic operations on vectors implemented in arith.go.
     11 
     12 TEXT hasVectorFacility(SB),NOSPLIT,$24-1
     13         MOVD    $x-24(SP), R1
     14         XC      $24, 0(R1), 0(R1) // clear the storage
     15         MOVD    $2, R0            // R0 is the number of double words stored -1
     16         WORD    $0xB2B01000       // STFLE 0(R1)
     17         XOR     R0, R0            // reset the value of R0
     18         MOVBZ   z-8(SP), R1
     19         AND     $0x40, R1
     20         BEQ     novector
     21 vectorinstalled:
     22         // check if the vector instruction has been enabled
     23         VLEIB   $0, $0xF, V16
     24         VLGVB   $0, V16, R1
     25         CMPBNE  R1, $0xF, novector
     26         MOVB    $1, ret+0(FP) // have vx
     27         RET
     28 novector:
     29         MOVB    $0, ret+0(FP) // no vx
     30         RET
     31 
     32 TEXT mulWW(SB),NOSPLIT,$0
     33 	MOVD	x+0(FP), R3
     34 	MOVD	y+8(FP), R4
     35 	MULHDU	R3, R4
     36 	MOVD	R10, z1+16(FP)
     37 	MOVD	R11, z0+24(FP)
     38 	RET
     39 
     40 // func divWW(x1, x0, y Word) (q, r Word)
     41 TEXT divWW(SB),NOSPLIT,$0
     42 	MOVD	x1+0(FP), R10
     43 	MOVD	x0+8(FP), R11
     44 	MOVD	y+16(FP), R5
     45 	WORD	$0xb98700a5 // dlgr r10,r5
     46 	MOVD	R11, q+24(FP)
     47 	MOVD	R10, r+32(FP)
     48 	RET
     49 
     50 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
     51 // func addVV(z, x, y []Word) (c Word)
     52 
     53 
     54 TEXT addVV(SB),NOSPLIT,$0
     55 	MOVD	addvectorfacility+0x00(SB),R1
     56 	BR	(R1)
     57 
     58 TEXT addVV_check(SB),NOSPLIT, $0
     59 	MOVB	hasVX(SB), R1
     60 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
     61 	MOVD	$addvectorfacility+0x00(SB), R1
     62 	MOVD	$addVV_novec(SB), R2
     63 	MOVD	R2, 0(R1)
     64 	//MOVD	$addVV_novec(SB), 0(R1)
     65 	BR	addVV_novec(SB)
     66 vectorimpl:
     67 	MOVD	$addvectorfacility+0x00(SB), R1
     68 	MOVD	$addVV_vec(SB), R2
     69 	MOVD	R2, 0(R1)
     70 	//MOVD	$addVV_vec(SB), 0(R1)
     71 	BR	addVV_vec(SB)
     72 
     73 GLOBL addvectorfacility+0x00(SB), NOPTR, $8
     74 DATA addvectorfacility+0x00(SB)/8, $addVV_check(SB)
     75 
     76 TEXT addVV_vec(SB),NOSPLIT,$0
     77 	MOVD	z_len+8(FP), R3
     78 	MOVD	x+24(FP), R8
     79 	MOVD	y+48(FP), R9
     80 	MOVD	z+0(FP), R2
     81 
     82 	MOVD	$0, R4		// c = 0
     83 	MOVD	$0, R0		// make sure it's zero
     84 	MOVD	$0, R10		// i = 0
     85 
     86 
     87 	// s/JL/JMP/ below to disable the unrolled loop
     88 	SUB	$4, R3
     89 	BLT	v1
     90 	SUB     $12, R3                 // n -= 16
     91         BLT     A1                      // if n < 0 goto A1
     92 
     93 	MOVD	R8, R5
     94 	MOVD	R9, R6
     95 	MOVD	R2, R7
     96 	// n >= 0
     97 	// regular loop body unrolled 16x
     98 	VZERO	V0			// c = 0
     99 UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
    100 	ADD	$64, R5
    101 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
    102 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
    103 
    104 
    105 	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
    106 	ADD	$64, R6
    107 	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
    108 	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
    109 
    110 	VACCCQ	V1, V9, V0, V25
    111 	VACQ	V1, V9, V0, V17
    112 	VACCCQ	V2, V10, V25, V26
    113 	VACQ	V2, V10, V25, V18
    114 
    115 
    116 	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
    117 	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
    118 	ADD	$32, R5
    119 	ADD	$32, R6
    120 
    121 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
    122 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
    123 	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
    124 	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
    125 
    126 	VACCCQ	V3, V11, V26, V27
    127 	VACQ	V3, V11, V26, V19
    128 	VACCCQ	V4, V12, V27, V28
    129 	VACQ	V4, V12, V27, V20
    130 
    131 	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
    132 	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
    133 	ADD	$32, R5
    134 	ADD	$32, R6
    135 
    136 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
    137 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
    138 	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
    139 	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
    140 
    141 	VACCCQ	V5, V13, V28, V29
    142 	VACQ	V5, V13, V28, V21
    143 	VACCCQ	V6, V14, V29, V30
    144 	VACQ	V6, V14, V29, V22
    145 
    146 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
    147 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
    148 	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
    149 	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
    150 
    151 	VACCCQ	V7, V15, V30, V31
    152 	VACQ	V7, V15, V30, V23
    153 	VACCCQ	V8, V16, V31, V0	//V0 has carry-over
    154 	VACQ	V8, V16, V31, V24
    155 
    156 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
    157 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
    158 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
    159 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
    160 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
    161 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
    162 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
    163 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
    164 	VSTM	V17, V24, 0(R7)  	// 128-bytes into z
    165 	ADD	$128, R7
    166 	ADD	$128, R10	// i += 16
    167 	SUB	$16,  R3	// n -= 16
    168 	BGE	UU1		// if n >= 0 goto U1
    169 	VLGVG	$1, V0, R4	// put cf into R4
    170 	NEG	R4, R4		// save cf
    171 
    172 A1:	ADD	$12, R3		// n += 16
    173 
    174 
    175 	// s/JL/JMP/ below to disable the unrolled loop
    176 	BLT	v1		// if n < 0 goto v1
    177 
    178 U1:	// n >= 0
    179 	// regular loop body unrolled 4x
    180 	MOVD	0(R8)(R10*1), R5
    181 	MOVD	8(R8)(R10*1), R6
    182 	MOVD	16(R8)(R10*1), R7
    183 	MOVD	24(R8)(R10*1), R1
    184 	ADDC	R4, R4		// restore CF
    185 	MOVD	0(R9)(R10*1), R11
    186 	ADDE	R11, R5
    187 	MOVD	8(R9)(R10*1), R11
    188 	ADDE	R11, R6
    189 	MOVD	16(R9)(R10*1), R11
    190 	ADDE	R11, R7
    191 	MOVD	24(R9)(R10*1), R11
    192 	ADDE	R11, R1
    193 	MOVD	R0, R4
    194 	ADDE	R4, R4		// save CF
    195 	NEG	R4, R4
    196 	MOVD	R5, 0(R2)(R10*1)
    197 	MOVD	R6, 8(R2)(R10*1)
    198 	MOVD	R7, 16(R2)(R10*1)
    199 	MOVD	R1, 24(R2)(R10*1)
    200 
    201 
    202 	ADD	$32, R10	// i += 4
    203 	SUB	$4,  R3		// n -= 4
    204 	BGE	U1		// if n >= 0 goto U1
    205 
    206 v1:	ADD	$4, R3		// n += 4
    207 	BLE	E1		// if n <= 0 goto E1
    208 
    209 L1:	// n > 0
    210 	ADDC	R4, R4		// restore CF
    211 	MOVD	0(R8)(R10*1), R5
    212 	MOVD	0(R9)(R10*1), R11
    213 	ADDE	R11, R5
    214 	MOVD	R5, 0(R2)(R10*1)
    215 	MOVD	R0, R4
    216 	ADDE	R4, R4		// save CF
    217 	NEG 	R4, R4
    218 
    219 	ADD	$8, R10		// i++
    220 	SUB	$1, R3		// n--
    221 	BGT	L1		// if n > 0 goto L1
    222 
    223 E1:	NEG	R4, R4
    224 	MOVD	R4, c+72(FP)	// return c
    225 	RET
    226 
    227 TEXT addVV_novec(SB),NOSPLIT,$0
    228 novec:
    229 	MOVD	z_len+8(FP), R3
    230 	MOVD	x+24(FP), R8
    231 	MOVD	y+48(FP), R9
    232 	MOVD	z+0(FP), R2
    233 
    234 	MOVD	$0, R4		// c = 0
    235 	MOVD	$0, R0		// make sure it's zero
    236 	MOVD	$0, R10		// i = 0
    237 
    238 	// s/JL/JMP/ below to disable the unrolled loop
    239 	SUB	$4, R3		// n -= 4
    240 	BLT	v1n		// if n < 0 goto v1n
    241 U1n:	// n >= 0
    242 	// regular loop body unrolled 4x
    243 	MOVD	0(R8)(R10*1), R5
    244 	MOVD	8(R8)(R10*1), R6
    245 	MOVD	16(R8)(R10*1), R7
    246 	MOVD	24(R8)(R10*1), R1
    247 	ADDC	R4, R4		// restore CF
    248 	MOVD	0(R9)(R10*1), R11
    249 	ADDE	R11, R5
    250 	MOVD	8(R9)(R10*1), R11
    251 	ADDE	R11, R6
    252 	MOVD	16(R9)(R10*1), R11
    253 	ADDE	R11, R7
    254 	MOVD	24(R9)(R10*1), R11
    255 	ADDE	R11, R1
    256 	MOVD	R0, R4
    257 	ADDE	R4, R4		// save CF
    258 	NEG	R4, R4
    259 	MOVD	R5, 0(R2)(R10*1)
    260 	MOVD	R6, 8(R2)(R10*1)
    261 	MOVD	R7, 16(R2)(R10*1)
    262 	MOVD	R1, 24(R2)(R10*1)
    263 
    264 
    265 	ADD	$32, R10	// i += 4
    266 	SUB	$4,  R3		// n -= 4
    267 	BGE	U1n		// if n >= 0 goto U1n
    268 
    269 v1n:	ADD	$4, R3		// n += 4
    270 	BLE	E1n		// if n <= 0 goto E1n
    271 
    272 L1n:	// n > 0
    273 	ADDC	R4, R4		// restore CF
    274 	MOVD	0(R8)(R10*1), R5
    275 	MOVD	0(R9)(R10*1), R11
    276 	ADDE	R11, R5
    277 	MOVD	R5, 0(R2)(R10*1)
    278 	MOVD	R0, R4
    279 	ADDE	R4, R4		// save CF
    280 	NEG 	R4, R4
    281 
    282 	ADD	$8, R10		// i++
    283 	SUB	$1, R3		// n--
    284 	BGT L1n			// if n > 0 goto L1n
    285 
    286 E1n:	NEG	R4, R4
    287 	MOVD	R4, c+72(FP)	// return c
    288 	RET
    289 
    290 
    291 TEXT subVV(SB),NOSPLIT,$0
    292 	MOVD	subvectorfacility+0x00(SB),R1
    293 	BR	(R1)
    294 
    295 TEXT subVV_check(SB),NOSPLIT,$0
    296 	MOVB	hasVX(SB), R1
    297 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
    298 	MOVD	$subvectorfacility+0x00(SB), R1
    299 	MOVD	$subVV_novec(SB), R2
    300 	MOVD	R2, 0(R1)
    301 	//MOVD	$subVV_novec(SB), 0(R1)
    302 	BR	subVV_novec(SB)
    303 vectorimpl:
    304 	MOVD	$subvectorfacility+0x00(SB), R1
    305 	MOVD    $subVV_vec(SB), R2
    306         MOVD    R2, 0(R1)
    307 	//MOVD	$subVV_vec(SB), 0(R1)
    308 	BR	subVV_vec(SB)
    309 
    310 GLOBL subvectorfacility+0x00(SB), NOPTR, $8
    311 DATA subvectorfacility+0x00(SB)/8, $subVV_check(SB)
    312 
    313 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    314 // func subVV(z, x, y []Word) (c Word)
    315 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
    316 TEXT subVV_vec(SB),NOSPLIT,$0
    317 	MOVD	z_len+8(FP), R3
    318 	MOVD	x+24(FP), R8
    319 	MOVD	y+48(FP), R9
    320 	MOVD	z+0(FP), R2
    321 	MOVD	$0, R4		// c = 0
    322 	MOVD	$0, R0		// make sure it's zero
    323 	MOVD	$0, R10		// i = 0
    324 
    325 	// s/JL/JMP/ below to disable the unrolled loop
    326 	SUB	$4, R3		// n -= 4
    327 	BLT	v1		// if n < 0 goto v1
    328 	SUB     $12, R3         // n -= 16
    329         BLT     A1              // if n < 0 goto A1
    330 
    331 	MOVD	R8, R5
    332 	MOVD	R9, R6
    333 	MOVD	R2, R7
    334 
    335 	// n >= 0
    336 	// regular loop body unrolled 16x
    337 	VZERO	V0		// cf = 0
    338 	MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow)
    339 	VLVGG	$1, R4, V0	//put carry into V0
    340 
    341 UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
    342 	ADD	$64, R5
    343 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
    344 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
    345 
    346 
    347 	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
    348 	ADD	$64, R6
    349 	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
    350 	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
    351 
    352 	VSBCBIQ	V1, V9, V0, V25
    353 	VSBIQ	V1, V9, V0, V17
    354 	VSBCBIQ	V2, V10, V25, V26
    355 	VSBIQ	V2, V10, V25, V18
    356 
    357 
    358 	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
    359 	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
    360 	ADD	$32, R5
    361 	ADD	$32, R6
    362 
    363 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
    364 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
    365 	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
    366 	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
    367 
    368 	VSBCBIQ	V3, V11, V26, V27
    369 	VSBIQ	V3, V11, V26, V19
    370 	VSBCBIQ	V4, V12, V27, V28
    371 	VSBIQ	V4, V12, V27, V20
    372 
    373 	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
    374 	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
    375 	ADD	$32, R5
    376 	ADD	$32, R6
    377 
    378 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
    379 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
    380 	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
    381 	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
    382 
    383 	VSBCBIQ	V5, V13, V28, V29
    384 	VSBIQ	V5, V13, V28, V21
    385 	VSBCBIQ	V6, V14, V29, V30
    386 	VSBIQ	V6, V14, V29, V22
    387 
    388 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
    389 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
    390 	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
    391 	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
    392 
    393 	VSBCBIQ	V7, V15, V30, V31
    394 	VSBIQ	V7, V15, V30, V23
    395 	VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over
    396 	VSBIQ	V8, V16, V31, V24
    397 
    398 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
    399 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
    400 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
    401 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
    402 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
    403 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
    404 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
    405 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
    406 	VSTM	V17, V24, 0(R7)   // 128-bytes into z
    407 	ADD	$128, R7
    408 	ADD	$128, R10	// i += 16
    409 	SUB	$16,  R3	// n -= 16
    410 	BGE	UU1		// if n >= 0 goto U1
    411 	VLGVG	$1, V0, R4	// put cf into R4
    412 	SUB	$1, R4		// save cf
    413 
    414 A1:	ADD	$12, R3		// n += 16
    415 	BLT	v1		// if n < 0 goto v1
    416 
    417 U1:	// n >= 0
    418 	// regular loop body unrolled 4x
    419 	MOVD	0(R8)(R10*1), R5
    420 	MOVD	8(R8)(R10*1), R6
    421 	MOVD	16(R8)(R10*1), R7
    422 	MOVD	24(R8)(R10*1), R1
    423 	MOVD	R0, R11
    424 	SUBC	R4, R11		// restore CF
    425 	MOVD	0(R9)(R10*1), R11
    426 	SUBE	R11, R5
    427 	MOVD	8(R9)(R10*1), R11
    428 	SUBE	R11, R6
    429 	MOVD	16(R9)(R10*1), R11
    430 	SUBE	R11, R7
    431 	MOVD	24(R9)(R10*1), R11
    432 	SUBE	R11, R1
    433 	MOVD	R0, R4
    434 	SUBE	R4, R4		// save CF
    435 	MOVD	R5, 0(R2)(R10*1)
    436 	MOVD	R6, 8(R2)(R10*1)
    437 	MOVD	R7, 16(R2)(R10*1)
    438 	MOVD	R1, 24(R2)(R10*1)
    439 
    440 	ADD	$32, R10	// i += 4
    441 	SUB	$4,  R3		// n -= 4
    442 	BGE	U1		// if n >= 0 goto U1n
    443 
    444 v1:	ADD	$4, R3		// n += 4
    445 	BLE	E1		// if n <= 0 goto E1
    446 
    447 L1:	// n > 0
    448 	MOVD	R0, R11
    449 	SUBC	R4, R11		// restore CF
    450 	MOVD	0(R8)(R10*1), R5
    451 	MOVD	0(R9)(R10*1), R11
    452 	SUBE	R11, R5
    453 	MOVD	R5, 0(R2)(R10*1)
    454 	MOVD	R0, R4
    455 	SUBE	R4, R4		// save CF
    456 
    457 	ADD	$8, R10		// i++
    458 	SUB	$1, R3		// n--
    459 	BGT	L1		// if n > 0 goto L1n
    460 
    461 E1:	NEG	R4, R4
    462 	MOVD	R4, c+72(FP)	// return c
    463 	RET
    464 
    465 
    466 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    467 // func subVV(z, x, y []Word) (c Word)
    468 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
    469 TEXT subVV_novec(SB),NOSPLIT,$0
    470 	MOVD z_len+8(FP), R3
    471 	MOVD x+24(FP), R8
    472 	MOVD y+48(FP), R9
    473 	MOVD z+0(FP), R2
    474 
    475 	MOVD $0, R4		// c = 0
    476 	MOVD $0, R0		// make sure it's zero
    477 	MOVD $0, R10		// i = 0
    478 
    479 	// s/JL/JMP/ below to disable the unrolled loop
    480 	SUB  $4, R3		// n -= 4
    481 	BLT v1			// if n < 0 goto v1
    482 
    483 U1:	// n >= 0
    484 	// regular loop body unrolled 4x
    485 	MOVD 0(R8)(R10*1), R5
    486 	MOVD 8(R8)(R10*1), R6
    487 	MOVD 16(R8)(R10*1), R7
    488 	MOVD 24(R8)(R10*1), R1
    489 	MOVD R0, R11
    490 	SUBC R4, R11		// restore CF
    491 	MOVD 0(R9)(R10*1), R11
    492 	SUBE R11, R5
    493 	MOVD 8(R9)(R10*1), R11
    494 	SUBE R11, R6
    495 	MOVD 16(R9)(R10*1), R11
    496 	SUBE R11, R7
    497 	MOVD 24(R9)(R10*1), R11
    498 	SUBE R11, R1
    499 	MOVD R0, R4
    500 	SUBE R4, R4		// save CF
    501 	MOVD R5, 0(R2)(R10*1)
    502 	MOVD R6, 8(R2)(R10*1)
    503 	MOVD R7, 16(R2)(R10*1)
    504 	MOVD R1, 24(R2)(R10*1)
    505 
    506 
    507 	ADD  $32, R10		// i += 4
    508 	SUB  $4,  R3		// n -= 4
    509 	BGE  U1			// if n >= 0 goto U1
    510 
    511 v1:	ADD  $4, R3		// n += 4
    512 	BLE E1			// if n <= 0 goto E1
    513 
    514 L1:	// n > 0
    515 	MOVD R0, R11
    516 	SUBC R4, R11		// restore CF
    517 	MOVD 0(R8)(R10*1), R5
    518 	MOVD 0(R9)(R10*1), R11
    519 	SUBE R11, R5
    520 	MOVD R5, 0(R2)(R10*1)
    521 	MOVD R0, R4
    522 	SUBE R4, R4		// save CF
    523 
    524 	ADD  $8, R10		// i++
    525 	SUB  $1, R3		// n--
    526 	BGT L1			// if n > 0 goto L1
    527 
    528 E1:	NEG  R4, R4
    529 	MOVD R4, c+72(FP)	// return c
    530 	RET
    531 
    532 TEXT addVW(SB),NOSPLIT,$0
    533 	MOVD	addwvectorfacility+0x00(SB),R1
    534 	BR	(R1)
    535 
    536 TEXT addVW_check(SB),NOSPLIT,$0
    537 	MOVB	hasVX(SB), R1
    538 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
    539 	MOVD	$addwvectorfacility+0x00(SB), R1
    540 	MOVD    $addVW_novec(SB), R2
    541         MOVD    R2, 0(R1)
    542 	//MOVD	$addVW_novec(SB), 0(R1)
    543 	BR	addVW_novec(SB)
    544 vectorimpl:
    545 	MOVD	$addwvectorfacility+0x00(SB), R1
    546 	MOVD    $addVW_vec(SB), R2
    547         MOVD    R2, 0(R1)
    548 	//MOVD	$addVW_vec(SB), 0(R1)
    549 	BR	addVW_vec(SB)
    550 
    551 GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
    552 DATA addwvectorfacility+0x00(SB)/8, $addVW_check(SB)
    553 
    554 
    555 // func addVW_vec(z, x []Word, y Word) (c Word)
    556 TEXT addVW_vec(SB),NOSPLIT,$0
    557 	MOVD	z_len+8(FP), R3
    558 	MOVD	x+24(FP), R8
    559 	MOVD	y+48(FP), R4	// c = y
    560 	MOVD	z+0(FP), R2
    561 
    562 	MOVD	$0, R0		// make sure it's zero
    563 	MOVD	$0, R10		// i = 0
    564 	MOVD	R8, R5
    565 	MOVD	R2, R7
    566 
    567 	// s/JL/JMP/ below to disable the unrolled loop
    568 	SUB	$4, R3			// n -= 4
    569 	BLT	v10			// if n < 0 goto v10
    570 	SUB	$12, R3
    571 	BLT	A10
    572 
    573 	// n >= 0
    574 	// regular loop body unrolled 16x
    575 
    576 	VZERO	V0			// prepare V0 to be final carry register
    577 	VZERO	V9			// to ensure upper half is zero
    578 	VLVGG	$1, R4, V9
    579 UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
    580 	ADD	$64, R5
    581 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
    582 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
    583 
    584 
    585 	VACCCQ	V1, V9, V0, V25
    586 	VACQ	V1, V9, V0, V17
    587 	VZERO	V9
    588 	VACCCQ	V2, V9, V25, V26
    589 	VACQ	V2, V9, V25, V18
    590 
    591 
    592 	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
    593 	ADD	$32, R5
    594 
    595 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
    596 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
    597 
    598 	VACCCQ	V3, V9, V26, V27
    599 	VACQ	V3, V9, V26, V19
    600 	VACCCQ	V4, V9, V27, V28
    601 	VACQ	V4, V9, V27, V20
    602 
    603 	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
    604 	ADD	$32, R5
    605 
    606 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
    607 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
    608 
    609 	VACCCQ	V5, V9, V28, V29
    610 	VACQ	V5, V9, V28, V21
    611 	VACCCQ	V6, V9, V29, V30
    612 	VACQ	V6, V9, V29, V22
    613 
    614 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
    615 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
    616 
    617 	VACCCQ	V7, V9, V30, V31
    618 	VACQ	V7, V9, V30, V23
    619 	VACCCQ	V8, V9, V31, V0	//V0 has carry-over
    620 	VACQ	V8, V9, V31, V24
    621 
    622 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
    623 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
    624 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
    625 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
    626 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
    627 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
    628 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
    629 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
    630 	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
    631 	ADD	$128, R7
    632 	ADD	$128, R10		// i += 16
    633 	SUB	$16,  R3		// n -= 16
    634 	BGE	UU1		// if n >= 0 goto U1
    635 	VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10
    636 
    637 A10:	ADD	$12, R3		// n += 16
    638 
    639 
    640 	// s/JL/JMP/ below to disable the unrolled loop
    641 
    642 	BLT	v10		// if n < 0 goto v10
    643 
    644 
    645 U4:	// n >= 0
    646 	// regular loop body unrolled 4x
    647 	MOVD 0(R8)(R10*1), R5
    648 	MOVD 8(R8)(R10*1), R6
    649 	MOVD 16(R8)(R10*1), R7
    650 	MOVD 24(R8)(R10*1), R1
    651 	ADDC R4, R5
    652 	ADDE R0, R6
    653 	ADDE R0, R7
    654 	ADDE R0, R1
    655 	ADDE R0, R0
    656 	MOVD R0, R4		// save CF
    657 	SUB  R0, R0
    658 	MOVD R5, 0(R2)(R10*1)
    659 	MOVD R6, 8(R2)(R10*1)
    660 	MOVD R7, 16(R2)(R10*1)
    661 	MOVD R1, 24(R2)(R10*1)
    662 
    663 	ADD $32, R10		// i += 4 -> i +=32
    664 	SUB $4, R3		// n -= 4
    665 	BGE U4			// if n >= 0 goto U4
    666 
    667 v10:	ADD $4, R3		// n += 4
    668 	BLE E10			// if n <= 0 goto E4
    669 
    670 
    671 L4:	// n > 0
    672 	MOVD	0(R8)(R10*1), R5
    673 	ADDC	R4, R5
    674 	ADDE	R0, R0
    675 	MOVD	R0, R4		// save CF
    676 	SUB 	R0, R0
    677 	MOVD	R5, 0(R2)(R10*1)
    678 
    679 	ADD	$8, R10		// i++
    680 	SUB	$1, R3		// n--
    681 	BGT	L4		// if n > 0 goto L4
    682 
    683 E10:	MOVD	R4, c+56(FP)	// return c
    684 
    685 	RET
    686 
    687 
    688 TEXT addVW_novec(SB),NOSPLIT,$0
    689 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
    690 	MOVD z_len+8(FP), R3
    691 	MOVD x+24(FP), R8
    692 	MOVD y+48(FP), R4	// c = y
    693 	MOVD z+0(FP), R2
    694 	MOVD $0, R0		// make sure it's 0
    695 	MOVD $0, R10		// i = 0
    696 
    697 	// s/JL/JMP/ below to disable the unrolled loop
    698 	SUB $4, R3		// n -= 4
    699 	BLT v4			// if n < 4 goto v4
    700 
    701 U4:	// n >= 0
    702 	// regular loop body unrolled 4x
    703 	MOVD 0(R8)(R10*1), R5
    704 	MOVD 8(R8)(R10*1), R6
    705 	MOVD 16(R8)(R10*1), R7
    706 	MOVD 24(R8)(R10*1), R1
    707 	ADDC R4, R5
    708 	ADDE R0, R6
    709 	ADDE R0, R7
    710 	ADDE R0, R1
    711 	ADDE R0, R0
    712 	MOVD R0, R4		// save CF
    713 	SUB  R0, R0
    714 	MOVD R5, 0(R2)(R10*1)
    715 	MOVD R6, 8(R2)(R10*1)
    716 	MOVD R7, 16(R2)(R10*1)
    717 	MOVD R1, 24(R2)(R10*1)
    718 
    719 	ADD $32, R10		// i += 4 -> i +=32
    720 	SUB $4, R3		// n -= 4
    721 	BGE U4			// if n >= 0 goto U4
    722 
    723 v4:	ADD $4, R3		// n += 4
    724 	BLE E4			// if n <= 0 goto E4
    725 
    726 L4:	// n > 0
    727 	MOVD 0(R8)(R10*1), R5
    728 	ADDC R4, R5
    729 	ADDE R0, R0
    730 	MOVD R0, R4		// save CF
    731 	SUB  R0, R0
    732 	MOVD R5, 0(R2)(R10*1)
    733 
    734 	ADD  $8, R10		// i++
    735 	SUB  $1, R3		// n--
    736 	BGT L4			// if n > 0 goto L4
    737 
    738 E4:	MOVD R4, c+56(FP)	// return c
    739 
    740 	RET
    741 
    742 TEXT subVW(SB),NOSPLIT,$0
    743 	MOVD	subwvectorfacility+0x00(SB),R1
    744 	BR	(R1)
    745 
    746 TEXT subVW_check(SB),NOSPLIT,$0
    747 	MOVB	hasVX(SB), R1
    748 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
    749 	MOVD	$subwvectorfacility+0x00(SB), R1
    750 	MOVD    $subVW_novec(SB), R2
    751         MOVD    R2, 0(R1)
    752 	//MOVD	$subVW_novec(SB), 0(R1)
    753 	BR	subVW_novec(SB)
    754 vectorimpl:
    755 	MOVD	$subwvectorfacility+0x00(SB), R1
    756 	MOVD    $subVW_vec(SB), R2
    757         MOVD    R2, 0(R1)
    758 	//MOVD	$subVW_vec(SB), 0(R1)
    759 	BR	subVW_vec(SB)
    760 
    761 GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
    762 DATA subwvectorfacility+0x00(SB)/8, $subVW_check(SB)
    763 
    764 // func subVW(z, x []Word, y Word) (c Word)
    765 TEXT subVW_vec(SB),NOSPLIT,$0
    766 	MOVD	z_len+8(FP), R3
    767 	MOVD	x+24(FP), R8
    768 	MOVD	y+48(FP), R4	// c = y
    769 	MOVD	z+0(FP), R2
    770 
    771 	MOVD	$0, R0		// make sure it's zero
    772 	MOVD	$0, R10		// i = 0
    773 	MOVD	R8, R5
    774 	MOVD	R2, R7
    775 
    776 	// s/JL/JMP/ below to disable the unrolled loop
    777 	SUB	$4, R3			// n -= 4
    778 	BLT	v11			// if n < 0 goto v11
    779 	SUB	$12, R3
    780 	BLT	A11
    781 
    782 	VZERO	V0
    783 	MOVD	$1, R6			// prepare V0 to be final carry register
    784 	VLVGG	$1, R6, V0		// borrow is initially "no borrow"
    785 	VZERO	V9			// to ensure upper half is zero
    786 	VLVGG	$1, R4, V9
    787 
    788 	// n >= 0
    789 	// regular loop body unrolled 16x
    790 
    791 
    792 UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
    793 	ADD	$64, R5
    794 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
    795 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
    796 
    797 
    798 	VSBCBIQ	V1, V9, V0, V25
    799 	VSBIQ	V1, V9, V0, V17
    800 	VZERO	V9
    801 	VSBCBIQ	V2, V9, V25, V26
    802 	VSBIQ	V2, V9, V25, V18
    803 
    804 	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
    805 	ADD	$32, R5
    806 
    807 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
    808 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
    809 
    810 
    811 	VSBCBIQ	V3, V9, V26, V27
    812 	VSBIQ	V3, V9, V26, V19
    813 	VSBCBIQ	V4, V9, V27, V28
    814 	VSBIQ	V4, V9, V27, V20
    815 
    816 	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
    817 	ADD	$32, R5
    818 
    819 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
    820 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
    821 
    822 	VSBCBIQ	V5, V9, V28, V29
    823 	VSBIQ	V5, V9, V28, V21
    824 	VSBCBIQ	V6, V9, V29, V30
    825 	VSBIQ	V6, V9, V29, V22
    826 
    827 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
    828 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
    829 
    830 	VSBCBIQ	V7, V9, V30, V31
    831 	VSBIQ	V7, V9, V30, V23
    832 	VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over
    833 	VSBIQ	V8, V9, V31, V24
    834 
    835 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
    836 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
    837 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
    838 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
    839 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
    840 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
    841 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
    842 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
    843 	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
    844 	ADD	$128, R7
    845 	ADD	$128, R10		// i += 16
    846 	SUB	$16,  R3		// n -= 16
    847 	BGE	UU1			// if n >= 0 goto U1
    848 	VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10
    849 	SUB	$1, R4			// save cf
    850 	NEG	R4, R4
    851 A11:	ADD	$12, R3			// n += 16
    852 
    853 	BLT	v11			// if n < 0 goto v11
    854 
    855 	// n >= 0
    856 	// regular loop body unrolled 4x
    857 
    858 U4:	// n >= 0
    859 	// regular loop body unrolled 4x
    860 	MOVD 0(R8)(R10*1), R5
    861 	MOVD 8(R8)(R10*1), R6
    862 	MOVD 16(R8)(R10*1), R7
    863 	MOVD 24(R8)(R10*1), R1
    864 	SUBC R4, R5 //SLGR  -> SUBC
    865 	SUBE R0, R6 //SLBGR -> SUBE
    866 	SUBE R0, R7
    867 	SUBE R0, R1
    868 	SUBE R4, R4		// save CF
    869 	NEG  R4, R4
    870 	MOVD R5, 0(R2)(R10*1)
    871 	MOVD R6, 8(R2)(R10*1)
    872 	MOVD R7, 16(R2)(R10*1)
    873 	MOVD R1, 24(R2)(R10*1)
    874 
    875 	ADD $32, R10		// i += 4 -> i +=32
    876 	SUB $4, R3		// n -= 4
    877 	BGE U4			// if n >= 0 goto U4
    878 
    879 v11:	ADD $4, R3		// n += 4
    880 	BLE E11			// if n <= 0 goto E4
    881 
    882 L4:	// n > 0
    883 
    884 	MOVD	0(R8)(R10*1), R5
    885 	SUBC	R4, R5
    886 	SUBE	R4, R4		// save CF
    887 	NEG	R4, R4
    888 	MOVD	R5, 0(R2)(R10*1)
    889 
    890 	ADD	$8, R10		// i++
    891 	SUB	$1, R3		// n--
    892 	BGT	L4		// if n > 0 goto L4
    893 
    894 E11:	MOVD	R4, c+56(FP)	// return c
    895 
    896 	RET
    897 
    898 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
    899 // func subVW(z, x []Word, y Word) (c Word)
    900 // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
    901 TEXT subVW_novec(SB),NOSPLIT,$0
    902 	MOVD z_len+8(FP), R3
    903 	MOVD x+24(FP), R8
    904 	MOVD y+48(FP), R4	// c = y
    905 	MOVD z+0(FP), R2
    906 	MOVD $0, R0		// make sure it's 0
    907 	MOVD $0, R10		// i = 0
    908 
    909 	// s/JL/JMP/ below to disable the unrolled loop
    910 	SUB $4, R3		// n -= 4
    911 	BLT v4			// if n < 4 goto v4
    912 
    913 U4:	// n >= 0
    914 	// regular loop body unrolled 4x
    915 	MOVD 0(R8)(R10*1), R5
    916 	MOVD 8(R8)(R10*1), R6
    917 	MOVD 16(R8)(R10*1), R7
    918 	MOVD 24(R8)(R10*1), R1
    919 	SUBC R4, R5 //SLGR  -> SUBC
    920 	SUBE R0, R6 //SLBGR -> SUBE
    921 	SUBE R0, R7
    922 	SUBE R0, R1
    923 	SUBE R4, R4		// save CF
    924 	NEG  R4, R4
    925 	MOVD R5, 0(R2)(R10*1)
    926 	MOVD R6, 8(R2)(R10*1)
    927 	MOVD R7, 16(R2)(R10*1)
    928 	MOVD R1, 24(R2)(R10*1)
    929 
    930 	ADD $32, R10		// i += 4 -> i +=32
    931 	SUB $4, R3		// n -= 4
    932 	BGE U4			// if n >= 0 goto U4
    933 
    934 v4:	ADD $4, R3		// n += 4
    935 	BLE E4			// if n <= 0 goto E4
    936 
    937 L4:	// n > 0
    938 	MOVD 0(R8)(R10*1), R5
    939 	SUBC R4, R5
    940 	SUBE R4, R4		// save CF
    941 	NEG  R4, R4
    942 	MOVD R5, 0(R2)(R10*1)
    943 
    944 	ADD  $8, R10		// i++
    945 	SUB  $1, R3		// n--
    946 	BGT L4			// if n > 0 goto L4
    947 
    948 E4:	MOVD R4, c+56(FP)	// return c
    949 
    950 	RET
    951 
    952 // func shlVU(z, x []Word, s uint) (c Word)
    953 TEXT shlVU(SB),NOSPLIT,$0
    954 	MOVD	z_len+8(FP), R5
    955 	MOVD	$0, R0
    956 	SUB	$1, R5             // n--
    957 	BLT	X8b                // n < 0        (n <= 0)
    958 
    959 	// n > 0
    960 	MOVD	s+48(FP), R4
    961 	CMPBEQ	R0, R4, Z80	   //handle 0 case beq
    962 	MOVD	$64, R6
    963 	CMPBEQ	R6, R4, Z864	   //handle 64 case beq
    964 	MOVD	z+0(FP), R2
    965 	MOVD	x+24(FP), R8
    966 	SLD	$3, R5             // n = n*8
    967 	SUB	R4, R6, R7
    968 	MOVD	(R8)(R5*1), R10    // w1 = x[i-1]
    969 	SRD	R7, R10, R3
    970 	MOVD	R3, c+56(FP)
    971 
    972 	MOVD	$0, R1             // i = 0
    973 	BR	E8
    974 
    975 	// i < n-1
    976 L8:	MOVD	R10, R3             // w = w1
    977 	MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1]
    978 
    979 	SLD	R4,  R3             // w<<s | w1>>
    980 	SRD	R7, R10, R6
    981 	OR 	R6, R3
    982 	MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>
    983 	SUB	$8, R5              // i--
    984 
    985 E8:	CMPBGT	R5, R0, L8	    // i < n-1
    986 
    987 	// i >= n-1
    988 X8a:	SLD	R4, R10             // w1<<s
    989 	MOVD	R10, (R2)           // z[0] = w1<<s
    990 	RET
    991 
    992 X8b:	MOVD	R0, c+56(FP)
    993 	RET
    994 
    995 Z80:	MOVD	z+0(FP), R2
    996 	MOVD	x+24(FP), R8
    997 	SLD	$3, R5             // n = n*8
    998 
    999 	MOVD	(R8), R10
   1000 	MOVD	$0, R3
   1001 	MOVD	R3, c+56(FP)
   1002 
   1003 	MOVD	$0, R1             // i = 0
   1004 	BR	E8Z
   1005 
   1006 	// i < n-1
   1007 L8Z:	MOVD	R10, R3
   1008 	MOVD	8(R8)(R1*1), R10
   1009 
   1010 	MOVD	R3, (R2)(R1*1)
   1011 	ADD 	$8, R1
   1012 
   1013 E8Z:	CMPBLT	R1, R5, L8Z
   1014 
   1015 	// i >= n-1
   1016 	MOVD	R10, (R2)(R5*1)
   1017 	RET
   1018 
   1019 Z864:	MOVD	z+0(FP), R2
   1020 	MOVD	x+24(FP), R8
   1021 	SLD	$3, R5             // n = n*8
   1022 	MOVD	(R8)(R5*1), R3     // w1 = x[n-1]
   1023 	MOVD	R3, c+56(FP)       // z[i] = x[n-1]
   1024 
   1025 	BR	E864
   1026 
   1027 	// i < n-1
   1028 L864:	MOVD	-8(R8)(R5*1), R3
   1029 
   1030 	MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1]
   1031 	SUB	$8, R5             // i--
   1032 
   1033 E864:	CMPBGT	R5, R0, L864       // i < n-1
   1034 
   1035 	MOVD	R0, (R2)           // z[n-1] = 0
   1036 	RET
   1037 
   1038 
   1039 // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
   1040 // func shrVU(z, x []Word, s uint) (c Word)
   1041 TEXT shrVU(SB),NOSPLIT,$0
   1042 	MOVD	z_len+8(FP), R5
   1043 	MOVD	$0, R0
   1044 	SUB	$1, R5             // n--
   1045 	BLT	X9b                // n < 0        (n <= 0)
   1046 
   1047 	// n > 0
   1048 	MOVD	s+48(FP), R4
   1049 	CMPBEQ	R0, R4, ZB0	//handle 0 case beq
   1050 	MOVD	$64, R6
   1051 	CMPBEQ 	R6, R4, ZB64	//handle 64 case beq
   1052 	MOVD	z+0(FP), R2
   1053 	MOVD	x+24(FP), R8
   1054 	SLD	$3, R5		// n = n*8
   1055 	SUB	R4, R6, R7
   1056 	MOVD	(R8), R10	// w1 = x[0]
   1057 	SLD	R7, R10, R3
   1058 	MOVD	R3, c+56(FP)
   1059 
   1060 	MOVD	$0, R1		// i = 0
   1061 	BR 	E9
   1062 
   1063 	// i < n-1
   1064 L9:	MOVD	R10, R3		// w = w1
   1065 	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
   1066 
   1067 	SRD	R4,  R3		// w>>s | w1<<s
   1068 	SLD	R7, R10, R6
   1069 	OR	R6, R3
   1070 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
   1071 	ADD	$8, R1		// i++
   1072 
   1073 E9:	CMPBLT	R1, R5, L9	// i < n-1
   1074 
   1075 	// i >= n-1
   1076 X9a:	SRD	R4, R10		// w1>>s
   1077 	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
   1078 	RET
   1079 
   1080 X9b:	MOVD	R0, c+56(FP)
   1081 	RET
   1082 
   1083 ZB0:	MOVD	z+0(FP), R2
   1084 	MOVD	x+24(FP), R8
   1085 	SLD	$3, R5		// n = n*8
   1086 
   1087 	MOVD	(R8), R10	// w1 = x[0]
   1088 	MOVD	$0, R3		// R10 << 64
   1089 	MOVD	R3, c+56(FP)
   1090 
   1091 	MOVD	$0, R1		// i = 0
   1092 	BR	E9Z
   1093 
   1094 	// i < n-1
   1095 L9Z:	MOVD	R10, R3		// w = w1
   1096 	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
   1097 
   1098 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
   1099 	ADD	$8, R1		// i++
   1100 
   1101 E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1
   1102 
   1103 	// i >= n-1
   1104 	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
   1105 	RET
   1106 
   1107 ZB64:	MOVD	z+0(FP), R2
   1108 	MOVD	x+24(FP), R8
   1109 	SLD	$3, R5		// n = n*8
   1110 	MOVD	(R8), R3	// w1 = x[0]
   1111 	MOVD	R3, c+56(FP)
   1112 
   1113 	MOVD	$0, R1		// i = 0
   1114 	BR	E964
   1115 
   1116 	// i < n-1
   1117 L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1]
   1118 
   1119 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
   1120 	ADD	$8, R1		// i++
   1121 
   1122 E964:	CMPBLT	R1, R5, L964	// i < n-1
   1123 
   1124 	// i >= n-1
   1125 	MOVD	$0, R10            // w1>>s
   1126 	MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s
   1127 	RET
   1128 
   1129 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
   1130 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   1131 TEXT mulAddVWW(SB),NOSPLIT,$0
   1132 	MOVD	z+0(FP), R2
   1133 	MOVD	x+24(FP), R8
   1134 	MOVD	y+48(FP), R9
   1135 	MOVD	r+56(FP), R4	// c = r
   1136 	MOVD	z_len+8(FP), R5
   1137 	MOVD	$0, R1		// i = 0
   1138 	MOVD	$0, R7		// i*8 = 0
   1139 	MOVD	$0, R0		// make sure it's zero
   1140 	BR	E5
   1141 
   1142 L5:	MOVD	(R8)(R1*1), R6
   1143 	MULHDU	R9, R6
   1144 	ADDC	R4, R11 	//add to low order bits
   1145 	ADDE	R0, R6
   1146 	MOVD	R11, (R2)(R1*1)
   1147 	MOVD	R6, R4
   1148 	ADD	$8, R1		// i*8 + 8
   1149 	ADD	$1, R7		// i++
   1150 
   1151 E5:	CMPBLT	R7, R5, L5	// i < n
   1152 
   1153 	MOVD	R4, c+64(FP)
   1154 	RET
   1155 
   1156 // func addMulVVW(z, x []Word, y Word) (c Word)
   1157 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
   1158 TEXT addMulVVW(SB),NOSPLIT,$0
   1159 	MOVD	z+0(FP), R2
   1160 	MOVD	x+24(FP), R8
   1161 	MOVD	y+48(FP), R9
   1162 	MOVD	z_len+8(FP), R5
   1163 
   1164 	MOVD	$0, R1		// i*8 = 0
   1165 	MOVD	$0, R7		// i = 0
   1166 	MOVD	$0, R0		// make sure it's zero
   1167 	MOVD	$0, R4		// c = 0
   1168 
   1169 	MOVD	R5, R12
   1170 	AND	$-2, R12
   1171 	CMPBGE	R5, $2, A6
   1172 	BR	E6
   1173 
   1174 A6:	MOVD	(R8)(R1*1), R6
   1175 	MULHDU	R9, R6
   1176 	MOVD	(R2)(R1*1), R10
   1177 	ADDC	R10, R11	//add to low order bits
   1178 	ADDE	R0, R6
   1179 	ADDC	R4, R11
   1180 	ADDE	R0, R6
   1181 	MOVD	R6, R4
   1182 	MOVD	R11, (R2)(R1*1)
   1183 
   1184 	MOVD	(8)(R8)(R1*1), R6
   1185 	MULHDU	R9, R6
   1186 	MOVD	(8)(R2)(R1*1), R10
   1187 	ADDC	R10, R11	//add to low order bits
   1188 	ADDE	R0, R6
   1189 	ADDC	R4, R11
   1190 	ADDE	R0, R6
   1191 	MOVD	R6, R4
   1192 	MOVD	R11, (8)(R2)(R1*1)
   1193 
   1194 	ADD	$16, R1		// i*8 + 8
   1195 	ADD	$2, R7		// i++
   1196 
   1197 	CMPBLT	R7, R12, A6
   1198 	BR	E6
   1199 
   1200 L6:	MOVD	(R8)(R1*1), R6
   1201 	MULHDU	R9, R6
   1202 	MOVD	(R2)(R1*1), R10
   1203 	ADDC	R10, R11	//add to low order bits
   1204 	ADDE	R0, R6
   1205 	ADDC	R4, R11
   1206 	ADDE	R0, R6
   1207 	MOVD	R6, R4
   1208 	MOVD	R11, (R2)(R1*1)
   1209 
   1210 	ADD	$8, R1		// i*8 + 8
   1211 	ADD	$1, R7		// i++
   1212 
   1213 E6:	CMPBLT	R7, R5, L6	// i < n
   1214 
   1215 	MOVD	R4, c+56(FP)
   1216 	RET
   1217 
   1218 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   1219 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
   1220 TEXT divWVW(SB),NOSPLIT,$0
   1221 	MOVD	z+0(FP), R2
   1222 	MOVD	xn+24(FP), R10	// r = xn
   1223 	MOVD	x+32(FP), R8
   1224 	MOVD	y+56(FP), R9
   1225 	MOVD	z_len+8(FP), R7	// i = z
   1226 	SLD	$3, R7, R1		// i*8
   1227 	MOVD	$0, R0		// make sure it's zero
   1228 	BR	E7
   1229 
   1230 L7:	MOVD	(R8)(R1*1), R11
   1231 	WORD	$0xB98700A9	//DLGR R10,R9
   1232 	MOVD	R11, (R2)(R1*1)
   1233 
   1234 E7:	SUB	$1, R7		// i--
   1235 	SUB	$8, R1
   1236 	BGE	L7		// i >= 0
   1237 
   1238 	MOVD	R10, r+64(FP)
   1239 	RET
   1240