Home | History | Annotate | Download | only in big
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !math_big_pure_go
      6 
      7 #include "textflag.h"
      8 
      9 // This file provides fast assembly versions for the elementary
     10 // arithmetic operations on vectors implemented in arith.go.
     11 
     12 // func mulWW(x, y Word) (z1, z0 Word)
     13 TEXT mulWW(SB),NOSPLIT,$0
     14 	MOVQ x+0(FP), AX
     15 	MULQ y+8(FP)
     16 	MOVQ DX, z1+16(FP)
     17 	MOVQ AX, z0+24(FP)
     18 	RET
     19 
     20 
     21 // func divWW(x1, x0, y Word) (q, r Word)
     22 TEXT divWW(SB),NOSPLIT,$0
     23 	MOVQ x1+0(FP), DX
     24 	MOVQ x0+8(FP), AX
     25 	DIVQ y+16(FP)
     26 	MOVQ AX, q+24(FP)
     27 	MOVQ DX, r+32(FP)
     28 	RET
     29 
     30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
     31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
     32 // This is faster than using rotate instructions.
     33 
     34 // func addVV(z, x, y []Word) (c Word)
     35 TEXT addVV(SB),NOSPLIT,$0
     36 	MOVQ z_len+8(FP), DI
     37 	MOVQ x+24(FP), R8
     38 	MOVQ y+48(FP), R9
     39 	MOVQ z+0(FP), R10
     40 
     41 	MOVQ $0, CX		// c = 0
     42 	MOVQ $0, SI		// i = 0
     43 
     44 	// s/JL/JMP/ below to disable the unrolled loop
     45 	SUBQ $4, DI		// n -= 4
     46 	JL V1			// if n < 0 goto V1
     47 
     48 U1:	// n >= 0
     49 	// regular loop body unrolled 4x
     50 	ADDQ CX, CX		// restore CF
     51 	MOVQ 0(R8)(SI*8), R11
     52 	MOVQ 8(R8)(SI*8), R12
     53 	MOVQ 16(R8)(SI*8), R13
     54 	MOVQ 24(R8)(SI*8), R14
     55 	ADCQ 0(R9)(SI*8), R11
     56 	ADCQ 8(R9)(SI*8), R12
     57 	ADCQ 16(R9)(SI*8), R13
     58 	ADCQ 24(R9)(SI*8), R14
     59 	MOVQ R11, 0(R10)(SI*8)
     60 	MOVQ R12, 8(R10)(SI*8)
     61 	MOVQ R13, 16(R10)(SI*8)
     62 	MOVQ R14, 24(R10)(SI*8)
     63 	SBBQ CX, CX		// save CF
     64 
     65 	ADDQ $4, SI		// i += 4
     66 	SUBQ $4, DI		// n -= 4
     67 	JGE U1			// if n >= 0 goto U1
     68 
     69 V1:	ADDQ $4, DI		// n += 4
     70 	JLE E1			// if n <= 0 goto E1
     71 
     72 L1:	// n > 0
     73 	ADDQ CX, CX		// restore CF
     74 	MOVQ 0(R8)(SI*8), R11
     75 	ADCQ 0(R9)(SI*8), R11
     76 	MOVQ R11, 0(R10)(SI*8)
     77 	SBBQ CX, CX		// save CF
     78 
     79 	ADDQ $1, SI		// i++
     80 	SUBQ $1, DI		// n--
     81 	JG L1			// if n > 0 goto L1
     82 
     83 E1:	NEGQ CX
     84 	MOVQ CX, c+72(FP)	// return c
     85 	RET
     86 
     87 
     88 // func subVV(z, x, y []Word) (c Word)
     89 // (same as addVV except for SBBQ instead of ADCQ and label names)
     90 TEXT subVV(SB),NOSPLIT,$0
     91 	MOVQ z_len+8(FP), DI
     92 	MOVQ x+24(FP), R8
     93 	MOVQ y+48(FP), R9
     94 	MOVQ z+0(FP), R10
     95 
     96 	MOVQ $0, CX		// c = 0
     97 	MOVQ $0, SI		// i = 0
     98 
     99 	// s/JL/JMP/ below to disable the unrolled loop
    100 	SUBQ $4, DI		// n -= 4
    101 	JL V2			// if n < 0 goto V2
    102 
    103 U2:	// n >= 0
    104 	// regular loop body unrolled 4x
    105 	ADDQ CX, CX		// restore CF
    106 	MOVQ 0(R8)(SI*8), R11
    107 	MOVQ 8(R8)(SI*8), R12
    108 	MOVQ 16(R8)(SI*8), R13
    109 	MOVQ 24(R8)(SI*8), R14
    110 	SBBQ 0(R9)(SI*8), R11
    111 	SBBQ 8(R9)(SI*8), R12
    112 	SBBQ 16(R9)(SI*8), R13
    113 	SBBQ 24(R9)(SI*8), R14
    114 	MOVQ R11, 0(R10)(SI*8)
    115 	MOVQ R12, 8(R10)(SI*8)
    116 	MOVQ R13, 16(R10)(SI*8)
    117 	MOVQ R14, 24(R10)(SI*8)
    118 	SBBQ CX, CX		// save CF
    119 
    120 	ADDQ $4, SI		// i += 4
    121 	SUBQ $4, DI		// n -= 4
    122 	JGE U2			// if n >= 0 goto U2
    123 
    124 V2:	ADDQ $4, DI		// n += 4
    125 	JLE E2			// if n <= 0 goto E2
    126 
    127 L2:	// n > 0
    128 	ADDQ CX, CX		// restore CF
    129 	MOVQ 0(R8)(SI*8), R11
    130 	SBBQ 0(R9)(SI*8), R11
    131 	MOVQ R11, 0(R10)(SI*8)
    132 	SBBQ CX, CX		// save CF
    133 
    134 	ADDQ $1, SI		// i++
    135 	SUBQ $1, DI		// n--
    136 	JG L2			// if n > 0 goto L2
    137 
    138 E2:	NEGQ CX
    139 	MOVQ CX, c+72(FP)	// return c
    140 	RET
    141 
    142 
    143 // func addVW(z, x []Word, y Word) (c Word)
    144 TEXT addVW(SB),NOSPLIT,$0
    145 	MOVQ z_len+8(FP), DI
    146 	MOVQ x+24(FP), R8
    147 	MOVQ y+48(FP), CX	// c = y
    148 	MOVQ z+0(FP), R10
    149 
    150 	MOVQ $0, SI		// i = 0
    151 
    152 	// s/JL/JMP/ below to disable the unrolled loop
    153 	SUBQ $4, DI		// n -= 4
    154 	JL V3			// if n < 4 goto V3
    155 
    156 U3:	// n >= 0
    157 	// regular loop body unrolled 4x
    158 	MOVQ 0(R8)(SI*8), R11
    159 	MOVQ 8(R8)(SI*8), R12
    160 	MOVQ 16(R8)(SI*8), R13
    161 	MOVQ 24(R8)(SI*8), R14
    162 	ADDQ CX, R11
    163 	ADCQ $0, R12
    164 	ADCQ $0, R13
    165 	ADCQ $0, R14
    166 	SBBQ CX, CX		// save CF
    167 	NEGQ CX
    168 	MOVQ R11, 0(R10)(SI*8)
    169 	MOVQ R12, 8(R10)(SI*8)
    170 	MOVQ R13, 16(R10)(SI*8)
    171 	MOVQ R14, 24(R10)(SI*8)
    172 
    173 	ADDQ $4, SI		// i += 4
    174 	SUBQ $4, DI		// n -= 4
    175 	JGE U3			// if n >= 0 goto U3
    176 
    177 V3:	ADDQ $4, DI		// n += 4
    178 	JLE E3			// if n <= 0 goto E3
    179 
    180 L3:	// n > 0
    181 	ADDQ 0(R8)(SI*8), CX
    182 	MOVQ CX, 0(R10)(SI*8)
    183 	SBBQ CX, CX		// save CF
    184 	NEGQ CX
    185 
    186 	ADDQ $1, SI		// i++
    187 	SUBQ $1, DI		// n--
    188 	JG L3			// if n > 0 goto L3
    189 
    190 E3:	MOVQ CX, c+56(FP)	// return c
    191 	RET
    192 
    193 
    194 // func subVW(z, x []Word, y Word) (c Word)
    195 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
    196 TEXT subVW(SB),NOSPLIT,$0
    197 	MOVQ z_len+8(FP), DI
    198 	MOVQ x+24(FP), R8
    199 	MOVQ y+48(FP), CX	// c = y
    200 	MOVQ z+0(FP), R10
    201 
    202 	MOVQ $0, SI		// i = 0
    203 
    204 	// s/JL/JMP/ below to disable the unrolled loop
    205 	SUBQ $4, DI		// n -= 4
    206 	JL V4			// if n < 4 goto V4
    207 
    208 U4:	// n >= 0
    209 	// regular loop body unrolled 4x
    210 	MOVQ 0(R8)(SI*8), R11
    211 	MOVQ 8(R8)(SI*8), R12
    212 	MOVQ 16(R8)(SI*8), R13
    213 	MOVQ 24(R8)(SI*8), R14
    214 	SUBQ CX, R11
    215 	SBBQ $0, R12
    216 	SBBQ $0, R13
    217 	SBBQ $0, R14
    218 	SBBQ CX, CX		// save CF
    219 	NEGQ CX
    220 	MOVQ R11, 0(R10)(SI*8)
    221 	MOVQ R12, 8(R10)(SI*8)
    222 	MOVQ R13, 16(R10)(SI*8)
    223 	MOVQ R14, 24(R10)(SI*8)
    224 
    225 	ADDQ $4, SI		// i += 4
    226 	SUBQ $4, DI		// n -= 4
    227 	JGE U4			// if n >= 0 goto U4
    228 
    229 V4:	ADDQ $4, DI		// n += 4
    230 	JLE E4			// if n <= 0 goto E4
    231 
    232 L4:	// n > 0
    233 	MOVQ 0(R8)(SI*8), R11
    234 	SUBQ CX, R11
    235 	MOVQ R11, 0(R10)(SI*8)
    236 	SBBQ CX, CX		// save CF
    237 	NEGQ CX
    238 
    239 	ADDQ $1, SI		// i++
    240 	SUBQ $1, DI		// n--
    241 	JG L4			// if n > 0 goto L4
    242 
    243 E4:	MOVQ CX, c+56(FP)	// return c
    244 	RET
    245 
    246 
    247 // func shlVU(z, x []Word, s uint) (c Word)
    248 TEXT shlVU(SB),NOSPLIT,$0
    249 	MOVQ z_len+8(FP), BX	// i = z
    250 	SUBQ $1, BX		// i--
    251 	JL X8b			// i < 0	(n <= 0)
    252 
    253 	// n > 0
    254 	MOVQ z+0(FP), R10
    255 	MOVQ x+24(FP), R8
    256 	MOVQ s+48(FP), CX
    257 	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
    258 	MOVQ $0, DX
    259 	SHLQ CX, DX:AX		// w1>>
    260 	MOVQ DX, c+56(FP)
    261 
    262 	CMPQ BX, $0
    263 	JLE X8a			// i <= 0
    264 
    265 	// i > 0
    266 L8:	MOVQ AX, DX		// w = w1
    267 	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
    268 	SHLQ CX, DX:AX		// w<<s | w1>>
    269 	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>
    270 	SUBQ $1, BX		// i--
    271 	JG L8			// i > 0
    272 
    273 	// i <= 0
    274 X8a:	SHLQ CX, AX		// w1<<s
    275 	MOVQ AX, (R10)		// z[0] = w1<<s
    276 	RET
    277 
    278 X8b:	MOVQ $0, c+56(FP)
    279 	RET
    280 
    281 
    282 // func shrVU(z, x []Word, s uint) (c Word)
    283 TEXT shrVU(SB),NOSPLIT,$0
    284 	MOVQ z_len+8(FP), R11
    285 	SUBQ $1, R11		// n--
    286 	JL X9b			// n < 0	(n <= 0)
    287 
    288 	// n > 0
    289 	MOVQ z+0(FP), R10
    290 	MOVQ x+24(FP), R8
    291 	MOVQ s+48(FP), CX
    292 	MOVQ (R8), AX		// w1 = x[0]
    293 	MOVQ $0, DX
    294 	SHRQ CX, DX:AX		// w1<<
    295 	MOVQ DX, c+56(FP)
    296 
    297 	MOVQ $0, BX		// i = 0
    298 	JMP E9
    299 
    300 	// i < n-1
    301 L9:	MOVQ AX, DX		// w = w1
    302 	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
    303 	SHRQ CX, DX:AX		// w>>s | w1<<
    304 	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<
    305 	ADDQ $1, BX		// i++
    306 
    307 E9:	CMPQ BX, R11
    308 	JL L9			// i < n-1
    309 
    310 	// i >= n-1
    311 X9a:	SHRQ CX, AX		// w1>>s
    312 	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
    313 	RET
    314 
    315 X9b:	MOVQ $0, c+56(FP)
    316 	RET
    317 
    318 
    319 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
    320 TEXT mulAddVWW(SB),NOSPLIT,$0
    321 	MOVQ z+0(FP), R10
    322 	MOVQ x+24(FP), R8
    323 	MOVQ y+48(FP), R9
    324 	MOVQ r+56(FP), CX	// c = r
    325 	MOVQ z_len+8(FP), R11
    326 	MOVQ $0, BX		// i = 0
    327 
    328 	CMPQ R11, $4
    329 	JL E5
    330 
    331 U5:	// i+4 <= n
    332 	// regular loop body unrolled 4x
    333 	MOVQ (0*8)(R8)(BX*8), AX
    334 	MULQ R9
    335 	ADDQ CX, AX
    336 	ADCQ $0, DX
    337 	MOVQ AX, (0*8)(R10)(BX*8)
    338 	MOVQ DX, CX
    339 	MOVQ (1*8)(R8)(BX*8), AX
    340 	MULQ R9
    341 	ADDQ CX, AX
    342 	ADCQ $0, DX
    343 	MOVQ AX, (1*8)(R10)(BX*8)
    344 	MOVQ DX, CX
    345 	MOVQ (2*8)(R8)(BX*8), AX
    346 	MULQ R9
    347 	ADDQ CX, AX
    348 	ADCQ $0, DX
    349 	MOVQ AX, (2*8)(R10)(BX*8)
    350 	MOVQ DX, CX
    351 	MOVQ (3*8)(R8)(BX*8), AX
    352 	MULQ R9
    353 	ADDQ CX, AX
    354 	ADCQ $0, DX
    355 	MOVQ AX, (3*8)(R10)(BX*8)
    356 	MOVQ DX, CX
    357 	ADDQ $4, BX		// i += 4
    358 
    359 	LEAQ 4(BX), DX
    360 	CMPQ DX, R11
    361 	JLE U5
    362 	JMP E5
    363 
    364 L5:	MOVQ (R8)(BX*8), AX
    365 	MULQ R9
    366 	ADDQ CX, AX
    367 	ADCQ $0, DX
    368 	MOVQ AX, (R10)(BX*8)
    369 	MOVQ DX, CX
    370 	ADDQ $1, BX		// i++
    371 
    372 E5:	CMPQ BX, R11		// i < n
    373 	JL L5
    374 
    375 	MOVQ CX, c+64(FP)
    376 	RET
    377 
    378 
    379 // func addMulVVW(z, x []Word, y Word) (c Word)
    380 TEXT addMulVVW(SB),NOSPLIT,$0
    381 	MOVQ z+0(FP), R10
    382 	MOVQ x+24(FP), R8
    383 	MOVQ y+48(FP), R9
    384 	MOVQ z_len+8(FP), R11
    385 	MOVQ $0, BX		// i = 0
    386 	MOVQ $0, CX		// c = 0
    387 	MOVQ R11, R12
    388 	ANDQ $-2, R12
    389 	CMPQ R11, $2
    390 	JAE A6
    391 	JMP E6
    392 
    393 A6:
    394 	MOVQ (R8)(BX*8), AX
    395 	MULQ R9
    396 	ADDQ (R10)(BX*8), AX
    397 	ADCQ $0, DX
    398 	ADDQ CX, AX
    399 	ADCQ $0, DX
    400 	MOVQ DX, CX
    401 	MOVQ AX, (R10)(BX*8)
    402 
    403 	MOVQ (8)(R8)(BX*8), AX
    404 	MULQ R9
    405 	ADDQ (8)(R10)(BX*8), AX
    406 	ADCQ $0, DX
    407 	ADDQ CX, AX
    408 	ADCQ $0, DX
    409 	MOVQ DX, CX
    410 	MOVQ AX, (8)(R10)(BX*8)
    411 
    412 	ADDQ $2, BX
    413 	CMPQ BX, R12
    414 	JL A6
    415 	JMP E6
    416 
    417 L6:	MOVQ (R8)(BX*8), AX
    418 	MULQ R9
    419 	ADDQ CX, AX
    420 	ADCQ $0, DX
    421 	ADDQ AX, (R10)(BX*8)
    422 	ADCQ $0, DX
    423 	MOVQ DX, CX
    424 	ADDQ $1, BX		// i++
    425 
    426 E6:	CMPQ BX, R11		// i < n
    427 	JL L6
    428 
    429 	MOVQ CX, c+56(FP)
    430 	RET
    431 
    432 
    433 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
    434 TEXT divWVW(SB),NOSPLIT,$0
    435 	MOVQ z+0(FP), R10
    436 	MOVQ xn+24(FP), DX	// r = xn
    437 	MOVQ x+32(FP), R8
    438 	MOVQ y+56(FP), R9
    439 	MOVQ z_len+8(FP), BX	// i = z
    440 	JMP E7
    441 
    442 L7:	MOVQ (R8)(BX*8), AX
    443 	DIVQ R9
    444 	MOVQ AX, (R10)(BX*8)
    445 
    446 E7:	SUBQ $1, BX		// i--
    447 	JGE L7			// i >= 0
    448 
    449 	MOVQ DX, r+64(FP)
    450 	RET
    451