Home | History | Annotate | Download | only in big
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !math_big_pure_go
      6 
      7 #include "textflag.h"
      8 
      9 // This file provides fast assembly versions for the elementary
     10 // arithmetic operations on vectors implemented in arith.go.
     11 
     12 // func mulWW(x, y Word) (z1, z0 Word)
     13 TEXT mulWW(SB),NOSPLIT,$0
     14 	MOVQ x+0(FP), AX
     15 	MULQ y+8(FP)
     16 	MOVQ DX, z1+16(FP)
     17 	MOVQ AX, z0+24(FP)
     18 	RET
     19 
     20 
     21 // func divWW(x1, x0, y Word) (q, r Word)
     22 TEXT divWW(SB),NOSPLIT,$0
     23 	MOVQ x1+0(FP), DX
     24 	MOVQ x0+8(FP), AX
     25 	DIVQ y+16(FP)
     26 	MOVQ AX, q+24(FP)
     27 	MOVQ DX, r+32(FP)
     28 	RET
     29 
     30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
     31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
     32 // This is faster than using rotate instructions.
     33 //
     34 // CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
     35 
     36 // func addVV(z, x, y []Word) (c Word)
     37 TEXT addVV(SB),NOSPLIT,$0
     38 	MOVQ z_len+8(FP), DI
     39 	MOVQ x+24(FP), R8
     40 	MOVQ y+48(FP), R9
     41 	MOVQ z+0(FP), R10
     42 
     43 	MOVQ $0, CX		// c = 0
     44 	MOVQ $0, SI		// i = 0
     45 
     46 	// s/JL/JMP/ below to disable the unrolled loop
     47 	SUBQ $4, DI		// n -= 4
     48 	JL V1			// if n < 0 goto V1
     49 
     50 U1:	// n >= 0
     51 	// regular loop body unrolled 4x
     52 	ADDQ CX, CX		// restore CF
     53 	MOVQ 0(R8)(SI*8), R11
     54 	MOVQ 8(R8)(SI*8), R12
     55 	MOVQ 16(R8)(SI*8), R13
     56 	MOVQ 24(R8)(SI*8), R14
     57 	ADCQ 0(R9)(SI*8), R11
     58 	ADCQ 8(R9)(SI*8), R12
     59 	ADCQ 16(R9)(SI*8), R13
     60 	ADCQ 24(R9)(SI*8), R14
     61 	MOVQ R11, 0(R10)(SI*8)
     62 	MOVQ R12, 8(R10)(SI*8)
     63 	MOVQ R13, 16(R10)(SI*8)
     64 	MOVQ R14, 24(R10)(SI*8)
     65 	SBBQ CX, CX		// save CF
     66 
     67 	ADDQ $4, SI		// i += 4
     68 	SUBQ $4, DI		// n -= 4
     69 	JGE U1			// if n >= 0 goto U1
     70 
     71 V1:	ADDQ $4, DI		// n += 4
     72 	JLE E1			// if n <= 0 goto E1
     73 
     74 L1:	// n > 0
     75 	ADDQ CX, CX		// restore CF
     76 	MOVQ 0(R8)(SI*8), R11
     77 	ADCQ 0(R9)(SI*8), R11
     78 	MOVQ R11, 0(R10)(SI*8)
     79 	SBBQ CX, CX		// save CF
     80 
     81 	ADDQ $1, SI		// i++
     82 	SUBQ $1, DI		// n--
     83 	JG L1			// if n > 0 goto L1
     84 
     85 E1:	NEGQ CX
     86 	MOVQ CX, c+72(FP)	// return c
     87 	RET
     88 
     89 
     90 // func subVV(z, x, y []Word) (c Word)
     91 // (same as addVV except for SBBQ instead of ADCQ and label names)
     92 TEXT subVV(SB),NOSPLIT,$0
     93 	MOVQ z_len+8(FP), DI
     94 	MOVQ x+24(FP), R8
     95 	MOVQ y+48(FP), R9
     96 	MOVQ z+0(FP), R10
     97 
     98 	MOVQ $0, CX		// c = 0
     99 	MOVQ $0, SI		// i = 0
    100 
    101 	// s/JL/JMP/ below to disable the unrolled loop
    102 	SUBQ $4, DI		// n -= 4
    103 	JL V2			// if n < 0 goto V2
    104 
    105 U2:	// n >= 0
    106 	// regular loop body unrolled 4x
    107 	ADDQ CX, CX		// restore CF
    108 	MOVQ 0(R8)(SI*8), R11
    109 	MOVQ 8(R8)(SI*8), R12
    110 	MOVQ 16(R8)(SI*8), R13
    111 	MOVQ 24(R8)(SI*8), R14
    112 	SBBQ 0(R9)(SI*8), R11
    113 	SBBQ 8(R9)(SI*8), R12
    114 	SBBQ 16(R9)(SI*8), R13
    115 	SBBQ 24(R9)(SI*8), R14
    116 	MOVQ R11, 0(R10)(SI*8)
    117 	MOVQ R12, 8(R10)(SI*8)
    118 	MOVQ R13, 16(R10)(SI*8)
    119 	MOVQ R14, 24(R10)(SI*8)
    120 	SBBQ CX, CX		// save CF
    121 
    122 	ADDQ $4, SI		// i += 4
    123 	SUBQ $4, DI		// n -= 4
    124 	JGE U2			// if n >= 0 goto U2
    125 
    126 V2:	ADDQ $4, DI		// n += 4
    127 	JLE E2			// if n <= 0 goto E2
    128 
    129 L2:	// n > 0
    130 	ADDQ CX, CX		// restore CF
    131 	MOVQ 0(R8)(SI*8), R11
    132 	SBBQ 0(R9)(SI*8), R11
    133 	MOVQ R11, 0(R10)(SI*8)
    134 	SBBQ CX, CX		// save CF
    135 
    136 	ADDQ $1, SI		// i++
    137 	SUBQ $1, DI		// n--
    138 	JG L2			// if n > 0 goto L2
    139 
    140 E2:	NEGQ CX
    141 	MOVQ CX, c+72(FP)	// return c
    142 	RET
    143 
    144 
    145 // func addVW(z, x []Word, y Word) (c Word)
    146 TEXT addVW(SB),NOSPLIT,$0
    147 	MOVQ z_len+8(FP), DI
    148 	MOVQ x+24(FP), R8
    149 	MOVQ y+48(FP), CX	// c = y
    150 	MOVQ z+0(FP), R10
    151 
    152 	MOVQ $0, SI		// i = 0
    153 
    154 	// s/JL/JMP/ below to disable the unrolled loop
    155 	SUBQ $4, DI		// n -= 4
    156 	JL V3			// if n < 4 goto V3
    157 
    158 U3:	// n >= 0
    159 	// regular loop body unrolled 4x
    160 	MOVQ 0(R8)(SI*8), R11
    161 	MOVQ 8(R8)(SI*8), R12
    162 	MOVQ 16(R8)(SI*8), R13
    163 	MOVQ 24(R8)(SI*8), R14
    164 	ADDQ CX, R11
    165 	ADCQ $0, R12
    166 	ADCQ $0, R13
    167 	ADCQ $0, R14
    168 	SBBQ CX, CX		// save CF
    169 	NEGQ CX
    170 	MOVQ R11, 0(R10)(SI*8)
    171 	MOVQ R12, 8(R10)(SI*8)
    172 	MOVQ R13, 16(R10)(SI*8)
    173 	MOVQ R14, 24(R10)(SI*8)
    174 
    175 	ADDQ $4, SI		// i += 4
    176 	SUBQ $4, DI		// n -= 4
    177 	JGE U3			// if n >= 0 goto U3
    178 
    179 V3:	ADDQ $4, DI		// n += 4
    180 	JLE E3			// if n <= 0 goto E3
    181 
    182 L3:	// n > 0
    183 	ADDQ 0(R8)(SI*8), CX
    184 	MOVQ CX, 0(R10)(SI*8)
    185 	SBBQ CX, CX		// save CF
    186 	NEGQ CX
    187 
    188 	ADDQ $1, SI		// i++
    189 	SUBQ $1, DI		// n--
    190 	JG L3			// if n > 0 goto L3
    191 
    192 E3:	MOVQ CX, c+56(FP)	// return c
    193 	RET
    194 
    195 
    196 // func subVW(z, x []Word, y Word) (c Word)
    197 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
    198 TEXT subVW(SB),NOSPLIT,$0
    199 	MOVQ z_len+8(FP), DI
    200 	MOVQ x+24(FP), R8
    201 	MOVQ y+48(FP), CX	// c = y
    202 	MOVQ z+0(FP), R10
    203 
    204 	MOVQ $0, SI		// i = 0
    205 
    206 	// s/JL/JMP/ below to disable the unrolled loop
    207 	SUBQ $4, DI		// n -= 4
    208 	JL V4			// if n < 4 goto V4
    209 
    210 U4:	// n >= 0
    211 	// regular loop body unrolled 4x
    212 	MOVQ 0(R8)(SI*8), R11
    213 	MOVQ 8(R8)(SI*8), R12
    214 	MOVQ 16(R8)(SI*8), R13
    215 	MOVQ 24(R8)(SI*8), R14
    216 	SUBQ CX, R11
    217 	SBBQ $0, R12
    218 	SBBQ $0, R13
    219 	SBBQ $0, R14
    220 	SBBQ CX, CX		// save CF
    221 	NEGQ CX
    222 	MOVQ R11, 0(R10)(SI*8)
    223 	MOVQ R12, 8(R10)(SI*8)
    224 	MOVQ R13, 16(R10)(SI*8)
    225 	MOVQ R14, 24(R10)(SI*8)
    226 
    227 	ADDQ $4, SI		// i += 4
    228 	SUBQ $4, DI		// n -= 4
    229 	JGE U4			// if n >= 0 goto U4
    230 
    231 V4:	ADDQ $4, DI		// n += 4
    232 	JLE E4			// if n <= 0 goto E4
    233 
    234 L4:	// n > 0
    235 	MOVQ 0(R8)(SI*8), R11
    236 	SUBQ CX, R11
    237 	MOVQ R11, 0(R10)(SI*8)
    238 	SBBQ CX, CX		// save CF
    239 	NEGQ CX
    240 
    241 	ADDQ $1, SI		// i++
    242 	SUBQ $1, DI		// n--
    243 	JG L4			// if n > 0 goto L4
    244 
    245 E4:	MOVQ CX, c+56(FP)	// return c
    246 	RET
    247 
    248 
    249 // func shlVU(z, x []Word, s uint) (c Word)
    250 TEXT shlVU(SB),NOSPLIT,$0
    251 	MOVQ z_len+8(FP), BX	// i = z
    252 	SUBQ $1, BX		// i--
    253 	JL X8b			// i < 0	(n <= 0)
    254 
    255 	// n > 0
    256 	MOVQ z+0(FP), R10
    257 	MOVQ x+24(FP), R8
    258 	MOVQ s+48(FP), CX
    259 	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
    260 	MOVQ $0, DX
    261 	SHLQ CX, DX:AX		// w1>>
    262 	MOVQ DX, c+56(FP)
    263 
    264 	CMPQ BX, $0
    265 	JLE X8a			// i <= 0
    266 
    267 	// i > 0
    268 L8:	MOVQ AX, DX		// w = w1
    269 	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
    270 	SHLQ CX, DX:AX		// w<<s | w1>>
    271 	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>
    272 	SUBQ $1, BX		// i--
    273 	JG L8			// i > 0
    274 
    275 	// i <= 0
    276 X8a:	SHLQ CX, AX		// w1<<s
    277 	MOVQ AX, (R10)		// z[0] = w1<<s
    278 	RET
    279 
    280 X8b:	MOVQ $0, c+56(FP)
    281 	RET
    282 
    283 
    284 // func shrVU(z, x []Word, s uint) (c Word)
    285 TEXT shrVU(SB),NOSPLIT,$0
    286 	MOVQ z_len+8(FP), R11
    287 	SUBQ $1, R11		// n--
    288 	JL X9b			// n < 0	(n <= 0)
    289 
    290 	// n > 0
    291 	MOVQ z+0(FP), R10
    292 	MOVQ x+24(FP), R8
    293 	MOVQ s+48(FP), CX
    294 	MOVQ (R8), AX		// w1 = x[0]
    295 	MOVQ $0, DX
    296 	SHRQ CX, DX:AX		// w1<<
    297 	MOVQ DX, c+56(FP)
    298 
    299 	MOVQ $0, BX		// i = 0
    300 	JMP E9
    301 
    302 	// i < n-1
    303 L9:	MOVQ AX, DX		// w = w1
    304 	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
    305 	SHRQ CX, DX:AX		// w>>s | w1<<
    306 	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<
    307 	ADDQ $1, BX		// i++
    308 
    309 E9:	CMPQ BX, R11
    310 	JL L9			// i < n-1
    311 
    312 	// i >= n-1
    313 X9a:	SHRQ CX, AX		// w1>>s
    314 	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
    315 	RET
    316 
    317 X9b:	MOVQ $0, c+56(FP)
    318 	RET
    319 
    320 
    321 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
    322 TEXT mulAddVWW(SB),NOSPLIT,$0
    323 	MOVQ z+0(FP), R10
    324 	MOVQ x+24(FP), R8
    325 	MOVQ y+48(FP), R9
    326 	MOVQ r+56(FP), CX	// c = r
    327 	MOVQ z_len+8(FP), R11
    328 	MOVQ $0, BX		// i = 0
    329 
    330 	CMPQ R11, $4
    331 	JL E5
    332 
    333 U5:	// i+4 <= n
    334 	// regular loop body unrolled 4x
    335 	MOVQ (0*8)(R8)(BX*8), AX
    336 	MULQ R9
    337 	ADDQ CX, AX
    338 	ADCQ $0, DX
    339 	MOVQ AX, (0*8)(R10)(BX*8)
    340 	MOVQ DX, CX
    341 	MOVQ (1*8)(R8)(BX*8), AX
    342 	MULQ R9
    343 	ADDQ CX, AX
    344 	ADCQ $0, DX
    345 	MOVQ AX, (1*8)(R10)(BX*8)
    346 	MOVQ DX, CX
    347 	MOVQ (2*8)(R8)(BX*8), AX
    348 	MULQ R9
    349 	ADDQ CX, AX
    350 	ADCQ $0, DX
    351 	MOVQ AX, (2*8)(R10)(BX*8)
    352 	MOVQ DX, CX
    353 	MOVQ (3*8)(R8)(BX*8), AX
    354 	MULQ R9
    355 	ADDQ CX, AX
    356 	ADCQ $0, DX
    357 	MOVQ AX, (3*8)(R10)(BX*8)
    358 	MOVQ DX, CX
    359 	ADDQ $4, BX		// i += 4
    360 
    361 	LEAQ 4(BX), DX
    362 	CMPQ DX, R11
    363 	JLE U5
    364 	JMP E5
    365 
    366 L5:	MOVQ (R8)(BX*8), AX
    367 	MULQ R9
    368 	ADDQ CX, AX
    369 	ADCQ $0, DX
    370 	MOVQ AX, (R10)(BX*8)
    371 	MOVQ DX, CX
    372 	ADDQ $1, BX		// i++
    373 
    374 E5:	CMPQ BX, R11		// i < n
    375 	JL L5
    376 
    377 	MOVQ CX, c+64(FP)
    378 	RET
    379 
    380 
    381 // func addMulVVW(z, x []Word, y Word) (c Word)
    382 TEXT addMulVVW(SB),NOSPLIT,$0
    383 	MOVQ z+0(FP), R10
    384 	MOVQ x+24(FP), R8
    385 	MOVQ y+48(FP), R9
    386 	MOVQ z_len+8(FP), R11
    387 	MOVQ $0, BX		// i = 0
    388 	MOVQ $0, CX		// c = 0
    389 	MOVQ R11, R12
    390 	ANDQ $-2, R12
    391 	CMPQ R11, $2
    392 	JAE A6
    393 	JMP E6
    394 
    395 A6:
    396 	MOVQ (R8)(BX*8), AX
    397 	MULQ R9
    398 	ADDQ (R10)(BX*8), AX
    399 	ADCQ $0, DX
    400 	ADDQ CX, AX
    401 	ADCQ $0, DX
    402 	MOVQ DX, CX
    403 	MOVQ AX, (R10)(BX*8)
    404 
    405 	MOVQ (8)(R8)(BX*8), AX
    406 	MULQ R9
    407 	ADDQ (8)(R10)(BX*8), AX
    408 	ADCQ $0, DX
    409 	ADDQ CX, AX
    410 	ADCQ $0, DX
    411 	MOVQ DX, CX
    412 	MOVQ AX, (8)(R10)(BX*8)
    413 
    414 	ADDQ $2, BX
    415 	CMPQ BX, R12
    416 	JL A6
    417 	JMP E6
    418 
    419 L6:	MOVQ (R8)(BX*8), AX
    420 	MULQ R9
    421 	ADDQ CX, AX
    422 	ADCQ $0, DX
    423 	ADDQ AX, (R10)(BX*8)
    424 	ADCQ $0, DX
    425 	MOVQ DX, CX
    426 	ADDQ $1, BX		// i++
    427 
    428 E6:	CMPQ BX, R11		// i < n
    429 	JL L6
    430 
    431 	MOVQ CX, c+56(FP)
    432 	RET
    433 
    434 
    435 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
    436 TEXT divWVW(SB),NOSPLIT,$0
    437 	MOVQ z+0(FP), R10
    438 	MOVQ xn+24(FP), DX	// r = xn
    439 	MOVQ x+32(FP), R8
    440 	MOVQ y+56(FP), R9
    441 	MOVQ z_len+8(FP), BX	// i = z
    442 	JMP E7
    443 
    444 L7:	MOVQ (R8)(BX*8), AX
    445 	DIVQ R9
    446 	MOVQ AX, (R10)(BX*8)
    447 
    448 E7:	SUBQ $1, BX		// i--
    449 	JGE L7			// i >= 0
    450 
    451 	MOVQ DX, r+64(FP)
    452 	RET
    453 
    454 // func bitLen(x Word) (n int)
    455 TEXT bitLen(SB),NOSPLIT,$0
    456 	BSRQ x+0(FP), AX
    457 	JZ Z1
    458 	ADDQ $1, AX
    459 	MOVQ AX, n+8(FP)
    460 	RET
    461 
    462 Z1:	MOVQ $0, n+8(FP)
    463 	RET
    464