1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 // 34 // CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit! 35 36 // func addVV(z, x, y []Word) (c Word) 37 TEXT addVV(SB),NOSPLIT,$0 38 MOVQ z_len+8(FP), DI 39 MOVQ x+24(FP), R8 40 MOVQ y+48(FP), R9 41 MOVQ z+0(FP), R10 42 43 MOVQ $0, CX // c = 0 44 MOVQ $0, SI // i = 0 45 46 // s/JL/JMP/ below to disable the unrolled loop 47 SUBQ $4, DI // n -= 4 48 JL V1 // if n < 0 goto V1 49 50 U1: // n >= 0 51 // regular loop body unrolled 4x 52 ADDQ CX, CX // restore CF 53 MOVQ 0(R8)(SI*8), R11 54 MOVQ 8(R8)(SI*8), R12 55 MOVQ 16(R8)(SI*8), R13 56 MOVQ 24(R8)(SI*8), R14 57 ADCQ 0(R9)(SI*8), R11 58 ADCQ 8(R9)(SI*8), R12 59 ADCQ 16(R9)(SI*8), R13 60 ADCQ 24(R9)(SI*8), R14 61 MOVQ R11, 0(R10)(SI*8) 62 MOVQ R12, 8(R10)(SI*8) 63 MOVQ R13, 16(R10)(SI*8) 64 MOVQ R14, 24(R10)(SI*8) 65 SBBQ CX, CX // save CF 66 67 ADDQ $4, SI // i += 4 68 SUBQ $4, DI // n -= 4 69 JGE U1 // if n >= 0 goto U1 70 71 V1: ADDQ $4, DI // n += 4 72 JLE E1 // if n <= 0 goto E1 73 74 L1: // n > 0 75 ADDQ CX, CX // restore CF 76 MOVQ 0(R8)(SI*8), R11 77 ADCQ 0(R9)(SI*8), R11 78 MOVQ R11, 0(R10)(SI*8) 79 SBBQ CX, CX // save CF 80 81 ADDQ $1, SI // i++ 82 SUBQ $1, DI // n-- 83 JG L1 // if n > 0 goto L1 84 85 E1: NEGQ CX 86 MOVQ CX, c+72(FP) // return c 87 RET 88 89 90 // func subVV(z, x, y []Word) (c Word) 91 // (same as addVV except for SBBQ instead of ADCQ and label names) 92 TEXT subVV(SB),NOSPLIT,$0 93 MOVQ z_len+8(FP), DI 94 MOVQ x+24(FP), R8 95 MOVQ y+48(FP), R9 96 MOVQ z+0(FP), R10 97 98 MOVQ $0, CX // c = 0 99 MOVQ $0, SI // i = 0 100 101 // s/JL/JMP/ below to disable the unrolled loop 102 SUBQ $4, DI // n -= 4 103 JL V2 // if n < 0 goto V2 104 105 U2: // n >= 0 106 // regular loop body unrolled 4x 107 ADDQ CX, CX // restore CF 108 MOVQ 0(R8)(SI*8), R11 109 MOVQ 8(R8)(SI*8), R12 110 MOVQ 16(R8)(SI*8), R13 111 MOVQ 24(R8)(SI*8), R14 112 SBBQ 0(R9)(SI*8), R11 113 SBBQ 8(R9)(SI*8), R12 114 SBBQ 16(R9)(SI*8), R13 115 SBBQ 24(R9)(SI*8), R14 116 MOVQ R11, 0(R10)(SI*8) 117 MOVQ R12, 8(R10)(SI*8) 118 MOVQ R13, 16(R10)(SI*8) 119 MOVQ R14, 24(R10)(SI*8) 120 SBBQ CX, CX // save CF 121 122 ADDQ $4, SI // i += 4 123 SUBQ $4, DI // n -= 4 124 JGE U2 // if n >= 0 goto U2 125 126 V2: ADDQ $4, DI // n += 4 127 JLE E2 // if n <= 0 goto E2 128 129 L2: // n > 0 130 ADDQ CX, CX // restore CF 131 MOVQ 0(R8)(SI*8), R11 132 SBBQ 0(R9)(SI*8), R11 133 MOVQ R11, 0(R10)(SI*8) 134 SBBQ CX, CX // save CF 135 136 ADDQ $1, SI // i++ 137 SUBQ $1, DI // n-- 138 JG L2 // if n > 0 goto L2 139 140 E2: NEGQ CX 141 MOVQ CX, c+72(FP) // return c 142 RET 143 144 145 // func addVW(z, x []Word, y Word) (c Word) 146 TEXT addVW(SB),NOSPLIT,$0 147 MOVQ z_len+8(FP), DI 148 MOVQ x+24(FP), R8 149 MOVQ y+48(FP), CX // c = y 150 MOVQ z+0(FP), R10 151 152 MOVQ $0, SI // i = 0 153 154 // s/JL/JMP/ below to disable the unrolled loop 155 SUBQ $4, DI // n -= 4 156 JL V3 // if n < 4 goto V3 157 158 U3: // n >= 0 159 // regular loop body unrolled 4x 160 MOVQ 0(R8)(SI*8), R11 161 MOVQ 8(R8)(SI*8), R12 162 MOVQ 16(R8)(SI*8), R13 163 MOVQ 24(R8)(SI*8), R14 164 ADDQ CX, R11 165 ADCQ $0, R12 166 ADCQ $0, R13 167 ADCQ $0, R14 168 SBBQ CX, CX // save CF 169 NEGQ CX 170 MOVQ R11, 0(R10)(SI*8) 171 MOVQ R12, 8(R10)(SI*8) 172 MOVQ R13, 16(R10)(SI*8) 173 MOVQ R14, 24(R10)(SI*8) 174 175 ADDQ $4, SI // i += 4 176 SUBQ $4, DI // n -= 4 177 JGE U3 // if n >= 0 goto U3 178 179 V3: ADDQ $4, DI // n += 4 180 JLE E3 // if n <= 0 goto E3 181 182 L3: // n > 0 183 ADDQ 0(R8)(SI*8), CX 184 MOVQ CX, 0(R10)(SI*8) 185 SBBQ CX, CX // save CF 186 NEGQ CX 187 188 ADDQ $1, SI // i++ 189 SUBQ $1, DI // n-- 190 JG L3 // if n > 0 goto L3 191 192 E3: MOVQ CX, c+56(FP) // return c 193 RET 194 195 196 // func subVW(z, x []Word, y Word) (c Word) 197 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 198 TEXT subVW(SB),NOSPLIT,$0 199 MOVQ z_len+8(FP), DI 200 MOVQ x+24(FP), R8 201 MOVQ y+48(FP), CX // c = y 202 MOVQ z+0(FP), R10 203 204 MOVQ $0, SI // i = 0 205 206 // s/JL/JMP/ below to disable the unrolled loop 207 SUBQ $4, DI // n -= 4 208 JL V4 // if n < 4 goto V4 209 210 U4: // n >= 0 211 // regular loop body unrolled 4x 212 MOVQ 0(R8)(SI*8), R11 213 MOVQ 8(R8)(SI*8), R12 214 MOVQ 16(R8)(SI*8), R13 215 MOVQ 24(R8)(SI*8), R14 216 SUBQ CX, R11 217 SBBQ $0, R12 218 SBBQ $0, R13 219 SBBQ $0, R14 220 SBBQ CX, CX // save CF 221 NEGQ CX 222 MOVQ R11, 0(R10)(SI*8) 223 MOVQ R12, 8(R10)(SI*8) 224 MOVQ R13, 16(R10)(SI*8) 225 MOVQ R14, 24(R10)(SI*8) 226 227 ADDQ $4, SI // i += 4 228 SUBQ $4, DI // n -= 4 229 JGE U4 // if n >= 0 goto U4 230 231 V4: ADDQ $4, DI // n += 4 232 JLE E4 // if n <= 0 goto E4 233 234 L4: // n > 0 235 MOVQ 0(R8)(SI*8), R11 236 SUBQ CX, R11 237 MOVQ R11, 0(R10)(SI*8) 238 SBBQ CX, CX // save CF 239 NEGQ CX 240 241 ADDQ $1, SI // i++ 242 SUBQ $1, DI // n-- 243 JG L4 // if n > 0 goto L4 244 245 E4: MOVQ CX, c+56(FP) // return c 246 RET 247 248 249 // func shlVU(z, x []Word, s uint) (c Word) 250 TEXT shlVU(SB),NOSPLIT,$0 251 MOVQ z_len+8(FP), BX // i = z 252 SUBQ $1, BX // i-- 253 JL X8b // i < 0 (n <= 0) 254 255 // n > 0 256 MOVQ z+0(FP), R10 257 MOVQ x+24(FP), R8 258 MOVQ s+48(FP), CX 259 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 260 MOVQ $0, DX 261 SHLQ CX, DX:AX // w1>> 262 MOVQ DX, c+56(FP) 263 264 CMPQ BX, $0 265 JLE X8a // i <= 0 266 267 // i > 0 268 L8: MOVQ AX, DX // w = w1 269 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 270 SHLQ CX, DX:AX // w<<s | w1>> 271 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>> 272 SUBQ $1, BX // i-- 273 JG L8 // i > 0 274 275 // i <= 0 276 X8a: SHLQ CX, AX // w1<<s 277 MOVQ AX, (R10) // z[0] = w1<<s 278 RET 279 280 X8b: MOVQ $0, c+56(FP) 281 RET 282 283 284 // func shrVU(z, x []Word, s uint) (c Word) 285 TEXT shrVU(SB),NOSPLIT,$0 286 MOVQ z_len+8(FP), R11 287 SUBQ $1, R11 // n-- 288 JL X9b // n < 0 (n <= 0) 289 290 // n > 0 291 MOVQ z+0(FP), R10 292 MOVQ x+24(FP), R8 293 MOVQ s+48(FP), CX 294 MOVQ (R8), AX // w1 = x[0] 295 MOVQ $0, DX 296 SHRQ CX, DX:AX // w1<< 297 MOVQ DX, c+56(FP) 298 299 MOVQ $0, BX // i = 0 300 JMP E9 301 302 // i < n-1 303 L9: MOVQ AX, DX // w = w1 304 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 305 SHRQ CX, DX:AX // w>>s | w1<< 306 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<< 307 ADDQ $1, BX // i++ 308 309 E9: CMPQ BX, R11 310 JL L9 // i < n-1 311 312 // i >= n-1 313 X9a: SHRQ CX, AX // w1>>s 314 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 315 RET 316 317 X9b: MOVQ $0, c+56(FP) 318 RET 319 320 321 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 322 TEXT mulAddVWW(SB),NOSPLIT,$0 323 MOVQ z+0(FP), R10 324 MOVQ x+24(FP), R8 325 MOVQ y+48(FP), R9 326 MOVQ r+56(FP), CX // c = r 327 MOVQ z_len+8(FP), R11 328 MOVQ $0, BX // i = 0 329 JMP E5 330 331 L5: MOVQ (R8)(BX*8), AX 332 MULQ R9 333 ADDQ CX, AX 334 ADCQ $0, DX 335 MOVQ AX, (R10)(BX*8) 336 MOVQ DX, CX 337 ADDQ $1, BX // i++ 338 339 E5: CMPQ BX, R11 // i < n 340 JL L5 341 342 MOVQ CX, c+64(FP) 343 RET 344 345 346 // func addMulVVW(z, x []Word, y Word) (c Word) 347 TEXT addMulVVW(SB),NOSPLIT,$0 348 MOVQ z+0(FP), R10 349 MOVQ x+24(FP), R8 350 MOVQ y+48(FP), R9 351 MOVQ z_len+8(FP), R11 352 MOVQ $0, BX // i = 0 353 MOVQ $0, CX // c = 0 354 MOVQ R11, R12 355 ANDQ $-2, R12 356 CMPQ R11, $2 357 JAE A6 358 JMP E6 359 360 A6: 361 MOVQ (R8)(BX*8), AX 362 MULQ R9 363 ADDQ (R10)(BX*8), AX 364 ADCQ $0, DX 365 ADDQ CX, AX 366 ADCQ $0, DX 367 MOVQ DX, CX 368 MOVQ AX, (R10)(BX*8) 369 370 MOVQ (8)(R8)(BX*8), AX 371 MULQ R9 372 ADDQ (8)(R10)(BX*8), AX 373 ADCQ $0, DX 374 ADDQ CX, AX 375 ADCQ $0, DX 376 MOVQ DX, CX 377 MOVQ AX, (8)(R10)(BX*8) 378 379 ADDQ $2, BX 380 CMPQ BX, R12 381 JL A6 382 JMP E6 383 384 L6: MOVQ (R8)(BX*8), AX 385 MULQ R9 386 ADDQ CX, AX 387 ADCQ $0, DX 388 ADDQ AX, (R10)(BX*8) 389 ADCQ $0, DX 390 MOVQ DX, CX 391 ADDQ $1, BX // i++ 392 393 E6: CMPQ BX, R11 // i < n 394 JL L6 395 396 MOVQ CX, c+56(FP) 397 RET 398 399 400 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 401 TEXT divWVW(SB),NOSPLIT,$0 402 MOVQ z+0(FP), R10 403 MOVQ xn+24(FP), DX // r = xn 404 MOVQ x+32(FP), R8 405 MOVQ y+56(FP), R9 406 MOVQ z_len+8(FP), BX // i = z 407 JMP E7 408 409 L7: MOVQ (R8)(BX*8), AX 410 DIVQ R9 411 MOVQ AX, (R10)(BX*8) 412 413 E7: SUBQ $1, BX // i-- 414 JGE L7 // i >= 0 415 416 MOVQ DX, r+64(FP) 417 RET 418 419 // func bitLen(x Word) (n int) 420 TEXT bitLen(SB),NOSPLIT,$0 421 BSRQ x+0(FP), AX 422 JZ Z1 423 ADDQ $1, AX 424 MOVQ AX, n+8(FP) 425 RET 426 427 Z1: MOVQ $0, n+8(FP) 428 RET 429