1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT mulWW(SB), NOSPLIT, $0 14 MOVD x+0(FP), R4 15 MOVD y+8(FP), R5 16 MULHDU R4, R5, R6 17 MULLD R4, R5, R7 18 MOVD R6, z1+16(FP) 19 MOVD R7, z0+24(FP) 20 RET 21 22 TEXT addVV(SB), NOSPLIT, $0 23 BR addVV_g(SB) 24 25 // func subVV(z, x, y []Word) (c Word) 26 // z[i] = x[i] - y[i] for all i, carrying 27 TEXT subVV(SB), NOSPLIT, $0 28 MOVD z_len+8(FP), R7 29 MOVD x+24(FP), R8 30 MOVD y+48(FP), R9 31 MOVD z+0(FP), R10 32 33 MOVD $0, R4 // c = 0 34 MOVD $0, R5 // i = 0 35 MOVD $1, R29 // work around lack of ADDI 36 MOVD $8, R28 // work around lack of scaled addressing 37 38 SUBC R0, R0 // clear CA 39 JMP sublend 40 41 // amd64 saves and restores CF, but I believe they only have to do that because all of 42 // their math operations clobber it - we should just be able to recover it at the end. 43 subloop: 44 MULLD R5, R28, R6 45 MOVD (R8)(R6), R11 // x[i] 46 MOVD (R9)(R6), R12 // y[i] 47 48 SUBE R12, R11, R15 49 MOVD R15, (R10)(R6) 50 51 ADD R29, R5 // i++ 52 53 sublend: 54 CMP R5, R7 55 BLT subloop 56 57 ADDZE R4 58 XOR R29, R4 59 MOVD R4, c+72(FP) 60 RET 61 62 TEXT addVW(SB), NOSPLIT, $0 63 BR addVW_g(SB) 64 65 TEXT subVW(SB), NOSPLIT, $0 66 BR subVW_g(SB) 67 68 TEXT shlVU(SB), NOSPLIT, $0 69 BR shlVU_g(SB) 70 71 TEXT shrVU(SB), NOSPLIT, $0 72 BR shrVU_g(SB) 73 74 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 75 TEXT mulAddVWW(SB), NOSPLIT, $0 76 MOVD z+0(FP), R10 77 MOVD x+24(FP), R8 78 MOVD y+48(FP), R9 79 MOVD r+56(FP), R4 // c = r 80 MOVD z_len+8(FP), R11 81 MOVD $0, R3 // i = 0 82 MOVD $8, R18 83 MOVD $1, R19 84 85 JMP e5 86 87 l5: 88 MULLD R18, R3, R5 89 MOVD (R8)(R5), R20 90 MULLD R9, R20, R6 91 MULHDU R9, R20, R7 92 ADDC R4, R6 93 ADDZE R7 94 MOVD R6, (R10)(R5) 95 MOVD R7, R4 96 ADD R19, R3 97 98 e5: 99 CMP R3, R11 100 BLT l5 101 102 MOVD R4, c+64(FP) 103 RET 104 105 // func addMulVVW(z, x []Word, y Word) (c Word) 106 TEXT addMulVVW(SB), NOSPLIT, $0 107 MOVD z+0(FP), R10 108 MOVD x+24(FP), R8 109 MOVD y+48(FP), R9 110 MOVD z_len+8(FP), R22 111 112 MOVD $0, R5 // i = 0 113 MOVD $0, R4 // c = 0 114 MOVD $8, R28 115 MOVD $-2, R23 116 AND R22, R23 // mask the last bit of z.len 117 MOVD $2, R24 118 CMP R23, R24 119 BGE unrolled 120 JMP end 121 122 unrolled: 123 MOVD $8, R19 // no (RA)(RB*8) on power 124 MULLD R5, R19 125 MOVD (R10)(R19), R11 // R11 = z[i] 126 MOVD (R8)(R19), R16 // R16 = x[i] 127 ADD R28, R19, R25 128 MOVD (R10)(R25), R17 129 MOVD (R8)(R25), R18 130 131 MULLD R9, R16, R12 132 MULHDU R9, R16, R14 133 MULLD R9, R18, R6 134 MULHDU R9, R18, R7 135 ADDC R4, R12 136 ADDZE R14 137 ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry 138 ADDZE R14 // carry = high order bits + add carry 139 MOVD R12, (R10)(R19) 140 ADDC R14, R6 141 ADDZE R7 142 ADDC R17, R6 143 ADDZE R7 144 MOVD R6, (R10)(R25) 145 MOVD R7, R4 146 147 ADD R24, R5 148 CMP R5, R23 149 BLT unrolled 150 JMP end 151 152 loop: 153 MOVD $8, R19 154 MULLD R5, R19 155 MOVD (R10)(R19), R11 156 MOVD (R8)(R19), R16 157 MULLD R9, R16, R12 158 MULHDU R9, R16, R14 159 ADDC R4, R12 160 ADDZE R14 161 ADDC R11, R12 162 ADDZE R14 163 MOVD R12, (R10)(R19) 164 MOVD R14, R4 165 166 MOVD $1, R15 167 ADD R15, R5 168 169 end: 170 CMP R5, R22 171 BLT loop 172 173 MOVD R4, c+56(FP) 174 RET 175 176 TEXT divWVW(SB), NOSPLIT, $0 177 BR divWVW_g(SB) 178 179 // func bitLen(x Word) int 180 TEXT bitLen(SB), NOSPLIT, $0 181 MOVD x+0(FP), R4 182 CNTLZD R4, R4 183 MOVD $64, R5 184 SUB R4, R5 185 MOVD R5, n+8(FP) 186 RET 187