1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s 3 // 4 // Copyright 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 // void runtimememmove(void*, void*, uintptr) 31 TEXT runtimememmove(SB), NOSPLIT, $0-24 32 33 MOVQ to+0(FP), DI 34 MOVQ from+8(FP), SI 35 MOVQ n+16(FP), BX 36 37 // REP instructions have a high startup cost, so we handle small sizes 38 // with some straightline code. The REP MOVSQ instruction is really fast 39 // for large sizes. The cutover is approximately 2K. 40 tail: 41 // move_129through256 or smaller work whether or not the source and the 42 // destination memory regions overlap because they load all data into 43 // registers before writing it back. move_256through2048 on the other 44 // hand can be used only when the memory regions don't overlap or the copy 45 // direction is forward. 46 TESTQ BX, BX 47 JEQ move_0 48 CMPQ BX, $2 49 JBE move_1or2 50 CMPQ BX, $4 51 JBE move_3or4 52 CMPQ BX, $8 53 JBE move_5through8 54 CMPQ BX, $16 55 JBE move_9through16 56 CMPQ BX, $32 57 JBE move_17through32 58 CMPQ BX, $64 59 JBE move_33through64 60 CMPQ BX, $128 61 JBE move_65through128 62 CMPQ BX, $256 63 JBE move_129through256 64 // TODO: use branch table and BSR to make this just a single dispatch 65 66 /* 67 * check and set for backwards 68 */ 69 CMPQ SI, DI 70 JLS back 71 72 /* 73 * forward copy loop 74 */ 75 forward: 76 CMPQ BX, $2048 77 JLS move_256through2048 78 79 MOVQ BX, CX 80 SHRQ $3, CX 81 ANDQ $7, BX 82 REP; MOVSQ 83 JMP tail 84 85 back: 86 /* 87 * check overlap 88 */ 89 MOVQ SI, CX 90 ADDQ BX, CX 91 CMPQ CX, DI 92 JLS forward 93 94 /* 95 * whole thing backwards has 96 * adjusted addresses 97 */ 98 ADDQ BX, DI 99 ADDQ BX, SI 100 STD 101 102 /* 103 * copy 104 */ 105 MOVQ BX, CX 106 SHRQ $3, CX 107 ANDQ $7, BX 108 109 SUBQ $8, DI 110 SUBQ $8, SI 111 REP; MOVSQ 112 113 CLD 114 ADDQ $8, DI 115 ADDQ $8, SI 116 SUBQ BX, DI 117 SUBQ BX, SI 118 JMP tail 119 120 move_1or2: 121 MOVB (SI), AX 122 MOVB -1(SI)(BX*1), CX 123 MOVB AX, (DI) 124 MOVB CX, -1(DI)(BX*1) 125 RET 126 move_0: 127 RET 128 move_3or4: 129 MOVW (SI), AX 130 MOVW -2(SI)(BX*1), CX 131 MOVW AX, (DI) 132 MOVW CX, -2(DI)(BX*1) 133 RET 134 move_5through8: 135 MOVL (SI), AX 136 MOVL -4(SI)(BX*1), CX 137 MOVL AX, (DI) 138 MOVL CX, -4(DI)(BX*1) 139 RET 140 move_9through16: 141 MOVQ (SI), AX 142 MOVQ -8(SI)(BX*1), CX 143 MOVQ AX, (DI) 144 MOVQ CX, -8(DI)(BX*1) 145 RET 146 move_17through32: 147 MOVOU (SI), X0 148 MOVOU -16(SI)(BX*1), X1 149 MOVOU X0, (DI) 150 MOVOU X1, -16(DI)(BX*1) 151 RET 152 move_33through64: 153 MOVOU (SI), X0 154 MOVOU 16(SI), X1 155 MOVOU -32(SI)(BX*1), X2 156 MOVOU -16(SI)(BX*1), X3 157 MOVOU X0, (DI) 158 MOVOU X1, 16(DI) 159 MOVOU X2, -32(DI)(BX*1) 160 MOVOU X3, -16(DI)(BX*1) 161 RET 162 move_65through128: 163 MOVOU (SI), X0 164 MOVOU 16(SI), X1 165 MOVOU 32(SI), X2 166 MOVOU 48(SI), X3 167 MOVOU -64(SI)(BX*1), X4 168 MOVOU -48(SI)(BX*1), X5 169 MOVOU -32(SI)(BX*1), X6 170 MOVOU -16(SI)(BX*1), X7 171 MOVOU X0, (DI) 172 MOVOU X1, 16(DI) 173 MOVOU X2, 32(DI) 174 MOVOU X3, 48(DI) 175 MOVOU X4, -64(DI)(BX*1) 176 MOVOU X5, -48(DI)(BX*1) 177 MOVOU X6, -32(DI)(BX*1) 178 MOVOU X7, -16(DI)(BX*1) 179 RET 180 move_129through256: 181 MOVOU (SI), X0 182 MOVOU 16(SI), X1 183 MOVOU 32(SI), X2 184 MOVOU 48(SI), X3 185 MOVOU 64(SI), X4 186 MOVOU 80(SI), X5 187 MOVOU 96(SI), X6 188 MOVOU 112(SI), X7 189 MOVOU -128(SI)(BX*1), X8 190 MOVOU -112(SI)(BX*1), X9 191 MOVOU -96(SI)(BX*1), X10 192 MOVOU -80(SI)(BX*1), X11 193 MOVOU -64(SI)(BX*1), X12 194 MOVOU -48(SI)(BX*1), X13 195 MOVOU -32(SI)(BX*1), X14 196 MOVOU -16(SI)(BX*1), X15 197 MOVOU X0, (DI) 198 MOVOU X1, 16(DI) 199 MOVOU X2, 32(DI) 200 MOVOU X3, 48(DI) 201 MOVOU X4, 64(DI) 202 MOVOU X5, 80(DI) 203 MOVOU X6, 96(DI) 204 MOVOU X7, 112(DI) 205 MOVOU X8, -128(DI)(BX*1) 206 MOVOU X9, -112(DI)(BX*1) 207 MOVOU X10, -96(DI)(BX*1) 208 MOVOU X11, -80(DI)(BX*1) 209 MOVOU X12, -64(DI)(BX*1) 210 MOVOU X13, -48(DI)(BX*1) 211 MOVOU X14, -32(DI)(BX*1) 212 MOVOU X15, -16(DI)(BX*1) 213 RET 214 move_256through2048: 215 SUBQ $256, BX 216 MOVOU (SI), X0 217 MOVOU 16(SI), X1 218 MOVOU 32(SI), X2 219 MOVOU 48(SI), X3 220 MOVOU 64(SI), X4 221 MOVOU 80(SI), X5 222 MOVOU 96(SI), X6 223 MOVOU 112(SI), X7 224 MOVOU 128(SI), X8 225 MOVOU 144(SI), X9 226 MOVOU 160(SI), X10 227 MOVOU 176(SI), X11 228 MOVOU 192(SI), X12 229 MOVOU 208(SI), X13 230 MOVOU 224(SI), X14 231 MOVOU 240(SI), X15 232 MOVOU X0, (DI) 233 MOVOU X1, 16(DI) 234 MOVOU X2, 32(DI) 235 MOVOU X3, 48(DI) 236 MOVOU X4, 64(DI) 237 MOVOU X5, 80(DI) 238 MOVOU X6, 96(DI) 239 MOVOU X7, 112(DI) 240 MOVOU X8, 128(DI) 241 MOVOU X9, 144(DI) 242 MOVOU X10, 160(DI) 243 MOVOU X11, 176(DI) 244 MOVOU X12, 192(DI) 245 MOVOU X13, 208(DI) 246 MOVOU X14, 224(DI) 247 MOVOU X15, 240(DI) 248 CMPQ BX, $256 249 LEAQ 256(SI), SI 250 LEAQ 256(DI), DI 251 JGE move_256through2048 252 JMP tail 253