1 // Inferno's libkern/memmove-386.s 2 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s 3 // 4 // Copyright 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 TEXT runtimememmove(SB), NOSPLIT, $0-12 31 MOVL to+0(FP), DI 32 MOVL from+4(FP), SI 33 MOVL n+8(FP), BX 34 35 // REP instructions have a high startup cost, so we handle small sizes 36 // with some straightline code. The REP MOVSL instruction is really fast 37 // for large sizes. The cutover is approximately 1K. We implement up to 38 // 128 because that is the maximum SSE register load (loading all data 39 // into registers lets us ignore copy direction). 40 tail: 41 TESTL BX, BX 42 JEQ move_0 43 CMPL BX, $2 44 JBE move_1or2 45 CMPL BX, $4 46 JBE move_3or4 47 CMPL BX, $8 48 JBE move_5through8 49 CMPL BX, $16 50 JBE move_9through16 51 TESTL $0x4000000, runtimecpuid_edx(SB) // check for sse2 52 JEQ nosse2 53 CMPL BX, $32 54 JBE move_17through32 55 CMPL BX, $64 56 JBE move_33through64 57 CMPL BX, $128 58 JBE move_65through128 59 // TODO: use branch table and BSR to make this just a single dispatch 60 61 nosse2: 62 /* 63 * check and set for backwards 64 */ 65 CMPL SI, DI 66 JLS back 67 68 /* 69 * forward copy loop 70 */ 71 forward: 72 MOVL BX, CX 73 SHRL $2, CX 74 ANDL $3, BX 75 76 REP; MOVSL 77 JMP tail 78 /* 79 * check overlap 80 */ 81 back: 82 MOVL SI, CX 83 ADDL BX, CX 84 CMPL CX, DI 85 JLS forward 86 /* 87 * whole thing backwards has 88 * adjusted addresses 89 */ 90 91 ADDL BX, DI 92 ADDL BX, SI 93 STD 94 95 /* 96 * copy 97 */ 98 MOVL BX, CX 99 SHRL $2, CX 100 ANDL $3, BX 101 102 SUBL $4, DI 103 SUBL $4, SI 104 REP; MOVSL 105 106 CLD 107 ADDL $4, DI 108 ADDL $4, SI 109 SUBL BX, DI 110 SUBL BX, SI 111 JMP tail 112 113 move_1or2: 114 MOVB (SI), AX 115 MOVB -1(SI)(BX*1), CX 116 MOVB AX, (DI) 117 MOVB CX, -1(DI)(BX*1) 118 RET 119 move_0: 120 RET 121 move_3or4: 122 MOVW (SI), AX 123 MOVW -2(SI)(BX*1), CX 124 MOVW AX, (DI) 125 MOVW CX, -2(DI)(BX*1) 126 RET 127 move_5through8: 128 MOVL (SI), AX 129 MOVL -4(SI)(BX*1), CX 130 MOVL AX, (DI) 131 MOVL CX, -4(DI)(BX*1) 132 RET 133 move_9through16: 134 MOVL (SI), AX 135 MOVL 4(SI), CX 136 MOVL -8(SI)(BX*1), DX 137 MOVL -4(SI)(BX*1), BP 138 MOVL AX, (DI) 139 MOVL CX, 4(DI) 140 MOVL DX, -8(DI)(BX*1) 141 MOVL BP, -4(DI)(BX*1) 142 RET 143 move_17through32: 144 MOVOU (SI), X0 145 MOVOU -16(SI)(BX*1), X1 146 MOVOU X0, (DI) 147 MOVOU X1, -16(DI)(BX*1) 148 RET 149 move_33through64: 150 MOVOU (SI), X0 151 MOVOU 16(SI), X1 152 MOVOU -32(SI)(BX*1), X2 153 MOVOU -16(SI)(BX*1), X3 154 MOVOU X0, (DI) 155 MOVOU X1, 16(DI) 156 MOVOU X2, -32(DI)(BX*1) 157 MOVOU X3, -16(DI)(BX*1) 158 RET 159 move_65through128: 160 MOVOU (SI), X0 161 MOVOU 16(SI), X1 162 MOVOU 32(SI), X2 163 MOVOU 48(SI), X3 164 MOVOU -64(SI)(BX*1), X4 165 MOVOU -48(SI)(BX*1), X5 166 MOVOU -32(SI)(BX*1), X6 167 MOVOU -16(SI)(BX*1), X7 168 MOVOU X0, (DI) 169 MOVOU X1, 16(DI) 170 MOVOU X2, 32(DI) 171 MOVOU X3, 48(DI) 172 MOVOU X4, -64(DI)(BX*1) 173 MOVOU X5, -48(DI)(BX*1) 174 MOVOU X6, -32(DI)(BX*1) 175 MOVOU X7, -16(DI)(BX*1) 176 RET 177