1 /* 2 * Copyright (c) 2009 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /************************************************************************ 31 * 32 * memcpy.S 33 * Version: "043009" 34 * 35 ************************************************************************/ 36 37 38 /************************************************************************ 39 * Include files 40 ************************************************************************/ 41 42 #include "machine/asm.h" 43 44 45 /* 46 * This routine could be optimized for MIPS64. The current code only 47 * uses MIPS32 instructions. 48 */ 49 #if defined(__MIPSEB__) 50 # define LWHI lwl /* high part is left in big-endian */ 51 # define SWHI swl /* high part is left in big-endian */ 52 # define LWLO lwr /* low part is right in big-endian */ 53 # define SWLO swr /* low part is right in big-endian */ 54 #endif 55 56 #if defined(__MIPSEL__) 57 # define LWHI lwr /* high part is right in little-endian */ 58 # define SWHI swr /* high part is right in little-endian */ 59 # define LWLO lwl /* low part is left in big-endian */ 60 # define SWLO swl /* low part is left in big-endian */ 61 #endif 62 63 LEAF(memcpy,0) 64 65 .set noreorder 66 .set noat 67 /* 68 * Below we handle the case where memcpy is called with overlapping src and dst. 69 * Although memcpy is not required to handle this case, some parts of Android like Skia 70 * rely on such usage. We call memmove to handle such cases. 71 */ 72 subu t0,a0,a1 73 sra AT,t0,31 74 xor t1,t0,AT 75 subu t0,t1,AT 76 sltu AT,t0,a2 77 beq AT,zero,.Lmemcpy 78 la t9,memmove 79 jr t9 80 nop 81 .Lmemcpy: 82 slti AT,a2,8 83 bne AT,zero,.Llast8 84 move v0,a0 # memcpy returns the dst pointer 85 86 # Test if the src and dst are word-aligned, or can be made word-aligned 87 xor t8,a1,a0 88 andi t8,t8,0x3 # t8 is a0/a1 word-displacement 89 90 bne t8,zero,.Lunaligned 91 negu a3,a0 92 93 andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned 94 beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned 95 subu a2,a2,a3 # now a2 is the remining bytes count 96 97 LWHI t8,0(a1) 98 addu a1,a1,a3 99 SWHI t8,0(a0) 100 addu a0,a0,a3 101 102 # Now the dst/src are mutually word-aligned with word-aligned addresses 103 .Lchk16w: 104 andi t8,a2,0x3f # any whole 64-byte chunks? 105 # t8 is the byte count after 64-byte chunks 106 107 beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks 108 # There will be at most 1 32-byte chunk after it 109 subu a3,a2,t8 # subtract from a2 the reminder 110 # Here a3 counts bytes in 16w chunks 111 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks 112 113 addu t0,a0,a2 # t0 is the "past the end" address 114 115 # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past 116 # the "t0-32" address 117 # This means: for x=128 the last "safe" a0 address is "t0-160" 118 # Alternatively, for x=64 the last "safe" a0 address is "t0-96" 119 # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit 120 subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address 121 122 pref 0,0(a1) # bring the first line of src, addr 0 123 pref 0,32(a1) # bring the second line of src, addr 32 124 pref 0,64(a1) # bring the third line of src, addr 64 125 pref 30,32(a0) # safe, as we have at least 64 bytes ahead 126 # In case the a0 > t9 don't use "pref 30" at all 127 sgtu v1,a0,t9 128 bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays 129 nop 130 # otherwise, start with using pref30 131 pref 30,64(a0) 132 .Lloop16w: 133 pref 0,96(a1) 134 lw t0,0(a1) 135 bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)" 136 lw t1,4(a1) 137 pref 30,96(a0) # continue setting up the dest, addr 96 138 .Lskip_pref30_96: 139 lw t2,8(a1) 140 lw t3,12(a1) 141 lw t4,16(a1) 142 lw t5,20(a1) 143 lw t6,24(a1) 144 lw t7,28(a1) 145 pref 0,128(a1) # bring the next lines of src, addr 128 146 147 sw t0,0(a0) 148 sw t1,4(a0) 149 sw t2,8(a0) 150 sw t3,12(a0) 151 sw t4,16(a0) 152 sw t5,20(a0) 153 sw t6,24(a0) 154 sw t7,28(a0) 155 156 lw t0,32(a1) 157 bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)" 158 lw t1,36(a1) 159 pref 30,128(a0) # continue setting up the dest, addr 128 160 .Lskip_pref30_128: 161 lw t2,40(a1) 162 lw t3,44(a1) 163 lw t4,48(a1) 164 lw t5,52(a1) 165 lw t6,56(a1) 166 lw t7,60(a1) 167 pref 0, 160(a1) # bring the next lines of src, addr 160 168 169 sw t0,32(a0) 170 sw t1,36(a0) 171 sw t2,40(a0) 172 sw t3,44(a0) 173 sw t4,48(a0) 174 sw t5,52(a0) 175 sw t6,56(a0) 176 sw t7,60(a0) 177 178 addiu a0,a0,64 # adding 64 to dest 179 sgtu v1,a0,t9 180 bne a0,a3,.Lloop16w 181 addiu a1,a1,64 # adding 64 to src 182 move a2,t8 183 184 # Here we have src and dest word-aligned but less than 64-bytes to go 185 186 .Lchk8w: 187 pref 0, 0x0(a1) 188 andi t8,a2,0x1f # is there a 32-byte chunk? 189 # the t8 is the reminder count past 32-bytes 190 beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk 191 nop 192 193 lw t0,0(a1) 194 lw t1,4(a1) 195 lw t2,8(a1) 196 lw t3,12(a1) 197 lw t4,16(a1) 198 lw t5,20(a1) 199 lw t6,24(a1) 200 lw t7,28(a1) 201 addiu a1,a1,32 202 203 sw t0,0(a0) 204 sw t1,4(a0) 205 sw t2,8(a0) 206 sw t3,12(a0) 207 sw t4,16(a0) 208 sw t5,20(a0) 209 sw t6,24(a0) 210 sw t7,28(a0) 211 addiu a0,a0,32 212 213 .Lchk1w: 214 andi a2,t8,0x3 # now a2 is the reminder past 1w chunks 215 beq a2,t8,.Llast8 216 subu a3,t8,a2 # a3 is count of bytes in 1w chunks 217 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks 218 219 # copying in words (4-byte chunks) 220 .LwordCopy_loop: 221 lw t3,0(a1) # the first t3 may be equal t0 ... optimize? 222 addiu a1,a1,4 223 addiu a0,a0,4 224 bne a0,a3,.LwordCopy_loop 225 sw t3,-4(a0) 226 227 # For the last (<8) bytes 228 .Llast8: 229 blez a2,.Lleave 230 addu a3,a0,a2 # a3 is the last dst address 231 .Llast8loop: 232 lb v1,0(a1) 233 addiu a1,a1,1 234 addiu a0,a0,1 235 bne a0,a3,.Llast8loop 236 sb v1,-1(a0) 237 238 .Lleave: 239 j ra 240 nop 241 242 # 243 # UNALIGNED case 244 # 245 246 .Lunaligned: 247 # got here with a3="negu a0" 248 andi a3,a3,0x3 # test if the a0 is word aligned 249 beqz a3,.Lua_chk16w 250 subu a2,a2,a3 # bytes left after initial a3 bytes 251 252 LWHI v1,0(a1) 253 LWLO v1,3(a1) 254 addu a1,a1,a3 # a3 may be here 1, 2 or 3 255 SWHI v1,0(a0) 256 addu a0,a0,a3 # below the dst will be word aligned (NOTE1) 257 258 .Lua_chk16w: 259 andi t8,a2,0x3f # any whole 64-byte chunks? 260 # t8 is the byte count after 64-byte chunks 261 beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks 262 # There will be at most 1 32-byte chunk after it 263 subu a3,a2,t8 # subtract from a2 the reminder 264 # Here a3 counts bytes in 16w chunks 265 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks 266 267 addu t0,a0,a2 # t0 is the "past the end" address 268 269 subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address 270 271 pref 0,0(a1) # bring the first line of src, addr 0 272 pref 0,32(a1) # bring the second line of src, addr 32 273 pref 0,64(a1) # bring the third line of src, addr 64 274 pref 30,32(a0) # safe, as we have at least 64 bytes ahead 275 # In case the a0 > t9 don't use "pref 30" at all 276 sgtu v1,a0,t9 277 bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays 278 nop 279 # otherwise, start with using pref30 280 pref 30,64(a0) 281 .Lua_loop16w: 282 pref 0,96(a1) 283 LWHI t0,0(a1) 284 LWLO t0,3(a1) 285 LWHI t1,4(a1) 286 bgtz v1,.Lua_skip_pref30_96 287 LWLO t1,7(a1) 288 pref 30,96(a0) # continue setting up the dest, addr 96 289 .Lua_skip_pref30_96: 290 LWHI t2,8(a1) 291 LWLO t2,11(a1) 292 LWHI t3,12(a1) 293 LWLO t3,15(a1) 294 LWHI t4,16(a1) 295 LWLO t4,19(a1) 296 LWHI t5,20(a1) 297 LWLO t5,23(a1) 298 LWHI t6,24(a1) 299 LWLO t6,27(a1) 300 LWHI t7,28(a1) 301 LWLO t7,31(a1) 302 pref 0,128(a1) # bring the next lines of src, addr 128 303 304 sw t0,0(a0) 305 sw t1,4(a0) 306 sw t2,8(a0) 307 sw t3,12(a0) 308 sw t4,16(a0) 309 sw t5,20(a0) 310 sw t6,24(a0) 311 sw t7,28(a0) 312 313 LWHI t0,32(a1) 314 LWLO t0,35(a1) 315 LWHI t1,36(a1) 316 bgtz v1,.Lua_skip_pref30_128 317 LWLO t1,39(a1) 318 pref 30,128(a0) # continue setting up the dest, addr 128 319 .Lua_skip_pref30_128: 320 LWHI t2,40(a1) 321 LWLO t2,43(a1) 322 LWHI t3,44(a1) 323 LWLO t3,47(a1) 324 LWHI t4,48(a1) 325 LWLO t4,51(a1) 326 LWHI t5,52(a1) 327 LWLO t5,55(a1) 328 LWHI t6,56(a1) 329 LWLO t6,59(a1) 330 LWHI t7,60(a1) 331 LWLO t7,63(a1) 332 pref 0, 160(a1) # bring the next lines of src, addr 160 333 334 sw t0,32(a0) 335 sw t1,36(a0) 336 sw t2,40(a0) 337 sw t3,44(a0) 338 sw t4,48(a0) 339 sw t5,52(a0) 340 sw t6,56(a0) 341 sw t7,60(a0) 342 343 addiu a0,a0,64 # adding 64 to dest 344 sgtu v1,a0,t9 345 bne a0,a3,.Lua_loop16w 346 addiu a1,a1,64 # adding 64 to src 347 move a2,t8 348 349 # Here we have src and dest word-aligned but less than 64-bytes to go 350 351 .Lua_chk8w: 352 pref 0, 0x0(a1) 353 andi t8,a2,0x1f # is there a 32-byte chunk? 354 # the t8 is the reminder count 355 beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk 356 nop 357 358 LWHI t0,0(a1) 359 LWLO t0,3(a1) 360 LWHI t1,4(a1) 361 LWLO t1,7(a1) 362 LWHI t2,8(a1) 363 LWLO t2,11(a1) 364 LWHI t3,12(a1) 365 LWLO t3,15(a1) 366 LWHI t4,16(a1) 367 LWLO t4,19(a1) 368 LWHI t5,20(a1) 369 LWLO t5,23(a1) 370 LWHI t6,24(a1) 371 LWLO t6,27(a1) 372 LWHI t7,28(a1) 373 LWLO t7,31(a1) 374 addiu a1,a1,32 375 376 sw t0,0(a0) 377 sw t1,4(a0) 378 sw t2,8(a0) 379 sw t3,12(a0) 380 sw t4,16(a0) 381 sw t5,20(a0) 382 sw t6,24(a0) 383 sw t7,28(a0) 384 addiu a0,a0,32 385 386 .Lua_chk1w: 387 andi a2,t8,0x3 # now a2 is the reminder past 1w chunks 388 beq a2,t8,.Lua_smallCopy 389 subu a3,t8,a2 # a3 is count of bytes in 1w chunks 390 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks 391 392 # copying in words (4-byte chunks) 393 .Lua_wordCopy_loop: 394 LWHI v1,0(a1) 395 LWLO v1,3(a1) 396 addiu a1,a1,4 397 addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1 398 bne a0,a3,.Lua_wordCopy_loop 399 sw v1,-4(a0) 400 401 # Now less than 4 bytes (value in a2) left to copy 402 .Lua_smallCopy: 403 beqz a2,.Lleave 404 addu a3,a0,a2 # a3 is the last dst address 405 .Lua_smallCopy_loop: 406 lb v1,0(a1) 407 addiu a1,a1,1 408 addiu a0,a0,1 409 bne a0,a3,.Lua_smallCopy_loop 410 sb v1,-1(a0) 411 412 j ra 413 nop 414 415 .set at 416 .set reorder 417 418 END(memcpy) 419 420 421 /************************************************************************ 422 * Implementation : Static functions 423 ************************************************************************/ 424