Home | History | Annotate | Download | only in string
      1 /*
      2  * Copyright (c) 2012-2015
      3  *      MIPS Technologies, Inc., California.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
     14  *    contributors may be used to endorse or promote products derived from
     15  *    this software without specific prior written permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #ifdef __ANDROID__
     31 # include <private/bionic_asm.h>
     32 # define USE_MEMMOVE_FOR_OVERLAP
     33 # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
     34 # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
     35 #elif _LIBC
     36 # include <sysdep.h>
     37 # include <regdef.h>
     38 # include <sys/asm.h>
     39 # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
     40 # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
     41 #elif _COMPILING_NEWLIB
     42 # include "machine/asm.h"
     43 # include "machine/regdef.h"
     44 # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
     45 # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
     46 #else
     47 # include <regdef.h>
     48 # include <sys/asm.h>
     49 #endif
     50 
     51 /* Check to see if the MIPS architecture we are compiling for supports
     52  * prefetching.
     53  */
     54 
     55 #if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
     56 # ifndef DISABLE_PREFETCH
     57 #  define USE_PREFETCH
     58 # endif
     59 #endif
     60 
     61 #if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
     62 # ifndef DISABLE_DOUBLE
     63 #  define USE_DOUBLE
     64 # endif
     65 #endif
     66 
     67 
     68 #if __mips_isa_rev > 5
     69 # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
     70 #  undef PREFETCH_STORE_HINT
     71 #  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
     72 # endif
     73 # define R6_CODE
     74 #endif
     75 
     76 /* Some asm.h files do not have the L macro definition.  */
     77 #ifndef L
     78 # if _MIPS_SIM == _ABIO32
     79 #  define L(label) $L ## label
     80 # else
     81 #  define L(label) .L ## label
     82 # endif
     83 #endif
     84 
     85 /* Some asm.h files do not have the PTR_ADDIU macro definition.  */
     86 #ifndef PTR_ADDIU
     87 # if _MIPS_SIM == _ABIO32
     88 #  define PTR_ADDIU	addiu
     89 # else
     90 #  define PTR_ADDIU	daddiu
     91 # endif
     92 #endif
     93 
     94 /* Some asm.h files do not have the PTR_SRA macro definition.  */
     95 #ifndef PTR_SRA
     96 # if  _MIPS_SIM == _ABIO32
     97 #  define PTR_SRA	sra
     98 # else
     99 #  define PTR_SRA	dsra
    100 # endif
    101 #endif
    102 
    103 /* New R6 instructions that may not be in asm.h.  */
    104 #ifndef PTR_LSA
    105 # if _MIPS_SIM == _ABIO32
    106 #  define PTR_LSA	lsa
    107 # else
    108 #  define PTR_LSA	dlsa
    109 # endif
    110 #endif
    111 
    112 /*
    113  * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
    114  * prefetches appears to offer a slight preformance advantage.
    115  *
    116  * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
    117  * or PREFETCH_STORE_STREAMED offers a large performance advantage
    118  * but PREPAREFORSTORE has some special restrictions to consider.
    119  *
    120  * Prefetch with the 'prepare for store' hint does not copy a memory
    121  * location into the cache, it just allocates a cache line and zeros
    122  * it out.  This means that if you do not write to the entire cache
    123  * line before writing it out to memory some data will get zero'ed out
    124  * when the cache line is written back to memory and data will be lost.
    125  *
    126  * Also if you are using this memcpy to copy overlapping buffers it may
    127  * not behave correctly when using the 'prepare for store' hint.  If you
    128  * use the 'prepare for store' prefetch on a memory area that is in the
    129  * memcpy source (as well as the memcpy destination), then you will get
    130  * some data zero'ed out before you have a chance to read it and data will
    131  * be lost.
    132  *
    133  * If you are going to use this memcpy routine with the 'prepare for store'
    134  * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
    135  * the problem of running memcpy on overlapping buffers.
    136  *
    137  * There are ifdef'ed sections of this memcpy to make sure that it does not
    138  * do prefetches on cache lines that are not going to be completely written.
    139  * This code is only needed and only used when PREFETCH_STORE_HINT is set to
    140  * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
    141  * 32 bytes and if the cache line is larger it will not work correctly.
    142  */
    143 
    144 #ifdef USE_PREFETCH
    145 # define PREFETCH_HINT_LOAD		0
    146 # define PREFETCH_HINT_STORE		1
    147 # define PREFETCH_HINT_LOAD_STREAMED	4
    148 # define PREFETCH_HINT_STORE_STREAMED	5
    149 # define PREFETCH_HINT_LOAD_RETAINED	6
    150 # define PREFETCH_HINT_STORE_RETAINED	7
    151 # define PREFETCH_HINT_WRITEBACK_INVAL	25
    152 # define PREFETCH_HINT_PREPAREFORSTORE	30
    153 
    154 /*
    155  * If we have not picked out what hints to use at this point use the
    156  * standard load and store prefetch hints.
    157  */
    158 # ifndef PREFETCH_STORE_HINT
    159 #  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
    160 # endif
    161 # ifndef PREFETCH_LOAD_HINT
    162 #  define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
    163 # endif
    164 
    165 /*
    166  * We double everything when USE_DOUBLE is true so we do 2 prefetches to
    167  * get 64 bytes in that case.  The assumption is that each individual
    168  * prefetch brings in 32 bytes.
    169  */
    170 
    171 # ifdef USE_DOUBLE
    172 #  define PREFETCH_CHUNK 64
    173 #  define PREFETCH_FOR_LOAD(chunk, reg) \
    174  pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
    175  pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
    176 #  define PREFETCH_FOR_STORE(chunk, reg) \
    177  pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
    178  pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
    179 # else
    180 #  define PREFETCH_CHUNK 32
    181 #  define PREFETCH_FOR_LOAD(chunk, reg) \
    182  pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
    183 #  define PREFETCH_FOR_STORE(chunk, reg) \
    184  pref PREFETCH_STORE_HINT, (chunk)*32(reg)
    185 # endif
    186 /* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
    187  * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
    188  * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
    189  * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
    190  * used then MAX_PREFETCH_SIZE does not matter.  */
    191 # define MAX_PREFETCH_SIZE 128
    192 /* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
    193  * than 5 on a STORE prefetch and that a single prefetch can never be larger
    194  * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
    195  * we actually do two prefetches in that case, one 32 bytes after the other.  */
    196 # ifdef USE_DOUBLE
    197 #  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
    198 # else
    199 #  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
    200 # endif
    201 # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
    202     && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
    203 /* We cannot handle this because the initial prefetches may fetch bytes that
    204  * are before the buffer being copied.  We start copies with an offset
    205  * of 4 so avoid this situation when using PREPAREFORSTORE.  */
    206 #error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
    207 # endif
    208 #else /* USE_PREFETCH not defined */
    209 # define PREFETCH_FOR_LOAD(offset, reg)
    210 # define PREFETCH_FOR_STORE(offset, reg)
    211 #endif
    212 
    213 /* Allow the routine to be named something else if desired.  */
    214 #ifndef MEMCPY_NAME
    215 # define MEMCPY_NAME memcpy
    216 #endif
    217 
    218 /* We use these 32/64 bit registers as temporaries to do the copying.  */
    219 #define REG0 t0
    220 #define REG1 t1
    221 #define REG2 t2
    222 #define REG3 t3
    223 #if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64)
    224 # define REG4 t4
    225 # define REG5 t5
    226 # define REG6 t6
    227 # define REG7 t7
    228 #else
    229 # define REG4 ta0
    230 # define REG5 ta1
    231 # define REG6 ta2
    232 # define REG7 ta3
    233 #endif
    234 
    235 /* We load/store 64 bits at a time when USE_DOUBLE is true.
    236  * The C_ prefix stands for CHUNK and is used to avoid macro name
    237  * conflicts with system header files.  */
    238 
    239 #ifdef USE_DOUBLE
    240 # define C_ST	sd
    241 # define C_LD	ld
    242 # if __MIPSEB
    243 #  define C_LDHI	ldl	/* high part is left in big-endian	*/
    244 #  define C_STHI	sdl	/* high part is left in big-endian	*/
    245 #  define C_LDLO	ldr	/* low part is right in big-endian	*/
    246 #  define C_STLO	sdr	/* low part is right in big-endian	*/
    247 # else
    248 #  define C_LDHI	ldr	/* high part is right in little-endian	*/
    249 #  define C_STHI	sdr	/* high part is right in little-endian	*/
    250 #  define C_LDLO	ldl	/* low part is left in little-endian	*/
    251 #  define C_STLO	sdl	/* low part is left in little-endian	*/
    252 # endif
    253 # define C_ALIGN	dalign	/* r6 align instruction			*/
    254 #else
    255 # define C_ST	sw
    256 # define C_LD	lw
    257 # if __MIPSEB
    258 #  define C_LDHI	lwl	/* high part is left in big-endian	*/
    259 #  define C_STHI	swl	/* high part is left in big-endian	*/
    260 #  define C_LDLO	lwr	/* low part is right in big-endian	*/
    261 #  define C_STLO	swr	/* low part is right in big-endian	*/
    262 # else
    263 #  define C_LDHI	lwr	/* high part is right in little-endian	*/
    264 #  define C_STHI	swr	/* high part is right in little-endian	*/
    265 #  define C_LDLO	lwl	/* low part is left in little-endian	*/
    266 #  define C_STLO	swl	/* low part is left in little-endian	*/
    267 # endif
    268 # define C_ALIGN	align	/* r6 align instruction			*/
    269 #endif
    270 
    271 /* Bookkeeping values for 32 vs. 64 bit mode.  */
    272 #ifdef USE_DOUBLE
    273 # define NSIZE 8
    274 # define NSIZEMASK 0x3f
    275 # define NSIZEDMASK 0x7f
    276 #else
    277 # define NSIZE 4
    278 # define NSIZEMASK 0x1f
    279 # define NSIZEDMASK 0x3f
    280 #endif
    281 #define UNIT(unit) ((unit)*NSIZE)
    282 #define UNITM1(unit) (((unit)*NSIZE)-1)
    283 
    284 #ifdef __ANDROID__
    285 LEAF(MEMCPY_NAME, 0)
    286 #else
    287 LEAF(MEMCPY_NAME)
    288 #endif
    289 	.set	nomips16
    290 	.set	noreorder
    291 /*
    292  * Below we handle the case where memcpy is called with overlapping src and dst.
    293  * Although memcpy is not required to handle this case, some parts of Android
    294  * like Skia rely on such usage. We call memmove to handle such cases.
    295  */
    296 #ifdef USE_MEMMOVE_FOR_OVERLAP
    297 	PTR_SUBU t0,a0,a1
    298 	PTR_SRA	t2,t0,31
    299 	xor	t1,t0,t2
    300 	PTR_SUBU t0,t1,t2
    301 	sltu	t2,t0,a2
    302 	beq	t2,zero,L(memcpy)
    303 	nop
    304 #if defined(__LP64__)
    305 	daddiu	sp,sp,-8
    306 	SETUP_GP64(0,MEMCPY_NAME)
    307 	LA	t9,memmove
    308 	RESTORE_GP64
    309 	jr	t9
    310 	daddiu	sp,sp,8
    311 #else
    312 	LA	t9,memmove
    313 	jr	t9
    314 	nop
    315 #endif
    316 L(memcpy):
    317 #endif
    318 /*
    319  * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
    320  * size, copy dst pointer to v0 for the return value.
    321  */
    322 	slti	t2,a2,(2 * NSIZE)
    323 	bne	t2,zero,L(lastb)
    324 #if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
    325 	move	v0,zero
    326 #else
    327 	move	v0,a0
    328 #endif
    329 
    330 #ifndef R6_CODE
    331 
    332 /*
    333  * If src and dst have different alignments, go to L(unaligned), if they
    334  * have the same alignment (but are not actually aligned) do a partial
    335  * load/store to make them aligned.  If they are both already aligned
    336  * we can start copying at L(aligned).
    337  */
    338 	xor	t8,a1,a0
    339 	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
    340 	bne	t8,zero,L(unaligned)
    341 	PTR_SUBU a3, zero, a0
    342 
    343 	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
    344 	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
    345 	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
    346 
    347 	C_LDHI	t8,0(a1)
    348 	PTR_ADDU a1,a1,a3
    349 	C_STHI	t8,0(a0)
    350 	PTR_ADDU a0,a0,a3
    351 
    352 #else /* R6_CODE */
    353 
    354 /*
    355  * Align the destination and hope that the source gets aligned too.  If it
    356  * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
    357  * align instruction.
    358  */
    359 	andi	t8,a0,7
    360 	lapc	t9,L(atable)
    361 	PTR_LSA	t9,t8,t9,2
    362 	jrc	t9
    363 L(atable):
    364 	bc	L(lb0)
    365 	bc	L(lb7)
    366 	bc	L(lb6)
    367 	bc	L(lb5)
    368 	bc	L(lb4)
    369 	bc	L(lb3)
    370 	bc	L(lb2)
    371 	bc	L(lb1)
    372 L(lb7):
    373 	lb	a3, 6(a1)
    374 	sb	a3, 6(a0)
    375 L(lb6):
    376 	lb	a3, 5(a1)
    377 	sb	a3, 5(a0)
    378 L(lb5):
    379 	lb	a3, 4(a1)
    380 	sb	a3, 4(a0)
    381 L(lb4):
    382 	lb	a3, 3(a1)
    383 	sb	a3, 3(a0)
    384 L(lb3):
    385 	lb	a3, 2(a1)
    386 	sb	a3, 2(a0)
    387 L(lb2):
    388 	lb	a3, 1(a1)
    389 	sb	a3, 1(a0)
    390 L(lb1):
    391 	lb	a3, 0(a1)
    392 	sb	a3, 0(a0)
    393 
    394 	li	t9,8
    395 	subu	t8,t9,t8
    396 	PTR_SUBU a2,a2,t8
    397 	PTR_ADDU a0,a0,t8
    398 	PTR_ADDU a1,a1,t8
    399 L(lb0):
    400 
    401 	andi	t8,a1,(NSIZE-1)
    402 	lapc	t9,L(jtable)
    403 	PTR_LSA	t9,t8,t9,2
    404 	jrc	t9
    405 L(jtable):
    406 	bc	L(aligned)
    407 	bc	L(r6_unaligned1)
    408 	bc	L(r6_unaligned2)
    409 	bc	L(r6_unaligned3)
    410 # ifdef USE_DOUBLE
    411 	bc	L(r6_unaligned4)
    412 	bc	L(r6_unaligned5)
    413 	bc	L(r6_unaligned6)
    414 	bc	L(r6_unaligned7)
    415 # endif
    416 #endif /* R6_CODE */
    417 
    418 L(aligned):
    419 
    420 /*
    421  * Now dst/src are both aligned to (word or double word) aligned addresses
    422  * Set a2 to count how many bytes we have to copy after all the 64/128 byte
    423  * chunks are copied and a3 to the dst pointer after all the 64/128 byte
    424  * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
    425  * equals a3.
    426  */
    427 
    428 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
    429 	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
    430 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
    431 	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
    432 
    433 /* When in the loop we may prefetch with the 'prepare to store' hint,
    434  * in this case the a0+x should not be past the "t0-32" address.  This
    435  * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
    436  * for x=64 the last "safe" a0 address is "t0-96" In the current version we
    437  * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
    438  */
    439 #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
    440 	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
    441 	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
    442 #endif
    443 	PREFETCH_FOR_LOAD  (0, a1)
    444 	PREFETCH_FOR_LOAD  (1, a1)
    445 	PREFETCH_FOR_LOAD  (2, a1)
    446 	PREFETCH_FOR_LOAD  (3, a1)
    447 #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
    448 	PREFETCH_FOR_STORE (1, a0)
    449 	PREFETCH_FOR_STORE (2, a0)
    450 	PREFETCH_FOR_STORE (3, a0)
    451 #endif
    452 #if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
    453 # if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
    454 	sltu    v1,t9,a0
    455 	bgtz    v1,L(skip_set)
    456 	nop
    457 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
    458 L(skip_set):
    459 # else
    460 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
    461 # endif
    462 #endif
    463 #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
    464     && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
    465 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
    466 # ifdef USE_DOUBLE
    467 	PTR_ADDIU v0,v0,32
    468 # endif
    469 #endif
    470 L(loop16w):
    471 	C_LD	t0,UNIT(0)(a1)
    472 #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
    473 	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
    474 	bgtz	v1,L(skip_pref)
    475 #endif
    476 	C_LD	t1,UNIT(1)(a1)
    477 #ifndef R6_CODE
    478 	PREFETCH_FOR_STORE (4, a0)
    479 	PREFETCH_FOR_STORE (5, a0)
    480 #else
    481 	PREFETCH_FOR_STORE (2, a0)
    482 #endif
    483 #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
    484 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
    485 # ifdef USE_DOUBLE
    486 	PTR_ADDIU v0,v0,32
    487 # endif
    488 #endif
    489 L(skip_pref):
    490 	C_LD	REG2,UNIT(2)(a1)
    491 	C_LD	REG3,UNIT(3)(a1)
    492 	C_LD	REG4,UNIT(4)(a1)
    493 	C_LD	REG5,UNIT(5)(a1)
    494 	C_LD	REG6,UNIT(6)(a1)
    495 	C_LD	REG7,UNIT(7)(a1)
    496 #ifndef R6_CODE
    497 	PREFETCH_FOR_LOAD (4, a1)
    498 #else
    499 	PREFETCH_FOR_LOAD (3, a1)
    500 #endif
    501 	C_ST	t0,UNIT(0)(a0)
    502 	C_ST	t1,UNIT(1)(a0)
    503 	C_ST	REG2,UNIT(2)(a0)
    504 	C_ST	REG3,UNIT(3)(a0)
    505 	C_ST	REG4,UNIT(4)(a0)
    506 	C_ST	REG5,UNIT(5)(a0)
    507 	C_ST	REG6,UNIT(6)(a0)
    508 	C_ST	REG7,UNIT(7)(a0)
    509 
    510 	C_LD	t0,UNIT(8)(a1)
    511 	C_LD	t1,UNIT(9)(a1)
    512 	C_LD	REG2,UNIT(10)(a1)
    513 	C_LD	REG3,UNIT(11)(a1)
    514 	C_LD	REG4,UNIT(12)(a1)
    515 	C_LD	REG5,UNIT(13)(a1)
    516 	C_LD	REG6,UNIT(14)(a1)
    517 	C_LD	REG7,UNIT(15)(a1)
    518 #ifndef R6_CODE
    519 	PREFETCH_FOR_LOAD (5, a1)
    520 #endif
    521 	C_ST	t0,UNIT(8)(a0)
    522 	C_ST	t1,UNIT(9)(a0)
    523 	C_ST	REG2,UNIT(10)(a0)
    524 	C_ST	REG3,UNIT(11)(a0)
    525 	C_ST	REG4,UNIT(12)(a0)
    526 	C_ST	REG5,UNIT(13)(a0)
    527 	C_ST	REG6,UNIT(14)(a0)
    528 	C_ST	REG7,UNIT(15)(a0)
    529 	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
    530 	bne	a0,a3,L(loop16w)
    531 	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
    532 	move	a2,t8
    533 
    534 /* Here we have src and dest word-aligned but less than 64-bytes or
    535  * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
    536  * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
    537  * the copy.
    538  */
    539 
    540 L(chkw):
    541 	PREFETCH_FOR_LOAD (0, a1)
    542 	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
    543 				/* The t8 is the reminder count past 32-bytes */
    544 	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
    545 	nop
    546 	C_LD	t0,UNIT(0)(a1)
    547 	C_LD	t1,UNIT(1)(a1)
    548 	C_LD	REG2,UNIT(2)(a1)
    549 	C_LD	REG3,UNIT(3)(a1)
    550 	C_LD	REG4,UNIT(4)(a1)
    551 	C_LD	REG5,UNIT(5)(a1)
    552 	C_LD	REG6,UNIT(6)(a1)
    553 	C_LD	REG7,UNIT(7)(a1)
    554 	PTR_ADDIU a1,a1,UNIT(8)
    555 	C_ST	t0,UNIT(0)(a0)
    556 	C_ST	t1,UNIT(1)(a0)
    557 	C_ST	REG2,UNIT(2)(a0)
    558 	C_ST	REG3,UNIT(3)(a0)
    559 	C_ST	REG4,UNIT(4)(a0)
    560 	C_ST	REG5,UNIT(5)(a0)
    561 	C_ST	REG6,UNIT(6)(a0)
    562 	C_ST	REG7,UNIT(7)(a0)
    563 	PTR_ADDIU a0,a0,UNIT(8)
    564 
    565 /*
    566  * Here we have less than 32(64) bytes to copy.  Set up for a loop to
    567  * copy one word (or double word) at a time.  Set a2 to count how many
    568  * bytes we have to copy after all the word (or double word) chunks are
    569  * copied and a3 to the dst pointer after all the (d)word chunks have
    570  * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
    571  */
    572 L(chk1w):
    573 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
    574 	beq	a2,t8,L(lastb)
    575 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
    576 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
    577 
    578 /* copying in words (4-byte or 8-byte chunks) */
    579 L(wordCopy_loop):
    580 	C_LD	REG3,UNIT(0)(a1)
    581 	PTR_ADDIU a0,a0,UNIT(1)
    582 	PTR_ADDIU a1,a1,UNIT(1)
    583 	bne	a0,a3,L(wordCopy_loop)
    584 	C_ST	REG3,UNIT(-1)(a0)
    585 
    586 /* Copy the last 8 (or 16) bytes */
    587 L(lastb):
    588 	blez	a2,L(leave)
    589 	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
    590 L(lastbloop):
    591 	lb	v1,0(a1)
    592 	PTR_ADDIU a0,a0,1
    593 	PTR_ADDIU a1,a1,1
    594 	bne	a0,a3,L(lastbloop)
    595 	sb	v1,-1(a0)
    596 L(leave):
    597 	j	ra
    598 	nop
    599 
    600 #ifndef R6_CODE
    601 /*
    602  * UNALIGNED case, got here with a3 = "negu a0"
    603  * This code is nearly identical to the aligned code above
    604  * but only the destination (not the source) gets aligned
    605  * so we need to do partial loads of the source followed
    606  * by normal stores to the destination (once we have aligned
    607  * the destination).
    608  */
    609 
    610 L(unaligned):
    611 	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
    612 	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
    613 	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
    614 
    615 	C_LDHI	v1,UNIT(0)(a1)
    616 	C_LDLO	v1,UNITM1(1)(a1)
    617 	PTR_ADDU a1,a1,a3
    618 	C_STHI	v1,UNIT(0)(a0)
    619 	PTR_ADDU a0,a0,a3
    620 
    621 /*
    622  *  Now the destination (but not the source) is aligned
    623  * Set a2 to count how many bytes we have to copy after all the 64/128 byte
    624  * chunks are copied and a3 to the dst pointer after all the 64/128 byte
    625  * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
    626  * equals a3.
    627  */
    628 
    629 L(ua_chk16w):
    630 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
    631 	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
    632 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
    633 	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
    634 
    635 # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
    636 	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
    637 	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
    638 # endif
    639 	PREFETCH_FOR_LOAD  (0, a1)
    640 	PREFETCH_FOR_LOAD  (1, a1)
    641 	PREFETCH_FOR_LOAD  (2, a1)
    642 # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
    643 	PREFETCH_FOR_STORE (1, a0)
    644 	PREFETCH_FOR_STORE (2, a0)
    645 	PREFETCH_FOR_STORE (3, a0)
    646 # endif
    647 # if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
    648 #  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
    649 	sltu    v1,t9,a0
    650 	bgtz    v1,L(ua_skip_set)
    651 	nop
    652 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
    653 L(ua_skip_set):
    654 #  else
    655 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
    656 #  endif
    657 # endif
    658 L(ua_loop16w):
    659 	PREFETCH_FOR_LOAD  (3, a1)
    660 	C_LDHI	t0,UNIT(0)(a1)
    661 	C_LDHI	t1,UNIT(1)(a1)
    662 	C_LDHI	REG2,UNIT(2)(a1)
    663 # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
    664 	sltu	v1,t9,a0
    665 	bgtz	v1,L(ua_skip_pref)
    666 # endif
    667 	C_LDHI	REG3,UNIT(3)(a1)
    668 	PREFETCH_FOR_STORE (4, a0)
    669 	PREFETCH_FOR_STORE (5, a0)
    670 L(ua_skip_pref):
    671 	C_LDHI	REG4,UNIT(4)(a1)
    672 	C_LDHI	REG5,UNIT(5)(a1)
    673 	C_LDHI	REG6,UNIT(6)(a1)
    674 	C_LDHI	REG7,UNIT(7)(a1)
    675 	C_LDLO	t0,UNITM1(1)(a1)
    676 	C_LDLO	t1,UNITM1(2)(a1)
    677 	C_LDLO	REG2,UNITM1(3)(a1)
    678 	C_LDLO	REG3,UNITM1(4)(a1)
    679 	C_LDLO	REG4,UNITM1(5)(a1)
    680 	C_LDLO	REG5,UNITM1(6)(a1)
    681 	C_LDLO	REG6,UNITM1(7)(a1)
    682 	C_LDLO	REG7,UNITM1(8)(a1)
    683         PREFETCH_FOR_LOAD (4, a1)
    684 	C_ST	t0,UNIT(0)(a0)
    685 	C_ST	t1,UNIT(1)(a0)
    686 	C_ST	REG2,UNIT(2)(a0)
    687 	C_ST	REG3,UNIT(3)(a0)
    688 	C_ST	REG4,UNIT(4)(a0)
    689 	C_ST	REG5,UNIT(5)(a0)
    690 	C_ST	REG6,UNIT(6)(a0)
    691 	C_ST	REG7,UNIT(7)(a0)
    692 	C_LDHI	t0,UNIT(8)(a1)
    693 	C_LDHI	t1,UNIT(9)(a1)
    694 	C_LDHI	REG2,UNIT(10)(a1)
    695 	C_LDHI	REG3,UNIT(11)(a1)
    696 	C_LDHI	REG4,UNIT(12)(a1)
    697 	C_LDHI	REG5,UNIT(13)(a1)
    698 	C_LDHI	REG6,UNIT(14)(a1)
    699 	C_LDHI	REG7,UNIT(15)(a1)
    700 	C_LDLO	t0,UNITM1(9)(a1)
    701 	C_LDLO	t1,UNITM1(10)(a1)
    702 	C_LDLO	REG2,UNITM1(11)(a1)
    703 	C_LDLO	REG3,UNITM1(12)(a1)
    704 	C_LDLO	REG4,UNITM1(13)(a1)
    705 	C_LDLO	REG5,UNITM1(14)(a1)
    706 	C_LDLO	REG6,UNITM1(15)(a1)
    707 	C_LDLO	REG7,UNITM1(16)(a1)
    708         PREFETCH_FOR_LOAD (5, a1)
    709 	C_ST	t0,UNIT(8)(a0)
    710 	C_ST	t1,UNIT(9)(a0)
    711 	C_ST	REG2,UNIT(10)(a0)
    712 	C_ST	REG3,UNIT(11)(a0)
    713 	C_ST	REG4,UNIT(12)(a0)
    714 	C_ST	REG5,UNIT(13)(a0)
    715 	C_ST	REG6,UNIT(14)(a0)
    716 	C_ST	REG7,UNIT(15)(a0)
    717 	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
    718 	bne	a0,a3,L(ua_loop16w)
    719 	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
    720 	move	a2,t8
    721 
    722 /* Here we have src and dest word-aligned but less than 64-bytes or
    723  * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
    724  * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
    725  * the copy.  */
    726 
    727 L(ua_chkw):
    728 	PREFETCH_FOR_LOAD (0, a1)
    729 	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
    730 				  /* t8 is the reminder count past 32-bytes */
    731 	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
    732 	nop
    733 	C_LDHI	t0,UNIT(0)(a1)
    734 	C_LDHI	t1,UNIT(1)(a1)
    735 	C_LDHI	REG2,UNIT(2)(a1)
    736 	C_LDHI	REG3,UNIT(3)(a1)
    737 	C_LDHI	REG4,UNIT(4)(a1)
    738 	C_LDHI	REG5,UNIT(5)(a1)
    739 	C_LDHI	REG6,UNIT(6)(a1)
    740 	C_LDHI	REG7,UNIT(7)(a1)
    741 	C_LDLO	t0,UNITM1(1)(a1)
    742 	C_LDLO	t1,UNITM1(2)(a1)
    743 	C_LDLO	REG2,UNITM1(3)(a1)
    744 	C_LDLO	REG3,UNITM1(4)(a1)
    745 	C_LDLO	REG4,UNITM1(5)(a1)
    746 	C_LDLO	REG5,UNITM1(6)(a1)
    747 	C_LDLO	REG6,UNITM1(7)(a1)
    748 	C_LDLO	REG7,UNITM1(8)(a1)
    749 	PTR_ADDIU a1,a1,UNIT(8)
    750 	C_ST	t0,UNIT(0)(a0)
    751 	C_ST	t1,UNIT(1)(a0)
    752 	C_ST	REG2,UNIT(2)(a0)
    753 	C_ST	REG3,UNIT(3)(a0)
    754 	C_ST	REG4,UNIT(4)(a0)
    755 	C_ST	REG5,UNIT(5)(a0)
    756 	C_ST	REG6,UNIT(6)(a0)
    757 	C_ST	REG7,UNIT(7)(a0)
    758 	PTR_ADDIU a0,a0,UNIT(8)
    759 /*
    760  * Here we have less than 32(64) bytes to copy.  Set up for a loop to
    761  * copy one word (or double word) at a time.
    762  */
    763 L(ua_chk1w):
    764 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
    765 	beq	a2,t8,L(ua_smallCopy)
    766 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
    767 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
    768 
    769 /* copying in words (4-byte or 8-byte chunks) */
    770 L(ua_wordCopy_loop):
    771 	C_LDHI	v1,UNIT(0)(a1)
    772 	C_LDLO	v1,UNITM1(1)(a1)
    773 	PTR_ADDIU a0,a0,UNIT(1)
    774 	PTR_ADDIU a1,a1,UNIT(1)
    775 	bne	a0,a3,L(ua_wordCopy_loop)
    776 	C_ST	v1,UNIT(-1)(a0)
    777 
    778 /* Copy the last 8 (or 16) bytes */
    779 L(ua_smallCopy):
    780 	beqz	a2,L(leave)
    781 	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
    782 L(ua_smallCopy_loop):
    783 	lb	v1,0(a1)
    784 	PTR_ADDIU a0,a0,1
    785 	PTR_ADDIU a1,a1,1
    786 	bne	a0,a3,L(ua_smallCopy_loop)
    787 	sb	v1,-1(a0)
    788 
    789 	j	ra
    790 	nop
    791 
    792 #else /* R6_CODE */
    793 
    794 # if __MIPSEB
    795 #  define SWAP_REGS(X,Y) X, Y
    796 #  define ALIGN_OFFSET(N) (N)
    797 # else
    798 #  define SWAP_REGS(X,Y) Y, X
    799 #  define ALIGN_OFFSET(N) (NSIZE-N)
    800 # endif
    801 # define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
    802 	andi	REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
    803 	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
    804 	PTR_SUBU a3, a2, REG7;	/* a3 is number of bytes to be copied in   */ \
    805 				/* (d)word chunks.			   */ \
    806 	move	a2, REG7;	/* a2 is # of bytes to copy byte by byte   */ \
    807 				/* after word loop is finished.		   */ \
    808 	PTR_ADDU REG6, a0, a3;	/* REG6 is the dst address after loop.	   */ \
    809 	PTR_SUBU REG2, a1, t8;	/* REG2 is the aligned src address.	   */ \
    810 	PTR_ADDU a1, a1, a3;	/* a1 is addr of source after word loop.   */ \
    811 	C_LD	t0, UNIT(0)(REG2);  /* Load first part of source.	   */ \
    812 L(r6_ua_wordcopy##BYTEOFFSET):						      \
    813 	C_LD	t1, UNIT(1)(REG2);  /* Load second part of source.	   */ \
    814 	C_ALIGN	REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);	      \
    815 	PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.	   */ \
    816 	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
    817 	move	t0, t1;		/* Move second part of source to first.	   */ \
    818 	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
    819 	C_ST	REG3, UNIT(-1)(a0);					      \
    820 	j	L(lastb);						      \
    821 	nop
    822 
    823 	/* We are generating R6 code, the destination is 4 byte aligned and
    824 	   the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
    825            alignment of the source.  */
    826 
    827 L(r6_unaligned1):
    828 	R6_UNALIGNED_WORD_COPY(1)
    829 L(r6_unaligned2):
    830 	R6_UNALIGNED_WORD_COPY(2)
    831 L(r6_unaligned3):
    832 	R6_UNALIGNED_WORD_COPY(3)
    833 # ifdef USE_DOUBLE
    834 L(r6_unaligned4):
    835 	R6_UNALIGNED_WORD_COPY(4)
    836 L(r6_unaligned5):
    837 	R6_UNALIGNED_WORD_COPY(5)
    838 L(r6_unaligned6):
    839 	R6_UNALIGNED_WORD_COPY(6)
    840 L(r6_unaligned7):
    841 	R6_UNALIGNED_WORD_COPY(7)
    842 # endif
    843 #endif /* R6_CODE */
    844 
    845 	.set	at
    846 	.set	reorder
    847 END(MEMCPY_NAME)
    848 #ifndef __ANDROID__
    849 # ifdef _LIBC
    850 libc_hidden_builtin_def (MEMCPY_NAME)
    851 # endif
    852 #endif
    853