Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright (c) 2012
      3  *      MIPS Technologies, Inc., California.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
     14  *    contributors may be used to endorse or promote products derived from
     15  *    this software without specific prior written permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #include "pixman-mips-dspr2-asm.h"
     31 
     32 /*
     33  * This routine could be optimized for MIPS64. The current code only
     34  * uses MIPS32 instructions.
     35  */
     36 
     37 #ifdef EB
     38 #  define LWHI	lwl		/* high part is left in big-endian */
     39 #  define SWHI	swl		/* high part is left in big-endian */
     40 #  define LWLO	lwr		/* low part is right in big-endian */
     41 #  define SWLO	swr		/* low part is right in big-endian */
     42 #else
     43 #  define LWHI	lwr		/* high part is right in little-endian */
     44 #  define SWHI	swr		/* high part is right in little-endian */
     45 #  define LWLO	lwl		/* low part is left in big-endian */
     46 #  define SWLO	swl		/* low part is left in big-endian */
     47 #endif
     48 
     49 LEAF_MIPS32R2(pixman_mips_fast_memcpy)
     50 
     51 	slti	AT, a2, 8
     52 	bne	AT, zero, $last8
     53 	move	v0, a0	/* memcpy returns the dst pointer */
     54 
     55 /* Test if the src and dst are word-aligned, or can be made word-aligned */
     56 	xor	t8, a1, a0
     57 	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
     58 
     59 	bne	t8, zero, $unaligned
     60 	negu	a3, a0
     61 
     62 	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
     63 	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
     64 	subu	a2, a2, a3	/* now a2 is the remining bytes count */
     65 
     66 	LWHI	t8, 0(a1)
     67 	addu	a1, a1, a3
     68 	SWHI	t8, 0(a0)
     69 	addu	a0, a0, a3
     70 
     71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
     72 $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
     73 				/* t8 is the byte count after 64-byte chunks */
     74 
     75 	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
     76 				/* There will be at most 1 32-byte chunk after it */
     77 	subu	a3, a2, t8	/* subtract from a2 the reminder */
     78                                 /* Here a3 counts bytes in 16w chunks */
     79 	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
     80 
     81 	addu	t0, a0, a2	/* t0 is the "past the end" address */
     82 
     83 /*
     84  * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
     85  * the "t0-32" address
     86  * This means: for x=128 the last "safe" a0 address is "t0-160"
     87  * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
     88  * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
     89  */
     90 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
     91 
     92 	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
     93 	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
     94 	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
     95 	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
     96 /* In case the a0 > t9 don't use "pref 30" at all */
     97 	sgtu	v1, a0, t9
     98 	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
     99 	nop
    100 /* otherwise, start with using pref30 */
    101 	pref	30, 64(a0)
    102 $loop16w:
    103 	pref	0, 96(a1)
    104 	lw	t0, 0(a1)
    105 	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
    106 	lw	t1, 4(a1)
    107 	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
    108 $skip_pref30_96:
    109 	lw	t2, 8(a1)
    110 	lw	t3, 12(a1)
    111 	lw	t4, 16(a1)
    112 	lw	t5, 20(a1)
    113 	lw	t6, 24(a1)
    114 	lw	t7, 28(a1)
    115         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
    116 
    117 	sw	t0, 0(a0)
    118 	sw	t1, 4(a0)
    119 	sw	t2, 8(a0)
    120 	sw	t3, 12(a0)
    121 	sw	t4, 16(a0)
    122 	sw	t5, 20(a0)
    123 	sw	t6, 24(a0)
    124 	sw	t7, 28(a0)
    125 
    126 	lw	t0, 32(a1)
    127 	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
    128 	lw	t1, 36(a1)
    129 	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
    130 $skip_pref30_128:
    131 	lw	t2, 40(a1)
    132 	lw	t3, 44(a1)
    133 	lw	t4, 48(a1)
    134 	lw	t5, 52(a1)
    135 	lw	t6, 56(a1)
    136 	lw	t7, 60(a1)
    137         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
    138 
    139 	sw	t0, 32(a0)
    140 	sw	t1, 36(a0)
    141 	sw	t2, 40(a0)
    142 	sw	t3, 44(a0)
    143 	sw	t4, 48(a0)
    144 	sw	t5, 52(a0)
    145 	sw	t6, 56(a0)
    146 	sw	t7, 60(a0)
    147 
    148 	addiu	a0, a0, 64	/* adding 64 to dest */
    149 	sgtu	v1, a0, t9
    150 	bne	a0, a3, $loop16w
    151 	addiu	a1, a1, 64	/* adding 64 to src */
    152 	move	a2, t8
    153 
    154 /* Here we have src and dest word-aligned but less than 64-bytes to go */
    155 
    156 $chk8w:
    157 	pref 0, 0x0(a1)
    158 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
    159 				/* the t8 is the reminder count past 32-bytes */
    160 	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
    161 	 nop
    162 
    163 	lw	t0, 0(a1)
    164 	lw	t1, 4(a1)
    165 	lw	t2, 8(a1)
    166 	lw	t3, 12(a1)
    167 	lw	t4, 16(a1)
    168 	lw	t5, 20(a1)
    169 	lw	t6, 24(a1)
    170 	lw	t7, 28(a1)
    171 	addiu	a1, a1, 32
    172 
    173 	sw	t0, 0(a0)
    174 	sw	t1, 4(a0)
    175 	sw	t2, 8(a0)
    176 	sw	t3, 12(a0)
    177 	sw	t4, 16(a0)
    178 	sw	t5, 20(a0)
    179 	sw	t6, 24(a0)
    180 	sw	t7, 28(a0)
    181 	addiu	a0, a0, 32
    182 
    183 $chk1w:
    184 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
    185 	beq	a2, t8, $last8
    186 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
    187 	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
    188 
    189 /* copying in words (4-byte chunks) */
    190 $wordCopy_loop:
    191 	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
    192 	addiu	a1, a1, 4
    193 	addiu	a0, a0, 4
    194 	bne	a0, a3, $wordCopy_loop
    195 	sw	t3, -4(a0)
    196 
    197 /* For the last (<8) bytes */
    198 $last8:
    199 	blez	a2, leave
    200 	addu	a3, a0, a2	/* a3 is the last dst address */
    201 $last8loop:
    202 	lb	v1, 0(a1)
    203 	addiu	a1, a1, 1
    204 	addiu	a0, a0, 1
    205 	bne	a0, a3, $last8loop
    206 	sb	v1, -1(a0)
    207 
    208 leave:	j	ra
    209 	nop
    210 
    211 /*
    212  * UNALIGNED case
    213  */
    214 
    215 $unaligned:
    216 	/* got here with a3="negu a0" */
    217 	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
    218 	beqz	a3, $ua_chk16w
    219 	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
    220 
    221 	LWHI	v1, 0(a1)
    222 	LWLO	v1, 3(a1)
    223 	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
    224 	SWHI	v1, 0(a0)
    225 	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
    226 
    227 $ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
    228 				/* t8 is the byte count after 64-byte chunks */
    229 	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
    230 				/* There will be at most 1 32-byte chunk after it */
    231 	subu	a3, a2, t8	/* subtract from a2 the reminder */
    232                                 /* Here a3 counts bytes in 16w chunks */
    233 	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
    234 
    235 	addu	t0, a0, a2	/* t0 is the "past the end" address */
    236 
    237 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
    238 
    239 	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
    240 	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
    241 	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
    242 	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
    243 /* In case the a0 > t9 don't use "pref 30" at all */
    244 	sgtu	v1, a0, t9
    245 	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
    246 	nop
    247 /* otherwise,  start with using pref30 */
    248 	pref	30, 64(a0)
    249 $ua_loop16w:
    250 	pref	0, 96(a1)
    251 	LWHI	t0, 0(a1)
    252 	LWLO	t0, 3(a1)
    253 	LWHI	t1, 4(a1)
    254 	bgtz	v1, $ua_skip_pref30_96
    255 	LWLO	t1, 7(a1)
    256 	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
    257 $ua_skip_pref30_96:
    258 	LWHI	t2, 8(a1)
    259 	LWLO	t2, 11(a1)
    260 	LWHI	t3, 12(a1)
    261 	LWLO	t3, 15(a1)
    262 	LWHI	t4, 16(a1)
    263 	LWLO	t4, 19(a1)
    264 	LWHI	t5, 20(a1)
    265 	LWLO	t5, 23(a1)
    266 	LWHI	t6, 24(a1)
    267 	LWLO	t6, 27(a1)
    268 	LWHI	t7, 28(a1)
    269 	LWLO	t7, 31(a1)
    270         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
    271 
    272 	sw	t0, 0(a0)
    273 	sw	t1, 4(a0)
    274 	sw	t2, 8(a0)
    275 	sw	t3, 12(a0)
    276 	sw	t4, 16(a0)
    277 	sw	t5, 20(a0)
    278 	sw	t6, 24(a0)
    279 	sw	t7, 28(a0)
    280 
    281 	LWHI	t0, 32(a1)
    282 	LWLO	t0, 35(a1)
    283 	LWHI	t1, 36(a1)
    284 	bgtz	v1, $ua_skip_pref30_128
    285 	LWLO	t1, 39(a1)
    286 	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
    287 $ua_skip_pref30_128:
    288 	LWHI	t2, 40(a1)
    289 	LWLO	t2, 43(a1)
    290 	LWHI	t3, 44(a1)
    291 	LWLO	t3, 47(a1)
    292 	LWHI	t4, 48(a1)
    293 	LWLO	t4, 51(a1)
    294 	LWHI	t5, 52(a1)
    295 	LWLO	t5, 55(a1)
    296 	LWHI	t6, 56(a1)
    297 	LWLO	t6, 59(a1)
    298 	LWHI	t7, 60(a1)
    299 	LWLO	t7, 63(a1)
    300         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
    301 
    302 	sw	t0, 32(a0)
    303 	sw	t1, 36(a0)
    304 	sw	t2, 40(a0)
    305 	sw	t3, 44(a0)
    306 	sw	t4, 48(a0)
    307 	sw	t5, 52(a0)
    308 	sw	t6, 56(a0)
    309 	sw	t7, 60(a0)
    310 
    311 	addiu	a0, a0, 64	/* adding 64 to dest */
    312 	sgtu	v1, a0, t9
    313 	bne	a0, a3, $ua_loop16w
    314 	addiu	a1, a1, 64	/* adding 64 to src */
    315 	move	a2, t8
    316 
    317 /* Here we have src and dest word-aligned but less than 64-bytes to go */
    318 
    319 $ua_chk8w:
    320 	pref 0, 0x0(a1)
    321 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
    322 				/* the t8 is the reminder count */
    323 	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
    324 
    325 	LWHI	t0, 0(a1)
    326 	LWLO	t0, 3(a1)
    327 	LWHI	t1, 4(a1)
    328 	LWLO	t1, 7(a1)
    329 	LWHI	t2, 8(a1)
    330 	LWLO	t2, 11(a1)
    331 	LWHI	t3, 12(a1)
    332 	LWLO	t3, 15(a1)
    333 	LWHI	t4, 16(a1)
    334 	LWLO	t4, 19(a1)
    335 	LWHI	t5, 20(a1)
    336 	LWLO	t5, 23(a1)
    337 	LWHI	t6, 24(a1)
    338 	LWLO	t6, 27(a1)
    339 	LWHI	t7, 28(a1)
    340 	LWLO	t7, 31(a1)
    341 	addiu	a1, a1, 32
    342 
    343 	sw	t0, 0(a0)
    344 	sw	t1, 4(a0)
    345 	sw	t2, 8(a0)
    346 	sw	t3, 12(a0)
    347 	sw	t4, 16(a0)
    348 	sw	t5, 20(a0)
    349 	sw	t6, 24(a0)
    350 	sw	t7, 28(a0)
    351 	addiu	a0, a0, 32
    352 
    353 $ua_chk1w:
    354 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
    355 	beq	a2, t8, $ua_smallCopy
    356 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
    357 	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
    358 
    359 /* copying in words (4-byte chunks) */
    360 $ua_wordCopy_loop:
    361 	LWHI	v1, 0(a1)
    362 	LWLO	v1, 3(a1)
    363 	addiu	a1, a1, 4
    364 	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
    365 	bne	a0, a3, $ua_wordCopy_loop
    366 	sw	v1, -4(a0)
    367 
    368 /* Now less than 4 bytes (value in a2) left to copy */
    369 $ua_smallCopy:
    370 	beqz	a2, leave
    371 	addu	a3, a0, a2	/* a3 is the last dst address */
    372 $ua_smallCopy_loop:
    373 	lb	v1, 0(a1)
    374 	addiu	a1, a1, 1
    375 	addiu	a0, a0, 1
    376 	bne	a0, a3, $ua_smallCopy_loop
    377 	sb	v1, -1(a0)
    378 
    379 	j	ra
    380 	nop
    381 
    382 END(pixman_mips_fast_memcpy)
    383