Home | History | Annotate | Download | only in lib
      1 /* memcpy.S: Sparc optimized memcpy and memmove code
      2  * Hand optimized from GNU libc's memcpy and memmove
      3  * Copyright (C) 1991,1996 Free Software Foundation
      4  * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds (at) helsinki.fi)
      5  * Copyright (C) 1996 David S. Miller (davem (at) caip.rutgers.edu)
      6  * Copyright (C) 1996 Eddie C. Dost (ecd (at) skynet.be)
      7  * Copyright (C) 1996 Jakub Jelinek (jj (at) sunsite.mff.cuni.cz)
      8  */
      9 
     10 #define FUNC(x) 		\
     11 	.globl	x;		\
     12 	.type	x,@function;	\
     13 	.align	4;		\
     14 x:
     15 
     16 /* Both these macros have to start with exactly the same insn */
     17 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
     18 	ldd	[%src + (offset) + 0x00], %t0; \
     19 	ldd	[%src + (offset) + 0x08], %t2; \
     20 	ldd	[%src + (offset) + 0x10], %t4; \
     21 	ldd	[%src + (offset) + 0x18], %t6; \
     22 	st	%t0, [%dst + (offset) + 0x00]; \
     23 	st	%t1, [%dst + (offset) + 0x04]; \
     24 	st	%t2, [%dst + (offset) + 0x08]; \
     25 	st	%t3, [%dst + (offset) + 0x0c]; \
     26 	st	%t4, [%dst + (offset) + 0x10]; \
     27 	st	%t5, [%dst + (offset) + 0x14]; \
     28 	st	%t6, [%dst + (offset) + 0x18]; \
     29 	st	%t7, [%dst + (offset) + 0x1c];
     30 
     31 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
     32 	ldd	[%src + (offset) + 0x00], %t0; \
     33 	ldd	[%src + (offset) + 0x08], %t2; \
     34 	ldd	[%src + (offset) + 0x10], %t4; \
     35 	ldd	[%src + (offset) + 0x18], %t6; \
     36 	std	%t0, [%dst + (offset) + 0x00]; \
     37 	std	%t2, [%dst + (offset) + 0x08]; \
     38 	std	%t4, [%dst + (offset) + 0x10]; \
     39 	std	%t6, [%dst + (offset) + 0x18];
     40 
     41 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
     42 	ldd	[%src - (offset) - 0x10], %t0; \
     43 	ldd	[%src - (offset) - 0x08], %t2; \
     44 	st	%t0, [%dst - (offset) - 0x10]; \
     45 	st	%t1, [%dst - (offset) - 0x0c]; \
     46 	st	%t2, [%dst - (offset) - 0x08]; \
     47 	st	%t3, [%dst - (offset) - 0x04];
     48 
     49 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
     50 	ldd	[%src - (offset) - 0x10], %t0; \
     51 	ldd	[%src - (offset) - 0x08], %t2; \
     52 	std	%t0, [%dst - (offset) - 0x10]; \
     53 	std	%t2, [%dst - (offset) - 0x08];
     54 
     55 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
     56 	ldub	[%src - (offset) - 0x02], %t0; \
     57 	ldub	[%src - (offset) - 0x01], %t1; \
     58 	stb	%t0, [%dst - (offset) - 0x02]; \
     59 	stb	%t1, [%dst - (offset) - 0x01];
     60 
     61 /* Both these macros have to start with exactly the same insn */
     62 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
     63 	ldd	[%src - (offset) - 0x20], %t0; \
     64 	ldd	[%src - (offset) - 0x18], %t2; \
     65 	ldd	[%src - (offset) - 0x10], %t4; \
     66 	ldd	[%src - (offset) - 0x08], %t6; \
     67 	st	%t0, [%dst - (offset) - 0x20]; \
     68 	st	%t1, [%dst - (offset) - 0x1c]; \
     69 	st	%t2, [%dst - (offset) - 0x18]; \
     70 	st	%t3, [%dst - (offset) - 0x14]; \
     71 	st	%t4, [%dst - (offset) - 0x10]; \
     72 	st	%t5, [%dst - (offset) - 0x0c]; \
     73 	st	%t6, [%dst - (offset) - 0x08]; \
     74 	st	%t7, [%dst - (offset) - 0x04];
     75 
     76 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
     77 	ldd	[%src - (offset) - 0x20], %t0; \
     78 	ldd	[%src - (offset) - 0x18], %t2; \
     79 	ldd	[%src - (offset) - 0x10], %t4; \
     80 	ldd	[%src - (offset) - 0x08], %t6; \
     81 	std	%t0, [%dst - (offset) - 0x20]; \
     82 	std	%t2, [%dst - (offset) - 0x18]; \
     83 	std	%t4, [%dst - (offset) - 0x10]; \
     84 	std	%t6, [%dst - (offset) - 0x08];
     85 
     86 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
     87 	ldd	[%src + (offset) + 0x00], %t0; \
     88 	ldd	[%src + (offset) + 0x08], %t2; \
     89 	st	%t0, [%dst + (offset) + 0x00]; \
     90 	st	%t1, [%dst + (offset) + 0x04]; \
     91 	st	%t2, [%dst + (offset) + 0x08]; \
     92 	st	%t3, [%dst + (offset) + 0x0c];
     93 
     94 #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
     95 	ldub	[%src + (offset) + 0x00], %t0; \
     96 	ldub	[%src + (offset) + 0x01], %t1; \
     97 	stb	%t0, [%dst + (offset) + 0x00]; \
     98 	stb	%t1, [%dst + (offset) + 0x01];
     99 
    100 #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
    101 	ldd	[%src + (offset) + 0x00], %t0; \
    102 	ldd	[%src + (offset) + 0x08], %t2; \
    103 	srl	%t0, shir, %t5; \
    104 	srl	%t1, shir, %t6; \
    105 	sll	%t0, shil, %t0; \
    106 	or	%t5, %prev, %t5; \
    107 	sll	%t1, shil, %prev; \
    108 	or	%t6, %t0, %t0; \
    109 	srl	%t2, shir, %t1; \
    110 	srl	%t3, shir, %t6; \
    111 	sll	%t2, shil, %t2; \
    112 	or	%t1, %prev, %t1; \
    113 	std	%t4, [%dst + (offset) + (offset2) - 0x04]; \
    114 	std	%t0, [%dst + (offset) + (offset2) + 0x04]; \
    115 	sll	%t3, shil, %prev; \
    116 	or	%t6, %t2, %t4;
    117 
    118 #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
    119 	ldd	[%src + (offset) + 0x00], %t0; \
    120 	ldd	[%src + (offset) + 0x08], %t2; \
    121 	srl	%t0, shir, %t4;	\
    122 	srl	%t1, shir, %t5;	\
    123 	sll	%t0, shil, %t6;	\
    124 	or	%t4, %prev, %t0; \
    125 	sll	%t1, shil, %prev; \
    126 	or	%t5, %t6, %t1; \
    127 	srl	%t2, shir, %t4;	\
    128 	srl	%t3, shir, %t5;	\
    129 	sll	%t2, shil, %t6; \
    130 	or	%t4, %prev, %t2; \
    131 	sll	%t3, shil, %prev; \
    132 	or	%t5, %t6, %t3; \
    133 	std	%t0, [%dst + (offset) + (offset2) + 0x00]; \
    134 	std	%t2, [%dst + (offset) + (offset2) + 0x08];
    135 
    136 	.text
    137 	.align	4
    138 
    139 0:
    140 	retl
    141 	 nop		! Only bcopy returns here and it retuns void...
    142 
    143 #ifdef __KERNEL__
    144 FUNC(amemmove)
    145 FUNC(__memmove)
    146 #endif
    147 FUNC(memmove)
    148 	cmp		%o0, %o1
    149 	mov		%o0, %g7
    150 	bleu		9f
    151 	 sub		%o0, %o1, %o4
    152 
    153 	add		%o1, %o2, %o3
    154 	cmp		%o3, %o0
    155 	bleu		0f
    156 	 andcc		%o4, 3, %o5
    157 
    158 	add		%o1, %o2, %o1
    159 	add		%o0, %o2, %o0
    160 	sub		%o1, 1, %o1
    161 	sub		%o0, 1, %o0
    162 
    163 1:	/* reverse_bytes */
    164 
    165 	ldub		[%o1], %o4
    166 	subcc		%o2, 1, %o2
    167 	stb		%o4, [%o0]
    168 	sub		%o1, 1, %o1
    169 	bne		1b
    170 	 sub		%o0, 1, %o0
    171 
    172 	retl
    173 	 mov		%g7, %o0
    174 
    175 /* NOTE: This code is executed just for the cases,
    176          where %src (=%o1) & 3 is != 0.
    177 	 We need to align it to 4. So, for (%src & 3)
    178 	 1 we need to do ldub,lduh
    179 	 2 lduh
    180 	 3 just ldub
    181          so even if it looks weird, the branches
    182          are correct here. -jj
    183  */
    184 78:	/* dword_align */
    185 
    186 	andcc		%o1, 1, %g0
    187 	be		4f
    188 	 andcc		%o1, 2, %g0
    189 
    190 	ldub		[%o1], %g2
    191 	add		%o1, 1, %o1
    192 	stb		%g2, [%o0]
    193 	sub		%o2, 1, %o2
    194 	bne		3f
    195 	 add		%o0, 1, %o0
    196 4:
    197 	lduh		[%o1], %g2
    198 	add		%o1, 2, %o1
    199 	sth		%g2, [%o0]
    200 	sub		%o2, 2, %o2
    201 	b		3f
    202 	 add		%o0, 2, %o0
    203 
    204 FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
    205 
    206 	sub		%o0, %o1, %o4
    207 	mov		%o0, %g7
    208 9:
    209 	andcc		%o4, 3, %o5
    210 0:
    211 	bne		86f
    212 	 cmp		%o2, 15
    213 
    214 	bleu		90f
    215 	 andcc		%o1, 3, %g0
    216 
    217 	bne		78b
    218 3:
    219 	 andcc		%o1, 4, %g0
    220 
    221 	be		2f
    222 	 mov		%o2, %g1
    223 
    224 	ld		[%o1], %o4
    225 	sub		%g1, 4, %g1
    226 	st		%o4, [%o0]
    227 	add		%o1, 4, %o1
    228 	add		%o0, 4, %o0
    229 2:
    230 	andcc		%g1, 0xffffff80, %g0
    231 	be		3f
    232 	 andcc		%o0, 4, %g0
    233 
    234 	be		82f + 4
    235 5:
    236 	MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
    237 	MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
    238 	MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
    239 	MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
    240 	sub		%g1, 128, %g1
    241 	add		%o1, 128, %o1
    242 	cmp		%g1, 128
    243 	bge		5b
    244 	 add		%o0, 128, %o0
    245 3:
    246 	andcc		%g1, 0x70, %g4
    247 	be		80f
    248 	 andcc		%g1, 8, %g0
    249 
    250 	sethi		%hi(80f), %o5
    251 	srl		%g4, 1, %o4
    252 	add		%g4, %o4, %o4
    253 	add		%o1, %g4, %o1
    254 	sub		%o5, %o4, %o5
    255 	jmpl		%o5 + %lo(80f), %g0
    256 	 add		%o0, %g4, %o0
    257 
    258 79:	/* memcpy_table */
    259 
    260 	MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
    261 	MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
    262 	MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
    263 	MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
    264 	MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
    265 	MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
    266 	MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
    267 
    268 80:	/* memcpy_table_end */
    269 	be		81f
    270 	 andcc		%g1, 4, %g0
    271 
    272 	ldd		[%o1], %g2
    273 	add		%o0, 8, %o0
    274 	st		%g2, [%o0 - 0x08]
    275 	add		%o1, 8, %o1
    276 	st		%g3, [%o0 - 0x04]
    277 
    278 81:	/* memcpy_last7 */
    279 
    280 	be		1f
    281 	 andcc		%g1, 2, %g0
    282 
    283 	ld		[%o1], %g2
    284 	add		%o1, 4, %o1
    285 	st		%g2, [%o0]
    286 	add		%o0, 4, %o0
    287 1:
    288 	be		1f
    289 	 andcc		%g1, 1, %g0
    290 
    291 	lduh		[%o1], %g2
    292 	add		%o1, 2, %o1
    293 	sth		%g2, [%o0]
    294 	add		%o0, 2, %o0
    295 1:
    296 	be		1f
    297 	 nop
    298 
    299 	ldub		[%o1], %g2
    300 	stb		%g2, [%o0]
    301 1:
    302 	retl
    303 	 mov		%g7, %o0
    304 
    305 82:	/* ldd_std */
    306 	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
    307 	MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
    308 	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
    309 	MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
    310 	subcc		%g1, 128, %g1
    311 	add		%o1, 128, %o1
    312 	cmp		%g1, 128
    313 	bge		82b
    314 	 add		%o0, 128, %o0
    315 
    316 	andcc		%g1, 0x70, %g4
    317 	be		84f
    318 	 andcc		%g1, 8, %g0
    319 
    320 	sethi		%hi(84f), %o5
    321 	add		%o1, %g4, %o1
    322 	sub		%o5, %g4, %o5
    323 	jmpl		%o5 + %lo(84f), %g0
    324 	 add		%o0, %g4, %o0
    325 
    326 83:	/* amemcpy_table */
    327 
    328 	MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
    329 	MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
    330 	MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
    331 	MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
    332 	MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
    333 	MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
    334 	MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
    335 
    336 84:	/* amemcpy_table_end */
    337 	be		85f
    338 	 andcc		%g1, 4, %g0
    339 
    340 	ldd		[%o1], %g2
    341 	add		%o0, 8, %o0
    342 	std		%g2, [%o0 - 0x08]
    343 	add		%o1, 8, %o1
    344 85:	/* amemcpy_last7 */
    345 	be		1f
    346 	 andcc		%g1, 2, %g0
    347 
    348 	ld		[%o1], %g2
    349 	add		%o1, 4, %o1
    350 	st		%g2, [%o0]
    351 	add		%o0, 4, %o0
    352 1:
    353 	be		1f
    354 	 andcc		%g1, 1, %g0
    355 
    356 	lduh		[%o1], %g2
    357 	add		%o1, 2, %o1
    358 	sth		%g2, [%o0]
    359 	add		%o0, 2, %o0
    360 1:
    361 	be		1f
    362 	 nop
    363 
    364 	ldub		[%o1], %g2
    365 	stb		%g2, [%o0]
    366 1:
    367 	retl
    368 	 mov		%g7, %o0
    369 
    370 86:	/* non_aligned */
    371 	cmp		%o2, 6
    372 	bleu		88f
    373 	 nop
    374 
    375 	save		%sp, -96, %sp
    376 	andcc		%i0, 3, %g0
    377 	be		61f
    378 	 andcc		%i0, 1, %g0
    379 	be		60f
    380 	 andcc		%i0, 2, %g0
    381 
    382 	ldub		[%i1], %g5
    383 	add		%i1, 1, %i1
    384 	stb		%g5, [%i0]
    385 	sub		%i2, 1, %i2
    386 	bne		61f
    387 	 add		%i0, 1, %i0
    388 60:
    389 	ldub		[%i1], %g3
    390 	add		%i1, 2, %i1
    391 	stb		%g3, [%i0]
    392 	sub		%i2, 2, %i2
    393 	ldub		[%i1 - 1], %g3
    394 	add		%i0, 2, %i0
    395 	stb		%g3, [%i0 - 1]
    396 61:
    397 	and		%i1, 3, %g2
    398 	and		%i2, 0xc, %g3
    399 	and		%i1, -4, %i1
    400 	cmp		%g3, 4
    401 	sll		%g2, 3, %g4
    402 	mov		32, %g2
    403 	be		4f
    404 	 sub		%g2, %g4, %l0
    405 
    406 	blu		3f
    407 	 cmp		%g3, 0x8
    408 
    409 	be		2f
    410 	 srl		%i2, 2, %g3
    411 
    412 	ld		[%i1], %i3
    413 	add		%i0, -8, %i0
    414 	ld		[%i1 + 4], %i4
    415 	b		8f
    416 	 add		%g3, 1, %g3
    417 2:
    418 	ld		[%i1], %i4
    419 	add		%i0, -12, %i0
    420 	ld		[%i1 + 4], %i5
    421 	add		%g3, 2, %g3
    422 	b		9f
    423 	 add		%i1, -4, %i1
    424 3:
    425 	ld		[%i1], %g1
    426 	add		%i0, -4, %i0
    427 	ld		[%i1 + 4], %i3
    428 	srl		%i2, 2, %g3
    429 	b		7f
    430 	 add		%i1, 4, %i1
    431 4:
    432 	ld		[%i1], %i5
    433 	cmp		%i2, 7
    434 	ld		[%i1 + 4], %g1
    435 	srl		%i2, 2, %g3
    436 	bleu		10f
    437 	 add		%i1, 8, %i1
    438 
    439 	ld		[%i1], %i3
    440 	add		%g3, -1, %g3
    441 5:
    442 	sll		%i5, %g4, %g2
    443 	srl		%g1, %l0, %g5
    444 	or		%g2, %g5, %g2
    445 	st		%g2, [%i0]
    446 7:
    447 	ld		[%i1 + 4], %i4
    448 	sll		%g1, %g4, %g2
    449 	srl		%i3, %l0, %g5
    450 	or		%g2, %g5, %g2
    451 	st		%g2, [%i0 + 4]
    452 8:
    453 	ld		[%i1 + 8], %i5
    454 	sll		%i3, %g4, %g2
    455 	srl		%i4, %l0, %g5
    456 	or		%g2, %g5, %g2
    457 	st		%g2, [%i0 + 8]
    458 9:
    459 	ld		[%i1 + 12], %g1
    460 	sll		%i4, %g4, %g2
    461 	srl		%i5, %l0, %g5
    462 	addcc		%g3, -4, %g3
    463 	or		%g2, %g5, %g2
    464 	add		%i1, 16, %i1
    465 	st		%g2, [%i0 + 12]
    466 	add		%i0, 16, %i0
    467 	bne,a		5b
    468 	 ld		[%i1], %i3
    469 10:
    470 	sll		%i5, %g4, %g2
    471 	srl		%g1, %l0, %g5
    472 	srl		%l0, 3, %g3
    473 	or		%g2, %g5, %g2
    474 	sub		%i1, %g3, %i1
    475 	andcc		%i2, 2, %g0
    476 	st		%g2, [%i0]
    477 	be		1f
    478 	 andcc		%i2, 1, %g0
    479 
    480 	ldub		[%i1], %g2
    481 	add		%i1, 2, %i1
    482 	stb		%g2, [%i0 + 4]
    483 	add		%i0, 2, %i0
    484 	ldub		[%i1 - 1], %g2
    485 	stb		%g2, [%i0 + 3]
    486 1:
    487 	be		1f
    488 	 nop
    489 	ldub		[%i1], %g2
    490 	stb		%g2, [%i0 + 4]
    491 1:
    492 	ret
    493 	 restore	%g7, %g0, %o0
    494 
    495 88:	/* short_end */
    496 
    497 	and		%o2, 0xe, %o3
    498 20:
    499 	sethi		%hi(89f), %o5
    500 	sll		%o3, 3, %o4
    501 	add		%o0, %o3, %o0
    502 	sub		%o5, %o4, %o5
    503 	add		%o1, %o3, %o1
    504 	jmpl		%o5 + %lo(89f), %g0
    505 	 andcc		%o2, 1, %g0
    506 
    507 	MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
    508 	MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
    509 	MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
    510 	MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
    511 	MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
    512 	MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
    513 	MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
    514 
    515 89:	/* short_table_end */
    516 
    517 	be		1f
    518 	 nop
    519 
    520 	ldub		[%o1], %g2
    521 	stb		%g2, [%o0]
    522 1:
    523 	retl
    524 	 mov		%g7, %o0
    525 
    526 90:	/* short_aligned_end */
    527 	bne		88b
    528 	 andcc		%o2, 8, %g0
    529 
    530 	be		1f
    531 	 andcc		%o2, 4, %g0
    532 
    533 	ld		[%o1 + 0x00], %g2
    534 	ld		[%o1 + 0x04], %g3
    535 	add		%o1, 8, %o1
    536 	st		%g2, [%o0 + 0x00]
    537 	st		%g3, [%o0 + 0x04]
    538 	add		%o0, 8, %o0
    539 1:
    540 	b		81b
    541 	 mov		%o2, %g1
    542