Home | History | Annotate | Download | only in lib
      1 /*
      2  * "memcpy" implementation of SuperH
      3  *
      4  * Copyright (C) 1999  Niibe Yutaka
      5  * Copyright (c) 2002  STMicroelectronics Ltd
      6  *   Modified from memcpy.S and micro-optimised for SH4
      7  *   Stuart Menefy (stuart.menefy (at) st.com)
      8  *
      9  */
     10 #include <linux/linkage.h>
     11 
     12 /*
     13  * void *memcpy(void *dst, const void *src, size_t n);
     14  *
     15  * It is assumed that there is no overlap between src and dst.
     16  * If there is an overlap, then the results are undefined.
     17  */
     18 
     19 	!
     20 	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
     21 	!
     22 
     23 	! Size is 16 or greater, and may have trailing bytes
     24 
     25 	.balign	32
     26 .Lcase1:
     27 	! Read a long word and write a long word at once
     28 	! At the start of each iteration, r7 contains last long load
     29 	add	#-1,r5		!  79 EX
     30 	mov	r4,r2		!   5 MT (0 cycles latency)
     31 
     32 	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
     33 	add	#-4,r5		!  50 EX
     34 
     35 	add	#7,r2		!  79 EX
     36 	!
     37 #ifdef CONFIG_CPU_LITTLE_ENDIAN
     38 	! 6 cycles, 4 bytes per iteration
     39 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
     40 	mov	r7, r3		!   5 MT (latency=0)	! RQPO
     41 
     42 	cmp/hi	r2,r0		!  57 MT
     43 	shll16	r3		! 103 EX
     44 
     45 	mov	r1,r6		!   5 MT (latency=0)
     46 	shll8	r3		! 102 EX		! Oxxx
     47 
     48 	shlr8	r6		! 106 EX		! xNML
     49 	mov	r1, r7		!   5 MT (latency=0)
     50 
     51 	or	r6,r3		!  82 EX		! ONML
     52 	bt/s	3b		! 109 BR
     53 
     54 	 mov.l	r3,@-r0		!  30 LS
     55 #else
     56 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
     57 	mov	r7,r3		!   5 MT (latency=0)	! OPQR
     58 
     59 	cmp/hi	r2,r0		!  57 MT
     60 	shlr16	r3		! 107 EX
     61 
     62 	shlr8	r3		! 106 EX		! xxxO
     63 	mov	r1,r6		!   5 MT (latency=0)
     64 
     65 	shll8	r6		! 102 EX		! LMNx
     66 	mov	r1,r7		!   5 MT (latency=0)
     67 
     68 	or	r6,r3		!  82 EX		! LMNO
     69 	bt/s	3b		! 109 BR
     70 
     71 	 mov.l	r3,@-r0		!  30 LS
     72 #endif
     73 	! Finally, copy a byte at once, if necessary
     74 
     75 	add	#4,r5		!  50 EX
     76 	cmp/eq	r4,r0		!  54 MT
     77 
     78 	add	#-6,r2		!  50 EX
     79 	bt	9f		! 109 BR
     80 
     81 8:	cmp/hi	r2,r0		!  57 MT
     82 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
     83 
     84 	bt/s	8b		! 109 BR
     85 
     86 	 mov.b	r1,@-r0		!  29 LS
     87 
     88 9:	rts
     89 	 nop
     90 
     91 
     92 	!
     93 	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
     94 	!
     95 
     96 	! Size is 16 or greater, and may have trailing bytes
     97 
     98 	.balign	32
     99 .Lcase3:
    100 	! Read a long word and write a long word at once
    101 	! At the start of each iteration, r7 contains last long load
    102 	add	#-3,r5		! 79 EX
    103 	mov	r4,r2		!  5 MT (0 cycles latency)
    104 
    105 	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
    106 	add	#-4,r5		! 50 EX
    107 
    108 	add	#7,r2		!  79 EX
    109 	!
    110 #ifdef CONFIG_CPU_LITTLE_ENDIAN
    111 	! 6 cycles, 4 bytes per iteration
    112 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
    113 	mov	r7, r3		!   5 MT (latency=0)	! RQPO
    114 
    115 	cmp/hi	r2,r0		!  57 MT
    116 	shll8	r3		! 102 EX		! QPOx
    117 
    118 	mov	r1,r6		!   5 MT (latency=0)
    119 	shlr16	r6		! 107 EX
    120 
    121 	shlr8	r6		! 106 EX		! xxxN
    122 	mov	r1, r7		!   5 MT (latency=0)
    123 
    124 	or	r6,r3		!  82 EX		! QPON
    125 	bt/s	3b		! 109 BR
    126 
    127 	 mov.l	r3,@-r0		!  30 LS
    128 #else
    129 3:	mov	r7,r3		! OPQR
    130 	shlr8	r3		! xOPQ
    131 	mov.l	@(r0,r5),r7	! KLMN
    132 	mov	r7,r6
    133 	shll16	r6
    134 	shll8	r6		! Nxxx
    135 	or	r6,r3		! NOPQ
    136 	cmp/hi	r2,r0
    137 	bt/s	3b
    138 	 mov.l	r3,@-r0
    139 #endif
    140 
    141 	! Finally, copy a byte at once, if necessary
    142 
    143 	add	#6,r5		!  50 EX
    144 	cmp/eq	r4,r0		!  54 MT
    145 
    146 	add	#-6,r2		!  50 EX
    147 	bt	9f		! 109 BR
    148 
    149 8:	cmp/hi	r2,r0		!  57 MT
    150 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
    151 
    152 	bt/s	8b		! 109 BR
    153 
    154 	 mov.b	r1,@-r0		!  29 LS
    155 
    156 9:	rts
    157 	 nop
    158 
    159 ENTRY(memcpy)
    160 
    161 	! Calculate the invariants which will be used in the remainder
    162 	! of the code:
    163 	!
    164 	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
    165 	!	         [ ...  ]                 [ ...  ]
    166 	!	           :                        :
    167 	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
    168 	!
    169 	!
    170 
    171 	! Short circuit the common case of src, dst and len being 32 bit aligned
    172 	! and test for zero length move
    173 
    174 	mov	r6, r0		!   5 MT (0 cycle latency)
    175 	or	r4, r0		!  82 EX
    176 
    177 	or	r5, r0		!  82 EX
    178 	tst	r6, r6		!  86 MT
    179 
    180 	bt/s	99f		! 111 BR		(zero len)
    181 	 tst	#3, r0		!  87 MT
    182 
    183 	mov	r4, r0		!   5 MT (0 cycle latency)
    184 	add	r6, r0		!  49 EX
    185 
    186 	mov	#16, r1		!   6 EX
    187 	bt/s	.Lcase00	! 111 BR		(aligned)
    188 
    189 	 sub	r4, r5		!  75 EX
    190 
    191 	! Arguments are not nicely long word aligned or zero len.
    192 	! Check for small copies, and if so do a simple byte at a time copy.
    193 	!
    194 	! Deciding on an exact value of 'small' is not easy, as the point at which
    195 	! using the optimised routines become worthwhile varies (these are the
    196 	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
    197 	!	size	byte-at-time	long	word	byte
    198 	!	16	42		39-40	46-50	50-55
    199 	!	24	58		43-44	54-58	62-67
    200 	!	36	82		49-50	66-70	80-85
    201 	! However the penalty for getting it 'wrong' is much higher for long word
    202 	! aligned data (and this is more common), so use a value of 16.
    203 
    204 	cmp/gt	r6,r1		!  56 MT
    205 
    206 	add	#-1,r5		!  50 EX
    207 	bf/s	6f		! 108 BR		(not small)
    208 
    209 	 mov	r5, r3		!   5 MT (latency=0)
    210 	shlr	r6		! 104 EX
    211 
    212 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
    213 	bf/s	4f		! 111 BR
    214 
    215 	 add	#-1,r3		!  50 EX
    216 	tst	r6, r6		!  86 MT
    217 
    218 	bt/s	98f		! 110 BR
    219 	 mov.b	r1,@-r0		!  29 LS
    220 
    221 	! 4 cycles, 2 bytes per iteration
    222 3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
    223 
    224 4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
    225 	dt	r6		!  67 EX
    226 
    227 	mov.b	r1,@-r0		!  29 LS
    228 	bf/s	3b		! 111 BR
    229 
    230 	 mov.b	r2,@-r0		!  29 LS
    231 98:
    232 	rts
    233 	 nop
    234 
    235 99:	rts
    236 	 mov	r4, r0
    237 
    238 	! Size is not small, so its worthwhile looking for optimisations.
    239 	! First align destination to a long word boundary.
    240 	!
    241 	! r5 = normal value -1
    242 
    243 6:	tst	#3, r0		!  87 MT
    244         mov	#3, r3		!   6 EX
    245 
    246 	bt/s	2f		! 111 BR
    247 	 and	r0,r3		!  78 EX
    248 
    249 	! 3 cycles, 1 byte per iteration
    250 1:	dt	r3		!  67 EX
    251 	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
    252 
    253 	add	#-1, r6		!  79 EX
    254 	bf/s	1b		! 109 BR
    255 
    256 	 mov.b	r1,@-r0		!  28 LS
    257 
    258 2:	add	#1, r5		!  79 EX
    259 
    260 	! Now select the appropriate bulk transfer code based on relative
    261 	! alignment of src and dst.
    262 
    263 	mov	r0, r3		!   5 MT (latency=0)
    264 
    265 	mov	r5, r0		!   5 MT (latency=0)
    266 	tst	#1, r0		!  87 MT
    267 
    268 	bf/s	1f		! 111 BR
    269 	 mov	#64, r7		!   6 EX
    270 
    271 	! bit 0 clear
    272 
    273 	cmp/ge	r7, r6		!  55 MT
    274 
    275 	bt/s	2f		! 111 BR
    276 	 tst	#2, r0		!  87 MT
    277 
    278 	! small
    279 	bt/s	.Lcase0
    280 	 mov	r3, r0
    281 
    282 	bra	.Lcase2
    283 	 nop
    284 
    285 	! big
    286 2:	bt/s	.Lcase0b
    287 	 mov	r3, r0
    288 
    289 	bra	.Lcase2b
    290 	 nop
    291 
    292 	! bit 0 set
    293 1:	tst	#2, r0		! 87 MT
    294 
    295 	bt/s	.Lcase1
    296 	 mov	r3, r0
    297 
    298 	bra	.Lcase3
    299 	 nop
    300 
    301 
    302 	!
    303 	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
    304 	!
    305 
    306 	! src, dst and size are all long word aligned
    307 	! size is non-zero
    308 
    309 	.balign	32
    310 .Lcase00:
    311 	mov	#64, r1		!   6 EX
    312 	mov	r5, r3		!   5 MT (latency=0)
    313 
    314 	cmp/gt	r6, r1		!  56 MT
    315 	add	#-4, r5		!  50 EX
    316 
    317 	bf	.Lcase00b	! 108 BR		(big loop)
    318 	shlr2	r6		! 105 EX
    319 
    320 	shlr	r6		! 104 EX
    321 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    322 
    323 	bf/s	4f		! 111 BR
    324 	 add	#-8, r3		!  50 EX
    325 
    326 	tst	r6, r6		!  86 MT
    327 	bt/s	5f		! 110 BR
    328 
    329 	 mov.l	r1,@-r0		!  30 LS
    330 
    331 	! 4 cycles, 2 long words per iteration
    332 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    333 
    334 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
    335 	dt	r6		!  67 EX
    336 
    337 	mov.l	r1, @-r0	!  30 LS
    338 	bf/s	3b		! 109 BR
    339 
    340 	 mov.l	r2, @-r0	!  30 LS
    341 
    342 5:	rts
    343 	 nop
    344 
    345 
    346 	! Size is 16 or greater and less than 64, but may have trailing bytes
    347 
    348 	.balign	32
    349 .Lcase0:
    350 	add	#-4, r5		!  50 EX
    351 	mov	r4, r7		!   5 MT (latency=0)
    352 
    353 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    354 	mov	#4, r2		!   6 EX
    355 
    356 	add	#11, r7		!  50 EX
    357 	tst	r2, r6		!  86 MT
    358 
    359 	mov	r5, r3		!   5 MT (latency=0)
    360 	bt/s	4f		! 111 BR
    361 
    362 	 add	#-4, r3		!  50 EX
    363 	mov.l	r1,@-r0		!  30 LS
    364 
    365 	! 4 cycles, 2 long words per iteration
    366 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    367 
    368 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
    369 	cmp/hi	r7, r0
    370 
    371 	mov.l	r1, @-r0	!  30 LS
    372 	bt/s	3b		! 109 BR
    373 
    374 	 mov.l	r2, @-r0	!  30 LS
    375 
    376 	! Copy the final 0-3 bytes
    377 
    378 	add	#3,r5		!  50 EX
    379 
    380 	cmp/eq	r0, r4		!  54 MT
    381 	add	#-10, r7	!  50 EX
    382 
    383 	bt	9f		! 110 BR
    384 
    385 	! 3 cycles, 1 byte per iteration
    386 1:	mov.b	@(r0,r5),r1	!  19 LS
    387 	cmp/hi	r7,r0		!  57 MT
    388 
    389 	bt/s	1b		! 111 BR
    390 	 mov.b	r1,@-r0		!  28 LS
    391 
    392 9:	rts
    393 	 nop
    394 
    395 	! Size is at least 64 bytes, so will be going round the big loop at least once.
    396 	!
    397 	!   r2 = rounded up r4
    398 	!   r3 = rounded down r0
    399 
    400 	.balign	32
    401 .Lcase0b:
    402 	add	#-4, r5		!  50 EX
    403 
    404 .Lcase00b:
    405 	mov	r0, r3		!   5 MT (latency=0)
    406 	mov	#(~0x1f), r1	!   6 EX
    407 
    408 	and	r1, r3		!  78 EX
    409 	mov	r4, r2		!   5 MT (latency=0)
    410 
    411 	cmp/eq	r3, r0		!  54 MT
    412 	add	#0x1f, r2	!  50 EX
    413 
    414 	bt/s	1f		! 110 BR
    415 	 and	r1, r2		!  78 EX
    416 
    417 	! copy initial words until cache line aligned
    418 
    419 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    420 	tst	#4, r0		!  87 MT
    421 
    422 	mov	r5, r6		!   5 MT (latency=0)
    423 	add	#-4, r6		!  50 EX
    424 
    425 	bt/s	4f		! 111 BR
    426 	 add	#8, r3		!  50 EX
    427 
    428 	tst	#0x18, r0	!  87 MT
    429 
    430 	bt/s	1f		! 109 BR
    431 	 mov.l	r1,@-r0		!  30 LS
    432 
    433 	! 4 cycles, 2 long words per iteration
    434 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
    435 
    436 4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
    437 	cmp/eq	r3, r0		!  54 MT
    438 
    439 	mov.l	r1, @-r0	!  30 LS
    440 	bf/s	3b		! 109 BR
    441 
    442 	 mov.l	r7, @-r0	!  30 LS
    443 
    444 	! Copy the cache line aligned blocks
    445 	!
    446 	! In use: r0, r2, r4, r5
    447 	! Scratch: r1, r3, r6, r7
    448 	!
    449 	! We could do this with the four scratch registers, but if src
    450 	! and dest hit the same cache line, this will thrash, so make
    451 	! use of additional registers.
    452 	!
    453 	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
    454 	!   r5:	 src (was r0+r5)
    455 	!   r1:	 dest (was r0)
    456 	! this can be reversed at the end, so we don't need to save any extra
    457 	! state.
    458 	!
    459 1:	mov.l	r8, @-r15	!  30 LS
    460 	add	r0, r5		!  49 EX
    461 
    462 	mov.l	r9, @-r15	!  30 LS
    463 	mov	r0, r1		!   5 MT (latency=0)
    464 
    465 	mov.l	r10, @-r15	!  30 LS
    466 	add	#-0x1c, r5	!  50 EX
    467 
    468 	mov.l	r11, @-r15	!  30 LS
    469 
    470 	! 16 cycles, 32 bytes per iteration
    471 2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
    472 	add	#-0x20, r1	! 50 EX
    473 	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
    474 	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
    475 	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
    476 	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
    477 	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
    478 	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
    479 	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
    480 	movca.l	r0,@r1		! 40 LS (latency=3-7)
    481 	mov.l	r3,@(0x04,r1)	! 33 LS
    482 	mov.l	r6,@(0x08,r1)	! 33 LS
    483 	mov.l	r7,@(0x0c,r1)	! 33 LS
    484 
    485 	mov.l	r8,@(0x10,r1)	! 33 LS
    486 	add	#-0x20, r5	! 50 EX
    487 
    488 	mov.l	r9,@(0x14,r1)	! 33 LS
    489 	cmp/eq	r2,r1		! 54 MT
    490 
    491 	mov.l	r10,@(0x18,r1)	!  33 LS
    492 	bf/s	2b		! 109 BR
    493 
    494 	 mov.l	r11,@(0x1c,r1)	!  33 LS
    495 
    496 	mov	r1, r0		!   5 MT (latency=0)
    497 
    498 	mov.l	@r15+, r11	!  15 LS
    499 	sub	r1, r5		!  75 EX
    500 
    501 	mov.l	@r15+, r10	!  15 LS
    502 	cmp/eq	r4, r0		!  54 MT
    503 
    504 	bf/s	1f		! 109 BR
    505 	 mov.l	 @r15+, r9	!  15 LS
    506 
    507 	rts
    508 1:	 mov.l	@r15+, r8	!  15 LS
    509 	sub	r4, r1		!  75 EX		(len remaining)
    510 
    511 	! number of trailing bytes is non-zero
    512 	!
    513 	! invariants restored (r5 already decremented by 4)
    514 	! also r1=num bytes remaining
    515 
    516 	mov	#4, r2		!   6 EX
    517 	mov	r4, r7		!   5 MT (latency=0)
    518 
    519 	add	#0x1c, r5	!  50 EX		(back to -4)
    520 	cmp/hs	r2, r1		!  58 MT
    521 
    522 	bf/s	5f		! 108 BR
    523 	 add	 #11, r7	!  50 EX
    524 
    525 	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
    526 	tst	r2, r1		!  86 MT
    527 
    528 	mov	r5, r3		!   5 MT (latency=0)
    529 	bt/s	4f		! 111 BR
    530 
    531 	 add	#-4, r3		!  50 EX
    532 	cmp/hs	r2, r1		!  58 MT
    533 
    534 	bt/s	5f		! 111 BR
    535 	 mov.l	r6,@-r0		!  30 LS
    536 
    537 	! 4 cycles, 2 long words per iteration
    538 3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
    539 
    540 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
    541 	cmp/hi	r7, r0
    542 
    543 	mov.l	r6, @-r0	!  30 LS
    544 	bt/s	3b		! 109 BR
    545 
    546 	 mov.l	r2, @-r0	!  30 LS
    547 
    548 	! Copy the final 0-3 bytes
    549 
    550 5:	cmp/eq	r0, r4		!  54 MT
    551 	add	#-10, r7	!  50 EX
    552 
    553 	bt	9f		! 110 BR
    554 	add	#3,r5		!  50 EX
    555 
    556 	! 3 cycles, 1 byte per iteration
    557 1:	mov.b	@(r0,r5),r1	!  19 LS
    558 	cmp/hi	r7,r0		!  57 MT
    559 
    560 	bt/s	1b		! 111 BR
    561 	 mov.b	r1,@-r0		!  28 LS
    562 
    563 9:	rts
    564 	 nop
    565 
    566 	!
    567 	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
    568 	!
    569 
    570 	.balign	32
    571 .Lcase2:
    572 	! Size is 16 or greater and less then 64, but may have trailing bytes
    573 
    574 2:	mov	r5, r6		!   5 MT (latency=0)
    575 	add	#-2,r5		!  50 EX
    576 
    577 	mov	r4,r2		!   5 MT (latency=0)
    578 	add	#-4,r6		!  50 EX
    579 
    580 	add	#7,r2		!  50 EX
    581 3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
    582 
    583 	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
    584 	cmp/hi	r2,r0		!  57 MT
    585 
    586 	mov.w	r1,@-r0		!  29 LS
    587 	bt/s	3b		! 111 BR
    588 
    589 	 mov.w	r3,@-r0		!  29 LS
    590 
    591 	bra	10f
    592 	 nop
    593 
    594 
    595 	.balign	32
    596 .Lcase2b:
    597 	! Size is at least 64 bytes, so will be going round the big loop at least once.
    598 	!
    599 	!   r2 = rounded up r4
    600 	!   r3 = rounded down r0
    601 
    602 	mov	r0, r3		!   5 MT (latency=0)
    603 	mov	#(~0x1f), r1	!   6 EX
    604 
    605 	and	r1, r3		!  78 EX
    606 	mov	r4, r2		!   5 MT (latency=0)
    607 
    608 	cmp/eq	r3, r0		!  54 MT
    609 	add	#0x1f, r2	!  50 EX
    610 
    611 	add	#-2, r5		!  50 EX
    612 	bt/s	1f		! 110 BR
    613 	 and	r1, r2		!  78 EX
    614 
    615 	! Copy a short word one at a time until we are cache line aligned
    616 	!   Normal values: r0, r2, r3, r4
    617 	!   Unused: r1, r6, r7
    618 	!   Mod: r5 (=r5-2)
    619 	!
    620 	add	#2, r3		!  50 EX
    621 
    622 2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
    623 	cmp/eq	r3,r0		!  54 MT
    624 
    625 	bf/s	2b		! 111 BR
    626 
    627 	 mov.w	r1,@-r0		!  29 LS
    628 
    629 	! Copy the cache line aligned blocks
    630 	!
    631 	! In use: r0, r2, r4, r5 (=r5-2)
    632 	! Scratch: r1, r3, r6, r7
    633 	!
    634 	! We could do this with the four scratch registers, but if src
    635 	! and dest hit the same cache line, this will thrash, so make
    636 	! use of additional registers.
    637 	!
    638 	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
    639 	!   r5:	 src (was r0+r5)
    640 	!   r1:	 dest (was r0)
    641 	! this can be reversed at the end, so we don't need to save any extra
    642 	! state.
    643 	!
    644 1:	mov.l	r8, @-r15	!  30 LS
    645 	add	r0, r5		!  49 EX
    646 
    647 	mov.l	r9, @-r15	!  30 LS
    648 	mov	r0, r1		!   5 MT (latency=0)
    649 
    650 	mov.l	r10, @-r15	!  30 LS
    651 	add	#-0x1e, r5	!  50 EX
    652 
    653 	mov.l	r11, @-r15	!  30 LS
    654 
    655 	mov.l	r12, @-r15	!  30 LS
    656 
    657 	! 17 cycles, 32 bytes per iteration
    658 #ifdef CONFIG_CPU_LITTLE_ENDIAN
    659 2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
    660 	add	#-0x20, r1	!  50 EX
    661 
    662 	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
    663 
    664 	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
    665 	shll16	r0		! 103 EX			JI..
    666 
    667 	mov.l	@r5+, r7	!  15 LS (latency=2)
    668 	xtrct	r3, r0		!  48 EX			LKJI
    669 
    670 	mov.l	@r5+, r8	!  15 LS (latency=2)
    671 	xtrct	r6, r3		!  48 EX			PONM
    672 
    673 	mov.l	@r5+, r9	!  15 LS (latency=2)
    674 	xtrct	r7, r6		!  48 EX
    675 
    676 	mov.l	@r5+, r10	!  15 LS (latency=2)
    677 	xtrct	r8, r7		!  48 EX
    678 
    679 	mov.l	@r5+, r11	!  15 LS (latency=2)
    680 	xtrct	r9, r8		!  48 EX
    681 
    682 	mov.w	@r5+, r12	!  15 LS (latency=2)
    683 	xtrct	r10, r9		!  48 EX
    684 
    685 	movca.l	r0,@r1		!  40 LS (latency=3-7)
    686 	xtrct	r11, r10	!  48 EX
    687 
    688 	mov.l	r3, @(0x04,r1)	!  33 LS
    689 	xtrct	r12, r11	!  48 EX
    690 
    691 	mov.l	r6, @(0x08,r1)	!  33 LS
    692 
    693 	mov.l	r7, @(0x0c,r1)	!  33 LS
    694 
    695 	mov.l	r8, @(0x10,r1)	!  33 LS
    696 	add	#-0x40, r5	!  50 EX
    697 
    698 	mov.l	r9, @(0x14,r1)	!  33 LS
    699 	cmp/eq	r2,r1		!  54 MT
    700 
    701 	mov.l	r10, @(0x18,r1)	!  33 LS
    702 	bf/s	2b		! 109 BR
    703 
    704 	 mov.l	r11, @(0x1c,r1)	!  33 LS
    705 #else
    706 2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
    707 	add	#-2, r5		!  50 EX
    708 
    709 	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
    710 	add	#-4, r1		!  50 EX
    711 
    712 	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
    713 	shll16	r0		! 103 EX
    714 
    715 	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
    716 	xtrct	r3, r0		!  48 EX
    717 
    718 	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
    719 	xtrct	r6, r3		!  48 EX
    720 
    721 	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
    722 	xtrct	r7, r6		!  48 EX
    723 
    724 	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
    725 	xtrct	r8, r7		!  48 EX
    726 
    727 	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
    728 	xtrct	r9, r8		!  48 EX
    729 
    730 	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
    731     	xtrct	r10, r9		!  48 EX
    732 
    733 	movca.l	r0,@r1		!  40 LS (latency=3-7)
    734 	add	#-0x1c, r1	!  50 EX
    735 
    736 	mov.l	r3, @(0x18,r1)	!  33 LS
    737 	xtrct	r11, r10	!  48 EX
    738 
    739 	mov.l	r6, @(0x14,r1)	!  33 LS
    740 	xtrct	r12, r11	!  48 EX
    741 
    742 	mov.l	r7, @(0x10,r1)	!  33 LS
    743 
    744 	mov.l	r8, @(0x0c,r1)	!  33 LS
    745 	add	#-0x1e, r5	!  50 EX
    746 
    747 	mov.l	r9, @(0x08,r1)	!  33 LS
    748 	cmp/eq	r2,r1		!  54 MT
    749 
    750 	mov.l	r10, @(0x04,r1)	!  33 LS
    751 	bf/s	2b		! 109 BR
    752 
    753 	 mov.l	r11, @(0x00,r1)	!  33 LS
    754 #endif
    755 
    756 	mov.l	@r15+, r12
    757 	mov	r1, r0		!   5 MT (latency=0)
    758 
    759 	mov.l	@r15+, r11	!  15 LS
    760 	sub	r1, r5		!  75 EX
    761 
    762 	mov.l	@r15+, r10	!  15 LS
    763 	cmp/eq	r4, r0		!  54 MT
    764 
    765 	bf/s	1f		! 109 BR
    766 	 mov.l	 @r15+, r9	!  15 LS
    767 
    768 	rts
    769 1:	 mov.l	@r15+, r8	!  15 LS
    770 
    771 	add	#0x1e, r5	!  50 EX
    772 
    773 	! Finish off a short word at a time
    774 	! r5 must be invariant - 2
    775 10:	mov	r4,r2		!   5 MT (latency=0)
    776 	add	#1,r2		!  50 EX
    777 
    778 	cmp/hi	r2, r0		!  57 MT
    779 	bf/s	1f		! 109 BR
    780 
    781 	 add	#2, r2		!  50 EX
    782 
    783 3:	mov.w	@(r0,r5),r1	!  20 LS
    784 	cmp/hi	r2,r0		!  57 MT
    785 
    786 	bt/s	3b		! 109 BR
    787 
    788 	 mov.w	r1,@-r0		!  29 LS
    789 1:
    790 
    791 	!
    792 	! Finally, copy the last byte if necessary
    793 	cmp/eq	r4,r0		!  54 MT
    794 	bt/s	9b
    795 	 add	#1,r5
    796 	mov.b	@(r0,r5),r1
    797 	rts
    798 	 mov.b	r1,@-r0
    799 
    800