Home | History | Annotate | Download | only in bionic
      1 /***************************************************************************
      2  Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
      3 
      4  Redistribution and use in source and binary forms, with or without
      5  modification, are permitted provided that the following conditions are met:
      6      * Redistributions of source code must retain the above copyright
      7        notice, this list of conditions and the following disclaimer.
      8      * Redistributions in binary form must reproduce the above copyright
      9        notice, this list of conditions and the following disclaimer in the
     10        documentation and/or other materials provided with the distribution.
     11      * Neither the name of The Linux Foundation nor the names of its contributors may
     12        be used to endorse or promote products derived from this software
     13        without specific prior written permission.
     14 
     15  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
     19  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25  POSSIBILITY OF SUCH DAMAGE.
     26   ***************************************************************************/
     27 
     28 /* Assumes neon instructions and a cache line size of 64 bytes. */
     29 
     30 #define PLDOFFS	(10)
     31 #define PLDTHRESH (PLDOFFS)
     32 #define BBTHRESH (4096/64)
     33 #define PLDSIZE (64)
     34 
     35 #if (PLDOFFS < 1)
     36 #error Routine does not support offsets less than 1
     37 #endif
     38 
     39 #if (PLDTHRESH < PLDOFFS)
     40 #error PLD threshold must be greater than or equal to the PLD offset
     41 #endif
     42 
     43 	.text
     44 	.fpu    neon
     45 
     46 .L_memcpy_base:
     47 	cmp	r2, #4
     48 	blt	.L_neon_lt4
     49 	cmp	r2, #16
     50 	blt	.L_neon_lt16
     51 	cmp	r2, #32
     52 	blt	.L_neon_16
     53 	cmp	r2, #64
     54 	blt	.L_neon_copy_32_a
     55 
     56 	mov	r12, r2, lsr #6
     57 	cmp	r12, #PLDTHRESH
     58 	ble	.L_neon_copy_64_loop_nopld
     59 
     60 	push	{r9, r10}
     61 	.cfi_adjust_cfa_offset 8
     62 	.cfi_rel_offset r9, 0
     63 	.cfi_rel_offset r10, 4
     64 
     65 	cmp	r12, #BBTHRESH
     66 	ble	.L_neon_prime_pump
     67 
     68 	add	lr, r0, #0x400
     69 	add	r9, r1, #(PLDOFFS*PLDSIZE)
     70 	sub	lr, lr, r9
     71 	lsl	lr, lr, #21
     72 	lsr	lr, lr, #21
     73 	add	lr, lr, #(PLDOFFS*PLDSIZE)
     74 	cmp	r12, lr, lsr #6
     75 	ble	.L_neon_prime_pump
     76 
     77 	itt	gt
     78 	movgt	r9, #(PLDOFFS)
     79 	rsbsgt	r9, r9, lr, lsr #6
     80 	ble	.L_neon_prime_pump
     81 
     82 	add	r10, r1, lr
     83 	bic	r10, #0x3F
     84 
     85 	sub	r12, r12, lr, lsr #6
     86 
     87 	cmp	r9, r12
     88 	itee	le
     89 	suble	r12, r12, r9
     90 	movgt	r9, r12
     91 	movgt	r12, #0
     92 
     93 	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
     94 .L_neon_copy_64_loop_outer_doublepld:
     95 	pld	[r1, #((PLDOFFS)*PLDSIZE)]
     96 	vld1.32	{q0, q1}, [r1]!
     97 	vld1.32	{q2, q3}, [r1]!
     98 	ldr	r3, [r10]
     99 	subs	r9, r9, #1
    100 	vst1.32	{q0, q1}, [r0]!
    101 	vst1.32	{q2, q3}, [r0]!
    102 	add	r10, #64
    103 	bne	.L_neon_copy_64_loop_outer_doublepld
    104 	cmp	r12, #0
    105 	beq	.L_neon_pop_before_nopld
    106 
    107 	cmp	r12, #(512*1024/64)
    108 	blt	.L_neon_copy_64_loop_outer
    109 
    110 .L_neon_copy_64_loop_ddr:
    111 	vld1.32	{q0, q1}, [r1]!
    112 	vld1.32	{q2, q3}, [r1]!
    113 	pld	[r10]
    114 	subs	r12, r12, #1
    115 	vst1.32	{q0, q1}, [r0]!
    116 	vst1.32	{q2, q3}, [r0]!
    117 	add	r10, #64
    118 	bne	.L_neon_copy_64_loop_ddr
    119 	b	.L_neon_pop_before_nopld
    120 
    121 .L_neon_prime_pump:
    122 	mov	lr, #(PLDOFFS*PLDSIZE)
    123 	add	r10, r1, #(PLDOFFS*PLDSIZE)
    124 	bic	r10, #0x3F
    125 	sub	r12, r12, #PLDOFFS
    126 	ldr	r3, [r10, #(-1*PLDSIZE)]
    127 
    128 .L_neon_copy_64_loop_outer:
    129 	vld1.32	{q0, q1}, [r1]!
    130 	vld1.32	{q2, q3}, [r1]!
    131 	ldr	r3, [r10]
    132 	subs	r12, r12, #1
    133 	vst1.32	{q0, q1}, [r0]!
    134 	vst1.32	{q2, q3}, [r0]!
    135 	add	r10, #64
    136 	bne	.L_neon_copy_64_loop_outer
    137 
    138 .L_neon_pop_before_nopld:
    139 	mov	r12, lr, lsr #6
    140 	pop	{r9, r10}
    141 	.cfi_adjust_cfa_offset -8
    142 	.cfi_restore r9
    143 	.cfi_restore r10
    144 
    145 .L_neon_copy_64_loop_nopld:
    146 	vld1.32	{q8, q9}, [r1]!
    147 	vld1.32	{q10, q11}, [r1]!
    148 	subs	r12, r12, #1
    149 	vst1.32	{q8, q9}, [r0]!
    150 	vst1.32	{q10, q11}, [r0]!
    151 	bne	.L_neon_copy_64_loop_nopld
    152 	ands	r2, r2, #0x3f
    153 	beq	.L_neon_exit
    154 
    155 .L_neon_copy_32_a:
    156 	movs	r3, r2, lsl #27
    157 	bcc	.L_neon_16
    158 	vld1.32	{q0,q1}, [r1]!
    159 	vst1.32	{q0,q1}, [r0]!
    160 
    161 .L_neon_16:
    162 	bpl	.L_neon_lt16
    163 	vld1.32	{q8}, [r1]!
    164 	vst1.32	{q8}, [r0]!
    165 	ands	r2, r2, #0x0f
    166 	beq	.L_neon_exit
    167 
    168 .L_neon_lt16:
    169 	movs	r3, r2, lsl #29
    170 	bcc	1f
    171 	vld1.8	{d0}, [r1]!
    172 	vst1.8	{d0}, [r0]!
    173 1:
    174 	bge	.L_neon_lt4
    175 	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
    176 	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
    177 
    178 .L_neon_lt4:
    179 	movs	r2, r2, lsl #31
    180 	itt	cs
    181 	ldrhcs	r3, [r1], #2
    182 	strhcs	r3, [r0], #2
    183 	itt	mi
    184 	ldrbmi	r3, [r1]
    185 	strbmi	r3, [r0]
    186 
    187 .L_neon_exit:
    188 	pop	{r0, pc}
    189