Home | History | Annotate | Download | only in memset_mips
      1 /*
      2  * Copyright (c) 2009
      3  *      MIPS Technologies, Inc., California.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
     14  *    contributors may be used to endorse or promote products derived from
     15  *    this software without specific prior written permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 /************************************************************************
     31  *
     32  *  memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
     33  *  Version: "043009"
     34  *
     35  ************************************************************************/
     36 
     37 
     38 /************************************************************************
     39  *  Include files
     40  ************************************************************************/
     41 
     42 #include "machine/asm.h"
     43 
     44 /*
     45  * This routine could be optimized for MIPS64. The current code only
     46  * uses MIPS32 instructions.
     47  */
     48 
     49 #if defined(__MIPSEB__)
     50 #  define SWHI	swl		/* high part is left in big-endian	*/
     51 #endif
     52 
     53 #if defined(__MIPSEL__)
     54 #  define SWHI	swr		/* high part is right in little-endian	*/
     55 #endif
     56 
     57 #if !(defined(XGPROF) || defined(XPROF))
     58 #undef SETUP_GP
     59 #define SETUP_GP
     60 #endif
     61 
     62 LEAF(memset_cmips,0)
     63 
     64 	.set	noreorder
     65 	.set	noat
     66 
     67 	addu	t0,a0,a2		# t0 is the "past the end" address
     68 	slti	AT,a2,4			# is a2 less than 4?
     69 	bne	AT,zero,.Llast4		# if yes, go to last4
     70 	move	v0,a0			# memset returns the dst pointer
     71 
     72 	beq	a1,zero,.Lset0
     73 	subu	v1,zero,a0
     74 
     75 	# smear byte into 32 bit word
     76 #if (__mips==32) && (__mips_isa_rev>=2)
     77 	ins     a1, a1, 8, 8        # Replicate fill byte into half-word.
     78 	ins     a1, a1, 16, 16      # Replicate fill byte into word.
     79 #else
     80 	and	a1,0xff
     81 	sll	AT,a1,8
     82 	or	a1,AT
     83 	sll	AT,a1,16
     84 	or	a1,AT
     85 #endif
     86 
     87 .Lset0:	andi	v1,v1,0x3		# word-unaligned address?
     88 	beq	v1,zero,.Laligned	# v1 is the unalignment count
     89 	subu	a2,a2,v1
     90 	SWHI	a1,0(a0)
     91 	addu	a0,a0,v1
     92 
     93 # Here we have the "word-aligned" a0 (until the "last4")
     94 .Laligned:
     95 	andi	t8,a2,0x3f	# any 64-byte chunks?
     96 				# t8 is the byte count past 64-byte chunks
     97 	beq	a2,t8,.Lchk8w	# when a2==t8, no 64-byte chunks
     98 				# There will be at most 1 32-byte chunk then
     99 	subu	a3,a2,t8	# subtract from a2 the reminder
    100 				# Here a3 counts bytes in 16w chunks
    101 	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
    102 
    103 # Find out, if there are any 64-byte chunks after which will be still at least
    104 # 96 bytes left. The value "96" is calculated as needed buffer for
    105 # "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
    106 # incrementing "a0" by 64.
    107 # For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
    108 #
    109 	sltiu	v1,a2,160
    110 	bgtz	v1,.Lloop16w_nopref30	# skip "pref 30,0(a0)"
    111 	subu	t7,a2,96	# subtract "pref 30 unsafe" region
    112 		# below we have at least 1 64-byte chunk which is "pref 30 safe"
    113 	andi	t6,t7,0x3f	# t6 is past "64-byte safe chunks" reminder
    114 	subu	t5,t7,t6	# subtract from t7 the reminder
    115 				# Here t5 counts bytes in 16w "safe" chunks
    116 	addu	t4,a0,t5	# Now t4 is the dst after 64-byte "safe" chunks
    117 
    118 # Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
    119 #	pref	30,0(a0)
    120 # Here we are in the region, where it is safe to use "pref 30,64(a0)"
    121 .Lloop16w:
    122 	addiu	a0,a0,64
    123 	pref	30,-32(a0)	# continue setting up the dest, addr 64-32
    124 	sw	a1,-64(a0)
    125 	sw	a1,-60(a0)
    126 	sw	a1,-56(a0)
    127 	sw	a1,-52(a0)
    128 	sw	a1,-48(a0)
    129 	sw	a1,-44(a0)
    130 	sw	a1,-40(a0)
    131 	sw	a1,-36(a0)
    132 	nop
    133 	nop			# the extra nop instructions help to balance
    134 	nop			# cycles needed for "store" + "fill" + "evict"
    135 	nop			# For 64byte store there are needed 8 fill
    136 	nop			# and 8 evict cycles, i.e. at least 32 instr.
    137 	nop
    138 	nop
    139 	pref	30,0(a0)	# continue setting up the dest, addr 64-0
    140 	sw	a1,-32(a0)
    141 	sw	a1,-28(a0)
    142 	sw	a1,-24(a0)
    143 	sw	a1,-20(a0)
    144 	sw	a1,-16(a0)
    145 	sw	a1,-12(a0)
    146 	sw	a1,-8(a0)
    147 	sw	a1,-4(a0)
    148 	nop
    149 	nop
    150 	nop
    151 	nop			# NOTE: adding 14 nop-s instead of 12 nop-s
    152 	nop			# gives better results for "fast" memory
    153 	nop
    154 	bne	a0,t4,.Lloop16w
    155 	nop
    156 
    157 	beq	a0,a3,.Lchk8w	# maybe no more 64-byte chunks?
    158 	nop			# this "delayed slot" is useless ...
    159 
    160 .Lloop16w_nopref30:	# there could be up to 3 "64-byte nopref30" chunks
    161 	addiu	a0,a0,64
    162 	sw	a1,-64(a0)
    163 	sw	a1,-60(a0)
    164 	sw	a1,-56(a0)
    165 	sw	a1,-52(a0)
    166 	sw	a1,-48(a0)
    167 	sw	a1,-44(a0)
    168 	sw	a1,-40(a0)
    169 	sw	a1,-36(a0)
    170 	sw	a1,-32(a0)
    171 	sw	a1,-28(a0)
    172 	sw	a1,-24(a0)
    173 	sw	a1,-20(a0)
    174 	sw	a1,-16(a0)
    175 	sw	a1,-12(a0)
    176 	sw	a1,-8(a0)
    177 	bne	a0,a3,.Lloop16w_nopref30
    178 	sw	a1,-4(a0)
    179 
    180 .Lchk8w:		# t8 here is the byte count past 64-byte chunks
    181 
    182 	andi	t7,t8,0x1f	# is there a 32-byte chunk?
    183 				# the t7 is the reminder count past 32-bytes
    184 	beq	t8,t7,.Lchk1w	# when t8==t7, no 32-byte chunk
    185 	move	a2,t7
    186 
    187 	sw	a1,0(a0)
    188 	sw	a1,4(a0)
    189 	sw	a1,8(a0)
    190 	sw	a1,12(a0)
    191 	sw	a1,16(a0)
    192 	sw	a1,20(a0)
    193 	sw	a1,24(a0)
    194 	sw	a1,28(a0)
    195 	addiu	a0,a0,32
    196 
    197 .Lchk1w:
    198 	andi	t8,a2,0x3	# now t8 is the reminder past 1w chunks
    199 	beq	a2,t8,.Llast4
    200 	subu	a3,a2,t8	# a3 is the count of bytes in 1w chunks
    201 	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
    202 
    203 # copying in words (4-byte chunks)
    204 .LwordCopy_loop:
    205 	addiu	a0,a0,4
    206 	bne	a0,a3,.LwordCopy_loop
    207 	sw	a1,-4(a0)
    208 
    209 .Llast4:beq	a0,t0,.Llast4e
    210 .Llast4l:addiu	a0,a0,1
    211 	bne	a0,t0,.Llast4l
    212 	sb	a1,-1(a0)
    213 
    214 .Llast4e:
    215 	j	ra
    216 	nop
    217 
    218 	.set	at
    219 	.set	reorder
    220 
    221 END(memset_cmips)
    222 
    223 
    224 /************************************************************************
    225  *  Implementation : Static functions
    226  ************************************************************************/
    227 
    228