1 /* 2 * Copyright (c) 2009 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /************************************************************************ 31 * 32 * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops 33 * Version: "043009" 34 * 35 ************************************************************************/ 36 37 38 /************************************************************************ 39 * Include files 40 ************************************************************************/ 41 42 #include "machine/asm.h" 43 44 /* 45 * This routine could be optimized for MIPS64. The current code only 46 * uses MIPS32 instructions. 47 */ 48 49 #if defined(__MIPSEB__) 50 # define SWHI swl /* high part is left in big-endian */ 51 #endif 52 53 #if defined(__MIPSEL__) 54 # define SWHI swr /* high part is right in little-endian */ 55 #endif 56 57 #if !(defined(XGPROF) || defined(XPROF)) 58 #undef SETUP_GP 59 #define SETUP_GP 60 #endif 61 62 LEAF(memset_cmips,0) 63 64 .set noreorder 65 .set noat 66 67 addu t0,a0,a2 # t0 is the "past the end" address 68 slti AT,a2,4 # is a2 less than 4? 69 bne AT,zero,.Llast4 # if yes, go to last4 70 move v0,a0 # memset returns the dst pointer 71 72 beq a1,zero,.Lset0 73 subu v1,zero,a0 74 75 # smear byte into 32 bit word 76 #if (__mips==32) && (__mips_isa_rev>=2) 77 ins a1, a1, 8, 8 # Replicate fill byte into half-word. 78 ins a1, a1, 16, 16 # Replicate fill byte into word. 79 #else 80 and a1,0xff 81 sll AT,a1,8 82 or a1,AT 83 sll AT,a1,16 84 or a1,AT 85 #endif 86 87 .Lset0: andi v1,v1,0x3 # word-unaligned address? 88 beq v1,zero,.Laligned # v1 is the unalignment count 89 subu a2,a2,v1 90 SWHI a1,0(a0) 91 addu a0,a0,v1 92 93 # Here we have the "word-aligned" a0 (until the "last4") 94 .Laligned: 95 andi t8,a2,0x3f # any 64-byte chunks? 96 # t8 is the byte count past 64-byte chunks 97 beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks 98 # There will be at most 1 32-byte chunk then 99 subu a3,a2,t8 # subtract from a2 the reminder 100 # Here a3 counts bytes in 16w chunks 101 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks 102 103 # Find out, if there are any 64-byte chunks after which will be still at least 104 # 96 bytes left. The value "96" is calculated as needed buffer for 105 # "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after 106 # incrementing "a0" by 64. 107 # For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk. 108 # 109 sltiu v1,a2,160 110 bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)" 111 subu t7,a2,96 # subtract "pref 30 unsafe" region 112 # below we have at least 1 64-byte chunk which is "pref 30 safe" 113 andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder 114 subu t5,t7,t6 # subtract from t7 the reminder 115 # Here t5 counts bytes in 16w "safe" chunks 116 addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks 117 118 # Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line 119 # pref 30,0(a0) 120 # Here we are in the region, where it is safe to use "pref 30,64(a0)" 121 .Lloop16w: 122 addiu a0,a0,64 123 pref 30,-32(a0) # continue setting up the dest, addr 64-32 124 sw a1,-64(a0) 125 sw a1,-60(a0) 126 sw a1,-56(a0) 127 sw a1,-52(a0) 128 sw a1,-48(a0) 129 sw a1,-44(a0) 130 sw a1,-40(a0) 131 sw a1,-36(a0) 132 nop 133 nop # the extra nop instructions help to balance 134 nop # cycles needed for "store" + "fill" + "evict" 135 nop # For 64byte store there are needed 8 fill 136 nop # and 8 evict cycles, i.e. at least 32 instr. 137 nop 138 nop 139 pref 30,0(a0) # continue setting up the dest, addr 64-0 140 sw a1,-32(a0) 141 sw a1,-28(a0) 142 sw a1,-24(a0) 143 sw a1,-20(a0) 144 sw a1,-16(a0) 145 sw a1,-12(a0) 146 sw a1,-8(a0) 147 sw a1,-4(a0) 148 nop 149 nop 150 nop 151 nop # NOTE: adding 14 nop-s instead of 12 nop-s 152 nop # gives better results for "fast" memory 153 nop 154 bne a0,t4,.Lloop16w 155 nop 156 157 beq a0,a3,.Lchk8w # maybe no more 64-byte chunks? 158 nop # this "delayed slot" is useless ... 159 160 .Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks 161 addiu a0,a0,64 162 sw a1,-64(a0) 163 sw a1,-60(a0) 164 sw a1,-56(a0) 165 sw a1,-52(a0) 166 sw a1,-48(a0) 167 sw a1,-44(a0) 168 sw a1,-40(a0) 169 sw a1,-36(a0) 170 sw a1,-32(a0) 171 sw a1,-28(a0) 172 sw a1,-24(a0) 173 sw a1,-20(a0) 174 sw a1,-16(a0) 175 sw a1,-12(a0) 176 sw a1,-8(a0) 177 bne a0,a3,.Lloop16w_nopref30 178 sw a1,-4(a0) 179 180 .Lchk8w: # t8 here is the byte count past 64-byte chunks 181 182 andi t7,t8,0x1f # is there a 32-byte chunk? 183 # the t7 is the reminder count past 32-bytes 184 beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk 185 move a2,t7 186 187 sw a1,0(a0) 188 sw a1,4(a0) 189 sw a1,8(a0) 190 sw a1,12(a0) 191 sw a1,16(a0) 192 sw a1,20(a0) 193 sw a1,24(a0) 194 sw a1,28(a0) 195 addiu a0,a0,32 196 197 .Lchk1w: 198 andi t8,a2,0x3 # now t8 is the reminder past 1w chunks 199 beq a2,t8,.Llast4 200 subu a3,a2,t8 # a3 is the count of bytes in 1w chunks 201 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks 202 203 # copying in words (4-byte chunks) 204 .LwordCopy_loop: 205 addiu a0,a0,4 206 bne a0,a3,.LwordCopy_loop 207 sw a1,-4(a0) 208 209 .Llast4:beq a0,t0,.Llast4e 210 .Llast4l:addiu a0,a0,1 211 bne a0,t0,.Llast4l 212 sb a1,-1(a0) 213 214 .Llast4e: 215 j ra 216 nop 217 218 .set at 219 .set reorder 220 221 END(memset_cmips) 222 223 224 /************************************************************************ 225 * Implementation : Static functions 226 ************************************************************************/ 227 228