Home | History | Annotate | Download | only in lib
      1 /* Optimized version of the standard memset() function.
      2 
      3    Copyright (c) 2002 Hewlett-Packard Co/CERN
      4 	Sverre Jarp <Sverre.Jarp (at) cern.ch>
      5 
      6    Return: dest
      7 
      8    Inputs:
      9         in0:    dest
     10         in1:    value
     11         in2:    count
     12 
     13    The algorithm is fairly straightforward: set byte by byte until we
     14    we get to a 16B-aligned address, then loop on 128 B chunks using an
     15    early store as prefetching, then loop on 32B chucks, then clear remaining
     16    words, finally clear remaining bytes.
     17    Since a stf.spill f0 can store 16B in one go, we use this instruction
     18    to get peak speed when value = 0.  */
     19 
     20 #include <asm/asmmacro.h>
     21 #undef ret
     22 
     23 #define dest		in0
     24 #define value		in1
     25 #define	cnt		in2
     26 
     27 #define tmp		r31
     28 #define save_lc		r30
     29 #define ptr0		r29
     30 #define ptr1		r28
     31 #define ptr2		r27
     32 #define ptr3		r26
     33 #define ptr9 		r24
     34 #define	loopcnt		r23
     35 #define linecnt		r22
     36 #define bytecnt		r21
     37 
     38 #define fvalue		f6
     39 
     40 // This routine uses only scratch predicate registers (p6 - p15)
     41 #define p_scr		p6			// default register for same-cycle branches
     42 #define p_nz		p7
     43 #define p_zr		p8
     44 #define p_unalgn	p9
     45 #define p_y		p11
     46 #define p_n		p12
     47 #define p_yy		p13
     48 #define p_nn		p14
     49 
     50 #define MIN1		15
     51 #define MIN1P1HALF	8
     52 #define LINE_SIZE	128
     53 #define LSIZE_SH        7			// shift amount
     54 #define PREF_AHEAD	8
     55 
     56 GLOBAL_ENTRY(memset)
     57 { .mmi
     58 	.prologue
     59 	alloc	tmp = ar.pfs, 3, 0, 0, 0
     60 	lfetch.nt1 [dest]			//
     61 	.save   ar.lc, save_lc
     62 	mov.i	save_lc = ar.lc
     63 	.body
     64 } { .mmi
     65 	mov	ret0 = dest			// return value
     66 	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
     67 	cmp.eq	p_scr, p0 = cnt, r0
     68 ;; }
     69 { .mmi
     70 	and	ptr2 = -(MIN1+1), dest		// aligned address
     71 	and	tmp = MIN1, dest		// prepare to check for correct alignment
     72 	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
     73 } { .mib
     74 	mov	ptr1 = dest
     75 	mux1	value = value, @brcst		// create 8 identical bytes in word
     76 (p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
     77 ;; }
     78 { .mib
     79 	cmp.ne	p_unalgn, p0 = tmp, r0		//
     80 } { .mib
     81 	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
     82 	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
     83 (p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
     84 ;; }
     85 { .mmi
     86 (p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
     87 (p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
     88 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
     89 ;; }
     90 { .mib
     91 (p_y)	add	cnt = -8, cnt			//
     92 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
     93 } { .mib
     94 (p_y)	st8	[ptr2] = value,-4		//
     95 (p_n)	add	ptr2 = 4, ptr2			//
     96 ;; }
     97 { .mib
     98 (p_yy)	add	cnt = -4, cnt			//
     99 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
    100 } { .mib
    101 (p_yy)	st4	[ptr2] = value,-2		//
    102 (p_nn)	add	ptr2 = 2, ptr2			//
    103 ;; }
    104 { .mmi
    105 	mov	tmp = LINE_SIZE+1		// for compare
    106 (p_y)	add	cnt = -2, cnt			//
    107 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
    108 } { .mmi
    109 	setf.sig fvalue=value			// transfer value to FLP side
    110 (p_y)	st2	[ptr2] = value,-1		//
    111 (p_n)	add	ptr2 = 1, ptr2			//
    112 ;; }
    113 
    114 { .mmi
    115 (p_yy)	st1	[ptr2] = value 			//
    116   	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
    117 } { .mbb
    118 (p_yy)	add	cnt = -1, cnt			//
    119 (p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
    120 ;; }
    121 
    122 { .mib
    123 	nop.m 0
    124 	shr.u	linecnt = cnt, LSIZE_SH
    125 (p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
    126 ;; }
    127 
    128 	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
    129 { .mmi
    130 	and	tmp = -(LINE_SIZE), cnt		// compute end of range
    131 	mov	ptr9 = ptr1			// used for prefetching
    132 	and	cnt = (LINE_SIZE-1), cnt	// remainder
    133 } { .mmi
    134 	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
    135 	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
    136 ;; }
    137 { .mmi
    138 (p_scr)	add	loopcnt = -1, linecnt		//
    139 	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
    140 	add	ptr1 = tmp, ptr1		// first address beyond total range
    141 ;; }
    142 { .mmi
    143 	add	tmp = -1, linecnt		// next loop count
    144 	mov.i	ar.lc = loopcnt			//
    145 ;; }
    146 .pref_l1a:
    147 { .mib
    148 	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
    149 	nop.i	0
    150 	br.cloop.dptk.few .pref_l1a
    151 ;; }
    152 { .mmi
    153 	add	ptr0 = 16, ptr2			// Two stores in parallel
    154 	mov.i	ar.lc = tmp			//
    155 ;; }
    156 .l1ax:
    157  { .mmi
    158 	stf8 [ptr2] = fvalue, 8
    159 	stf8 [ptr0] = fvalue, 8
    160  ;; }
    161  { .mmi
    162 	stf8 [ptr2] = fvalue, 24
    163 	stf8 [ptr0] = fvalue, 24
    164  ;; }
    165  { .mmi
    166 	stf8 [ptr2] = fvalue, 8
    167 	stf8 [ptr0] = fvalue, 8
    168  ;; }
    169  { .mmi
    170 	stf8 [ptr2] = fvalue, 24
    171 	stf8 [ptr0] = fvalue, 24
    172  ;; }
    173  { .mmi
    174 	stf8 [ptr2] = fvalue, 8
    175 	stf8 [ptr0] = fvalue, 8
    176  ;; }
    177  { .mmi
    178 	stf8 [ptr2] = fvalue, 24
    179 	stf8 [ptr0] = fvalue, 24
    180  ;; }
    181  { .mmi
    182 	stf8 [ptr2] = fvalue, 8
    183 	stf8 [ptr0] = fvalue, 32
    184  	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
    185  ;; }
    186 { .mmb
    187 	stf8 [ptr2] = fvalue, 24
    188 (p_scr)	stf8 [ptr9] = fvalue, 128
    189 	br.cloop.dptk.few .l1ax
    190 ;; }
    191 { .mbb
    192 	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
    193 (p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
    194 	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
    195 ;; }
    196 
    197 	TEXT_ALIGN(32)
    198 .l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
    199 { .mmi
    200 	and	tmp = -(LINE_SIZE), cnt		// compute end of range
    201 	mov	ptr9 = ptr1			// used for prefetching
    202 	and	cnt = (LINE_SIZE-1), cnt	// remainder
    203 } { .mmi
    204 	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
    205 	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
    206 ;; }
    207 { .mmi
    208 (p_scr)	add	loopcnt = -1, linecnt
    209 	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
    210 	add	ptr1 = tmp, ptr1		// first address beyond total range
    211 ;; }
    212 { .mmi
    213 	add	tmp = -1, linecnt		// next loop count
    214 	mov.i	ar.lc = loopcnt
    215 ;; }
    216 .pref_l1b:
    217 { .mib
    218 	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
    219 	nop.i   0
    220 	br.cloop.dptk.few .pref_l1b
    221 ;; }
    222 { .mmi
    223 	add	ptr0 = 16, ptr2			// Two stores in parallel
    224 	mov.i	ar.lc = tmp
    225 ;; }
    226 .l1bx:
    227  { .mmi
    228 	stf.spill [ptr2] = f0, 32
    229 	stf.spill [ptr0] = f0, 32
    230  ;; }
    231  { .mmi
    232 	stf.spill [ptr2] = f0, 32
    233 	stf.spill [ptr0] = f0, 32
    234  ;; }
    235  { .mmi
    236 	stf.spill [ptr2] = f0, 32
    237 	stf.spill [ptr0] = f0, 64
    238  	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
    239  ;; }
    240 { .mmb
    241 	stf.spill [ptr2] = f0, 32
    242 (p_scr)	stf.spill [ptr9] = f0, 128
    243 	br.cloop.dptk.few .l1bx
    244 ;; }
    245 { .mib
    246 	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
    247 (p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
    248 ;; }
    249 
    250 .fraction_of_line:
    251 { .mib
    252 	add	ptr2 = 16, ptr1
    253 	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
    254 ;; }
    255 { .mib
    256 	cmp.eq	p_scr, p0 = loopcnt, r0
    257 	add	loopcnt = -1, loopcnt
    258 (p_scr)	br.cond.dpnt.many .store_words
    259 ;; }
    260 { .mib
    261 	and	cnt = 0x1f, cnt			// compute the remaining cnt
    262 	mov.i   ar.lc = loopcnt
    263 ;; }
    264 	TEXT_ALIGN(32)
    265 .l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
    266 { .mmb
    267 	stf8	[ptr1] = fvalue, 8
    268 	stf8	[ptr2] = fvalue, 8
    269 ;; } { .mmb
    270 	stf8	[ptr1] = fvalue, 24
    271 	stf8	[ptr2] = fvalue, 24
    272 	br.cloop.dptk.many .l2
    273 ;; }
    274 .store_words:
    275 { .mib
    276 	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
    277 (p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
    278 ;; }
    279 
    280 { .mmi
    281 	stf8	[ptr1] = fvalue, 8		// store
    282 	cmp.le	p_y, p_n = 16, cnt
    283 	add	cnt = -8, cnt			// subtract
    284 ;; }
    285 { .mmi
    286 (p_y)	stf8	[ptr1] = fvalue, 8		// store
    287 (p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
    288 (p_y)	add	cnt = -8, cnt			// subtract
    289 ;; }
    290 { .mmi						// store
    291 (p_yy)	stf8	[ptr1] = fvalue, 8
    292 (p_yy)	add	cnt = -8, cnt			// subtract
    293 ;; }
    294 
    295 .move_bytes_from_alignment:
    296 { .mib
    297 	cmp.eq	p_scr, p0 = cnt, r0
    298 	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
    299 (p_scr)	br.cond.dpnt.few .restore_and_exit
    300 ;; }
    301 { .mib
    302 (p_y)	st4	[ptr1] = value,4
    303 	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
    304 ;; }
    305 { .mib
    306 (p_yy)	st2	[ptr1] = value,2
    307 	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
    308 ;; }
    309 
    310 { .mib
    311 (p_y)	st1	[ptr1] = value
    312 ;; }
    313 .restore_and_exit:
    314 { .mib
    315 	nop.m	0
    316 	mov.i	ar.lc = save_lc
    317 	br.ret.sptk.many rp
    318 ;; }
    319 
    320 .move_bytes_unaligned:
    321 { .mmi
    322        .pred.rel "mutex",p_y, p_n
    323        .pred.rel "mutex",p_yy, p_nn
    324 (p_n)	cmp.le  p_yy, p_nn = 4, cnt
    325 (p_y)	cmp.le  p_yy, p_nn = 5, cnt
    326 (p_n)	add	ptr2 = 2, ptr1
    327 } { .mmi
    328 (p_y)	add	ptr2 = 3, ptr1
    329 (p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
    330 (p_y)	add	cnt = -1, cnt
    331 ;; }
    332 { .mmi
    333 (p_yy)	cmp.le.unc p_y, p0 = 8, cnt
    334 	add	ptr3 = ptr1, cnt		// prepare last store
    335 	mov.i	ar.lc = save_lc
    336 } { .mmi
    337 (p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
    338 (p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
    339 (p_yy)	add	cnt = -4, cnt
    340 ;; }
    341 { .mmi
    342 (p_y)	cmp.le.unc p_yy, p0 = 8, cnt
    343 	add	ptr3 = -1, ptr3			// last store
    344 	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
    345 } { .mmi
    346 (p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
    347 (p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
    348 (p_y)	add	cnt = -4, cnt
    349 ;; }
    350 { .mmi
    351 (p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
    352 (p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
    353 	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
    354 } { .mmi
    355 (p_yy)	add	cnt = -4, cnt
    356 ;; }
    357 { .mmb
    358 (p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
    359 (p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
    360 	br.ret.sptk.many rp
    361 }
    362 END(memset)
    363