Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef STRLEN
     32 # define STRLEN strlen
     33 #endif
     34 
     35 #ifndef L
     36 # define L(label)	.L##label
     37 #endif
     38 
     39 #ifndef cfi_startproc
     40 # define cfi_startproc	.cfi_startproc
     41 #endif
     42 
     43 #ifndef cfi_endproc
     44 # define cfi_endproc	.cfi_endproc
     45 #endif
     46 
     47 #ifndef cfi_rel_offset
     48 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     49 #endif
     50 
     51 #ifndef cfi_restore
     52 # define cfi_restore(reg)	.cfi_restore reg
     53 #endif
     54 
     55 #ifndef cfi_adjust_cfa_offset
     56 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     57 #endif
     58 
     59 #ifndef ENTRY
     60 # define ENTRY(name)             \
     61 	.type name,  @function;  \
     62 	.globl name;             \
     63 	.p2align 4;              \
     64 name:                            \
     65 	cfi_startproc
     66 #endif
     67 
     68 #ifndef END
     69 # define END(name)               \
     70 	cfi_endproc;             \
     71 	.size name,	.-name
     72 #endif
     73 
     74 #define CFI_PUSH(REG)                   \
     75 	cfi_adjust_cfa_offset (4);      \
     76 	cfi_rel_offset (REG, 0)
     77 
     78 #define CFI_POP(REG)                    \
     79 	cfi_adjust_cfa_offset (-4);     \
     80 	cfi_restore (REG)
     81 
     82 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
     83 #define POP(REG) popl REG; CFI_POP (REG)
     84 
     85 	.section .text.sse2,"ax",@progbits
     86 ENTRY (STRLEN)
     87 	mov	4(%esp), %edx
     88 	mov	%edx, %ecx
     89 	and	$0x3f, %ecx
     90 	pxor	%xmm0, %xmm0
     91 	cmp	$0x30, %ecx
     92 	ja	L(next)
     93 	movdqu	(%edx), %xmm1
     94 	pcmpeqb	%xmm1, %xmm0
     95 	pmovmskb %xmm0, %ecx
     96 	test	%ecx, %ecx
     97 	jnz	L(exit_less16)
     98 	mov	%edx, %eax
     99 	and	$-16, %eax
    100 	jmp	L(align16_start)
    101 L(next):
    102 	mov	%edx, %eax
    103 	and	$-16, %eax
    104 	PUSH	(%edi)
    105 	pcmpeqb	(%eax), %xmm0
    106 	mov	$-1, %edi
    107 	sub	%eax, %ecx
    108 	shl	%cl, %edi
    109 	pmovmskb %xmm0, %ecx
    110 	and	%edi, %ecx
    111 	POP	(%edi)
    112 	jnz	L(exit_unaligned)
    113 	pxor	%xmm0, %xmm0
    114 L(align16_start):
    115 	pxor	%xmm1, %xmm1
    116 	pxor	%xmm2, %xmm2
    117 	pxor	%xmm3, %xmm3
    118 	pcmpeqb	16(%eax), %xmm0
    119 	pmovmskb %xmm0, %ecx
    120 	test	%ecx, %ecx
    121 	jnz	L(exit16)
    122 
    123 	pcmpeqb	32(%eax), %xmm1
    124 	pmovmskb %xmm1, %ecx
    125 	test	%ecx, %ecx
    126 	jnz	L(exit32)
    127 
    128 	pcmpeqb	48(%eax), %xmm2
    129 	pmovmskb %xmm2, %ecx
    130 	test	%ecx, %ecx
    131 	jnz	L(exit48)
    132 
    133 	pcmpeqb	64(%eax), %xmm3
    134 	pmovmskb %xmm3, %ecx
    135 	test	%ecx, %ecx
    136 	jnz	L(exit64)
    137 
    138 	pcmpeqb	80(%eax), %xmm0
    139 	add	$64, %eax
    140 	pmovmskb %xmm0, %ecx
    141 	test	%ecx, %ecx
    142 	jnz	L(exit16)
    143 
    144 	pcmpeqb	32(%eax), %xmm1
    145 	pmovmskb %xmm1, %ecx
    146 	test	%ecx, %ecx
    147 	jnz	L(exit32)
    148 
    149 	pcmpeqb	48(%eax), %xmm2
    150 	pmovmskb %xmm2, %ecx
    151 	test	%ecx, %ecx
    152 	jnz	L(exit48)
    153 
    154 	pcmpeqb	64(%eax), %xmm3
    155 	pmovmskb %xmm3, %ecx
    156 	test	%ecx, %ecx
    157 	jnz	L(exit64)
    158 
    159 	pcmpeqb	80(%eax), %xmm0
    160 	add	$64, %eax
    161 	pmovmskb %xmm0, %ecx
    162 	test	%ecx, %ecx
    163 	jnz	L(exit16)
    164 
    165 	pcmpeqb	32(%eax), %xmm1
    166 	pmovmskb %xmm1, %ecx
    167 	test	%ecx, %ecx
    168 	jnz	L(exit32)
    169 
    170 	pcmpeqb	48(%eax), %xmm2
    171 	pmovmskb %xmm2, %ecx
    172 	test	%ecx, %ecx
    173 	jnz	L(exit48)
    174 
    175 	pcmpeqb	64(%eax), %xmm3
    176 	pmovmskb %xmm3, %ecx
    177 	test	%ecx, %ecx
    178 	jnz	L(exit64)
    179 
    180 	pcmpeqb	80(%eax), %xmm0
    181 	add	$64, %eax
    182 	pmovmskb %xmm0, %ecx
    183 	test	%ecx, %ecx
    184 	jnz	L(exit16)
    185 
    186 	pcmpeqb	32(%eax), %xmm1
    187 	pmovmskb %xmm1, %ecx
    188 	test	%ecx, %ecx
    189 	jnz	L(exit32)
    190 
    191 	pcmpeqb	48(%eax), %xmm2
    192 	pmovmskb %xmm2, %ecx
    193 	test	%ecx, %ecx
    194 	jnz	L(exit48)
    195 
    196 	pcmpeqb	64(%eax), %xmm3
    197 	pmovmskb %xmm3, %ecx
    198 	test	%ecx, %ecx
    199 	jnz	L(exit64)
    200 
    201 
    202 	test	$0x3f, %eax
    203 	jz	L(align64_loop)
    204 
    205 	pcmpeqb	80(%eax), %xmm0
    206 	add	$80, %eax
    207 	pmovmskb %xmm0, %ecx
    208 	test	%ecx, %ecx
    209 	jnz	L(exit)
    210 
    211 	test	$0x3f, %eax
    212 	jz	L(align64_loop)
    213 
    214 	pcmpeqb	16(%eax), %xmm1
    215 	add	$16, %eax
    216 	pmovmskb %xmm1, %ecx
    217 	test	%ecx, %ecx
    218 	jnz	L(exit)
    219 
    220 	test	$0x3f, %eax
    221 	jz	L(align64_loop)
    222 
    223 	pcmpeqb	16(%eax), %xmm2
    224 	add	$16, %eax
    225 	pmovmskb %xmm2, %ecx
    226 	test	%ecx, %ecx
    227 	jnz	L(exit)
    228 
    229 	test	$0x3f, %eax
    230 	jz	L(align64_loop)
    231 
    232 	pcmpeqb	16(%eax), %xmm3
    233 	add	$16, %eax
    234 	pmovmskb %xmm3, %ecx
    235 	test	%ecx, %ecx
    236 	jnz	L(exit)
    237 
    238 	add	$16, %eax
    239 	.p2align 4
    240 L(align64_loop):
    241 	movaps	(%eax),	%xmm4
    242 	pminub	16(%eax), 	%xmm4
    243 	movaps	32(%eax), 	%xmm5
    244 	pminub	48(%eax), 	%xmm5
    245 	add	$64, 	%eax
    246 	pminub	%xmm4,	%xmm5
    247 	pcmpeqb	%xmm0,	%xmm5
    248 	pmovmskb %xmm5,	%ecx
    249 	test	%ecx,	%ecx
    250 	jz	L(align64_loop)
    251 
    252 
    253 	pcmpeqb	-64(%eax), %xmm0
    254 	sub	$80, 	%eax
    255 	pmovmskb %xmm0, %ecx
    256 	test	%ecx, %ecx
    257 	jnz	L(exit16)
    258 
    259 	pcmpeqb	32(%eax), %xmm1
    260 	pmovmskb %xmm1, %ecx
    261 	test	%ecx, %ecx
    262 	jnz	L(exit32)
    263 
    264 	pcmpeqb	48(%eax), %xmm2
    265 	pmovmskb %xmm2, %ecx
    266 	test	%ecx, %ecx
    267 	jnz	L(exit48)
    268 
    269 	pcmpeqb	64(%eax), %xmm3
    270 	pmovmskb %xmm3, %ecx
    271 	sub	%edx, %eax
    272 	bsf	%ecx, %ecx
    273 	add	%ecx, %eax
    274 	add	$64, %eax
    275 	ret
    276 
    277 	.p2align 4
    278 L(exit):
    279 	sub	%edx, %eax
    280 	bsf	%ecx, %ecx
    281 	add	%ecx, %eax
    282 	ret
    283 
    284 L(exit_less16):
    285 	bsf	%ecx, %eax
    286 	ret
    287 
    288 	.p2align 4
    289 L(exit_unaligned):
    290 	sub	%edx, %eax
    291 	bsf	%ecx, %ecx
    292 	add	%ecx, %eax
    293 	ret
    294 
    295 	.p2align 4
    296 L(exit16):
    297 	sub	%edx, %eax
    298 	bsf	%ecx, %ecx
    299 	add	%ecx, %eax
    300 	add	$16, %eax
    301 	ret
    302 
    303 	.p2align 4
    304 L(exit32):
    305 	sub	%edx, %eax
    306 	bsf	%ecx, %ecx
    307 	add	%ecx, %eax
    308 	add	$32, %eax
    309 	ret
    310 
    311 	.p2align 4
    312 L(exit48):
    313 	sub	%edx, %eax
    314 	bsf	%ecx, %ecx
    315 	add	%ecx, %eax
    316 	add	$48, %eax
    317 	ret
    318 
    319 	.p2align 4
    320 L(exit64):
    321 	sub	%edx, %eax
    322 	bsf	%ecx, %ecx
    323 	add	%ecx, %eax
    324 	add	$64, %eax
    325 	ret
    326 
    327 END (STRLEN)
    328 
    329