Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2011, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #ifndef L
     32 # define L(label)	.L##label
     33 #endif
     34 
     35 #ifndef cfi_startproc
     36 # define cfi_startproc	.cfi_startproc
     37 #endif
     38 
     39 #ifndef cfi_endproc
     40 # define cfi_endproc	.cfi_endproc
     41 #endif
     42 
     43 #ifndef cfi_rel_offset
     44 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     45 #endif
     46 
     47 #ifndef cfi_restore
     48 # define cfi_restore(reg)	.cfi_restore reg
     49 #endif
     50 
     51 #ifndef cfi_adjust_cfa_offset
     52 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     53 #endif
     54 
     55 #ifndef ENTRY
     56 # define ENTRY(name)	\
     57 	.type name,  @function;	\
     58 	.globl name;	\
     59 	.p2align 4;	\
     60 name:	\
     61 	cfi_startproc
     62 #endif
     63 
     64 #ifndef END
     65 # define END(name)	\
     66 	cfi_endproc;	\
     67 	.size name,	.-name
     68 #endif
     69 
     70 #define CFI_PUSH(REG)	\
     71 	cfi_adjust_cfa_offset (4);	\
     72 	cfi_rel_offset (REG, 0)
     73 
     74 #define CFI_POP(REG)	\
     75 	cfi_adjust_cfa_offset (-4);	\
     76 	cfi_restore (REG)
     77 
     78 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
     79 #define POP(REG) popl REG; CFI_POP (REG)
     80 
     81 #define ENTRANCE PUSH (%edi);
     82 #define PARMS  8
     83 #define RETURN  POP (%edi); ret; CFI_PUSH (%edi);
     84 
     85 #define STR1  PARMS
     86 #define STR2  STR1+4
     87 #define LEN   STR2+4
     88 
     89 	.text
     90 ENTRY (memchr)
     91 	ENTRANCE
     92 	mov	STR1(%esp), %ecx
     93 	movd	STR2(%esp), %xmm1
     94 	mov	LEN(%esp), %edx
     95 	test	%edx, %edx
     96 	jz	L(return_null)
     97 
     98 	punpcklbw %xmm1, %xmm1
     99 	mov	%ecx, %edi
    100 	punpcklbw %xmm1, %xmm1
    101 
    102 	and	$63, %ecx
    103 	pshufd	$0, %xmm1, %xmm1
    104 	cmp	$48, %ecx
    105 	ja	L(crosscache)
    106 
    107 	movdqu	(%edi), %xmm0
    108 	pcmpeqb	%xmm1, %xmm0
    109 	pmovmskb %xmm0, %eax
    110 	test	%eax, %eax
    111 	jnz	L(match_case2_prolog)
    112 
    113 	sub	$16, %edx
    114 	jbe	L(return_null)
    115 	lea	16(%edi), %edi
    116 	and	$15, %ecx
    117 	and	$-16, %edi
    118 	add	%ecx, %edx
    119 	sub	$64, %edx
    120 	jbe	L(exit_loop)
    121 	jmp	L(loop_prolog)
    122 
    123 	.p2align 4
    124 L(crosscache):
    125 	and	$15, %ecx
    126 	and	$-16, %edi
    127 	movdqa	(%edi), %xmm0
    128 	pcmpeqb	%xmm1, %xmm0
    129 	pmovmskb %xmm0, %eax
    130 	sar	%cl, %eax
    131 	test	%eax, %eax
    132 
    133 	jnz	L(match_case2_prolog1)
    134 	lea	-16(%edx), %edx
    135 	add	%ecx, %edx
    136 	jle	L(return_null)
    137 	lea	16(%edi), %edi
    138 	sub	$64, %edx
    139 	jbe	L(exit_loop)
    140 
    141 	.p2align 4
    142 L(loop_prolog):
    143 	movdqa	(%edi), %xmm0
    144 	pcmpeqb	%xmm1, %xmm0
    145 	xor	%ecx, %ecx
    146 	pmovmskb %xmm0, %eax
    147 	test	%eax, %eax
    148 	jnz	L(match_case1)
    149 
    150 	movdqa	16(%edi), %xmm2
    151 	pcmpeqb	%xmm1, %xmm2
    152 	lea	16(%ecx), %ecx
    153 	pmovmskb %xmm2, %eax
    154 	test	%eax, %eax
    155 	jnz	L(match_case1)
    156 
    157 	movdqa	32(%edi), %xmm3
    158 	pcmpeqb	%xmm1, %xmm3
    159 	lea	16(%ecx), %ecx
    160 	pmovmskb %xmm3, %eax
    161 	test	%eax, %eax
    162 	jnz	L(match_case1)
    163 
    164 	movdqa	48(%edi), %xmm4
    165 	pcmpeqb	%xmm1, %xmm4
    166 	lea	16(%ecx), %ecx
    167 	pmovmskb %xmm4, %eax
    168 	test	%eax, %eax
    169 	jnz	L(match_case1)
    170 
    171 	lea	64(%edi), %edi
    172 	sub	$64, %edx
    173 	jbe	L(exit_loop)
    174 
    175 	movdqa	(%edi), %xmm0
    176 	pcmpeqb	%xmm1, %xmm0
    177 	xor	%ecx, %ecx
    178 	pmovmskb %xmm0, %eax
    179 	test	%eax, %eax
    180 	jnz	L(match_case1)
    181 
    182 	movdqa	16(%edi), %xmm2
    183 	pcmpeqb	%xmm1, %xmm2
    184 	lea	16(%ecx), %ecx
    185 	pmovmskb %xmm2, %eax
    186 	test	%eax, %eax
    187 	jnz	L(match_case1)
    188 
    189 	movdqa	32(%edi), %xmm3
    190 	pcmpeqb	%xmm1, %xmm3
    191 	lea	16(%ecx), %ecx
    192 	pmovmskb %xmm3, %eax
    193 	test	%eax, %eax
    194 	jnz	L(match_case1)
    195 
    196 	movdqa	48(%edi), %xmm4
    197 	pcmpeqb	%xmm1, %xmm4
    198 	lea	16(%ecx), %ecx
    199 	pmovmskb %xmm4, %eax
    200 	test	%eax, %eax
    201 	jnz	L(match_case1)
    202 
    203 	lea	64(%edi), %edi
    204 	mov	%edi, %ecx
    205 	and	$-64, %edi
    206 	and	$63, %ecx
    207 	add	%ecx, %edx
    208 
    209 	.p2align 4
    210 L(align64_loop):
    211 	sub	$64, %edx
    212 	jbe	L(exit_loop)
    213 	movdqa	(%edi), %xmm0
    214 	movdqa	16(%edi), %xmm2
    215 	movdqa	32(%edi), %xmm3
    216 	movdqa	48(%edi), %xmm4
    217 	pcmpeqb	%xmm1, %xmm0
    218 	pcmpeqb	%xmm1, %xmm2
    219 	pcmpeqb	%xmm1, %xmm3
    220 	pcmpeqb	%xmm1, %xmm4
    221 
    222 	pmaxub	%xmm0, %xmm3
    223 	pmaxub	%xmm2, %xmm4
    224 	pmaxub	%xmm3, %xmm4
    225 	add	$64, %edi
    226 	pmovmskb %xmm4, %eax
    227 
    228 	test	%eax, %eax
    229 	jz	L(align64_loop)
    230 
    231 	sub	$64, %edi
    232 
    233 	pmovmskb %xmm0, %eax
    234 	xor	%ecx, %ecx
    235 	test	%eax, %eax
    236 	jnz	L(match_case1)
    237 
    238 	pmovmskb %xmm2, %eax
    239 	lea	16(%ecx), %ecx
    240 	test	%eax, %eax
    241 	jnz	L(match_case1)
    242 
    243 	movdqa	32(%edi), %xmm3
    244 	pcmpeqb	%xmm1, %xmm3
    245 	pmovmskb %xmm3, %eax
    246 	lea	16(%ecx), %ecx
    247 	test	%eax, %eax
    248 	jnz	L(match_case1)
    249 
    250 	pcmpeqb	48(%edi), %xmm1
    251 	pmovmskb %xmm1, %eax
    252 	lea	16(%ecx), %ecx
    253 
    254 	.p2align 4
    255 L(match_case1):
    256 	add	%ecx, %edi
    257 	test	%al, %al
    258 	jz	L(match_case1_high)
    259 	mov	%al, %cl
    260 	and	$15, %cl
    261 	jz	L(match_case1_8)
    262 	test	$0x01, %al
    263 	jnz	L(exit_case1_1)
    264 	test	$0x02, %al
    265 	jnz	L(exit_case1_2)
    266 	test	$0x04, %al
    267 	jnz	L(exit_case1_3)
    268 	lea	3(%edi), %eax
    269 	RETURN
    270 
    271 	.p2align 4
    272 L(match_case1_8):
    273 	test	$0x10, %al
    274 	jnz	L(exit_case1_5)
    275 	test	$0x20, %al
    276 	jnz	L(exit_case1_6)
    277 	test	$0x40, %al
    278 	jnz	L(exit_case1_7)
    279 	lea	7(%edi), %eax
    280 	RETURN
    281 
    282 	.p2align 4
    283 L(match_case1_high):
    284 	mov	%ah, %ch
    285 	and	$15, %ch
    286 	jz	L(match_case1_high_8)
    287 	test	$0x01, %ah
    288 	jnz	L(exit_case1_9)
    289 	test	$0x02, %ah
    290 	jnz	L(exit_case1_10)
    291 	test	$0x04, %ah
    292 	jnz	L(exit_case1_11)
    293 	lea	11(%edi), %eax
    294 	RETURN
    295 
    296 	.p2align 4
    297 L(match_case1_high_8):
    298 	test	$0x10, %ah
    299 	jnz	L(exit_case1_13)
    300 	test	$0x20, %ah
    301 	jnz	L(exit_case1_14)
    302 	test	$0x40, %ah
    303 	jnz	L(exit_case1_15)
    304 	lea	15(%edi), %eax
    305 	RETURN
    306 
    307 	.p2align 4
    308 L(exit_loop):
    309 	add	$64, %edx
    310 
    311 	movdqa	(%edi), %xmm0
    312 	pcmpeqb	%xmm1, %xmm0
    313 	xor	%ecx, %ecx
    314 	pmovmskb %xmm0, %eax
    315 	test	%eax, %eax
    316 	jnz	L(match_case2)
    317 	cmp	$16, %edx
    318 	jbe	L(return_null)
    319 
    320 	movdqa	16(%edi), %xmm2
    321 	pcmpeqb	%xmm1, %xmm2
    322 	lea	16(%ecx), %ecx
    323 	pmovmskb %xmm2, %eax
    324 	test	%eax, %eax
    325 	jnz	L(match_case2)
    326 	cmp	$32, %edx
    327 	jbe	L(return_null)
    328 
    329 	movdqa	32(%edi), %xmm3
    330 	pcmpeqb	%xmm1, %xmm3
    331 	lea	16(%ecx), %ecx
    332 	pmovmskb %xmm3, %eax
    333 	test	%eax, %eax
    334 	jnz	L(match_case2)
    335 	cmp	$48, %edx
    336 	jbe	L(return_null)
    337 
    338 	pcmpeqb	48(%edi), %xmm1
    339 	lea	16(%ecx), %ecx
    340 	pmovmskb %xmm1, %eax
    341 	test	%eax, %eax
    342 	jnz	L(match_case2)
    343 
    344 	xor	%eax, %eax
    345 	RETURN
    346 
    347 	.p2align 4
    348 L(exit_case1_1):
    349 	mov	%edi, %eax
    350 	RETURN
    351 
    352 	.p2align 4
    353 L(exit_case1_2):
    354 	lea	1(%edi), %eax
    355 	RETURN
    356 
    357 	.p2align 4
    358 L(exit_case1_3):
    359 	lea	2(%edi), %eax
    360 	RETURN
    361 
    362 	.p2align 4
    363 L(exit_case1_5):
    364 	lea	4(%edi), %eax
    365 	RETURN
    366 
    367 	.p2align 4
    368 L(exit_case1_6):
    369 	lea	5(%edi), %eax
    370 	RETURN
    371 
    372 	.p2align 4
    373 L(exit_case1_7):
    374 	lea	6(%edi), %eax
    375 	RETURN
    376 
    377 	.p2align 4
    378 L(exit_case1_9):
    379 	lea	8(%edi), %eax
    380 	RETURN
    381 
    382 	.p2align 4
    383 L(exit_case1_10):
    384 	lea	9(%edi), %eax
    385 	RETURN
    386 
    387 	.p2align 4
    388 L(exit_case1_11):
    389 	lea	10(%edi), %eax
    390 	RETURN
    391 
    392 	.p2align 4
    393 L(exit_case1_13):
    394 	lea	12(%edi), %eax
    395 	RETURN
    396 
    397 	.p2align 4
    398 L(exit_case1_14):
    399 	lea	13(%edi), %eax
    400 	RETURN
    401 
    402 	.p2align 4
    403 L(exit_case1_15):
    404 	lea	14(%edi), %eax
    405 	RETURN
    406 
    407 	.p2align 4
    408 L(match_case2):
    409 	sub	%ecx, %edx
    410 L(match_case2_prolog1):
    411 	add	%ecx, %edi
    412 L(match_case2_prolog):
    413 	test	%al, %al
    414 	jz	L(match_case2_high)
    415 	mov	%al, %cl
    416 	and	$15, %cl
    417 	jz	L(match_case2_8)
    418 	test	$0x01, %al
    419 	jnz	L(exit_case2_1)
    420 	test	$0x02, %al
    421 	jnz	L(exit_case2_2)
    422 	test	$0x04, %al
    423 	jnz	L(exit_case2_3)
    424 	sub	$4, %edx
    425 	jb	L(return_null)
    426 	lea	3(%edi), %eax
    427 	RETURN
    428 
    429 	.p2align 4
    430 L(match_case2_8):
    431 	test	$0x10, %al
    432 	jnz	L(exit_case2_5)
    433 	test	$0x20, %al
    434 	jnz	L(exit_case2_6)
    435 	test	$0x40, %al
    436 	jnz	L(exit_case2_7)
    437 	sub	$8, %edx
    438 	jb	L(return_null)
    439 	lea	7(%edi), %eax
    440 	RETURN
    441 
    442 	.p2align 4
    443 L(match_case2_high):
    444 	mov	%ah, %ch
    445 	and	$15, %ch
    446 	jz	L(match_case2_high_8)
    447 	test	$0x01, %ah
    448 	jnz	L(exit_case2_9)
    449 	test	$0x02, %ah
    450 	jnz	L(exit_case2_10)
    451 	test	$0x04, %ah
    452 	jnz	L(exit_case2_11)
    453 	sub	$12, %edx
    454 	jb	L(return_null)
    455 	lea	11(%edi), %eax
    456 	RETURN
    457 
    458 	.p2align 4
    459 L(match_case2_high_8):
    460 	test	$0x10, %ah
    461 	jnz	L(exit_case2_13)
    462 	test	$0x20, %ah
    463 	jnz	L(exit_case2_14)
    464 	test	$0x40, %ah
    465 	jnz	L(exit_case2_15)
    466 	sub	$16, %edx
    467 	jb	L(return_null)
    468 	lea	15(%edi), %eax
    469 	RETURN
    470 
    471 	.p2align 4
    472 L(exit_case2_1):
    473 	mov	%edi, %eax
    474 	RETURN
    475 
    476 	.p2align 4
    477 L(exit_case2_2):
    478 	sub	$2, %edx
    479 	jb	L(return_null)
    480 	lea	1(%edi), %eax
    481 	RETURN
    482 
    483 	.p2align 4
    484 L(exit_case2_3):
    485 	sub	$3, %edx
    486 	jb	L(return_null)
    487 	lea	2(%edi), %eax
    488 	RETURN
    489 
    490 	.p2align 4
    491 L(exit_case2_5):
    492 	sub	$5, %edx
    493 	jb	L(return_null)
    494 	lea	4(%edi), %eax
    495 	RETURN
    496 
    497 	.p2align 4
    498 L(exit_case2_6):
    499 	sub	$6, %edx
    500 	jb	L(return_null)
    501 	lea	5(%edi), %eax
    502 	RETURN
    503 
    504 	.p2align 4
    505 L(exit_case2_7):
    506 	sub	$7, %edx
    507 	jb	L(return_null)
    508 	lea	6(%edi), %eax
    509 	RETURN
    510 
    511 	.p2align 4
    512 L(exit_case2_9):
    513 	sub	$9, %edx
    514 	jb	L(return_null)
    515 	lea	8(%edi), %eax
    516 	RETURN
    517 
    518 	.p2align 4
    519 L(exit_case2_10):
    520 	sub	$10, %edx
    521 	jb	L(return_null)
    522 	lea	9(%edi), %eax
    523 	RETURN
    524 
    525 	.p2align 4
    526 L(exit_case2_11):
    527 	sub	$11, %edx
    528 	jb	L(return_null)
    529 	lea	10(%edi), %eax
    530 	RETURN
    531 
    532 	.p2align 4
    533 L(exit_case2_13):
    534 	sub	$13, %edx
    535 	jb	L(return_null)
    536 	lea	12(%edi), %eax
    537 	RETURN
    538 
    539 	.p2align 4
    540 L(exit_case2_14):
    541 	sub	$14, %edx
    542 	jb	L(return_null)
    543 	lea	13(%edi), %eax
    544 	RETURN
    545 
    546 	.p2align 4
    547 L(exit_case2_15):
    548 	sub	$15, %edx
    549 	jb	L(return_null)
    550 	lea	14(%edi), %eax
    551 	RETURN
    552 	.p2align 4
    553 L(return_null):
    554 	xor	%eax, %eax
    555 	RETURN
    556 END (memchr)
    557