1 /* 2 Copyright (c) 2011, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef L 32 # define L(label) .L##label 33 #endif 34 35 #ifndef cfi_startproc 36 # define cfi_startproc .cfi_startproc 37 #endif 38 39 #ifndef cfi_endproc 40 # define cfi_endproc .cfi_endproc 41 #endif 42 43 #ifndef cfi_rel_offset 44 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 45 #endif 46 47 #ifndef cfi_restore 48 # define cfi_restore(reg) .cfi_restore reg 49 #endif 50 51 #ifndef cfi_adjust_cfa_offset 52 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 53 #endif 54 55 #ifndef ENTRY 56 # define ENTRY(name) \ 57 .type name, @function; \ 58 .globl name; \ 59 .p2align 4; \ 60 name: \ 61 cfi_startproc 62 #endif 63 64 #ifndef END 65 # define END(name) \ 66 cfi_endproc; \ 67 .size name, .-name 68 #endif 69 70 #define CFI_PUSH(REG) \ 71 cfi_adjust_cfa_offset (4); \ 72 cfi_rel_offset (REG, 0) 73 74 #define CFI_POP(REG) \ 75 cfi_adjust_cfa_offset (-4); \ 76 cfi_restore (REG) 77 78 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 79 #define POP(REG) popl REG; CFI_POP (REG) 80 81 #define ENTRANCE PUSH (%edi); 82 #define PARMS 8 83 #define RETURN POP (%edi); ret; CFI_PUSH (%edi); 84 85 #define STR1 PARMS 86 #define STR2 STR1+4 87 #define LEN STR2+4 88 89 .text 90 ENTRY (memchr) 91 ENTRANCE 92 mov STR1(%esp), %ecx 93 movd STR2(%esp), %xmm1 94 mov LEN(%esp), %edx 95 test %edx, %edx 96 jz L(return_null) 97 98 punpcklbw %xmm1, %xmm1 99 mov %ecx, %edi 100 punpcklbw %xmm1, %xmm1 101 102 and $63, %ecx 103 pshufd $0, %xmm1, %xmm1 104 cmp $48, %ecx 105 ja L(crosscache) 106 107 movdqu (%edi), %xmm0 108 pcmpeqb %xmm1, %xmm0 109 pmovmskb %xmm0, %eax 110 test %eax, %eax 111 jnz L(match_case2_prolog) 112 113 sub $16, %edx 114 jbe L(return_null) 115 lea 16(%edi), %edi 116 and $15, %ecx 117 and $-16, %edi 118 add %ecx, %edx 119 sub $64, %edx 120 jbe L(exit_loop) 121 jmp L(loop_prolog) 122 123 .p2align 4 124 L(crosscache): 125 and $15, %ecx 126 and $-16, %edi 127 movdqa (%edi), %xmm0 128 pcmpeqb %xmm1, %xmm0 129 pmovmskb %xmm0, %eax 130 sar %cl, %eax 131 test %eax, %eax 132 133 jnz L(match_case2_prolog1) 134 lea -16(%edx), %edx 135 add %ecx, %edx 136 jle L(return_null) 137 lea 16(%edi), %edi 138 sub $64, %edx 139 jbe L(exit_loop) 140 141 .p2align 4 142 L(loop_prolog): 143 movdqa (%edi), %xmm0 144 pcmpeqb %xmm1, %xmm0 145 xor %ecx, %ecx 146 pmovmskb %xmm0, %eax 147 test %eax, %eax 148 jnz L(match_case1) 149 150 movdqa 16(%edi), %xmm2 151 pcmpeqb %xmm1, %xmm2 152 lea 16(%ecx), %ecx 153 pmovmskb %xmm2, %eax 154 test %eax, %eax 155 jnz L(match_case1) 156 157 movdqa 32(%edi), %xmm3 158 pcmpeqb %xmm1, %xmm3 159 lea 16(%ecx), %ecx 160 pmovmskb %xmm3, %eax 161 test %eax, %eax 162 jnz L(match_case1) 163 164 movdqa 48(%edi), %xmm4 165 pcmpeqb %xmm1, %xmm4 166 lea 16(%ecx), %ecx 167 pmovmskb %xmm4, %eax 168 test %eax, %eax 169 jnz L(match_case1) 170 171 lea 64(%edi), %edi 172 sub $64, %edx 173 jbe L(exit_loop) 174 175 movdqa (%edi), %xmm0 176 pcmpeqb %xmm1, %xmm0 177 xor %ecx, %ecx 178 pmovmskb %xmm0, %eax 179 test %eax, %eax 180 jnz L(match_case1) 181 182 movdqa 16(%edi), %xmm2 183 pcmpeqb %xmm1, %xmm2 184 lea 16(%ecx), %ecx 185 pmovmskb %xmm2, %eax 186 test %eax, %eax 187 jnz L(match_case1) 188 189 movdqa 32(%edi), %xmm3 190 pcmpeqb %xmm1, %xmm3 191 lea 16(%ecx), %ecx 192 pmovmskb %xmm3, %eax 193 test %eax, %eax 194 jnz L(match_case1) 195 196 movdqa 48(%edi), %xmm4 197 pcmpeqb %xmm1, %xmm4 198 lea 16(%ecx), %ecx 199 pmovmskb %xmm4, %eax 200 test %eax, %eax 201 jnz L(match_case1) 202 203 lea 64(%edi), %edi 204 mov %edi, %ecx 205 and $-64, %edi 206 and $63, %ecx 207 add %ecx, %edx 208 209 .p2align 4 210 L(align64_loop): 211 sub $64, %edx 212 jbe L(exit_loop) 213 movdqa (%edi), %xmm0 214 movdqa 16(%edi), %xmm2 215 movdqa 32(%edi), %xmm3 216 movdqa 48(%edi), %xmm4 217 pcmpeqb %xmm1, %xmm0 218 pcmpeqb %xmm1, %xmm2 219 pcmpeqb %xmm1, %xmm3 220 pcmpeqb %xmm1, %xmm4 221 222 pmaxub %xmm0, %xmm3 223 pmaxub %xmm2, %xmm4 224 pmaxub %xmm3, %xmm4 225 add $64, %edi 226 pmovmskb %xmm4, %eax 227 228 test %eax, %eax 229 jz L(align64_loop) 230 231 sub $64, %edi 232 233 pmovmskb %xmm0, %eax 234 xor %ecx, %ecx 235 test %eax, %eax 236 jnz L(match_case1) 237 238 pmovmskb %xmm2, %eax 239 lea 16(%ecx), %ecx 240 test %eax, %eax 241 jnz L(match_case1) 242 243 movdqa 32(%edi), %xmm3 244 pcmpeqb %xmm1, %xmm3 245 pmovmskb %xmm3, %eax 246 lea 16(%ecx), %ecx 247 test %eax, %eax 248 jnz L(match_case1) 249 250 pcmpeqb 48(%edi), %xmm1 251 pmovmskb %xmm1, %eax 252 lea 16(%ecx), %ecx 253 254 .p2align 4 255 L(match_case1): 256 add %ecx, %edi 257 test %al, %al 258 jz L(match_case1_high) 259 mov %al, %cl 260 and $15, %cl 261 jz L(match_case1_8) 262 test $0x01, %al 263 jnz L(exit_case1_1) 264 test $0x02, %al 265 jnz L(exit_case1_2) 266 test $0x04, %al 267 jnz L(exit_case1_3) 268 lea 3(%edi), %eax 269 RETURN 270 271 .p2align 4 272 L(match_case1_8): 273 test $0x10, %al 274 jnz L(exit_case1_5) 275 test $0x20, %al 276 jnz L(exit_case1_6) 277 test $0x40, %al 278 jnz L(exit_case1_7) 279 lea 7(%edi), %eax 280 RETURN 281 282 .p2align 4 283 L(match_case1_high): 284 mov %ah, %ch 285 and $15, %ch 286 jz L(match_case1_high_8) 287 test $0x01, %ah 288 jnz L(exit_case1_9) 289 test $0x02, %ah 290 jnz L(exit_case1_10) 291 test $0x04, %ah 292 jnz L(exit_case1_11) 293 lea 11(%edi), %eax 294 RETURN 295 296 .p2align 4 297 L(match_case1_high_8): 298 test $0x10, %ah 299 jnz L(exit_case1_13) 300 test $0x20, %ah 301 jnz L(exit_case1_14) 302 test $0x40, %ah 303 jnz L(exit_case1_15) 304 lea 15(%edi), %eax 305 RETURN 306 307 .p2align 4 308 L(exit_loop): 309 add $64, %edx 310 311 movdqa (%edi), %xmm0 312 pcmpeqb %xmm1, %xmm0 313 xor %ecx, %ecx 314 pmovmskb %xmm0, %eax 315 test %eax, %eax 316 jnz L(match_case2) 317 cmp $16, %edx 318 jbe L(return_null) 319 320 movdqa 16(%edi), %xmm2 321 pcmpeqb %xmm1, %xmm2 322 lea 16(%ecx), %ecx 323 pmovmskb %xmm2, %eax 324 test %eax, %eax 325 jnz L(match_case2) 326 cmp $32, %edx 327 jbe L(return_null) 328 329 movdqa 32(%edi), %xmm3 330 pcmpeqb %xmm1, %xmm3 331 lea 16(%ecx), %ecx 332 pmovmskb %xmm3, %eax 333 test %eax, %eax 334 jnz L(match_case2) 335 cmp $48, %edx 336 jbe L(return_null) 337 338 pcmpeqb 48(%edi), %xmm1 339 lea 16(%ecx), %ecx 340 pmovmskb %xmm1, %eax 341 test %eax, %eax 342 jnz L(match_case2) 343 344 xor %eax, %eax 345 RETURN 346 347 .p2align 4 348 L(exit_case1_1): 349 mov %edi, %eax 350 RETURN 351 352 .p2align 4 353 L(exit_case1_2): 354 lea 1(%edi), %eax 355 RETURN 356 357 .p2align 4 358 L(exit_case1_3): 359 lea 2(%edi), %eax 360 RETURN 361 362 .p2align 4 363 L(exit_case1_5): 364 lea 4(%edi), %eax 365 RETURN 366 367 .p2align 4 368 L(exit_case1_6): 369 lea 5(%edi), %eax 370 RETURN 371 372 .p2align 4 373 L(exit_case1_7): 374 lea 6(%edi), %eax 375 RETURN 376 377 .p2align 4 378 L(exit_case1_9): 379 lea 8(%edi), %eax 380 RETURN 381 382 .p2align 4 383 L(exit_case1_10): 384 lea 9(%edi), %eax 385 RETURN 386 387 .p2align 4 388 L(exit_case1_11): 389 lea 10(%edi), %eax 390 RETURN 391 392 .p2align 4 393 L(exit_case1_13): 394 lea 12(%edi), %eax 395 RETURN 396 397 .p2align 4 398 L(exit_case1_14): 399 lea 13(%edi), %eax 400 RETURN 401 402 .p2align 4 403 L(exit_case1_15): 404 lea 14(%edi), %eax 405 RETURN 406 407 .p2align 4 408 L(match_case2): 409 sub %ecx, %edx 410 L(match_case2_prolog1): 411 add %ecx, %edi 412 L(match_case2_prolog): 413 test %al, %al 414 jz L(match_case2_high) 415 mov %al, %cl 416 and $15, %cl 417 jz L(match_case2_8) 418 test $0x01, %al 419 jnz L(exit_case2_1) 420 test $0x02, %al 421 jnz L(exit_case2_2) 422 test $0x04, %al 423 jnz L(exit_case2_3) 424 sub $4, %edx 425 jb L(return_null) 426 lea 3(%edi), %eax 427 RETURN 428 429 .p2align 4 430 L(match_case2_8): 431 test $0x10, %al 432 jnz L(exit_case2_5) 433 test $0x20, %al 434 jnz L(exit_case2_6) 435 test $0x40, %al 436 jnz L(exit_case2_7) 437 sub $8, %edx 438 jb L(return_null) 439 lea 7(%edi), %eax 440 RETURN 441 442 .p2align 4 443 L(match_case2_high): 444 mov %ah, %ch 445 and $15, %ch 446 jz L(match_case2_high_8) 447 test $0x01, %ah 448 jnz L(exit_case2_9) 449 test $0x02, %ah 450 jnz L(exit_case2_10) 451 test $0x04, %ah 452 jnz L(exit_case2_11) 453 sub $12, %edx 454 jb L(return_null) 455 lea 11(%edi), %eax 456 RETURN 457 458 .p2align 4 459 L(match_case2_high_8): 460 test $0x10, %ah 461 jnz L(exit_case2_13) 462 test $0x20, %ah 463 jnz L(exit_case2_14) 464 test $0x40, %ah 465 jnz L(exit_case2_15) 466 sub $16, %edx 467 jb L(return_null) 468 lea 15(%edi), %eax 469 RETURN 470 471 .p2align 4 472 L(exit_case2_1): 473 mov %edi, %eax 474 RETURN 475 476 .p2align 4 477 L(exit_case2_2): 478 sub $2, %edx 479 jb L(return_null) 480 lea 1(%edi), %eax 481 RETURN 482 483 .p2align 4 484 L(exit_case2_3): 485 sub $3, %edx 486 jb L(return_null) 487 lea 2(%edi), %eax 488 RETURN 489 490 .p2align 4 491 L(exit_case2_5): 492 sub $5, %edx 493 jb L(return_null) 494 lea 4(%edi), %eax 495 RETURN 496 497 .p2align 4 498 L(exit_case2_6): 499 sub $6, %edx 500 jb L(return_null) 501 lea 5(%edi), %eax 502 RETURN 503 504 .p2align 4 505 L(exit_case2_7): 506 sub $7, %edx 507 jb L(return_null) 508 lea 6(%edi), %eax 509 RETURN 510 511 .p2align 4 512 L(exit_case2_9): 513 sub $9, %edx 514 jb L(return_null) 515 lea 8(%edi), %eax 516 RETURN 517 518 .p2align 4 519 L(exit_case2_10): 520 sub $10, %edx 521 jb L(return_null) 522 lea 9(%edi), %eax 523 RETURN 524 525 .p2align 4 526 L(exit_case2_11): 527 sub $11, %edx 528 jb L(return_null) 529 lea 10(%edi), %eax 530 RETURN 531 532 .p2align 4 533 L(exit_case2_13): 534 sub $13, %edx 535 jb L(return_null) 536 lea 12(%edi), %eax 537 RETURN 538 539 .p2align 4 540 L(exit_case2_14): 541 sub $14, %edx 542 jb L(return_null) 543 lea 13(%edi), %eax 544 RETURN 545 546 .p2align 4 547 L(exit_case2_15): 548 sub $15, %edx 549 jb L(return_null) 550 lea 14(%edi), %eax 551 RETURN 552 .p2align 4 553 L(return_null): 554 xor %eax, %eax 555 RETURN 556 END (memchr) 557