Home | History | Annotate | Download | only in x86
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "asm_support_x86.S"
     18 
     19 #define MEMCMP  __memcmp16
     20 
     21 /* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
     22 
     23 #ifndef L
     24 # define L(label)    .L##label
     25 #endif
     26 
     27 #define CFI_PUSH(REG)    \
     28     CFI_ADJUST_CFA_OFFSET(4);    \
     29     CFI_REL_OFFSET(REG, 0)
     30 
     31 #define CFI_POP(REG)    \
     32     CFI_ADJUST_CFA_OFFSET(-4);    \
     33     CFI_RESTORE(REG)
     34 
     35 #define PUSH(REG)    pushl REG; CFI_PUSH (REG)
     36 #define POP(REG)    popl REG; CFI_POP (REG)
     37 
     38 #define PARMS        4
     39 #define BLK1        PARMS
     40 #define BLK2        BLK1+4
     41 #define LEN        BLK2+4
     42 #define RETURN_END    POP (%edi); POP (%esi); POP (%ebx); ret
     43 #define RETURN        RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
     44 
     45 DEFINE_FUNCTION MEMCMP
     46     movl       LEN(%esp), %ecx
     47 
     48     shl        $1, %ecx
     49     jz         L(zero)
     50 
     51     movl       BLK1(%esp), %eax
     52     cmp        $48, %ecx
     53     movl       BLK2(%esp), %edx
     54     jae        L(48bytesormore)
     55 
     56     PUSH       (%ebx)
     57     add        %ecx, %edx
     58     add        %ecx, %eax
     59     jmp        L(less48bytes)
     60 
     61     CFI_POP    (%ebx)
     62 
     63     .p2align 4
     64 L(zero):
     65     xor        %eax, %eax
     66     ret
     67 
     68     .p2align 4
     69 L(48bytesormore):
     70     PUSH       (%ebx)
     71     PUSH       (%esi)
     72     PUSH       (%edi)
     73     CFI_REMEMBER_STATE
     74     movdqu     (%eax), %xmm3
     75     movdqu     (%edx), %xmm0
     76     movl       %eax, %edi
     77     movl       %edx, %esi
     78     pcmpeqb    %xmm0, %xmm3
     79     pmovmskb   %xmm3, %edx
     80     lea        16(%edi), %edi
     81 
     82     sub        $0xffff, %edx
     83     lea        16(%esi), %esi
     84     jnz        L(less16bytes)
     85     mov        %edi, %edx
     86     and        $0xf, %edx
     87     xor        %edx, %edi
     88     sub        %edx, %esi
     89     add        %edx, %ecx
     90     mov        %esi, %edx
     91     and        $0xf, %edx
     92     jz         L(shr_0)
     93     xor        %edx, %esi
     94 
     95     cmp        $0, %edx
     96     je         L(shr_0)
     97     cmp        $2, %edx
     98     je         L(shr_2)
     99     cmp        $4, %edx
    100     je         L(shr_4)
    101     cmp        $6, %edx
    102     je         L(shr_6)
    103     cmp        $8, %edx
    104     je         L(shr_8)
    105     cmp        $10, %edx
    106     je         L(shr_10)
    107     cmp        $12, %edx
    108     je         L(shr_12)
    109     jmp        L(shr_14)
    110 
    111     .p2align 4
    112 L(shr_0):
    113     cmp        $80, %ecx
    114     jae        L(shr_0_gobble)
    115     lea        -48(%ecx), %ecx
    116     xor        %eax, %eax
    117     movaps     (%esi), %xmm1
    118     pcmpeqb    (%edi), %xmm1
    119     movaps     16(%esi), %xmm2
    120     pcmpeqb    16(%edi), %xmm2
    121     pand       %xmm1, %xmm2
    122     pmovmskb   %xmm2, %edx
    123     add        $32, %edi
    124     add        $32, %esi
    125     sub        $0xffff, %edx
    126     jnz        L(exit)
    127 
    128     lea        (%ecx, %edi,1), %eax
    129     lea        (%ecx, %esi,1), %edx
    130     POP        (%edi)
    131     POP        (%esi)
    132     jmp        L(less48bytes)
    133 
    134     CFI_RESTORE_STATE
    135     CFI_REMEMBER_STATE
    136     .p2align 4
    137 L(shr_0_gobble):
    138     lea        -48(%ecx), %ecx
    139     movdqa     (%esi), %xmm0
    140     xor        %eax, %eax
    141     pcmpeqb    (%edi), %xmm0
    142     sub        $32, %ecx
    143     movdqa     16(%esi), %xmm2
    144     pcmpeqb    16(%edi), %xmm2
    145 L(shr_0_gobble_loop):
    146     pand       %xmm0, %xmm2
    147     sub        $32, %ecx
    148     pmovmskb   %xmm2, %edx
    149     movdqa     %xmm0, %xmm1
    150     movdqa     32(%esi), %xmm0
    151     movdqa     48(%esi), %xmm2
    152     sbb        $0xffff, %edx
    153     pcmpeqb    32(%edi), %xmm0
    154     pcmpeqb    48(%edi), %xmm2
    155     lea        32(%edi), %edi
    156     lea        32(%esi), %esi
    157     jz         L(shr_0_gobble_loop)
    158 
    159     pand       %xmm0, %xmm2
    160     cmp        $0, %ecx
    161     jge        L(shr_0_gobble_loop_next)
    162     inc        %edx
    163     add        $32, %ecx
    164 L(shr_0_gobble_loop_next):
    165     test       %edx, %edx
    166     jnz        L(exit)
    167 
    168     pmovmskb %xmm2, %edx
    169     movdqa     %xmm0, %xmm1
    170     lea        32(%edi), %edi
    171     lea        32(%esi), %esi
    172     sub        $0xffff, %edx
    173     jnz        L(exit)
    174     lea        (%ecx, %edi,1), %eax
    175     lea        (%ecx, %esi,1), %edx
    176     POP        (%edi)
    177     POP        (%esi)
    178     jmp        L(less48bytes)
    179 
    180     CFI_RESTORE_STATE
    181     CFI_REMEMBER_STATE
    182     .p2align 4
    183 L(shr_2):
    184     cmp        $80, %ecx
    185     lea        -48(%ecx), %ecx
    186     mov        %edx, %eax
    187     jae        L(shr_2_gobble)
    188 
    189     movdqa     16(%esi), %xmm1
    190     movdqa     %xmm1, %xmm2
    191     palignr    $2,(%esi), %xmm1
    192     pcmpeqb    (%edi), %xmm1
    193 
    194     movdqa     32(%esi), %xmm3
    195     palignr    $2,%xmm2, %xmm3
    196     pcmpeqb    16(%edi), %xmm3
    197 
    198     pand       %xmm1, %xmm3
    199     pmovmskb   %xmm3, %edx
    200     lea        32(%edi), %edi
    201     lea        32(%esi), %esi
    202     sub        $0xffff, %edx
    203     jnz        L(exit)
    204     lea        (%ecx, %edi,1), %eax
    205     lea        2(%ecx, %esi,1), %edx
    206     POP        (%edi)
    207     POP        (%esi)
    208     jmp        L(less48bytes)
    209 
    210     CFI_RESTORE_STATE
    211     CFI_REMEMBER_STATE
    212     .p2align 4
    213 L(shr_2_gobble):
    214     sub        $32, %ecx
    215     movdqa     16(%esi), %xmm0
    216     palignr    $2,(%esi), %xmm0
    217     pcmpeqb    (%edi), %xmm0
    218 
    219     movdqa     32(%esi), %xmm3
    220     palignr    $2,16(%esi), %xmm3
    221     pcmpeqb    16(%edi), %xmm3
    222 
    223 L(shr_2_gobble_loop):
    224     pand       %xmm0, %xmm3
    225     sub        $32, %ecx
    226     pmovmskb   %xmm3, %edx
    227     movdqa     %xmm0, %xmm1
    228 
    229     movdqa     64(%esi), %xmm3
    230     palignr    $2,48(%esi), %xmm3
    231     sbb        $0xffff, %edx
    232     movdqa     48(%esi), %xmm0
    233     palignr    $2,32(%esi), %xmm0
    234     pcmpeqb    32(%edi), %xmm0
    235     lea        32(%esi), %esi
    236     pcmpeqb    48(%edi), %xmm3
    237 
    238     lea        32(%edi), %edi
    239     jz         L(shr_2_gobble_loop)
    240     pand       %xmm0, %xmm3
    241 
    242     cmp        $0, %ecx
    243     jge        L(shr_2_gobble_next)
    244     inc        %edx
    245     add        $32, %ecx
    246 L(shr_2_gobble_next):
    247     test       %edx, %edx
    248     jnz        L(exit)
    249 
    250     pmovmskb   %xmm3, %edx
    251     movdqa     %xmm0, %xmm1
    252     lea        32(%edi), %edi
    253     lea        32(%esi), %esi
    254     sub        $0xffff, %edx
    255     jnz        L(exit)
    256 
    257     lea        (%ecx, %edi,1), %eax
    258     lea        2(%ecx, %esi,1), %edx
    259     POP        (%edi)
    260     POP        (%esi)
    261     jmp        L(less48bytes)
    262 
    263     CFI_RESTORE_STATE
    264     CFI_REMEMBER_STATE
    265     .p2align 4
    266 L(shr_4):
    267     cmp        $80, %ecx
    268     lea        -48(%ecx), %ecx
    269     mov        %edx, %eax
    270     jae        L(shr_4_gobble)
    271 
    272     movdqa     16(%esi), %xmm1
    273     movdqa     %xmm1, %xmm2
    274     palignr    $4,(%esi), %xmm1
    275     pcmpeqb    (%edi), %xmm1
    276 
    277     movdqa     32(%esi), %xmm3
    278     palignr    $4,%xmm2, %xmm3
    279     pcmpeqb    16(%edi), %xmm3
    280 
    281     pand       %xmm1, %xmm3
    282     pmovmskb   %xmm3, %edx
    283     lea        32(%edi), %edi
    284     lea        32(%esi), %esi
    285     sub        $0xffff, %edx
    286     jnz        L(exit)
    287     lea        (%ecx, %edi,1), %eax
    288     lea        4(%ecx, %esi,1), %edx
    289     POP        (%edi)
    290     POP        (%esi)
    291     jmp        L(less48bytes)
    292 
    293     CFI_RESTORE_STATE
    294     CFI_REMEMBER_STATE
    295     .p2align 4
    296 L(shr_4_gobble):
    297     sub        $32, %ecx
    298     movdqa     16(%esi), %xmm0
    299     palignr    $4,(%esi), %xmm0
    300     pcmpeqb    (%edi), %xmm0
    301 
    302     movdqa     32(%esi), %xmm3
    303     palignr    $4,16(%esi), %xmm3
    304     pcmpeqb    16(%edi), %xmm3
    305 
    306 L(shr_4_gobble_loop):
    307     pand       %xmm0, %xmm3
    308     sub        $32, %ecx
    309     pmovmskb   %xmm3, %edx
    310     movdqa     %xmm0, %xmm1
    311 
    312     movdqa     64(%esi), %xmm3
    313     palignr    $4,48(%esi), %xmm3
    314     sbb        $0xffff, %edx
    315     movdqa     48(%esi), %xmm0
    316     palignr    $4,32(%esi), %xmm0
    317     pcmpeqb    32(%edi), %xmm0
    318     lea        32(%esi), %esi
    319     pcmpeqb    48(%edi), %xmm3
    320 
    321     lea        32(%edi), %edi
    322     jz         L(shr_4_gobble_loop)
    323     pand       %xmm0, %xmm3
    324 
    325     cmp        $0, %ecx
    326     jge        L(shr_4_gobble_next)
    327     inc        %edx
    328     add        $32, %ecx
    329 L(shr_4_gobble_next):
    330     test       %edx, %edx
    331     jnz        L(exit)
    332 
    333     pmovmskb   %xmm3, %edx
    334     movdqa     %xmm0, %xmm1
    335     lea        32(%edi), %edi
    336     lea        32(%esi), %esi
    337     sub        $0xffff, %edx
    338     jnz        L(exit)
    339 
    340     lea        (%ecx, %edi,1), %eax
    341     lea        4(%ecx, %esi,1), %edx
    342     POP        (%edi)
    343     POP        (%esi)
    344     jmp        L(less48bytes)
    345 
    346     CFI_RESTORE_STATE
    347     CFI_REMEMBER_STATE
    348     .p2align 4
    349 L(shr_6):
    350     cmp        $80, %ecx
    351     lea        -48(%ecx), %ecx
    352     mov        %edx, %eax
    353     jae        L(shr_6_gobble)
    354 
    355     movdqa     16(%esi), %xmm1
    356     movdqa     %xmm1, %xmm2
    357     palignr    $6,(%esi), %xmm1
    358     pcmpeqb    (%edi), %xmm1
    359 
    360     movdqa     32(%esi), %xmm3
    361     palignr    $6,%xmm2, %xmm3
    362     pcmpeqb    16(%edi), %xmm3
    363 
    364     pand       %xmm1, %xmm3
    365     pmovmskb   %xmm3, %edx
    366     lea        32(%edi), %edi
    367     lea        32(%esi), %esi
    368     sub        $0xffff, %edx
    369     jnz        L(exit)
    370     lea        (%ecx, %edi,1), %eax
    371     lea        6(%ecx, %esi,1), %edx
    372     POP        (%edi)
    373     POP        (%esi)
    374     jmp        L(less48bytes)
    375 
    376     CFI_RESTORE_STATE
    377     CFI_REMEMBER_STATE
    378     .p2align 4
    379 L(shr_6_gobble):
    380     sub        $32, %ecx
    381     movdqa     16(%esi), %xmm0
    382     palignr    $6,(%esi), %xmm0
    383     pcmpeqb    (%edi), %xmm0
    384 
    385     movdqa     32(%esi), %xmm3
    386     palignr    $6,16(%esi), %xmm3
    387     pcmpeqb    16(%edi), %xmm3
    388 
    389 L(shr_6_gobble_loop):
    390     pand       %xmm0, %xmm3
    391     sub        $32, %ecx
    392     pmovmskb   %xmm3, %edx
    393     movdqa     %xmm0, %xmm1
    394 
    395     movdqa     64(%esi), %xmm3
    396     palignr    $6,48(%esi), %xmm3
    397     sbb        $0xffff, %edx
    398     movdqa     48(%esi), %xmm0
    399     palignr    $6,32(%esi), %xmm0
    400     pcmpeqb    32(%edi), %xmm0
    401     lea        32(%esi), %esi
    402     pcmpeqb    48(%edi), %xmm3
    403 
    404     lea        32(%edi), %edi
    405     jz         L(shr_6_gobble_loop)
    406     pand       %xmm0, %xmm3
    407 
    408     cmp        $0, %ecx
    409     jge        L(shr_6_gobble_next)
    410     inc        %edx
    411     add        $32, %ecx
    412 L(shr_6_gobble_next):
    413     test       %edx, %edx
    414     jnz        L(exit)
    415 
    416     pmovmskb   %xmm3, %edx
    417     movdqa     %xmm0, %xmm1
    418     lea        32(%edi), %edi
    419     lea        32(%esi), %esi
    420     sub        $0xffff, %edx
    421     jnz        L(exit)
    422 
    423     lea        (%ecx, %edi,1), %eax
    424     lea        6(%ecx, %esi,1), %edx
    425     POP        (%edi)
    426     POP        (%esi)
    427     jmp        L(less48bytes)
    428 
    429     CFI_RESTORE_STATE
    430     CFI_REMEMBER_STATE
    431     .p2align 4
    432 L(shr_8):
    433     cmp        $80, %ecx
    434     lea        -48(%ecx), %ecx
    435     mov        %edx, %eax
    436     jae        L(shr_8_gobble)
    437 
    438     movdqa     16(%esi), %xmm1
    439     movdqa     %xmm1, %xmm2
    440     palignr    $8,(%esi), %xmm1
    441     pcmpeqb    (%edi), %xmm1
    442 
    443     movdqa     32(%esi), %xmm3
    444     palignr    $8,%xmm2, %xmm3
    445     pcmpeqb    16(%edi), %xmm3
    446 
    447     pand       %xmm1, %xmm3
    448     pmovmskb   %xmm3, %edx
    449     lea        32(%edi), %edi
    450     lea        32(%esi), %esi
    451     sub        $0xffff, %edx
    452     jnz        L(exit)
    453     lea        (%ecx, %edi,1), %eax
    454     lea        8(%ecx, %esi,1), %edx
    455     POP        (%edi)
    456     POP        (%esi)
    457     jmp        L(less48bytes)
    458 
    459     CFI_RESTORE_STATE
    460     CFI_REMEMBER_STATE
    461     .p2align 4
    462 L(shr_8_gobble):
    463     sub        $32, %ecx
    464     movdqa     16(%esi), %xmm0
    465     palignr    $8,(%esi), %xmm0
    466     pcmpeqb    (%edi), %xmm0
    467 
    468     movdqa     32(%esi), %xmm3
    469     palignr    $8,16(%esi), %xmm3
    470     pcmpeqb    16(%edi), %xmm3
    471 
    472 L(shr_8_gobble_loop):
    473     pand       %xmm0, %xmm3
    474     sub        $32, %ecx
    475     pmovmskb   %xmm3, %edx
    476     movdqa     %xmm0, %xmm1
    477 
    478     movdqa     64(%esi), %xmm3
    479     palignr    $8,48(%esi), %xmm3
    480     sbb        $0xffff, %edx
    481     movdqa     48(%esi), %xmm0
    482     palignr    $8,32(%esi), %xmm0
    483     pcmpeqb    32(%edi), %xmm0
    484     lea        32(%esi), %esi
    485     pcmpeqb    48(%edi), %xmm3
    486 
    487     lea        32(%edi), %edi
    488     jz         L(shr_8_gobble_loop)
    489     pand       %xmm0, %xmm3
    490 
    491     cmp        $0, %ecx
    492     jge        L(shr_8_gobble_next)
    493     inc        %edx
    494     add        $32, %ecx
    495 L(shr_8_gobble_next):
    496     test       %edx, %edx
    497     jnz        L(exit)
    498 
    499     pmovmskb   %xmm3, %edx
    500     movdqa     %xmm0, %xmm1
    501     lea        32(%edi), %edi
    502     lea        32(%esi), %esi
    503     sub        $0xffff, %edx
    504     jnz        L(exit)
    505 
    506     lea        (%ecx, %edi,1), %eax
    507     lea        8(%ecx, %esi,1), %edx
    508     POP        (%edi)
    509     POP        (%esi)
    510     jmp        L(less48bytes)
    511 
    512     CFI_RESTORE_STATE
    513     CFI_REMEMBER_STATE
    514     .p2align 4
    515 L(shr_10):
    516     cmp        $80, %ecx
    517     lea        -48(%ecx), %ecx
    518     mov        %edx, %eax
    519     jae        L(shr_10_gobble)
    520 
    521     movdqa     16(%esi), %xmm1
    522     movdqa     %xmm1, %xmm2
    523     palignr    $10, (%esi), %xmm1
    524     pcmpeqb    (%edi), %xmm1
    525 
    526     movdqa     32(%esi), %xmm3
    527     palignr    $10,%xmm2, %xmm3
    528     pcmpeqb    16(%edi), %xmm3
    529 
    530     pand       %xmm1, %xmm3
    531     pmovmskb   %xmm3, %edx
    532     lea        32(%edi), %edi
    533     lea        32(%esi), %esi
    534     sub        $0xffff, %edx
    535     jnz        L(exit)
    536     lea        (%ecx, %edi,1), %eax
    537     lea        10(%ecx, %esi,1), %edx
    538     POP        (%edi)
    539     POP        (%esi)
    540     jmp        L(less48bytes)
    541 
    542     CFI_RESTORE_STATE
    543     CFI_REMEMBER_STATE
    544     .p2align 4
    545 L(shr_10_gobble):
    546     sub        $32, %ecx
    547     movdqa     16(%esi), %xmm0
    548     palignr    $10, (%esi), %xmm0
    549     pcmpeqb    (%edi), %xmm0
    550 
    551     movdqa     32(%esi), %xmm3
    552     palignr    $10, 16(%esi), %xmm3
    553     pcmpeqb    16(%edi), %xmm3
    554 
    555 L(shr_10_gobble_loop):
    556     pand       %xmm0, %xmm3
    557     sub        $32, %ecx
    558     pmovmskb   %xmm3, %edx
    559     movdqa     %xmm0, %xmm1
    560 
    561     movdqa     64(%esi), %xmm3
    562     palignr    $10,48(%esi), %xmm3
    563     sbb        $0xffff, %edx
    564     movdqa     48(%esi), %xmm0
    565     palignr    $10,32(%esi), %xmm0
    566     pcmpeqb    32(%edi), %xmm0
    567     lea        32(%esi), %esi
    568     pcmpeqb    48(%edi), %xmm3
    569 
    570     lea        32(%edi), %edi
    571     jz         L(shr_10_gobble_loop)
    572     pand       %xmm0, %xmm3
    573 
    574     cmp        $0, %ecx
    575     jge        L(shr_10_gobble_next)
    576     inc        %edx
    577     add        $32, %ecx
    578 L(shr_10_gobble_next):
    579     test       %edx, %edx
    580     jnz        L(exit)
    581 
    582     pmovmskb   %xmm3, %edx
    583     movdqa     %xmm0, %xmm1
    584     lea        32(%edi), %edi
    585     lea        32(%esi), %esi
    586     sub        $0xffff, %edx
    587     jnz        L(exit)
    588 
    589     lea        (%ecx, %edi,1), %eax
    590     lea        10(%ecx, %esi,1), %edx
    591     POP        (%edi)
    592     POP        (%esi)
    593     jmp        L(less48bytes)
    594 
    595     CFI_RESTORE_STATE
    596     CFI_REMEMBER_STATE
    597     .p2align 4
    598 L(shr_12):
    599     cmp        $80, %ecx
    600     lea        -48(%ecx), %ecx
    601     mov        %edx, %eax
    602     jae        L(shr_12_gobble)
    603 
    604     movdqa     16(%esi), %xmm1
    605     movdqa     %xmm1, %xmm2
    606     palignr    $12, (%esi), %xmm1
    607     pcmpeqb    (%edi), %xmm1
    608 
    609     movdqa     32(%esi), %xmm3
    610     palignr    $12, %xmm2, %xmm3
    611     pcmpeqb    16(%edi), %xmm3
    612 
    613     pand       %xmm1, %xmm3
    614     pmovmskb   %xmm3, %edx
    615     lea        32(%edi), %edi
    616     lea        32(%esi), %esi
    617     sub        $0xffff, %edx
    618     jnz        L(exit)
    619     lea        (%ecx, %edi,1), %eax
    620     lea        12(%ecx, %esi,1), %edx
    621     POP        (%edi)
    622     POP        (%esi)
    623     jmp        L(less48bytes)
    624 
    625     CFI_RESTORE_STATE
    626     CFI_REMEMBER_STATE
    627     .p2align 4
    628 L(shr_12_gobble):
    629     sub        $32, %ecx
    630     movdqa     16(%esi), %xmm0
    631     palignr    $12, (%esi), %xmm0
    632     pcmpeqb    (%edi), %xmm0
    633 
    634     movdqa     32(%esi), %xmm3
    635     palignr    $12, 16(%esi), %xmm3
    636     pcmpeqb    16(%edi), %xmm3
    637 
    638 L(shr_12_gobble_loop):
    639     pand       %xmm0, %xmm3
    640     sub        $32, %ecx
    641     pmovmskb   %xmm3, %edx
    642     movdqa     %xmm0, %xmm1
    643 
    644     movdqa     64(%esi), %xmm3
    645     palignr    $12,48(%esi), %xmm3
    646     sbb        $0xffff, %edx
    647     movdqa     48(%esi), %xmm0
    648     palignr    $12,32(%esi), %xmm0
    649     pcmpeqb    32(%edi), %xmm0
    650     lea        32(%esi), %esi
    651     pcmpeqb    48(%edi), %xmm3
    652 
    653     lea        32(%edi), %edi
    654     jz         L(shr_12_gobble_loop)
    655     pand       %xmm0, %xmm3
    656 
    657     cmp        $0, %ecx
    658     jge        L(shr_12_gobble_next)
    659     inc        %edx
    660     add        $32, %ecx
    661 L(shr_12_gobble_next):
    662     test       %edx, %edx
    663     jnz        L(exit)
    664 
    665     pmovmskb   %xmm3, %edx
    666     movdqa     %xmm0, %xmm1
    667     lea        32(%edi), %edi
    668     lea        32(%esi), %esi
    669     sub        $0xffff, %edx
    670     jnz        L(exit)
    671 
    672     lea        (%ecx, %edi,1), %eax
    673     lea        12(%ecx, %esi,1), %edx
    674     POP        (%edi)
    675     POP        (%esi)
    676     jmp        L(less48bytes)
    677 
    678     CFI_RESTORE_STATE
    679     CFI_REMEMBER_STATE
    680     .p2align 4
    681 L(shr_14):
    682     cmp        $80, %ecx
    683     lea        -48(%ecx), %ecx
    684     mov        %edx, %eax
    685     jae        L(shr_14_gobble)
    686 
    687     movdqa     16(%esi), %xmm1
    688     movdqa     %xmm1, %xmm2
    689     palignr    $14, (%esi), %xmm1
    690     pcmpeqb    (%edi), %xmm1
    691 
    692     movdqa     32(%esi), %xmm3
    693     palignr    $14, %xmm2, %xmm3
    694     pcmpeqb    16(%edi), %xmm3
    695 
    696     pand       %xmm1, %xmm3
    697     pmovmskb   %xmm3, %edx
    698     lea        32(%edi), %edi
    699     lea        32(%esi), %esi
    700     sub        $0xffff, %edx
    701     jnz        L(exit)
    702     lea        (%ecx, %edi,1), %eax
    703     lea        14(%ecx, %esi,1), %edx
    704     POP        (%edi)
    705     POP        (%esi)
    706     jmp        L(less48bytes)
    707 
    708     CFI_RESTORE_STATE
    709     CFI_REMEMBER_STATE
    710     .p2align 4
    711 L(shr_14_gobble):
    712     sub        $32, %ecx
    713     movdqa     16(%esi), %xmm0
    714     palignr    $14, (%esi), %xmm0
    715     pcmpeqb    (%edi), %xmm0
    716 
    717     movdqa     32(%esi), %xmm3
    718     palignr    $14, 16(%esi), %xmm3
    719     pcmpeqb    16(%edi), %xmm3
    720 
    721 L(shr_14_gobble_loop):
    722     pand       %xmm0, %xmm3
    723     sub        $32, %ecx
    724     pmovmskb   %xmm3, %edx
    725     movdqa     %xmm0, %xmm1
    726 
    727     movdqa     64(%esi), %xmm3
    728     palignr    $14,48(%esi), %xmm3
    729     sbb        $0xffff, %edx
    730     movdqa     48(%esi), %xmm0
    731     palignr    $14,32(%esi), %xmm0
    732     pcmpeqb    32(%edi), %xmm0
    733     lea        32(%esi), %esi
    734     pcmpeqb    48(%edi), %xmm3
    735 
    736     lea        32(%edi), %edi
    737     jz         L(shr_14_gobble_loop)
    738     pand       %xmm0, %xmm3
    739 
    740     cmp        $0, %ecx
    741     jge        L(shr_14_gobble_next)
    742     inc        %edx
    743     add        $32, %ecx
    744 L(shr_14_gobble_next):
    745     test       %edx, %edx
    746     jnz        L(exit)
    747 
    748     pmovmskb   %xmm3, %edx
    749     movdqa     %xmm0, %xmm1
    750     lea        32(%edi), %edi
    751     lea        32(%esi), %esi
    752     sub        $0xffff, %edx
    753     jnz        L(exit)
    754 
    755     lea        (%ecx, %edi,1), %eax
    756     lea        14(%ecx, %esi,1), %edx
    757     POP        (%edi)
    758     POP        (%esi)
    759     jmp        L(less48bytes)
    760 
    761     CFI_RESTORE_STATE
    762     CFI_REMEMBER_STATE
    763     .p2align 4
    764 L(exit):
    765     pmovmskb   %xmm1, %ebx
    766     sub        $0xffff, %ebx
    767     jz         L(first16bytes)
    768     lea        -16(%esi), %esi
    769     lea        -16(%edi), %edi
    770     mov        %ebx, %edx
    771 
    772 L(first16bytes):
    773     add        %eax, %esi
    774 L(less16bytes):
    775     test       %dl, %dl
    776     jz         L(next_four_words)
    777     test       $15, %dl
    778     jz         L(second_two_words)
    779     test       $3, %dl
    780     jz         L(second_word)
    781     movzwl     -16(%edi), %eax
    782     movzwl     -16(%esi), %ebx
    783     subl       %ebx, %eax
    784     RETURN
    785 
    786     .p2align 4
    787 L(second_word):
    788     movzwl     -14(%edi), %eax
    789     movzwl     -14(%esi), %ebx
    790     subl       %ebx, %eax
    791     RETURN
    792 
    793     .p2align 4
    794 L(second_two_words):
    795     test       $63, %dl
    796     jz         L(fourth_word)
    797     movzwl     -12(%edi), %eax
    798     movzwl     -12(%esi), %ebx
    799     subl       %ebx, %eax
    800     RETURN
    801 
    802     .p2align 4
    803 L(fourth_word):
    804     movzwl     -10(%edi), %eax
    805     movzwl     -10(%esi), %ebx
    806     subl       %ebx, %eax
    807     RETURN
    808 
    809     .p2align 4
    810 L(next_four_words):
    811     test       $15, %dh
    812     jz         L(fourth_two_words)
    813     test       $3, %dh
    814     jz         L(sixth_word)
    815     movzwl     -8(%edi), %eax
    816     movzwl     -8(%esi), %ebx
    817     subl       %ebx, %eax
    818     RETURN
    819 
    820     .p2align 4
    821 L(sixth_word):
    822     movzwl     -6(%edi), %eax
    823     movzwl     -6(%esi), %ebx
    824     subl       %ebx, %eax
    825     RETURN
    826 
    827     .p2align 4
    828 L(fourth_two_words):
    829     test       $63, %dh
    830     jz         L(eighth_word)
    831     movzwl     -4(%edi), %eax
    832     movzwl     -4(%esi), %ebx
    833     subl       %ebx, %eax
    834     RETURN
    835 
    836     .p2align 4
    837 L(eighth_word):
    838     movzwl     -2(%edi), %eax
    839     movzwl     -2(%esi), %ebx
    840     subl       %ebx, %eax
    841     RETURN
    842 
    843 
    844     CFI_PUSH (%ebx)
    845 
    846     .p2align 4
    847 L(more8bytes):
    848     cmp        $16, %ecx
    849     jae        L(more16bytes)
    850     cmp        $8, %ecx
    851     je         L(8bytes)
    852     cmp        $10, %ecx
    853     je         L(10bytes)
    854     cmp        $12, %ecx
    855     je         L(12bytes)
    856     jmp        L(14bytes)
    857 
    858     .p2align 4
    859 L(more16bytes):
    860     cmp        $24, %ecx
    861     jae        L(more24bytes)
    862     cmp        $16, %ecx
    863     je         L(16bytes)
    864     cmp        $18, %ecx
    865     je         L(18bytes)
    866     cmp        $20, %ecx
    867     je         L(20bytes)
    868     jmp        L(22bytes)
    869 
    870     .p2align 4
    871 L(more24bytes):
    872     cmp        $32, %ecx
    873     jae        L(more32bytes)
    874     cmp        $24, %ecx
    875     je         L(24bytes)
    876     cmp        $26, %ecx
    877     je         L(26bytes)
    878     cmp        $28, %ecx
    879     je         L(28bytes)
    880     jmp        L(30bytes)
    881 
    882     .p2align 4
    883 L(more32bytes):
    884     cmp        $40, %ecx
    885     jae        L(more40bytes)
    886     cmp        $32, %ecx
    887     je         L(32bytes)
    888     cmp        $34, %ecx
    889     je         L(34bytes)
    890     cmp        $36, %ecx
    891     je         L(36bytes)
    892     jmp        L(38bytes)
    893 
    894     .p2align 4
    895 L(less48bytes):
    896     cmp        $8, %ecx
    897     jae        L(more8bytes)
    898     cmp        $2, %ecx
    899     je         L(2bytes)
    900     cmp        $4, %ecx
    901     je         L(4bytes)
    902     jmp        L(6bytes)
    903 
    904     .p2align 4
    905 L(more40bytes):
    906     cmp        $40, %ecx
    907     je         L(40bytes)
    908     cmp        $42, %ecx
    909     je         L(42bytes)
    910     cmp        $44, %ecx
    911     je         L(44bytes)
    912     jmp        L(46bytes)
    913 
    914     .p2align 4
    915 L(46bytes):
    916     movzwl     -46(%eax), %ecx
    917     movzwl     -46(%edx), %ebx
    918     subl       %ebx, %ecx
    919     jne        L(memcmp16_exit)
    920 L(44bytes):
    921     movzwl     -44(%eax), %ecx
    922     movzwl     -44(%edx), %ebx
    923     subl       %ebx, %ecx
    924     jne        L(memcmp16_exit)
    925 L(42bytes):
    926     movzwl     -42(%eax), %ecx
    927     movzwl     -42(%edx), %ebx
    928     subl       %ebx, %ecx
    929     jne        L(memcmp16_exit)
    930 L(40bytes):
    931     movzwl     -40(%eax), %ecx
    932     movzwl     -40(%edx), %ebx
    933     subl       %ebx, %ecx
    934     jne        L(memcmp16_exit)
    935 L(38bytes):
    936     movzwl     -38(%eax), %ecx
    937     movzwl     -38(%edx), %ebx
    938     subl       %ebx, %ecx
    939     jne        L(memcmp16_exit)
    940 L(36bytes):
    941     movzwl     -36(%eax), %ecx
    942     movzwl     -36(%edx), %ebx
    943     subl       %ebx, %ecx
    944     jne        L(memcmp16_exit)
    945 L(34bytes):
    946     movzwl     -34(%eax), %ecx
    947     movzwl     -34(%edx), %ebx
    948     subl       %ebx, %ecx
    949     jne        L(memcmp16_exit)
    950 L(32bytes):
    951     movzwl     -32(%eax), %ecx
    952     movzwl     -32(%edx), %ebx
    953     subl       %ebx, %ecx
    954     jne        L(memcmp16_exit)
    955 L(30bytes):
    956     movzwl     -30(%eax), %ecx
    957     movzwl     -30(%edx), %ebx
    958     subl       %ebx, %ecx
    959     jne        L(memcmp16_exit)
    960 L(28bytes):
    961     movzwl     -28(%eax), %ecx
    962     movzwl     -28(%edx), %ebx
    963     subl       %ebx, %ecx
    964     jne        L(memcmp16_exit)
    965 L(26bytes):
    966     movzwl     -26(%eax), %ecx
    967     movzwl     -26(%edx), %ebx
    968     subl       %ebx, %ecx
    969     jne        L(memcmp16_exit)
    970 L(24bytes):
    971     movzwl     -24(%eax), %ecx
    972     movzwl     -24(%edx), %ebx
    973     subl       %ebx, %ecx
    974     jne        L(memcmp16_exit)
    975 L(22bytes):
    976     movzwl     -22(%eax), %ecx
    977     movzwl     -22(%edx), %ebx
    978     subl       %ebx, %ecx
    979     jne        L(memcmp16_exit)
    980 L(20bytes):
    981     movzwl     -20(%eax), %ecx
    982     movzwl     -20(%edx), %ebx
    983     subl       %ebx, %ecx
    984     jne        L(memcmp16_exit)
    985 L(18bytes):
    986     movzwl     -18(%eax), %ecx
    987     movzwl     -18(%edx), %ebx
    988     subl       %ebx, %ecx
    989     jne        L(memcmp16_exit)
    990 L(16bytes):
    991     movzwl     -16(%eax), %ecx
    992     movzwl     -16(%edx), %ebx
    993     subl       %ebx, %ecx
    994     jne        L(memcmp16_exit)
    995 L(14bytes):
    996     movzwl     -14(%eax), %ecx
    997     movzwl     -14(%edx), %ebx
    998     subl       %ebx, %ecx
    999     jne        L(memcmp16_exit)
   1000 L(12bytes):
   1001     movzwl     -12(%eax), %ecx
   1002     movzwl     -12(%edx), %ebx
   1003     subl       %ebx, %ecx
   1004     jne        L(memcmp16_exit)
   1005 L(10bytes):
   1006     movzwl     -10(%eax), %ecx
   1007     movzwl     -10(%edx), %ebx
   1008     subl       %ebx, %ecx
   1009     jne        L(memcmp16_exit)
   1010 L(8bytes):
   1011     movzwl     -8(%eax), %ecx
   1012     movzwl     -8(%edx), %ebx
   1013     subl       %ebx, %ecx
   1014     jne        L(memcmp16_exit)
   1015 L(6bytes):
   1016     movzwl     -6(%eax), %ecx
   1017     movzwl     -6(%edx), %ebx
   1018     subl       %ebx, %ecx
   1019     jne        L(memcmp16_exit)
   1020 L(4bytes):
   1021     movzwl     -4(%eax), %ecx
   1022     movzwl     -4(%edx), %ebx
   1023     subl       %ebx, %ecx
   1024     jne        L(memcmp16_exit)
   1025 L(2bytes):
   1026     movzwl     -2(%eax), %eax
   1027     movzwl     -2(%edx), %ebx
   1028     subl       %ebx, %eax
   1029     POP        (%ebx)
   1030     ret
   1031     CFI_PUSH   (%ebx)
   1032 
   1033     .p2align 4
   1034 L(memcmp16_exit):
   1035     POP        (%ebx)
   1036     mov        %ecx, %eax
   1037     ret
   1038 END_FUNCTION MEMCMP
   1039