Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
      4 
      5 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
      6 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
      7 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
      8 declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
      9 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
     10 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
     11 
     12 define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
     13 ; X32-LABEL: pcmpestri_reg_eq_i8:
     14 ; X32:       # %bb.0: # %entry
     15 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     16 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
     17 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
     18 ; X32-NEXT:    setae %al
     19 ; X32-NEXT:    retl
     20 ;
     21 ; X64-LABEL: pcmpestri_reg_eq_i8:
     22 ; X64:       # %bb.0: # %entry
     23 ; X64-NEXT:    movl %edi, %eax
     24 ; X64-NEXT:    movl %esi, %edx
     25 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
     26 ; X64-NEXT:    setae %al
     27 ; X64-NEXT:    retq
     28 entry:
     29   %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
     30   %result = icmp eq i32 %c, 0
     31   ret i1 %result
     32 }
     33 
     34 define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
     35 ; X32-LABEL: pcmpestri_reg_idx_i8:
     36 ; X32:       # %bb.0: # %entry
     37 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     38 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
     39 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
     40 ; X32-NEXT:    movl %ecx, %eax
     41 ; X32-NEXT:    retl
     42 ;
     43 ; X64-LABEL: pcmpestri_reg_idx_i8:
     44 ; X64:       # %bb.0: # %entry
     45 ; X64-NEXT:    movl %edi, %eax
     46 ; X64-NEXT:    movl %esi, %edx
     47 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
     48 ; X64-NEXT:    movl %ecx, %eax
     49 ; X64-NEXT:    retq
     50 entry:
     51   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
     52   ret i32 %idx
     53 }
     54 
     55 define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
     56 ; X32-LABEL: pcmpestri_reg_diff_i8:
     57 ; X32:       # %bb.0: # %entry
     58 ; X32-NEXT:    pushl %ebp
     59 ; X32-NEXT:    movl %esp, %ebp
     60 ; X32-NEXT:    andl $-16, %esp
     61 ; X32-NEXT:    subl $48, %esp
     62 ; X32-NEXT:    movl 8(%ebp), %eax
     63 ; X32-NEXT:    movl 12(%ebp), %edx
     64 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
     65 ; X32-NEXT:    cmpl $16, %ecx
     66 ; X32-NEXT:    jne .LBB2_2
     67 ; X32-NEXT:  # %bb.1:
     68 ; X32-NEXT:    xorl %eax, %eax
     69 ; X32-NEXT:    jmp .LBB2_3
     70 ; X32-NEXT:  .LBB2_2: # %compare
     71 ; X32-NEXT:    movdqa %xmm0, (%esp)
     72 ; X32-NEXT:    andl $15, %ecx
     73 ; X32-NEXT:    movb (%esp,%ecx), %al
     74 ; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
     75 ; X32-NEXT:    subb 16(%esp,%ecx), %al
     76 ; X32-NEXT:  .LBB2_3: # %exit
     77 ; X32-NEXT:    movzbl %al, %eax
     78 ; X32-NEXT:    movl %ebp, %esp
     79 ; X32-NEXT:    popl %ebp
     80 ; X32-NEXT:    retl
     81 ;
     82 ; X64-LABEL: pcmpestri_reg_diff_i8:
     83 ; X64:       # %bb.0: # %entry
     84 ; X64-NEXT:    movl %edi, %eax
     85 ; X64-NEXT:    movl %esi, %edx
     86 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
     87 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
     88 ; X64-NEXT:    cmpl $16, %ecx
     89 ; X64-NEXT:    jne .LBB2_2
     90 ; X64-NEXT:  # %bb.1:
     91 ; X64-NEXT:    xorl %eax, %eax
     92 ; X64-NEXT:    movzbl %al, %eax
     93 ; X64-NEXT:    retq
     94 ; X64-NEXT:  .LBB2_2: # %compare
     95 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
     96 ; X64-NEXT:    andl $15, %ecx
     97 ; X64-NEXT:    movb -24(%rsp,%rcx), %al
     98 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
     99 ; X64-NEXT:    subb -40(%rsp,%rcx), %al
    100 ; X64-NEXT:    movzbl %al, %eax
    101 ; X64-NEXT:    retq
    102 entry:
    103   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
    104   %eq = icmp eq i32 %idx, 16
    105   br i1 %eq, label %exit, label %compare
    106 
    107 compare:
    108   %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
    109   %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
    110   %sub = sub i8 %lhs_c, %rhs_c
    111   br label %exit
    112 
    113 exit:
    114   %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
    115   %result_ext = zext i8 %result to i32
    116   ret i32 %result_ext
    117 }
    118 
    119 define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
    120 ; X32-LABEL: pcmpestri_mem_eq_i8:
    121 ; X32:       # %bb.0: # %entry
    122 ; X32-NEXT:    pushl %esi
    123 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    124 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    125 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    126 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    127 ; X32-NEXT:    movdqu (%esi), %xmm0
    128 ; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0
    129 ; X32-NEXT:    setae %al
    130 ; X32-NEXT:    popl %esi
    131 ; X32-NEXT:    retl
    132 ;
    133 ; X64-LABEL: pcmpestri_mem_eq_i8:
    134 ; X64:       # %bb.0: # %entry
    135 ; X64-NEXT:    movq %rdx, %r8
    136 ; X64-NEXT:    movdqu (%rdi), %xmm0
    137 ; X64-NEXT:    movl %esi, %eax
    138 ; X64-NEXT:    movl %ecx, %edx
    139 ; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
    140 ; X64-NEXT:    setae %al
    141 ; X64-NEXT:    retq
    142 entry:
    143   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    144   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    145   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    146   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    147   %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
    148   %result = icmp eq i32 %c, 0
    149   ret i1 %result
    150 }
    151 
    152 define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
    153 ; X32-LABEL: pcmpestri_mem_idx_i8:
    154 ; X32:       # %bb.0: # %entry
    155 ; X32-NEXT:    pushl %esi
    156 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    157 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    158 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    159 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    160 ; X32-NEXT:    movdqu (%esi), %xmm0
    161 ; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0
    162 ; X32-NEXT:    movl %ecx, %eax
    163 ; X32-NEXT:    popl %esi
    164 ; X32-NEXT:    retl
    165 ;
    166 ; X64-LABEL: pcmpestri_mem_idx_i8:
    167 ; X64:       # %bb.0: # %entry
    168 ; X64-NEXT:    movq %rdx, %r8
    169 ; X64-NEXT:    movdqu (%rdi), %xmm0
    170 ; X64-NEXT:    movl %esi, %eax
    171 ; X64-NEXT:    movl %ecx, %edx
    172 ; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
    173 ; X64-NEXT:    movl %ecx, %eax
    174 ; X64-NEXT:    retq
    175 entry:
    176   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    177   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    178   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    179   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    180   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
    181   ret i32 %idx
    182 }
    183 
    184 define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
    185 ; X32-LABEL: pcmpestri_mem_diff_i8:
    186 ; X32:       # %bb.0: # %entry
    187 ; X32-NEXT:    pushl %ebp
    188 ; X32-NEXT:    movl %esp, %ebp
    189 ; X32-NEXT:    pushl %esi
    190 ; X32-NEXT:    andl $-16, %esp
    191 ; X32-NEXT:    subl $48, %esp
    192 ; X32-NEXT:    movl 12(%ebp), %eax
    193 ; X32-NEXT:    movl 20(%ebp), %edx
    194 ; X32-NEXT:    movl 16(%ebp), %ecx
    195 ; X32-NEXT:    movl 8(%ebp), %esi
    196 ; X32-NEXT:    movdqu (%esi), %xmm1
    197 ; X32-NEXT:    movdqu (%ecx), %xmm0
    198 ; X32-NEXT:    pcmpestri $24, %xmm0, %xmm1
    199 ; X32-NEXT:    cmpl $16, %ecx
    200 ; X32-NEXT:    jne .LBB5_2
    201 ; X32-NEXT:  # %bb.1:
    202 ; X32-NEXT:    xorl %eax, %eax
    203 ; X32-NEXT:    jmp .LBB5_3
    204 ; X32-NEXT:  .LBB5_2: # %compare
    205 ; X32-NEXT:    movdqa %xmm1, (%esp)
    206 ; X32-NEXT:    andl $15, %ecx
    207 ; X32-NEXT:    movb (%esp,%ecx), %al
    208 ; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
    209 ; X32-NEXT:    subb 16(%esp,%ecx), %al
    210 ; X32-NEXT:  .LBB5_3: # %exit
    211 ; X32-NEXT:    movzbl %al, %eax
    212 ; X32-NEXT:    leal -4(%ebp), %esp
    213 ; X32-NEXT:    popl %esi
    214 ; X32-NEXT:    popl %ebp
    215 ; X32-NEXT:    retl
    216 ;
    217 ; X64-LABEL: pcmpestri_mem_diff_i8:
    218 ; X64:       # %bb.0: # %entry
    219 ; X64-NEXT:    movdqu (%rdi), %xmm1
    220 ; X64-NEXT:    movdqu (%rdx), %xmm0
    221 ; X64-NEXT:    movl %esi, %eax
    222 ; X64-NEXT:    movl %ecx, %edx
    223 ; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
    224 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    225 ; X64-NEXT:    cmpl $16, %ecx
    226 ; X64-NEXT:    jne .LBB5_2
    227 ; X64-NEXT:  # %bb.1:
    228 ; X64-NEXT:    xorl %eax, %eax
    229 ; X64-NEXT:    movzbl %al, %eax
    230 ; X64-NEXT:    retq
    231 ; X64-NEXT:  .LBB5_2: # %compare
    232 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    233 ; X64-NEXT:    andl $15, %ecx
    234 ; X64-NEXT:    movb -24(%rsp,%rcx), %al
    235 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    236 ; X64-NEXT:    subb -40(%rsp,%rcx), %al
    237 ; X64-NEXT:    movzbl %al, %eax
    238 ; X64-NEXT:    retq
    239 entry:
    240   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    241   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    242   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    243   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    244   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
    245   %eq = icmp eq i32 %idx, 16
    246   br i1 %eq, label %exit, label %compare
    247 
    248 compare:
    249   %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
    250   %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
    251   %sub = sub i8 %lhs_c, %rhs_c
    252   br label %exit
    253 
    254 exit:
    255   %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
    256   %result_ext = zext i8 %result to i32
    257   ret i32 %result_ext
    258 }
    259 
    260 define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
    261 ; X32-LABEL: pcmpestri_reg_eq_i16:
    262 ; X32:       # %bb.0: # %entry
    263 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    264 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    265 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
    266 ; X32-NEXT:    setae %al
    267 ; X32-NEXT:    retl
    268 ;
    269 ; X64-LABEL: pcmpestri_reg_eq_i16:
    270 ; X64:       # %bb.0: # %entry
    271 ; X64-NEXT:    movl %edi, %eax
    272 ; X64-NEXT:    movl %esi, %edx
    273 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
    274 ; X64-NEXT:    setae %al
    275 ; X64-NEXT:    retq
    276 entry:
    277   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    278   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    279   %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
    280   %result = icmp eq i32 %c, 0
    281   ret i1 %result
    282 }
    283 
    284 define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
    285 ; X32-LABEL: pcmpestri_reg_idx_i16:
    286 ; X32:       # %bb.0: # %entry
    287 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    288 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    289 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
    290 ; X32-NEXT:    movl %ecx, %eax
    291 ; X32-NEXT:    retl
    292 ;
    293 ; X64-LABEL: pcmpestri_reg_idx_i16:
    294 ; X64:       # %bb.0: # %entry
    295 ; X64-NEXT:    movl %edi, %eax
    296 ; X64-NEXT:    movl %esi, %edx
    297 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
    298 ; X64-NEXT:    movl %ecx, %eax
    299 ; X64-NEXT:    retq
    300 entry:
    301   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    302   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    303   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
    304   ret i32 %idx
    305 }
    306 
    307 define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
    308 ; X32-LABEL: pcmpestri_reg_diff_i16:
    309 ; X32:       # %bb.0: # %entry
    310 ; X32-NEXT:    pushl %ebp
    311 ; X32-NEXT:    movl %esp, %ebp
    312 ; X32-NEXT:    andl $-16, %esp
    313 ; X32-NEXT:    subl $48, %esp
    314 ; X32-NEXT:    movl 8(%ebp), %eax
    315 ; X32-NEXT:    movl 12(%ebp), %edx
    316 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
    317 ; X32-NEXT:    cmpl $16, %ecx
    318 ; X32-NEXT:    jne .LBB8_2
    319 ; X32-NEXT:  # %bb.1:
    320 ; X32-NEXT:    xorl %eax, %eax
    321 ; X32-NEXT:    jmp .LBB8_3
    322 ; X32-NEXT:  .LBB8_2: # %compare
    323 ; X32-NEXT:    movdqa %xmm0, (%esp)
    324 ; X32-NEXT:    addl %ecx, %ecx
    325 ; X32-NEXT:    andl $14, %ecx
    326 ; X32-NEXT:    movzwl (%esp,%ecx), %eax
    327 ; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
    328 ; X32-NEXT:    subw 16(%esp,%ecx), %ax
    329 ; X32-NEXT:  .LBB8_3: # %exit
    330 ; X32-NEXT:    movzwl %ax, %eax
    331 ; X32-NEXT:    movl %ebp, %esp
    332 ; X32-NEXT:    popl %ebp
    333 ; X32-NEXT:    retl
    334 ;
    335 ; X64-LABEL: pcmpestri_reg_diff_i16:
    336 ; X64:       # %bb.0: # %entry
    337 ; X64-NEXT:    movl %edi, %eax
    338 ; X64-NEXT:    movl %esi, %edx
    339 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
    340 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    341 ; X64-NEXT:    cmpl $16, %ecx
    342 ; X64-NEXT:    jne .LBB8_2
    343 ; X64-NEXT:  # %bb.1:
    344 ; X64-NEXT:    xorl %eax, %eax
    345 ; X64-NEXT:    movzwl %ax, %eax
    346 ; X64-NEXT:    retq
    347 ; X64-NEXT:  .LBB8_2: # %compare
    348 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    349 ; X64-NEXT:    andl $7, %ecx
    350 ; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
    351 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    352 ; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
    353 ; X64-NEXT:    movzwl %ax, %eax
    354 ; X64-NEXT:    retq
    355 entry:
    356   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    357   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    358   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
    359   %eq = icmp eq i32 %idx, 16
    360   br i1 %eq, label %exit, label %compare
    361 
    362 compare:
    363   %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
    364   %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
    365   %sub = sub i16 %lhs_c, %rhs_c
    366   br label %exit
    367 
    368 exit:
    369   %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
    370   %result_ext = zext i16 %result to i32
    371   ret i32 %result_ext
    372 }
    373 
    374 define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
    375 ; X32-LABEL: pcmpestri_mem_eq_i16:
    376 ; X32:       # %bb.0: # %entry
    377 ; X32-NEXT:    pushl %esi
    378 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    379 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    380 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    381 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    382 ; X32-NEXT:    movdqu (%esi), %xmm0
    383 ; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0
    384 ; X32-NEXT:    setae %al
    385 ; X32-NEXT:    popl %esi
    386 ; X32-NEXT:    retl
    387 ;
    388 ; X64-LABEL: pcmpestri_mem_eq_i16:
    389 ; X64:       # %bb.0: # %entry
    390 ; X64-NEXT:    movq %rdx, %r8
    391 ; X64-NEXT:    movdqu (%rdi), %xmm0
    392 ; X64-NEXT:    movl %esi, %eax
    393 ; X64-NEXT:    movl %ecx, %edx
    394 ; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
    395 ; X64-NEXT:    setae %al
    396 ; X64-NEXT:    retq
    397 entry:
    398   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    399   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    400   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    401   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    402   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    403   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    404   %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
    405   %result = icmp eq i32 %c, 0
    406   ret i1 %result
    407 }
    408 
    409 define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
    410 ; X32-LABEL: pcmpestri_mem_idx_i16:
    411 ; X32:       # %bb.0: # %entry
    412 ; X32-NEXT:    pushl %esi
    413 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    414 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    415 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    416 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    417 ; X32-NEXT:    movdqu (%esi), %xmm0
    418 ; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0
    419 ; X32-NEXT:    movl %ecx, %eax
    420 ; X32-NEXT:    popl %esi
    421 ; X32-NEXT:    retl
    422 ;
    423 ; X64-LABEL: pcmpestri_mem_idx_i16:
    424 ; X64:       # %bb.0: # %entry
    425 ; X64-NEXT:    movq %rdx, %r8
    426 ; X64-NEXT:    movdqu (%rdi), %xmm0
    427 ; X64-NEXT:    movl %esi, %eax
    428 ; X64-NEXT:    movl %ecx, %edx
    429 ; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
    430 ; X64-NEXT:    movl %ecx, %eax
    431 ; X64-NEXT:    retq
    432 entry:
    433   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    434   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    435   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    436   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    437   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    438   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    439   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
    440   ret i32 %idx
    441 }
    442 
    443 define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
    444 ; X32-LABEL: pcmpestri_mem_diff_i16:
    445 ; X32:       # %bb.0: # %entry
    446 ; X32-NEXT:    pushl %ebp
    447 ; X32-NEXT:    movl %esp, %ebp
    448 ; X32-NEXT:    pushl %esi
    449 ; X32-NEXT:    andl $-16, %esp
    450 ; X32-NEXT:    subl $48, %esp
    451 ; X32-NEXT:    movl 12(%ebp), %eax
    452 ; X32-NEXT:    movl 20(%ebp), %edx
    453 ; X32-NEXT:    movl 16(%ebp), %ecx
    454 ; X32-NEXT:    movl 8(%ebp), %esi
    455 ; X32-NEXT:    movdqu (%esi), %xmm1
    456 ; X32-NEXT:    movdqu (%ecx), %xmm0
    457 ; X32-NEXT:    pcmpestri $25, %xmm0, %xmm1
    458 ; X32-NEXT:    cmpl $8, %ecx
    459 ; X32-NEXT:    jne .LBB11_2
    460 ; X32-NEXT:  # %bb.1:
    461 ; X32-NEXT:    xorl %eax, %eax
    462 ; X32-NEXT:    jmp .LBB11_3
    463 ; X32-NEXT:  .LBB11_2: # %compare
    464 ; X32-NEXT:    movdqa %xmm1, (%esp)
    465 ; X32-NEXT:    addl %ecx, %ecx
    466 ; X32-NEXT:    andl $14, %ecx
    467 ; X32-NEXT:    movzwl (%esp,%ecx), %eax
    468 ; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
    469 ; X32-NEXT:    subw 16(%esp,%ecx), %ax
    470 ; X32-NEXT:  .LBB11_3: # %exit
    471 ; X32-NEXT:    movzwl %ax, %eax
    472 ; X32-NEXT:    leal -4(%ebp), %esp
    473 ; X32-NEXT:    popl %esi
    474 ; X32-NEXT:    popl %ebp
    475 ; X32-NEXT:    retl
    476 ;
    477 ; X64-LABEL: pcmpestri_mem_diff_i16:
    478 ; X64:       # %bb.0: # %entry
    479 ; X64-NEXT:    movdqu (%rdi), %xmm1
    480 ; X64-NEXT:    movdqu (%rdx), %xmm0
    481 ; X64-NEXT:    movl %esi, %eax
    482 ; X64-NEXT:    movl %ecx, %edx
    483 ; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
    484 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    485 ; X64-NEXT:    cmpl $8, %ecx
    486 ; X64-NEXT:    jne .LBB11_2
    487 ; X64-NEXT:  # %bb.1:
    488 ; X64-NEXT:    xorl %eax, %eax
    489 ; X64-NEXT:    movzwl %ax, %eax
    490 ; X64-NEXT:    retq
    491 ; X64-NEXT:  .LBB11_2: # %compare
    492 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    493 ; X64-NEXT:    andl $7, %ecx
    494 ; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
    495 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    496 ; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
    497 ; X64-NEXT:    movzwl %ax, %eax
    498 ; X64-NEXT:    retq
    499 entry:
    500   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    501   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    502   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    503   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    504   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    505   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    506   %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
    507   %eq = icmp eq i32 %idx, 8
    508   br i1 %eq, label %exit, label %compare
    509 
    510 compare:
    511   %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
    512   %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
    513   %sub = sub i16 %lhs_c, %rhs_c
    514   br label %exit
    515 
    516 exit:
    517   %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
    518   %result_ext = zext i16 %result to i32
    519   ret i32 %result_ext
    520 }
    521 
    522 define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
    523 ; X32-LABEL: pcmpistri_reg_eq_i8:
    524 ; X32:       # %bb.0: # %entry
    525 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    526 ; X32-NEXT:    setae %al
    527 ; X32-NEXT:    retl
    528 ;
    529 ; X64-LABEL: pcmpistri_reg_eq_i8:
    530 ; X64:       # %bb.0: # %entry
    531 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    532 ; X64-NEXT:    setae %al
    533 ; X64-NEXT:    retq
    534 entry:
    535   %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    536   %result = icmp eq i32 %c, 0
    537   ret i1 %result
    538 }
    539 
    540 define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
    541 ; X32-LABEL: pcmpistri_reg_idx_i8:
    542 ; X32:       # %bb.0: # %entry
    543 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    544 ; X32-NEXT:    movl %ecx, %eax
    545 ; X32-NEXT:    retl
    546 ;
    547 ; X64-LABEL: pcmpistri_reg_idx_i8:
    548 ; X64:       # %bb.0: # %entry
    549 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    550 ; X64-NEXT:    movl %ecx, %eax
    551 ; X64-NEXT:    retq
    552 entry:
    553   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    554   ret i32 %idx
    555 }
    556 
    557 define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
    558 ; X32-LABEL: pcmpistri_reg_diff_i8:
    559 ; X32:       # %bb.0: # %entry
    560 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    561 ; X32-NEXT:    cmpl $16, %ecx
    562 ; X32-NEXT:    jne .LBB14_2
    563 ; X32-NEXT:  # %bb.1:
    564 ; X32-NEXT:    xorl %eax, %eax
    565 ; X32-NEXT:    movzbl %al, %eax
    566 ; X32-NEXT:    retl
    567 ; X32-NEXT:  .LBB14_2: # %compare
    568 ; X32-NEXT:    pushl %ebp
    569 ; X32-NEXT:    movl %esp, %ebp
    570 ; X32-NEXT:    andl $-16, %esp
    571 ; X32-NEXT:    subl $48, %esp
    572 ; X32-NEXT:    movdqa %xmm0, (%esp)
    573 ; X32-NEXT:    andl $15, %ecx
    574 ; X32-NEXT:    movb (%esp,%ecx), %al
    575 ; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
    576 ; X32-NEXT:    subb 16(%esp,%ecx), %al
    577 ; X32-NEXT:    movl %ebp, %esp
    578 ; X32-NEXT:    popl %ebp
    579 ; X32-NEXT:    movzbl %al, %eax
    580 ; X32-NEXT:    retl
    581 ;
    582 ; X64-LABEL: pcmpistri_reg_diff_i8:
    583 ; X64:       # %bb.0: # %entry
    584 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    585 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    586 ; X64-NEXT:    cmpl $16, %ecx
    587 ; X64-NEXT:    jne .LBB14_2
    588 ; X64-NEXT:  # %bb.1:
    589 ; X64-NEXT:    xorl %eax, %eax
    590 ; X64-NEXT:    movzbl %al, %eax
    591 ; X64-NEXT:    retq
    592 ; X64-NEXT:  .LBB14_2: # %compare
    593 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    594 ; X64-NEXT:    andl $15, %ecx
    595 ; X64-NEXT:    movb -24(%rsp,%rcx), %al
    596 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    597 ; X64-NEXT:    subb -40(%rsp,%rcx), %al
    598 ; X64-NEXT:    movzbl %al, %eax
    599 ; X64-NEXT:    retq
    600 entry:
    601   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    602   %eq = icmp eq i32 %idx, 16
    603   br i1 %eq, label %exit, label %compare
    604 
    605 compare:
    606   %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
    607   %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
    608   %sub = sub i8 %lhs_c, %rhs_c
    609   br label %exit
    610 
    611 exit:
    612   %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
    613   %result_ext = zext i8 %result to i32
    614   ret i32 %result_ext
    615 }
    616 
    617 define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
    618 ; X32-LABEL: pcmpistri_mem_eq_i8:
    619 ; X32:       # %bb.0: # %entry
    620 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    621 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    622 ; X32-NEXT:    movdqu (%ecx), %xmm0
    623 ; X32-NEXT:    pcmpistri $24, (%eax), %xmm0
    624 ; X32-NEXT:    setae %al
    625 ; X32-NEXT:    retl
    626 ;
    627 ; X64-LABEL: pcmpistri_mem_eq_i8:
    628 ; X64:       # %bb.0: # %entry
    629 ; X64-NEXT:    movdqu (%rdi), %xmm0
    630 ; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
    631 ; X64-NEXT:    setae %al
    632 ; X64-NEXT:    retq
    633 entry:
    634   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    635   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    636   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    637   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    638   %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    639   %result = icmp eq i32 %c, 0
    640   ret i1 %result
    641 }
    642 
    643 define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
    644 ; X32-LABEL: pcmpistri_mem_idx_i8:
    645 ; X32:       # %bb.0: # %entry
    646 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    647 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    648 ; X32-NEXT:    movdqu (%ecx), %xmm0
    649 ; X32-NEXT:    pcmpistri $24, (%eax), %xmm0
    650 ; X32-NEXT:    movl %ecx, %eax
    651 ; X32-NEXT:    retl
    652 ;
    653 ; X64-LABEL: pcmpistri_mem_idx_i8:
    654 ; X64:       # %bb.0: # %entry
    655 ; X64-NEXT:    movdqu (%rdi), %xmm0
    656 ; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
    657 ; X64-NEXT:    movl %ecx, %eax
    658 ; X64-NEXT:    retq
    659 entry:
    660   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    661   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    662   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    663   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    664   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    665   ret i32 %idx
    666 }
    667 
    668 define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
    669 ; X32-LABEL: pcmpistri_mem_diff_i8:
    670 ; X32:       # %bb.0: # %entry
    671 ; X32-NEXT:    pushl %ebp
    672 ; X32-NEXT:    movl %esp, %ebp
    673 ; X32-NEXT:    andl $-16, %esp
    674 ; X32-NEXT:    subl $48, %esp
    675 ; X32-NEXT:    movl 12(%ebp), %eax
    676 ; X32-NEXT:    movl 8(%ebp), %ecx
    677 ; X32-NEXT:    movdqu (%ecx), %xmm1
    678 ; X32-NEXT:    movdqu (%eax), %xmm0
    679 ; X32-NEXT:    pcmpistri $24, %xmm0, %xmm1
    680 ; X32-NEXT:    cmpl $16, %ecx
    681 ; X32-NEXT:    jne .LBB17_2
    682 ; X32-NEXT:  # %bb.1:
    683 ; X32-NEXT:    xorl %eax, %eax
    684 ; X32-NEXT:    jmp .LBB17_3
    685 ; X32-NEXT:  .LBB17_2: # %compare
    686 ; X32-NEXT:    movdqa %xmm1, (%esp)
    687 ; X32-NEXT:    andl $15, %ecx
    688 ; X32-NEXT:    movb (%esp,%ecx), %al
    689 ; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
    690 ; X32-NEXT:    subb 16(%esp,%ecx), %al
    691 ; X32-NEXT:  .LBB17_3: # %exit
    692 ; X32-NEXT:    movzbl %al, %eax
    693 ; X32-NEXT:    movl %ebp, %esp
    694 ; X32-NEXT:    popl %ebp
    695 ; X32-NEXT:    retl
    696 ;
    697 ; X64-LABEL: pcmpistri_mem_diff_i8:
    698 ; X64:       # %bb.0: # %entry
    699 ; X64-NEXT:    movdqu (%rdi), %xmm1
    700 ; X64-NEXT:    movdqu (%rsi), %xmm0
    701 ; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1
    702 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    703 ; X64-NEXT:    cmpl $16, %ecx
    704 ; X64-NEXT:    jne .LBB17_2
    705 ; X64-NEXT:  # %bb.1:
    706 ; X64-NEXT:    xorl %eax, %eax
    707 ; X64-NEXT:    movzbl %al, %eax
    708 ; X64-NEXT:    retq
    709 ; X64-NEXT:  .LBB17_2: # %compare
    710 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    711 ; X64-NEXT:    andl $15, %ecx
    712 ; X64-NEXT:    movb -24(%rsp,%rcx), %al
    713 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    714 ; X64-NEXT:    subb -40(%rsp,%rcx), %al
    715 ; X64-NEXT:    movzbl %al, %eax
    716 ; X64-NEXT:    retq
    717 entry:
    718   %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
    719   %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
    720   %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
    721   %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
    722   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
    723   %eq = icmp eq i32 %idx, 16
    724   br i1 %eq, label %exit, label %compare
    725 
    726 compare:
    727   %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
    728   %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
    729   %sub = sub i8 %lhs_c, %rhs_c
    730   br label %exit
    731 
    732 exit:
    733   %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
    734   %result_ext = zext i8 %result to i32
    735   ret i32 %result_ext
    736 }
    737 
    738 define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
    739 ; X32-LABEL: pcmpistri_reg_eq_i16:
    740 ; X32:       # %bb.0: # %entry
    741 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    742 ; X32-NEXT:    setae %al
    743 ; X32-NEXT:    retl
    744 ;
    745 ; X64-LABEL: pcmpistri_reg_eq_i16:
    746 ; X64:       # %bb.0: # %entry
    747 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    748 ; X64-NEXT:    setae %al
    749 ; X64-NEXT:    retq
    750 entry:
    751   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    752   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    753   %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
    754   %result = icmp eq i32 %c, 0
    755   ret i1 %result
    756 }
    757 
    758 define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
    759 ; X32-LABEL: pcmpistri_reg_idx_i16:
    760 ; X32:       # %bb.0: # %entry
    761 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    762 ; X32-NEXT:    movl %ecx, %eax
    763 ; X32-NEXT:    retl
    764 ;
    765 ; X64-LABEL: pcmpistri_reg_idx_i16:
    766 ; X64:       # %bb.0: # %entry
    767 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    768 ; X64-NEXT:    movl %ecx, %eax
    769 ; X64-NEXT:    retq
    770 entry:
    771   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    772   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    773   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
    774   ret i32 %idx
    775 }
    776 
    777 define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
    778 ; X32-LABEL: pcmpistri_reg_diff_i16:
    779 ; X32:       # %bb.0: # %entry
    780 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
    781 ; X32-NEXT:    cmpl $16, %ecx
    782 ; X32-NEXT:    jne .LBB20_2
    783 ; X32-NEXT:  # %bb.1:
    784 ; X32-NEXT:    xorl %eax, %eax
    785 ; X32-NEXT:    movzwl %ax, %eax
    786 ; X32-NEXT:    retl
    787 ; X32-NEXT:  .LBB20_2: # %compare
    788 ; X32-NEXT:    pushl %ebp
    789 ; X32-NEXT:    movl %esp, %ebp
    790 ; X32-NEXT:    andl $-16, %esp
    791 ; X32-NEXT:    subl $48, %esp
    792 ; X32-NEXT:    movdqa %xmm0, (%esp)
    793 ; X32-NEXT:    addl %ecx, %ecx
    794 ; X32-NEXT:    andl $14, %ecx
    795 ; X32-NEXT:    movzwl (%esp,%ecx), %eax
    796 ; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
    797 ; X32-NEXT:    subw 16(%esp,%ecx), %ax
    798 ; X32-NEXT:    movl %ebp, %esp
    799 ; X32-NEXT:    popl %ebp
    800 ; X32-NEXT:    movzwl %ax, %eax
    801 ; X32-NEXT:    retl
    802 ;
    803 ; X64-LABEL: pcmpistri_reg_diff_i16:
    804 ; X64:       # %bb.0: # %entry
    805 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
    806 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    807 ; X64-NEXT:    cmpl $16, %ecx
    808 ; X64-NEXT:    jne .LBB20_2
    809 ; X64-NEXT:  # %bb.1:
    810 ; X64-NEXT:    xorl %eax, %eax
    811 ; X64-NEXT:    movzwl %ax, %eax
    812 ; X64-NEXT:    retq
    813 ; X64-NEXT:  .LBB20_2: # %compare
    814 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    815 ; X64-NEXT:    andl $7, %ecx
    816 ; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
    817 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    818 ; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
    819 ; X64-NEXT:    movzwl %ax, %eax
    820 ; X64-NEXT:    retq
    821 entry:
    822   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    823   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    824   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
    825   %eq = icmp eq i32 %idx, 16
    826   br i1 %eq, label %exit, label %compare
    827 
    828 compare:
    829   %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
    830   %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
    831   %sub = sub i16 %lhs_c, %rhs_c
    832   br label %exit
    833 
    834 exit:
    835   %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
    836   %result_ext = zext i16 %result to i32
    837   ret i32 %result_ext
    838 }
    839 
    840 define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
    841 ; X32-LABEL: pcmpistri_mem_eq_i16:
    842 ; X32:       # %bb.0: # %entry
    843 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    844 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    845 ; X32-NEXT:    movdqu (%ecx), %xmm0
    846 ; X32-NEXT:    pcmpistri $25, (%eax), %xmm0
    847 ; X32-NEXT:    setae %al
    848 ; X32-NEXT:    retl
    849 ;
    850 ; X64-LABEL: pcmpistri_mem_eq_i16:
    851 ; X64:       # %bb.0: # %entry
    852 ; X64-NEXT:    movdqu (%rdi), %xmm0
    853 ; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
    854 ; X64-NEXT:    setae %al
    855 ; X64-NEXT:    retq
    856 entry:
    857   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    858   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    859   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    860   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    861   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    862   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    863   %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
    864   %result = icmp eq i32 %c, 0
    865   ret i1 %result
    866 }
    867 
    868 define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
    869 ; X32-LABEL: pcmpistri_mem_idx_i16:
    870 ; X32:       # %bb.0: # %entry
    871 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    872 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    873 ; X32-NEXT:    movdqu (%ecx), %xmm0
    874 ; X32-NEXT:    pcmpistri $25, (%eax), %xmm0
    875 ; X32-NEXT:    movl %ecx, %eax
    876 ; X32-NEXT:    retl
    877 ;
    878 ; X64-LABEL: pcmpistri_mem_idx_i16:
    879 ; X64:       # %bb.0: # %entry
    880 ; X64-NEXT:    movdqu (%rdi), %xmm0
    881 ; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
    882 ; X64-NEXT:    movl %ecx, %eax
    883 ; X64-NEXT:    retq
    884 entry:
    885   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    886   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    887   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    888   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    889   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    890   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    891   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
    892   ret i32 %idx
    893 }
    894 
    895 define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
    896 ; X32-LABEL: pcmpistri_mem_diff_i16:
    897 ; X32:       # %bb.0: # %entry
    898 ; X32-NEXT:    pushl %ebp
    899 ; X32-NEXT:    movl %esp, %ebp
    900 ; X32-NEXT:    andl $-16, %esp
    901 ; X32-NEXT:    subl $48, %esp
    902 ; X32-NEXT:    movl 12(%ebp), %eax
    903 ; X32-NEXT:    movl 8(%ebp), %ecx
    904 ; X32-NEXT:    movdqu (%ecx), %xmm1
    905 ; X32-NEXT:    movdqu (%eax), %xmm0
    906 ; X32-NEXT:    pcmpistri $25, %xmm0, %xmm1
    907 ; X32-NEXT:    cmpl $8, %ecx
    908 ; X32-NEXT:    jne .LBB23_2
    909 ; X32-NEXT:  # %bb.1:
    910 ; X32-NEXT:    xorl %eax, %eax
    911 ; X32-NEXT:    jmp .LBB23_3
    912 ; X32-NEXT:  .LBB23_2: # %compare
    913 ; X32-NEXT:    movdqa %xmm1, (%esp)
    914 ; X32-NEXT:    addl %ecx, %ecx
    915 ; X32-NEXT:    andl $14, %ecx
    916 ; X32-NEXT:    movzwl (%esp,%ecx), %eax
    917 ; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
    918 ; X32-NEXT:    subw 16(%esp,%ecx), %ax
    919 ; X32-NEXT:  .LBB23_3: # %exit
    920 ; X32-NEXT:    movzwl %ax, %eax
    921 ; X32-NEXT:    movl %ebp, %esp
    922 ; X32-NEXT:    popl %ebp
    923 ; X32-NEXT:    retl
    924 ;
    925 ; X64-LABEL: pcmpistri_mem_diff_i16:
    926 ; X64:       # %bb.0: # %entry
    927 ; X64-NEXT:    movdqu (%rdi), %xmm1
    928 ; X64-NEXT:    movdqu (%rsi), %xmm0
    929 ; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1
    930 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
    931 ; X64-NEXT:    cmpl $8, %ecx
    932 ; X64-NEXT:    jne .LBB23_2
    933 ; X64-NEXT:  # %bb.1:
    934 ; X64-NEXT:    xorl %eax, %eax
    935 ; X64-NEXT:    movzwl %ax, %eax
    936 ; X64-NEXT:    retq
    937 ; X64-NEXT:  .LBB23_2: # %compare
    938 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
    939 ; X64-NEXT:    andl $7, %ecx
    940 ; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
    941 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
    942 ; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
    943 ; X64-NEXT:    movzwl %ax, %eax
    944 ; X64-NEXT:    retq
    945 entry:
    946   %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
    947   %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
    948   %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
    949   %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
    950   %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
    951   %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
    952   %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
    953   %eq = icmp eq i32 %idx, 8
    954   br i1 %eq, label %exit, label %compare
    955 
    956 compare:
    957   %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
    958   %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
    959   %sub = sub i16 %lhs_c, %rhs_c
    960   br label %exit
    961 
    962 exit:
    963   %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
    964   %result_ext = zext i16 %result to i32
    965   ret i32 %result_ext
    966 }
    967 
    968 define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
    969 ; X32-LABEL: pcmpestr_index_flag:
    970 ; X32:       # %bb.0: # %entry
    971 ; X32-NEXT:    pushl %ebx
    972 ; X32-NEXT:    pushl %edi
    973 ; X32-NEXT:    pushl %esi
    974 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    975 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
    976 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    977 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    978 ; X32-NEXT:    xorl %ebx, %ebx
    979 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
    980 ; X32-NEXT:    setb %bl
    981 ; X32-NEXT:    movl %ecx, (%edi)
    982 ; X32-NEXT:    movl %ebx, (%esi)
    983 ; X32-NEXT:    popl %esi
    984 ; X32-NEXT:    popl %edi
    985 ; X32-NEXT:    popl %ebx
    986 ; X32-NEXT:    retl
    987 ;
    988 ; X64-LABEL: pcmpestr_index_flag:
    989 ; X64:       # %bb.0: # %entry
    990 ; X64-NEXT:    movq %rcx, %r8
    991 ; X64-NEXT:    movq %rdx, %r9
    992 ; X64-NEXT:    xorl %r10d, %r10d
    993 ; X64-NEXT:    movl %edi, %eax
    994 ; X64-NEXT:    movl %esi, %edx
    995 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
    996 ; X64-NEXT:    setb %r10b
    997 ; X64-NEXT:    movl %ecx, (%r9)
    998 ; X64-NEXT:    movl %r10d, (%r8)
    999 ; X64-NEXT:    retq
   1000 entry:
   1001   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1002   %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1003   store i32 %index, i32* %iptr
   1004   store i32 %flag, i32* %fptr
   1005   ret void
   1006 }
   1007 
   1008 define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
   1009 ; X32-LABEL: pcmpestr_mask_flag:
   1010 ; X32:       # %bb.0: # %entry
   1011 ; X32-NEXT:    pushl %ebx
   1012 ; X32-NEXT:    pushl %esi
   1013 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1014 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1015 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1016 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1017 ; X32-NEXT:    xorl %ebx, %ebx
   1018 ; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1019 ; X32-NEXT:    setb %bl
   1020 ; X32-NEXT:    movdqa %xmm0, (%esi)
   1021 ; X32-NEXT:    movl %ebx, (%ecx)
   1022 ; X32-NEXT:    popl %esi
   1023 ; X32-NEXT:    popl %ebx
   1024 ; X32-NEXT:    retl
   1025 ;
   1026 ; X64-LABEL: pcmpestr_mask_flag:
   1027 ; X64:       # %bb.0: # %entry
   1028 ; X64-NEXT:    movq %rdx, %r8
   1029 ; X64-NEXT:    xorl %r9d, %r9d
   1030 ; X64-NEXT:    movl %edi, %eax
   1031 ; X64-NEXT:    movl %esi, %edx
   1032 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1033 ; X64-NEXT:    setb %r9b
   1034 ; X64-NEXT:    movdqa %xmm0, (%r8)
   1035 ; X64-NEXT:    movl %r9d, (%rcx)
   1036 ; X64-NEXT:    retq
   1037 entry:
   1038   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1039   %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1040   store <16 x i8> %mask, <16 x i8>* %mptr
   1041   store i32 %flag, i32* %fptr
   1042   ret void
   1043 }
   1044 
   1045 define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
   1046 ; X32-LABEL: pcmpestr_mask_index:
   1047 ; X32:       # %bb.0: # %entry
   1048 ; X32-NEXT:    pushl %edi
   1049 ; X32-NEXT:    pushl %esi
   1050 ; X32-NEXT:    movdqa %xmm0, %xmm2
   1051 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1052 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1053 ; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1054 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1055 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
   1056 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2
   1057 ; X32-NEXT:    movdqa %xmm0, (%edi)
   1058 ; X32-NEXT:    movl %ecx, (%esi)
   1059 ; X32-NEXT:    popl %esi
   1060 ; X32-NEXT:    popl %edi
   1061 ; X32-NEXT:    retl
   1062 ;
   1063 ; X64-LABEL: pcmpestr_mask_index:
   1064 ; X64:       # %bb.0: # %entry
   1065 ; X64-NEXT:    movq %rcx, %r8
   1066 ; X64-NEXT:    movq %rdx, %r9
   1067 ; X64-NEXT:    movdqa %xmm0, %xmm2
   1068 ; X64-NEXT:    movl %edi, %eax
   1069 ; X64-NEXT:    movl %esi, %edx
   1070 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1071 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
   1072 ; X64-NEXT:    movdqa %xmm0, (%r9)
   1073 ; X64-NEXT:    movl %ecx, (%r8)
   1074 ; X64-NEXT:    retq
   1075 entry:
   1076   %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1077   %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1078   store <16 x i8> %mask, <16 x i8>* %mptr
   1079   store i32 %index, i32* %iptr
   1080   ret void
   1081 }
   1082 
   1083 define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
   1084 ; X32-LABEL: pcmpestr_mask_index_flag:
   1085 ; X32:       # %bb.0: # %entry
   1086 ; X32-NEXT:    pushl %ebp
   1087 ; X32-NEXT:    pushl %ebx
   1088 ; X32-NEXT:    pushl %edi
   1089 ; X32-NEXT:    pushl %esi
   1090 ; X32-NEXT:    movdqa %xmm0, %xmm2
   1091 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1092 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1093 ; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1094 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1095 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
   1096 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
   1097 ; X32-NEXT:    xorl %ebx, %ebx
   1098 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2
   1099 ; X32-NEXT:    setb %bl
   1100 ; X32-NEXT:    movdqa %xmm0, (%ebp)
   1101 ; X32-NEXT:    movl %ecx, (%edi)
   1102 ; X32-NEXT:    movl %ebx, (%esi)
   1103 ; X32-NEXT:    popl %esi
   1104 ; X32-NEXT:    popl %edi
   1105 ; X32-NEXT:    popl %ebx
   1106 ; X32-NEXT:    popl %ebp
   1107 ; X32-NEXT:    retl
   1108 ;
   1109 ; X64-LABEL: pcmpestr_mask_index_flag:
   1110 ; X64:       # %bb.0: # %entry
   1111 ; X64-NEXT:    movq %rcx, %r9
   1112 ; X64-NEXT:    movq %rdx, %r10
   1113 ; X64-NEXT:    movdqa %xmm0, %xmm2
   1114 ; X64-NEXT:    movl %edi, %eax
   1115 ; X64-NEXT:    movl %esi, %edx
   1116 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
   1117 ; X64-NEXT:    xorl %esi, %esi
   1118 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
   1119 ; X64-NEXT:    setb %sil
   1120 ; X64-NEXT:    movdqa %xmm0, (%r10)
   1121 ; X64-NEXT:    movl %ecx, (%r9)
   1122 ; X64-NEXT:    movl %esi, (%r8)
   1123 ; X64-NEXT:    retq
   1124 entry:
   1125   %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1126   %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1127   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1128   store <16 x i8> %mask, <16 x i8>* %mptr
   1129   store i32 %index, i32* %iptr
   1130   store i32 %flag, i32* %fptr
   1131   ret void
   1132 }
   1133 
   1134 define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
   1135 ; X32-LABEL: pcmpistr_index_flag:
   1136 ; X32:       # %bb.0: # %entry
   1137 ; X32-NEXT:    pushl %ebx
   1138 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1139 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1140 ; X32-NEXT:    xorl %ebx, %ebx
   1141 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
   1142 ; X32-NEXT:    setb %bl
   1143 ; X32-NEXT:    movl %ecx, (%edx)
   1144 ; X32-NEXT:    movl %ebx, (%eax)
   1145 ; X32-NEXT:    popl %ebx
   1146 ; X32-NEXT:    retl
   1147 ;
   1148 ; X64-LABEL: pcmpistr_index_flag:
   1149 ; X64:       # %bb.0: # %entry
   1150 ; X64-NEXT:    xorl %eax, %eax
   1151 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
   1152 ; X64-NEXT:    setb %al
   1153 ; X64-NEXT:    movl %ecx, (%rdi)
   1154 ; X64-NEXT:    movl %eax, (%rsi)
   1155 ; X64-NEXT:    retq
   1156 entry:
   1157   %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1158   %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1159   store i32 %index, i32* %iptr
   1160   store i32 %flag, i32* %fptr
   1161   ret void
   1162 }
   1163 
   1164 define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
   1165 ; X32-LABEL: pcmpistr_mask_flag:
   1166 ; X32:       # %bb.0: # %entry
   1167 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1168 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1169 ; X32-NEXT:    xorl %edx, %edx
   1170 ; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1171 ; X32-NEXT:    setb %dl
   1172 ; X32-NEXT:    movdqa %xmm0, (%ecx)
   1173 ; X32-NEXT:    movl %edx, (%eax)
   1174 ; X32-NEXT:    retl
   1175 ;
   1176 ; X64-LABEL: pcmpistr_mask_flag:
   1177 ; X64:       # %bb.0: # %entry
   1178 ; X64-NEXT:    xorl %eax, %eax
   1179 ; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1180 ; X64-NEXT:    setb %al
   1181 ; X64-NEXT:    movdqa %xmm0, (%rdi)
   1182 ; X64-NEXT:    movl %eax, (%rsi)
   1183 ; X64-NEXT:    retq
   1184 entry:
   1185   %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1186   %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1187   store <16 x i8> %mask, <16 x i8>* %mptr
   1188   store i32 %flag, i32* %fptr
   1189   ret void
   1190 }
   1191 
   1192 define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
   1193 ; X32-LABEL: pcmpistr_mask_index:
   1194 ; X32:       # %bb.0: # %entry
   1195 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1196 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1197 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
   1198 ; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1199 ; X32-NEXT:    movdqa %xmm0, (%edx)
   1200 ; X32-NEXT:    movl %ecx, (%eax)
   1201 ; X32-NEXT:    retl
   1202 ;
   1203 ; X64-LABEL: pcmpistr_mask_index:
   1204 ; X64:       # %bb.0: # %entry
   1205 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
   1206 ; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1207 ; X64-NEXT:    movdqa %xmm0, (%rdi)
   1208 ; X64-NEXT:    movl %ecx, (%rsi)
   1209 ; X64-NEXT:    retq
   1210 entry:
   1211   %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1212   %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1213   store <16 x i8> %mask, <16 x i8>* %mptr
   1214   store i32 %index, i32* %iptr
   1215   ret void
   1216 }
   1217 
   1218 define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
   1219 ; X32-LABEL: pcmpistr_mask_index_flag:
   1220 ; X32:       # %bb.0: # %entry
   1221 ; X32-NEXT:    pushl %ebx
   1222 ; X32-NEXT:    pushl %esi
   1223 ; X32-NEXT:    movdqa %xmm0, %xmm2
   1224 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1225 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1226 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1227 ; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1228 ; X32-NEXT:    xorl %ebx, %ebx
   1229 ; X32-NEXT:    pcmpistri $24, %xmm1, %xmm2
   1230 ; X32-NEXT:    setb %bl
   1231 ; X32-NEXT:    movdqa %xmm0, (%esi)
   1232 ; X32-NEXT:    movl %ecx, (%edx)
   1233 ; X32-NEXT:    movl %ebx, (%eax)
   1234 ; X32-NEXT:    popl %esi
   1235 ; X32-NEXT:    popl %ebx
   1236 ; X32-NEXT:    retl
   1237 ;
   1238 ; X64-LABEL: pcmpistr_mask_index_flag:
   1239 ; X64:       # %bb.0: # %entry
   1240 ; X64-NEXT:    movdqa %xmm0, %xmm2
   1241 ; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
   1242 ; X64-NEXT:    xorl %eax, %eax
   1243 ; X64-NEXT:    pcmpistri $24, %xmm1, %xmm2
   1244 ; X64-NEXT:    setb %al
   1245 ; X64-NEXT:    movdqa %xmm0, (%rdi)
   1246 ; X64-NEXT:    movl %ecx, (%rsi)
   1247 ; X64-NEXT:    movl %eax, (%rdx)
   1248 ; X64-NEXT:    retq
   1249 entry:
   1250   %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1251   %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1252   %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1253   store <16 x i8> %mask, <16 x i8>* %mptr
   1254   store i32 %index, i32* %iptr
   1255   store i32 %flag, i32* %fptr
   1256   ret void
   1257 }
   1258 
   1259 ; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
   1260 define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
   1261 ; X32-LABEL: pcmpistr_mask_index_flag_load:
   1262 ; X32:       # %bb.0: # %entry
   1263 ; X32-NEXT:    pushl %ebx
   1264 ; X32-NEXT:    pushl %esi
   1265 ; X32-NEXT:    movdqa %xmm0, %xmm1
   1266 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1267 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1268 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1269 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1270 ; X32-NEXT:    movdqu (%ecx), %xmm2
   1271 ; X32-NEXT:    pcmpistrm $24, %xmm2, %xmm0
   1272 ; X32-NEXT:    xorl %ebx, %ebx
   1273 ; X32-NEXT:    pcmpistri $24, %xmm2, %xmm1
   1274 ; X32-NEXT:    setb %bl
   1275 ; X32-NEXT:    movdqa %xmm0, (%esi)
   1276 ; X32-NEXT:    movl %ecx, (%edx)
   1277 ; X32-NEXT:    movl %ebx, (%eax)
   1278 ; X32-NEXT:    popl %esi
   1279 ; X32-NEXT:    popl %ebx
   1280 ; X32-NEXT:    retl
   1281 ;
   1282 ; X64-LABEL: pcmpistr_mask_index_flag_load:
   1283 ; X64:       # %bb.0: # %entry
   1284 ; X64-NEXT:    movq %rcx, %rax
   1285 ; X64-NEXT:    movdqa %xmm0, %xmm1
   1286 ; X64-NEXT:    movdqu (%rdi), %xmm2
   1287 ; X64-NEXT:    pcmpistrm $24, %xmm2, %xmm0
   1288 ; X64-NEXT:    xorl %edi, %edi
   1289 ; X64-NEXT:    pcmpistri $24, %xmm2, %xmm1
   1290 ; X64-NEXT:    setb %dil
   1291 ; X64-NEXT:    movdqa %xmm0, (%rsi)
   1292 ; X64-NEXT:    movl %ecx, (%rdx)
   1293 ; X64-NEXT:    movl %edi, (%rax)
   1294 ; X64-NEXT:    retq
   1295 entry:
   1296   %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
   1297   %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1298   %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1299   %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
   1300   store <16 x i8> %mask, <16 x i8>* %mptr
   1301   store i32 %index, i32* %iptr
   1302   store i32 %flag, i32* %fptr
   1303   ret void
   1304 }
   1305 
   1306 ; Make sure we don't fold nontemporal loads.
   1307 define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
   1308 ; X32-LABEL: pcmpestri_nontemporal:
   1309 ; X32:       # %bb.0: # %entry
   1310 ; X32-NEXT:    pushl %ebx
   1311 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1312 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1313 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1314 ; X32-NEXT:    movntdqa (%ecx), %xmm1
   1315 ; X32-NEXT:    xorl %ebx, %ebx
   1316 ; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
   1317 ; X32-NEXT:    setb %bl
   1318 ; X32-NEXT:    movl %ebx, %eax
   1319 ; X32-NEXT:    popl %ebx
   1320 ; X32-NEXT:    retl
   1321 ;
   1322 ; X64-LABEL: pcmpestri_nontemporal:
   1323 ; X64:       # %bb.0: # %entry
   1324 ; X64-NEXT:    movntdqa (%rsi), %xmm1
   1325 ; X64-NEXT:    xorl %esi, %esi
   1326 ; X64-NEXT:    movl %edi, %eax
   1327 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
   1328 ; X64-NEXT:    setb %sil
   1329 ; X64-NEXT:    movl %esi, %eax
   1330 ; X64-NEXT:    retq
   1331 entry:
   1332   %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
   1333   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
   1334   ret i32 %flag
   1335 }
   1336 
   1337 !0 = !{ i32 1 }
   1338