Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
      3 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2   | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
      4 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx      | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
      5 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
      7 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
      8 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
      9 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
     10 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
     11 
     12 ;
     13 ; 128-bit Vectors
     14 ;
     15 
     16 define i64 @test_reduce_v2i64(<2 x i64> %a0) {
     17 ; X86-SSE2-LABEL: test_reduce_v2i64:
     18 ; X86-SSE2:       ## %bb.0:
     19 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     20 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
     21 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
     22 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
     23 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
     24 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
     25 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
     26 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
     27 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
     28 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
     29 ; X86-SSE2-NEXT:    pand %xmm5, %xmm2
     30 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
     31 ; X86-SSE2-NEXT:    por %xmm2, %xmm3
     32 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
     33 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
     34 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
     35 ; X86-SSE2-NEXT:    movd %xmm3, %eax
     36 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
     37 ; X86-SSE2-NEXT:    movd %xmm0, %edx
     38 ; X86-SSE2-NEXT:    retl
     39 ;
     40 ; X86-SSE42-LABEL: test_reduce_v2i64:
     41 ; X86-SSE42:       ## %bb.0:
     42 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
     43 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
     44 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
     45 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
     46 ; X86-SSE42-NEXT:    movd %xmm2, %eax
     47 ; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
     48 ; X86-SSE42-NEXT:    retl
     49 ;
     50 ; X86-AVX-LABEL: test_reduce_v2i64:
     51 ; X86-AVX:       ## %bb.0:
     52 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     53 ; X86-AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
     54 ; X86-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
     55 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
     56 ; X86-AVX-NEXT:    vpextrd $1, %xmm0, %edx
     57 ; X86-AVX-NEXT:    retl
     58 ;
     59 ; X64-SSE2-LABEL: test_reduce_v2i64:
     60 ; X64-SSE2:       ## %bb.0:
     61 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     62 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
     63 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
     64 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
     65 ; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
     66 ; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
     67 ; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
     68 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
     69 ; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
     70 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
     71 ; X64-SSE2-NEXT:    pand %xmm5, %xmm2
     72 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
     73 ; X64-SSE2-NEXT:    por %xmm2, %xmm3
     74 ; X64-SSE2-NEXT:    pand %xmm3, %xmm0
     75 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
     76 ; X64-SSE2-NEXT:    por %xmm0, %xmm3
     77 ; X64-SSE2-NEXT:    movq %xmm3, %rax
     78 ; X64-SSE2-NEXT:    retq
     79 ;
     80 ; X64-SSE42-LABEL: test_reduce_v2i64:
     81 ; X64-SSE42:       ## %bb.0:
     82 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
     83 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
     84 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
     85 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
     86 ; X64-SSE42-NEXT:    movq %xmm2, %rax
     87 ; X64-SSE42-NEXT:    retq
     88 ;
     89 ; X64-AVX1-LABEL: test_reduce_v2i64:
     90 ; X64-AVX1:       ## %bb.0:
     91 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     92 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
     93 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
     94 ; X64-AVX1-NEXT:    vmovq %xmm0, %rax
     95 ; X64-AVX1-NEXT:    retq
     96 ;
     97 ; X64-AVX2-LABEL: test_reduce_v2i64:
     98 ; X64-AVX2:       ## %bb.0:
     99 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    100 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
    101 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
    102 ; X64-AVX2-NEXT:    vmovq %xmm0, %rax
    103 ; X64-AVX2-NEXT:    retq
    104 ;
    105 ; X64-AVX512-LABEL: test_reduce_v2i64:
    106 ; X64-AVX512:       ## %bb.0:
    107 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    108 ; X64-AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
    109 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
    110 ; X64-AVX512-NEXT:    retq
    111   %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
    112   %2 = icmp sgt <2 x i64> %a0, %1
    113   %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
    114   %4 = extractelement <2 x i64> %3, i32 0
    115   ret i64 %4
    116 }
    117 
    118 define i32 @test_reduce_v4i32(<4 x i32> %a0) {
    119 ; X86-SSE2-LABEL: test_reduce_v4i32:
    120 ; X86-SSE2:       ## %bb.0:
    121 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    122 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
    123 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
    124 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
    125 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
    126 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
    127 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    128 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    129 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
    130 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    131 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    132 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    133 ; X86-SSE2-NEXT:    movd %xmm1, %eax
    134 ; X86-SSE2-NEXT:    retl
    135 ;
    136 ; X86-SSE42-LABEL: test_reduce_v4i32:
    137 ; X86-SSE42:       ## %bb.0:
    138 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    139 ; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
    140 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    141 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    142 ; X86-SSE42-NEXT:    movd %xmm0, %eax
    143 ; X86-SSE42-NEXT:    retl
    144 ;
    145 ; X86-AVX-LABEL: test_reduce_v4i32:
    146 ; X86-AVX:       ## %bb.0:
    147 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    148 ; X86-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    149 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    150 ; X86-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    151 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
    152 ; X86-AVX-NEXT:    retl
    153 ;
    154 ; X64-SSE2-LABEL: test_reduce_v4i32:
    155 ; X64-SSE2:       ## %bb.0:
    156 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    157 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
    158 ; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
    159 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
    160 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
    161 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
    162 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    163 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    164 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
    165 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    166 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    167 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    168 ; X64-SSE2-NEXT:    movd %xmm1, %eax
    169 ; X64-SSE2-NEXT:    retq
    170 ;
    171 ; X64-SSE42-LABEL: test_reduce_v4i32:
    172 ; X64-SSE42:       ## %bb.0:
    173 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    174 ; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
    175 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    176 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    177 ; X64-SSE42-NEXT:    movd %xmm0, %eax
    178 ; X64-SSE42-NEXT:    retq
    179 ;
    180 ; X64-AVX-LABEL: test_reduce_v4i32:
    181 ; X64-AVX:       ## %bb.0:
    182 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    183 ; X64-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    184 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    185 ; X64-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    186 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
    187 ; X64-AVX-NEXT:    retq
    188   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    189   %2 = icmp sgt <4 x i32> %a0, %1
    190   %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
    191   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    192   %5 = icmp sgt <4 x i32> %3, %4
    193   %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
    194   %7 = extractelement <4 x i32> %6, i32 0
    195   ret i32 %7
    196 }
    197 
    198 define i16 @test_reduce_v8i16(<8 x i16> %a0) {
    199 ; X86-SSE2-LABEL: test_reduce_v8i16:
    200 ; X86-SSE2:       ## %bb.0:
    201 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    202 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    203 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    204 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    205 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
    206 ; X86-SSE2-NEXT:    psrld $16, %xmm1
    207 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    208 ; X86-SSE2-NEXT:    movd %xmm1, %eax
    209 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
    210 ; X86-SSE2-NEXT:    retl
    211 ;
    212 ; X86-SSE42-LABEL: test_reduce_v8i16:
    213 ; X86-SSE42:       ## %bb.0:
    214 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    215 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    216 ; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
    217 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    218 ; X86-SSE42-NEXT:    movd %xmm0, %eax
    219 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
    220 ; X86-SSE42-NEXT:    retl
    221 ;
    222 ; X86-AVX-LABEL: test_reduce_v8i16:
    223 ; X86-AVX:       ## %bb.0:
    224 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    225 ; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    226 ; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
    227 ; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    228 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
    229 ; X86-AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
    230 ; X86-AVX-NEXT:    retl
    231 ;
    232 ; X64-SSE2-LABEL: test_reduce_v8i16:
    233 ; X64-SSE2:       ## %bb.0:
    234 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    235 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    236 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    237 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    238 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
    239 ; X64-SSE2-NEXT:    psrld $16, %xmm1
    240 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    241 ; X64-SSE2-NEXT:    movd %xmm1, %eax
    242 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
    243 ; X64-SSE2-NEXT:    retq
    244 ;
    245 ; X64-SSE42-LABEL: test_reduce_v8i16:
    246 ; X64-SSE42:       ## %bb.0:
    247 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    248 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    249 ; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
    250 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    251 ; X64-SSE42-NEXT:    movd %xmm0, %eax
    252 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
    253 ; X64-SSE42-NEXT:    retq
    254 ;
    255 ; X64-AVX-LABEL: test_reduce_v8i16:
    256 ; X64-AVX:       ## %bb.0:
    257 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    258 ; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    259 ; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
    260 ; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    261 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
    262 ; X64-AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
    263 ; X64-AVX-NEXT:    retq
    264   %1  = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    265   %2  = icmp sgt <8 x i16> %a0, %1
    266   %3  = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
    267   %4  = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    268   %5  = icmp sgt <8 x i16> %3, %4
    269   %6  = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
    270   %7  = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    271   %8  = icmp sgt <8 x i16> %6, %7
    272   %9  = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
    273   %10 = extractelement <8 x i16> %9, i32 0
    274   ret i16 %10
    275 }
    276 
    277 define i8 @test_reduce_v16i8(<16 x i8> %a0) {
    278 ; X86-SSE2-LABEL: test_reduce_v16i8:
    279 ; X86-SSE2:       ## %bb.0:
    280 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    281 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
    282 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    283 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
    284 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
    285 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
    286 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    287 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    288 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    289 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    290 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    291 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    292 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
    293 ; X86-SSE2-NEXT:    psrld $16, %xmm0
    294 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
    295 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    296 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
    297 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
    298 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
    299 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
    300 ; X86-SSE2-NEXT:    psrlw $8, %xmm0
    301 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    302 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    303 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    304 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    305 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    306 ; X86-SSE2-NEXT:    movd %xmm1, %eax
    307 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
    308 ; X86-SSE2-NEXT:    retl
    309 ;
    310 ; X86-SSE42-LABEL: test_reduce_v16i8:
    311 ; X86-SSE42:       ## %bb.0:
    312 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    313 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    314 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
    315 ; X86-SSE42-NEXT:    psrlw $8, %xmm2
    316 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
    317 ; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
    318 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    319 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
    320 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
    321 ; X86-SSE42-NEXT:    retl
    322 ;
    323 ; X86-AVX-LABEL: test_reduce_v16i8:
    324 ; X86-AVX:       ## %bb.0:
    325 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    326 ; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    327 ; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
    328 ; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
    329 ; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
    330 ; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    331 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
    332 ; X86-AVX-NEXT:    ## kill: def $al killed $al killed $eax
    333 ; X86-AVX-NEXT:    retl
    334 ;
    335 ; X64-SSE2-LABEL: test_reduce_v16i8:
    336 ; X64-SSE2:       ## %bb.0:
    337 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    338 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
    339 ; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    340 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
    341 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
    342 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
    343 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    344 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    345 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    346 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    347 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    348 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    349 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
    350 ; X64-SSE2-NEXT:    psrld $16, %xmm0
    351 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
    352 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    353 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
    354 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
    355 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
    356 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
    357 ; X64-SSE2-NEXT:    psrlw $8, %xmm0
    358 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    359 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    360 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    361 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    362 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    363 ; X64-SSE2-NEXT:    movd %xmm1, %eax
    364 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
    365 ; X64-SSE2-NEXT:    retq
    366 ;
    367 ; X64-SSE42-LABEL: test_reduce_v16i8:
    368 ; X64-SSE42:       ## %bb.0:
    369 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    370 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    371 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
    372 ; X64-SSE42-NEXT:    psrlw $8, %xmm2
    373 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
    374 ; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
    375 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    376 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
    377 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
    378 ; X64-SSE42-NEXT:    retq
    379 ;
    380 ; X64-AVX-LABEL: test_reduce_v16i8:
    381 ; X64-AVX:       ## %bb.0:
    382 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    383 ; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    384 ; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
    385 ; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
    386 ; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
    387 ; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    388 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
    389 ; X64-AVX-NEXT:    ## kill: def $al killed $al killed $eax
    390 ; X64-AVX-NEXT:    retq
    391   %1  = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    392   %2  = icmp sgt <16 x i8> %a0, %1
    393   %3  = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
    394   %4  = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    395   %5  = icmp sgt <16 x i8> %3, %4
    396   %6  = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
    397   %7  = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    398   %8  = icmp sgt <16 x i8> %6, %7
    399   %9  = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
    400   %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    401   %11 = icmp sgt <16 x i8> %9, %10
    402   %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
    403   %13 = extractelement <16 x i8> %12, i32 0
    404   ret i8 %13
    405 }
    406 
    407 ;
    408 ; 256-bit Vectors
    409 ;
    410 
    411 define i64 @test_reduce_v4i64(<4 x i64> %a0) {
    412 ; X86-SSE2-LABEL: test_reduce_v4i64:
    413 ; X86-SSE2:       ## %bb.0:
    414 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
    415 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
    416 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
    417 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
    418 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
    419 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
    420 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
    421 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
    422 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
    423 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
    424 ; X86-SSE2-NEXT:    pand %xmm6, %xmm3
    425 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
    426 ; X86-SSE2-NEXT:    por %xmm3, %xmm4
    427 ; X86-SSE2-NEXT:    pand %xmm4, %xmm0
    428 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
    429 ; X86-SSE2-NEXT:    por %xmm0, %xmm4
    430 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
    431 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
    432 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
    433 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
    434 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
    435 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
    436 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
    437 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
    438 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
    439 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
    440 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
    441 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
    442 ; X86-SSE2-NEXT:    pand %xmm2, %xmm4
    443 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
    444 ; X86-SSE2-NEXT:    por %xmm4, %xmm2
    445 ; X86-SSE2-NEXT:    movd %xmm2, %eax
    446 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    447 ; X86-SSE2-NEXT:    movd %xmm0, %edx
    448 ; X86-SSE2-NEXT:    retl
    449 ;
    450 ; X86-SSE42-LABEL: test_reduce_v4i64:
    451 ; X86-SSE42:       ## %bb.0:
    452 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
    453 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
    454 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
    455 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    456 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
    457 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
    458 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
    459 ; X86-SSE42-NEXT:    movd %xmm2, %eax
    460 ; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
    461 ; X86-SSE42-NEXT:    retl
    462 ;
    463 ; X86-AVX1-LABEL: test_reduce_v4i64:
    464 ; X86-AVX1:       ## %bb.0:
    465 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    466 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
    467 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
    468 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
    469 ; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    470 ; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
    471 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
    472 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    473 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
    474 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    475 ; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    476 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
    477 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
    478 ; X86-AVX1-NEXT:    vzeroupper
    479 ; X86-AVX1-NEXT:    retl
    480 ;
    481 ; X86-AVX2-LABEL: test_reduce_v4i64:
    482 ; X86-AVX2:       ## %bb.0:
    483 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    484 ; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
    485 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    486 ; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
    487 ; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
    488 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    489 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
    490 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
    491 ; X86-AVX2-NEXT:    vzeroupper
    492 ; X86-AVX2-NEXT:    retl
    493 ;
    494 ; X64-SSE2-LABEL: test_reduce_v4i64:
    495 ; X64-SSE2:       ## %bb.0:
    496 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
    497 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
    498 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
    499 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
    500 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
    501 ; X64-SSE2-NEXT:    movdqa %xmm4, %xmm5
    502 ; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
    503 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
    504 ; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
    505 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
    506 ; X64-SSE2-NEXT:    pand %xmm6, %xmm3
    507 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
    508 ; X64-SSE2-NEXT:    por %xmm3, %xmm4
    509 ; X64-SSE2-NEXT:    pand %xmm4, %xmm0
    510 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
    511 ; X64-SSE2-NEXT:    por %xmm0, %xmm4
    512 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
    513 ; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
    514 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
    515 ; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
    516 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
    517 ; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
    518 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
    519 ; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
    520 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
    521 ; X64-SSE2-NEXT:    pand %xmm5, %xmm1
    522 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
    523 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
    524 ; X64-SSE2-NEXT:    pand %xmm2, %xmm4
    525 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
    526 ; X64-SSE2-NEXT:    por %xmm4, %xmm2
    527 ; X64-SSE2-NEXT:    movq %xmm2, %rax
    528 ; X64-SSE2-NEXT:    retq
    529 ;
    530 ; X64-SSE42-LABEL: test_reduce_v4i64:
    531 ; X64-SSE42:       ## %bb.0:
    532 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
    533 ; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
    534 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
    535 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    536 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
    537 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
    538 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
    539 ; X64-SSE42-NEXT:    movq %xmm2, %rax
    540 ; X64-SSE42-NEXT:    retq
    541 ;
    542 ; X64-AVX1-LABEL: test_reduce_v4i64:
    543 ; X64-AVX1:       ## %bb.0:
    544 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    545 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
    546 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
    547 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
    548 ; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    549 ; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
    550 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
    551 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    552 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
    553 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    554 ; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    555 ; X64-AVX1-NEXT:    vmovq %xmm0, %rax
    556 ; X64-AVX1-NEXT:    vzeroupper
    557 ; X64-AVX1-NEXT:    retq
    558 ;
    559 ; X64-AVX2-LABEL: test_reduce_v4i64:
    560 ; X64-AVX2:       ## %bb.0:
    561 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    562 ; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
    563 ; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    564 ; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
    565 ; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
    566 ; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
    567 ; X64-AVX2-NEXT:    vmovq %xmm0, %rax
    568 ; X64-AVX2-NEXT:    vzeroupper
    569 ; X64-AVX2-NEXT:    retq
    570 ;
    571 ; X64-AVX512-LABEL: test_reduce_v4i64:
    572 ; X64-AVX512:       ## %bb.0:
    573 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    574 ; X64-AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
    575 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    576 ; X64-AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
    577 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
    578 ; X64-AVX512-NEXT:    vzeroupper
    579 ; X64-AVX512-NEXT:    retq
    580   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    581   %2 = icmp sgt <4 x i64> %a0, %1
    582   %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
    583   %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    584   %5 = icmp sgt <4 x i64> %3, %4
    585   %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
    586   %7 = extractelement <4 x i64> %6, i32 0
    587   ret i64 %7
    588 }
    589 
    590 define i32 @test_reduce_v8i32(<8 x i32> %a0) {
    591 ; X86-SSE2-LABEL: test_reduce_v8i32:
    592 ; X86-SSE2:       ## %bb.0:
    593 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
    594 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
    595 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
    596 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
    597 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
    598 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    599 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    600 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
    601 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    602 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    603 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    604 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    605 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
    606 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
    607 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
    608 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
    609 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
    610 ; X86-SSE2-NEXT:    movd %xmm2, %eax
    611 ; X86-SSE2-NEXT:    retl
    612 ;
    613 ; X86-SSE42-LABEL: test_reduce_v8i32:
    614 ; X86-SSE42:       ## %bb.0:
    615 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    616 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    617 ; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
    618 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    619 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    620 ; X86-SSE42-NEXT:    movd %xmm0, %eax
    621 ; X86-SSE42-NEXT:    retl
    622 ;
    623 ; X86-AVX1-LABEL: test_reduce_v8i32:
    624 ; X86-AVX1:       ## %bb.0:
    625 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    626 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    627 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    628 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    629 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    630 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    631 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
    632 ; X86-AVX1-NEXT:    vzeroupper
    633 ; X86-AVX1-NEXT:    retl
    634 ;
    635 ; X86-AVX2-LABEL: test_reduce_v8i32:
    636 ; X86-AVX2:       ## %bb.0:
    637 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    638 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    639 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    640 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    641 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    642 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    643 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
    644 ; X86-AVX2-NEXT:    vzeroupper
    645 ; X86-AVX2-NEXT:    retl
    646 ;
    647 ; X64-SSE2-LABEL: test_reduce_v8i32:
    648 ; X64-SSE2:       ## %bb.0:
    649 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
    650 ; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
    651 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
    652 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
    653 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
    654 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    655 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    656 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
    657 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    658 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    659 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    660 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    661 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
    662 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
    663 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
    664 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
    665 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
    666 ; X64-SSE2-NEXT:    movd %xmm2, %eax
    667 ; X64-SSE2-NEXT:    retq
    668 ;
    669 ; X64-SSE42-LABEL: test_reduce_v8i32:
    670 ; X64-SSE42:       ## %bb.0:
    671 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    672 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    673 ; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
    674 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    675 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
    676 ; X64-SSE42-NEXT:    movd %xmm0, %eax
    677 ; X64-SSE42-NEXT:    retq
    678 ;
    679 ; X64-AVX1-LABEL: test_reduce_v8i32:
    680 ; X64-AVX1:       ## %bb.0:
    681 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    682 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    683 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    684 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    685 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    686 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    687 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
    688 ; X64-AVX1-NEXT:    vzeroupper
    689 ; X64-AVX1-NEXT:    retq
    690 ;
    691 ; X64-AVX2-LABEL: test_reduce_v8i32:
    692 ; X64-AVX2:       ## %bb.0:
    693 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    694 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    695 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    696 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    697 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    698 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    699 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
    700 ; X64-AVX2-NEXT:    vzeroupper
    701 ; X64-AVX2-NEXT:    retq
    702 ;
    703 ; X64-AVX512-LABEL: test_reduce_v8i32:
    704 ; X64-AVX512:       ## %bb.0:
    705 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    706 ; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    707 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    708 ; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    709 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    710 ; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    711 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
    712 ; X64-AVX512-NEXT:    vzeroupper
    713 ; X64-AVX512-NEXT:    retq
    714   %1  = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    715   %2  = icmp sgt <8 x i32> %a0, %1
    716   %3  = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
    717   %4  = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    718   %5  = icmp sgt <8 x i32> %3, %4
    719   %6  = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
    720   %7  = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    721   %8  = icmp sgt <8 x i32> %6, %7
    722   %9  = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
    723   %10 = extractelement <8 x i32> %9, i32 0
    724   ret i32 %10
    725 }
    726 
    727 define i16 @test_reduce_v16i16(<16 x i16> %a0) {
    728 ; X86-SSE2-LABEL: test_reduce_v16i16:
    729 ; X86-SSE2:       ## %bb.0:
    730 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    731 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    732 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    733 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    734 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    735 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
    736 ; X86-SSE2-NEXT:    psrld $16, %xmm1
    737 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    738 ; X86-SSE2-NEXT:    movd %xmm1, %eax
    739 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
    740 ; X86-SSE2-NEXT:    retl
    741 ;
    742 ; X86-SSE42-LABEL: test_reduce_v16i16:
    743 ; X86-SSE42:       ## %bb.0:
    744 ; X86-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
    745 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    746 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    747 ; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
    748 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    749 ; X86-SSE42-NEXT:    movd %xmm0, %eax
    750 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
    751 ; X86-SSE42-NEXT:    retl
    752 ;
    753 ; X86-AVX1-LABEL: test_reduce_v16i16:
    754 ; X86-AVX1:       ## %bb.0:
    755 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    756 ; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
    757 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    758 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    759 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
    760 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    761 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
    762 ; X86-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
    763 ; X86-AVX1-NEXT:    vzeroupper
    764 ; X86-AVX1-NEXT:    retl
    765 ;
    766 ; X86-AVX2-LABEL: test_reduce_v16i16:
    767 ; X86-AVX2:       ## %bb.0:
    768 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    769 ; X86-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
    770 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    771 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    772 ; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
    773 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    774 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
    775 ; X86-AVX2-NEXT:    ## kill: def $ax killed $ax killed $eax
    776 ; X86-AVX2-NEXT:    vzeroupper
    777 ; X86-AVX2-NEXT:    retl
    778 ;
    779 ; X64-SSE2-LABEL: test_reduce_v16i16:
    780 ; X64-SSE2:       ## %bb.0:
    781 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    782 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    783 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    784 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    785 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
    786 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
    787 ; X64-SSE2-NEXT:    psrld $16, %xmm1
    788 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
    789 ; X64-SSE2-NEXT:    movd %xmm1, %eax
    790 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
    791 ; X64-SSE2-NEXT:    retq
    792 ;
    793 ; X64-SSE42-LABEL: test_reduce_v16i16:
    794 ; X64-SSE42:       ## %bb.0:
    795 ; X64-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
    796 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    797 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    798 ; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
    799 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    800 ; X64-SSE42-NEXT:    movd %xmm0, %eax
    801 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
    802 ; X64-SSE42-NEXT:    retq
    803 ;
    804 ; X64-AVX1-LABEL: test_reduce_v16i16:
    805 ; X64-AVX1:       ## %bb.0:
    806 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    807 ; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
    808 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    809 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    810 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
    811 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    812 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
    813 ; X64-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
    814 ; X64-AVX1-NEXT:    vzeroupper
    815 ; X64-AVX1-NEXT:    retq
    816 ;
    817 ; X64-AVX2-LABEL: test_reduce_v16i16:
    818 ; X64-AVX2:       ## %bb.0:
    819 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    820 ; X64-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
    821 ; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    822 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    823 ; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
    824 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    825 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
    826 ; X64-AVX2-NEXT:    ## kill: def $ax killed $ax killed $eax
    827 ; X64-AVX2-NEXT:    vzeroupper
    828 ; X64-AVX2-NEXT:    retq
    829 ;
    830 ; X64-AVX512-LABEL: test_reduce_v16i16:
    831 ; X64-AVX512:       ## %bb.0:
    832 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    833 ; X64-AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
    834 ; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    835 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    836 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
    837 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    838 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
    839 ; X64-AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
    840 ; X64-AVX512-NEXT:    vzeroupper
    841 ; X64-AVX512-NEXT:    retq
    842   %1  = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    843   %2  = icmp sgt <16 x i16> %a0, %1
    844   %3  = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
    845   %4  = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    846   %5  = icmp sgt <16 x i16> %3, %4
    847   %6  = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
    848   %7  = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    849   %8  = icmp sgt <16 x i16> %6, %7
    850   %9  = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
    851   %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    852   %11 = icmp sgt <16 x i16> %9, %10
    853   %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
    854   %13 = extractelement <16 x i16> %12, i32 0
    855   ret i16 %13
    856 }
    857 
    858 define i8 @test_reduce_v32i8(<32 x i8> %a0) {
    859 ; X86-SSE2-LABEL: test_reduce_v32i8:
    860 ; X86-SSE2:       ## %bb.0:
    861 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
    862 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    863 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
    864 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
    865 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
    866 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    867 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    868 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    869 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    870 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    871 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    872 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    873 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
    874 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    875 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
    876 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
    877 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
    878 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
    879 ; X86-SSE2-NEXT:    psrld $16, %xmm0
    880 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
    881 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    882 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
    883 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
    884 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
    885 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
    886 ; X86-SSE2-NEXT:    psrlw $8, %xmm0
    887 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
    888 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    889 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
    890 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
    891 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
    892 ; X86-SSE2-NEXT:    movd %xmm2, %eax
    893 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
    894 ; X86-SSE2-NEXT:    retl
    895 ;
    896 ; X86-SSE42-LABEL: test_reduce_v32i8:
    897 ; X86-SSE42:       ## %bb.0:
    898 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
    899 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    900 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    901 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
    902 ; X86-SSE42-NEXT:    psrlw $8, %xmm2
    903 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
    904 ; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
    905 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
    906 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
    907 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
    908 ; X86-SSE42-NEXT:    retl
    909 ;
    910 ; X86-AVX1-LABEL: test_reduce_v32i8:
    911 ; X86-AVX1:       ## %bb.0:
    912 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    913 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
    914 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    915 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    916 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
    917 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
    918 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
    919 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    920 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
    921 ; X86-AVX1-NEXT:    ## kill: def $al killed $al killed $eax
    922 ; X86-AVX1-NEXT:    vzeroupper
    923 ; X86-AVX1-NEXT:    retl
    924 ;
    925 ; X86-AVX2-LABEL: test_reduce_v32i8:
    926 ; X86-AVX2:       ## %bb.0:
    927 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    928 ; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
    929 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    930 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    931 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
    932 ; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
    933 ; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
    934 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    935 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
    936 ; X86-AVX2-NEXT:    ## kill: def $al killed $al killed $eax
    937 ; X86-AVX2-NEXT:    vzeroupper
    938 ; X86-AVX2-NEXT:    retl
    939 ;
    940 ; X64-SSE2-LABEL: test_reduce_v32i8:
    941 ; X64-SSE2:       ## %bb.0:
    942 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
    943 ; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    944 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
    945 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
    946 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
    947 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    948 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    949 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    950 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    951 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    952 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    953 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    954 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
    955 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    956 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
    957 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
    958 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
    959 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
    960 ; X64-SSE2-NEXT:    psrld $16, %xmm0
    961 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
    962 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
    963 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
    964 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
    965 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
    966 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
    967 ; X64-SSE2-NEXT:    psrlw $8, %xmm0
    968 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
    969 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
    970 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
    971 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
    972 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
    973 ; X64-SSE2-NEXT:    movd %xmm2, %eax
    974 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
    975 ; X64-SSE2-NEXT:    retq
    976 ;
    977 ; X64-SSE42-LABEL: test_reduce_v32i8:
    978 ; X64-SSE42:       ## %bb.0:
    979 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
    980 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    981 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    982 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
    983 ; X64-SSE42-NEXT:    psrlw $8, %xmm2
    984 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
    985 ; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
    986 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
    987 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
    988 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
    989 ; X64-SSE42-NEXT:    retq
    990 ;
    991 ; X64-AVX1-LABEL: test_reduce_v32i8:
    992 ; X64-AVX1:       ## %bb.0:
    993 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    994 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
    995 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    996 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    997 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
    998 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
    999 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
   1000 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1001 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   1002 ; X64-AVX1-NEXT:    ## kill: def $al killed $al killed $eax
   1003 ; X64-AVX1-NEXT:    vzeroupper
   1004 ; X64-AVX1-NEXT:    retq
   1005 ;
   1006 ; X64-AVX2-LABEL: test_reduce_v32i8:
   1007 ; X64-AVX2:       ## %bb.0:
   1008 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1009 ; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1010 ; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1011 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1012 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1013 ; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1014 ; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
   1015 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1016 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   1017 ; X64-AVX2-NEXT:    ## kill: def $al killed $al killed $eax
   1018 ; X64-AVX2-NEXT:    vzeroupper
   1019 ; X64-AVX2-NEXT:    retq
   1020 ;
   1021 ; X64-AVX512-LABEL: test_reduce_v32i8:
   1022 ; X64-AVX512:       ## %bb.0:
   1023 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1024 ; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1025 ; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1026 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1027 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1028 ; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1029 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
   1030 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1031 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
   1032 ; X64-AVX512-NEXT:    ## kill: def $al killed $al killed $eax
   1033 ; X64-AVX512-NEXT:    vzeroupper
   1034 ; X64-AVX512-NEXT:    retq
   1035   %1  = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1036   %2  = icmp sgt <32 x i8> %a0, %1
   1037   %3  = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
   1038   %4  = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1039   %5  = icmp sgt <32 x i8> %3, %4
   1040   %6  = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
   1041   %7  = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1042   %8  = icmp sgt <32 x i8> %6, %7
   1043   %9  = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
   1044   %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1045   %11 = icmp sgt <32 x i8> %9, %10
   1046   %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
   1047   %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1048   %14 = icmp sgt <32 x i8> %12, %13
   1049   %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
   1050   %16 = extractelement <32 x i8> %15, i32 0
   1051   ret i8 %16
   1052 }
   1053 
   1054 ;
   1055 ; 512-bit Vectors
   1056 ;
   1057 
   1058 define i64 @test_reduce_v8i64(<8 x i64> %a0) {
   1059 ; X86-SSE2-LABEL: test_reduce_v8i64:
   1060 ; X86-SSE2:       ## %bb.0:
   1061 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
   1062 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
   1063 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
   1064 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
   1065 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
   1066 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
   1067 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
   1068 ; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
   1069 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
   1070 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1071 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
   1072 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
   1073 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
   1074 ; X86-SSE2-NEXT:    pand %xmm5, %xmm0
   1075 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
   1076 ; X86-SSE2-NEXT:    por %xmm0, %xmm5
   1077 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
   1078 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
   1079 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1080 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
   1081 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
   1082 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
   1083 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
   1084 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
   1085 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1086 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
   1087 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
   1088 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
   1089 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
   1090 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
   1091 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
   1092 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1093 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
   1094 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
   1095 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
   1096 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
   1097 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
   1098 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
   1099 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
   1100 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1101 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
   1102 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
   1103 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
   1104 ; X86-SSE2-NEXT:    pand %xmm1, %xmm5
   1105 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
   1106 ; X86-SSE2-NEXT:    por %xmm5, %xmm1
   1107 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1108 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1109 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
   1110 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
   1111 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
   1112 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
   1113 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
   1114 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
   1115 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   1116 ; X86-SSE2-NEXT:    pand %xmm2, %xmm4
   1117 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
   1118 ; X86-SSE2-NEXT:    por %xmm4, %xmm2
   1119 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
   1120 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
   1121 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
   1122 ; X86-SSE2-NEXT:    movd %xmm2, %eax
   1123 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1124 ; X86-SSE2-NEXT:    movd %xmm0, %edx
   1125 ; X86-SSE2-NEXT:    retl
   1126 ;
   1127 ; X86-SSE42-LABEL: test_reduce_v8i64:
   1128 ; X86-SSE42:       ## %bb.0:
   1129 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
   1130 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
   1131 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
   1132 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
   1133 ; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
   1134 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
   1135 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
   1136 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
   1137 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
   1138 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
   1139 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
   1140 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
   1141 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
   1142 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
   1143 ; X86-SSE42-NEXT:    movd %xmm1, %eax
   1144 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
   1145 ; X86-SSE42-NEXT:    retl
   1146 ;
   1147 ; X86-AVX1-LABEL: test_reduce_v8i64:
   1148 ; X86-AVX1:       ## %bb.0:
   1149 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1150 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1151 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
   1152 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
   1153 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
   1154 ; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1155 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1156 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
   1157 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
   1158 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1159 ; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1160 ; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1161 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
   1162 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1163 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
   1164 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1165 ; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1166 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
   1167 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
   1168 ; X86-AVX1-NEXT:    vzeroupper
   1169 ; X86-AVX1-NEXT:    retl
   1170 ;
   1171 ; X86-AVX2-LABEL: test_reduce_v8i64:
   1172 ; X86-AVX2:       ## %bb.0:
   1173 ; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1174 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1175 ; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1176 ; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1177 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1178 ; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1179 ; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1180 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1181 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
   1182 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
   1183 ; X86-AVX2-NEXT:    vzeroupper
   1184 ; X86-AVX2-NEXT:    retl
   1185 ;
   1186 ; X64-SSE2-LABEL: test_reduce_v8i64:
   1187 ; X64-SSE2:       ## %bb.0:
   1188 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
   1189 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
   1190 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
   1191 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm6
   1192 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
   1193 ; X64-SSE2-NEXT:    movdqa %xmm6, %xmm7
   1194 ; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
   1195 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
   1196 ; X64-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
   1197 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1198 ; X64-SSE2-NEXT:    pand %xmm8, %xmm6
   1199 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
   1200 ; X64-SSE2-NEXT:    por %xmm6, %xmm5
   1201 ; X64-SSE2-NEXT:    pand %xmm5, %xmm0
   1202 ; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
   1203 ; X64-SSE2-NEXT:    por %xmm0, %xmm5
   1204 ; X64-SSE2-NEXT:    movdqa %xmm3, %xmm0
   1205 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
   1206 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1207 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
   1208 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm6
   1209 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
   1210 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
   1211 ; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
   1212 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
   1213 ; X64-SSE2-NEXT:    pand %xmm7, %xmm0
   1214 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
   1215 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
   1216 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
   1217 ; X64-SSE2-NEXT:    pandn %xmm3, %xmm2
   1218 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
   1219 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
   1220 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
   1221 ; X64-SSE2-NEXT:    movdqa %xmm5, %xmm1
   1222 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
   1223 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
   1224 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
   1225 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
   1226 ; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
   1227 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
   1228 ; X64-SSE2-NEXT:    pand %xmm6, %xmm0
   1229 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
   1230 ; X64-SSE2-NEXT:    por %xmm0, %xmm1
   1231 ; X64-SSE2-NEXT:    pand %xmm1, %xmm5
   1232 ; X64-SSE2-NEXT:    pandn %xmm2, %xmm1
   1233 ; X64-SSE2-NEXT:    por %xmm5, %xmm1
   1234 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1235 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1236 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
   1237 ; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
   1238 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
   1239 ; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
   1240 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
   1241 ; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
   1242 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
   1243 ; X64-SSE2-NEXT:    pand %xmm5, %xmm2
   1244 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
   1245 ; X64-SSE2-NEXT:    por %xmm2, %xmm3
   1246 ; X64-SSE2-NEXT:    pand %xmm3, %xmm1
   1247 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
   1248 ; X64-SSE2-NEXT:    por %xmm1, %xmm3
   1249 ; X64-SSE2-NEXT:    movq %xmm3, %rax
   1250 ; X64-SSE2-NEXT:    retq
   1251 ;
   1252 ; X64-SSE42-LABEL: test_reduce_v8i64:
   1253 ; X64-SSE42:       ## %bb.0:
   1254 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
   1255 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
   1256 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
   1257 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
   1258 ; X64-SSE42-NEXT:    movdqa %xmm4, %xmm0
   1259 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
   1260 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
   1261 ; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
   1262 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
   1263 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
   1264 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
   1265 ; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
   1266 ; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
   1267 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
   1268 ; X64-SSE42-NEXT:    movq %xmm1, %rax
   1269 ; X64-SSE42-NEXT:    retq
   1270 ;
   1271 ; X64-AVX1-LABEL: test_reduce_v8i64:
   1272 ; X64-AVX1:       ## %bb.0:
   1273 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1274 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1275 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
   1276 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
   1277 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
   1278 ; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1279 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1280 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
   1281 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
   1282 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1283 ; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1284 ; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1285 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
   1286 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1287 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
   1288 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1289 ; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1290 ; X64-AVX1-NEXT:    vmovq %xmm0, %rax
   1291 ; X64-AVX1-NEXT:    vzeroupper
   1292 ; X64-AVX1-NEXT:    retq
   1293 ;
   1294 ; X64-AVX2-LABEL: test_reduce_v8i64:
   1295 ; X64-AVX2:       ## %bb.0:
   1296 ; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1297 ; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1298 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1299 ; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1300 ; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1301 ; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1302 ; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
   1303 ; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
   1304 ; X64-AVX2-NEXT:    vmovq %xmm0, %rax
   1305 ; X64-AVX2-NEXT:    vzeroupper
   1306 ; X64-AVX2-NEXT:    retq
   1307 ;
   1308 ; X64-AVX512-LABEL: test_reduce_v8i64:
   1309 ; X64-AVX512:       ## %bb.0:
   1310 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1311 ; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   1312 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1313 ; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   1314 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1315 ; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   1316 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
   1317 ; X64-AVX512-NEXT:    vzeroupper
   1318 ; X64-AVX512-NEXT:    retq
   1319   %1  = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   1320   %2  = icmp sgt <8 x i64> %a0, %1
   1321   %3  = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
   1322   %4  = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1323   %5  = icmp sgt <8 x i64> %3, %4
   1324   %6  = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
   1325   %7  = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1326   %8  = icmp sgt <8 x i64> %6, %7
   1327   %9  = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
   1328   %10 = extractelement <8 x i64> %9, i32 0
   1329   ret i64 %10
   1330 }
   1331 
   1332 define i32 @test_reduce_v16i32(<16 x i32> %a0) {
   1333 ; X86-SSE2-LABEL: test_reduce_v16i32:
   1334 ; X86-SSE2:       ## %bb.0:
   1335 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
   1336 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
   1337 ; X86-SSE2-NEXT:    pand %xmm4, %xmm1
   1338 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
   1339 ; X86-SSE2-NEXT:    por %xmm1, %xmm4
   1340 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1341 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
   1342 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
   1343 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
   1344 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
   1345 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1346 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
   1347 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
   1348 ; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
   1349 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
   1350 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1351 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
   1352 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
   1353 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
   1354 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
   1355 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
   1356 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1357 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1358 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
   1359 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
   1360 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
   1361 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
   1362 ; X86-SSE2-NEXT:    movd %xmm1, %eax
   1363 ; X86-SSE2-NEXT:    retl
   1364 ;
   1365 ; X86-SSE42-LABEL: test_reduce_v16i32:
   1366 ; X86-SSE42:       ## %bb.0:
   1367 ; X86-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
   1368 ; X86-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
   1369 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
   1370 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1371 ; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
   1372 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1373 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
   1374 ; X86-SSE42-NEXT:    movd %xmm0, %eax
   1375 ; X86-SSE42-NEXT:    retl
   1376 ;
   1377 ; X86-AVX1-LABEL: test_reduce_v16i32:
   1378 ; X86-AVX1:       ## %bb.0:
   1379 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1380 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1381 ; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
   1382 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1383 ; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
   1384 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1385 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1386 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1387 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1388 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
   1389 ; X86-AVX1-NEXT:    vzeroupper
   1390 ; X86-AVX1-NEXT:    retl
   1391 ;
   1392 ; X86-AVX2-LABEL: test_reduce_v16i32:
   1393 ; X86-AVX2:       ## %bb.0:
   1394 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1395 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1396 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1397 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1398 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1399 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1400 ; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1401 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
   1402 ; X86-AVX2-NEXT:    vzeroupper
   1403 ; X86-AVX2-NEXT:    retl
   1404 ;
   1405 ; X64-SSE2-LABEL: test_reduce_v16i32:
   1406 ; X64-SSE2:       ## %bb.0:
   1407 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
   1408 ; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
   1409 ; X64-SSE2-NEXT:    pand %xmm4, %xmm1
   1410 ; X64-SSE2-NEXT:    pandn %xmm3, %xmm4
   1411 ; X64-SSE2-NEXT:    por %xmm1, %xmm4
   1412 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1413 ; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
   1414 ; X64-SSE2-NEXT:    pand %xmm1, %xmm0
   1415 ; X64-SSE2-NEXT:    pandn %xmm2, %xmm1
   1416 ; X64-SSE2-NEXT:    por %xmm0, %xmm1
   1417 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1418 ; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
   1419 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
   1420 ; X64-SSE2-NEXT:    pandn %xmm4, %xmm0
   1421 ; X64-SSE2-NEXT:    por %xmm1, %xmm0
   1422 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1423 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
   1424 ; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
   1425 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
   1426 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
   1427 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
   1428 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1429 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1430 ; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
   1431 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
   1432 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
   1433 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
   1434 ; X64-SSE2-NEXT:    movd %xmm1, %eax
   1435 ; X64-SSE2-NEXT:    retq
   1436 ;
   1437 ; X64-SSE42-LABEL: test_reduce_v16i32:
   1438 ; X64-SSE42:       ## %bb.0:
   1439 ; X64-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
   1440 ; X64-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
   1441 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
   1442 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1443 ; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
   1444 ; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1445 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
   1446 ; X64-SSE42-NEXT:    movd %xmm0, %eax
   1447 ; X64-SSE42-NEXT:    retq
   1448 ;
   1449 ; X64-AVX1-LABEL: test_reduce_v16i32:
   1450 ; X64-AVX1:       ## %bb.0:
   1451 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1452 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1453 ; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
   1454 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1455 ; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
   1456 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1457 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1458 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1459 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   1460 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
   1461 ; X64-AVX1-NEXT:    vzeroupper
   1462 ; X64-AVX1-NEXT:    retq
   1463 ;
   1464 ; X64-AVX2-LABEL: test_reduce_v16i32:
   1465 ; X64-AVX2:       ## %bb.0:
   1466 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1467 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1468 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1469 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1470 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1471 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1472 ; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1473 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
   1474 ; X64-AVX2-NEXT:    vzeroupper
   1475 ; X64-AVX2-NEXT:    retq
   1476 ;
   1477 ; X64-AVX512-LABEL: test_reduce_v16i32:
   1478 ; X64-AVX512:       ## %bb.0:
   1479 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1480 ; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   1481 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1482 ; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   1483 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1484 ; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   1485 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1486 ; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   1487 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
   1488 ; X64-AVX512-NEXT:    vzeroupper
   1489 ; X64-AVX512-NEXT:    retq
   1490   %1  = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1491   %2  = icmp sgt <16 x i32> %a0, %1
   1492   %3  = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
   1493   %4  = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1494   %5  = icmp sgt <16 x i32> %3, %4
   1495   %6  = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
   1496   %7  = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1497   %8  = icmp sgt <16 x i32> %6, %7
   1498   %9  = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
   1499   %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1500   %11 = icmp sgt <16 x i32> %9, %10
   1501   %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
   1502   %13 = extractelement <16 x i32> %12, i32 0
   1503   ret i32 %13
   1504 }
   1505 
   1506 define i16 @test_reduce_v32i16(<32 x i16> %a0) {
   1507 ; X86-SSE2-LABEL: test_reduce_v32i16:
   1508 ; X86-SSE2:       ## %bb.0:
   1509 ; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
   1510 ; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
   1511 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
   1512 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1513 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
   1514 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1515 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
   1516 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1517 ; X86-SSE2-NEXT:    psrld $16, %xmm1
   1518 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
   1519 ; X86-SSE2-NEXT:    movd %xmm1, %eax
   1520 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
   1521 ; X86-SSE2-NEXT:    retl
   1522 ;
   1523 ; X86-SSE42-LABEL: test_reduce_v32i16:
   1524 ; X86-SSE42:       ## %bb.0:
   1525 ; X86-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
   1526 ; X86-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
   1527 ; X86-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
   1528 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1529 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
   1530 ; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
   1531 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
   1532 ; X86-SSE42-NEXT:    movd %xmm0, %eax
   1533 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
   1534 ; X86-SSE42-NEXT:    retl
   1535 ;
   1536 ; X86-AVX1-LABEL: test_reduce_v32i16:
   1537 ; X86-AVX1:       ## %bb.0:
   1538 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1539 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1540 ; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
   1541 ; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
   1542 ; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
   1543 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1544 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1545 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
   1546 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1547 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
   1548 ; X86-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
   1549 ; X86-AVX1-NEXT:    vzeroupper
   1550 ; X86-AVX1-NEXT:    retl
   1551 ;
   1552 ; X86-AVX2-LABEL: test_reduce_v32i16:
   1553 ; X86-AVX2:       ## %bb.0:
   1554 ; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
   1555 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1556 ; X86-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
   1557 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1558 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1559 ; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
   1560 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1561 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
   1562 ; X86-AVX2-NEXT:    ## kill: def $ax killed $ax killed $eax
   1563 ; X86-AVX2-NEXT:    vzeroupper
   1564 ; X86-AVX2-NEXT:    retl
   1565 ;
   1566 ; X64-SSE2-LABEL: test_reduce_v32i16:
   1567 ; X64-SSE2:       ## %bb.0:
   1568 ; X64-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
   1569 ; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
   1570 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
   1571 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1572 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
   1573 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1574 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
   1575 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1576 ; X64-SSE2-NEXT:    psrld $16, %xmm1
   1577 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
   1578 ; X64-SSE2-NEXT:    movd %xmm1, %eax
   1579 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
   1580 ; X64-SSE2-NEXT:    retq
   1581 ;
   1582 ; X64-SSE42-LABEL: test_reduce_v32i16:
   1583 ; X64-SSE42:       ## %bb.0:
   1584 ; X64-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
   1585 ; X64-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
   1586 ; X64-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
   1587 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1588 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
   1589 ; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
   1590 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
   1591 ; X64-SSE42-NEXT:    movd %xmm0, %eax
   1592 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
   1593 ; X64-SSE42-NEXT:    retq
   1594 ;
   1595 ; X64-AVX1-LABEL: test_reduce_v32i16:
   1596 ; X64-AVX1:       ## %bb.0:
   1597 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1598 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1599 ; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
   1600 ; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
   1601 ; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
   1602 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1603 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1604 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
   1605 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1606 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
   1607 ; X64-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
   1608 ; X64-AVX1-NEXT:    vzeroupper
   1609 ; X64-AVX1-NEXT:    retq
   1610 ;
   1611 ; X64-AVX2-LABEL: test_reduce_v32i16:
   1612 ; X64-AVX2:       ## %bb.0:
   1613 ; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
   1614 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1615 ; X64-AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
   1616 ; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1617 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1618 ; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
   1619 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1620 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
   1621 ; X64-AVX2-NEXT:    ## kill: def $ax killed $ax killed $eax
   1622 ; X64-AVX2-NEXT:    vzeroupper
   1623 ; X64-AVX2-NEXT:    retq
   1624 ;
   1625 ; X64-AVX512-LABEL: test_reduce_v32i16:
   1626 ; X64-AVX512:       ## %bb.0:
   1627 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1628 ; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
   1629 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1630 ; X64-AVX512-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
   1631 ; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
   1632 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1633 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
   1634 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1635 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
   1636 ; X64-AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
   1637 ; X64-AVX512-NEXT:    vzeroupper
   1638 ; X64-AVX512-NEXT:    retq
   1639   %1  = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1640   %2  = icmp sgt <32 x i16> %a0, %1
   1641   %3  = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
   1642   %4  = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1643   %5  = icmp sgt <32 x i16> %3, %4
   1644   %6  = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
   1645   %7  = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1646   %8  = icmp sgt <32 x i16> %6, %7
   1647   %9  = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
   1648   %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1649   %11 = icmp sgt <32 x i16> %9, %10
   1650   %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
   1651   %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1652   %14 = icmp sgt <32 x i16> %12, %13
   1653   %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
   1654   %16 = extractelement <32 x i16> %15, i32 0
   1655   ret i16 %16
   1656 }
   1657 
   1658 define i8 @test_reduce_v64i8(<64 x i8> %a0) {
   1659 ; X86-SSE2-LABEL: test_reduce_v64i8:
   1660 ; X86-SSE2:       ## %bb.0:
   1661 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
   1662 ; X86-SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
   1663 ; X86-SSE2-NEXT:    pand %xmm4, %xmm1
   1664 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
   1665 ; X86-SSE2-NEXT:    por %xmm1, %xmm4
   1666 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1667 ; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
   1668 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
   1669 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
   1670 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
   1671 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1672 ; X86-SSE2-NEXT:    pcmpgtb %xmm4, %xmm0
   1673 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
   1674 ; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
   1675 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
   1676 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1677 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
   1678 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
   1679 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
   1680 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
   1681 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
   1682 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1683 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1684 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
   1685 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
   1686 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
   1687 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
   1688 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1689 ; X86-SSE2-NEXT:    psrld $16, %xmm0
   1690 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1691 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
   1692 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
   1693 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
   1694 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
   1695 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
   1696 ; X86-SSE2-NEXT:    psrlw $8, %xmm0
   1697 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1698 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
   1699 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
   1700 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
   1701 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
   1702 ; X86-SSE2-NEXT:    movd %xmm1, %eax
   1703 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
   1704 ; X86-SSE2-NEXT:    retl
   1705 ;
   1706 ; X86-SSE42-LABEL: test_reduce_v64i8:
   1707 ; X86-SSE42:       ## %bb.0:
   1708 ; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
   1709 ; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
   1710 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
   1711 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1712 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
   1713 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
   1714 ; X86-SSE42-NEXT:    psrlw $8, %xmm2
   1715 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
   1716 ; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
   1717 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
   1718 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
   1719 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
   1720 ; X86-SSE42-NEXT:    retl
   1721 ;
   1722 ; X86-AVX1-LABEL: test_reduce_v64i8:
   1723 ; X86-AVX1:       ## %bb.0:
   1724 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1725 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1726 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
   1727 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1728 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
   1729 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1730 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1731 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1732 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1733 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
   1734 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1735 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   1736 ; X86-AVX1-NEXT:    ## kill: def $al killed $al killed $eax
   1737 ; X86-AVX1-NEXT:    vzeroupper
   1738 ; X86-AVX1-NEXT:    retl
   1739 ;
   1740 ; X86-AVX2-LABEL: test_reduce_v64i8:
   1741 ; X86-AVX2:       ## %bb.0:
   1742 ; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
   1743 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1744 ; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1745 ; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1746 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1747 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1748 ; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1749 ; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
   1750 ; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1751 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   1752 ; X86-AVX2-NEXT:    ## kill: def $al killed $al killed $eax
   1753 ; X86-AVX2-NEXT:    vzeroupper
   1754 ; X86-AVX2-NEXT:    retl
   1755 ;
   1756 ; X64-SSE2-LABEL: test_reduce_v64i8:
   1757 ; X64-SSE2:       ## %bb.0:
   1758 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
   1759 ; X64-SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
   1760 ; X64-SSE2-NEXT:    pand %xmm4, %xmm1
   1761 ; X64-SSE2-NEXT:    pandn %xmm3, %xmm4
   1762 ; X64-SSE2-NEXT:    por %xmm1, %xmm4
   1763 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
   1764 ; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
   1765 ; X64-SSE2-NEXT:    pand %xmm1, %xmm0
   1766 ; X64-SSE2-NEXT:    pandn %xmm2, %xmm1
   1767 ; X64-SSE2-NEXT:    por %xmm0, %xmm1
   1768 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1769 ; X64-SSE2-NEXT:    pcmpgtb %xmm4, %xmm0
   1770 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
   1771 ; X64-SSE2-NEXT:    pandn %xmm4, %xmm0
   1772 ; X64-SSE2-NEXT:    por %xmm1, %xmm0
   1773 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1774 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
   1775 ; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
   1776 ; X64-SSE2-NEXT:    pand %xmm2, %xmm0
   1777 ; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
   1778 ; X64-SSE2-NEXT:    por %xmm0, %xmm2
   1779 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1780 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1781 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
   1782 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
   1783 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
   1784 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
   1785 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
   1786 ; X64-SSE2-NEXT:    psrld $16, %xmm0
   1787 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
   1788 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
   1789 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
   1790 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
   1791 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
   1792 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
   1793 ; X64-SSE2-NEXT:    psrlw $8, %xmm0
   1794 ; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
   1795 ; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
   1796 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
   1797 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
   1798 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
   1799 ; X64-SSE2-NEXT:    movd %xmm1, %eax
   1800 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
   1801 ; X64-SSE2-NEXT:    retq
   1802 ;
   1803 ; X64-SSE42-LABEL: test_reduce_v64i8:
   1804 ; X64-SSE42:       ## %bb.0:
   1805 ; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
   1806 ; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
   1807 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
   1808 ; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1809 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
   1810 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
   1811 ; X64-SSE42-NEXT:    psrlw $8, %xmm2
   1812 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
   1813 ; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
   1814 ; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
   1815 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
   1816 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
   1817 ; X64-SSE42-NEXT:    retq
   1818 ;
   1819 ; X64-AVX1-LABEL: test_reduce_v64i8:
   1820 ; X64-AVX1:       ## %bb.0:
   1821 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1822 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1823 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
   1824 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1825 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
   1826 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1827 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1828 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1829 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1830 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
   1831 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1832 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   1833 ; X64-AVX1-NEXT:    ## kill: def $al killed $al killed $eax
   1834 ; X64-AVX1-NEXT:    vzeroupper
   1835 ; X64-AVX1-NEXT:    retq
   1836 ;
   1837 ; X64-AVX2-LABEL: test_reduce_v64i8:
   1838 ; X64-AVX2:       ## %bb.0:
   1839 ; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
   1840 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1841 ; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1842 ; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1843 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1844 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1845 ; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1846 ; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
   1847 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1848 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   1849 ; X64-AVX2-NEXT:    ## kill: def $al killed $al killed $eax
   1850 ; X64-AVX2-NEXT:    vzeroupper
   1851 ; X64-AVX2-NEXT:    retq
   1852 ;
   1853 ; X64-AVX512-LABEL: test_reduce_v64i8:
   1854 ; X64-AVX512:       ## %bb.0:
   1855 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1856 ; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
   1857 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1858 ; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
   1859 ; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1860 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1861 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1862 ; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
   1863 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
   1864 ; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
   1865 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
   1866 ; X64-AVX512-NEXT:    ## kill: def $al killed $al killed $eax
   1867 ; X64-AVX512-NEXT:    vzeroupper
   1868 ; X64-AVX512-NEXT:    retq
   1869   %1  = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1870   %2  = icmp sgt <64 x i8> %a0, %1
   1871   %3  = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
   1872   %4  = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1873   %5  = icmp sgt <64 x i8> %3, %4
   1874   %6  = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
   1875   %7  = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1876   %8  = icmp sgt <64 x i8> %6, %7
   1877   %9  = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
   1878   %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1879   %11 = icmp sgt <64 x i8> %9, %10
   1880   %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
   1881   %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1882   %14 = icmp sgt <64 x i8> %12, %13
   1883   %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
   1884   %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1885   %17 = icmp sgt <64 x i8> %15, %16
   1886   %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
   1887   %19 = extractelement <64 x i8> %18, i32 0
   1888   ret i8 %19
   1889 }
   1890