Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
      9 
     10 define <8 x i16> @test1(<8 x i16> %x) nounwind {
     11 ; SSE-LABEL: test1:
     12 ; SSE:       # %bb.0: # %vector.ph
     13 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     14 ; SSE-NEXT:    retq
     15 ;
     16 ; AVX-LABEL: test1:
     17 ; AVX:       # %bb.0: # %vector.ph
     18 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     19 ; AVX-NEXT:    retq
     20 vector.ph:
     21   %0 = icmp slt <8 x i16> %x, zeroinitializer
     22   %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
     23   %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
     24   ret <8 x i16> %res
     25 }
     26 
     27 define <8 x i16> @test2(<8 x i16> %x) nounwind {
     28 ; SSE-LABEL: test2:
     29 ; SSE:       # %bb.0: # %vector.ph
     30 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     31 ; SSE-NEXT:    retq
     32 ;
     33 ; AVX-LABEL: test2:
     34 ; AVX:       # %bb.0: # %vector.ph
     35 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     36 ; AVX-NEXT:    retq
     37 vector.ph:
     38   %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
     39   %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
     40   %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
     41   ret <8 x i16> %res
     42 }
     43 
     44 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
     45 ; SSE-LABEL: test3:
     46 ; SSE:       # %bb.0: # %vector.ph
     47 ; SSE-NEXT:    movd %edi, %xmm1
     48 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
     49 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
     50 ; SSE-NEXT:    psubusw %xmm1, %xmm0
     51 ; SSE-NEXT:    retq
     52 ;
     53 ; AVX1-LABEL: test3:
     54 ; AVX1:       # %bb.0: # %vector.ph
     55 ; AVX1-NEXT:    vmovd %edi, %xmm1
     56 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
     57 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
     58 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
     59 ; AVX1-NEXT:    retq
     60 ;
     61 ; AVX2-LABEL: test3:
     62 ; AVX2:       # %bb.0: # %vector.ph
     63 ; AVX2-NEXT:    vmovd %edi, %xmm1
     64 ; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
     65 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
     66 ; AVX2-NEXT:    retq
     67 ;
     68 ; AVX512-LABEL: test3:
     69 ; AVX512:       # %bb.0: # %vector.ph
     70 ; AVX512-NEXT:    vpbroadcastw %edi, %xmm1
     71 ; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
     72 ; AVX512-NEXT:    retq
     73 vector.ph:
     74   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
     75   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
     76   %1 = icmp ult <8 x i16> %x, %broadcast15
     77   %2 = sub <8 x i16> %x, %broadcast15
     78   %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
     79   ret <8 x i16> %res
     80 }
     81 
     82 define <16 x i8> @test4(<16 x i8> %x) nounwind {
     83 ; SSE-LABEL: test4:
     84 ; SSE:       # %bb.0: # %vector.ph
     85 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
     86 ; SSE-NEXT:    retq
     87 ;
     88 ; AVX-LABEL: test4:
     89 ; AVX:       # %bb.0: # %vector.ph
     90 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
     91 ; AVX-NEXT:    retq
     92 vector.ph:
     93   %0 = icmp slt <16 x i8> %x, zeroinitializer
     94   %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
     95   %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
     96   ret <16 x i8> %res
     97 }
     98 
     99 define <16 x i8> @test5(<16 x i8> %x) nounwind {
    100 ; SSE-LABEL: test5:
    101 ; SSE:       # %bb.0: # %vector.ph
    102 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    103 ; SSE-NEXT:    retq
    104 ;
    105 ; AVX-LABEL: test5:
    106 ; AVX:       # %bb.0: # %vector.ph
    107 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    108 ; AVX-NEXT:    retq
    109 vector.ph:
    110   %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    111   %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    112   %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
    113   ret <16 x i8> %res
    114 }
    115 
    116 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
    117 ; SSE2-LABEL: test6:
    118 ; SSE2:       # %bb.0: # %vector.ph
    119 ; SSE2-NEXT:    movd %edi, %xmm1
    120 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    121 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    122 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    123 ; SSE2-NEXT:    psubusb %xmm1, %xmm0
    124 ; SSE2-NEXT:    retq
    125 ;
    126 ; SSSE3-LABEL: test6:
    127 ; SSSE3:       # %bb.0: # %vector.ph
    128 ; SSSE3-NEXT:    movd %edi, %xmm1
    129 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
    130 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
    131 ; SSSE3-NEXT:    psubusb %xmm1, %xmm0
    132 ; SSSE3-NEXT:    retq
    133 ;
    134 ; SSE41-LABEL: test6:
    135 ; SSE41:       # %bb.0: # %vector.ph
    136 ; SSE41-NEXT:    movd %edi, %xmm1
    137 ; SSE41-NEXT:    pxor %xmm2, %xmm2
    138 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
    139 ; SSE41-NEXT:    psubusb %xmm1, %xmm0
    140 ; SSE41-NEXT:    retq
    141 ;
    142 ; AVX1-LABEL: test6:
    143 ; AVX1:       # %bb.0: # %vector.ph
    144 ; AVX1-NEXT:    vmovd %edi, %xmm1
    145 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    146 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    147 ; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
    148 ; AVX1-NEXT:    retq
    149 ;
    150 ; AVX2-LABEL: test6:
    151 ; AVX2:       # %bb.0: # %vector.ph
    152 ; AVX2-NEXT:    vmovd %edi, %xmm1
    153 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
    154 ; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
    155 ; AVX2-NEXT:    retq
    156 ;
    157 ; AVX512-LABEL: test6:
    158 ; AVX512:       # %bb.0: # %vector.ph
    159 ; AVX512-NEXT:    vpbroadcastb %edi, %xmm1
    160 ; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
    161 ; AVX512-NEXT:    retq
    162 vector.ph:
    163   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
    164   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
    165   %1 = icmp ult <16 x i8> %x, %broadcast15
    166   %2 = sub <16 x i8> %x, %broadcast15
    167   %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
    168   ret <16 x i8> %res
    169 }
    170 
    171 define <16 x i16> @test7(<16 x i16> %x) nounwind {
    172 ; SSE-LABEL: test7:
    173 ; SSE:       # %bb.0: # %vector.ph
    174 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    175 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    176 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    177 ; SSE-NEXT:    retq
    178 ;
    179 ; AVX1-LABEL: test7:
    180 ; AVX1:       # %bb.0: # %vector.ph
    181 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    182 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    183 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm1, %xmm1
    184 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
    185 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    186 ; AVX1-NEXT:    retq
    187 ;
    188 ; AVX2-LABEL: test7:
    189 ; AVX2:       # %bb.0: # %vector.ph
    190 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    191 ; AVX2-NEXT:    retq
    192 ;
    193 ; AVX512-LABEL: test7:
    194 ; AVX512:       # %bb.0: # %vector.ph
    195 ; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    196 ; AVX512-NEXT:    retq
    197 vector.ph:
    198   %0 = icmp slt <16 x i16> %x, zeroinitializer
    199   %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
    200   %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
    201   ret <16 x i16> %res
    202 }
    203 
    204 define <16 x i16> @test8(<16 x i16> %x) nounwind {
    205 ; SSE-LABEL: test8:
    206 ; SSE:       # %bb.0: # %vector.ph
    207 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
    208 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    209 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    210 ; SSE-NEXT:    retq
    211 ;
    212 ; AVX1-LABEL: test8:
    213 ; AVX1:       # %bb.0: # %vector.ph
    214 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    215 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
    216 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm1, %xmm1
    217 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
    218 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    219 ; AVX1-NEXT:    retq
    220 ;
    221 ; AVX2-LABEL: test8:
    222 ; AVX2:       # %bb.0: # %vector.ph
    223 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    224 ; AVX2-NEXT:    retq
    225 ;
    226 ; AVX512-LABEL: test8:
    227 ; AVX512:       # %bb.0: # %vector.ph
    228 ; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    229 ; AVX512-NEXT:    retq
    230 vector.ph:
    231   %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
    232   %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
    233   %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
    234   ret <16 x i16> %res
    235 }
    236 
    237 define <16 x i16> @test8a(<16 x i16> %x) nounwind {
    238 ; SSE-LABEL: test8a:
    239 ; SSE:       # %bb.0: # %vector.ph
    240 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
    241 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm1
    242 ; SSE-NEXT:    retq
    243 ;
    244 ; AVX1-LABEL: test8a:
    245 ; AVX1:       # %bb.0: # %vector.ph
    246 ; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm1
    247 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    248 ; AVX1-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
    249 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    250 ; AVX1-NEXT:    retq
    251 ;
    252 ; AVX2-LABEL: test8a:
    253 ; AVX2:       # %bb.0: # %vector.ph
    254 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    255 ; AVX2-NEXT:    retq
    256 ;
    257 ; AVX512-LABEL: test8a:
    258 ; AVX512:       # %bb.0: # %vector.ph
    259 ; AVX512-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    260 ; AVX512-NEXT:    retq
    261 vector.ph:
    262   %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
    263   %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
    264   %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
    265   ret <16 x i16> %res
    266 }
    267 
    268 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
    269 ; SSE-LABEL: test9:
    270 ; SSE:       # %bb.0: # %vector.ph
    271 ; SSE-NEXT:    movd %edi, %xmm2
    272 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
    273 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
    274 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    275 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    276 ; SSE-NEXT:    retq
    277 ;
    278 ; AVX1-LABEL: test9:
    279 ; AVX1:       # %bb.0: # %vector.ph
    280 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    281 ; AVX1-NEXT:    vmovd %edi, %xmm2
    282 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
    283 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
    284 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm1, %xmm1
    285 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
    286 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    287 ; AVX1-NEXT:    retq
    288 ;
    289 ; AVX2-LABEL: test9:
    290 ; AVX2:       # %bb.0: # %vector.ph
    291 ; AVX2-NEXT:    vmovd %edi, %xmm1
    292 ; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
    293 ; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
    294 ; AVX2-NEXT:    retq
    295 ;
    296 ; AVX512-LABEL: test9:
    297 ; AVX512:       # %bb.0: # %vector.ph
    298 ; AVX512-NEXT:    vpbroadcastw %edi, %ymm1
    299 ; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
    300 ; AVX512-NEXT:    retq
    301 vector.ph:
    302   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
    303   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
    304   %1 = icmp ult <16 x i16> %x, %broadcast15
    305   %2 = sub <16 x i16> %x, %broadcast15
    306   %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
    307   ret <16 x i16> %res
    308 }
    309 
    310 define <32 x i8> @test10(<32 x i8> %x) nounwind {
    311 ; SSE-LABEL: test10:
    312 ; SSE:       # %bb.0: # %vector.ph
    313 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    314 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    315 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    316 ; SSE-NEXT:    retq
    317 ;
    318 ; AVX1-LABEL: test10:
    319 ; AVX1:       # %bb.0: # %vector.ph
    320 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    321 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    322 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm1, %xmm1
    323 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
    324 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    325 ; AVX1-NEXT:    retq
    326 ;
    327 ; AVX2-LABEL: test10:
    328 ; AVX2:       # %bb.0: # %vector.ph
    329 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    330 ; AVX2-NEXT:    retq
    331 ;
    332 ; AVX512-LABEL: test10:
    333 ; AVX512:       # %bb.0: # %vector.ph
    334 ; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    335 ; AVX512-NEXT:    retq
    336 vector.ph:
    337   %0 = icmp slt <32 x i8> %x, zeroinitializer
    338   %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    339   %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
    340   ret <32 x i8> %res
    341 }
    342 
    343 define <32 x i8> @test11(<32 x i8> %x) nounwind {
    344 ; SSE-LABEL: test11:
    345 ; SSE:       # %bb.0: # %vector.ph
    346 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    347 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    348 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    349 ; SSE-NEXT:    retq
    350 ;
    351 ; AVX1-LABEL: test11:
    352 ; AVX1:       # %bb.0: # %vector.ph
    353 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    354 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    355 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm1, %xmm1
    356 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
    357 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    358 ; AVX1-NEXT:    retq
    359 ;
    360 ; AVX2-LABEL: test11:
    361 ; AVX2:       # %bb.0: # %vector.ph
    362 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    363 ; AVX2-NEXT:    retq
    364 ;
    365 ; AVX512-LABEL: test11:
    366 ; AVX512:       # %bb.0: # %vector.ph
    367 ; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    368 ; AVX512-NEXT:    retq
    369 vector.ph:
    370   %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    371   %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    372   %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
    373   ret <32 x i8> %res
    374 }
    375 
    376 define <32 x i8> @test11a(<32 x i8> %x) nounwind {
    377 ; SSE-LABEL: test11a:
    378 ; SSE:       # %bb.0: # %vector.ph
    379 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    380 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm1
    381 ; SSE-NEXT:    retq
    382 ;
    383 ; AVX1-LABEL: test11a:
    384 ; AVX1:       # %bb.0: # %vector.ph
    385 ; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm1
    386 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    387 ; AVX1-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    388 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    389 ; AVX1-NEXT:    retq
    390 ;
    391 ; AVX2-LABEL: test11a:
    392 ; AVX2:       # %bb.0: # %vector.ph
    393 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    394 ; AVX2-NEXT:    retq
    395 ;
    396 ; AVX512-LABEL: test11a:
    397 ; AVX512:       # %bb.0: # %vector.ph
    398 ; AVX512-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    399 ; AVX512-NEXT:    retq
    400 vector.ph:
    401   %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
    402   %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
    403   %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
    404   ret <32 x i8> %res
    405 }
    406 
    407 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
    408 ; SSE2-LABEL: test12:
    409 ; SSE2:       # %bb.0: # %vector.ph
    410 ; SSE2-NEXT:    movd %edi, %xmm2
    411 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    412 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
    413 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
    414 ; SSE2-NEXT:    psubusb %xmm2, %xmm0
    415 ; SSE2-NEXT:    psubusb %xmm2, %xmm1
    416 ; SSE2-NEXT:    retq
    417 ;
    418 ; SSSE3-LABEL: test12:
    419 ; SSSE3:       # %bb.0: # %vector.ph
    420 ; SSSE3-NEXT:    movd %edi, %xmm2
    421 ; SSSE3-NEXT:    pxor %xmm3, %xmm3
    422 ; SSSE3-NEXT:    pshufb %xmm3, %xmm2
    423 ; SSSE3-NEXT:    psubusb %xmm2, %xmm0
    424 ; SSSE3-NEXT:    psubusb %xmm2, %xmm1
    425 ; SSSE3-NEXT:    retq
    426 ;
    427 ; SSE41-LABEL: test12:
    428 ; SSE41:       # %bb.0: # %vector.ph
    429 ; SSE41-NEXT:    movd %edi, %xmm2
    430 ; SSE41-NEXT:    pxor %xmm3, %xmm3
    431 ; SSE41-NEXT:    pshufb %xmm3, %xmm2
    432 ; SSE41-NEXT:    psubusb %xmm2, %xmm0
    433 ; SSE41-NEXT:    psubusb %xmm2, %xmm1
    434 ; SSE41-NEXT:    retq
    435 ;
    436 ; AVX1-LABEL: test12:
    437 ; AVX1:       # %bb.0: # %vector.ph
    438 ; AVX1-NEXT:    vmovd %edi, %xmm1
    439 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    440 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    441 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    442 ; AVX1-NEXT:    vpsubusb %xmm1, %xmm2, %xmm2
    443 ; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
    444 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    445 ; AVX1-NEXT:    retq
    446 ;
    447 ; AVX2-LABEL: test12:
    448 ; AVX2:       # %bb.0: # %vector.ph
    449 ; AVX2-NEXT:    vmovd %edi, %xmm1
    450 ; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
    451 ; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
    452 ; AVX2-NEXT:    retq
    453 ;
    454 ; AVX512-LABEL: test12:
    455 ; AVX512:       # %bb.0: # %vector.ph
    456 ; AVX512-NEXT:    vpbroadcastb %edi, %ymm1
    457 ; AVX512-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
    458 ; AVX512-NEXT:    retq
    459 vector.ph:
    460   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
    461   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
    462   %1 = icmp ult <32 x i8> %x, %broadcast15
    463   %2 = sub <32 x i8> %x, %broadcast15
    464   %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
    465   ret <32 x i8> %res
    466 }
    467 
    468 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
    469 ; SSE2-LABEL: test13:
    470 ; SSE2:       # %bb.0: # %vector.ph
    471 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    472 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    473 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
    474 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    475 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
    476 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
    477 ; SSE2-NEXT:    psubd %xmm2, %xmm0
    478 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
    479 ; SSE2-NEXT:    pxor %xmm4, %xmm6
    480 ; SSE2-NEXT:    por %xmm4, %xmm5
    481 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
    482 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    483 ; SSE2-NEXT:    pxor %xmm4, %xmm2
    484 ; SSE2-NEXT:    por %xmm3, %xmm4
    485 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
    486 ; SSE2-NEXT:    packssdw %xmm6, %xmm2
    487 ; SSE2-NEXT:    psubd %xmm1, %xmm3
    488 ; SSE2-NEXT:    pslld $16, %xmm0
    489 ; SSE2-NEXT:    psrad $16, %xmm0
    490 ; SSE2-NEXT:    pslld $16, %xmm3
    491 ; SSE2-NEXT:    psrad $16, %xmm3
    492 ; SSE2-NEXT:    packssdw %xmm0, %xmm3
    493 ; SSE2-NEXT:    pandn %xmm3, %xmm2
    494 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    495 ; SSE2-NEXT:    retq
    496 ;
    497 ; SSSE3-LABEL: test13:
    498 ; SSSE3:       # %bb.0: # %vector.ph
    499 ; SSSE3-NEXT:    pxor %xmm3, %xmm3
    500 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
    501 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
    502 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
    503 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
    504 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
    505 ; SSSE3-NEXT:    psubd %xmm2, %xmm0
    506 ; SSSE3-NEXT:    movdqa %xmm2, %xmm6
    507 ; SSSE3-NEXT:    pxor %xmm3, %xmm6
    508 ; SSSE3-NEXT:    por %xmm3, %xmm5
    509 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
    510 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    511 ; SSSE3-NEXT:    pxor %xmm3, %xmm2
    512 ; SSSE3-NEXT:    por %xmm4, %xmm3
    513 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
    514 ; SSSE3-NEXT:    packssdw %xmm6, %xmm2
    515 ; SSSE3-NEXT:    psubd %xmm1, %xmm4
    516 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    517 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    518 ; SSSE3-NEXT:    pshufb %xmm1, %xmm4
    519 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
    520 ; SSSE3-NEXT:    pandn %xmm4, %xmm2
    521 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
    522 ; SSSE3-NEXT:    retq
    523 ;
    524 ; SSE41-LABEL: test13:
    525 ; SSE41:       # %bb.0: # %vector.ph
    526 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
    527 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
    528 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    529 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
    530 ; SSE41-NEXT:    pmaxud %xmm1, %xmm0
    531 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
    532 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
    533 ; SSE41-NEXT:    pxor %xmm5, %xmm0
    534 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    535 ; SSE41-NEXT:    pshufb %xmm6, %xmm0
    536 ; SSE41-NEXT:    movdqa %xmm3, %xmm7
    537 ; SSE41-NEXT:    pmaxud %xmm2, %xmm7
    538 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
    539 ; SSE41-NEXT:    pxor %xmm5, %xmm7
    540 ; SSE41-NEXT:    pshufb %xmm6, %xmm7
    541 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
    542 ; SSE41-NEXT:    psubd %xmm2, %xmm3
    543 ; SSE41-NEXT:    psubd %xmm1, %xmm4
    544 ; SSE41-NEXT:    pshufb %xmm6, %xmm4
    545 ; SSE41-NEXT:    pshufb %xmm6, %xmm3
    546 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
    547 ; SSE41-NEXT:    pandn %xmm4, %xmm0
    548 ; SSE41-NEXT:    retq
    549 ;
    550 ; AVX1-LABEL: test13:
    551 ; AVX1:       # %bb.0: # %vector.ph
    552 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    553 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    554 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    555 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm3
    556 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
    557 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
    558 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
    559 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
    560 ; AVX1-NEXT:    vpmaxud %xmm5, %xmm2, %xmm6
    561 ; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
    562 ; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
    563 ; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
    564 ; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
    565 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
    566 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    567 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    568 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
    569 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    570 ; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
    571 ; AVX1-NEXT:    vzeroupper
    572 ; AVX1-NEXT:    retq
    573 ;
    574 ; AVX2-LABEL: test13:
    575 ; AVX2:       # %bb.0: # %vector.ph
    576 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    577 ; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm2
    578 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
    579 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    580 ; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
    581 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
    582 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
    583 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    584 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    585 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    586 ; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
    587 ; AVX2-NEXT:    vzeroupper
    588 ; AVX2-NEXT:    retq
    589 ;
    590 ; AVX512-LABEL: test13:
    591 ; AVX512:       # %bb.0: # %vector.ph
    592 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    593 ; AVX512-NEXT:    vpcmpnltud %ymm1, %ymm0, %k1
    594 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    595 ; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
    596 ; AVX512-NEXT:    vzeroupper
    597 ; AVX512-NEXT:    retq
    598 vector.ph:
    599   %lhs = zext <8 x i16> %x to <8 x i32>
    600   %cond = icmp ult <8 x i32> %lhs, %y
    601   %sub = sub <8 x i32> %lhs, %y
    602   %trunc = trunc <8 x i32> %sub to <8 x i16>
    603   %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
    604   ret <8 x i16> %res
    605 }
    606 
    607 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
    608 ; SSE2-LABEL: test14:
    609 ; SSE2:       # %bb.0: # %vector.ph
    610 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
    611 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    612 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
    613 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
    614 ; SSE2-NEXT:    movdqa %xmm6, %xmm8
    615 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
    616 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
    617 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
    618 ; SSE2-NEXT:    movdqa %xmm5, %xmm10
    619 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
    620 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
    621 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
    622 ; SSE2-NEXT:    movdqa %xmm4, %xmm9
    623 ; SSE2-NEXT:    pxor %xmm0, %xmm9
    624 ; SSE2-NEXT:    psubd %xmm5, %xmm4
    625 ; SSE2-NEXT:    por %xmm0, %xmm5
    626 ; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
    627 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255]
    628 ; SSE2-NEXT:    pand %xmm9, %xmm5
    629 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
    630 ; SSE2-NEXT:    pxor %xmm0, %xmm7
    631 ; SSE2-NEXT:    psubd %xmm10, %xmm3
    632 ; SSE2-NEXT:    por %xmm0, %xmm10
    633 ; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
    634 ; SSE2-NEXT:    pand %xmm9, %xmm10
    635 ; SSE2-NEXT:    packuswb %xmm5, %xmm10
    636 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
    637 ; SSE2-NEXT:    pxor %xmm0, %xmm5
    638 ; SSE2-NEXT:    psubd %xmm6, %xmm2
    639 ; SSE2-NEXT:    por %xmm0, %xmm6
    640 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
    641 ; SSE2-NEXT:    pand %xmm9, %xmm6
    642 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
    643 ; SSE2-NEXT:    pxor %xmm0, %xmm5
    644 ; SSE2-NEXT:    por %xmm8, %xmm0
    645 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
    646 ; SSE2-NEXT:    pand %xmm9, %xmm0
    647 ; SSE2-NEXT:    packuswb %xmm6, %xmm0
    648 ; SSE2-NEXT:    packuswb %xmm10, %xmm0
    649 ; SSE2-NEXT:    psubd %xmm8, %xmm1
    650 ; SSE2-NEXT:    pand %xmm9, %xmm4
    651 ; SSE2-NEXT:    pand %xmm9, %xmm3
    652 ; SSE2-NEXT:    packuswb %xmm4, %xmm3
    653 ; SSE2-NEXT:    pand %xmm9, %xmm2
    654 ; SSE2-NEXT:    pand %xmm9, %xmm1
    655 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
    656 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
    657 ; SSE2-NEXT:    pandn %xmm1, %xmm0
    658 ; SSE2-NEXT:    retq
    659 ;
    660 ; SSSE3-LABEL: test14:
    661 ; SSSE3:       # %bb.0: # %vector.ph
    662 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
    663 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
    664 ; SSSE3-NEXT:    movdqa %xmm5, %xmm7
    665 ; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
    666 ; SSSE3-NEXT:    movdqa %xmm7, %xmm8
    667 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
    668 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
    669 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
    670 ; SSSE3-NEXT:    movdqa %xmm5, %xmm10
    671 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
    672 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
    673 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
    674 ; SSSE3-NEXT:    movdqa %xmm2, %xmm9
    675 ; SSSE3-NEXT:    pxor %xmm0, %xmm9
    676 ; SSSE3-NEXT:    psubd %xmm5, %xmm2
    677 ; SSSE3-NEXT:    por %xmm0, %xmm5
    678 ; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm5
    679 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    680 ; SSSE3-NEXT:    pshufb %xmm9, %xmm5
    681 ; SSSE3-NEXT:    movdqa %xmm1, %xmm6
    682 ; SSSE3-NEXT:    pxor %xmm0, %xmm6
    683 ; SSSE3-NEXT:    psubd %xmm10, %xmm1
    684 ; SSSE3-NEXT:    por %xmm0, %xmm10
    685 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm10
    686 ; SSSE3-NEXT:    pshufb %xmm9, %xmm10
    687 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
    688 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
    689 ; SSSE3-NEXT:    pxor %xmm0, %xmm5
    690 ; SSSE3-NEXT:    psubd %xmm7, %xmm4
    691 ; SSSE3-NEXT:    por %xmm0, %xmm7
    692 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
    693 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    694 ; SSSE3-NEXT:    pshufb %xmm5, %xmm7
    695 ; SSSE3-NEXT:    movdqa %xmm3, %xmm6
    696 ; SSSE3-NEXT:    pxor %xmm0, %xmm6
    697 ; SSSE3-NEXT:    por %xmm8, %xmm0
    698 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm0
    699 ; SSSE3-NEXT:    pshufb %xmm5, %xmm0
    700 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
    701 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
    702 ; SSSE3-NEXT:    psubd %xmm8, %xmm3
    703 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    704 ; SSSE3-NEXT:    pand %xmm5, %xmm4
    705 ; SSSE3-NEXT:    pand %xmm5, %xmm3
    706 ; SSSE3-NEXT:    packuswb %xmm4, %xmm3
    707 ; SSSE3-NEXT:    pand %xmm5, %xmm2
    708 ; SSSE3-NEXT:    pand %xmm5, %xmm1
    709 ; SSSE3-NEXT:    packuswb %xmm2, %xmm1
    710 ; SSSE3-NEXT:    packuswb %xmm3, %xmm1
    711 ; SSSE3-NEXT:    andnpd %xmm1, %xmm0
    712 ; SSSE3-NEXT:    retq
    713 ;
    714 ; SSE41-LABEL: test14:
    715 ; SSE41:       # %bb.0: # %vector.ph
    716 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
    717 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    718 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    719 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
    720 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
    721 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    722 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    723 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
    724 ; SSE41-NEXT:    pmaxud %xmm10, %xmm0
    725 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
    726 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
    727 ; SSE41-NEXT:    pxor %xmm6, %xmm0
    728 ; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    729 ; SSE41-NEXT:    pshufb %xmm7, %xmm0
    730 ; SSE41-NEXT:    movdqa %xmm3, %xmm5
    731 ; SSE41-NEXT:    pmaxud %xmm9, %xmm5
    732 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
    733 ; SSE41-NEXT:    pxor %xmm6, %xmm5
    734 ; SSE41-NEXT:    pshufb %xmm7, %xmm5
    735 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
    736 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    737 ; SSE41-NEXT:    pmaxud %xmm8, %xmm0
    738 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
    739 ; SSE41-NEXT:    pxor %xmm6, %xmm0
    740 ; SSE41-NEXT:    movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    741 ; SSE41-NEXT:    pshufb %xmm12, %xmm0
    742 ; SSE41-NEXT:    movdqa %xmm2, %xmm7
    743 ; SSE41-NEXT:    pmaxud %xmm11, %xmm7
    744 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
    745 ; SSE41-NEXT:    pxor %xmm6, %xmm7
    746 ; SSE41-NEXT:    pshufb %xmm12, %xmm7
    747 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
    748 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
    749 ; SSE41-NEXT:    psubd %xmm11, %xmm2
    750 ; SSE41-NEXT:    psubd %xmm8, %xmm1
    751 ; SSE41-NEXT:    psubd %xmm9, %xmm3
    752 ; SSE41-NEXT:    psubd %xmm10, %xmm4
    753 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    754 ; SSE41-NEXT:    pand %xmm5, %xmm4
    755 ; SSE41-NEXT:    pand %xmm5, %xmm3
    756 ; SSE41-NEXT:    packusdw %xmm4, %xmm3
    757 ; SSE41-NEXT:    pand %xmm5, %xmm1
    758 ; SSE41-NEXT:    pand %xmm5, %xmm2
    759 ; SSE41-NEXT:    packusdw %xmm2, %xmm1
    760 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
    761 ; SSE41-NEXT:    pandn %xmm1, %xmm0
    762 ; SSE41-NEXT:    retq
    763 ;
    764 ; AVX1-LABEL: test14:
    765 ; AVX1:       # %bb.0: # %vector.ph
    766 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
    767 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    768 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    769 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
    770 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    771 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    772 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    773 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
    774 ; AVX1-NEXT:    vpmaxud %xmm0, %xmm6, %xmm7
    775 ; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm7
    776 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    777 ; AVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
    778 ; AVX1-NEXT:    vpmaxud %xmm11, %xmm2, %xmm4
    779 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
    780 ; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
    781 ; AVX1-NEXT:    vpackssdw %xmm7, %xmm4, %xmm10
    782 ; AVX1-NEXT:    vpmaxud %xmm9, %xmm1, %xmm7
    783 ; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm1, %xmm7
    784 ; AVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
    785 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
    786 ; AVX1-NEXT:    vpmaxud %xmm8, %xmm4, %xmm5
    787 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
    788 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm3
    789 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm7, %xmm3
    790 ; AVX1-NEXT:    vpacksswb %xmm10, %xmm3, %xmm3
    791 ; AVX1-NEXT:    vpsubd %xmm8, %xmm4, %xmm4
    792 ; AVX1-NEXT:    vpsubd %xmm9, %xmm1, %xmm1
    793 ; AVX1-NEXT:    vpsubd %xmm11, %xmm2, %xmm2
    794 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
    795 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    796 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
    797 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
    798 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
    799 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
    800 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm2
    801 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    802 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
    803 ; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
    804 ; AVX1-NEXT:    vzeroupper
    805 ; AVX1-NEXT:    retq
    806 ;
    807 ; AVX2-LABEL: test14:
    808 ; AVX2:       # %bb.0: # %vector.ph
    809 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
    810 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
    811 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    812 ; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm4
    813 ; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm1, %ymm4
    814 ; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
    815 ; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
    816 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
    817 ; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
    818 ; AVX2-NEXT:    vpmaxud %ymm3, %ymm2, %ymm6
    819 ; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm2, %ymm6
    820 ; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
    821 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
    822 ; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
    823 ; AVX2-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
    824 ; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
    825 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
    826 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    827 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
    828 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    829 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    830 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    831 ; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
    832 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    833 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    834 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    835 ; AVX2-NEXT:    vpandn %xmm0, %xmm4, %xmm0
    836 ; AVX2-NEXT:    vzeroupper
    837 ; AVX2-NEXT:    retq
    838 ;
    839 ; AVX512-LABEL: test14:
    840 ; AVX512:       # %bb.0: # %vector.ph
    841 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    842 ; AVX512-NEXT:    vpcmpnltud %zmm0, %zmm1, %k1
    843 ; AVX512-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
    844 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
    845 ; AVX512-NEXT:    vzeroupper
    846 ; AVX512-NEXT:    retq
    847 vector.ph:
    848   %rhs = zext <16 x i8> %x to <16 x i32>
    849   %cond = icmp ult <16 x i32> %y, %rhs
    850   %sub = sub <16 x i32> %y, %rhs
    851   %truncsub = trunc <16 x i32> %sub to <16 x i8>
    852   %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
    853   ret <16 x i8> %res
    854 }
    855 
    856 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
    857 ; SSE2-LABEL: test15:
    858 ; SSE2:       # %bb.0: # %vector.ph
    859 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    860 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    861 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    862 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    863 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
    864 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
    865 ; SSE2-NEXT:    psubd %xmm2, %xmm3
    866 ; SSE2-NEXT:    pxor %xmm4, %xmm2
    867 ; SSE2-NEXT:    por %xmm4, %xmm5
    868 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
    869 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    870 ; SSE2-NEXT:    pxor %xmm4, %xmm2
    871 ; SSE2-NEXT:    por %xmm0, %xmm4
    872 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
    873 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
    874 ; SSE2-NEXT:    psubd %xmm1, %xmm0
    875 ; SSE2-NEXT:    pslld $16, %xmm3
    876 ; SSE2-NEXT:    psrad $16, %xmm3
    877 ; SSE2-NEXT:    pslld $16, %xmm0
    878 ; SSE2-NEXT:    psrad $16, %xmm0
    879 ; SSE2-NEXT:    packssdw %xmm3, %xmm0
    880 ; SSE2-NEXT:    pand %xmm4, %xmm0
    881 ; SSE2-NEXT:    retq
    882 ;
    883 ; SSSE3-LABEL: test15:
    884 ; SSSE3:       # %bb.0: # %vector.ph
    885 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
    886 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
    887 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
    888 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    889 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
    890 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
    891 ; SSSE3-NEXT:    psubd %xmm2, %xmm0
    892 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
    893 ; SSSE3-NEXT:    por %xmm4, %xmm5
    894 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
    895 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    896 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
    897 ; SSSE3-NEXT:    por %xmm3, %xmm4
    898 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
    899 ; SSSE3-NEXT:    packssdw %xmm5, %xmm4
    900 ; SSSE3-NEXT:    psubd %xmm1, %xmm3
    901 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    902 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    903 ; SSSE3-NEXT:    pshufb %xmm1, %xmm3
    904 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
    905 ; SSSE3-NEXT:    pand %xmm4, %xmm3
    906 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    907 ; SSSE3-NEXT:    retq
    908 ;
    909 ; SSE41-LABEL: test15:
    910 ; SSE41:       # %bb.0: # %vector.ph
    911 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
    912 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
    913 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    914 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
    915 ; SSE41-NEXT:    pminud %xmm1, %xmm4
    916 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
    917 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
    918 ; SSE41-NEXT:    pxor %xmm5, %xmm4
    919 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    920 ; SSE41-NEXT:    pshufb %xmm6, %xmm4
    921 ; SSE41-NEXT:    movdqa %xmm3, %xmm7
    922 ; SSE41-NEXT:    pminud %xmm2, %xmm7
    923 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
    924 ; SSE41-NEXT:    pxor %xmm5, %xmm7
    925 ; SSE41-NEXT:    pshufb %xmm6, %xmm7
    926 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
    927 ; SSE41-NEXT:    psubd %xmm2, %xmm3
    928 ; SSE41-NEXT:    psubd %xmm1, %xmm0
    929 ; SSE41-NEXT:    pshufb %xmm6, %xmm0
    930 ; SSE41-NEXT:    pshufb %xmm6, %xmm3
    931 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
    932 ; SSE41-NEXT:    pand %xmm4, %xmm0
    933 ; SSE41-NEXT:    retq
    934 ;
    935 ; AVX1-LABEL: test15:
    936 ; AVX1:       # %bb.0: # %vector.ph
    937 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    938 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    939 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    940 ; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
    941 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
    942 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
    943 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
    944 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
    945 ; AVX1-NEXT:    vpminud %xmm5, %xmm2, %xmm6
    946 ; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
    947 ; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
    948 ; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
    949 ; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
    950 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
    951 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    952 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    953 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
    954 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    955 ; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
    956 ; AVX1-NEXT:    vzeroupper
    957 ; AVX1-NEXT:    retq
    958 ;
    959 ; AVX2-LABEL: test15:
    960 ; AVX2:       # %bb.0: # %vector.ph
    961 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    962 ; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
    963 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
    964 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    965 ; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
    966 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
    967 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
    968 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    969 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    970 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    971 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
    972 ; AVX2-NEXT:    vzeroupper
    973 ; AVX2-NEXT:    retq
    974 ;
    975 ; AVX512-LABEL: test15:
    976 ; AVX512:       # %bb.0: # %vector.ph
    977 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    978 ; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1
    979 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    980 ; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
    981 ; AVX512-NEXT:    vzeroupper
    982 ; AVX512-NEXT:    retq
    983 vector.ph:
    984   %lhs = zext <8 x i16> %x to <8 x i32>
    985   %cond = icmp ugt <8 x i32> %lhs, %y
    986   %sub = sub <8 x i32> %lhs, %y
    987   %truncsub = trunc <8 x i32> %sub to <8 x i16>
    988   %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
    989   ret <8 x i16> %res
    990 }
    991 
    992 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
    993 ; SSE2-LABEL: test16:
    994 ; SSE2:       # %bb.0: # %vector.ph
    995 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    996 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    997 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    998 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    999 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   1000 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   1001 ; SSE2-NEXT:    psubd %xmm2, %xmm3
   1002 ; SSE2-NEXT:    pxor %xmm4, %xmm2
   1003 ; SSE2-NEXT:    por %xmm4, %xmm5
   1004 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
   1005 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1006 ; SSE2-NEXT:    pxor %xmm4, %xmm2
   1007 ; SSE2-NEXT:    por %xmm0, %xmm4
   1008 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
   1009 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
   1010 ; SSE2-NEXT:    psubd %xmm1, %xmm0
   1011 ; SSE2-NEXT:    pslld $16, %xmm3
   1012 ; SSE2-NEXT:    psrad $16, %xmm3
   1013 ; SSE2-NEXT:    pslld $16, %xmm0
   1014 ; SSE2-NEXT:    psrad $16, %xmm0
   1015 ; SSE2-NEXT:    packssdw %xmm3, %xmm0
   1016 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1017 ; SSE2-NEXT:    retq
   1018 ;
   1019 ; SSSE3-LABEL: test16:
   1020 ; SSSE3:       # %bb.0: # %vector.ph
   1021 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
   1022 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
   1023 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
   1024 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
   1025 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   1026 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
   1027 ; SSSE3-NEXT:    psubd %xmm2, %xmm0
   1028 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
   1029 ; SSSE3-NEXT:    por %xmm4, %xmm5
   1030 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
   1031 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
   1032 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
   1033 ; SSSE3-NEXT:    por %xmm3, %xmm4
   1034 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
   1035 ; SSSE3-NEXT:    packssdw %xmm5, %xmm4
   1036 ; SSSE3-NEXT:    psubd %xmm1, %xmm3
   1037 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1038 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   1039 ; SSSE3-NEXT:    pshufb %xmm1, %xmm3
   1040 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
   1041 ; SSSE3-NEXT:    pand %xmm4, %xmm3
   1042 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   1043 ; SSSE3-NEXT:    retq
   1044 ;
   1045 ; SSE41-LABEL: test16:
   1046 ; SSE41:       # %bb.0: # %vector.ph
   1047 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
   1048 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   1049 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1050 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
   1051 ; SSE41-NEXT:    pmaxud %xmm0, %xmm4
   1052 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm4
   1053 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
   1054 ; SSE41-NEXT:    pxor %xmm5, %xmm4
   1055 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1056 ; SSE41-NEXT:    pshufb %xmm6, %xmm4
   1057 ; SSE41-NEXT:    movdqa %xmm2, %xmm7
   1058 ; SSE41-NEXT:    pmaxud %xmm3, %xmm7
   1059 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
   1060 ; SSE41-NEXT:    pxor %xmm5, %xmm7
   1061 ; SSE41-NEXT:    pshufb %xmm6, %xmm7
   1062 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
   1063 ; SSE41-NEXT:    psubd %xmm2, %xmm3
   1064 ; SSE41-NEXT:    psubd %xmm1, %xmm0
   1065 ; SSE41-NEXT:    pshufb %xmm6, %xmm0
   1066 ; SSE41-NEXT:    pshufb %xmm6, %xmm3
   1067 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
   1068 ; SSE41-NEXT:    pand %xmm4, %xmm0
   1069 ; SSE41-NEXT:    retq
   1070 ;
   1071 ; AVX1-LABEL: test16:
   1072 ; AVX1:       # %bb.0: # %vector.ph
   1073 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1074 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1075 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1076 ; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm3
   1077 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
   1078 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
   1079 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
   1080 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
   1081 ; AVX1-NEXT:    vpmaxud %xmm2, %xmm5, %xmm6
   1082 ; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
   1083 ; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
   1084 ; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
   1085 ; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
   1086 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
   1087 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1088 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1089 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
   1090 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1091 ; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
   1092 ; AVX1-NEXT:    vzeroupper
   1093 ; AVX1-NEXT:    retq
   1094 ;
   1095 ; AVX2-LABEL: test16:
   1096 ; AVX2:       # %bb.0: # %vector.ph
   1097 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1098 ; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm2
   1099 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
   1100 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
   1101 ; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
   1102 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   1103 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
   1104 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
   1105 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1106 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1107 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
   1108 ; AVX2-NEXT:    vzeroupper
   1109 ; AVX2-NEXT:    retq
   1110 ;
   1111 ; AVX512-LABEL: test16:
   1112 ; AVX512:       # %bb.0: # %vector.ph
   1113 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1114 ; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
   1115 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
   1116 ; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
   1117 ; AVX512-NEXT:    vzeroupper
   1118 ; AVX512-NEXT:    retq
   1119 vector.ph:
   1120   %lhs = zext <8 x i16> %x to <8 x i32>
   1121   %cond = icmp ult <8 x i32> %y, %lhs
   1122   %sub = sub <8 x i32> %lhs, %y
   1123   %truncsub = trunc <8 x i32> %sub to <8 x i16>
   1124   %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
   1125   ret <8 x i16> %res
   1126 }
   1127 
   1128 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
   1129 ; SSE-LABEL: psubus_8i16_max:
   1130 ; SSE:       # %bb.0: # %vector.ph
   1131 ; SSE-NEXT:    psubusw %xmm1, %xmm0
   1132 ; SSE-NEXT:    retq
   1133 ;
   1134 ; AVX-LABEL: psubus_8i16_max:
   1135 ; AVX:       # %bb.0: # %vector.ph
   1136 ; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1137 ; AVX-NEXT:    retq
   1138 vector.ph:
   1139   %cmp = icmp ult <8 x i16> %x, %y
   1140   %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
   1141   %res = sub <8 x i16> %max, %y
   1142   ret <8 x i16> %res
   1143 }
   1144 
   1145 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
   1146 ; SSE-LABEL: psubus_16i8_max:
   1147 ; SSE:       # %bb.0: # %vector.ph
   1148 ; SSE-NEXT:    psubusb %xmm1, %xmm0
   1149 ; SSE-NEXT:    retq
   1150 ;
   1151 ; AVX-LABEL: psubus_16i8_max:
   1152 ; AVX:       # %bb.0: # %vector.ph
   1153 ; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
   1154 ; AVX-NEXT:    retq
   1155 vector.ph:
   1156   %cmp = icmp ult <16 x i8> %x, %y
   1157   %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
   1158   %res = sub <16 x i8> %max, %y
   1159   ret <16 x i8> %res
   1160 }
   1161 
   1162 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
   1163 ; SSE-LABEL: psubus_16i16_max:
   1164 ; SSE:       # %bb.0: # %vector.ph
   1165 ; SSE-NEXT:    psubusw %xmm2, %xmm0
   1166 ; SSE-NEXT:    psubusw %xmm3, %xmm1
   1167 ; SSE-NEXT:    retq
   1168 ;
   1169 ; AVX1-LABEL: psubus_16i16_max:
   1170 ; AVX1:       # %bb.0: # %vector.ph
   1171 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1172 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1173 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm3, %xmm2
   1174 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1175 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1176 ; AVX1-NEXT:    retq
   1177 ;
   1178 ; AVX2-LABEL: psubus_16i16_max:
   1179 ; AVX2:       # %bb.0: # %vector.ph
   1180 ; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
   1181 ; AVX2-NEXT:    retq
   1182 ;
   1183 ; AVX512-LABEL: psubus_16i16_max:
   1184 ; AVX512:       # %bb.0: # %vector.ph
   1185 ; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
   1186 ; AVX512-NEXT:    retq
   1187 vector.ph:
   1188   %cmp = icmp ult <16 x i16> %x, %y
   1189   %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
   1190   %res = sub <16 x i16> %max, %y
   1191   ret <16 x i16> %res
   1192 }
   1193 
   1194 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
   1195 ; SSE-LABEL: psubus_32i16_max:
   1196 ; SSE:       # %bb.0: # %vector.ph
   1197 ; SSE-NEXT:    psubusw %xmm4, %xmm0
   1198 ; SSE-NEXT:    psubusw %xmm5, %xmm1
   1199 ; SSE-NEXT:    psubusw %xmm6, %xmm2
   1200 ; SSE-NEXT:    psubusw %xmm7, %xmm3
   1201 ; SSE-NEXT:    retq
   1202 ;
   1203 ; AVX1-LABEL: psubus_32i16_max:
   1204 ; AVX1:       # %bb.0: # %vector.ph
   1205 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1206 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1207 ; AVX1-NEXT:    vpsubusw %xmm4, %xmm5, %xmm4
   1208 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
   1209 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1210 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   1211 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1212 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm4, %xmm2
   1213 ; AVX1-NEXT:    vpsubusw %xmm3, %xmm1, %xmm1
   1214 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1215 ; AVX1-NEXT:    retq
   1216 ;
   1217 ; AVX2-LABEL: psubus_32i16_max:
   1218 ; AVX2:       # %bb.0: # %vector.ph
   1219 ; AVX2-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0
   1220 ; AVX2-NEXT:    vpsubusw %ymm3, %ymm1, %ymm1
   1221 ; AVX2-NEXT:    retq
   1222 ;
   1223 ; AVX512-LABEL: psubus_32i16_max:
   1224 ; AVX512:       # %bb.0: # %vector.ph
   1225 ; AVX512-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
   1226 ; AVX512-NEXT:    retq
   1227 vector.ph:
   1228   %cmp = icmp ult <32 x i16> %x, %y
   1229   %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
   1230   %res = sub <32 x i16> %max, %y
   1231   ret <32 x i16> %res
   1232 }
   1233 
   1234 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
   1235 ; SSE-LABEL: psubus_64i8_max:
   1236 ; SSE:       # %bb.0: # %vector.ph
   1237 ; SSE-NEXT:    psubusb %xmm4, %xmm0
   1238 ; SSE-NEXT:    psubusb %xmm5, %xmm1
   1239 ; SSE-NEXT:    psubusb %xmm6, %xmm2
   1240 ; SSE-NEXT:    psubusb %xmm7, %xmm3
   1241 ; SSE-NEXT:    retq
   1242 ;
   1243 ; AVX1-LABEL: psubus_64i8_max:
   1244 ; AVX1:       # %bb.0: # %vector.ph
   1245 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1246 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1247 ; AVX1-NEXT:    vpsubusb %xmm4, %xmm5, %xmm4
   1248 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
   1249 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1250 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   1251 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1252 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm4, %xmm2
   1253 ; AVX1-NEXT:    vpsubusb %xmm3, %xmm1, %xmm1
   1254 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1255 ; AVX1-NEXT:    retq
   1256 ;
   1257 ; AVX2-LABEL: psubus_64i8_max:
   1258 ; AVX2:       # %bb.0: # %vector.ph
   1259 ; AVX2-NEXT:    vpsubusb %ymm2, %ymm0, %ymm0
   1260 ; AVX2-NEXT:    vpsubusb %ymm3, %ymm1, %ymm1
   1261 ; AVX2-NEXT:    retq
   1262 ;
   1263 ; AVX512-LABEL: psubus_64i8_max:
   1264 ; AVX512:       # %bb.0: # %vector.ph
   1265 ; AVX512-NEXT:    vpsubusb %zmm1, %zmm0, %zmm0
   1266 ; AVX512-NEXT:    retq
   1267 vector.ph:
   1268   %cmp = icmp ult <64 x i8> %x, %y
   1269   %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
   1270   %res = sub <64 x i8> %max, %y
   1271   ret <64 x i8> %res
   1272 }
   1273 
   1274 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
   1275 ; SSE-LABEL: psubus_32i8_max:
   1276 ; SSE:       # %bb.0: # %vector.ph
   1277 ; SSE-NEXT:    psubusb %xmm2, %xmm0
   1278 ; SSE-NEXT:    psubusb %xmm3, %xmm1
   1279 ; SSE-NEXT:    retq
   1280 ;
   1281 ; AVX1-LABEL: psubus_32i8_max:
   1282 ; AVX1:       # %bb.0: # %vector.ph
   1283 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1284 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1285 ; AVX1-NEXT:    vpsubusb %xmm2, %xmm3, %xmm2
   1286 ; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
   1287 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1288 ; AVX1-NEXT:    retq
   1289 ;
   1290 ; AVX2-LABEL: psubus_32i8_max:
   1291 ; AVX2:       # %bb.0: # %vector.ph
   1292 ; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
   1293 ; AVX2-NEXT:    retq
   1294 ;
   1295 ; AVX512-LABEL: psubus_32i8_max:
   1296 ; AVX512:       # %bb.0: # %vector.ph
   1297 ; AVX512-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
   1298 ; AVX512-NEXT:    retq
   1299 vector.ph:
   1300   %cmp = icmp ult <32 x i8> %x, %y
   1301   %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
   1302   %res = sub <32 x i8> %max, %y
   1303   ret <32 x i8> %res
   1304 }
   1305 
   1306 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
   1307 ; SSE2-LABEL: psubus_8i32_max:
   1308 ; SSE2:       # %bb.0: # %vector.ph
   1309 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1310 ; SSE2-NEXT:    pxor %xmm4, %xmm4
   1311 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
   1312 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   1313 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
   1314 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1315 ; SSE2-NEXT:    pxor %xmm5, %xmm6
   1316 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1317 ; SSE2-NEXT:    por %xmm5, %xmm4
   1318 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
   1319 ; SSE2-NEXT:    pand %xmm4, %xmm3
   1320 ; SSE2-NEXT:    pandn %xmm2, %xmm4
   1321 ; SSE2-NEXT:    por %xmm3, %xmm4
   1322 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1323 ; SSE2-NEXT:    pxor %xmm5, %xmm3
   1324 ; SSE2-NEXT:    por %xmm0, %xmm5
   1325 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
   1326 ; SSE2-NEXT:    pand %xmm5, %xmm0
   1327 ; SSE2-NEXT:    pandn %xmm1, %xmm5
   1328 ; SSE2-NEXT:    por %xmm5, %xmm0
   1329 ; SSE2-NEXT:    psubd %xmm1, %xmm0
   1330 ; SSE2-NEXT:    psubd %xmm2, %xmm4
   1331 ; SSE2-NEXT:    pslld $16, %xmm4
   1332 ; SSE2-NEXT:    psrad $16, %xmm4
   1333 ; SSE2-NEXT:    pslld $16, %xmm0
   1334 ; SSE2-NEXT:    psrad $16, %xmm0
   1335 ; SSE2-NEXT:    packssdw %xmm4, %xmm0
   1336 ; SSE2-NEXT:    retq
   1337 ;
   1338 ; SSSE3-LABEL: psubus_8i32_max:
   1339 ; SSSE3:       # %bb.0: # %vector.ph
   1340 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1341 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   1342 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   1343 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
   1344 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
   1345 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   1346 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
   1347 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
   1348 ; SSSE3-NEXT:    pand %xmm7, %xmm2
   1349 ; SSSE3-NEXT:    pandn %xmm5, %xmm7
   1350 ; SSSE3-NEXT:    por %xmm2, %xmm7
   1351 ; SSSE3-NEXT:    pshufb %xmm3, %xmm7
   1352 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
   1353 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
   1354 ; SSSE3-NEXT:    pand %xmm6, %xmm1
   1355 ; SSSE3-NEXT:    pandn %xmm5, %xmm6
   1356 ; SSSE3-NEXT:    por %xmm1, %xmm6
   1357 ; SSSE3-NEXT:    pshufb %xmm3, %xmm6
   1358 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1359 ; SSSE3-NEXT:    psubusw %xmm6, %xmm0
   1360 ; SSSE3-NEXT:    retq
   1361 ;
   1362 ; SSE41-LABEL: psubus_8i32_max:
   1363 ; SSE41:       # %bb.0: # %vector.ph
   1364 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   1365 ; SSE41-NEXT:    pminud %xmm3, %xmm2
   1366 ; SSE41-NEXT:    pminud %xmm3, %xmm1
   1367 ; SSE41-NEXT:    packusdw %xmm2, %xmm1
   1368 ; SSE41-NEXT:    psubusw %xmm1, %xmm0
   1369 ; SSE41-NEXT:    retq
   1370 ;
   1371 ; AVX1-LABEL: psubus_8i32_max:
   1372 ; AVX1:       # %bb.0: # %vector.ph
   1373 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1374 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   1375 ; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
   1376 ; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
   1377 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1378 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1379 ; AVX1-NEXT:    vzeroupper
   1380 ; AVX1-NEXT:    retq
   1381 ;
   1382 ; AVX2-LABEL: psubus_8i32_max:
   1383 ; AVX2:       # %bb.0: # %vector.ph
   1384 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
   1385 ; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
   1386 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1387 ; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1388 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1389 ; AVX2-NEXT:    vzeroupper
   1390 ; AVX2-NEXT:    retq
   1391 ;
   1392 ; AVX512-LABEL: psubus_8i32_max:
   1393 ; AVX512:       # %bb.0: # %vector.ph
   1394 ; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
   1395 ; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1396 ; AVX512-NEXT:    vzeroupper
   1397 ; AVX512-NEXT:    retq
   1398 vector.ph:
   1399   %lhs = zext <8 x i16> %x to <8 x i32>
   1400   %cond = icmp ult <8 x i32> %lhs, %y
   1401   %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
   1402   %sub = sub <8 x i32> %max, %y
   1403   %res = trunc <8 x i32> %sub to <8 x i16>
   1404   ret <8 x i16> %res
   1405 }
   1406 
   1407 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
   1408 ; SSE2-LABEL: psubus_8i64_max:
   1409 ; SSE2:       # %bb.0: # %vector.ph
   1410 ; SSE2-NEXT:    pxor %xmm5, %xmm5
   1411 ; SSE2-NEXT:    movdqa %xmm0, %xmm10
   1412 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
   1413 ; SSE2-NEXT:    movdqa %xmm10, %xmm8
   1414 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
   1415 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
   1416 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
   1417 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
   1418 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
   1419 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
   1420 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
   1421 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1422 ; SSE2-NEXT:    pxor %xmm11, %xmm6
   1423 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1424 ; SSE2-NEXT:    por %xmm11, %xmm7
   1425 ; SSE2-NEXT:    movdqa %xmm7, %xmm5
   1426 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
   1427 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
   1428 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
   1429 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
   1430 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1431 ; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
   1432 ; SSE2-NEXT:    por %xmm7, %xmm13
   1433 ; SSE2-NEXT:    pand %xmm13, %xmm0
   1434 ; SSE2-NEXT:    pandn %xmm2, %xmm13
   1435 ; SSE2-NEXT:    por %xmm0, %xmm13
   1436 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1437 ; SSE2-NEXT:    pxor %xmm11, %xmm0
   1438 ; SSE2-NEXT:    movdqa %xmm9, %xmm5
   1439 ; SSE2-NEXT:    por %xmm11, %xmm5
   1440 ; SSE2-NEXT:    movdqa %xmm5, %xmm7
   1441 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
   1442 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2]
   1443 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
   1444 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
   1445 ; SSE2-NEXT:    pand %xmm12, %xmm5
   1446 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
   1447 ; SSE2-NEXT:    por %xmm5, %xmm0
   1448 ; SSE2-NEXT:    pand %xmm0, %xmm9
   1449 ; SSE2-NEXT:    pandn %xmm1, %xmm0
   1450 ; SSE2-NEXT:    por %xmm9, %xmm0
   1451 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
   1452 ; SSE2-NEXT:    pxor %xmm11, %xmm5
   1453 ; SSE2-NEXT:    movdqa %xmm10, %xmm7
   1454 ; SSE2-NEXT:    por %xmm11, %xmm7
   1455 ; SSE2-NEXT:    movdqa %xmm7, %xmm6
   1456 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
   1457 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
   1458 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
   1459 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
   1460 ; SSE2-NEXT:    pand %xmm9, %xmm5
   1461 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
   1462 ; SSE2-NEXT:    por %xmm5, %xmm7
   1463 ; SSE2-NEXT:    pand %xmm7, %xmm10
   1464 ; SSE2-NEXT:    pandn %xmm4, %xmm7
   1465 ; SSE2-NEXT:    por %xmm10, %xmm7
   1466 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   1467 ; SSE2-NEXT:    pxor %xmm11, %xmm5
   1468 ; SSE2-NEXT:    por %xmm8, %xmm11
   1469 ; SSE2-NEXT:    movdqa %xmm11, %xmm6
   1470 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
   1471 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
   1472 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm11
   1473 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
   1474 ; SSE2-NEXT:    pand %xmm9, %xmm5
   1475 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1476 ; SSE2-NEXT:    por %xmm5, %xmm6
   1477 ; SSE2-NEXT:    pand %xmm6, %xmm8
   1478 ; SSE2-NEXT:    pandn %xmm3, %xmm6
   1479 ; SSE2-NEXT:    por %xmm8, %xmm6
   1480 ; SSE2-NEXT:    psubq %xmm3, %xmm6
   1481 ; SSE2-NEXT:    psubq %xmm4, %xmm7
   1482 ; SSE2-NEXT:    psubq %xmm1, %xmm0
   1483 ; SSE2-NEXT:    psubq %xmm2, %xmm13
   1484 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
   1485 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1486 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1487 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
   1488 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1489 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
   1490 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   1491 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
   1492 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   1493 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1494 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
   1495 ; SSE2-NEXT:    retq
   1496 ;
   1497 ; SSSE3-LABEL: psubus_8i64_max:
   1498 ; SSSE3:       # %bb.0: # %vector.ph
   1499 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
   1500 ; SSSE3-NEXT:    movdqa %xmm2, %xmm7
   1501 ; SSSE3-NEXT:    pxor %xmm5, %xmm7
   1502 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
   1503 ; SSSE3-NEXT:    movdqa %xmm8, %xmm6
   1504 ; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm6
   1505 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
   1506 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm7
   1507 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
   1508 ; SSSE3-NEXT:    pand %xmm9, %xmm7
   1509 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1510 ; SSSE3-NEXT:    por %xmm7, %xmm6
   1511 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535]
   1512 ; SSSE3-NEXT:    pand %xmm6, %xmm2
   1513 ; SSSE3-NEXT:    pandn %xmm9, %xmm6
   1514 ; SSSE3-NEXT:    por %xmm2, %xmm6
   1515 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
   1516 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7]
   1517 ; SSSE3-NEXT:    movdqa %xmm1, %xmm6
   1518 ; SSSE3-NEXT:    pxor %xmm5, %xmm6
   1519 ; SSSE3-NEXT:    movdqa %xmm8, %xmm7
   1520 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
   1521 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
   1522 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
   1523 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1524 ; SSSE3-NEXT:    pand %xmm2, %xmm6
   1525 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
   1526 ; SSSE3-NEXT:    por %xmm6, %xmm2
   1527 ; SSSE3-NEXT:    pand %xmm2, %xmm1
   1528 ; SSSE3-NEXT:    pandn %xmm9, %xmm2
   1529 ; SSSE3-NEXT:    por %xmm1, %xmm2
   1530 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
   1531 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1532 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
   1533 ; SSSE3-NEXT:    movdqa %xmm4, %xmm2
   1534 ; SSSE3-NEXT:    pxor %xmm5, %xmm2
   1535 ; SSSE3-NEXT:    movdqa %xmm8, %xmm6
   1536 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
   1537 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
   1538 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
   1539 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1540 ; SSSE3-NEXT:    pand %xmm7, %xmm2
   1541 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1542 ; SSSE3-NEXT:    por %xmm2, %xmm6
   1543 ; SSSE3-NEXT:    pand %xmm6, %xmm4
   1544 ; SSSE3-NEXT:    pandn %xmm9, %xmm6
   1545 ; SSSE3-NEXT:    por %xmm4, %xmm6
   1546 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
   1547 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
   1548 ; SSSE3-NEXT:    pxor %xmm3, %xmm5
   1549 ; SSSE3-NEXT:    movdqa %xmm8, %xmm4
   1550 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
   1551 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
   1552 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm5
   1553 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
   1554 ; SSSE3-NEXT:    pand %xmm6, %xmm5
   1555 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   1556 ; SSSE3-NEXT:    por %xmm5, %xmm4
   1557 ; SSSE3-NEXT:    pand %xmm4, %xmm3
   1558 ; SSSE3-NEXT:    pandn %xmm9, %xmm4
   1559 ; SSSE3-NEXT:    por %xmm3, %xmm4
   1560 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
   1561 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
   1562 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   1563 ; SSSE3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
   1564 ; SSSE3-NEXT:    psubusw %xmm3, %xmm0
   1565 ; SSSE3-NEXT:    retq
   1566 ;
   1567 ; SSE41-LABEL: psubus_8i64_max:
   1568 ; SSE41:       # %bb.0: # %vector.ph
   1569 ; SSE41-NEXT:    movdqa %xmm0, %xmm10
   1570 ; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
   1571 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
   1572 ; SSE41-NEXT:    pxor %xmm6, %xmm0
   1573 ; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
   1574 ; SSE41-NEXT:    movdqa %xmm8, %xmm7
   1575 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
   1576 ; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
   1577 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
   1578 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
   1579 ; SSE41-NEXT:    pand %xmm9, %xmm5
   1580 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
   1581 ; SSE41-NEXT:    por %xmm5, %xmm0
   1582 ; SSE41-NEXT:    movapd {{.*#+}} xmm7 = [65535,65535]
   1583 ; SSE41-NEXT:    movapd %xmm7, %xmm11
   1584 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm11
   1585 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
   1586 ; SSE41-NEXT:    pxor %xmm6, %xmm0
   1587 ; SSE41-NEXT:    movdqa %xmm8, %xmm4
   1588 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
   1589 ; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
   1590 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
   1591 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
   1592 ; SSE41-NEXT:    pand %xmm9, %xmm5
   1593 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
   1594 ; SSE41-NEXT:    por %xmm5, %xmm0
   1595 ; SSE41-NEXT:    movapd %xmm7, %xmm4
   1596 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
   1597 ; SSE41-NEXT:    packusdw %xmm11, %xmm4
   1598 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
   1599 ; SSE41-NEXT:    pxor %xmm6, %xmm0
   1600 ; SSE41-NEXT:    movdqa %xmm8, %xmm3
   1601 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
   1602 ; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
   1603 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
   1604 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
   1605 ; SSE41-NEXT:    pand %xmm9, %xmm5
   1606 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
   1607 ; SSE41-NEXT:    por %xmm5, %xmm0
   1608 ; SSE41-NEXT:    movapd %xmm7, %xmm3
   1609 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
   1610 ; SSE41-NEXT:    pxor %xmm1, %xmm6
   1611 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
   1612 ; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
   1613 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
   1614 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm6
   1615 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
   1616 ; SSE41-NEXT:    pand %xmm2, %xmm5
   1617 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
   1618 ; SSE41-NEXT:    por %xmm5, %xmm0
   1619 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm7
   1620 ; SSE41-NEXT:    packusdw %xmm3, %xmm7
   1621 ; SSE41-NEXT:    packusdw %xmm4, %xmm7
   1622 ; SSE41-NEXT:    psubusw %xmm7, %xmm10
   1623 ; SSE41-NEXT:    pxor %xmm1, %xmm1
   1624 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
   1625 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
   1626 ; SSE41-NEXT:    packusdw %xmm10, %xmm0
   1627 ; SSE41-NEXT:    retq
   1628 ;
   1629 ; AVX1-LABEL: psubus_8i64_max:
   1630 ; AVX1:       # %bb.0: # %vector.ph
   1631 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1632 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
   1633 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
   1634 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
   1635 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
   1636 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm6
   1637 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm6
   1638 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
   1639 ; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [65535,65535,65535,65535]
   1640 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
   1641 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1642 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1643 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1644 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
   1645 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
   1646 ; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm4
   1647 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
   1648 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1649 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
   1650 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1651 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   1652 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1653 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1654 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1655 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1656 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1657 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   1658 ; AVX1-NEXT:    vzeroupper
   1659 ; AVX1-NEXT:    retq
   1660 ;
   1661 ; AVX2-LABEL: psubus_8i64_max:
   1662 ; AVX2:       # %bb.0: # %vector.ph
   1663 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
   1664 ; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm4
   1665 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
   1666 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
   1667 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
   1668 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
   1669 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm3
   1670 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
   1671 ; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
   1672 ; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
   1673 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
   1674 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1675 ; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1676 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1677 ; AVX2-NEXT:    vzeroupper
   1678 ; AVX2-NEXT:    retq
   1679 ;
   1680 ; AVX512-LABEL: psubus_8i64_max:
   1681 ; AVX512:       # %bb.0: # %vector.ph
   1682 ; AVX512-NEXT:    vpmovusqw %zmm1, %xmm1
   1683 ; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1684 ; AVX512-NEXT:    vzeroupper
   1685 ; AVX512-NEXT:    retq
   1686 vector.ph:
   1687   %lhs = zext <8 x i16> %x to <8 x i64>
   1688   %cond = icmp ult <8 x i64> %lhs, %y
   1689   %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
   1690   %sub = sub <8 x i64> %max, %y
   1691   %res = trunc <8 x i64> %sub to <8 x i16>
   1692   ret <8 x i16> %res
   1693 }
   1694 
   1695 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
   1696 ; SSE2-LABEL: psubus_16i32_max:
   1697 ; SSE2:       # %bb.0: # %vector.ph
   1698 ; SSE2-NEXT:    movdqa %xmm1, %xmm8
   1699 ; SSE2-NEXT:    pxor %xmm7, %xmm7
   1700 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
   1701 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
   1702 ; SSE2-NEXT:    movdqa %xmm0, %xmm10
   1703 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
   1704 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
   1705 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
   1706 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1707 ; SSE2-NEXT:    pxor %xmm7, %xmm6
   1708 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
   1709 ; SSE2-NEXT:    por %xmm7, %xmm9
   1710 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm9
   1711 ; SSE2-NEXT:    pand %xmm9, %xmm0
   1712 ; SSE2-NEXT:    pandn %xmm3, %xmm9
   1713 ; SSE2-NEXT:    por %xmm0, %xmm9
   1714 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1715 ; SSE2-NEXT:    pxor %xmm7, %xmm6
   1716 ; SSE2-NEXT:    movdqa %xmm10, %xmm0
   1717 ; SSE2-NEXT:    por %xmm7, %xmm0
   1718 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
   1719 ; SSE2-NEXT:    pand %xmm0, %xmm10
   1720 ; SSE2-NEXT:    pandn %xmm2, %xmm0
   1721 ; SSE2-NEXT:    por %xmm10, %xmm0
   1722 ; SSE2-NEXT:    movdqa %xmm5, %xmm10
   1723 ; SSE2-NEXT:    pxor %xmm7, %xmm10
   1724 ; SSE2-NEXT:    movdqa %xmm8, %xmm6
   1725 ; SSE2-NEXT:    por %xmm7, %xmm6
   1726 ; SSE2-NEXT:    pcmpgtd %xmm10, %xmm6
   1727 ; SSE2-NEXT:    pand %xmm6, %xmm8
   1728 ; SSE2-NEXT:    pandn %xmm5, %xmm6
   1729 ; SSE2-NEXT:    por %xmm8, %xmm6
   1730 ; SSE2-NEXT:    movdqa %xmm4, %xmm8
   1731 ; SSE2-NEXT:    pxor %xmm7, %xmm8
   1732 ; SSE2-NEXT:    por %xmm1, %xmm7
   1733 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
   1734 ; SSE2-NEXT:    pand %xmm7, %xmm1
   1735 ; SSE2-NEXT:    pandn %xmm4, %xmm7
   1736 ; SSE2-NEXT:    por %xmm7, %xmm1
   1737 ; SSE2-NEXT:    psubd %xmm4, %xmm1
   1738 ; SSE2-NEXT:    psubd %xmm5, %xmm6
   1739 ; SSE2-NEXT:    psubd %xmm2, %xmm0
   1740 ; SSE2-NEXT:    psubd %xmm3, %xmm9
   1741 ; SSE2-NEXT:    pslld $16, %xmm9
   1742 ; SSE2-NEXT:    psrad $16, %xmm9
   1743 ; SSE2-NEXT:    pslld $16, %xmm0
   1744 ; SSE2-NEXT:    psrad $16, %xmm0
   1745 ; SSE2-NEXT:    packssdw %xmm9, %xmm0
   1746 ; SSE2-NEXT:    pslld $16, %xmm6
   1747 ; SSE2-NEXT:    psrad $16, %xmm6
   1748 ; SSE2-NEXT:    pslld $16, %xmm1
   1749 ; SSE2-NEXT:    psrad $16, %xmm1
   1750 ; SSE2-NEXT:    packssdw %xmm6, %xmm1
   1751 ; SSE2-NEXT:    retq
   1752 ;
   1753 ; SSSE3-LABEL: psubus_16i32_max:
   1754 ; SSSE3:       # %bb.0: # %vector.ph
   1755 ; SSSE3-NEXT:    movdqa %xmm1, %xmm8
   1756 ; SSSE3-NEXT:    pxor %xmm7, %xmm7
   1757 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
   1758 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
   1759 ; SSSE3-NEXT:    movdqa %xmm0, %xmm10
   1760 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
   1761 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
   1762 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
   1763 ; SSSE3-NEXT:    movdqa %xmm3, %xmm6
   1764 ; SSSE3-NEXT:    pxor %xmm7, %xmm6
   1765 ; SSSE3-NEXT:    movdqa %xmm0, %xmm9
   1766 ; SSSE3-NEXT:    por %xmm7, %xmm9
   1767 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm9
   1768 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   1769 ; SSSE3-NEXT:    pandn %xmm3, %xmm9
   1770 ; SSSE3-NEXT:    por %xmm0, %xmm9
   1771 ; SSSE3-NEXT:    movdqa %xmm2, %xmm6
   1772 ; SSSE3-NEXT:    pxor %xmm7, %xmm6
   1773 ; SSSE3-NEXT:    movdqa %xmm10, %xmm0
   1774 ; SSSE3-NEXT:    por %xmm7, %xmm0
   1775 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm0
   1776 ; SSSE3-NEXT:    pand %xmm0, %xmm10
   1777 ; SSSE3-NEXT:    pandn %xmm2, %xmm0
   1778 ; SSSE3-NEXT:    por %xmm10, %xmm0
   1779 ; SSSE3-NEXT:    movdqa %xmm5, %xmm10
   1780 ; SSSE3-NEXT:    pxor %xmm7, %xmm10
   1781 ; SSSE3-NEXT:    movdqa %xmm8, %xmm6
   1782 ; SSSE3-NEXT:    por %xmm7, %xmm6
   1783 ; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm6
   1784 ; SSSE3-NEXT:    pand %xmm6, %xmm8
   1785 ; SSSE3-NEXT:    pandn %xmm5, %xmm6
   1786 ; SSSE3-NEXT:    por %xmm8, %xmm6
   1787 ; SSSE3-NEXT:    movdqa %xmm4, %xmm8
   1788 ; SSSE3-NEXT:    pxor %xmm7, %xmm8
   1789 ; SSSE3-NEXT:    por %xmm1, %xmm7
   1790 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
   1791 ; SSSE3-NEXT:    pand %xmm7, %xmm1
   1792 ; SSSE3-NEXT:    pandn %xmm4, %xmm7
   1793 ; SSSE3-NEXT:    por %xmm7, %xmm1
   1794 ; SSSE3-NEXT:    psubd %xmm4, %xmm1
   1795 ; SSSE3-NEXT:    psubd %xmm5, %xmm6
   1796 ; SSSE3-NEXT:    psubd %xmm2, %xmm0
   1797 ; SSSE3-NEXT:    psubd %xmm3, %xmm9
   1798 ; SSSE3-NEXT:    pslld $16, %xmm9
   1799 ; SSSE3-NEXT:    psrad $16, %xmm9
   1800 ; SSSE3-NEXT:    pslld $16, %xmm0
   1801 ; SSSE3-NEXT:    psrad $16, %xmm0
   1802 ; SSSE3-NEXT:    packssdw %xmm9, %xmm0
   1803 ; SSSE3-NEXT:    pslld $16, %xmm6
   1804 ; SSSE3-NEXT:    psrad $16, %xmm6
   1805 ; SSSE3-NEXT:    pslld $16, %xmm1
   1806 ; SSSE3-NEXT:    psrad $16, %xmm1
   1807 ; SSSE3-NEXT:    packssdw %xmm6, %xmm1
   1808 ; SSSE3-NEXT:    retq
   1809 ;
   1810 ; SSE41-LABEL: psubus_16i32_max:
   1811 ; SSE41:       # %bb.0: # %vector.ph
   1812 ; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
   1813 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
   1814 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1815 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
   1816 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
   1817 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1818 ; SSE41-NEXT:    pmaxud %xmm2, %xmm0
   1819 ; SSE41-NEXT:    pmaxud %xmm3, %xmm7
   1820 ; SSE41-NEXT:    pmaxud %xmm4, %xmm1
   1821 ; SSE41-NEXT:    pmaxud %xmm5, %xmm6
   1822 ; SSE41-NEXT:    psubd %xmm5, %xmm6
   1823 ; SSE41-NEXT:    psubd %xmm4, %xmm1
   1824 ; SSE41-NEXT:    psubd %xmm3, %xmm7
   1825 ; SSE41-NEXT:    psubd %xmm2, %xmm0
   1826 ; SSE41-NEXT:    pxor %xmm2, %xmm2
   1827 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
   1828 ; SSE41-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
   1829 ; SSE41-NEXT:    packusdw %xmm7, %xmm0
   1830 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
   1831 ; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7]
   1832 ; SSE41-NEXT:    packusdw %xmm6, %xmm1
   1833 ; SSE41-NEXT:    retq
   1834 ;
   1835 ; AVX1-LABEL: psubus_16i32_max:
   1836 ; AVX1:       # %bb.0: # %vector.ph
   1837 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1838 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
   1839 ; AVX1-NEXT:    vpminud %xmm4, %xmm3, %xmm3
   1840 ; AVX1-NEXT:    vpminud %xmm4, %xmm1, %xmm1
   1841 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   1842 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1843 ; AVX1-NEXT:    vpminud %xmm4, %xmm3, %xmm3
   1844 ; AVX1-NEXT:    vpminud %xmm4, %xmm2, %xmm2
   1845 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1846 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1847 ; AVX1-NEXT:    vpsubusw %xmm2, %xmm3, %xmm2
   1848 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1849 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1850 ; AVX1-NEXT:    retq
   1851 ;
   1852 ; AVX2-LABEL: psubus_16i32_max:
   1853 ; AVX2:       # %bb.0: # %vector.ph
   1854 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
   1855 ; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
   1856 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
   1857 ; AVX2-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
   1858 ; AVX2-NEXT:    vpminud %ymm3, %ymm2, %ymm2
   1859 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   1860 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1861 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1862 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
   1863 ; AVX2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   1864 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm3, %xmm1
   1865 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   1866 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1867 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
   1868 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   1869 ; AVX2-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
   1870 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
   1871 ; AVX2-NEXT:    retq
   1872 ;
   1873 ; AVX512-LABEL: psubus_16i32_max:
   1874 ; AVX512:       # %bb.0: # %vector.ph
   1875 ; AVX512-NEXT:    vpmovusdw %zmm1, %ymm1
   1876 ; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
   1877 ; AVX512-NEXT:    retq
   1878 vector.ph:
   1879   %lhs = zext <16 x i16> %x to <16 x i32>
   1880   %cond = icmp ult <16 x i32> %lhs, %y
   1881   %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
   1882   %sub = sub <16 x i32> %max, %y
   1883   %res = trunc <16 x i32> %sub to <16 x i16>
   1884   ret <16 x i16> %res
   1885 }
   1886 
   1887 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
   1888 ; SSE2-LABEL: psubus_i16_i32_max_swapped:
   1889 ; SSE2:       # %bb.0: # %vector.ph
   1890 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1891 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1892 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
   1893 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
   1894 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
   1895 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
   1896 ; SSE2-NEXT:    pxor %xmm5, %xmm3
   1897 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   1898 ; SSE2-NEXT:    por %xmm5, %xmm6
   1899 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm3
   1900 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1901 ; SSE2-NEXT:    pand %xmm3, %xmm6
   1902 ; SSE2-NEXT:    pandn %xmm0, %xmm3
   1903 ; SSE2-NEXT:    por %xmm6, %xmm3
   1904 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1905 ; SSE2-NEXT:    pxor %xmm5, %xmm0
   1906 ; SSE2-NEXT:    por %xmm4, %xmm5
   1907 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
   1908 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1909 ; SSE2-NEXT:    pand %xmm0, %xmm5
   1910 ; SSE2-NEXT:    pandn %xmm4, %xmm0
   1911 ; SSE2-NEXT:    por %xmm5, %xmm0
   1912 ; SSE2-NEXT:    psubd %xmm1, %xmm0
   1913 ; SSE2-NEXT:    psubd %xmm2, %xmm3
   1914 ; SSE2-NEXT:    pslld $16, %xmm3
   1915 ; SSE2-NEXT:    psrad $16, %xmm3
   1916 ; SSE2-NEXT:    pslld $16, %xmm0
   1917 ; SSE2-NEXT:    psrad $16, %xmm0
   1918 ; SSE2-NEXT:    packssdw %xmm3, %xmm0
   1919 ; SSE2-NEXT:    retq
   1920 ;
   1921 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:
   1922 ; SSSE3:       # %bb.0: # %vector.ph
   1923 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1924 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   1925 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   1926 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
   1927 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
   1928 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   1929 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
   1930 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
   1931 ; SSSE3-NEXT:    pand %xmm7, %xmm2
   1932 ; SSSE3-NEXT:    pandn %xmm5, %xmm7
   1933 ; SSSE3-NEXT:    por %xmm2, %xmm7
   1934 ; SSSE3-NEXT:    pshufb %xmm3, %xmm7
   1935 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
   1936 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
   1937 ; SSSE3-NEXT:    pand %xmm6, %xmm1
   1938 ; SSSE3-NEXT:    pandn %xmm5, %xmm6
   1939 ; SSSE3-NEXT:    por %xmm1, %xmm6
   1940 ; SSSE3-NEXT:    pshufb %xmm3, %xmm6
   1941 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1942 ; SSSE3-NEXT:    psubusw %xmm6, %xmm0
   1943 ; SSSE3-NEXT:    retq
   1944 ;
   1945 ; SSE41-LABEL: psubus_i16_i32_max_swapped:
   1946 ; SSE41:       # %bb.0: # %vector.ph
   1947 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   1948 ; SSE41-NEXT:    pminud %xmm3, %xmm2
   1949 ; SSE41-NEXT:    pminud %xmm3, %xmm1
   1950 ; SSE41-NEXT:    packusdw %xmm2, %xmm1
   1951 ; SSE41-NEXT:    psubusw %xmm1, %xmm0
   1952 ; SSE41-NEXT:    retq
   1953 ;
   1954 ; AVX1-LABEL: psubus_i16_i32_max_swapped:
   1955 ; AVX1:       # %bb.0: # %vector.ph
   1956 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1957 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   1958 ; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
   1959 ; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
   1960 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1961 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1962 ; AVX1-NEXT:    vzeroupper
   1963 ; AVX1-NEXT:    retq
   1964 ;
   1965 ; AVX2-LABEL: psubus_i16_i32_max_swapped:
   1966 ; AVX2:       # %bb.0: # %vector.ph
   1967 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
   1968 ; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
   1969 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1970 ; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1971 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1972 ; AVX2-NEXT:    vzeroupper
   1973 ; AVX2-NEXT:    retq
   1974 ;
   1975 ; AVX512-LABEL: psubus_i16_i32_max_swapped:
   1976 ; AVX512:       # %bb.0: # %vector.ph
   1977 ; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
   1978 ; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   1979 ; AVX512-NEXT:    vzeroupper
   1980 ; AVX512-NEXT:    retq
   1981 vector.ph:
   1982   %lhs = zext <8 x i16> %x to <8 x i32>
   1983   %cond = icmp ult <8 x i32> %y, %lhs
   1984   %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
   1985   %sub = sub <8 x i32> %max, %y
   1986   %res = trunc <8 x i32> %sub to <8 x i16>
   1987   ret <8 x i16> %res
   1988 }
   1989 
   1990 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
   1991 ; SSE2-LABEL: psubus_i16_i32_min:
   1992 ; SSE2:       # %bb.0: # %vector.ph
   1993 ; SSE2-NEXT:    pxor %xmm4, %xmm4
   1994 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1995 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
   1996 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
   1997 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   1998 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   1999 ; SSE2-NEXT:    pxor %xmm4, %xmm5
   2000 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2001 ; SSE2-NEXT:    por %xmm4, %xmm6
   2002 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
   2003 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2004 ; SSE2-NEXT:    pand %xmm5, %xmm6
   2005 ; SSE2-NEXT:    pandn %xmm2, %xmm5
   2006 ; SSE2-NEXT:    por %xmm6, %xmm5
   2007 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   2008 ; SSE2-NEXT:    pxor %xmm4, %xmm2
   2009 ; SSE2-NEXT:    por %xmm3, %xmm4
   2010 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
   2011 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2012 ; SSE2-NEXT:    pand %xmm2, %xmm4
   2013 ; SSE2-NEXT:    pandn %xmm1, %xmm2
   2014 ; SSE2-NEXT:    por %xmm4, %xmm2
   2015 ; SSE2-NEXT:    psubd %xmm2, %xmm3
   2016 ; SSE2-NEXT:    psubd %xmm5, %xmm0
   2017 ; SSE2-NEXT:    pslld $16, %xmm0
   2018 ; SSE2-NEXT:    psrad $16, %xmm0
   2019 ; SSE2-NEXT:    pslld $16, %xmm3
   2020 ; SSE2-NEXT:    psrad $16, %xmm3
   2021 ; SSE2-NEXT:    packssdw %xmm0, %xmm3
   2022 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
   2023 ; SSE2-NEXT:    retq
   2024 ;
   2025 ; SSSE3-LABEL: psubus_i16_i32_min:
   2026 ; SSSE3:       # %bb.0: # %vector.ph
   2027 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2028 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
   2029 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   2030 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
   2031 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
   2032 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   2033 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
   2034 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
   2035 ; SSSE3-NEXT:    pand %xmm7, %xmm2
   2036 ; SSSE3-NEXT:    pandn %xmm5, %xmm7
   2037 ; SSSE3-NEXT:    por %xmm2, %xmm7
   2038 ; SSSE3-NEXT:    pshufb %xmm3, %xmm7
   2039 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
   2040 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
   2041 ; SSSE3-NEXT:    pand %xmm6, %xmm1
   2042 ; SSSE3-NEXT:    pandn %xmm5, %xmm6
   2043 ; SSSE3-NEXT:    por %xmm1, %xmm6
   2044 ; SSSE3-NEXT:    pshufb %xmm3, %xmm6
   2045 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   2046 ; SSSE3-NEXT:    psubusw %xmm6, %xmm0
   2047 ; SSSE3-NEXT:    retq
   2048 ;
   2049 ; SSE41-LABEL: psubus_i16_i32_min:
   2050 ; SSE41:       # %bb.0: # %vector.ph
   2051 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   2052 ; SSE41-NEXT:    pminud %xmm3, %xmm2
   2053 ; SSE41-NEXT:    pminud %xmm3, %xmm1
   2054 ; SSE41-NEXT:    packusdw %xmm2, %xmm1
   2055 ; SSE41-NEXT:    psubusw %xmm1, %xmm0
   2056 ; SSE41-NEXT:    retq
   2057 ;
   2058 ; AVX1-LABEL: psubus_i16_i32_min:
   2059 ; AVX1:       # %bb.0: # %vector.ph
   2060 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2061 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
   2062 ; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
   2063 ; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
   2064 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   2065 ; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   2066 ; AVX1-NEXT:    vzeroupper
   2067 ; AVX1-NEXT:    retq
   2068 ;
   2069 ; AVX2-LABEL: psubus_i16_i32_min:
   2070 ; AVX2:       # %bb.0: # %vector.ph
   2071 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
   2072 ; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
   2073 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2074 ; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   2075 ; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   2076 ; AVX2-NEXT:    vzeroupper
   2077 ; AVX2-NEXT:    retq
   2078 ;
   2079 ; AVX512-LABEL: psubus_i16_i32_min:
   2080 ; AVX512:       # %bb.0: # %vector.ph
   2081 ; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
   2082 ; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
   2083 ; AVX512-NEXT:    vzeroupper
   2084 ; AVX512-NEXT:    retq
   2085 vector.ph:
   2086   %lhs = zext <8 x i16> %x to <8 x i32>
   2087   %cond = icmp ult <8 x i32> %lhs, %y
   2088   %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
   2089   %sub = sub <8 x i32> %lhs, %min
   2090   %res = trunc <8 x i32> %sub to <8 x i16>
   2091   ret <8 x i16> %res
   2092 }
   2093