Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      2 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      3 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      5 
      6 define void @test1(i16* nocapture %head) nounwind {
      7 ; SSE-LABEL: test1:
      8 ; SSE:       ## BB#0: ## %vector.ph
      9 ; SSE-NEXT:    movdqu (%rdi), %xmm0
     10 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     11 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
     12 ; SSE-NEXT:    retq
     13 ;
     14 ; AVX-LABEL: test1:
     15 ; AVX:       ## BB#0: ## %vector.ph
     16 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
     17 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     18 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
     19 ; AVX-NEXT:    retq
     20 vector.ph:
     21   %0 = getelementptr inbounds i16, i16* %head, i64 0
     22   %1 = bitcast i16* %0 to <8 x i16>*
     23   %2 = load <8 x i16>, <8 x i16>* %1, align 2
     24   %3 = icmp slt <8 x i16> %2, zeroinitializer
     25   %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
     26   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     27   store <8 x i16> %5, <8 x i16>* %1, align 2
     28   ret void
     29 }
     30 
     31 define void @test2(i16* nocapture %head) nounwind {
     32 ; SSE-LABEL: test2:
     33 ; SSE:       ## BB#0: ## %vector.ph
     34 ; SSE-NEXT:    movdqu (%rdi), %xmm0
     35 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     36 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
     37 ; SSE-NEXT:    retq
     38 ;
     39 ; AVX-LABEL: test2:
     40 ; AVX:       ## BB#0: ## %vector.ph
     41 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
     42 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     43 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
     44 ; AVX-NEXT:    retq
     45 vector.ph:
     46   %0 = getelementptr inbounds i16, i16* %head, i64 0
     47   %1 = bitcast i16* %0 to <8 x i16>*
     48   %2 = load <8 x i16>, <8 x i16>* %1, align 2
     49   %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
     50   %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
     51   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     52   store <8 x i16> %5, <8 x i16>* %1, align 2
     53   ret void
     54 }
     55 
     56 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
     57 ; SSE-LABEL: test3:
     58 ; SSE:       ## BB#0: ## %vector.ph
     59 ; SSE-NEXT:    movd %esi, %xmm0
     60 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
     61 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
     62 ; SSE-NEXT:    movdqu (%rdi), %xmm1
     63 ; SSE-NEXT:    psubusw %xmm0, %xmm1
     64 ; SSE-NEXT:    movdqu %xmm1, (%rdi)
     65 ; SSE-NEXT:    retq
     66 ;
     67 ; AVX1-LABEL: test3:
     68 ; AVX1:       ## BB#0: ## %vector.ph
     69 ; AVX1-NEXT:    vmovd %esi, %xmm0
     70 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
     71 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
     72 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
     73 ; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
     74 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
     75 ; AVX1-NEXT:    retq
     76 ;
     77 ; AVX2-LABEL: test3:
     78 ; AVX2:       ## BB#0: ## %vector.ph
     79 ; AVX2-NEXT:    vmovd %esi, %xmm0
     80 ; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
     81 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
     82 ; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
     83 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
     84 ; AVX2-NEXT:    retq
     85 vector.ph:
     86   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
     87   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
     88   %1 = getelementptr inbounds i16, i16* %head, i64 0
     89   %2 = bitcast i16* %1 to <8 x i16>*
     90   %3 = load <8 x i16>, <8 x i16>* %2, align 2
     91   %4 = icmp ult <8 x i16> %3, %broadcast15
     92   %5 = sub <8 x i16> %3, %broadcast15
     93   %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
     94   store <8 x i16> %6, <8 x i16>* %2, align 2
     95   ret void
     96 }
     97 
     98 define void @test4(i8* nocapture %head) nounwind {
     99 ; SSE-LABEL: test4:
    100 ; SSE:       ## BB#0: ## %vector.ph
    101 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    102 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    103 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    104 ; SSE-NEXT:    retq
    105 ;
    106 ; AVX-LABEL: test4:
    107 ; AVX:       ## BB#0: ## %vector.ph
    108 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
    109 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    110 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
    111 ; AVX-NEXT:    retq
    112 vector.ph:
    113   %0 = getelementptr inbounds i8, i8* %head, i64 0
    114   %1 = bitcast i8* %0 to <16 x i8>*
    115   %2 = load <16 x i8>, <16 x i8>* %1, align 1
    116   %3 = icmp slt <16 x i8> %2, zeroinitializer
    117   %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    118   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    119   store <16 x i8> %5, <16 x i8>* %1, align 1
    120   ret void
    121 }
    122 
    123 define void @test5(i8* nocapture %head) nounwind {
    124 ; SSE-LABEL: test5:
    125 ; SSE:       ## BB#0: ## %vector.ph
    126 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    127 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    128 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    129 ; SSE-NEXT:    retq
    130 ;
    131 ; AVX-LABEL: test5:
    132 ; AVX:       ## BB#0: ## %vector.ph
    133 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
    134 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    135 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
    136 ; AVX-NEXT:    retq
    137 vector.ph:
    138   %0 = getelementptr inbounds i8, i8* %head, i64 0
    139   %1 = bitcast i8* %0 to <16 x i8>*
    140   %2 = load <16 x i8>, <16 x i8>* %1, align 1
    141   %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    142   %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    143   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    144   store <16 x i8> %5, <16 x i8>* %1, align 1
    145   ret void
    146 }
    147 
    148 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
    149 ; SSE2-LABEL: test6:
    150 ; SSE2:       ## BB#0: ## %vector.ph
    151 ; SSE2-NEXT:    movd %esi, %xmm0
    152 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    153 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    154 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    155 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
    156 ; SSE2-NEXT:    psubusb %xmm0, %xmm1
    157 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
    158 ; SSE2-NEXT:    retq
    159 ;
    160 ; SSSE3-LABEL: test6:
    161 ; SSSE3:       ## BB#0: ## %vector.ph
    162 ; SSSE3-NEXT:    movd %esi, %xmm0
    163 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    164 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    165 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
    166 ; SSSE3-NEXT:    psubusb %xmm0, %xmm1
    167 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
    168 ; SSSE3-NEXT:    retq
    169 ;
    170 ; AVX1-LABEL: test6:
    171 ; AVX1:       ## BB#0: ## %vector.ph
    172 ; AVX1-NEXT:    vmovd %esi, %xmm0
    173 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    174 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    175 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
    176 ; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
    177 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
    178 ; AVX1-NEXT:    retq
    179 ;
    180 ; AVX2-LABEL: test6:
    181 ; AVX2:       ## BB#0: ## %vector.ph
    182 ; AVX2-NEXT:    vmovd %esi, %xmm0
    183 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
    184 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
    185 ; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
    186 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
    187 ; AVX2-NEXT:    retq
    188 vector.ph:
    189   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
    190   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
    191   %1 = getelementptr inbounds i8, i8* %head, i64 0
    192   %2 = bitcast i8* %1 to <16 x i8>*
    193   %3 = load <16 x i8>, <16 x i8>* %2, align 1
    194   %4 = icmp ult <16 x i8> %3, %broadcast15
    195   %5 = sub <16 x i8> %3, %broadcast15
    196   %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
    197   store <16 x i8> %6, <16 x i8>* %2, align 1
    198   ret void
    199 }
    200 
    201 define void @test7(i16* nocapture %head) nounwind {
    202 ; SSE-LABEL: test7:
    203 ; SSE:       ## BB#0: ## %vector.ph
    204 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    205 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    206 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    207 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    208 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    209 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    210 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    211 ; SSE-NEXT:    retq
    212 ;
    213 ; AVX1-LABEL: test7:
    214 ; AVX1:       ## BB#0: ## %vector.ph
    215 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    216 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    217 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    218 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
    219 ; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
    220 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    221 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
    222 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
    223 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    224 ; AVX1-NEXT:    vzeroupper
    225 ; AVX1-NEXT:    retq
    226 ;
    227 ; AVX2-LABEL: test7:
    228 ; AVX2:       ## BB#0: ## %vector.ph
    229 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    230 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    231 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    232 ; AVX2-NEXT:    vzeroupper
    233 ; AVX2-NEXT:    retq
    234 vector.ph:
    235   %0 = getelementptr inbounds i16, i16* %head, i64 0
    236   %1 = bitcast i16* %0 to <16 x i16>*
    237   %2 = load <16 x i16>, <16 x i16>* %1, align 2
    238   %3 = icmp slt <16 x i16> %2, zeroinitializer
    239   %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
    240   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    241   store <16 x i16> %5, <16 x i16>* %1, align 2
    242   ret void
    243 }
    244 
    245 define void @test8(i16* nocapture %head) nounwind {
    246 ; SSE-LABEL: test8:
    247 ; SSE:       ## BB#0: ## %vector.ph
    248 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    249 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    250 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
    251 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    252 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    253 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    254 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    255 ; SSE-NEXT:    retq
    256 ;
    257 ; AVX1-LABEL: test8:
    258 ; AVX1:       ## BB#0: ## %vector.ph
    259 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    260 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    261 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    262 ; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
    263 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
    264 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
    265 ; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
    266 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
    267 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    268 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
    269 ; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    270 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    271 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    272 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
    273 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    274 ; AVX1-NEXT:    vzeroupper
    275 ; AVX1-NEXT:    retq
    276 ;
    277 ; AVX2-LABEL: test8:
    278 ; AVX2:       ## BB#0: ## %vector.ph
    279 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    280 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    281 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    282 ; AVX2-NEXT:    vzeroupper
    283 ; AVX2-NEXT:    retq
    284 vector.ph:
    285   %0 = getelementptr inbounds i16, i16* %head, i64 0
    286   %1 = bitcast i16* %0 to <16 x i16>*
    287   %2 = load <16 x i16>, <16 x i16>* %1, align 2
    288   %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
    289   %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
    290   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    291   store <16 x i16> %5, <16 x i16>* %1, align 2
    292   ret void
    293 
    294 }
    295 
    296 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
    297 ; SSE-LABEL: test9:
    298 ; SSE:       ## BB#0: ## %vector.ph
    299 ; SSE-NEXT:    movd %esi, %xmm0
    300 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    301 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    302 ; SSE-NEXT:    movdqu (%rdi), %xmm1
    303 ; SSE-NEXT:    movdqu 16(%rdi), %xmm2
    304 ; SSE-NEXT:    psubusw %xmm0, %xmm1
    305 ; SSE-NEXT:    psubusw %xmm0, %xmm2
    306 ; SSE-NEXT:    movdqu %xmm2, 16(%rdi)
    307 ; SSE-NEXT:    movdqu %xmm1, (%rdi)
    308 ; SSE-NEXT:    retq
    309 ;
    310 ; AVX1-LABEL: test9:
    311 ; AVX1:       ## BB#0: ## %vector.ph
    312 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    313 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    314 ; AVX1-NEXT:    vmovd %esi, %xmm2
    315 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
    316 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
    317 ; AVX1-NEXT:    vpsubw %xmm2, %xmm1, %xmm3
    318 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm4
    319 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    320 ; AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm4
    321 ; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm1, %xmm1
    322 ; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm2
    323 ; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
    324 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    325 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
    326 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    327 ; AVX1-NEXT:    vzeroupper
    328 ; AVX1-NEXT:    retq
    329 ;
    330 ; AVX2-LABEL: test9:
    331 ; AVX2:       ## BB#0: ## %vector.ph
    332 ; AVX2-NEXT:    vmovd %esi, %xmm0
    333 ; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
    334 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
    335 ; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
    336 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    337 ; AVX2-NEXT:    vzeroupper
    338 ; AVX2-NEXT:    retq
    339 vector.ph:
    340   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
    341   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
    342   %1 = getelementptr inbounds i16, i16* %head, i64 0
    343   %2 = bitcast i16* %1 to <16 x i16>*
    344   %3 = load <16 x i16>, <16 x i16>* %2, align 2
    345   %4 = icmp ult <16 x i16> %3, %broadcast15
    346   %5 = sub <16 x i16> %3, %broadcast15
    347   %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
    348   store <16 x i16> %6, <16 x i16>* %2, align 2
    349   ret void
    350 }
    351 
    352 define void @test10(i8* nocapture %head) nounwind {
    353 ; SSE-LABEL: test10:
    354 ; SSE:       ## BB#0: ## %vector.ph
    355 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    356 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    357 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    358 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    359 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    360 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    361 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    362 ; SSE-NEXT:    retq
    363 ;
    364 ; AVX1-LABEL: test10:
    365 ; AVX1:       ## BB#0: ## %vector.ph
    366 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    367 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    368 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    369 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
    370 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
    371 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    372 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
    373 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
    374 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    375 ; AVX1-NEXT:    vzeroupper
    376 ; AVX1-NEXT:    retq
    377 ;
    378 ; AVX2-LABEL: test10:
    379 ; AVX2:       ## BB#0: ## %vector.ph
    380 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    381 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    382 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    383 ; AVX2-NEXT:    vzeroupper
    384 ; AVX2-NEXT:    retq
    385 vector.ph:
    386   %0 = getelementptr inbounds i8, i8* %head, i64 0
    387   %1 = bitcast i8* %0 to <32 x i8>*
    388   %2 = load <32 x i8>, <32 x i8>* %1, align 1
    389   %3 = icmp slt <32 x i8> %2, zeroinitializer
    390   %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    391   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    392   store <32 x i8> %5, <32 x i8>* %1, align 1
    393   ret void
    394 
    395 }
    396 
    397 define void @test11(i8* nocapture %head) nounwind {
    398 ; SSE-LABEL: test11:
    399 ; SSE:       ## BB#0: ## %vector.ph
    400 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    401 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    402 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    403 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    404 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    405 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    406 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    407 ; SSE-NEXT:    retq
    408 ;
    409 ; AVX1-LABEL: test11:
    410 ; AVX1:       ## BB#0: ## %vector.ph
    411 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    412 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    413 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    414 ; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
    415 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
    416 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
    417 ; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
    418 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
    419 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    420 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
    421 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    422 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    423 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    424 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
    425 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    426 ; AVX1-NEXT:    vzeroupper
    427 ; AVX1-NEXT:    retq
    428 ;
    429 ; AVX2-LABEL: test11:
    430 ; AVX2:       ## BB#0: ## %vector.ph
    431 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    432 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    433 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    434 ; AVX2-NEXT:    vzeroupper
    435 ; AVX2-NEXT:    retq
    436 vector.ph:
    437   %0 = getelementptr inbounds i8, i8* %head, i64 0
    438   %1 = bitcast i8* %0 to <32 x i8>*
    439   %2 = load <32 x i8>, <32 x i8>* %1, align 1
    440   %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    441   %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    442   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    443   store <32 x i8> %5, <32 x i8>* %1, align 1
    444   ret void
    445 }
    446 
    447 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
    448 ; SSE2-LABEL: test12:
    449 ; SSE2:       ## BB#0: ## %vector.ph
    450 ; SSE2-NEXT:    movd %esi, %xmm0
    451 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    452 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    453 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    454 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
    455 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
    456 ; SSE2-NEXT:    psubusb %xmm0, %xmm1
    457 ; SSE2-NEXT:    psubusb %xmm0, %xmm2
    458 ; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
    459 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
    460 ; SSE2-NEXT:    retq
    461 ;
    462 ; SSSE3-LABEL: test12:
    463 ; SSSE3:       ## BB#0: ## %vector.ph
    464 ; SSSE3-NEXT:    movd %esi, %xmm0
    465 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    466 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    467 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
    468 ; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
    469 ; SSSE3-NEXT:    psubusb %xmm0, %xmm1
    470 ; SSSE3-NEXT:    psubusb %xmm0, %xmm2
    471 ; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
    472 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
    473 ; SSSE3-NEXT:    retq
    474 ;
    475 ; AVX1-LABEL: test12:
    476 ; AVX1:       ## BB#0: ## %vector.ph
    477 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    478 ; AVX1-NEXT:    vmovd %esi, %xmm1
    479 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    480 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    481 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    482 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
    483 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm4
    484 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    485 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm4
    486 ; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
    487 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
    488 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
    489 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    490 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
    491 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    492 ; AVX1-NEXT:    vzeroupper
    493 ; AVX1-NEXT:    retq
    494 ;
    495 ; AVX2-LABEL: test12:
    496 ; AVX2:       ## BB#0: ## %vector.ph
    497 ; AVX2-NEXT:    vmovd %esi, %xmm0
    498 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    499 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
    500 ; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
    501 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    502 ; AVX2-NEXT:    vzeroupper
    503 ; AVX2-NEXT:    retq
    504 vector.ph:
    505   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
    506   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
    507   %1 = getelementptr inbounds i8, i8* %head, i64 0
    508   %2 = bitcast i8* %1 to <32 x i8>*
    509   %3 = load <32 x i8>, <32 x i8>* %2, align 1
    510   %4 = icmp ult <32 x i8> %3, %broadcast15
    511   %5 = sub <32 x i8> %3, %broadcast15
    512   %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
    513   store <32 x i8> %6, <32 x i8>* %2, align 1
    514   ret void
    515 }
    516