Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      2 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      3 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      5 
      6 define void @test1(i16* nocapture %head) nounwind {
      7 ; SSE-LABEL: test1:
      8 ; SSE:       ## BB#0: ## %vector.ph
      9 ; SSE-NEXT:    movdqu (%rdi), %xmm0
     10 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     11 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
     12 ; SSE-NEXT:    retq
     13 ;
     14 ; AVX-LABEL: test1:
     15 ; AVX:       ## BB#0: ## %vector.ph
     16 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
     17 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     18 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
     19 ; AVX-NEXT:    retq
     20 vector.ph:
     21   %0 = getelementptr inbounds i16, i16* %head, i64 0
     22   %1 = bitcast i16* %0 to <8 x i16>*
     23   %2 = load <8 x i16>, <8 x i16>* %1, align 2
     24   %3 = icmp slt <8 x i16> %2, zeroinitializer
     25   %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
     26   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     27   store <8 x i16> %5, <8 x i16>* %1, align 2
     28   ret void
     29 }
     30 
     31 define void @test2(i16* nocapture %head) nounwind {
     32 ; SSE-LABEL: test2:
     33 ; SSE:       ## BB#0: ## %vector.ph
     34 ; SSE-NEXT:    movdqu (%rdi), %xmm0
     35 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
     36 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
     37 ; SSE-NEXT:    retq
     38 ;
     39 ; AVX-LABEL: test2:
     40 ; AVX:       ## BB#0: ## %vector.ph
     41 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
     42 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
     43 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
     44 ; AVX-NEXT:    retq
     45 vector.ph:
     46   %0 = getelementptr inbounds i16, i16* %head, i64 0
     47   %1 = bitcast i16* %0 to <8 x i16>*
     48   %2 = load <8 x i16>, <8 x i16>* %1, align 2
     49   %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
     50   %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
     51   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     52   store <8 x i16> %5, <8 x i16>* %1, align 2
     53   ret void
     54 }
     55 
     56 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
     57 ; SSE2-LABEL: test3:
     58 ; SSE2:       ## BB#0: ## %vector.ph
     59 ; SSE2-NEXT:    movd %esi, %xmm0
     60 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
     61 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
     62 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
     63 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
     64 ; SSE2-NEXT:    psubusw %xmm0, %xmm1
     65 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
     66 ; SSE2-NEXT:    retq
     67 ;
     68 ; SSSE3-LABEL: test3:
     69 ; SSSE3:       ## BB#0: ## %vector.ph
     70 ; SSSE3-NEXT:    movd %esi, %xmm0
     71 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
     72 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
     73 ; SSSE3-NEXT:    psubusw %xmm0, %xmm1
     74 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
     75 ; SSSE3-NEXT:    retq
     76 ;
     77 ; AVX1-LABEL: test3:
     78 ; AVX1:       ## BB#0: ## %vector.ph
     79 ; AVX1-NEXT:    vmovd %esi, %xmm0
     80 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
     81 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
     82 ; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
     83 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
     84 ; AVX1-NEXT:    retq
     85 ;
     86 ; AVX2-LABEL: test3:
     87 ; AVX2:       ## BB#0: ## %vector.ph
     88 ; AVX2-NEXT:    vmovd %esi, %xmm0
     89 ; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
     90 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
     91 ; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
     92 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
     93 ; AVX2-NEXT:    retq
     94 vector.ph:
     95   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
     96   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
     97   %1 = getelementptr inbounds i16, i16* %head, i64 0
     98   %2 = bitcast i16* %1 to <8 x i16>*
     99   %3 = load <8 x i16>, <8 x i16>* %2, align 2
    100   %4 = icmp ult <8 x i16> %3, %broadcast15
    101   %5 = sub <8 x i16> %3, %broadcast15
    102   %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
    103   store <8 x i16> %6, <8 x i16>* %2, align 2
    104   ret void
    105 }
    106 
    107 define void @test4(i8* nocapture %head) nounwind {
    108 ; SSE-LABEL: test4:
    109 ; SSE:       ## BB#0: ## %vector.ph
    110 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    111 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    112 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    113 ; SSE-NEXT:    retq
    114 ;
    115 ; AVX-LABEL: test4:
    116 ; AVX:       ## BB#0: ## %vector.ph
    117 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
    118 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    119 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
    120 ; AVX-NEXT:    retq
    121 vector.ph:
    122   %0 = getelementptr inbounds i8, i8* %head, i64 0
    123   %1 = bitcast i8* %0 to <16 x i8>*
    124   %2 = load <16 x i8>, <16 x i8>* %1, align 1
    125   %3 = icmp slt <16 x i8> %2, zeroinitializer
    126   %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    127   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    128   store <16 x i8> %5, <16 x i8>* %1, align 1
    129   ret void
    130 }
    131 
    132 define void @test5(i8* nocapture %head) nounwind {
    133 ; SSE-LABEL: test5:
    134 ; SSE:       ## BB#0: ## %vector.ph
    135 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    136 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
    137 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    138 ; SSE-NEXT:    retq
    139 ;
    140 ; AVX-LABEL: test5:
    141 ; AVX:       ## BB#0: ## %vector.ph
    142 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
    143 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
    144 ; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
    145 ; AVX-NEXT:    retq
    146 vector.ph:
    147   %0 = getelementptr inbounds i8, i8* %head, i64 0
    148   %1 = bitcast i8* %0 to <16 x i8>*
    149   %2 = load <16 x i8>, <16 x i8>* %1, align 1
    150   %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    151   %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    152   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    153   store <16 x i8> %5, <16 x i8>* %1, align 1
    154   ret void
    155 }
    156 
    157 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
    158 ; SSE2-LABEL: test6:
    159 ; SSE2:       ## BB#0: ## %vector.ph
    160 ; SSE2-NEXT:    movd %esi, %xmm0
    161 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    162 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
    163 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    164 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
    165 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
    166 ; SSE2-NEXT:    psubusb %xmm0, %xmm1
    167 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
    168 ; SSE2-NEXT:    retq
    169 ;
    170 ; SSSE3-LABEL: test6:
    171 ; SSSE3:       ## BB#0: ## %vector.ph
    172 ; SSSE3-NEXT:    movd %esi, %xmm0
    173 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    174 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    175 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
    176 ; SSSE3-NEXT:    psubusb %xmm0, %xmm1
    177 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
    178 ; SSSE3-NEXT:    retq
    179 ;
    180 ; AVX1-LABEL: test6:
    181 ; AVX1:       ## BB#0: ## %vector.ph
    182 ; AVX1-NEXT:    vmovd %esi, %xmm0
    183 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    184 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    185 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
    186 ; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
    187 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
    188 ; AVX1-NEXT:    retq
    189 ;
    190 ; AVX2-LABEL: test6:
    191 ; AVX2:       ## BB#0: ## %vector.ph
    192 ; AVX2-NEXT:    vmovd %esi, %xmm0
    193 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
    194 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
    195 ; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
    196 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
    197 ; AVX2-NEXT:    retq
    198 vector.ph:
    199   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
    200   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
    201   %1 = getelementptr inbounds i8, i8* %head, i64 0
    202   %2 = bitcast i8* %1 to <16 x i8>*
    203   %3 = load <16 x i8>, <16 x i8>* %2, align 1
    204   %4 = icmp ult <16 x i8> %3, %broadcast15
    205   %5 = sub <16 x i8> %3, %broadcast15
    206   %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
    207   store <16 x i8> %6, <16 x i8>* %2, align 1
    208   ret void
    209 }
    210 
    211 define void @test7(i16* nocapture %head) nounwind {
    212 ; SSE-LABEL: test7:
    213 ; SSE:       ## BB#0: ## %vector.ph
    214 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    215 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    216 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    217 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    218 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    219 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    220 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    221 ; SSE-NEXT:    retq
    222 ;
    223 ; AVX1-LABEL: test7:
    224 ; AVX1:       ## BB#0: ## %vector.ph
    225 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    226 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    227 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    228 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
    229 ; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
    230 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    231 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
    232 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
    233 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    234 ; AVX1-NEXT:    vzeroupper
    235 ; AVX1-NEXT:    retq
    236 ;
    237 ; AVX2-LABEL: test7:
    238 ; AVX2:       ## BB#0: ## %vector.ph
    239 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    240 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    241 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    242 ; AVX2-NEXT:    vzeroupper
    243 ; AVX2-NEXT:    retq
    244 vector.ph:
    245   %0 = getelementptr inbounds i16, i16* %head, i64 0
    246   %1 = bitcast i16* %0 to <16 x i16>*
    247   %2 = load <16 x i16>, <16 x i16>* %1, align 2
    248   %3 = icmp slt <16 x i16> %2, zeroinitializer
    249   %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
    250   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    251   store <16 x i16> %5, <16 x i16>* %1, align 2
    252   ret void
    253 }
    254 
    255 define void @test8(i16* nocapture %head) nounwind {
    256 ; SSE-LABEL: test8:
    257 ; SSE:       ## BB#0: ## %vector.ph
    258 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    259 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    260 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
    261 ; SSE-NEXT:    psubusw %xmm2, %xmm0
    262 ; SSE-NEXT:    psubusw %xmm2, %xmm1
    263 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    264 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    265 ; SSE-NEXT:    retq
    266 ;
    267 ; AVX1-LABEL: test8:
    268 ; AVX1:       ## BB#0: ## %vector.ph
    269 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    270 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    271 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
    272 ; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
    273 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
    274 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
    275 ; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
    276 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
    277 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    278 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
    279 ; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    280 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    281 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    282 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
    283 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    284 ; AVX1-NEXT:    vzeroupper
    285 ; AVX1-NEXT:    retq
    286 ;
    287 ; AVX2-LABEL: test8:
    288 ; AVX2:       ## BB#0: ## %vector.ph
    289 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    290 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
    291 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    292 ; AVX2-NEXT:    vzeroupper
    293 ; AVX2-NEXT:    retq
    294 vector.ph:
    295   %0 = getelementptr inbounds i16, i16* %head, i64 0
    296   %1 = bitcast i16* %0 to <16 x i16>*
    297   %2 = load <16 x i16>, <16 x i16>* %1, align 2
    298   %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
    299   %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
    300   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    301   store <16 x i16> %5, <16 x i16>* %1, align 2
    302   ret void
    303 
    304 }
    305 
    306 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
    307 ; SSE2-LABEL: test9:
    308 ; SSE2:       ## BB#0: ## %vector.ph
    309 ; SSE2-NEXT:    movd %esi, %xmm0
    310 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
    311 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    312 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
    313 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
    314 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
    315 ; SSE2-NEXT:    psubusw %xmm0, %xmm1
    316 ; SSE2-NEXT:    psubusw %xmm0, %xmm2
    317 ; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
    318 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
    319 ; SSE2-NEXT:    retq
    320 ;
    321 ; SSSE3-LABEL: test9:
    322 ; SSSE3:       ## BB#0: ## %vector.ph
    323 ; SSSE3-NEXT:    movd %esi, %xmm0
    324 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    325 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
    326 ; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
    327 ; SSSE3-NEXT:    psubusw %xmm0, %xmm1
    328 ; SSSE3-NEXT:    psubusw %xmm0, %xmm2
    329 ; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
    330 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
    331 ; SSSE3-NEXT:    retq
    332 ;
    333 ; AVX1-LABEL: test9:
    334 ; AVX1:       ## BB#0: ## %vector.ph
    335 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    336 ; AVX1-NEXT:    vmovd %esi, %xmm1
    337 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    338 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    339 ; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm3
    340 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm4
    341 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    342 ; AVX1-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm4
    343 ; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
    344 ; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
    345 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
    346 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    347 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
    348 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    349 ; AVX1-NEXT:    vzeroupper
    350 ; AVX1-NEXT:    retq
    351 ;
    352 ; AVX2-LABEL: test9:
    353 ; AVX2:       ## BB#0: ## %vector.ph
    354 ; AVX2-NEXT:    vmovd %esi, %xmm0
    355 ; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
    356 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
    357 ; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
    358 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    359 ; AVX2-NEXT:    vzeroupper
    360 ; AVX2-NEXT:    retq
    361 vector.ph:
    362   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
    363   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
    364   %1 = getelementptr inbounds i16, i16* %head, i64 0
    365   %2 = bitcast i16* %1 to <16 x i16>*
    366   %3 = load <16 x i16>, <16 x i16>* %2, align 2
    367   %4 = icmp ult <16 x i16> %3, %broadcast15
    368   %5 = sub <16 x i16> %3, %broadcast15
    369   %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
    370   store <16 x i16> %6, <16 x i16>* %2, align 2
    371   ret void
    372 }
    373 
    374 define void @test10(i8* nocapture %head) nounwind {
    375 ; SSE-LABEL: test10:
    376 ; SSE:       ## BB#0: ## %vector.ph
    377 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    378 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    379 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    380 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    381 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    382 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    383 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    384 ; SSE-NEXT:    retq
    385 ;
    386 ; AVX1-LABEL: test10:
    387 ; AVX1:       ## BB#0: ## %vector.ph
    388 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    389 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    390 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    391 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
    392 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
    393 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    394 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
    395 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
    396 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    397 ; AVX1-NEXT:    vzeroupper
    398 ; AVX1-NEXT:    retq
    399 ;
    400 ; AVX2-LABEL: test10:
    401 ; AVX2:       ## BB#0: ## %vector.ph
    402 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    403 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    404 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    405 ; AVX2-NEXT:    vzeroupper
    406 ; AVX2-NEXT:    retq
    407 vector.ph:
    408   %0 = getelementptr inbounds i8, i8* %head, i64 0
    409   %1 = bitcast i8* %0 to <32 x i8>*
    410   %2 = load <32 x i8>, <32 x i8>* %1, align 1
    411   %3 = icmp slt <32 x i8> %2, zeroinitializer
    412   %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    413   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    414   store <32 x i8> %5, <32 x i8>* %1, align 1
    415   ret void
    416 
    417 }
    418 
    419 define void @test11(i8* nocapture %head) nounwind {
    420 ; SSE-LABEL: test11:
    421 ; SSE:       ## BB#0: ## %vector.ph
    422 ; SSE-NEXT:    movdqu (%rdi), %xmm0
    423 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
    424 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    425 ; SSE-NEXT:    psubusb %xmm2, %xmm0
    426 ; SSE-NEXT:    psubusb %xmm2, %xmm1
    427 ; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
    428 ; SSE-NEXT:    movdqu %xmm0, (%rdi)
    429 ; SSE-NEXT:    retq
    430 ;
    431 ; AVX1-LABEL: test11:
    432 ; AVX1:       ## BB#0: ## %vector.ph
    433 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    434 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    435 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
    436 ; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
    437 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
    438 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
    439 ; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
    440 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
    441 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    442 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
    443 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    444 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    445 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    446 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
    447 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    448 ; AVX1-NEXT:    vzeroupper
    449 ; AVX1-NEXT:    retq
    450 ;
    451 ; AVX2-LABEL: test11:
    452 ; AVX2:       ## BB#0: ## %vector.ph
    453 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
    454 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
    455 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    456 ; AVX2-NEXT:    vzeroupper
    457 ; AVX2-NEXT:    retq
    458 vector.ph:
    459   %0 = getelementptr inbounds i8, i8* %head, i64 0
    460   %1 = bitcast i8* %0 to <32 x i8>*
    461   %2 = load <32 x i8>, <32 x i8>* %1, align 1
    462   %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    463   %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    464   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    465   store <32 x i8> %5, <32 x i8>* %1, align 1
    466   ret void
    467 }
    468 
    469 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
    470 ; SSE2-LABEL: test12:
    471 ; SSE2:       ## BB#0: ## %vector.ph
    472 ; SSE2-NEXT:    movd %esi, %xmm0
    473 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    474 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
    475 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    476 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
    477 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
    478 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
    479 ; SSE2-NEXT:    psubusb %xmm0, %xmm1
    480 ; SSE2-NEXT:    psubusb %xmm0, %xmm2
    481 ; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
    482 ; SSE2-NEXT:    movdqu %xmm1, (%rdi)
    483 ; SSE2-NEXT:    retq
    484 ;
    485 ; SSSE3-LABEL: test12:
    486 ; SSSE3:       ## BB#0: ## %vector.ph
    487 ; SSSE3-NEXT:    movd %esi, %xmm0
    488 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    489 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    490 ; SSSE3-NEXT:    movdqu (%rdi), %xmm1
    491 ; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
    492 ; SSSE3-NEXT:    psubusb %xmm0, %xmm1
    493 ; SSSE3-NEXT:    psubusb %xmm0, %xmm2
    494 ; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
    495 ; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
    496 ; SSSE3-NEXT:    retq
    497 ;
    498 ; AVX1-LABEL: test12:
    499 ; AVX1:       ## BB#0: ## %vector.ph
    500 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    501 ; AVX1-NEXT:    vmovd %esi, %xmm1
    502 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    503 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    504 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    505 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
    506 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm4
    507 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    508 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm4
    509 ; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
    510 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
    511 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
    512 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    513 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
    514 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    515 ; AVX1-NEXT:    vzeroupper
    516 ; AVX1-NEXT:    retq
    517 ;
    518 ; AVX2-LABEL: test12:
    519 ; AVX2:       ## BB#0: ## %vector.ph
    520 ; AVX2-NEXT:    vmovd %esi, %xmm0
    521 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    522 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
    523 ; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
    524 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    525 ; AVX2-NEXT:    vzeroupper
    526 ; AVX2-NEXT:    retq
    527 vector.ph:
    528   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
    529   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
    530   %1 = getelementptr inbounds i8, i8* %head, i64 0
    531   %2 = bitcast i8* %1 to <32 x i8>*
    532   %3 = load <32 x i8>, <32 x i8>* %2, align 1
    533   %4 = icmp ult <32 x i8> %3, %broadcast15
    534   %5 = sub <32 x i8> %3, %broadcast15
    535   %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
    536   store <32 x i8> %6, <32 x i8>* %2, align 1
    537   ret void
    538 }
    539