Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
      2 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
      3 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
      4 
      5 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
      6 target triple = "x86_64-apple-macosx10.8.0"
      7 
      8 define void @test1(i16* nocapture %head) nounwind {
      9 vector.ph:
     10   br label %vector.body
     11 
     12 vector.body:                                      ; preds = %vector.body, %vector.ph
     13   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
     14   %0 = getelementptr inbounds i16* %head, i64 %index
     15   %1 = bitcast i16* %0 to <8 x i16>*
     16   %2 = load <8 x i16>* %1, align 2
     17   %3 = icmp slt <8 x i16> %2, zeroinitializer
     18   %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
     19   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     20   store <8 x i16> %5, <8 x i16>* %1, align 2
     21   %index.next = add i64 %index, 8
     22   %6 = icmp eq i64 %index.next, 16384
     23   br i1 %6, label %for.end, label %vector.body
     24 
     25 for.end:                                          ; preds = %vector.body
     26   ret void
     27 
     28 ; SSE2: @test1
     29 ; SSE2: psubusw LCPI0_0(%rip), %xmm0
     30 
     31 ; AVX1: @test1
     32 ; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
     33 
     34 ; AVX2: @test1
     35 ; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
     36 }
     37 
     38 define void @test2(i16* nocapture %head) nounwind {
     39 vector.ph:
     40   br label %vector.body
     41 
     42 vector.body:                                      ; preds = %vector.body, %vector.ph
     43   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
     44   %0 = getelementptr inbounds i16* %head, i64 %index
     45   %1 = bitcast i16* %0 to <8 x i16>*
     46   %2 = load <8 x i16>* %1, align 2
     47   %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
     48   %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
     49   %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
     50   store <8 x i16> %5, <8 x i16>* %1, align 2
     51   %index.next = add i64 %index, 8
     52   %6 = icmp eq i64 %index.next, 16384
     53   br i1 %6, label %for.end, label %vector.body
     54 
     55 for.end:                                          ; preds = %vector.body
     56   ret void
     57 
     58 ; SSE2: @test2
     59 ; SSE2: psubusw LCPI1_0(%rip), %xmm0
     60 
     61 ; AVX1: @test2
     62 ; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
     63 
     64 ; AVX2: @test2
     65 ; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
     66 }
     67 
     68 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
     69 vector.ph:
     70   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
     71   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
     72   br label %vector.body
     73 
     74 vector.body:                                      ; preds = %vector.body, %vector.ph
     75   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
     76   %1 = getelementptr inbounds i16* %head, i64 %index
     77   %2 = bitcast i16* %1 to <8 x i16>*
     78   %3 = load <8 x i16>* %2, align 2
     79   %4 = icmp ult <8 x i16> %3, %broadcast15
     80   %5 = sub <8 x i16> %3, %broadcast15
     81   %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
     82   store <8 x i16> %6, <8 x i16>* %2, align 2
     83   %index.next = add i64 %index, 8
     84   %7 = icmp eq i64 %index.next, 16384
     85   br i1 %7, label %for.end, label %vector.body
     86 
     87 for.end:                                          ; preds = %vector.body
     88   ret void
     89 
     90 ; SSE2: @test3
     91 ; SSE2: psubusw %xmm0, %xmm1
     92 
     93 ; AVX1: @test3
     94 ; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
     95 
     96 ; AVX2: @test3
     97 ; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
     98 }
     99 
    100 define void @test4(i8* nocapture %head) nounwind {
    101 vector.ph:
    102   br label %vector.body
    103 
    104 vector.body:                                      ; preds = %vector.body, %vector.ph
    105   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    106   %0 = getelementptr inbounds i8* %head, i64 %index
    107   %1 = bitcast i8* %0 to <16 x i8>*
    108   %2 = load <16 x i8>* %1, align 1
    109   %3 = icmp slt <16 x i8> %2, zeroinitializer
    110   %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    111   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    112   store <16 x i8> %5, <16 x i8>* %1, align 1
    113   %index.next = add i64 %index, 16
    114   %6 = icmp eq i64 %index.next, 16384
    115   br i1 %6, label %for.end, label %vector.body
    116 
    117 for.end:                                          ; preds = %vector.body
    118   ret void
    119 
    120 ; SSE2: @test4
    121 ; SSE2: psubusb LCPI3_0(%rip), %xmm0
    122 
    123 ; AVX1: @test4
    124 ; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
    125 
    126 ; AVX2: @test4
    127 ; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
    128 }
    129 
    130 define void @test5(i8* nocapture %head) nounwind {
    131 vector.ph:
    132   br label %vector.body
    133 
    134 vector.body:                                      ; preds = %vector.body, %vector.ph
    135   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    136   %0 = getelementptr inbounds i8* %head, i64 %index
    137   %1 = bitcast i8* %0 to <16 x i8>*
    138   %2 = load <16 x i8>* %1, align 1
    139   %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    140   %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    141   %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
    142   store <16 x i8> %5, <16 x i8>* %1, align 1
    143   %index.next = add i64 %index, 16
    144   %6 = icmp eq i64 %index.next, 16384
    145   br i1 %6, label %for.end, label %vector.body
    146 
    147 for.end:                                          ; preds = %vector.body
    148   ret void
    149 
    150 ; SSE2: @test5
    151 ; SSE2: psubusb LCPI4_0(%rip), %xmm0
    152 
    153 ; AVX1: @test5
    154 ; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
    155 
    156 ; AVX2: @test5
    157 ; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
    158 }
    159 
    160 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
    161 vector.ph:
    162   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
    163   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
    164   br label %vector.body
    165 
    166 vector.body:                                      ; preds = %vector.body, %vector.ph
    167   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    168   %1 = getelementptr inbounds i8* %head, i64 %index
    169   %2 = bitcast i8* %1 to <16 x i8>*
    170   %3 = load <16 x i8>* %2, align 1
    171   %4 = icmp ult <16 x i8> %3, %broadcast15
    172   %5 = sub <16 x i8> %3, %broadcast15
    173   %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
    174   store <16 x i8> %6, <16 x i8>* %2, align 1
    175   %index.next = add i64 %index, 16
    176   %7 = icmp eq i64 %index.next, 16384
    177   br i1 %7, label %for.end, label %vector.body
    178 
    179 for.end:                                          ; preds = %vector.body
    180   ret void
    181 
    182 ; SSE2: @test6
    183 ; SSE2: psubusb %xmm0, %xmm1
    184 
    185 ; AVX1: @test6
    186 ; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
    187 
    188 ; AVX2: @test6
    189 ; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
    190 }
    191 
    192 define void @test7(i16* nocapture %head) nounwind {
    193 vector.ph:
    194   br label %vector.body
    195 
    196 vector.body:                                      ; preds = %vector.body, %vector.ph
    197   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    198   %0 = getelementptr inbounds i16* %head, i64 %index
    199   %1 = bitcast i16* %0 to <16 x i16>*
    200   %2 = load <16 x i16>* %1, align 2
    201   %3 = icmp slt <16 x i16> %2, zeroinitializer
    202   %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
    203   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    204   store <16 x i16> %5, <16 x i16>* %1, align 2
    205   %index.next = add i64 %index, 8
    206   %6 = icmp eq i64 %index.next, 16384
    207   br i1 %6, label %for.end, label %vector.body
    208 
    209 for.end:                                          ; preds = %vector.body
    210   ret void
    211 
    212 ; AVX2: @test7
    213 ; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
    214 }
    215 
    216 define void @test8(i16* nocapture %head) nounwind {
    217 vector.ph:
    218   br label %vector.body
    219 
    220 vector.body:                                      ; preds = %vector.body, %vector.ph
    221   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    222   %0 = getelementptr inbounds i16* %head, i64 %index
    223   %1 = bitcast i16* %0 to <16 x i16>*
    224   %2 = load <16 x i16>* %1, align 2
    225   %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
    226   %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
    227   %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
    228   store <16 x i16> %5, <16 x i16>* %1, align 2
    229   %index.next = add i64 %index, 8
    230   %6 = icmp eq i64 %index.next, 16384
    231   br i1 %6, label %for.end, label %vector.body
    232 
    233 for.end:                                          ; preds = %vector.body
    234   ret void
    235 
    236 ; AVX2: @test8
    237 ; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
    238 }
    239 
    240 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
    241 vector.ph:
    242   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
    243   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
    244   br label %vector.body
    245 
    246 vector.body:                                      ; preds = %vector.body, %vector.ph
    247   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    248   %1 = getelementptr inbounds i16* %head, i64 %index
    249   %2 = bitcast i16* %1 to <16 x i16>*
    250   %3 = load <16 x i16>* %2, align 2
    251   %4 = icmp ult <16 x i16> %3, %broadcast15
    252   %5 = sub <16 x i16> %3, %broadcast15
    253   %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
    254   store <16 x i16> %6, <16 x i16>* %2, align 2
    255   %index.next = add i64 %index, 8
    256   %7 = icmp eq i64 %index.next, 16384
    257   br i1 %7, label %for.end, label %vector.body
    258 
    259 for.end:                                          ; preds = %vector.body
    260   ret void
    261 
    262 
    263 ; AVX2: @test9
    264 ; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
    265 }
    266 
    267 define void @test10(i8* nocapture %head) nounwind {
    268 vector.ph:
    269   br label %vector.body
    270 
    271 vector.body:                                      ; preds = %vector.body, %vector.ph
    272   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    273   %0 = getelementptr inbounds i8* %head, i64 %index
    274   %1 = bitcast i8* %0 to <32 x i8>*
    275   %2 = load <32 x i8>* %1, align 1
    276   %3 = icmp slt <32 x i8> %2, zeroinitializer
    277   %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
    278   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    279   store <32 x i8> %5, <32 x i8>* %1, align 1
    280   %index.next = add i64 %index, 16
    281   %6 = icmp eq i64 %index.next, 16384
    282   br i1 %6, label %for.end, label %vector.body
    283 
    284 for.end:                                          ; preds = %vector.body
    285   ret void
    286 
    287 
    288 ; AVX2: @test10
    289 ; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
    290 }
    291 
    292 define void @test11(i8* nocapture %head) nounwind {
    293 vector.ph:
    294   br label %vector.body
    295 
    296 vector.body:                                      ; preds = %vector.body, %vector.ph
    297   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    298   %0 = getelementptr inbounds i8* %head, i64 %index
    299   %1 = bitcast i8* %0 to <32 x i8>*
    300   %2 = load <32 x i8>* %1, align 1
    301   %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
    302   %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
    303   %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
    304   store <32 x i8> %5, <32 x i8>* %1, align 1
    305   %index.next = add i64 %index, 16
    306   %6 = icmp eq i64 %index.next, 16384
    307   br i1 %6, label %for.end, label %vector.body
    308 
    309 for.end:                                          ; preds = %vector.body
    310   ret void
    311 
    312 ; AVX2: @test11
    313 ; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
    314 }
    315 
    316 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
    317 vector.ph:
    318   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
    319   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
    320   br label %vector.body
    321 
    322 vector.body:                                      ; preds = %vector.body, %vector.ph
    323   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
    324   %1 = getelementptr inbounds i8* %head, i64 %index
    325   %2 = bitcast i8* %1 to <32 x i8>*
    326   %3 = load <32 x i8>* %2, align 1
    327   %4 = icmp ult <32 x i8> %3, %broadcast15
    328   %5 = sub <32 x i8> %3, %broadcast15
    329   %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
    330   store <32 x i8> %6, <32 x i8>* %2, align 1
    331   %index.next = add i64 %index, 16
    332   %7 = icmp eq i64 %index.next, 16384
    333   br i1 %7, label %for.end, label %vector.body
    334 
    335 for.end:                                          ; preds = %vector.body
    336   ret void
    337 
    338 ; AVX2: @test12
    339 ; AVX2: vpsubusb %ymm0, %ymm1, %ymm1
    340 }
    341