Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
      4 
      5 
      6 ; Verify that we don't scalarize a packed vector shift left of 16-bit
      7 ; signed integers if the amount is a constant build_vector.
      8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
      9 
     10 define <8 x i16> @test1(<8 x i16> %a) {
     11   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     12   ret <8 x i16> %shl
     13 }
     14 ; CHECK-LABEL: test1
     15 ; CHECK: pmullw
     16 ; CHECK-NEXT: ret
     17 
     18 
     19 define <8 x i16> @test2(<8 x i16> %a) {
     20   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
     21   ret <8 x i16> %shl
     22 }
     23 ; CHECK-LABEL: test2
     24 ; CHECK: pmullw
     25 ; CHECK-NEXT: ret
     26 
     27 
     28 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
     29 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
     30 ; counts is a constant build_vector.
     31 
     32 define <4 x i32> @test3(<4 x i32> %a) {
     33   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
     34   ret <4 x i32> %shl
     35 }
     36 ; CHECK-LABEL: test3
     37 ; CHECK-NOT: cvttps2dq
     38 ; SSE: pmulld
     39 ; AVX2: vpsllvd
     40 ; CHECK-NEXT: ret
     41 
     42 
     43 define <4 x i32> @test4(<4 x i32> %a) {
     44   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
     45   ret <4 x i32> %shl
     46 }
     47 ; CHECK-LABEL: test4
     48 ; CHECK-NOT: cvttps2dq
     49 ; SSE: pmulld
     50 ; AVX2: vpsllvd
     51 ; CHECK-NEXT: ret
     52 
     53 
     54 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
     55 ; into two pmullw instructions. With AVX2, the test case below would produce
     56 ; a single vpmullw.
     57 
     58 define <16 x i16> @test5(<16 x i16> %a) {
     59   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     60   ret <16 x i16> %shl
     61 }
     62 ; CHECK-LABEL: test5
     63 ; SSE: pmullw
     64 ; SSE-NEXT: pmullw
     65 ; AVX2: vpmullw
     66 ; AVX2-NOT: vpmullw
     67 ; CHECK: ret
     68 
     69 
     70 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
     71 ; into two pmulld instructions. With AVX2, the test case below would produce
     72 ; a single vpsllvd instead.
     73 
     74 define <8 x i32> @test6(<8 x i32> %a) {
     75   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
     76   ret <8 x i32> %shl
     77 }
     78 ; CHECK-LABEL: test6
     79 ; SSE: pmulld
     80 ; SSE-NEXT: pmulld
     81 ; AVX2: vpsllvd
     82 ; CHECK: ret
     83 
     84 
     85 ; With AVX2 and AVX512, the test case below should produce a sequence of
     86 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
     87 ; parts and then we convert each part into a pmullw.
     88 
     89 define <32 x i16> @test7(<32 x i16> %a) {
     90   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     91   ret <32 x i16> %shl
     92 }
     93 ; CHECK-LABEL: test7
     94 ; SSE: pmullw
     95 ; SSE-NEXT: pmullw
     96 ; SSE-NEXT: pmullw
     97 ; SSE-NEXT: pmullw
     98 ; AVX2: vpmullw
     99 ; AVX2-NEXT: vpmullw
    100 ; CHECK: ret
    101 
    102 
    103 ; Similar to test7; the difference is that with AVX512 support
    104 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
    105 
    106 define <16 x i32> @test8(<16 x i32> %a) {
    107   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    108   ret <16 x i32> %shl
    109 }
    110 ; CHECK-LABEL: test8
    111 ; SSE: pmulld
    112 ; SSE-NEXT: pmulld
    113 ; SSE-NEXT: pmulld
    114 ; SSE-NEXT: pmulld
    115 ; AVX2ONLY: vpsllvd
    116 ; AVX2ONLY-NEXT: vpsllvd
    117 ; AVX512: vpsllvd
    118 ; AVX512-NOT: vpsllvd
    119 ; CHECK: ret
    120 
    121 
    122 ; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
    123 
    124 define <8 x i64> @test9(<8 x i64> %a) {
    125   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
    126   ret <8 x i64> %shl
    127 }
    128 ; CHECK-LABEL: test9
    129 ; AVX2ONLY: vpsllvq
    130 ; AVX2ONLY-NEXT: vpsllvq
    131 ; AVX512: vpsllvq
    132 ; AVX512-NOT: vpsllvq
    133 ; CHECK: ret
    134 
    135