1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE 2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY 3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512 4 5 6 ; Verify that we don't scalarize a packed vector shift left of 16-bit 7 ; signed integers if the amount is a constant build_vector. 8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 9 10 define <8 x i16> @test1(<8 x i16> %a) { 11 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 12 ret <8 x i16> %shl 13 } 14 ; CHECK-LABEL: test1 15 ; CHECK: pmullw 16 ; CHECK-NEXT: ret 17 18 19 define <8 x i16> @test2(<8 x i16> %a) { 20 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 21 ret <8 x i16> %shl 22 } 23 ; CHECK-LABEL: test2 24 ; CHECK: pmullw 25 ; CHECK-NEXT: ret 26 27 28 ; Verify that a vector shift left of 32-bit signed integers is simply expanded 29 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 30 ; counts is a constant build_vector. 31 32 define <4 x i32> @test3(<4 x i32> %a) { 33 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 34 ret <4 x i32> %shl 35 } 36 ; CHECK-LABEL: test3 37 ; CHECK-NOT: cvttps2dq 38 ; SSE: pmulld 39 ; AVX2: vpsllvd 40 ; CHECK-NEXT: ret 41 42 43 define <4 x i32> @test4(<4 x i32> %a) { 44 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 45 ret <4 x i32> %shl 46 } 47 ; CHECK-LABEL: test4 48 ; CHECK-NOT: cvttps2dq 49 ; SSE: pmulld 50 ; AVX2: vpsllvd 51 ; CHECK-NEXT: ret 52 53 54 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 55 ; into two pmullw instructions. With AVX2, the test case below would produce 56 ; a single vpmullw. 57 58 define <16 x i16> @test5(<16 x i16> %a) { 59 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 60 ret <16 x i16> %shl 61 } 62 ; CHECK-LABEL: test5 63 ; SSE: pmullw 64 ; SSE-NEXT: pmullw 65 ; AVX2: vpmullw 66 ; AVX2-NOT: vpmullw 67 ; CHECK: ret 68 69 70 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 71 ; into two pmulld instructions. With AVX2, the test case below would produce 72 ; a single vpsllvd instead. 73 74 define <8 x i32> @test6(<8 x i32> %a) { 75 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 76 ret <8 x i32> %shl 77 } 78 ; CHECK-LABEL: test6 79 ; SSE: pmulld 80 ; SSE-NEXT: pmulld 81 ; AVX2: vpsllvd 82 ; CHECK: ret 83 84 85 ; With AVX2 and AVX512, the test case below should produce a sequence of 86 ; two vpmullw instructions. On SSE2 instead, we split the shift in four 87 ; parts and then we convert each part into a pmullw. 88 89 define <32 x i16> @test7(<32 x i16> %a) { 90 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 91 ret <32 x i16> %shl 92 } 93 ; CHECK-LABEL: test7 94 ; SSE: pmullw 95 ; SSE-NEXT: pmullw 96 ; SSE-NEXT: pmullw 97 ; SSE-NEXT: pmullw 98 ; AVX2: vpmullw 99 ; AVX2-NEXT: vpmullw 100 ; CHECK: ret 101 102 103 ; Similar to test7; the difference is that with AVX512 support 104 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 105 106 define <16 x i32> @test8(<16 x i32> %a) { 107 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 108 ret <16 x i32> %shl 109 } 110 ; CHECK-LABEL: test8 111 ; SSE: pmulld 112 ; SSE-NEXT: pmulld 113 ; SSE-NEXT: pmulld 114 ; SSE-NEXT: pmulld 115 ; AVX2ONLY: vpsllvd 116 ; AVX2ONLY-NEXT: vpsllvd 117 ; AVX512: vpsllvd 118 ; AVX512-NOT: vpsllvd 119 ; CHECK: ret 120 121 122 ; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support. 123 124 define <8 x i64> @test9(<8 x i64> %a) { 125 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 126 ret <8 x i64> %shl 127 } 128 ; CHECK-LABEL: test9 129 ; AVX2ONLY: vpsllvq 130 ; AVX2ONLY-NEXT: vpsllvq 131 ; AVX512: vpsllvq 132 ; AVX512-NOT: vpsllvq 133 ; CHECK: ret 134 135