1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7 ; Verify that we don't scalarize a packed vector shift left of 16-bit 8 ; signed integers if the amount is a constant build_vector. 9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 10 11 define <8 x i16> @test1(<8 x i16> %a) { 12 ; SSE-LABEL: test1: 13 ; SSE: # %bb.0: 14 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 15 ; SSE-NEXT: retq 16 ; 17 ; AVX-LABEL: test1: 18 ; AVX: # %bb.0: 19 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 20 ; AVX-NEXT: retq 21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 22 ret <8 x i16> %shl 23 } 24 25 define <8 x i16> @test2(<8 x i16> %a) { 26 ; SSE-LABEL: test2: 27 ; SSE: # %bb.0: 28 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 29 ; SSE-NEXT: retq 30 ; 31 ; AVX-LABEL: test2: 32 ; AVX: # %bb.0: 33 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 34 ; AVX-NEXT: retq 35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 36 ret <8 x i16> %shl 37 } 38 39 ; Verify that a vector shift left of 32-bit signed integers is simply expanded 40 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 41 ; counts is a constant build_vector. 42 43 define <4 x i32> @test3(<4 x i32> %a) { 44 ; SSE2-LABEL: test3: 45 ; SSE2: # %bb.0: 46 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 47 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 48 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 49 ; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 50 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 51 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 52 ; SSE2-NEXT: retq 53 ; 54 ; SSE41-LABEL: test3: 55 ; SSE41: # %bb.0: 56 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 57 ; SSE41-NEXT: retq 58 ; 59 ; AVX-LABEL: test3: 60 ; AVX: # %bb.0: 61 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 62 ; AVX-NEXT: retq 63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 64 ret <4 x i32> %shl 65 } 66 67 define <4 x i32> @test4(<4 x i32> %a) { 68 ; SSE2-LABEL: test4: 69 ; SSE2: # %bb.0: 70 ; SSE2-NEXT: movdqa %xmm0, %xmm1 71 ; SSE2-NEXT: pslld $1, %xmm1 72 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 73 ; SSE2-NEXT: movapd %xmm1, %xmm0 74 ; SSE2-NEXT: retq 75 ; 76 ; SSE41-LABEL: test4: 77 ; SSE41: # %bb.0: 78 ; SSE41-NEXT: movdqa %xmm0, %xmm1 79 ; SSE41-NEXT: pslld $1, %xmm1 80 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 81 ; SSE41-NEXT: movdqa %xmm1, %xmm0 82 ; SSE41-NEXT: retq 83 ; 84 ; AVX-LABEL: test4: 85 ; AVX: # %bb.0: 86 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 87 ; AVX-NEXT: retq 88 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 89 ret <4 x i32> %shl 90 } 91 92 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 93 ; into two pmullw instructions. With AVX2, the test case below would produce 94 ; a single vpmullw. 95 96 define <16 x i16> @test5(<16 x i16> %a) { 97 ; SSE-LABEL: test5: 98 ; SSE: # %bb.0: 99 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] 100 ; SSE-NEXT: pmullw %xmm2, %xmm0 101 ; SSE-NEXT: pmullw %xmm2, %xmm1 102 ; SSE-NEXT: retq 103 ; 104 ; AVX-LABEL: test5: 105 ; AVX: # %bb.0: 106 ; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 107 ; AVX-NEXT: retq 108 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 109 ret <16 x i16> %shl 110 } 111 112 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 113 ; into two pmulld instructions. With AVX2, the test case below would produce 114 ; a single vpsllvd instead. 115 116 define <8 x i32> @test6(<8 x i32> %a) { 117 ; SSE2-LABEL: test6: 118 ; SSE2: # %bb.0: 119 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 120 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 121 ; SSE2-NEXT: pmuludq %xmm2, %xmm0 122 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 123 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 124 ; SSE2-NEXT: pmuludq %xmm4, %xmm3 125 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 126 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 127 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 128 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 129 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 130 ; SSE2-NEXT: pmuludq %xmm4, %xmm1 131 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 132 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 133 ; SSE2-NEXT: movdqa %xmm2, %xmm1 134 ; SSE2-NEXT: retq 135 ; 136 ; SSE41-LABEL: test6: 137 ; SSE41: # %bb.0: 138 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 139 ; SSE41-NEXT: pmulld %xmm2, %xmm0 140 ; SSE41-NEXT: pmulld %xmm2, %xmm1 141 ; SSE41-NEXT: retq 142 ; 143 ; AVX-LABEL: test6: 144 ; AVX: # %bb.0: 145 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 146 ; AVX-NEXT: retq 147 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 148 ret <8 x i32> %shl 149 } 150 151 ; With AVX2 and AVX512, the test case below should produce a sequence of 152 ; two vpmullw instructions. On SSE2 instead, we split the shift in four 153 ; parts and then we convert each part into a pmullw. 154 155 define <32 x i16> @test7(<32 x i16> %a) { 156 ; SSE-LABEL: test7: 157 ; SSE: # %bb.0: 158 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] 159 ; SSE-NEXT: pmullw %xmm4, %xmm0 160 ; SSE-NEXT: pmullw %xmm4, %xmm1 161 ; SSE-NEXT: pmullw %xmm4, %xmm2 162 ; SSE-NEXT: pmullw %xmm4, %xmm3 163 ; SSE-NEXT: retq 164 ; 165 ; AVX-LABEL: test7: 166 ; AVX: # %bb.0: 167 ; AVX-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 168 ; AVX-NEXT: # ymm2 = mem[0,1,0,1] 169 ; AVX-NEXT: vpmullw %ymm2, %ymm0, %ymm0 170 ; AVX-NEXT: vpmullw %ymm2, %ymm1, %ymm1 171 ; AVX-NEXT: retq 172 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 173 ret <32 x i16> %shl 174 } 175 176 ; Similar to test7; the difference is that with AVX512 support 177 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 178 179 define <16 x i32> @test8(<16 x i32> %a) { 180 ; SSE2-LABEL: test8: 181 ; SSE2: # %bb.0: 182 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 183 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 184 ; SSE2-NEXT: pmuludq %xmm4, %xmm0 185 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 186 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] 187 ; SSE2-NEXT: pmuludq %xmm6, %xmm5 188 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 189 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 190 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 191 ; SSE2-NEXT: pmuludq %xmm4, %xmm1 192 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 193 ; SSE2-NEXT: pmuludq %xmm6, %xmm5 194 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 195 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 196 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 197 ; SSE2-NEXT: pmuludq %xmm4, %xmm2 198 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 199 ; SSE2-NEXT: pmuludq %xmm6, %xmm5 200 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 201 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 202 ; SSE2-NEXT: pmuludq %xmm3, %xmm4 203 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 204 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 205 ; SSE2-NEXT: pmuludq %xmm6, %xmm3 206 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 207 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 208 ; SSE2-NEXT: movdqa %xmm4, %xmm3 209 ; SSE2-NEXT: retq 210 ; 211 ; SSE41-LABEL: test8: 212 ; SSE41: # %bb.0: 213 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 214 ; SSE41-NEXT: pmulld %xmm4, %xmm0 215 ; SSE41-NEXT: pmulld %xmm4, %xmm1 216 ; SSE41-NEXT: pmulld %xmm4, %xmm2 217 ; SSE41-NEXT: pmulld %xmm4, %xmm3 218 ; SSE41-NEXT: retq 219 ; 220 ; AVX2-LABEL: test8: 221 ; AVX2: # %bb.0: 222 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] 223 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 224 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 225 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 226 ; AVX2-NEXT: retq 227 ; 228 ; AVX512-LABEL: test8: 229 ; AVX512: # %bb.0: 230 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 231 ; AVX512-NEXT: retq 232 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 233 ret <16 x i32> %shl 234 } 235 236 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. 237 238 define <8 x i64> @test9(<8 x i64> %a) { 239 ; SSE2-LABEL: test9: 240 ; SSE2: # %bb.0: 241 ; SSE2-NEXT: movdqa %xmm1, %xmm4 242 ; SSE2-NEXT: psllq $2, %xmm4 243 ; SSE2-NEXT: psllq $3, %xmm1 244 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 245 ; SSE2-NEXT: movdqa %xmm3, %xmm4 246 ; SSE2-NEXT: psllq $2, %xmm4 247 ; SSE2-NEXT: psllq $3, %xmm3 248 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 249 ; SSE2-NEXT: paddq %xmm0, %xmm0 250 ; SSE2-NEXT: paddq %xmm2, %xmm2 251 ; SSE2-NEXT: retq 252 ; 253 ; SSE41-LABEL: test9: 254 ; SSE41: # %bb.0: 255 ; SSE41-NEXT: movdqa %xmm1, %xmm4 256 ; SSE41-NEXT: psllq $3, %xmm4 257 ; SSE41-NEXT: psllq $2, %xmm1 258 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 259 ; SSE41-NEXT: movdqa %xmm3, %xmm4 260 ; SSE41-NEXT: psllq $3, %xmm4 261 ; SSE41-NEXT: psllq $2, %xmm3 262 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 263 ; SSE41-NEXT: paddq %xmm0, %xmm0 264 ; SSE41-NEXT: paddq %xmm2, %xmm2 265 ; SSE41-NEXT: retq 266 ; 267 ; AVX2-LABEL: test9: 268 ; AVX2: # %bb.0: 269 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] 270 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 271 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 272 ; AVX2-NEXT: retq 273 ; 274 ; AVX512-LABEL: test9: 275 ; AVX512: # %bb.0: 276 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 277 ; AVX512-NEXT: retq 278 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 279 ret <8 x i64> %shl 280 } 281