1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL 12 ; 13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. 14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 15 16 ; 17 ; Variable Shifts 18 ; 19 20 define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21 ; SSE2-LABEL: var_shift_v2i64: 22 ; SSE2: # %bb.0: 23 ; SSE2-NEXT: movdqa %xmm0, %xmm2 24 ; SSE2-NEXT: psllq %xmm1, %xmm2 25 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 26 ; SSE2-NEXT: psllq %xmm1, %xmm0 27 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 28 ; SSE2-NEXT: retq 29 ; 30 ; SSE41-LABEL: var_shift_v2i64: 31 ; SSE41: # %bb.0: 32 ; SSE41-NEXT: movdqa %xmm0, %xmm2 33 ; SSE41-NEXT: psllq %xmm1, %xmm2 34 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 35 ; SSE41-NEXT: psllq %xmm1, %xmm0 36 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 37 ; SSE41-NEXT: retq 38 ; 39 ; AVX1-LABEL: var_shift_v2i64: 40 ; AVX1: # %bb.0: 41 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 42 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 43 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 44 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 45 ; AVX1-NEXT: retq 46 ; 47 ; AVX2-LABEL: var_shift_v2i64: 48 ; AVX2: # %bb.0: 49 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 50 ; AVX2-NEXT: retq 51 ; 52 ; XOPAVX1-LABEL: var_shift_v2i64: 53 ; XOPAVX1: # %bb.0: 54 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 55 ; XOPAVX1-NEXT: retq 56 ; 57 ; XOPAVX2-LABEL: var_shift_v2i64: 58 ; XOPAVX2: # %bb.0: 59 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 60 ; XOPAVX2-NEXT: retq 61 ; 62 ; AVX512-LABEL: var_shift_v2i64: 63 ; AVX512: # %bb.0: 64 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 65 ; AVX512-NEXT: retq 66 ; 67 ; AVX512VL-LABEL: var_shift_v2i64: 68 ; AVX512VL: # %bb.0: 69 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 70 ; AVX512VL-NEXT: retq 71 ; 72 ; X32-SSE-LABEL: var_shift_v2i64: 73 ; X32-SSE: # %bb.0: 74 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 75 ; X32-SSE-NEXT: psllq %xmm1, %xmm2 76 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 77 ; X32-SSE-NEXT: psllq %xmm1, %xmm0 78 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 79 ; X32-SSE-NEXT: retl 80 %shift = shl <2 x i64> %a, %b 81 ret <2 x i64> %shift 82 } 83 84 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 85 ; SSE2-LABEL: var_shift_v4i32: 86 ; SSE2: # %bb.0: 87 ; SSE2-NEXT: pslld $23, %xmm1 88 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 89 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 90 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 91 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 92 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 93 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 94 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 95 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 96 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 97 ; SSE2-NEXT: retq 98 ; 99 ; SSE41-LABEL: var_shift_v4i32: 100 ; SSE41: # %bb.0: 101 ; SSE41-NEXT: pslld $23, %xmm1 102 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 103 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 104 ; SSE41-NEXT: pmulld %xmm1, %xmm0 105 ; SSE41-NEXT: retq 106 ; 107 ; AVX1-LABEL: var_shift_v4i32: 108 ; AVX1: # %bb.0: 109 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 110 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 111 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 112 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 113 ; AVX1-NEXT: retq 114 ; 115 ; AVX2-LABEL: var_shift_v4i32: 116 ; AVX2: # %bb.0: 117 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 118 ; AVX2-NEXT: retq 119 ; 120 ; XOPAVX1-LABEL: var_shift_v4i32: 121 ; XOPAVX1: # %bb.0: 122 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 123 ; XOPAVX1-NEXT: retq 124 ; 125 ; XOPAVX2-LABEL: var_shift_v4i32: 126 ; XOPAVX2: # %bb.0: 127 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 128 ; XOPAVX2-NEXT: retq 129 ; 130 ; AVX512-LABEL: var_shift_v4i32: 131 ; AVX512: # %bb.0: 132 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 133 ; AVX512-NEXT: retq 134 ; 135 ; AVX512VL-LABEL: var_shift_v4i32: 136 ; AVX512VL: # %bb.0: 137 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 138 ; AVX512VL-NEXT: retq 139 ; 140 ; X32-SSE-LABEL: var_shift_v4i32: 141 ; X32-SSE: # %bb.0: 142 ; X32-SSE-NEXT: pslld $23, %xmm1 143 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 144 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 145 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 146 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 147 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 148 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 149 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 150 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 151 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 152 ; X32-SSE-NEXT: retl 153 %shift = shl <4 x i32> %a, %b 154 ret <4 x i32> %shift 155 } 156 157 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 158 ; SSE2-LABEL: var_shift_v8i16: 159 ; SSE2: # %bb.0: 160 ; SSE2-NEXT: pxor %xmm2, %xmm2 161 ; SSE2-NEXT: movdqa %xmm1, %xmm3 162 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 163 ; SSE2-NEXT: pslld $23, %xmm3 164 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 165 ; SSE2-NEXT: paddd %xmm4, %xmm3 166 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 167 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 168 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 169 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 170 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 171 ; SSE2-NEXT: pslld $23, %xmm1 172 ; SSE2-NEXT: paddd %xmm4, %xmm1 173 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 174 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 175 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 176 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 177 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 178 ; SSE2-NEXT: pmullw %xmm1, %xmm0 179 ; SSE2-NEXT: retq 180 ; 181 ; SSE41-LABEL: var_shift_v8i16: 182 ; SSE41: # %bb.0: 183 ; SSE41-NEXT: pxor %xmm2, %xmm2 184 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 185 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 186 ; SSE41-NEXT: pslld $23, %xmm1 187 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] 188 ; SSE41-NEXT: paddd %xmm2, %xmm1 189 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 190 ; SSE41-NEXT: pslld $23, %xmm3 191 ; SSE41-NEXT: paddd %xmm2, %xmm3 192 ; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 193 ; SSE41-NEXT: packusdw %xmm1, %xmm2 194 ; SSE41-NEXT: pmullw %xmm2, %xmm0 195 ; SSE41-NEXT: retq 196 ; 197 ; AVX1-LABEL: var_shift_v8i16: 198 ; AVX1: # %bb.0: 199 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 200 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 201 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 202 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 203 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 204 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 205 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 206 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 207 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 208 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 209 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 210 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 211 ; AVX1-NEXT: retq 212 ; 213 ; AVX2-LABEL: var_shift_v8i16: 214 ; AVX2: # %bb.0: 215 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 216 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 217 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 218 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 219 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 220 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 221 ; AVX2-NEXT: vzeroupper 222 ; AVX2-NEXT: retq 223 ; 224 ; XOP-LABEL: var_shift_v8i16: 225 ; XOP: # %bb.0: 226 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 227 ; XOP-NEXT: retq 228 ; 229 ; AVX512DQ-LABEL: var_shift_v8i16: 230 ; AVX512DQ: # %bb.0: 231 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 232 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 233 ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 234 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 235 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 236 ; AVX512DQ-NEXT: vzeroupper 237 ; AVX512DQ-NEXT: retq 238 ; 239 ; AVX512BW-LABEL: var_shift_v8i16: 240 ; AVX512BW: # %bb.0: 241 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 242 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 243 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 244 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 245 ; AVX512BW-NEXT: vzeroupper 246 ; AVX512BW-NEXT: retq 247 ; 248 ; AVX512DQVL-LABEL: var_shift_v8i16: 249 ; AVX512DQVL: # %bb.0: 250 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 251 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 252 ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 253 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 254 ; AVX512DQVL-NEXT: vzeroupper 255 ; AVX512DQVL-NEXT: retq 256 ; 257 ; AVX512BWVL-LABEL: var_shift_v8i16: 258 ; AVX512BWVL: # %bb.0: 259 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 260 ; AVX512BWVL-NEXT: retq 261 ; 262 ; X32-SSE-LABEL: var_shift_v8i16: 263 ; X32-SSE: # %bb.0: 264 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 265 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 266 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 267 ; X32-SSE-NEXT: pslld $23, %xmm3 268 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 269 ; X32-SSE-NEXT: paddd %xmm4, %xmm3 270 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 271 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 272 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 273 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 274 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 275 ; X32-SSE-NEXT: pslld $23, %xmm1 276 ; X32-SSE-NEXT: paddd %xmm4, %xmm1 277 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 278 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 279 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 280 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 281 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 282 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0 283 ; X32-SSE-NEXT: retl 284 %shift = shl <8 x i16> %a, %b 285 ret <8 x i16> %shift 286 } 287 288 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 289 ; SSE2-LABEL: var_shift_v16i8: 290 ; SSE2: # %bb.0: 291 ; SSE2-NEXT: psllw $5, %xmm1 292 ; SSE2-NEXT: pxor %xmm2, %xmm2 293 ; SSE2-NEXT: pxor %xmm3, %xmm3 294 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 295 ; SSE2-NEXT: movdqa %xmm3, %xmm4 296 ; SSE2-NEXT: pandn %xmm0, %xmm4 297 ; SSE2-NEXT: psllw $4, %xmm0 298 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 299 ; SSE2-NEXT: pand %xmm3, %xmm0 300 ; SSE2-NEXT: por %xmm4, %xmm0 301 ; SSE2-NEXT: paddb %xmm1, %xmm1 302 ; SSE2-NEXT: pxor %xmm3, %xmm3 303 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 304 ; SSE2-NEXT: movdqa %xmm3, %xmm4 305 ; SSE2-NEXT: pandn %xmm0, %xmm4 306 ; SSE2-NEXT: psllw $2, %xmm0 307 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 308 ; SSE2-NEXT: pand %xmm3, %xmm0 309 ; SSE2-NEXT: por %xmm4, %xmm0 310 ; SSE2-NEXT: paddb %xmm1, %xmm1 311 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 312 ; SSE2-NEXT: movdqa %xmm2, %xmm1 313 ; SSE2-NEXT: pandn %xmm0, %xmm1 314 ; SSE2-NEXT: paddb %xmm0, %xmm0 315 ; SSE2-NEXT: pand %xmm2, %xmm0 316 ; SSE2-NEXT: por %xmm1, %xmm0 317 ; SSE2-NEXT: retq 318 ; 319 ; SSE41-LABEL: var_shift_v16i8: 320 ; SSE41: # %bb.0: 321 ; SSE41-NEXT: movdqa %xmm0, %xmm2 322 ; SSE41-NEXT: psllw $5, %xmm1 323 ; SSE41-NEXT: movdqa %xmm0, %xmm3 324 ; SSE41-NEXT: psllw $4, %xmm3 325 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 326 ; SSE41-NEXT: movdqa %xmm1, %xmm0 327 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 328 ; SSE41-NEXT: movdqa %xmm2, %xmm3 329 ; SSE41-NEXT: psllw $2, %xmm3 330 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 331 ; SSE41-NEXT: paddb %xmm1, %xmm1 332 ; SSE41-NEXT: movdqa %xmm1, %xmm0 333 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 334 ; SSE41-NEXT: movdqa %xmm2, %xmm3 335 ; SSE41-NEXT: paddb %xmm2, %xmm3 336 ; SSE41-NEXT: paddb %xmm1, %xmm1 337 ; SSE41-NEXT: movdqa %xmm1, %xmm0 338 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 339 ; SSE41-NEXT: movdqa %xmm2, %xmm0 340 ; SSE41-NEXT: retq 341 ; 342 ; AVX-LABEL: var_shift_v16i8: 343 ; AVX: # %bb.0: 344 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 345 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 346 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 347 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 348 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 349 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 350 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 351 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 352 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 353 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 354 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 355 ; AVX-NEXT: retq 356 ; 357 ; XOP-LABEL: var_shift_v16i8: 358 ; XOP: # %bb.0: 359 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 360 ; XOP-NEXT: retq 361 ; 362 ; AVX512DQ-LABEL: var_shift_v16i8: 363 ; AVX512DQ: # %bb.0: 364 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 365 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 366 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 367 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 368 ; AVX512DQ-NEXT: vzeroupper 369 ; AVX512DQ-NEXT: retq 370 ; 371 ; AVX512BW-LABEL: var_shift_v16i8: 372 ; AVX512BW: # %bb.0: 373 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 374 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 375 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 376 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 377 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 378 ; AVX512BW-NEXT: vzeroupper 379 ; AVX512BW-NEXT: retq 380 ; 381 ; AVX512DQVL-LABEL: var_shift_v16i8: 382 ; AVX512DQVL: # %bb.0: 383 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 384 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 385 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 386 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 387 ; AVX512DQVL-NEXT: vzeroupper 388 ; AVX512DQVL-NEXT: retq 389 ; 390 ; AVX512BWVL-LABEL: var_shift_v16i8: 391 ; AVX512BWVL: # %bb.0: 392 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 393 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 394 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 395 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 396 ; AVX512BWVL-NEXT: vzeroupper 397 ; AVX512BWVL-NEXT: retq 398 ; 399 ; X32-SSE-LABEL: var_shift_v16i8: 400 ; X32-SSE: # %bb.0: 401 ; X32-SSE-NEXT: psllw $5, %xmm1 402 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 403 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 404 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 405 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 406 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 407 ; X32-SSE-NEXT: psllw $4, %xmm0 408 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 409 ; X32-SSE-NEXT: pand %xmm3, %xmm0 410 ; X32-SSE-NEXT: por %xmm4, %xmm0 411 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 412 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 413 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 414 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 415 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 416 ; X32-SSE-NEXT: psllw $2, %xmm0 417 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 418 ; X32-SSE-NEXT: pand %xmm3, %xmm0 419 ; X32-SSE-NEXT: por %xmm4, %xmm0 420 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 421 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 422 ; X32-SSE-NEXT: movdqa %xmm2, %xmm1 423 ; X32-SSE-NEXT: pandn %xmm0, %xmm1 424 ; X32-SSE-NEXT: paddb %xmm0, %xmm0 425 ; X32-SSE-NEXT: pand %xmm2, %xmm0 426 ; X32-SSE-NEXT: por %xmm1, %xmm0 427 ; X32-SSE-NEXT: retl 428 %shift = shl <16 x i8> %a, %b 429 ret <16 x i8> %shift 430 } 431 432 ; 433 ; Uniform Variable Shifts 434 ; 435 436 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 437 ; SSE-LABEL: splatvar_shift_v2i64: 438 ; SSE: # %bb.0: 439 ; SSE-NEXT: psllq %xmm1, %xmm0 440 ; SSE-NEXT: retq 441 ; 442 ; AVX-LABEL: splatvar_shift_v2i64: 443 ; AVX: # %bb.0: 444 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 445 ; AVX-NEXT: retq 446 ; 447 ; XOP-LABEL: splatvar_shift_v2i64: 448 ; XOP: # %bb.0: 449 ; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 450 ; XOP-NEXT: retq 451 ; 452 ; AVX512-LABEL: splatvar_shift_v2i64: 453 ; AVX512: # %bb.0: 454 ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 455 ; AVX512-NEXT: retq 456 ; 457 ; AVX512VL-LABEL: splatvar_shift_v2i64: 458 ; AVX512VL: # %bb.0: 459 ; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 460 ; AVX512VL-NEXT: retq 461 ; 462 ; X32-SSE-LABEL: splatvar_shift_v2i64: 463 ; X32-SSE: # %bb.0: 464 ; X32-SSE-NEXT: psllq %xmm1, %xmm0 465 ; X32-SSE-NEXT: retl 466 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 467 %shift = shl <2 x i64> %a, %splat 468 ret <2 x i64> %shift 469 } 470 471 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 472 ; SSE2-LABEL: splatvar_shift_v4i32: 473 ; SSE2: # %bb.0: 474 ; SSE2-NEXT: xorps %xmm2, %xmm2 475 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 476 ; SSE2-NEXT: pslld %xmm2, %xmm0 477 ; SSE2-NEXT: retq 478 ; 479 ; SSE41-LABEL: splatvar_shift_v4i32: 480 ; SSE41: # %bb.0: 481 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 482 ; SSE41-NEXT: pslld %xmm1, %xmm0 483 ; SSE41-NEXT: retq 484 ; 485 ; AVX-LABEL: splatvar_shift_v4i32: 486 ; AVX: # %bb.0: 487 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 488 ; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 489 ; AVX-NEXT: retq 490 ; 491 ; XOP-LABEL: splatvar_shift_v4i32: 492 ; XOP: # %bb.0: 493 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 494 ; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 495 ; XOP-NEXT: retq 496 ; 497 ; AVX512-LABEL: splatvar_shift_v4i32: 498 ; AVX512: # %bb.0: 499 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 500 ; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 501 ; AVX512-NEXT: retq 502 ; 503 ; AVX512VL-LABEL: splatvar_shift_v4i32: 504 ; AVX512VL: # %bb.0: 505 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 506 ; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0 507 ; AVX512VL-NEXT: retq 508 ; 509 ; X32-SSE-LABEL: splatvar_shift_v4i32: 510 ; X32-SSE: # %bb.0: 511 ; X32-SSE-NEXT: xorps %xmm2, %xmm2 512 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 513 ; X32-SSE-NEXT: pslld %xmm2, %xmm0 514 ; X32-SSE-NEXT: retl 515 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 516 %shift = shl <4 x i32> %a, %splat 517 ret <4 x i32> %shift 518 } 519 520 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 521 ; SSE2-LABEL: splatvar_shift_v8i16: 522 ; SSE2: # %bb.0: 523 ; SSE2-NEXT: pextrw $0, %xmm1, %eax 524 ; SSE2-NEXT: movd %eax, %xmm1 525 ; SSE2-NEXT: psllw %xmm1, %xmm0 526 ; SSE2-NEXT: retq 527 ; 528 ; SSE41-LABEL: splatvar_shift_v8i16: 529 ; SSE41: # %bb.0: 530 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 531 ; SSE41-NEXT: psllw %xmm1, %xmm0 532 ; SSE41-NEXT: retq 533 ; 534 ; AVX-LABEL: splatvar_shift_v8i16: 535 ; AVX: # %bb.0: 536 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 537 ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 538 ; AVX-NEXT: retq 539 ; 540 ; XOP-LABEL: splatvar_shift_v8i16: 541 ; XOP: # %bb.0: 542 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 543 ; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 544 ; XOP-NEXT: retq 545 ; 546 ; AVX512-LABEL: splatvar_shift_v8i16: 547 ; AVX512: # %bb.0: 548 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 549 ; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 550 ; AVX512-NEXT: retq 551 ; 552 ; AVX512VL-LABEL: splatvar_shift_v8i16: 553 ; AVX512VL: # %bb.0: 554 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 555 ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 556 ; AVX512VL-NEXT: retq 557 ; 558 ; X32-SSE-LABEL: splatvar_shift_v8i16: 559 ; X32-SSE: # %bb.0: 560 ; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 561 ; X32-SSE-NEXT: movd %eax, %xmm1 562 ; X32-SSE-NEXT: psllw %xmm1, %xmm0 563 ; X32-SSE-NEXT: retl 564 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 565 %shift = shl <8 x i16> %a, %splat 566 ret <8 x i16> %shift 567 } 568 569 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 570 ; SSE2-LABEL: splatvar_shift_v16i8: 571 ; SSE2: # %bb.0: 572 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 573 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 574 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 575 ; SSE2-NEXT: psllw $5, %xmm2 576 ; SSE2-NEXT: pxor %xmm1, %xmm1 577 ; SSE2-NEXT: pxor %xmm3, %xmm3 578 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 579 ; SSE2-NEXT: movdqa %xmm3, %xmm4 580 ; SSE2-NEXT: pandn %xmm0, %xmm4 581 ; SSE2-NEXT: psllw $4, %xmm0 582 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 583 ; SSE2-NEXT: pand %xmm3, %xmm0 584 ; SSE2-NEXT: por %xmm4, %xmm0 585 ; SSE2-NEXT: paddb %xmm2, %xmm2 586 ; SSE2-NEXT: pxor %xmm3, %xmm3 587 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 588 ; SSE2-NEXT: movdqa %xmm3, %xmm4 589 ; SSE2-NEXT: pandn %xmm0, %xmm4 590 ; SSE2-NEXT: psllw $2, %xmm0 591 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 592 ; SSE2-NEXT: pand %xmm3, %xmm0 593 ; SSE2-NEXT: por %xmm4, %xmm0 594 ; SSE2-NEXT: paddb %xmm2, %xmm2 595 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 596 ; SSE2-NEXT: movdqa %xmm1, %xmm2 597 ; SSE2-NEXT: pandn %xmm0, %xmm2 598 ; SSE2-NEXT: paddb %xmm0, %xmm0 599 ; SSE2-NEXT: pand %xmm1, %xmm0 600 ; SSE2-NEXT: por %xmm2, %xmm0 601 ; SSE2-NEXT: retq 602 ; 603 ; SSE41-LABEL: splatvar_shift_v16i8: 604 ; SSE41: # %bb.0: 605 ; SSE41-NEXT: movdqa %xmm0, %xmm2 606 ; SSE41-NEXT: pxor %xmm0, %xmm0 607 ; SSE41-NEXT: pshufb %xmm0, %xmm1 608 ; SSE41-NEXT: psllw $5, %xmm1 609 ; SSE41-NEXT: movdqa %xmm1, %xmm3 610 ; SSE41-NEXT: paddb %xmm1, %xmm3 611 ; SSE41-NEXT: movdqa %xmm2, %xmm4 612 ; SSE41-NEXT: psllw $4, %xmm4 613 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 614 ; SSE41-NEXT: movdqa %xmm1, %xmm0 615 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2 616 ; SSE41-NEXT: movdqa %xmm2, %xmm1 617 ; SSE41-NEXT: psllw $2, %xmm1 618 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 619 ; SSE41-NEXT: movdqa %xmm3, %xmm0 620 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 621 ; SSE41-NEXT: movdqa %xmm2, %xmm1 622 ; SSE41-NEXT: paddb %xmm2, %xmm1 623 ; SSE41-NEXT: paddb %xmm3, %xmm3 624 ; SSE41-NEXT: movdqa %xmm3, %xmm0 625 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 626 ; SSE41-NEXT: movdqa %xmm2, %xmm0 627 ; SSE41-NEXT: retq 628 ; 629 ; AVX1-LABEL: splatvar_shift_v16i8: 630 ; AVX1: # %bb.0: 631 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 632 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 633 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 634 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 635 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 636 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 637 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 638 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 639 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 640 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 641 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 642 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 643 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 644 ; AVX1-NEXT: retq 645 ; 646 ; AVX2-LABEL: splatvar_shift_v16i8: 647 ; AVX2: # %bb.0: 648 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 649 ; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 650 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 651 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 652 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 653 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 654 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 655 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 656 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 657 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 658 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 659 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 660 ; AVX2-NEXT: retq 661 ; 662 ; XOPAVX1-LABEL: splatvar_shift_v16i8: 663 ; XOPAVX1: # %bb.0: 664 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 665 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 666 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 667 ; XOPAVX1-NEXT: retq 668 ; 669 ; XOPAVX2-LABEL: splatvar_shift_v16i8: 670 ; XOPAVX2: # %bb.0: 671 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 672 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 673 ; XOPAVX2-NEXT: retq 674 ; 675 ; AVX512DQ-LABEL: splatvar_shift_v16i8: 676 ; AVX512DQ: # %bb.0: 677 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 678 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 679 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 680 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 681 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 682 ; AVX512DQ-NEXT: vzeroupper 683 ; AVX512DQ-NEXT: retq 684 ; 685 ; AVX512BW-LABEL: splatvar_shift_v16i8: 686 ; AVX512BW: # %bb.0: 687 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 688 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 689 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 690 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 691 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 692 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 693 ; AVX512BW-NEXT: vzeroupper 694 ; AVX512BW-NEXT: retq 695 ; 696 ; AVX512DQVL-LABEL: splatvar_shift_v16i8: 697 ; AVX512DQVL: # %bb.0: 698 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 699 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 700 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 701 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 702 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 703 ; AVX512DQVL-NEXT: vzeroupper 704 ; AVX512DQVL-NEXT: retq 705 ; 706 ; AVX512BWVL-LABEL: splatvar_shift_v16i8: 707 ; AVX512BWVL: # %bb.0: 708 ; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 709 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 710 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 711 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 712 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 713 ; AVX512BWVL-NEXT: vzeroupper 714 ; AVX512BWVL-NEXT: retq 715 ; 716 ; X32-SSE-LABEL: splatvar_shift_v16i8: 717 ; X32-SSE: # %bb.0: 718 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 719 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 720 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 721 ; X32-SSE-NEXT: psllw $5, %xmm2 722 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 723 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 724 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 725 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 726 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 727 ; X32-SSE-NEXT: psllw $4, %xmm0 728 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 729 ; X32-SSE-NEXT: pand %xmm3, %xmm0 730 ; X32-SSE-NEXT: por %xmm4, %xmm0 731 ; X32-SSE-NEXT: paddb %xmm2, %xmm2 732 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 733 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 734 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 735 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 736 ; X32-SSE-NEXT: psllw $2, %xmm0 737 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 738 ; X32-SSE-NEXT: pand %xmm3, %xmm0 739 ; X32-SSE-NEXT: por %xmm4, %xmm0 740 ; X32-SSE-NEXT: paddb %xmm2, %xmm2 741 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 742 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 743 ; X32-SSE-NEXT: pandn %xmm0, %xmm2 744 ; X32-SSE-NEXT: paddb %xmm0, %xmm0 745 ; X32-SSE-NEXT: pand %xmm1, %xmm0 746 ; X32-SSE-NEXT: por %xmm2, %xmm0 747 ; X32-SSE-NEXT: retl 748 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 749 %shift = shl <16 x i8> %a, %splat 750 ret <16 x i8> %shift 751 } 752 753 ; 754 ; Constant Shifts 755 ; 756 757 define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { 758 ; SSE2-LABEL: constant_shift_v2i64: 759 ; SSE2: # %bb.0: 760 ; SSE2-NEXT: movdqa %xmm0, %xmm1 761 ; SSE2-NEXT: psllq $1, %xmm1 762 ; SSE2-NEXT: psllq $7, %xmm0 763 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 764 ; SSE2-NEXT: retq 765 ; 766 ; SSE41-LABEL: constant_shift_v2i64: 767 ; SSE41: # %bb.0: 768 ; SSE41-NEXT: movdqa %xmm0, %xmm1 769 ; SSE41-NEXT: psllq $7, %xmm1 770 ; SSE41-NEXT: psllq $1, %xmm0 771 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 772 ; SSE41-NEXT: retq 773 ; 774 ; AVX1-LABEL: constant_shift_v2i64: 775 ; AVX1: # %bb.0: 776 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 777 ; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 778 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 779 ; AVX1-NEXT: retq 780 ; 781 ; AVX2-LABEL: constant_shift_v2i64: 782 ; AVX2: # %bb.0: 783 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 784 ; AVX2-NEXT: retq 785 ; 786 ; XOPAVX1-LABEL: constant_shift_v2i64: 787 ; XOPAVX1: # %bb.0: 788 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0 789 ; XOPAVX1-NEXT: retq 790 ; 791 ; XOPAVX2-LABEL: constant_shift_v2i64: 792 ; XOPAVX2: # %bb.0: 793 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 794 ; XOPAVX2-NEXT: retq 795 ; 796 ; AVX512-LABEL: constant_shift_v2i64: 797 ; AVX512: # %bb.0: 798 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 799 ; AVX512-NEXT: retq 800 ; 801 ; AVX512VL-LABEL: constant_shift_v2i64: 802 ; AVX512VL: # %bb.0: 803 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 804 ; AVX512VL-NEXT: retq 805 ; 806 ; X32-SSE-LABEL: constant_shift_v2i64: 807 ; X32-SSE: # %bb.0: 808 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 809 ; X32-SSE-NEXT: psllq $1, %xmm1 810 ; X32-SSE-NEXT: psllq $7, %xmm0 811 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 812 ; X32-SSE-NEXT: retl 813 %shift = shl <2 x i64> %a, <i64 1, i64 7> 814 ret <2 x i64> %shift 815 } 816 817 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { 818 ; SSE2-LABEL: constant_shift_v4i32: 819 ; SSE2: # %bb.0: 820 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 821 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 822 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 823 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 824 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 825 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 826 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 827 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 828 ; SSE2-NEXT: retq 829 ; 830 ; SSE41-LABEL: constant_shift_v4i32: 831 ; SSE41: # %bb.0: 832 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 833 ; SSE41-NEXT: retq 834 ; 835 ; AVX1-LABEL: constant_shift_v4i32: 836 ; AVX1: # %bb.0: 837 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 838 ; AVX1-NEXT: retq 839 ; 840 ; AVX2-LABEL: constant_shift_v4i32: 841 ; AVX2: # %bb.0: 842 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 843 ; AVX2-NEXT: retq 844 ; 845 ; XOPAVX1-LABEL: constant_shift_v4i32: 846 ; XOPAVX1: # %bb.0: 847 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 848 ; XOPAVX1-NEXT: retq 849 ; 850 ; XOPAVX2-LABEL: constant_shift_v4i32: 851 ; XOPAVX2: # %bb.0: 852 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 853 ; XOPAVX2-NEXT: retq 854 ; 855 ; AVX512-LABEL: constant_shift_v4i32: 856 ; AVX512: # %bb.0: 857 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 858 ; AVX512-NEXT: retq 859 ; 860 ; AVX512VL-LABEL: constant_shift_v4i32: 861 ; AVX512VL: # %bb.0: 862 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 863 ; AVX512VL-NEXT: retq 864 ; 865 ; X32-SSE-LABEL: constant_shift_v4i32: 866 ; X32-SSE: # %bb.0: 867 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 868 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 869 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 870 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 871 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 872 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 873 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 874 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 875 ; X32-SSE-NEXT: retl 876 %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 877 ret <4 x i32> %shift 878 } 879 880 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { 881 ; SSE-LABEL: constant_shift_v8i16: 882 ; SSE: # %bb.0: 883 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 884 ; SSE-NEXT: retq 885 ; 886 ; AVX-LABEL: constant_shift_v8i16: 887 ; AVX: # %bb.0: 888 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 889 ; AVX-NEXT: retq 890 ; 891 ; XOP-LABEL: constant_shift_v8i16: 892 ; XOP: # %bb.0: 893 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 894 ; XOP-NEXT: retq 895 ; 896 ; AVX512DQ-LABEL: constant_shift_v8i16: 897 ; AVX512DQ: # %bb.0: 898 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 899 ; AVX512DQ-NEXT: retq 900 ; 901 ; AVX512BW-LABEL: constant_shift_v8i16: 902 ; AVX512BW: # %bb.0: 903 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 904 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 905 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 906 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 907 ; AVX512BW-NEXT: vzeroupper 908 ; AVX512BW-NEXT: retq 909 ; 910 ; AVX512DQVL-LABEL: constant_shift_v8i16: 911 ; AVX512DQVL: # %bb.0: 912 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 913 ; AVX512DQVL-NEXT: retq 914 ; 915 ; AVX512BWVL-LABEL: constant_shift_v8i16: 916 ; AVX512BWVL: # %bb.0: 917 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 918 ; AVX512BWVL-NEXT: retq 919 ; 920 ; X32-SSE-LABEL: constant_shift_v8i16: 921 ; X32-SSE: # %bb.0: 922 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 923 ; X32-SSE-NEXT: retl 924 %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 925 ret <8 x i16> %shift 926 } 927 928 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { 929 ; SSE2-LABEL: constant_shift_v16i8: 930 ; SSE2: # %bb.0: 931 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] 932 ; SSE2-NEXT: movdqa %xmm1, %xmm2 933 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 934 ; SSE2-NEXT: movdqa %xmm0, %xmm3 935 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 936 ; SSE2-NEXT: pmullw %xmm2, %xmm3 937 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 938 ; SSE2-NEXT: pand %xmm2, %xmm3 939 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 940 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 941 ; SSE2-NEXT: pmullw %xmm1, %xmm0 942 ; SSE2-NEXT: pand %xmm2, %xmm0 943 ; SSE2-NEXT: packuswb %xmm3, %xmm0 944 ; SSE2-NEXT: retq 945 ; 946 ; SSE41-LABEL: constant_shift_v16i8: 947 ; SSE41: # %bb.0: 948 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] 949 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 950 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 951 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 952 ; SSE41-NEXT: pmullw %xmm2, %xmm0 953 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 954 ; SSE41-NEXT: pand %xmm2, %xmm0 955 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 956 ; SSE41-NEXT: pand %xmm2, %xmm1 957 ; SSE41-NEXT: packuswb %xmm0, %xmm1 958 ; SSE41-NEXT: movdqa %xmm1, %xmm0 959 ; SSE41-NEXT: retq 960 ; 961 ; AVX1-LABEL: constant_shift_v16i8: 962 ; AVX1: # %bb.0: 963 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] 964 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 965 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 966 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 967 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 968 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 969 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 970 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 971 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 972 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 973 ; AVX1-NEXT: retq 974 ; 975 ; AVX2-LABEL: constant_shift_v16i8: 976 ; AVX2: # %bb.0: 977 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 978 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 979 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 980 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 981 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 982 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 983 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 984 ; AVX2-NEXT: vzeroupper 985 ; AVX2-NEXT: retq 986 ; 987 ; XOP-LABEL: constant_shift_v16i8: 988 ; XOP: # %bb.0: 989 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 990 ; XOP-NEXT: retq 991 ; 992 ; AVX512DQ-LABEL: constant_shift_v16i8: 993 ; AVX512DQ: # %bb.0: 994 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 995 ; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 996 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 997 ; AVX512DQ-NEXT: vzeroupper 998 ; AVX512DQ-NEXT: retq 999 ; 1000 ; AVX512BW-LABEL: constant_shift_v16i8: 1001 ; AVX512BW: # %bb.0: 1002 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 1003 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1004 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1005 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1006 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1007 ; AVX512BW-NEXT: vzeroupper 1008 ; AVX512BW-NEXT: retq 1009 ; 1010 ; AVX512DQVL-LABEL: constant_shift_v16i8: 1011 ; AVX512DQVL: # %bb.0: 1012 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1013 ; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 1014 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1015 ; AVX512DQVL-NEXT: vzeroupper 1016 ; AVX512DQVL-NEXT: retq 1017 ; 1018 ; AVX512BWVL-LABEL: constant_shift_v16i8: 1019 ; AVX512BWVL: # %bb.0: 1020 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1021 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1022 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1023 ; AVX512BWVL-NEXT: vzeroupper 1024 ; AVX512BWVL-NEXT: retq 1025 ; 1026 ; X32-SSE-LABEL: constant_shift_v16i8: 1027 ; X32-SSE: # %bb.0: 1028 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] 1029 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1030 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1031 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1032 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1033 ; X32-SSE-NEXT: pmullw %xmm2, %xmm3 1034 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1035 ; X32-SSE-NEXT: pand %xmm2, %xmm3 1036 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1037 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1038 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0 1039 ; X32-SSE-NEXT: pand %xmm2, %xmm0 1040 ; X32-SSE-NEXT: packuswb %xmm3, %xmm0 1041 ; X32-SSE-NEXT: retl 1042 %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 1043 ret <16 x i8> %shift 1044 } 1045 1046 ; 1047 ; Uniform Constant Shifts 1048 ; 1049 1050 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { 1051 ; SSE-LABEL: splatconstant_shift_v2i64: 1052 ; SSE: # %bb.0: 1053 ; SSE-NEXT: psllq $7, %xmm0 1054 ; SSE-NEXT: retq 1055 ; 1056 ; AVX-LABEL: splatconstant_shift_v2i64: 1057 ; AVX: # %bb.0: 1058 ; AVX-NEXT: vpsllq $7, %xmm0, %xmm0 1059 ; AVX-NEXT: retq 1060 ; 1061 ; XOP-LABEL: splatconstant_shift_v2i64: 1062 ; XOP: # %bb.0: 1063 ; XOP-NEXT: vpsllq $7, %xmm0, %xmm0 1064 ; XOP-NEXT: retq 1065 ; 1066 ; AVX512-LABEL: splatconstant_shift_v2i64: 1067 ; AVX512: # %bb.0: 1068 ; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0 1069 ; AVX512-NEXT: retq 1070 ; 1071 ; AVX512VL-LABEL: splatconstant_shift_v2i64: 1072 ; AVX512VL: # %bb.0: 1073 ; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0 1074 ; AVX512VL-NEXT: retq 1075 ; 1076 ; X32-SSE-LABEL: splatconstant_shift_v2i64: 1077 ; X32-SSE: # %bb.0: 1078 ; X32-SSE-NEXT: psllq $7, %xmm0 1079 ; X32-SSE-NEXT: retl 1080 %shift = shl <2 x i64> %a, <i64 7, i64 7> 1081 ret <2 x i64> %shift 1082 } 1083 1084 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { 1085 ; SSE-LABEL: splatconstant_shift_v4i32: 1086 ; SSE: # %bb.0: 1087 ; SSE-NEXT: pslld $5, %xmm0 1088 ; SSE-NEXT: retq 1089 ; 1090 ; AVX-LABEL: splatconstant_shift_v4i32: 1091 ; AVX: # %bb.0: 1092 ; AVX-NEXT: vpslld $5, %xmm0, %xmm0 1093 ; AVX-NEXT: retq 1094 ; 1095 ; XOP-LABEL: splatconstant_shift_v4i32: 1096 ; XOP: # %bb.0: 1097 ; XOP-NEXT: vpslld $5, %xmm0, %xmm0 1098 ; XOP-NEXT: retq 1099 ; 1100 ; AVX512-LABEL: splatconstant_shift_v4i32: 1101 ; AVX512: # %bb.0: 1102 ; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 1103 ; AVX512-NEXT: retq 1104 ; 1105 ; AVX512VL-LABEL: splatconstant_shift_v4i32: 1106 ; AVX512VL: # %bb.0: 1107 ; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0 1108 ; AVX512VL-NEXT: retq 1109 ; 1110 ; X32-SSE-LABEL: splatconstant_shift_v4i32: 1111 ; X32-SSE: # %bb.0: 1112 ; X32-SSE-NEXT: pslld $5, %xmm0 1113 ; X32-SSE-NEXT: retl 1114 %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> 1115 ret <4 x i32> %shift 1116 } 1117 1118 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { 1119 ; SSE-LABEL: splatconstant_shift_v8i16: 1120 ; SSE: # %bb.0: 1121 ; SSE-NEXT: psllw $3, %xmm0 1122 ; SSE-NEXT: retq 1123 ; 1124 ; AVX-LABEL: splatconstant_shift_v8i16: 1125 ; AVX: # %bb.0: 1126 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 1127 ; AVX-NEXT: retq 1128 ; 1129 ; XOP-LABEL: splatconstant_shift_v8i16: 1130 ; XOP: # %bb.0: 1131 ; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 1132 ; XOP-NEXT: retq 1133 ; 1134 ; AVX512-LABEL: splatconstant_shift_v8i16: 1135 ; AVX512: # %bb.0: 1136 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 1137 ; AVX512-NEXT: retq 1138 ; 1139 ; AVX512VL-LABEL: splatconstant_shift_v8i16: 1140 ; AVX512VL: # %bb.0: 1141 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 1142 ; AVX512VL-NEXT: retq 1143 ; 1144 ; X32-SSE-LABEL: splatconstant_shift_v8i16: 1145 ; X32-SSE: # %bb.0: 1146 ; X32-SSE-NEXT: psllw $3, %xmm0 1147 ; X32-SSE-NEXT: retl 1148 %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 1149 ret <8 x i16> %shift 1150 } 1151 1152 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { 1153 ; SSE-LABEL: splatconstant_shift_v16i8: 1154 ; SSE: # %bb.0: 1155 ; SSE-NEXT: psllw $3, %xmm0 1156 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1157 ; SSE-NEXT: retq 1158 ; 1159 ; AVX-LABEL: splatconstant_shift_v16i8: 1160 ; AVX: # %bb.0: 1161 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 1162 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1163 ; AVX-NEXT: retq 1164 ; 1165 ; XOP-LABEL: splatconstant_shift_v16i8: 1166 ; XOP: # %bb.0: 1167 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 1168 ; XOP-NEXT: retq 1169 ; 1170 ; AVX512-LABEL: splatconstant_shift_v16i8: 1171 ; AVX512: # %bb.0: 1172 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 1173 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1174 ; AVX512-NEXT: retq 1175 ; 1176 ; AVX512VL-LABEL: splatconstant_shift_v16i8: 1177 ; AVX512VL: # %bb.0: 1178 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 1179 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1180 ; AVX512VL-NEXT: retq 1181 ; 1182 ; X32-SSE-LABEL: splatconstant_shift_v16i8: 1183 ; X32-SSE: # %bb.0: 1184 ; X32-SSE-NEXT: psllw $3, %xmm0 1185 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1186 ; X32-SSE-NEXT: retl 1187 %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 1188 ret <16 x i8> %shift 1189 } 1190