1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,VBMI 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW 12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI 13 14 define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { 15 ; SSE3-LABEL: var_shuffle_v2i64: 16 ; SSE3: # %bb.0: 17 ; SSE3-NEXT: movq %xmm1, %rax 18 ; SSE3-NEXT: andl $1, %eax 19 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 20 ; SSE3-NEXT: movq %xmm1, %rcx 21 ; SSE3-NEXT: andl $1, %ecx 22 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 23 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 24 ; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 25 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 26 ; SSE3-NEXT: retq 27 ; 28 ; SSSE3-LABEL: var_shuffle_v2i64: 29 ; SSSE3: # %bb.0: 30 ; SSSE3-NEXT: movq %xmm1, %rax 31 ; SSSE3-NEXT: andl $1, %eax 32 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 33 ; SSSE3-NEXT: movq %xmm1, %rcx 34 ; SSSE3-NEXT: andl $1, %ecx 35 ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 36 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 37 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 38 ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 39 ; SSSE3-NEXT: retq 40 ; 41 ; SSE41-LABEL: var_shuffle_v2i64: 42 ; SSE41: # %bb.0: 43 ; SSE41-NEXT: pxor %xmm2, %xmm2 44 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 45 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] 46 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 47 ; SSE41-NEXT: movdqa %xmm2, %xmm0 48 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 49 ; SSE41-NEXT: movapd %xmm1, %xmm0 50 ; SSE41-NEXT: retq 51 ; 52 ; AVX-LABEL: var_shuffle_v2i64: 53 ; AVX: # %bb.0: 54 ; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1 55 ; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 56 ; AVX-NEXT: retq 57 %index0 = extractelement <2 x i64> %indices, i32 0 58 %index1 = extractelement <2 x i64> %indices, i32 1 59 %v0 = extractelement <2 x i64> %v, i64 %index0 60 %v1 = extractelement <2 x i64> %v, i64 %index1 61 %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0 62 %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1 63 ret <2 x i64> %ret1 64 } 65 66 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { 67 ; SSE3-LABEL: var_shuffle_v4i32: 68 ; SSE3: # %bb.0: 69 ; SSE3-NEXT: movd %xmm1, %eax 70 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 71 ; SSE3-NEXT: movd %xmm2, %ecx 72 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 73 ; SSE3-NEXT: movd %xmm2, %edx 74 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 75 ; SSE3-NEXT: movd %xmm1, %esi 76 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 77 ; SSE3-NEXT: andl $3, %eax 78 ; SSE3-NEXT: andl $3, %ecx 79 ; SSE3-NEXT: andl $3, %edx 80 ; SSE3-NEXT: andl $3, %esi 81 ; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 82 ; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 83 ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 84 ; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 85 ; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 86 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 87 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 88 ; SSE3-NEXT: retq 89 ; 90 ; SSSE3-LABEL: var_shuffle_v4i32: 91 ; SSSE3: # %bb.0: 92 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] 93 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 94 ; SSSE3-NEXT: pmuludq %xmm2, %xmm1 95 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 96 ; SSSE3-NEXT: pmuludq %xmm2, %xmm3 97 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 98 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 99 ; SSSE3-NEXT: paddd {{.*}}(%rip), %xmm1 100 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 101 ; SSSE3-NEXT: retq 102 ; 103 ; SSE41-LABEL: var_shuffle_v4i32: 104 ; SSE41: # %bb.0: 105 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 106 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 107 ; SSE41-NEXT: pshufb %xmm1, %xmm0 108 ; SSE41-NEXT: retq 109 ; 110 ; AVX-LABEL: var_shuffle_v4i32: 111 ; AVX: # %bb.0: 112 ; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 113 ; AVX-NEXT: retq 114 %index0 = extractelement <4 x i32> %indices, i32 0 115 %index1 = extractelement <4 x i32> %indices, i32 1 116 %index2 = extractelement <4 x i32> %indices, i32 2 117 %index3 = extractelement <4 x i32> %indices, i32 3 118 %v0 = extractelement <4 x i32> %v, i32 %index0 119 %v1 = extractelement <4 x i32> %v, i32 %index1 120 %v2 = extractelement <4 x i32> %v, i32 %index2 121 %v3 = extractelement <4 x i32> %v, i32 %index3 122 %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0 123 %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1 124 %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2 125 %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3 126 ret <4 x i32> %ret3 127 } 128 129 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { 130 ; SSE3-LABEL: var_shuffle_v8i16: 131 ; SSE3: # %bb.0: 132 ; SSE3-NEXT: movd %xmm1, %r8d 133 ; SSE3-NEXT: pextrw $1, %xmm1, %r9d 134 ; SSE3-NEXT: pextrw $2, %xmm1, %r10d 135 ; SSE3-NEXT: pextrw $3, %xmm1, %esi 136 ; SSE3-NEXT: pextrw $4, %xmm1, %edi 137 ; SSE3-NEXT: pextrw $5, %xmm1, %eax 138 ; SSE3-NEXT: pextrw $6, %xmm1, %ecx 139 ; SSE3-NEXT: pextrw $7, %xmm1, %edx 140 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 141 ; SSE3-NEXT: andl $7, %r8d 142 ; SSE3-NEXT: andl $7, %r9d 143 ; SSE3-NEXT: andl $7, %r10d 144 ; SSE3-NEXT: andl $7, %esi 145 ; SSE3-NEXT: andl $7, %edi 146 ; SSE3-NEXT: andl $7, %eax 147 ; SSE3-NEXT: andl $7, %ecx 148 ; SSE3-NEXT: andl $7, %edx 149 ; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx 150 ; SSE3-NEXT: movd %edx, %xmm0 151 ; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx 152 ; SSE3-NEXT: movd %ecx, %xmm1 153 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 154 ; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax 155 ; SSE3-NEXT: movd %eax, %xmm0 156 ; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax 157 ; SSE3-NEXT: movd %eax, %xmm2 158 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 159 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 160 ; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax 161 ; SSE3-NEXT: movd %eax, %xmm0 162 ; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax 163 ; SSE3-NEXT: movd %eax, %xmm1 164 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 165 ; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax 166 ; SSE3-NEXT: movd %eax, %xmm3 167 ; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax 168 ; SSE3-NEXT: movd %eax, %xmm0 169 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 170 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 171 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 172 ; SSE3-NEXT: retq 173 ; 174 ; SSSE3-LABEL: var_shuffle_v8i16: 175 ; SSSE3: # %bb.0: 176 ; SSSE3-NEXT: pmullw {{.*}}(%rip), %xmm1 177 ; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm1 178 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 179 ; SSSE3-NEXT: retq 180 ; 181 ; SSE41-LABEL: var_shuffle_v8i16: 182 ; SSE41: # %bb.0: 183 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 184 ; SSE41-NEXT: paddw {{.*}}(%rip), %xmm1 185 ; SSE41-NEXT: pshufb %xmm1, %xmm0 186 ; SSE41-NEXT: retq 187 ; 188 ; AVXNOVLBW-LABEL: var_shuffle_v8i16: 189 ; AVXNOVLBW: # %bb.0: 190 ; AVXNOVLBW-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 191 ; AVXNOVLBW-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 192 ; AVXNOVLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 193 ; AVXNOVLBW-NEXT: retq 194 ; 195 ; AVX512VL-LABEL: var_shuffle_v8i16: 196 ; AVX512VL: # %bb.0: 197 ; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 198 ; AVX512VL-NEXT: retq 199 %index0 = extractelement <8 x i16> %indices, i32 0 200 %index1 = extractelement <8 x i16> %indices, i32 1 201 %index2 = extractelement <8 x i16> %indices, i32 2 202 %index3 = extractelement <8 x i16> %indices, i32 3 203 %index4 = extractelement <8 x i16> %indices, i32 4 204 %index5 = extractelement <8 x i16> %indices, i32 5 205 %index6 = extractelement <8 x i16> %indices, i32 6 206 %index7 = extractelement <8 x i16> %indices, i32 7 207 %v0 = extractelement <8 x i16> %v, i16 %index0 208 %v1 = extractelement <8 x i16> %v, i16 %index1 209 %v2 = extractelement <8 x i16> %v, i16 %index2 210 %v3 = extractelement <8 x i16> %v, i16 %index3 211 %v4 = extractelement <8 x i16> %v, i16 %index4 212 %v5 = extractelement <8 x i16> %v, i16 %index5 213 %v6 = extractelement <8 x i16> %v, i16 %index6 214 %v7 = extractelement <8 x i16> %v, i16 %index7 215 %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0 216 %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1 217 %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2 218 %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3 219 %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4 220 %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5 221 %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6 222 %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7 223 ret <8 x i16> %ret7 224 } 225 226 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { 227 ; SSE3-LABEL: var_shuffle_v16i8: 228 ; SSE3: # %bb.0: 229 ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 230 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 231 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 232 ; SSE3-NEXT: andl $15, %eax 233 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 234 ; SSE3-NEXT: movd %eax, %xmm8 235 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 236 ; SSE3-NEXT: andl $15, %eax 237 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 238 ; SSE3-NEXT: movd %eax, %xmm15 239 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 240 ; SSE3-NEXT: andl $15, %eax 241 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 242 ; SSE3-NEXT: movd %eax, %xmm9 243 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 244 ; SSE3-NEXT: andl $15, %eax 245 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 246 ; SSE3-NEXT: movd %eax, %xmm3 247 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 248 ; SSE3-NEXT: andl $15, %eax 249 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 250 ; SSE3-NEXT: movd %eax, %xmm10 251 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 252 ; SSE3-NEXT: andl $15, %eax 253 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 254 ; SSE3-NEXT: movd %eax, %xmm7 255 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 256 ; SSE3-NEXT: andl $15, %eax 257 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 258 ; SSE3-NEXT: movd %eax, %xmm11 259 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 260 ; SSE3-NEXT: andl $15, %eax 261 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 262 ; SSE3-NEXT: movd %eax, %xmm6 263 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 264 ; SSE3-NEXT: andl $15, %eax 265 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 266 ; SSE3-NEXT: movd %eax, %xmm12 267 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 268 ; SSE3-NEXT: andl $15, %eax 269 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 270 ; SSE3-NEXT: movd %eax, %xmm5 271 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 272 ; SSE3-NEXT: andl $15, %eax 273 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 274 ; SSE3-NEXT: movd %eax, %xmm13 275 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 276 ; SSE3-NEXT: andl $15, %eax 277 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 278 ; SSE3-NEXT: movd %eax, %xmm4 279 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 280 ; SSE3-NEXT: andl $15, %eax 281 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 282 ; SSE3-NEXT: movd %eax, %xmm14 283 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 284 ; SSE3-NEXT: andl $15, %eax 285 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 286 ; SSE3-NEXT: movd %eax, %xmm1 287 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 288 ; SSE3-NEXT: andl $15, %eax 289 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 290 ; SSE3-NEXT: movd %eax, %xmm2 291 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 292 ; SSE3-NEXT: andl $15, %eax 293 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 294 ; SSE3-NEXT: movd %eax, %xmm0 295 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 296 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 297 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] 298 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 299 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 300 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 301 ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 302 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 303 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] 304 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 305 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 306 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 307 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 308 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 309 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 310 ; SSE3-NEXT: retq 311 ; 312 ; SSSE3-LABEL: var_shuffle_v16i8: 313 ; SSSE3: # %bb.0: 314 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 315 ; SSSE3-NEXT: retq 316 ; 317 ; SSE41-LABEL: var_shuffle_v16i8: 318 ; SSE41: # %bb.0: 319 ; SSE41-NEXT: pshufb %xmm1, %xmm0 320 ; SSE41-NEXT: retq 321 ; 322 ; AVX-LABEL: var_shuffle_v16i8: 323 ; AVX: # %bb.0: 324 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 325 ; AVX-NEXT: retq 326 %index0 = extractelement <16 x i8> %indices, i32 0 327 %index1 = extractelement <16 x i8> %indices, i32 1 328 %index2 = extractelement <16 x i8> %indices, i32 2 329 %index3 = extractelement <16 x i8> %indices, i32 3 330 %index4 = extractelement <16 x i8> %indices, i32 4 331 %index5 = extractelement <16 x i8> %indices, i32 5 332 %index6 = extractelement <16 x i8> %indices, i32 6 333 %index7 = extractelement <16 x i8> %indices, i32 7 334 %index8 = extractelement <16 x i8> %indices, i32 8 335 %index9 = extractelement <16 x i8> %indices, i32 9 336 %index10 = extractelement <16 x i8> %indices, i32 10 337 %index11 = extractelement <16 x i8> %indices, i32 11 338 %index12 = extractelement <16 x i8> %indices, i32 12 339 %index13 = extractelement <16 x i8> %indices, i32 13 340 %index14 = extractelement <16 x i8> %indices, i32 14 341 %index15 = extractelement <16 x i8> %indices, i32 15 342 %v0 = extractelement <16 x i8> %v, i8 %index0 343 %v1 = extractelement <16 x i8> %v, i8 %index1 344 %v2 = extractelement <16 x i8> %v, i8 %index2 345 %v3 = extractelement <16 x i8> %v, i8 %index3 346 %v4 = extractelement <16 x i8> %v, i8 %index4 347 %v5 = extractelement <16 x i8> %v, i8 %index5 348 %v6 = extractelement <16 x i8> %v, i8 %index6 349 %v7 = extractelement <16 x i8> %v, i8 %index7 350 %v8 = extractelement <16 x i8> %v, i8 %index8 351 %v9 = extractelement <16 x i8> %v, i8 %index9 352 %v10 = extractelement <16 x i8> %v, i8 %index10 353 %v11 = extractelement <16 x i8> %v, i8 %index11 354 %v12 = extractelement <16 x i8> %v, i8 %index12 355 %v13 = extractelement <16 x i8> %v, i8 %index13 356 %v14 = extractelement <16 x i8> %v, i8 %index14 357 %v15 = extractelement <16 x i8> %v, i8 %index15 358 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 359 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 360 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 361 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 362 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 363 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 364 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 365 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 366 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 367 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 368 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 369 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 370 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 371 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 372 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 373 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 374 ret <16 x i8> %ret15 375 } 376 377 define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind { 378 ; SSE3-LABEL: var_shuffle_v2f64: 379 ; SSE3: # %bb.0: 380 ; SSE3-NEXT: movq %xmm1, %rax 381 ; SSE3-NEXT: andl $1, %eax 382 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 383 ; SSE3-NEXT: movq %xmm1, %rcx 384 ; SSE3-NEXT: andl $1, %ecx 385 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 386 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 387 ; SSE3-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 388 ; SSE3-NEXT: retq 389 ; 390 ; SSSE3-LABEL: var_shuffle_v2f64: 391 ; SSSE3: # %bb.0: 392 ; SSSE3-NEXT: movq %xmm1, %rax 393 ; SSSE3-NEXT: andl $1, %eax 394 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 395 ; SSSE3-NEXT: movq %xmm1, %rcx 396 ; SSSE3-NEXT: andl $1, %ecx 397 ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 398 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 399 ; SSSE3-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 400 ; SSSE3-NEXT: retq 401 ; 402 ; SSE41-LABEL: var_shuffle_v2f64: 403 ; SSE41: # %bb.0: 404 ; SSE41-NEXT: movdqa %xmm0, %xmm2 405 ; SSE41-NEXT: pxor %xmm0, %xmm0 406 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 407 ; SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] 408 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 409 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 410 ; SSE41-NEXT: movapd %xmm2, %xmm0 411 ; SSE41-NEXT: retq 412 ; 413 ; AVX-LABEL: var_shuffle_v2f64: 414 ; AVX: # %bb.0: 415 ; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1 416 ; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 417 ; AVX-NEXT: retq 418 %index0 = extractelement <2 x i64> %indices, i32 0 419 %index1 = extractelement <2 x i64> %indices, i32 1 420 %v0 = extractelement <2 x double> %v, i64 %index0 421 %v1 = extractelement <2 x double> %v, i64 %index1 422 %ret0 = insertelement <2 x double> undef, double %v0, i32 0 423 %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1 424 ret <2 x double> %ret1 425 } 426 427 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind { 428 ; SSE3-LABEL: var_shuffle_v4f32: 429 ; SSE3: # %bb.0: 430 ; SSE3-NEXT: movd %xmm1, %eax 431 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 432 ; SSE3-NEXT: movd %xmm2, %ecx 433 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 434 ; SSE3-NEXT: movd %xmm2, %edx 435 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 436 ; SSE3-NEXT: movd %xmm1, %esi 437 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 438 ; SSE3-NEXT: andl $3, %eax 439 ; SSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 440 ; SSE3-NEXT: andl $3, %ecx 441 ; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 442 ; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 443 ; SSE3-NEXT: andl $3, %edx 444 ; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 445 ; SSE3-NEXT: andl $3, %esi 446 ; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 447 ; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 448 ; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 449 ; SSE3-NEXT: retq 450 ; 451 ; SSSE3-LABEL: var_shuffle_v4f32: 452 ; SSSE3: # %bb.0: 453 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] 454 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 455 ; SSSE3-NEXT: pmuludq %xmm2, %xmm1 456 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 457 ; SSSE3-NEXT: pmuludq %xmm2, %xmm3 458 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 459 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 460 ; SSSE3-NEXT: paddd {{.*}}(%rip), %xmm1 461 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 462 ; SSSE3-NEXT: retq 463 ; 464 ; SSE41-LABEL: var_shuffle_v4f32: 465 ; SSE41: # %bb.0: 466 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 467 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 468 ; SSE41-NEXT: pshufb %xmm1, %xmm0 469 ; SSE41-NEXT: retq 470 ; 471 ; AVX-LABEL: var_shuffle_v4f32: 472 ; AVX: # %bb.0: 473 ; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 474 ; AVX-NEXT: retq 475 %index0 = extractelement <4 x i32> %indices, i32 0 476 %index1 = extractelement <4 x i32> %indices, i32 1 477 %index2 = extractelement <4 x i32> %indices, i32 2 478 %index3 = extractelement <4 x i32> %indices, i32 3 479 %v0 = extractelement <4 x float> %v, i32 %index0 480 %v1 = extractelement <4 x float> %v, i32 %index1 481 %v2 = extractelement <4 x float> %v, i32 %index2 482 %v3 = extractelement <4 x float> %v, i32 %index3 483 %ret0 = insertelement <4 x float> undef, float %v0, i32 0 484 %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1 485 %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2 486 %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3 487 ret <4 x float> %ret3 488 } 489 490 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind { 491 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 492 ; SSE3: # %bb.0: 493 ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 494 ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 495 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 496 ; SSE3-NEXT: andl $15, %eax 497 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 498 ; SSE3-NEXT: movd %eax, %xmm8 499 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 500 ; SSE3-NEXT: andl $15, %eax 501 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 502 ; SSE3-NEXT: movd %eax, %xmm15 503 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 504 ; SSE3-NEXT: andl $15, %eax 505 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 506 ; SSE3-NEXT: movd %eax, %xmm9 507 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 508 ; SSE3-NEXT: andl $15, %eax 509 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 510 ; SSE3-NEXT: movd %eax, %xmm3 511 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 512 ; SSE3-NEXT: andl $15, %eax 513 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 514 ; SSE3-NEXT: movd %eax, %xmm10 515 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 516 ; SSE3-NEXT: andl $15, %eax 517 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 518 ; SSE3-NEXT: movd %eax, %xmm7 519 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 520 ; SSE3-NEXT: andl $15, %eax 521 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 522 ; SSE3-NEXT: movd %eax, %xmm11 523 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 524 ; SSE3-NEXT: andl $15, %eax 525 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 526 ; SSE3-NEXT: movd %eax, %xmm6 527 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 528 ; SSE3-NEXT: andl $15, %eax 529 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 530 ; SSE3-NEXT: movd %eax, %xmm12 531 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 532 ; SSE3-NEXT: andl $15, %eax 533 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 534 ; SSE3-NEXT: movd %eax, %xmm5 535 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 536 ; SSE3-NEXT: andl $15, %eax 537 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 538 ; SSE3-NEXT: movd %eax, %xmm13 539 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 540 ; SSE3-NEXT: andl $15, %eax 541 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 542 ; SSE3-NEXT: movd %eax, %xmm4 543 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 544 ; SSE3-NEXT: andl $15, %eax 545 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 546 ; SSE3-NEXT: movd %eax, %xmm14 547 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 548 ; SSE3-NEXT: andl $15, %eax 549 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 550 ; SSE3-NEXT: movd %eax, %xmm1 551 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 552 ; SSE3-NEXT: andl $15, %eax 553 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 554 ; SSE3-NEXT: movd %eax, %xmm2 555 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 556 ; SSE3-NEXT: andl $15, %eax 557 ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 558 ; SSE3-NEXT: movd %eax, %xmm0 559 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 560 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 561 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] 562 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 563 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 564 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 565 ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 566 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 567 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] 568 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 569 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 570 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 571 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 572 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 573 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 574 ; SSE3-NEXT: retq 575 ; 576 ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 577 ; SSSE3: # %bb.0: 578 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 579 ; SSSE3-NEXT: retq 580 ; 581 ; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 582 ; SSE41: # %bb.0: 583 ; SSE41-NEXT: pshufb %xmm1, %xmm0 584 ; SSE41-NEXT: retq 585 ; 586 ; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 587 ; AVX: # %bb.0: 588 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 589 ; AVX-NEXT: vzeroupper 590 ; AVX-NEXT: retq 591 %index0 = extractelement <32 x i8> %indices, i32 0 592 %index1 = extractelement <32 x i8> %indices, i32 1 593 %index2 = extractelement <32 x i8> %indices, i32 2 594 %index3 = extractelement <32 x i8> %indices, i32 3 595 %index4 = extractelement <32 x i8> %indices, i32 4 596 %index5 = extractelement <32 x i8> %indices, i32 5 597 %index6 = extractelement <32 x i8> %indices, i32 6 598 %index7 = extractelement <32 x i8> %indices, i32 7 599 %index8 = extractelement <32 x i8> %indices, i32 8 600 %index9 = extractelement <32 x i8> %indices, i32 9 601 %index10 = extractelement <32 x i8> %indices, i32 10 602 %index11 = extractelement <32 x i8> %indices, i32 11 603 %index12 = extractelement <32 x i8> %indices, i32 12 604 %index13 = extractelement <32 x i8> %indices, i32 13 605 %index14 = extractelement <32 x i8> %indices, i32 14 606 %index15 = extractelement <32 x i8> %indices, i32 15 607 %v0 = extractelement <16 x i8> %v, i8 %index0 608 %v1 = extractelement <16 x i8> %v, i8 %index1 609 %v2 = extractelement <16 x i8> %v, i8 %index2 610 %v3 = extractelement <16 x i8> %v, i8 %index3 611 %v4 = extractelement <16 x i8> %v, i8 %index4 612 %v5 = extractelement <16 x i8> %v, i8 %index5 613 %v6 = extractelement <16 x i8> %v, i8 %index6 614 %v7 = extractelement <16 x i8> %v, i8 %index7 615 %v8 = extractelement <16 x i8> %v, i8 %index8 616 %v9 = extractelement <16 x i8> %v, i8 %index9 617 %v10 = extractelement <16 x i8> %v, i8 %index10 618 %v11 = extractelement <16 x i8> %v, i8 %index11 619 %v12 = extractelement <16 x i8> %v, i8 %index12 620 %v13 = extractelement <16 x i8> %v, i8 %index13 621 %v14 = extractelement <16 x i8> %v, i8 %index14 622 %v15 = extractelement <16 x i8> %v, i8 %index15 623 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 624 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 625 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 626 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 627 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 628 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 629 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 630 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 631 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 632 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 633 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 634 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 635 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 636 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 637 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 638 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 639 ret <16 x i8> %ret15 640 } 641 642 define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind { 643 ; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 644 ; SSE3: # %bb.0: 645 ; SSE3-NEXT: pushq %rbp 646 ; SSE3-NEXT: movq %rsp, %rbp 647 ; SSE3-NEXT: pushq %r15 648 ; SSE3-NEXT: pushq %r14 649 ; SSE3-NEXT: pushq %r13 650 ; SSE3-NEXT: pushq %r12 651 ; SSE3-NEXT: pushq %rbx 652 ; SSE3-NEXT: andq $-32, %rsp 653 ; SSE3-NEXT: subq $608, %rsp # imm = 0x260 654 ; SSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) 655 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 656 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 657 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 658 ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 659 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 660 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 661 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 662 ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 663 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 664 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 665 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 666 ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 667 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 668 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 669 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d 670 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 671 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 672 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d 673 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 674 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 675 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d 676 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 677 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 678 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d 679 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 680 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 681 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d 682 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 683 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 684 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 685 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 686 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 687 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d 688 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 689 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 690 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi 691 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 692 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 693 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi 694 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 695 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 696 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx 697 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 698 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 699 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx 700 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 701 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 702 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 703 ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 704 ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 705 ; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d 706 ; SSE3-NEXT: andl $31, %r9d 707 ; SSE3-NEXT: movzbl 64(%rsp,%r9), %ebx 708 ; SSE3-NEXT: movd %ebx, %xmm8 709 ; SSE3-NEXT: andl $31, %eax 710 ; SSE3-NEXT: movzbl 96(%rsp,%rax), %eax 711 ; SSE3-NEXT: movd %eax, %xmm15 712 ; SSE3-NEXT: andl $31, %edx 713 ; SSE3-NEXT: movzbl 128(%rsp,%rdx), %eax 714 ; SSE3-NEXT: movd %eax, %xmm9 715 ; SSE3-NEXT: andl $31, %ecx 716 ; SSE3-NEXT: movzbl 160(%rsp,%rcx), %eax 717 ; SSE3-NEXT: movd %eax, %xmm3 718 ; SSE3-NEXT: andl $31, %esi 719 ; SSE3-NEXT: movzbl 192(%rsp,%rsi), %eax 720 ; SSE3-NEXT: movd %eax, %xmm10 721 ; SSE3-NEXT: andl $31, %edi 722 ; SSE3-NEXT: movzbl 224(%rsp,%rdi), %eax 723 ; SSE3-NEXT: movd %eax, %xmm7 724 ; SSE3-NEXT: andl $31, %r8d 725 ; SSE3-NEXT: movzbl 256(%rsp,%r8), %eax 726 ; SSE3-NEXT: movd %eax, %xmm11 727 ; SSE3-NEXT: andl $31, %r10d 728 ; SSE3-NEXT: movzbl 288(%rsp,%r10), %eax 729 ; SSE3-NEXT: movd %eax, %xmm6 730 ; SSE3-NEXT: andl $31, %r13d 731 ; SSE3-NEXT: movzbl 320(%rsp,%r13), %eax 732 ; SSE3-NEXT: movd %eax, %xmm12 733 ; SSE3-NEXT: andl $31, %r12d 734 ; SSE3-NEXT: movzbl 352(%rsp,%r12), %eax 735 ; SSE3-NEXT: movd %eax, %xmm5 736 ; SSE3-NEXT: andl $31, %r15d 737 ; SSE3-NEXT: movzbl 384(%rsp,%r15), %eax 738 ; SSE3-NEXT: movd %eax, %xmm13 739 ; SSE3-NEXT: andl $31, %r14d 740 ; SSE3-NEXT: movzbl 416(%rsp,%r14), %eax 741 ; SSE3-NEXT: movd %eax, %xmm4 742 ; SSE3-NEXT: andl $31, %r11d 743 ; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax 744 ; SSE3-NEXT: movd %eax, %xmm14 745 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 746 ; SSE3-NEXT: andl $31, %eax 747 ; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax 748 ; SSE3-NEXT: movd %eax, %xmm1 749 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 750 ; SSE3-NEXT: andl $31, %eax 751 ; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax 752 ; SSE3-NEXT: movd %eax, %xmm2 753 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 754 ; SSE3-NEXT: andl $31, %eax 755 ; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax 756 ; SSE3-NEXT: movd %eax, %xmm0 757 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 758 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 759 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] 760 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 761 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 762 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 763 ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 764 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 765 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] 766 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 767 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 768 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 769 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 770 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 771 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 772 ; SSE3-NEXT: leaq -40(%rbp), %rsp 773 ; SSE3-NEXT: popq %rbx 774 ; SSE3-NEXT: popq %r12 775 ; SSE3-NEXT: popq %r13 776 ; SSE3-NEXT: popq %r14 777 ; SSE3-NEXT: popq %r15 778 ; SSE3-NEXT: popq %rbp 779 ; SSE3-NEXT: retq 780 ; 781 ; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 782 ; SSSE3: # %bb.0: 783 ; SSSE3-NEXT: pushq %rbp 784 ; SSSE3-NEXT: movq %rsp, %rbp 785 ; SSSE3-NEXT: pushq %r15 786 ; SSSE3-NEXT: pushq %r14 787 ; SSSE3-NEXT: pushq %r13 788 ; SSSE3-NEXT: pushq %r12 789 ; SSSE3-NEXT: pushq %rbx 790 ; SSSE3-NEXT: andq $-32, %rsp 791 ; SSSE3-NEXT: subq $608, %rsp # imm = 0x260 792 ; SSSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) 793 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 794 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 795 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 796 ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 797 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 798 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 799 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 800 ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 801 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 802 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 803 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 804 ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 805 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 806 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 807 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d 808 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 809 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 810 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d 811 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 812 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 813 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d 814 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 815 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 816 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d 817 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 818 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 819 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d 820 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 821 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 822 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 823 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 824 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 825 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d 826 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 827 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 828 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi 829 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 830 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 831 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi 832 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 833 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 834 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx 835 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 836 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 837 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx 838 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 839 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 840 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 841 ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 842 ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 843 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d 844 ; SSSE3-NEXT: andl $31, %r9d 845 ; SSSE3-NEXT: movzbl 64(%rsp,%r9), %ebx 846 ; SSSE3-NEXT: movd %ebx, %xmm8 847 ; SSSE3-NEXT: andl $31, %eax 848 ; SSSE3-NEXT: movzbl 96(%rsp,%rax), %eax 849 ; SSSE3-NEXT: movd %eax, %xmm15 850 ; SSSE3-NEXT: andl $31, %edx 851 ; SSSE3-NEXT: movzbl 128(%rsp,%rdx), %eax 852 ; SSSE3-NEXT: movd %eax, %xmm9 853 ; SSSE3-NEXT: andl $31, %ecx 854 ; SSSE3-NEXT: movzbl 160(%rsp,%rcx), %eax 855 ; SSSE3-NEXT: movd %eax, %xmm3 856 ; SSSE3-NEXT: andl $31, %esi 857 ; SSSE3-NEXT: movzbl 192(%rsp,%rsi), %eax 858 ; SSSE3-NEXT: movd %eax, %xmm10 859 ; SSSE3-NEXT: andl $31, %edi 860 ; SSSE3-NEXT: movzbl 224(%rsp,%rdi), %eax 861 ; SSSE3-NEXT: movd %eax, %xmm7 862 ; SSSE3-NEXT: andl $31, %r8d 863 ; SSSE3-NEXT: movzbl 256(%rsp,%r8), %eax 864 ; SSSE3-NEXT: movd %eax, %xmm11 865 ; SSSE3-NEXT: andl $31, %r10d 866 ; SSSE3-NEXT: movzbl 288(%rsp,%r10), %eax 867 ; SSSE3-NEXT: movd %eax, %xmm6 868 ; SSSE3-NEXT: andl $31, %r13d 869 ; SSSE3-NEXT: movzbl 320(%rsp,%r13), %eax 870 ; SSSE3-NEXT: movd %eax, %xmm12 871 ; SSSE3-NEXT: andl $31, %r12d 872 ; SSSE3-NEXT: movzbl 352(%rsp,%r12), %eax 873 ; SSSE3-NEXT: movd %eax, %xmm5 874 ; SSSE3-NEXT: andl $31, %r15d 875 ; SSSE3-NEXT: movzbl 384(%rsp,%r15), %eax 876 ; SSSE3-NEXT: movd %eax, %xmm13 877 ; SSSE3-NEXT: andl $31, %r14d 878 ; SSSE3-NEXT: movzbl 416(%rsp,%r14), %eax 879 ; SSSE3-NEXT: movd %eax, %xmm4 880 ; SSSE3-NEXT: andl $31, %r11d 881 ; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax 882 ; SSSE3-NEXT: movd %eax, %xmm14 883 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 884 ; SSSE3-NEXT: andl $31, %eax 885 ; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax 886 ; SSSE3-NEXT: movd %eax, %xmm1 887 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 888 ; SSSE3-NEXT: andl $31, %eax 889 ; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax 890 ; SSSE3-NEXT: movd %eax, %xmm2 891 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 892 ; SSSE3-NEXT: andl $31, %eax 893 ; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax 894 ; SSSE3-NEXT: movd %eax, %xmm0 895 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 896 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 897 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] 898 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 899 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] 900 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 901 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 902 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 903 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] 904 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 905 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 906 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 907 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 908 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 909 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 910 ; SSSE3-NEXT: leaq -40(%rbp), %rsp 911 ; SSSE3-NEXT: popq %rbx 912 ; SSSE3-NEXT: popq %r12 913 ; SSSE3-NEXT: popq %r13 914 ; SSSE3-NEXT: popq %r14 915 ; SSSE3-NEXT: popq %r15 916 ; SSSE3-NEXT: popq %rbp 917 ; SSSE3-NEXT: retq 918 ; 919 ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 920 ; SSE41: # %bb.0: 921 ; SSE41-NEXT: pushq %rbp 922 ; SSE41-NEXT: movq %rsp, %rbp 923 ; SSE41-NEXT: andq $-32, %rsp 924 ; SSE41-NEXT: subq $544, %rsp # imm = 0x220 925 ; SSE41-NEXT: pextrb $0, %xmm2, %eax 926 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 927 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 928 ; SSE41-NEXT: andl $31, %eax 929 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 930 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 931 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 932 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 933 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 934 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 935 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 936 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 937 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 938 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 939 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 940 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 941 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 942 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 943 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 944 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 945 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 946 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 947 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 948 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 949 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 950 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 951 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 952 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 953 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 954 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 955 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 956 ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 957 ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) 958 ; SSE41-NEXT: movaps %xmm0, (%rsp) 959 ; SSE41-NEXT: movzbl 480(%rsp,%rax), %eax 960 ; SSE41-NEXT: movd %eax, %xmm0 961 ; SSE41-NEXT: pextrb $1, %xmm2, %eax 962 ; SSE41-NEXT: andl $31, %eax 963 ; SSE41-NEXT: pinsrb $1, 448(%rsp,%rax), %xmm0 964 ; SSE41-NEXT: pextrb $2, %xmm2, %eax 965 ; SSE41-NEXT: andl $31, %eax 966 ; SSE41-NEXT: pinsrb $2, 416(%rsp,%rax), %xmm0 967 ; SSE41-NEXT: pextrb $3, %xmm2, %eax 968 ; SSE41-NEXT: andl $31, %eax 969 ; SSE41-NEXT: pinsrb $3, 384(%rsp,%rax), %xmm0 970 ; SSE41-NEXT: pextrb $4, %xmm2, %eax 971 ; SSE41-NEXT: andl $31, %eax 972 ; SSE41-NEXT: pinsrb $4, 352(%rsp,%rax), %xmm0 973 ; SSE41-NEXT: pextrb $5, %xmm2, %eax 974 ; SSE41-NEXT: andl $31, %eax 975 ; SSE41-NEXT: pinsrb $5, 320(%rsp,%rax), %xmm0 976 ; SSE41-NEXT: pextrb $6, %xmm2, %eax 977 ; SSE41-NEXT: andl $31, %eax 978 ; SSE41-NEXT: pinsrb $6, 288(%rsp,%rax), %xmm0 979 ; SSE41-NEXT: pextrb $7, %xmm2, %eax 980 ; SSE41-NEXT: andl $31, %eax 981 ; SSE41-NEXT: pinsrb $7, 256(%rsp,%rax), %xmm0 982 ; SSE41-NEXT: pextrb $8, %xmm2, %eax 983 ; SSE41-NEXT: andl $31, %eax 984 ; SSE41-NEXT: pinsrb $8, 224(%rsp,%rax), %xmm0 985 ; SSE41-NEXT: pextrb $9, %xmm2, %eax 986 ; SSE41-NEXT: andl $31, %eax 987 ; SSE41-NEXT: pinsrb $9, 192(%rsp,%rax), %xmm0 988 ; SSE41-NEXT: pextrb $10, %xmm2, %eax 989 ; SSE41-NEXT: andl $31, %eax 990 ; SSE41-NEXT: pinsrb $10, 160(%rsp,%rax), %xmm0 991 ; SSE41-NEXT: pextrb $11, %xmm2, %eax 992 ; SSE41-NEXT: andl $31, %eax 993 ; SSE41-NEXT: pinsrb $11, 128(%rsp,%rax), %xmm0 994 ; SSE41-NEXT: pextrb $12, %xmm2, %eax 995 ; SSE41-NEXT: andl $31, %eax 996 ; SSE41-NEXT: pinsrb $12, 96(%rsp,%rax), %xmm0 997 ; SSE41-NEXT: pextrb $13, %xmm2, %eax 998 ; SSE41-NEXT: andl $31, %eax 999 ; SSE41-NEXT: pinsrb $13, 64(%rsp,%rax), %xmm0 1000 ; SSE41-NEXT: pextrb $14, %xmm2, %eax 1001 ; SSE41-NEXT: andl $31, %eax 1002 ; SSE41-NEXT: pinsrb $14, 32(%rsp,%rax), %xmm0 1003 ; SSE41-NEXT: pextrb $15, %xmm2, %eax 1004 ; SSE41-NEXT: andl $31, %eax 1005 ; SSE41-NEXT: pinsrb $15, (%rsp,%rax), %xmm0 1006 ; SSE41-NEXT: movq %rbp, %rsp 1007 ; SSE41-NEXT: popq %rbp 1008 ; SSE41-NEXT: retq 1009 ; 1010 ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1011 ; XOP: # %bb.0: 1012 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1013 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm0 1014 ; XOP-NEXT: vzeroupper 1015 ; XOP-NEXT: retq 1016 ; 1017 ; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1018 ; AVX1: # %bb.0: 1019 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1020 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 1021 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1022 ; AVX1-NEXT: vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1 1023 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1024 ; AVX1-NEXT: vzeroupper 1025 ; AVX1-NEXT: retq 1026 ; 1027 ; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1028 ; AVX2: # %bb.0: 1029 ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1030 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 1031 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 1032 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 1033 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 1034 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 1035 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 1036 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 1037 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1038 ; AVX2-NEXT: vzeroupper 1039 ; AVX2-NEXT: retq 1040 ; 1041 ; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1042 ; AVX512: # %bb.0: 1043 ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1044 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 1045 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 1046 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 1047 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 1048 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 1049 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 1050 ; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 1051 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1052 ; AVX512-NEXT: vzeroupper 1053 ; AVX512-NEXT: retq 1054 ; 1055 ; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1056 ; AVX512VLBW: # %bb.0: 1057 ; AVX512VLBW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1058 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 1059 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 1060 ; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm3 1061 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 1062 ; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 1063 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} 1064 ; AVX512VLBW-NEXT: vmovdqa %xmm2, %xmm0 1065 ; AVX512VLBW-NEXT: vzeroupper 1066 ; AVX512VLBW-NEXT: retq 1067 ; 1068 ; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1069 ; VLVBMI: # %bb.0: 1070 ; VLVBMI-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1071 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 1072 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1073 ; VLVBMI-NEXT: vzeroupper 1074 ; VLVBMI-NEXT: retq 1075 %index0 = extractelement <16 x i8> %indices, i32 0 1076 %index1 = extractelement <16 x i8> %indices, i32 1 1077 %index2 = extractelement <16 x i8> %indices, i32 2 1078 %index3 = extractelement <16 x i8> %indices, i32 3 1079 %index4 = extractelement <16 x i8> %indices, i32 4 1080 %index5 = extractelement <16 x i8> %indices, i32 5 1081 %index6 = extractelement <16 x i8> %indices, i32 6 1082 %index7 = extractelement <16 x i8> %indices, i32 7 1083 %index8 = extractelement <16 x i8> %indices, i32 8 1084 %index9 = extractelement <16 x i8> %indices, i32 9 1085 %index10 = extractelement <16 x i8> %indices, i32 10 1086 %index11 = extractelement <16 x i8> %indices, i32 11 1087 %index12 = extractelement <16 x i8> %indices, i32 12 1088 %index13 = extractelement <16 x i8> %indices, i32 13 1089 %index14 = extractelement <16 x i8> %indices, i32 14 1090 %index15 = extractelement <16 x i8> %indices, i32 15 1091 %v0 = extractelement <32 x i8> %v, i8 %index0 1092 %v1 = extractelement <32 x i8> %v, i8 %index1 1093 %v2 = extractelement <32 x i8> %v, i8 %index2 1094 %v3 = extractelement <32 x i8> %v, i8 %index3 1095 %v4 = extractelement <32 x i8> %v, i8 %index4 1096 %v5 = extractelement <32 x i8> %v, i8 %index5 1097 %v6 = extractelement <32 x i8> %v, i8 %index6 1098 %v7 = extractelement <32 x i8> %v, i8 %index7 1099 %v8 = extractelement <32 x i8> %v, i8 %index8 1100 %v9 = extractelement <32 x i8> %v, i8 %index9 1101 %v10 = extractelement <32 x i8> %v, i8 %index10 1102 %v11 = extractelement <32 x i8> %v, i8 %index11 1103 %v12 = extractelement <32 x i8> %v, i8 %index12 1104 %v13 = extractelement <32 x i8> %v, i8 %index13 1105 %v14 = extractelement <32 x i8> %v, i8 %index14 1106 %v15 = extractelement <32 x i8> %v, i8 %index15 1107 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 1108 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 1109 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 1110 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 1111 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 1112 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 1113 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 1114 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 1115 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 1116 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 1117 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 1118 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 1119 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 1120 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 1121 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 1122 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 1123 ret <16 x i8> %ret15 1124 } 1125