1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 5 ; 6 ; Unary shuffle indices from registers 7 ; 8 9 define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 10 ; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: 11 ; ALL: # BB#0: 12 ; ALL-NEXT: pushq %rbp 13 ; ALL-NEXT: movq %rsp, %rbp 14 ; ALL-NEXT: andq $-32, %rsp 15 ; ALL-NEXT: subq $64, %rsp 16 ; ALL-NEXT: vmovaps %ymm0, (%rsp) 17 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 18 ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 19 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 20 ; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 21 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 22 ; ALL-NEXT: movq %rbp, %rsp 23 ; ALL-NEXT: popq %rbp 24 ; ALL-NEXT: retq 25 %x0 = extractelement <4 x double> %x, i64 %i0 26 %x1 = extractelement <4 x double> %x, i64 %i1 27 %x2 = extractelement <4 x double> %x, i64 %i2 28 %x3 = extractelement <4 x double> %x, i64 %i3 29 %r0 = insertelement <4 x double> undef, double %x0, i32 0 30 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 31 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 32 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 33 ret <4 x double> %r3 34 } 35 36 define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 37 ; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: 38 ; ALL: # BB#0: 39 ; ALL-NEXT: pushq %rbp 40 ; ALL-NEXT: movq %rsp, %rbp 41 ; ALL-NEXT: andq $-32, %rsp 42 ; ALL-NEXT: subq $64, %rsp 43 ; ALL-NEXT: vmovaps %ymm0, (%rsp) 44 ; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 45 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 46 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 47 ; ALL-NEXT: movq %rbp, %rsp 48 ; ALL-NEXT: popq %rbp 49 ; ALL-NEXT: retq 50 %x0 = extractelement <4 x double> %x, i64 %i0 51 %x1 = extractelement <4 x double> %x, i64 %i1 52 %x2 = extractelement <4 x double> %x, i64 %i2 53 %x3 = extractelement <4 x double> %x, i64 %i3 54 %r0 = insertelement <4 x double> undef, double undef, i32 0 55 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 56 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 57 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3 58 ret <4 x double> %r3 59 } 60 61 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 62 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: 63 ; ALL: # BB#0: 64 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 65 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 66 ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 67 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 68 ; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 69 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 70 ; ALL-NEXT: retq 71 %x0 = extractelement <2 x double> %x, i64 %i0 72 %x1 = extractelement <2 x double> %x, i64 %i1 73 %x2 = extractelement <2 x double> %x, i64 %i2 74 %x3 = extractelement <2 x double> %x, i64 %i3 75 %r0 = insertelement <4 x double> undef, double %x0, i32 0 76 %r1 = insertelement <4 x double> %r0, double %x1, i32 1 77 %r2 = insertelement <4 x double> %r1, double %x2, i32 2 78 %r3 = insertelement <4 x double> %r2, double %x3, i32 3 79 ret <4 x double> %r3 80 } 81 82 define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 83 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: 84 ; AVX1: # BB#0: 85 ; AVX1-NEXT: pushq %rbp 86 ; AVX1-NEXT: movq %rsp, %rbp 87 ; AVX1-NEXT: andq $-32, %rsp 88 ; AVX1-NEXT: subq $64, %rsp 89 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) 90 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 91 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 92 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 93 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 94 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 95 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 96 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 97 ; AVX1-NEXT: movq %rbp, %rsp 98 ; AVX1-NEXT: popq %rbp 99 ; AVX1-NEXT: retq 100 ; 101 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: 102 ; AVX2: # BB#0: 103 ; AVX2-NEXT: pushq %rbp 104 ; AVX2-NEXT: movq %rsp, %rbp 105 ; AVX2-NEXT: andq $-32, %rsp 106 ; AVX2-NEXT: subq $64, %rsp 107 ; AVX2-NEXT: vmovaps %ymm0, (%rsp) 108 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 109 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 110 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 111 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 112 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 113 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 114 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 115 ; AVX2-NEXT: movq %rbp, %rsp 116 ; AVX2-NEXT: popq %rbp 117 ; AVX2-NEXT: retq 118 %x0 = extractelement <4 x i64> %x, i64 %i0 119 %x1 = extractelement <4 x i64> %x, i64 %i1 120 %x2 = extractelement <4 x i64> %x, i64 %i2 121 %x3 = extractelement <4 x i64> %x, i64 %i3 122 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 123 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 124 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 125 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 126 ret <4 x i64> %r3 127 } 128 129 define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 130 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: 131 ; AVX1: # BB#0: 132 ; AVX1-NEXT: pushq %rbp 133 ; AVX1-NEXT: movq %rsp, %rbp 134 ; AVX1-NEXT: andq $-32, %rsp 135 ; AVX1-NEXT: subq $64, %rsp 136 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) 137 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 138 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 139 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 140 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 141 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 142 ; AVX1-NEXT: movq %rbp, %rsp 143 ; AVX1-NEXT: popq %rbp 144 ; AVX1-NEXT: retq 145 ; 146 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: 147 ; AVX2: # BB#0: 148 ; AVX2-NEXT: pushq %rbp 149 ; AVX2-NEXT: movq %rsp, %rbp 150 ; AVX2-NEXT: andq $-32, %rsp 151 ; AVX2-NEXT: subq $64, %rsp 152 ; AVX2-NEXT: vmovaps %ymm0, (%rsp) 153 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 154 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 155 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 156 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 157 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 158 ; AVX2-NEXT: movq %rbp, %rsp 159 ; AVX2-NEXT: popq %rbp 160 ; AVX2-NEXT: retq 161 %x0 = extractelement <4 x i64> %x, i64 %i0 162 %x1 = extractelement <4 x i64> %x, i64 %i1 163 %x2 = extractelement <4 x i64> %x, i64 %i2 164 %x3 = extractelement <4 x i64> %x, i64 %i3 165 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 166 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 167 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2 168 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3 169 ret <4 x i64> %r3 170 } 171 172 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { 173 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: 174 ; AVX1: # BB#0: 175 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 176 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 177 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 178 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 179 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 180 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 181 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 182 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 183 ; AVX1-NEXT: retq 184 ; 185 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: 186 ; AVX2: # BB#0: 187 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 188 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 189 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 190 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 191 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 192 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 193 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 194 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 195 ; AVX2-NEXT: retq 196 %x0 = extractelement <2 x i64> %x, i64 %i0 197 %x1 = extractelement <2 x i64> %x, i64 %i1 198 %x2 = extractelement <2 x i64> %x, i64 %i2 199 %x3 = extractelement <2 x i64> %x, i64 %i3 200 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 201 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 202 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 203 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 204 ret <4 x i64> %r3 205 } 206 207 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 208 ; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: 209 ; AVX1: # BB#0: 210 ; AVX1-NEXT: pushq %rbp 211 ; AVX1-NEXT: movq %rsp, %rbp 212 ; AVX1-NEXT: andq $-32, %rsp 213 ; AVX1-NEXT: subq $64, %rsp 214 ; AVX1-NEXT: movslq %edi, %rax 215 ; AVX1-NEXT: movslq %esi, %rsi 216 ; AVX1-NEXT: movslq %edx, %rdx 217 ; AVX1-NEXT: movslq %ecx, %r11 218 ; AVX1-NEXT: movslq %r8d, %r10 219 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) 220 ; AVX1-NEXT: movslq %r9d, %r8 221 ; AVX1-NEXT: movslq 16(%rbp), %rdi 222 ; AVX1-NEXT: movslq 24(%rbp), %rcx 223 ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 224 ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 225 ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 226 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 227 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 228 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 229 ; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 230 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] 231 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] 232 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 233 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 234 ; AVX1-NEXT: movq %rbp, %rsp 235 ; AVX1-NEXT: popq %rbp 236 ; AVX1-NEXT: retq 237 ; 238 ; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: 239 ; AVX2: # BB#0: 240 ; AVX2-NEXT: vmovd %edi, %xmm1 241 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 242 ; AVX2-NEXT: vmovd %esi, %xmm2 243 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2 244 ; AVX2-NEXT: vmovd %edx, %xmm3 245 ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 246 ; AVX2-NEXT: vmovd %ecx, %xmm4 247 ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4 248 ; AVX2-NEXT: vmovd %r8d, %xmm5 249 ; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5 250 ; AVX2-NEXT: vmovd %r9d, %xmm6 251 ; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6 252 ; AVX2-NEXT: vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero 253 ; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7 254 ; AVX2-NEXT: vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero 255 ; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0 256 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] 257 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] 258 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] 259 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 260 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] 261 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 262 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 263 ; AVX2-NEXT: retq 264 %x0 = extractelement <8 x float> %x, i32 %i0 265 %x1 = extractelement <8 x float> %x, i32 %i1 266 %x2 = extractelement <8 x float> %x, i32 %i2 267 %x3 = extractelement <8 x float> %x, i32 %i3 268 %x4 = extractelement <8 x float> %x, i32 %i4 269 %x5 = extractelement <8 x float> %x, i32 %i5 270 %x6 = extractelement <8 x float> %x, i32 %i6 271 %x7 = extractelement <8 x float> %x, i32 %i7 272 %r0 = insertelement <8 x float> undef, float %x0, i32 0 273 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 274 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 275 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 276 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 277 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 278 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 279 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 280 ret <8 x float> %r7 281 } 282 283 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { 284 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: 285 ; ALL: # BB#0: 286 ; ALL-NEXT: movslq %edi, %rax 287 ; ALL-NEXT: movslq %esi, %rsi 288 ; ALL-NEXT: movslq %edx, %rdx 289 ; ALL-NEXT: movslq %ecx, %r11 290 ; ALL-NEXT: movslq %r8d, %r10 291 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 292 ; ALL-NEXT: movslq %r9d, %r8 293 ; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi 294 ; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx 295 ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 296 ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 297 ; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 298 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 299 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 300 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 301 ; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 302 ; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] 303 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] 304 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 305 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 306 ; ALL-NEXT: retq 307 %x0 = extractelement <4 x float> %x, i32 %i0 308 %x1 = extractelement <4 x float> %x, i32 %i1 309 %x2 = extractelement <4 x float> %x, i32 %i2 310 %x3 = extractelement <4 x float> %x, i32 %i3 311 %x4 = extractelement <4 x float> %x, i32 %i4 312 %x5 = extractelement <4 x float> %x, i32 %i5 313 %x6 = extractelement <4 x float> %x, i32 %i6 314 %x7 = extractelement <4 x float> %x, i32 %i7 315 %r0 = insertelement <8 x float> undef, float %x0, i32 0 316 %r1 = insertelement <8 x float> %r0, float %x1, i32 1 317 %r2 = insertelement <8 x float> %r1, float %x2, i32 2 318 %r3 = insertelement <8 x float> %r2, float %x3, i32 3 319 %r4 = insertelement <8 x float> %r3, float %x4, i32 4 320 %r5 = insertelement <8 x float> %r4, float %x5, i32 5 321 %r6 = insertelement <8 x float> %r5, float %x6, i32 6 322 %r7 = insertelement <8 x float> %r6, float %x7, i32 7 323 ret <8 x float> %r7 324 } 325 326 define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 327 ; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 328 ; AVX1: # BB#0: 329 ; AVX1-NEXT: pushq %rbp 330 ; AVX1-NEXT: movq %rsp, %rbp 331 ; AVX1-NEXT: andq $-32, %rsp 332 ; AVX1-NEXT: subq $64, %rsp 333 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) 334 ; AVX1-NEXT: movslq 32(%rbp), %rax 335 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 336 ; AVX1-NEXT: vmovd %eax, %xmm0 337 ; AVX1-NEXT: movslq 40(%rbp), %rax 338 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 339 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 340 ; AVX1-NEXT: movslq 48(%rbp), %rax 341 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 342 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 343 ; AVX1-NEXT: movslq 56(%rbp), %rax 344 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 345 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 346 ; AVX1-NEXT: movslq 64(%rbp), %rax 347 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 348 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 349 ; AVX1-NEXT: movslq 72(%rbp), %rax 350 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 351 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 352 ; AVX1-NEXT: movslq 80(%rbp), %rax 353 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 354 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 355 ; AVX1-NEXT: movslq 88(%rbp), %rax 356 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 357 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 358 ; AVX1-NEXT: movslq %edi, %rax 359 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 360 ; AVX1-NEXT: vmovd %eax, %xmm1 361 ; AVX1-NEXT: movslq %esi, %rax 362 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 363 ; AVX1-NEXT: movslq %edx, %rax 364 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 365 ; AVX1-NEXT: movslq %ecx, %rax 366 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 367 ; AVX1-NEXT: movslq %r8d, %rax 368 ; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 369 ; AVX1-NEXT: movslq %r9d, %rax 370 ; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 371 ; AVX1-NEXT: movslq 16(%rbp), %rax 372 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 373 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 374 ; AVX1-NEXT: movslq 24(%rbp), %rax 375 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax 376 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 377 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 378 ; AVX1-NEXT: movq %rbp, %rsp 379 ; AVX1-NEXT: popq %rbp 380 ; AVX1-NEXT: retq 381 ; 382 ; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16: 383 ; AVX2: # BB#0: 384 ; AVX2-NEXT: pushq %rbp 385 ; AVX2-NEXT: movq %rsp, %rbp 386 ; AVX2-NEXT: andq $-32, %rsp 387 ; AVX2-NEXT: subq $64, %rsp 388 ; AVX2-NEXT: vmovaps %ymm0, (%rsp) 389 ; AVX2-NEXT: movslq 32(%rbp), %rax 390 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 391 ; AVX2-NEXT: vmovd %eax, %xmm0 392 ; AVX2-NEXT: movslq 40(%rbp), %rax 393 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 394 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 395 ; AVX2-NEXT: movslq 48(%rbp), %rax 396 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 397 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 398 ; AVX2-NEXT: movslq 56(%rbp), %rax 399 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 400 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 401 ; AVX2-NEXT: movslq 64(%rbp), %rax 402 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 403 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 404 ; AVX2-NEXT: movslq 72(%rbp), %rax 405 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 406 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 407 ; AVX2-NEXT: movslq 80(%rbp), %rax 408 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 409 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 410 ; AVX2-NEXT: movslq 88(%rbp), %rax 411 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 412 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 413 ; AVX2-NEXT: movslq %edi, %rax 414 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 415 ; AVX2-NEXT: vmovd %eax, %xmm1 416 ; AVX2-NEXT: movslq %esi, %rax 417 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 418 ; AVX2-NEXT: movslq %edx, %rax 419 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 420 ; AVX2-NEXT: movslq %ecx, %rax 421 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 422 ; AVX2-NEXT: movslq %r8d, %rax 423 ; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 424 ; AVX2-NEXT: movslq %r9d, %rax 425 ; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 426 ; AVX2-NEXT: movslq 16(%rbp), %rax 427 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 428 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 429 ; AVX2-NEXT: movslq 24(%rbp), %rax 430 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax 431 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 432 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 433 ; AVX2-NEXT: movq %rbp, %rsp 434 ; AVX2-NEXT: popq %rbp 435 ; AVX2-NEXT: retq 436 %x0 = extractelement <16 x i16> %x, i32 %i0 437 %x1 = extractelement <16 x i16> %x, i32 %i1 438 %x2 = extractelement <16 x i16> %x, i32 %i2 439 %x3 = extractelement <16 x i16> %x, i32 %i3 440 %x4 = extractelement <16 x i16> %x, i32 %i4 441 %x5 = extractelement <16 x i16> %x, i32 %i5 442 %x6 = extractelement <16 x i16> %x, i32 %i6 443 %x7 = extractelement <16 x i16> %x, i32 %i7 444 %x8 = extractelement <16 x i16> %x, i32 %i8 445 %x9 = extractelement <16 x i16> %x, i32 %i9 446 %x10 = extractelement <16 x i16> %x, i32 %i10 447 %x11 = extractelement <16 x i16> %x, i32 %i11 448 %x12 = extractelement <16 x i16> %x, i32 %i12 449 %x13 = extractelement <16 x i16> %x, i32 %i13 450 %x14 = extractelement <16 x i16> %x, i32 %i14 451 %x15 = extractelement <16 x i16> %x, i32 %i15 452 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 453 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 454 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 455 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 456 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 457 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 458 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 459 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 460 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 461 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 462 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 463 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 464 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 465 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 466 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 467 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 468 ret <16 x i16> %r15 469 } 470 471 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { 472 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 473 ; AVX1: # BB#0: 474 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 475 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 476 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 477 ; AVX1-NEXT: vmovd %eax, %xmm0 478 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 479 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 480 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 481 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 482 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 483 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 484 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 485 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 486 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 487 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 488 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 489 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 490 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 491 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 492 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 493 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 494 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 495 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 496 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 497 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 498 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 499 ; AVX1-NEXT: movslq %edi, %rax 500 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 501 ; AVX1-NEXT: vmovd %eax, %xmm1 502 ; AVX1-NEXT: movslq %esi, %rax 503 ; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 504 ; AVX1-NEXT: movslq %edx, %rax 505 ; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 506 ; AVX1-NEXT: movslq %ecx, %rax 507 ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 508 ; AVX1-NEXT: movslq %r8d, %rax 509 ; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 510 ; AVX1-NEXT: movslq %r9d, %rax 511 ; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 512 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 513 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 514 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 515 ; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax 516 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax 517 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 518 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 519 ; AVX1-NEXT: retq 520 ; 521 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: 522 ; AVX2: # BB#0: 523 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 524 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 525 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 526 ; AVX2-NEXT: vmovd %eax, %xmm0 527 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 528 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 529 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 530 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 531 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 532 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 533 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 534 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 535 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 536 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 537 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 538 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 539 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 540 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 541 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 542 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 543 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 544 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 545 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 546 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 547 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 548 ; AVX2-NEXT: movslq %edi, %rax 549 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 550 ; AVX2-NEXT: vmovd %eax, %xmm1 551 ; AVX2-NEXT: movslq %esi, %rax 552 ; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 553 ; AVX2-NEXT: movslq %edx, %rax 554 ; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 555 ; AVX2-NEXT: movslq %ecx, %rax 556 ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 557 ; AVX2-NEXT: movslq %r8d, %rax 558 ; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 559 ; AVX2-NEXT: movslq %r9d, %rax 560 ; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 561 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 562 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 563 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 564 ; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax 565 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax 566 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 567 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 568 ; AVX2-NEXT: retq 569 %x0 = extractelement <8 x i16> %x, i32 %i0 570 %x1 = extractelement <8 x i16> %x, i32 %i1 571 %x2 = extractelement <8 x i16> %x, i32 %i2 572 %x3 = extractelement <8 x i16> %x, i32 %i3 573 %x4 = extractelement <8 x i16> %x, i32 %i4 574 %x5 = extractelement <8 x i16> %x, i32 %i5 575 %x6 = extractelement <8 x i16> %x, i32 %i6 576 %x7 = extractelement <8 x i16> %x, i32 %i7 577 %x8 = extractelement <8 x i16> %x, i32 %i8 578 %x9 = extractelement <8 x i16> %x, i32 %i9 579 %x10 = extractelement <8 x i16> %x, i32 %i10 580 %x11 = extractelement <8 x i16> %x, i32 %i11 581 %x12 = extractelement <8 x i16> %x, i32 %i12 582 %x13 = extractelement <8 x i16> %x, i32 %i13 583 %x14 = extractelement <8 x i16> %x, i32 %i14 584 %x15 = extractelement <8 x i16> %x, i32 %i15 585 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0 586 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1 587 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2 588 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3 589 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4 590 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5 591 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6 592 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7 593 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8 594 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9 595 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10 596 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11 597 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12 598 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13 599 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14 600 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15 601 ret <16 x i16> %r15 602 } 603 604 ; 605 ; Unary shuffle indices from memory 606 ; 607 608 define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind { 609 ; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: 610 ; AVX1: # BB#0: 611 ; AVX1-NEXT: pushq %rbp 612 ; AVX1-NEXT: movq %rsp, %rbp 613 ; AVX1-NEXT: andq $-32, %rsp 614 ; AVX1-NEXT: subq $64, %rsp 615 ; AVX1-NEXT: movq (%rdi), %rax 616 ; AVX1-NEXT: movq 8(%rdi), %rcx 617 ; AVX1-NEXT: movq 16(%rdi), %rdx 618 ; AVX1-NEXT: movq 24(%rdi), %rsi 619 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) 620 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 621 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 622 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 623 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 624 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 625 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 626 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 627 ; AVX1-NEXT: movq %rbp, %rsp 628 ; AVX1-NEXT: popq %rbp 629 ; AVX1-NEXT: retq 630 ; 631 ; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: 632 ; AVX2: # BB#0: 633 ; AVX2-NEXT: pushq %rbp 634 ; AVX2-NEXT: movq %rsp, %rbp 635 ; AVX2-NEXT: andq $-32, %rsp 636 ; AVX2-NEXT: subq $64, %rsp 637 ; AVX2-NEXT: movq (%rdi), %rax 638 ; AVX2-NEXT: movq 8(%rdi), %rcx 639 ; AVX2-NEXT: movq 16(%rdi), %rdx 640 ; AVX2-NEXT: movq 24(%rdi), %rsi 641 ; AVX2-NEXT: vmovaps %ymm0, (%rsp) 642 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 643 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 644 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 645 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 646 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 647 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 648 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 649 ; AVX2-NEXT: movq %rbp, %rsp 650 ; AVX2-NEXT: popq %rbp 651 ; AVX2-NEXT: retq 652 %p0 = getelementptr inbounds i64, i64* %i, i32 0 653 %p1 = getelementptr inbounds i64, i64* %i, i32 1 654 %p2 = getelementptr inbounds i64, i64* %i, i32 2 655 %p3 = getelementptr inbounds i64, i64* %i, i32 3 656 %i0 = load i64, i64* %p0, align 4 657 %i1 = load i64, i64* %p1, align 4 658 %i2 = load i64, i64* %p2, align 4 659 %i3 = load i64, i64* %p3, align 4 660 %x0 = extractelement <4 x i64> %x, i64 %i0 661 %x1 = extractelement <4 x i64> %x, i64 %i1 662 %x2 = extractelement <4 x i64> %x, i64 %i2 663 %x3 = extractelement <4 x i64> %x, i64 %i3 664 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 665 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 666 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 667 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 668 ret <4 x i64> %r3 669 } 670 671 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind { 672 ; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: 673 ; AVX1: # BB#0: 674 ; AVX1-NEXT: movq (%rdi), %rax 675 ; AVX1-NEXT: movq 8(%rdi), %rcx 676 ; AVX1-NEXT: movq 16(%rdi), %rdx 677 ; AVX1-NEXT: movq 24(%rdi), %rsi 678 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 679 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 680 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 681 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 682 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 683 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 684 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 685 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 686 ; AVX1-NEXT: retq 687 ; 688 ; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: 689 ; AVX2: # BB#0: 690 ; AVX2-NEXT: movq (%rdi), %rax 691 ; AVX2-NEXT: movq 8(%rdi), %rcx 692 ; AVX2-NEXT: movq 16(%rdi), %rdx 693 ; AVX2-NEXT: movq 24(%rdi), %rsi 694 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 695 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 696 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 697 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 698 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 699 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 700 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 701 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 702 ; AVX2-NEXT: retq 703 %p0 = getelementptr inbounds i64, i64* %i, i32 0 704 %p1 = getelementptr inbounds i64, i64* %i, i32 1 705 %p2 = getelementptr inbounds i64, i64* %i, i32 2 706 %p3 = getelementptr inbounds i64, i64* %i, i32 3 707 %i0 = load i64, i64* %p0, align 4 708 %i1 = load i64, i64* %p1, align 4 709 %i2 = load i64, i64* %p2, align 4 710 %i3 = load i64, i64* %p3, align 4 711 %x0 = extractelement <2 x i64> %x, i64 %i0 712 %x1 = extractelement <2 x i64> %x, i64 %i1 713 %x2 = extractelement <2 x i64> %x, i64 %i2 714 %x3 = extractelement <2 x i64> %x, i64 %i3 715 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0 716 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1 717 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2 718 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3 719 ret <4 x i64> %r3 720 } 721