1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 6 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 7 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP 8 9 define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind { 10 ; SSE2-LABEL: v3i64: 11 ; SSE2: # %bb.0: 12 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 13 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 14 ; SSE2-NEXT: movq %xmm2, 16(%rdi) 15 ; SSE2-NEXT: movdqa %xmm0, (%rdi) 16 ; SSE2-NEXT: retq 17 ; 18 ; SSE42-LABEL: v3i64: 19 ; SSE42: # %bb.0: 20 ; SSE42-NEXT: pextrq $1, %xmm0, 16(%rdi) 21 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 22 ; SSE42-NEXT: movdqa %xmm0, (%rdi) 23 ; SSE42-NEXT: retq 24 ; 25 ; AVX-LABEL: v3i64: 26 ; AVX: # %bb.0: 27 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] 28 ; AVX-NEXT: vpextrq $1, %xmm0, 16(%rdi) 29 ; AVX-NEXT: vmovdqa %xmm1, (%rdi) 30 ; AVX-NEXT: retq 31 ; 32 ; XOP-LABEL: v3i64: 33 ; XOP: # %bb.0: 34 ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] 35 ; XOP-NEXT: vpextrq $1, %xmm0, 16(%rdi) 36 ; XOP-NEXT: vmovdqa %xmm1, (%rdi) 37 ; XOP-NEXT: retq 38 %r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> <i32 0, i32 2, i32 1> 39 store <3 x i64> %r, <3 x i64>* %p 40 ret void 41 } 42 define void @v3f64(<2 x double> %a, <2 x double> %b, <3 x double>* %p) nounwind { 43 ; SSE-LABEL: v3f64: 44 ; SSE: # %bb.0: 45 ; SSE-NEXT: movhpd %xmm0, 16(%rdi) 46 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 47 ; SSE-NEXT: movapd %xmm0, (%rdi) 48 ; SSE-NEXT: retq 49 ; 50 ; AVX-LABEL: v3f64: 51 ; AVX: # %bb.0: 52 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] 53 ; AVX-NEXT: vmovhpd %xmm0, 16(%rdi) 54 ; AVX-NEXT: vmovapd %xmm1, (%rdi) 55 ; AVX-NEXT: retq 56 ; 57 ; XOP-LABEL: v3f64: 58 ; XOP: # %bb.0: 59 ; XOP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] 60 ; XOP-NEXT: vmovhpd %xmm0, 16(%rdi) 61 ; XOP-NEXT: vmovapd %xmm1, (%rdi) 62 ; XOP-NEXT: retq 63 %r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> <i32 0, i32 2, i32 1> 64 store <3 x double> %r, <3 x double>* %p 65 ret void 66 } 67 68 define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind { 69 ; SSE2-LABEL: v3i32: 70 ; SSE2: # %bb.0: 71 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 72 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 73 ; SSE2-NEXT: movd %xmm2, 8(%rdi) 74 ; SSE2-NEXT: movq %xmm0, (%rdi) 75 ; SSE2-NEXT: retq 76 ; 77 ; SSE42-LABEL: v3i32: 78 ; SSE42: # %bb.0: 79 ; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi) 80 ; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 81 ; SSE42-NEXT: movlps %xmm0, (%rdi) 82 ; SSE42-NEXT: retq 83 ; 84 ; AVX-LABEL: v3i32: 85 ; AVX: # %bb.0: 86 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 87 ; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) 88 ; AVX-NEXT: vmovlps %xmm1, (%rdi) 89 ; AVX-NEXT: retq 90 ; 91 ; XOP-LABEL: v3i32: 92 ; XOP: # %bb.0: 93 ; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 94 ; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi) 95 ; XOP-NEXT: vmovlps %xmm1, (%rdi) 96 ; XOP-NEXT: retq 97 %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1> 98 store <3 x i32> %r, <3 x i32>* %p 99 ret void 100 } 101 102 define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind { 103 ; SSE2-LABEL: v5i16: 104 ; SSE2: # %bb.0: 105 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 106 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 107 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 108 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 109 ; SSE2-NEXT: pextrw $6, %xmm0, %eax 110 ; SSE2-NEXT: movw %ax, 8(%rdi) 111 ; SSE2-NEXT: movq %xmm2, (%rdi) 112 ; SSE2-NEXT: retq 113 ; 114 ; SSE42-LABEL: v5i16: 115 ; SSE42: # %bb.0: 116 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 117 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 118 ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 119 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 120 ; SSE42-NEXT: pextrw $6, %xmm0, 8(%rdi) 121 ; SSE42-NEXT: movq %xmm2, (%rdi) 122 ; SSE42-NEXT: retq 123 ; 124 ; AVX1-LABEL: v5i16: 125 ; AVX1: # %bb.0: 126 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 127 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 128 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 129 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 130 ; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi) 131 ; AVX1-NEXT: vmovq %xmm1, (%rdi) 132 ; AVX1-NEXT: retq 133 ; 134 ; AVX2-SLOW-LABEL: v5i16: 135 ; AVX2-SLOW: # %bb.0: 136 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 137 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 138 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 139 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 140 ; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi) 141 ; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi) 142 ; AVX2-SLOW-NEXT: retq 143 ; 144 ; AVX2-FAST-LABEL: v5i16: 145 ; AVX2-FAST: # %bb.0: 146 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15] 147 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 148 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 149 ; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi) 150 ; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi) 151 ; AVX2-FAST-NEXT: retq 152 ; 153 ; XOP-LABEL: v5i16: 154 ; XOP: # %bb.0: 155 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[4,5],xmm1[4,5],xmm0[6,7],xmm1[6,7] 156 ; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi) 157 ; XOP-NEXT: vmovq %xmm1, (%rdi) 158 ; XOP-NEXT: retq 159 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3> 160 store <5 x i16> %r, <5 x i16>* %p 161 ret void 162 } 163 164 define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind { 165 ; SSE2-LABEL: v5i32: 166 ; SSE2: # %bb.0: 167 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] 168 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 169 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 170 ; SSE2-NEXT: movd %xmm2, 16(%rdi) 171 ; SSE2-NEXT: movdqa %xmm0, (%rdi) 172 ; SSE2-NEXT: retq 173 ; 174 ; SSE42-LABEL: v5i32: 175 ; SSE42: # %bb.0: 176 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 177 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 178 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 179 ; SSE42-NEXT: pextrd $3, %xmm0, 16(%rdi) 180 ; SSE42-NEXT: movdqa %xmm2, (%rdi) 181 ; SSE42-NEXT: retq 182 ; 183 ; AVX1-LABEL: v5i32: 184 ; AVX1: # %bb.0: 185 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 186 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 187 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 188 ; AVX1-NEXT: vpextrd $3, %xmm0, 16(%rdi) 189 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 190 ; AVX1-NEXT: retq 191 ; 192 ; AVX2-LABEL: v5i32: 193 ; AVX2: # %bb.0: 194 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] 195 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 196 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 197 ; AVX2-NEXT: vpextrd $3, %xmm0, 16(%rdi) 198 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 199 ; AVX2-NEXT: retq 200 ; 201 ; XOP-LABEL: v5i32: 202 ; XOP: # %bb.0: 203 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7],xmm0[4,5,6,7],xmm1[8,9,10,11] 204 ; XOP-NEXT: vpextrd $3, %xmm0, 16(%rdi) 205 ; XOP-NEXT: vmovdqa %xmm1, (%rdi) 206 ; XOP-NEXT: retq 207 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3> 208 store <5 x i32> %r, <5 x i32>* %p 209 ret void 210 } 211 212 define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind { 213 ; SSE2-LABEL: v5f32: 214 ; SSE2: # %bb.0: 215 ; SSE2-NEXT: movaps %xmm0, %xmm2 216 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,2] 217 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] 218 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 219 ; SSE2-NEXT: movss %xmm0, 16(%rdi) 220 ; SSE2-NEXT: movaps %xmm2, (%rdi) 221 ; SSE2-NEXT: retq 222 ; 223 ; SSE42-LABEL: v5f32: 224 ; SSE42: # %bb.0: 225 ; SSE42-NEXT: extractps $3, %xmm0, 16(%rdi) 226 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,2] 227 ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 228 ; SSE42-NEXT: movaps %xmm0, (%rdi) 229 ; SSE42-NEXT: retq 230 ; 231 ; AVX-LABEL: v5f32: 232 ; AVX: # %bb.0: 233 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] 234 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] 235 ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) 236 ; AVX-NEXT: vmovaps %xmm1, (%rdi) 237 ; AVX-NEXT: retq 238 ; 239 ; XOP-LABEL: v5f32: 240 ; XOP: # %bb.0: 241 ; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] 242 ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] 243 ; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi) 244 ; XOP-NEXT: vmovaps %xmm1, (%rdi) 245 ; XOP-NEXT: retq 246 %r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3> 247 store <5 x float> %r, <5 x float>* %p 248 ret void 249 } 250 251 define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind { 252 ; SSE2-LABEL: v7i8: 253 ; SSE2: # %bb.0: 254 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 255 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] 256 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] 257 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7] 258 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] 259 ; SSE2-NEXT: pand %xmm2, %xmm1 260 ; SSE2-NEXT: pandn %xmm0, %xmm2 261 ; SSE2-NEXT: por %xmm1, %xmm2 262 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 263 ; SSE2-NEXT: pand %xmm2, %xmm0 264 ; SSE2-NEXT: packuswb %xmm0, %xmm0 265 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) 266 ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al 267 ; SSE2-NEXT: movb %al, 6(%rdi) 268 ; SSE2-NEXT: movd %xmm0, (%rdi) 269 ; SSE2-NEXT: pextrw $2, %xmm0, %eax 270 ; SSE2-NEXT: movw %ax, 4(%rdi) 271 ; SSE2-NEXT: retq 272 ; 273 ; SSE42-LABEL: v7i8: 274 ; SSE42: # %bb.0: 275 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 276 ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) 277 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] 278 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] 279 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] 280 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi) 281 ; SSE42-NEXT: movd %xmm1, (%rdi) 282 ; SSE42-NEXT: retq 283 ; 284 ; AVX-LABEL: v7i8: 285 ; AVX: # %bb.0: 286 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 287 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] 288 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] 289 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] 290 ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) 291 ; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) 292 ; AVX-NEXT: vmovd %xmm0, (%rdi) 293 ; AVX-NEXT: retq 294 ; 295 ; XOP-LABEL: v7i8: 296 ; XOP: # %bb.0: 297 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u] 298 ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi) 299 ; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi) 300 ; XOP-NEXT: vmovd %xmm0, (%rdi) 301 ; XOP-NEXT: retq 302 %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4> 303 store <7 x i8> %r, <7 x i8>* %p 304 ret void 305 } 306 307 define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind { 308 ; SSE2-LABEL: v7i16: 309 ; SSE2: # %bb.0: 310 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 311 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] 312 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] 313 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7] 314 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] 315 ; SSE2-NEXT: pand %xmm2, %xmm3 316 ; SSE2-NEXT: pandn %xmm0, %xmm2 317 ; SSE2-NEXT: por %xmm3, %xmm2 318 ; SSE2-NEXT: movd %xmm1, %eax 319 ; SSE2-NEXT: movw %ax, 12(%rdi) 320 ; SSE2-NEXT: movq %xmm2, (%rdi) 321 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 322 ; SSE2-NEXT: movd %xmm0, 8(%rdi) 323 ; SSE2-NEXT: retq 324 ; 325 ; SSE42-LABEL: v7i16: 326 ; SSE42: # %bb.0: 327 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 328 ; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi) 329 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] 330 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] 331 ; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi) 332 ; SSE42-NEXT: movq %xmm1, (%rdi) 333 ; SSE42-NEXT: retq 334 ; 335 ; AVX-LABEL: v7i16: 336 ; AVX: # %bb.0: 337 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] 338 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] 339 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] 340 ; AVX-NEXT: vpextrw $0, %xmm1, 12(%rdi) 341 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) 342 ; AVX-NEXT: vmovq %xmm0, (%rdi) 343 ; AVX-NEXT: retq 344 ; 345 ; XOP-LABEL: v7i16: 346 ; XOP: # %bb.0: 347 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15] 348 ; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi) 349 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) 350 ; XOP-NEXT: vmovq %xmm0, (%rdi) 351 ; XOP-NEXT: retq 352 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4> 353 store <7 x i16> %r, <7 x i16>* %p 354 ret void 355 } 356 357 358 define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind { 359 ; SSE2-LABEL: v7i32: 360 ; SSE2: # %bb.0: 361 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,2] 362 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] 363 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 364 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] 365 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 366 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 367 ; SSE2-NEXT: movd %xmm1, 24(%rdi) 368 ; SSE2-NEXT: movq %xmm0, 16(%rdi) 369 ; SSE2-NEXT: movdqa %xmm3, (%rdi) 370 ; SSE2-NEXT: retq 371 ; 372 ; SSE42-LABEL: v7i32: 373 ; SSE42: # %bb.0: 374 ; SSE42-NEXT: movdqa %xmm0, %xmm2 375 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] 376 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] 377 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 378 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 379 ; SSE42-NEXT: movd %xmm1, 24(%rdi) 380 ; SSE42-NEXT: movq %xmm0, 16(%rdi) 381 ; SSE42-NEXT: movdqa %xmm2, (%rdi) 382 ; SSE42-NEXT: retq 383 ; 384 ; AVX1-LABEL: v7i32: 385 ; AVX1: # %bb.0: 386 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] 387 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] 388 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 389 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3] 390 ; AVX1-NEXT: vmovss %xmm1, 24(%rdi) 391 ; AVX1-NEXT: vmovlps %xmm0, 16(%rdi) 392 ; AVX1-NEXT: vmovaps %xmm2, (%rdi) 393 ; AVX1-NEXT: retq 394 ; 395 ; AVX2-LABEL: v7i32: 396 ; AVX2: # %bb.0: 397 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 398 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 399 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u> 400 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 401 ; AVX2-NEXT: vmovss %xmm1, 24(%rdi) 402 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 403 ; AVX2-NEXT: vmovlps %xmm1, 16(%rdi) 404 ; AVX2-NEXT: vmovaps %xmm0, (%rdi) 405 ; AVX2-NEXT: vzeroupper 406 ; AVX2-NEXT: retq 407 ; 408 ; XOP-LABEL: v7i32: 409 ; XOP: # %bb.0: 410 ; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] 411 ; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] 412 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 413 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3] 414 ; XOP-NEXT: vmovss %xmm1, 24(%rdi) 415 ; XOP-NEXT: vmovlps %xmm0, 16(%rdi) 416 ; XOP-NEXT: vmovaps %xmm2, (%rdi) 417 ; XOP-NEXT: retq 418 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4> 419 store <7 x i32> %r, <7 x i32>* %p 420 ret void 421 } 422 423 define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind { 424 ; SSE2-LABEL: v12i8: 425 ; SSE2: # %bb.0: 426 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 427 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 428 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 429 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] 430 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 431 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 432 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] 433 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 434 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 435 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] 436 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4] 437 ; SSE2-NEXT: packuswb %xmm3, %xmm0 438 ; SSE2-NEXT: pand %xmm2, %xmm0 439 ; SSE2-NEXT: pandn %xmm1, %xmm2 440 ; SSE2-NEXT: por %xmm0, %xmm2 441 ; SSE2-NEXT: movq %xmm2, (%rdi) 442 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 443 ; SSE2-NEXT: movd %xmm0, 8(%rdi) 444 ; SSE2-NEXT: retq 445 ; 446 ; SSE42-LABEL: v12i8: 447 ; SSE42: # %bb.0: 448 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] 449 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] 450 ; SSE42-NEXT: por %xmm1, %xmm0 451 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) 452 ; SSE42-NEXT: movq %xmm0, (%rdi) 453 ; SSE42-NEXT: retq 454 ; 455 ; AVX-LABEL: v12i8: 456 ; AVX: # %bb.0: 457 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] 458 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] 459 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 460 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) 461 ; AVX-NEXT: vmovq %xmm0, (%rdi) 462 ; AVX-NEXT: retq 463 ; 464 ; XOP-LABEL: v12i8: 465 ; XOP: # %bb.0: 466 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u] 467 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u] 468 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 469 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) 470 ; XOP-NEXT: vmovq %xmm0, (%rdi) 471 ; XOP-NEXT: retq 472 %r = shufflevector <8 x i8> %a, <8 x i8> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 473 store <12 x i8> %r, <12 x i8>* %p 474 ret void 475 } 476 477 define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind { 478 ; SSE2-LABEL: v12i16: 479 ; SSE2: # %bb.0: 480 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,3] 481 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] 482 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7] 483 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] 484 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] 485 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4] 486 ; SSE2-NEXT: pand %xmm3, %xmm4 487 ; SSE2-NEXT: pandn %xmm2, %xmm3 488 ; SSE2-NEXT: por %xmm4, %xmm3 489 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 490 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] 491 ; SSE2-NEXT: pand %xmm2, %xmm1 492 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 493 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7] 494 ; SSE2-NEXT: pandn %xmm0, %xmm2 495 ; SSE2-NEXT: por %xmm1, %xmm2 496 ; SSE2-NEXT: movq %xmm2, 16(%rdi) 497 ; SSE2-NEXT: movdqa %xmm3, (%rdi) 498 ; SSE2-NEXT: retq 499 ; 500 ; SSE42-LABEL: v12i16: 501 ; SSE42: # %bb.0: 502 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 503 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 504 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] 505 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7] 506 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] 507 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] 508 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] 509 ; SSE42-NEXT: movdqa %xmm0, (%rdi) 510 ; SSE42-NEXT: movq %xmm3, 16(%rdi) 511 ; SSE42-NEXT: retq 512 ; 513 ; AVX1-LABEL: v12i16: 514 ; AVX1: # %bb.0: 515 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 516 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 517 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] 518 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] 519 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] 520 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] 521 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] 522 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 523 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) 524 ; AVX1-NEXT: retq 525 ; 526 ; AVX2-SLOW-LABEL: v12i16: 527 ; AVX2-SLOW: # %bb.0: 528 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 529 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 530 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] 531 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] 532 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 533 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] 534 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] 535 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) 536 ; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi) 537 ; AVX2-SLOW-NEXT: retq 538 ; 539 ; AVX2-FAST-LABEL: v12i16: 540 ; AVX2-FAST: # %bb.0: 541 ; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2 542 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] 543 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] 544 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 545 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15] 546 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] 547 ; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi) 548 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi) 549 ; AVX2-FAST-NEXT: retq 550 ; 551 ; XOP-LABEL: v12i16: 552 ; XOP: # %bb.0: 553 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,8,9],xmm1[0,1],xmm0[2,3,10,11],xmm1[2,3],xmm0[4,5,12,13] 554 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[4,5],xmm0[6,7,14,15],xmm1[6,7],xmm0[8,9,10,11,12,13,14,15] 555 ; XOP-NEXT: vmovq %xmm0, 16(%rdi) 556 ; XOP-NEXT: vmovdqa %xmm2, (%rdi) 557 ; XOP-NEXT: retq 558 %r = shufflevector <8 x i16> %a, <8 x i16> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 559 store <12 x i16> %r, <12 x i16>* %p 560 ret void 561 } 562 563 define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { 564 ; SSE2-LABEL: v12i32: 565 ; SSE2: # %bb.0: 566 ; SSE2-NEXT: movdqa %xmm0, %xmm3 567 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 568 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] 569 ; SSE2-NEXT: movaps %xmm2, %xmm4 570 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] 571 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] 572 ; SSE2-NEXT: movaps %xmm2, %xmm4 573 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0] 574 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2] 575 ; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 576 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] 577 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] 578 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2] 579 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,0] 580 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 581 ; SSE2-NEXT: movaps %xmm0, 32(%rdi) 582 ; SSE2-NEXT: movaps %xmm4, 16(%rdi) 583 ; SSE2-NEXT: movaps %xmm3, (%rdi) 584 ; SSE2-NEXT: retq 585 ; 586 ; SSE42-LABEL: v12i32: 587 ; SSE42: # %bb.0: 588 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 589 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] 590 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] 591 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] 592 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] 593 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] 594 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] 595 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] 596 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 597 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 598 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] 599 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] 600 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] 601 ; SSE42-NEXT: movdqa %xmm0, 32(%rdi) 602 ; SSE42-NEXT: movdqa %xmm4, 16(%rdi) 603 ; SSE42-NEXT: movdqa %xmm3, (%rdi) 604 ; SSE42-NEXT: retq 605 ; 606 ; AVX1-LABEL: v12i32: 607 ; AVX1: # %bb.0: 608 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] 609 ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] 610 ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6] 611 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] 612 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] 613 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 614 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 615 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 616 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3] 617 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] 618 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 619 ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi) 620 ; AVX1-NEXT: vmovaps %ymm2, (%rdi) 621 ; AVX1-NEXT: vzeroupper 622 ; AVX1-NEXT: retq 623 ; 624 ; AVX2-SLOW-LABEL: v12i32: 625 ; AVX2-SLOW: # %bb.0: 626 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] 627 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7] 628 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] 629 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] 630 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6> 631 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 632 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 633 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 634 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) 635 ; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi) 636 ; AVX2-SLOW-NEXT: vzeroupper 637 ; AVX2-SLOW-NEXT: retq 638 ; 639 ; AVX2-FAST-LABEL: v12i32: 640 ; AVX2-FAST: # %bb.0: 641 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> 642 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2 643 ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3 644 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 645 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7] 646 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 647 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] 648 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 649 ; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi) 650 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi) 651 ; AVX2-FAST-NEXT: vzeroupper 652 ; AVX2-FAST-NEXT: retq 653 ; 654 ; XOP-LABEL: v12i32: 655 ; XOP: # %bb.0: 656 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] 657 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6] 658 ; XOP-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] 659 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 660 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 661 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 662 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3] 663 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] 664 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 665 ; XOP-NEXT: vmovaps %xmm0, 32(%rdi) 666 ; XOP-NEXT: vmovaps %ymm2, (%rdi) 667 ; XOP-NEXT: vzeroupper 668 ; XOP-NEXT: retq 669 %r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 670 store <12 x i32> %r, <12 x i32>* %p 671 ret void 672 } 673 674 define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind { 675 ; SSE2-LABEL: pr29025: 676 ; SSE2: # %bb.0: 677 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] 678 ; SSE2-NEXT: pand %xmm3, %xmm1 679 ; SSE2-NEXT: pand %xmm3, %xmm0 680 ; SSE2-NEXT: packuswb %xmm1, %xmm0 681 ; SSE2-NEXT: packuswb %xmm0, %xmm0 682 ; SSE2-NEXT: pxor %xmm1, %xmm1 683 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 684 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 685 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 686 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 687 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 688 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] 689 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4] 690 ; SSE2-NEXT: packuswb %xmm1, %xmm0 691 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] 692 ; SSE2-NEXT: pand %xmm1, %xmm0 693 ; SSE2-NEXT: pand %xmm3, %xmm2 694 ; SSE2-NEXT: packuswb %xmm2, %xmm2 695 ; SSE2-NEXT: packuswb %xmm2, %xmm2 696 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7] 697 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] 698 ; SSE2-NEXT: pandn %xmm2, %xmm1 699 ; SSE2-NEXT: por %xmm0, %xmm1 700 ; SSE2-NEXT: movq %xmm1, (%rdi) 701 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 702 ; SSE2-NEXT: movd %xmm0, 8(%rdi) 703 ; SSE2-NEXT: retq 704 ; 705 ; SSE42-LABEL: pr29025: 706 ; SSE42: # %bb.0: 707 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 708 ; SSE42-NEXT: pshufb %xmm3, %xmm1 709 ; SSE42-NEXT: pshufb %xmm3, %xmm0 710 ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 711 ; SSE42-NEXT: pshufb %xmm3, %xmm2 712 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 713 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 714 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) 715 ; SSE42-NEXT: movq %xmm0, (%rdi) 716 ; SSE42-NEXT: retq 717 ; 718 ; AVX-LABEL: pr29025: 719 ; AVX: # %bb.0: 720 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 721 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 722 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 723 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 724 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1 725 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 726 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 727 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi) 728 ; AVX-NEXT: vmovq %xmm0, (%rdi) 729 ; AVX-NEXT: retq 730 ; 731 ; XOP-LABEL: pr29025: 732 ; XOP: # %bb.0: 733 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u] 734 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u] 735 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) 736 ; XOP-NEXT: vmovq %xmm0, (%rdi) 737 ; XOP-NEXT: retq 738 %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 739 %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 740 %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 741 store <12 x i8> %r, <12 x i8>* %p, align 1 742 ret void 743 } 744 745 define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind { 746 ; SSE2-LABEL: interleave_24i8_out: 747 ; SSE2: # %bb.0: 748 ; SSE2-NEXT: movdqu (%rdi), %xmm0 749 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 750 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255] 751 ; SSE2-NEXT: movdqa %xmm0, %xmm2 752 ; SSE2-NEXT: pand %xmm3, %xmm2 753 ; SSE2-NEXT: pandn %xmm1, %xmm3 754 ; SSE2-NEXT: por %xmm2, %xmm3 755 ; SSE2-NEXT: pxor %xmm2, %xmm2 756 ; SSE2-NEXT: movdqa %xmm3, %xmm4 757 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 758 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] 759 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 760 ; SSE2-NEXT: pand %xmm5, %xmm3 761 ; SSE2-NEXT: pandn %xmm4, %xmm5 762 ; SSE2-NEXT: por %xmm3, %xmm5 763 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,1,3] 764 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] 765 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 766 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] 767 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] 768 ; SSE2-NEXT: packuswb %xmm0, %xmm3 769 ; SSE2-NEXT: movq %xmm3, (%rsi) 770 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] 771 ; SSE2-NEXT: movdqa %xmm0, %xmm4 772 ; SSE2-NEXT: pand %xmm3, %xmm4 773 ; SSE2-NEXT: pandn %xmm1, %xmm3 774 ; SSE2-NEXT: por %xmm4, %xmm3 775 ; SSE2-NEXT: movdqa %xmm3, %xmm4 776 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 777 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] 778 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 779 ; SSE2-NEXT: pand %xmm5, %xmm3 780 ; SSE2-NEXT: pandn %xmm4, %xmm5 781 ; SSE2-NEXT: por %xmm3, %xmm5 782 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,0,3,4,5,6,7] 783 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] 784 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] 785 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] 786 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] 787 ; SSE2-NEXT: packuswb %xmm0, %xmm3 788 ; SSE2-NEXT: movq %xmm3, (%rdx) 789 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] 790 ; SSE2-NEXT: pand %xmm3, %xmm0 791 ; SSE2-NEXT: pandn %xmm1, %xmm3 792 ; SSE2-NEXT: por %xmm0, %xmm3 793 ; SSE2-NEXT: movdqa %xmm3, %xmm0 794 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 795 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] 796 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 797 ; SSE2-NEXT: pand %xmm1, %xmm3 798 ; SSE2-NEXT: pandn %xmm0, %xmm1 799 ; SSE2-NEXT: por %xmm3, %xmm1 800 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] 801 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 802 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 803 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 804 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 805 ; SSE2-NEXT: packuswb %xmm0, %xmm0 806 ; SSE2-NEXT: movq %xmm0, (%rcx) 807 ; SSE2-NEXT: retq 808 ; 809 ; SSE42-LABEL: interleave_24i8_out: 810 ; SSE42: # %bb.0: 811 ; SSE42-NEXT: movdqu (%rdi), %xmm0 812 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 813 ; SSE42-NEXT: movdqa %xmm1, %xmm2 814 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u] 815 ; SSE42-NEXT: movdqa %xmm0, %xmm3 816 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 817 ; SSE42-NEXT: por %xmm2, %xmm3 818 ; SSE42-NEXT: movq %xmm3, (%rsi) 819 ; SSE42-NEXT: movdqa %xmm1, %xmm2 820 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] 821 ; SSE42-NEXT: movdqa %xmm0, %xmm3 822 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] 823 ; SSE42-NEXT: por %xmm2, %xmm3 824 ; SSE42-NEXT: movq %xmm3, (%rdx) 825 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] 826 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 827 ; SSE42-NEXT: por %xmm1, %xmm0 828 ; SSE42-NEXT: movq %xmm0, (%rcx) 829 ; SSE42-NEXT: retq 830 ; 831 ; AVX-LABEL: interleave_24i8_out: 832 ; AVX: # %bb.0: 833 ; AVX-NEXT: vmovdqu (%rdi), %xmm0 834 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 835 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] 836 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] 837 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 838 ; AVX-NEXT: vmovq %xmm2, (%rsi) 839 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] 840 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 841 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 842 ; AVX-NEXT: vmovq %xmm2, (%rdx) 843 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] 844 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 845 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 846 ; AVX-NEXT: vmovq %xmm0, (%rcx) 847 ; AVX-NEXT: retq 848 ; 849 ; XOP-LABEL: interleave_24i8_out: 850 ; XOP: # %bb.0: 851 ; XOP-NEXT: vmovdqu (%rdi), %xmm0 852 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 853 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] 854 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] 855 ; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2 856 ; XOP-NEXT: vmovq %xmm2, (%rsi) 857 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] 858 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 859 ; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2 860 ; XOP-NEXT: vmovq %xmm2, (%rdx) 861 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] 862 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 863 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 864 ; XOP-NEXT: vmovq %xmm0, (%rcx) 865 ; XOP-NEXT: retq 866 %wide.vec = load <24 x i8>, <24 x i8>* %p, align 4 867 %s1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 868 %s2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 869 %s3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 870 store <8 x i8> %s1, <8 x i8>* %q1, align 4 871 store <8 x i8> %s2, <8 x i8>* %q2, align 4 872 store <8 x i8> %s3, <8 x i8>* %q3, align 4 873 ret void 874 } 875 876 define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind { 877 ; SSE2-LABEL: interleave_24i8_in: 878 ; SSE2: # %bb.0: 879 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 880 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 881 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 882 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 883 ; SSE2-NEXT: pxor %xmm2, %xmm2 884 ; SSE2-NEXT: movdqa %xmm1, %xmm3 885 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 886 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] 887 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] 888 ; SSE2-NEXT: pand %xmm5, %xmm4 889 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 890 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,3,4,5,6,7] 891 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] 892 ; SSE2-NEXT: pandn %xmm2, %xmm5 893 ; SSE2-NEXT: por %xmm4, %xmm5 894 ; SSE2-NEXT: movdqa %xmm3, %xmm2 895 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 896 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 897 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] 898 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] 899 ; SSE2-NEXT: packuswb %xmm5, %xmm2 900 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] 901 ; SSE2-NEXT: pand %xmm4, %xmm2 902 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] 903 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,3,4,5,6,7] 904 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] 905 ; SSE2-NEXT: pandn %xmm5, %xmm4 906 ; SSE2-NEXT: por %xmm2, %xmm4 907 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 908 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 909 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] 910 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 911 ; SSE2-NEXT: packuswb %xmm0, %xmm1 912 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] 913 ; SSE2-NEXT: pand %xmm2, %xmm1 914 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 915 ; SSE2-NEXT: pandn %xmm0, %xmm2 916 ; SSE2-NEXT: por %xmm1, %xmm2 917 ; SSE2-NEXT: movq %xmm2, 16(%rdi) 918 ; SSE2-NEXT: movdqu %xmm4, (%rdi) 919 ; SSE2-NEXT: retq 920 ; 921 ; SSE42-LABEL: interleave_24i8_in: 922 ; SSE42: # %bb.0: 923 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 924 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 925 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 926 ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 927 ; SSE42-NEXT: movdqa %xmm0, %xmm2 928 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] 929 ; SSE42-NEXT: movdqa %xmm1, %xmm3 930 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero 931 ; SSE42-NEXT: por %xmm2, %xmm3 932 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] 933 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] 934 ; SSE42-NEXT: por %xmm0, %xmm1 935 ; SSE42-NEXT: movq %xmm1, 16(%rdi) 936 ; SSE42-NEXT: movdqu %xmm3, (%rdi) 937 ; SSE42-NEXT: retq 938 ; 939 ; AVX-LABEL: interleave_24i8_in: 940 ; AVX: # %bb.0: 941 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 942 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 943 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 944 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 945 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] 946 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero 947 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 948 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] 949 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] 950 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 951 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) 952 ; AVX-NEXT: vmovdqu %xmm2, (%rdi) 953 ; AVX-NEXT: retq 954 ; 955 ; XOP-LABEL: interleave_24i8_in: 956 ; XOP: # %bb.0: 957 ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 958 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 959 ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 960 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 961 ; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] 962 ; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero 963 ; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2 964 ; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] 965 ; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] 966 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 967 ; XOP-NEXT: vmovq %xmm0, 16(%rdi) 968 ; XOP-NEXT: vmovdqu %xmm2, (%rdi) 969 ; XOP-NEXT: retq 970 %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 971 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 972 %s3 = load <8 x i8>, <8 x i8>* %q3, align 4 973 %t1 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 974 %t2 = shufflevector <8 x i8> %s3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 975 %interleaved = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 976 store <24 x i8> %interleaved, <24 x i8>* %p, align 4 977 ret void 978 } 979 980 981 define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind { 982 ; SSE2-LABEL: interleave_24i16_out: 983 ; SSE2: # %bb.0: 984 ; SSE2-NEXT: movdqu (%rdi), %xmm3 985 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2 986 ; SSE2-NEXT: movdqu 32(%rdi), %xmm8 987 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] 988 ; SSE2-NEXT: movdqa %xmm3, %xmm4 989 ; SSE2-NEXT: pand %xmm1, %xmm4 990 ; SSE2-NEXT: pandn %xmm2, %xmm1 991 ; SSE2-NEXT: por %xmm4, %xmm1 992 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 993 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 994 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 995 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 996 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 997 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] 998 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 999 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] 1000 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] 1001 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] 1002 ; SSE2-NEXT: movdqa %xmm4, %xmm5 1003 ; SSE2-NEXT: pandn %xmm2, %xmm5 1004 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1005 ; SSE2-NEXT: pand %xmm4, %xmm6 1006 ; SSE2-NEXT: por %xmm5, %xmm6 1007 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7] 1008 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 1009 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 1010 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] 1011 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,7] 1012 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] 1013 ; SSE2-NEXT: pand %xmm6, %xmm5 1014 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7] 1015 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] 1016 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] 1017 ; SSE2-NEXT: movdqa %xmm6, %xmm0 1018 ; SSE2-NEXT: pandn %xmm7, %xmm0 1019 ; SSE2-NEXT: por %xmm5, %xmm0 1020 ; SSE2-NEXT: pand %xmm4, %xmm2 1021 ; SSE2-NEXT: pandn %xmm3, %xmm4 1022 ; SSE2-NEXT: por %xmm2, %xmm4 1023 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] 1024 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 1025 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 1026 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] 1027 ; SSE2-NEXT: pand %xmm6, %xmm2 1028 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7] 1029 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] 1030 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,4,5] 1031 ; SSE2-NEXT: pandn %xmm3, %xmm6 1032 ; SSE2-NEXT: por %xmm2, %xmm6 1033 ; SSE2-NEXT: movups %xmm1, (%rsi) 1034 ; SSE2-NEXT: movdqu %xmm0, (%rdx) 1035 ; SSE2-NEXT: movdqu %xmm6, (%rcx) 1036 ; SSE2-NEXT: retq 1037 ; 1038 ; SSE42-LABEL: interleave_24i16_out: 1039 ; SSE42: # %bb.0: 1040 ; SSE42-NEXT: movdqu (%rdi), %xmm0 1041 ; SSE42-NEXT: movdqu 16(%rdi), %xmm1 1042 ; SSE42-NEXT: movdqu 32(%rdi), %xmm2 1043 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] 1044 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] 1045 ; SSE42-NEXT: movdqa %xmm0, %xmm4 1046 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] 1047 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] 1048 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7] 1049 ; SSE42-NEXT: movdqa %xmm2, %xmm3 1050 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] 1051 ; SSE42-NEXT: movdqa %xmm0, %xmm5 1052 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7] 1053 ; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15] 1054 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7] 1055 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] 1056 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] 1057 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] 1058 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 1059 ; SSE42-NEXT: movdqu %xmm4, (%rsi) 1060 ; SSE42-NEXT: movdqu %xmm5, (%rdx) 1061 ; SSE42-NEXT: movdqu %xmm1, (%rcx) 1062 ; SSE42-NEXT: retq 1063 ; 1064 ; AVX1-LABEL: interleave_24i16_out: 1065 ; AVX1: # %bb.0: 1066 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0 1067 ; AVX1-NEXT: vmovdqu (%rdi), %ymm1 1068 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1069 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] 1070 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] 1071 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] 1072 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 1073 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] 1074 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] 1075 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15] 1076 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] 1077 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 1078 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] 1079 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] 1080 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] 1081 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] 1082 ; AVX1-NEXT: vmovdqu %xmm3, (%rsi) 1083 ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) 1084 ; AVX1-NEXT: vmovdqu %xmm0, (%rcx) 1085 ; AVX1-NEXT: vzeroupper 1086 ; AVX1-NEXT: retq 1087 ; 1088 ; AVX2-LABEL: interleave_24i16_out: 1089 ; AVX2: # %bb.0: 1090 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 1091 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 1092 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1093 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1094 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] 1095 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11] 1096 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1097 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1098 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] 1099 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] 1100 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1101 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1102 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 1103 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] 1104 ; AVX2-NEXT: vmovdqu %xmm2, (%rsi) 1105 ; AVX2-NEXT: vmovdqu %xmm3, (%rdx) 1106 ; AVX2-NEXT: vmovdqu %xmm0, (%rcx) 1107 ; AVX2-NEXT: vzeroupper 1108 ; AVX2-NEXT: retq 1109 ; 1110 ; XOP-LABEL: interleave_24i16_out: 1111 ; XOP: # %bb.0: 1112 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm0 1113 ; XOP-NEXT: vmovdqu (%rdi), %ymm1 1114 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1115 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] 1116 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11] 1117 ; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] 1118 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13] 1119 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3] 1120 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15] 1121 ; XOP-NEXT: vmovdqu %xmm3, (%rsi) 1122 ; XOP-NEXT: vmovdqu %xmm4, (%rdx) 1123 ; XOP-NEXT: vmovdqu %xmm0, (%rcx) 1124 ; XOP-NEXT: vzeroupper 1125 ; XOP-NEXT: retq 1126 %wide.vec = load <24 x i16>, <24 x i16>* %p, align 4 1127 %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1128 %s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1129 %s3 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1130 store <8 x i16> %s1, <8 x i16>* %q1, align 4 1131 store <8 x i16> %s2, <8 x i16>* %q2, align 4 1132 store <8 x i16> %s3, <8 x i16>* %q3, align 4 1133 ret void 1134 } 1135 1136 define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind { 1137 ; SSE2-LABEL: interleave_24i16_in: 1138 ; SSE2: # %bb.0: 1139 ; SSE2-NEXT: movdqu (%rsi), %xmm3 1140 ; SSE2-NEXT: movdqu (%rdx), %xmm2 1141 ; SSE2-NEXT: movdqu (%rcx), %xmm1 1142 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,3] 1143 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] 1144 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1145 ; SSE2-NEXT: pandn %xmm4, %xmm5 1146 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,3,3] 1147 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] 1148 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1149 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 1150 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] 1151 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] 1152 ; SSE2-NEXT: pand %xmm0, %xmm3 1153 ; SSE2-NEXT: por %xmm5, %xmm3 1154 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1155 ; SSE2-NEXT: pandn %xmm4, %xmm5 1156 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,3,4,5,6,7] 1157 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1158 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 1159 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,3,2,0,4,5,6,7] 1160 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] 1161 ; SSE2-NEXT: pand %xmm0, %xmm2 1162 ; SSE2-NEXT: por %xmm5, %xmm2 1163 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] 1164 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 1165 ; SSE2-NEXT: pand %xmm5, %xmm1 1166 ; SSE2-NEXT: pandn %xmm6, %xmm5 1167 ; SSE2-NEXT: por %xmm1, %xmm5 1168 ; SSE2-NEXT: pand %xmm0, %xmm5 1169 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,7] 1170 ; SSE2-NEXT: pandn %xmm1, %xmm0 1171 ; SSE2-NEXT: por %xmm5, %xmm0 1172 ; SSE2-NEXT: movdqu %xmm0, 16(%rdi) 1173 ; SSE2-NEXT: movdqu %xmm2, 32(%rdi) 1174 ; SSE2-NEXT: movdqu %xmm3, (%rdi) 1175 ; SSE2-NEXT: retq 1176 ; 1177 ; SSE42-LABEL: interleave_24i16_in: 1178 ; SSE42: # %bb.0: 1179 ; SSE42-NEXT: movdqu (%rsi), %xmm0 1180 ; SSE42-NEXT: movdqu (%rdx), %xmm1 1181 ; SSE42-NEXT: movdqu (%rcx), %xmm2 1182 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] 1183 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,3,3] 1184 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1185 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] 1186 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,3] 1187 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7] 1188 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] 1189 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] 1190 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,3,4,5,6,7] 1191 ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] 1192 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] 1193 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1194 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,6,7,4,5,8,9,10,11,10,11,12,13,14,15] 1195 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] 1196 ; SSE42-NEXT: movdqu %xmm4, 32(%rdi) 1197 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi) 1198 ; SSE42-NEXT: movdqu %xmm5, (%rdi) 1199 ; SSE42-NEXT: retq 1200 ; 1201 ; AVX1-LABEL: interleave_24i16_in: 1202 ; AVX1: # %bb.0: 1203 ; AVX1-NEXT: vmovdqu (%rsi), %xmm0 1204 ; AVX1-NEXT: vmovdqu (%rdx), %xmm1 1205 ; AVX1-NEXT: vmovdqu (%rcx), %xmm2 1206 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] 1207 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7] 1208 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] 1209 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] 1210 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] 1211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 1212 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1213 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] 1214 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,0,3] 1215 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] 1216 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1217 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1218 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] 1219 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 1220 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] 1221 ; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) 1222 ; AVX1-NEXT: vmovups %ymm3, (%rdi) 1223 ; AVX1-NEXT: vzeroupper 1224 ; AVX1-NEXT: retq 1225 ; 1226 ; AVX2-LABEL: interleave_24i16_in: 1227 ; AVX2: # %bb.0: 1228 ; AVX2-NEXT: vmovdqu (%rsi), %xmm0 1229 ; AVX2-NEXT: vmovdqu (%rdx), %xmm1 1230 ; AVX2-NEXT: vmovdqu (%rcx), %xmm2 1231 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 1232 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23] 1233 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] 1234 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] 1235 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] 1236 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2> 1237 ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 1238 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] 1239 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 1240 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1241 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] 1242 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 1243 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] 1244 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi) 1245 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi) 1246 ; AVX2-NEXT: vzeroupper 1247 ; AVX2-NEXT: retq 1248 ; 1249 ; XOP-LABEL: interleave_24i16_in: 1250 ; XOP: # %bb.0: 1251 ; XOP-NEXT: vmovdqu (%rsi), %xmm0 1252 ; XOP-NEXT: vmovdqu (%rdx), %xmm1 1253 ; XOP-NEXT: vmovdqu (%rcx), %xmm2 1254 ; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[4,5,6,7],xmm1[6,7],xmm0[6,7,8,9],xmm1[8,9],xmm0[8,9,10,11] 1255 ; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] 1256 ; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] 1257 ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1258 ; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11] 1259 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1260 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15] 1261 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] 1262 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] 1263 ; XOP-NEXT: vmovdqu %xmm0, 32(%rdi) 1264 ; XOP-NEXT: vmovups %ymm3, (%rdi) 1265 ; XOP-NEXT: vzeroupper 1266 ; XOP-NEXT: retq 1267 %s1 = load <8 x i16>, <8 x i16>* %q1, align 4 1268 %s2 = load <8 x i16>, <8 x i16>* %q2, align 4 1269 %s3 = load <8 x i16>, <8 x i16>* %q3, align 4 1270 %t1 = shufflevector <8 x i16> %s1, <8 x i16> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1271 %t2 = shufflevector <8 x i16> %s3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1272 %interleaved = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1273 store <24 x i16> %interleaved, <24 x i16>* %p, align 4 1274 ret void 1275 } 1276 1277 define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { 1278 ; SSE2-LABEL: interleave_24i32_out: 1279 ; SSE2: # %bb.0: 1280 ; SSE2-NEXT: movups 80(%rdi), %xmm9 1281 ; SSE2-NEXT: movups 64(%rdi), %xmm10 1282 ; SSE2-NEXT: movups (%rdi), %xmm0 1283 ; SSE2-NEXT: movups 16(%rdi), %xmm11 1284 ; SSE2-NEXT: movups 32(%rdi), %xmm8 1285 ; SSE2-NEXT: movups 48(%rdi), %xmm2 1286 ; SSE2-NEXT: movaps %xmm2, %xmm3 1287 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3] 1288 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] 1289 ; SSE2-NEXT: movaps %xmm9, %xmm6 1290 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] 1291 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 1292 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3] 1293 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0] 1294 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] 1295 ; SSE2-NEXT: movaps %xmm0, %xmm5 1296 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3] 1297 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1298 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] 1299 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1300 ; SSE2-NEXT: movaps %xmm8, %xmm4 1301 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] 1302 ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0] 1303 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] 1304 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] 1305 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3] 1306 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] 1307 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] 1308 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] 1309 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3] 1310 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] 1311 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0] 1312 ; SSE2-NEXT: movups %xmm3, 16(%rsi) 1313 ; SSE2-NEXT: movups %xmm5, (%rsi) 1314 ; SSE2-NEXT: movups %xmm2, 16(%rdx) 1315 ; SSE2-NEXT: movups %xmm0, (%rdx) 1316 ; SSE2-NEXT: movups %xmm7, 16(%rcx) 1317 ; SSE2-NEXT: movups %xmm1, (%rcx) 1318 ; SSE2-NEXT: retq 1319 ; 1320 ; SSE42-LABEL: interleave_24i32_out: 1321 ; SSE42: # %bb.0: 1322 ; SSE42-NEXT: movdqu 80(%rdi), %xmm9 1323 ; SSE42-NEXT: movdqu 64(%rdi), %xmm10 1324 ; SSE42-NEXT: movdqu (%rdi), %xmm4 1325 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 1326 ; SSE42-NEXT: movdqu 32(%rdi), %xmm11 1327 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 1328 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] 1329 ; SSE42-NEXT: movdqa %xmm2, %xmm7 1330 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] 1331 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] 1332 ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] 1333 ; SSE42-NEXT: blendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] 1334 ; SSE42-NEXT: movdqa %xmm10, %xmm1 1335 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] 1336 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] 1337 ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm10[2,3] 1338 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] 1339 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] 1340 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,2] 1341 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,0,3,3] 1342 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm5[6,7] 1343 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] 1344 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,2,2] 1345 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6,7] 1346 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,3],xmm6[4,5,6,7] 1347 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] 1348 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 1349 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7] 1350 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] 1351 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1352 ; SSE42-NEXT: movdqu %xmm3, 16(%rsi) 1353 ; SSE42-NEXT: movups %xmm4, (%rsi) 1354 ; SSE42-NEXT: movdqu %xmm5, 16(%rdx) 1355 ; SSE42-NEXT: movdqu %xmm7, (%rdx) 1356 ; SSE42-NEXT: movdqu %xmm2, 16(%rcx) 1357 ; SSE42-NEXT: movdqu %xmm1, (%rcx) 1358 ; SSE42-NEXT: retq 1359 ; 1360 ; AVX1-LABEL: interleave_24i32_out: 1361 ; AVX1: # %bb.0: 1362 ; AVX1-NEXT: vmovups (%rdi), %ymm0 1363 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 1364 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2 1365 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1366 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1] 1367 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 1368 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1369 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 1370 ; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3] 1371 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1] 1372 ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3] 1373 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 1374 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] 1375 ; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3] 1376 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2] 1377 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1378 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1379 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 1380 ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3] 1381 ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2] 1382 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] 1383 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 1384 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1385 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3] 1386 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 1387 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1388 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1389 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1390 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 1391 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1392 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1393 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 1394 ; AVX1-NEXT: vmovups %ymm4, (%rsi) 1395 ; AVX1-NEXT: vmovups %ymm5, (%rdx) 1396 ; AVX1-NEXT: vmovups %ymm0, (%rcx) 1397 ; AVX1-NEXT: vzeroupper 1398 ; AVX1-NEXT: retq 1399 ; 1400 ; AVX2-SLOW-LABEL: interleave_24i32_out: 1401 ; AVX2-SLOW: # %bb.0: 1402 ; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0 1403 ; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1 1404 ; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2 1405 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5> 1406 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 1407 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1408 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> 1409 ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 1410 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 1411 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6> 1412 ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 1413 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1414 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> 1415 ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 1416 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1417 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1418 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> 1419 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 1420 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] 1421 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 1422 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 1423 ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi) 1424 ; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx) 1425 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx) 1426 ; AVX2-SLOW-NEXT: vzeroupper 1427 ; AVX2-SLOW-NEXT: retq 1428 ; 1429 ; AVX2-FAST-LABEL: interleave_24i32_out: 1430 ; AVX2-FAST: # %bb.0: 1431 ; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0 1432 ; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1 1433 ; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2 1434 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5> 1435 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 1436 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1437 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> 1438 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 1439 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 1440 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6> 1441 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 1442 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1443 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> 1444 ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 1445 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] 1446 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7] 1447 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 1448 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1449 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> 1450 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 1451 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 1452 ; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi) 1453 ; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx) 1454 ; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx) 1455 ; AVX2-FAST-NEXT: vzeroupper 1456 ; AVX2-FAST-NEXT: retq 1457 ; 1458 ; XOP-LABEL: interleave_24i32_out: 1459 ; XOP: # %bb.0: 1460 ; XOP-NEXT: vmovups (%rdi), %ymm0 1461 ; XOP-NEXT: vmovups 32(%rdi), %ymm1 1462 ; XOP-NEXT: vmovups 64(%rdi), %ymm2 1463 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm3 1464 ; XOP-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1] 1465 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 1466 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1467 ; XOP-NEXT: vextractf128 $1, %ymm5, %xmm6 1468 ; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3] 1469 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1] 1470 ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3] 1471 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 1472 ; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] 1473 ; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3] 1474 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2] 1475 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 1476 ; XOP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1477 ; XOP-NEXT: vextractf128 $1, %ymm6, %xmm7 1478 ; XOP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3] 1479 ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2] 1480 ; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] 1481 ; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 1482 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] 1483 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3] 1484 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 1485 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1486 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1487 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1488 ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 1489 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1490 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1491 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] 1492 ; XOP-NEXT: vmovups %ymm4, (%rsi) 1493 ; XOP-NEXT: vmovups %ymm5, (%rdx) 1494 ; XOP-NEXT: vmovups %ymm0, (%rcx) 1495 ; XOP-NEXT: vzeroupper 1496 ; XOP-NEXT: retq 1497 %wide.vec = load <24 x i32>, <24 x i32>* %p, align 4 1498 %s1 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 1499 %s2 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 1500 %s3 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 1501 store <8 x i32> %s1, <8 x i32>* %q1, align 4 1502 store <8 x i32> %s2, <8 x i32>* %q2, align 4 1503 store <8 x i32> %s3, <8 x i32>* %q3, align 4 1504 ret void 1505 } 1506 1507 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { 1508 ; SSE2-LABEL: interleave_24i32_in: 1509 ; SSE2: # %bb.0: 1510 ; SSE2-NEXT: movdqu (%rsi), %xmm5 1511 ; SSE2-NEXT: movdqu 16(%rsi), %xmm2 1512 ; SSE2-NEXT: movdqu (%rdx), %xmm6 1513 ; SSE2-NEXT: movdqu 16(%rdx), %xmm1 1514 ; SSE2-NEXT: movups (%rcx), %xmm7 1515 ; SSE2-NEXT: movups 16(%rcx), %xmm4 1516 ; SSE2-NEXT: movdqa %xmm5, %xmm0 1517 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 1518 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 1519 ; SSE2-NEXT: movaps %xmm7, %xmm3 1520 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] 1521 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] 1522 ; SSE2-NEXT: movaps %xmm7, %xmm3 1523 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0] 1524 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2] 1525 ; SSE2-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] 1526 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] 1527 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] 1528 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2] 1529 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm5[3,0] 1530 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] 1531 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1532 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] 1533 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] 1534 ; SSE2-NEXT: movaps %xmm4, %xmm7 1535 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm6[3,0] 1536 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2] 1537 ; SSE2-NEXT: movaps %xmm4, %xmm7 1538 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0] 1539 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2] 1540 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] 1541 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] 1542 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2] 1543 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2] 1544 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,0] 1545 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 1546 ; SSE2-NEXT: movups %xmm2, 80(%rdi) 1547 ; SSE2-NEXT: movups %xmm7, 64(%rdi) 1548 ; SSE2-NEXT: movups %xmm6, 48(%rdi) 1549 ; SSE2-NEXT: movups %xmm5, 32(%rdi) 1550 ; SSE2-NEXT: movups %xmm3, 16(%rdi) 1551 ; SSE2-NEXT: movups %xmm0, (%rdi) 1552 ; SSE2-NEXT: retq 1553 ; 1554 ; SSE42-LABEL: interleave_24i32_in: 1555 ; SSE42: # %bb.0: 1556 ; SSE42-NEXT: movdqu (%rsi), %xmm5 1557 ; SSE42-NEXT: movdqu 16(%rsi), %xmm2 1558 ; SSE42-NEXT: movdqu (%rdx), %xmm6 1559 ; SSE42-NEXT: movdqu 16(%rdx), %xmm1 1560 ; SSE42-NEXT: movdqu (%rcx), %xmm7 1561 ; SSE42-NEXT: movdqu 16(%rcx), %xmm4 1562 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] 1563 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] 1564 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7] 1565 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] 1566 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] 1567 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] 1568 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] 1569 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7] 1570 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 1571 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] 1572 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] 1573 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] 1574 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7] 1575 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] 1576 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] 1577 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] 1578 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] 1579 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] 1580 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] 1581 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] 1582 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] 1583 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1584 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 1585 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] 1586 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 1587 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] 1588 ; SSE42-NEXT: movdqu %xmm1, 80(%rdi) 1589 ; SSE42-NEXT: movdqu %xmm7, 64(%rdi) 1590 ; SSE42-NEXT: movdqu %xmm6, 48(%rdi) 1591 ; SSE42-NEXT: movdqu %xmm5, 32(%rdi) 1592 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi) 1593 ; SSE42-NEXT: movdqu %xmm0, (%rdi) 1594 ; SSE42-NEXT: retq 1595 ; 1596 ; AVX1-LABEL: interleave_24i32_in: 1597 ; AVX1: # %bb.0: 1598 ; AVX1-NEXT: vmovups (%rsi), %ymm0 1599 ; AVX1-NEXT: vmovups (%rdx), %ymm1 1600 ; AVX1-NEXT: vmovupd (%rcx), %ymm2 1601 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0] 1602 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] 1603 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0] 1604 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] 1605 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1606 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] 1607 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 1608 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1609 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1610 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1611 ; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0] 1612 ; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2] 1613 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0] 1614 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2] 1615 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1616 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3] 1617 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] 1618 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1619 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] 1620 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] 1621 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 1622 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] 1623 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1624 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 1625 ; AVX1-NEXT: vmovups %ymm4, 64(%rdi) 1626 ; AVX1-NEXT: vmovups %ymm3, (%rdi) 1627 ; AVX1-NEXT: vzeroupper 1628 ; AVX1-NEXT: retq 1629 ; 1630 ; AVX2-SLOW-LABEL: interleave_24i32_in: 1631 ; AVX2-SLOW: # %bb.0: 1632 ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 1633 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 1634 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 1635 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] 1636 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] 1637 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] 1638 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 1639 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4 1640 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1641 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] 1642 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] 1643 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] 1644 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] 1645 ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5 1646 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1647 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] 1648 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] 1649 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] 1650 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] 1651 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1652 ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) 1653 ; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) 1654 ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) 1655 ; AVX2-SLOW-NEXT: vzeroupper 1656 ; AVX2-SLOW-NEXT: retq 1657 ; 1658 ; AVX2-FAST-LABEL: interleave_24i32_in: 1659 ; AVX2-FAST: # %bb.0: 1660 ; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 1661 ; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 1662 ; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 1663 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] 1664 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] 1665 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] 1666 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 1667 ; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4 1668 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1669 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] 1670 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] 1671 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] 1672 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] 1673 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] 1674 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] 1675 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 1676 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] 1677 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1678 ; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2 1679 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1680 ; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi) 1681 ; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) 1682 ; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) 1683 ; AVX2-FAST-NEXT: vzeroupper 1684 ; AVX2-FAST-NEXT: retq 1685 ; 1686 ; XOP-LABEL: interleave_24i32_in: 1687 ; XOP: # %bb.0: 1688 ; XOP-NEXT: vmovups (%rsi), %ymm0 1689 ; XOP-NEXT: vmovups (%rdx), %ymm1 1690 ; XOP-NEXT: vmovupd (%rcx), %ymm2 1691 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0] 1692 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] 1693 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0] 1694 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] 1695 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1696 ; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] 1697 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 1698 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 1699 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm4 1700 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm5 1701 ; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0] 1702 ; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2] 1703 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0] 1704 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2] 1705 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1706 ; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3] 1707 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] 1708 ; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] 1709 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5] 1710 ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] 1711 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1712 ; XOP-NEXT: vmovups %ymm0, 32(%rdi) 1713 ; XOP-NEXT: vmovups %ymm4, 64(%rdi) 1714 ; XOP-NEXT: vmovups %ymm3, (%rdi) 1715 ; XOP-NEXT: vzeroupper 1716 ; XOP-NEXT: retq 1717 %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 1718 %s2 = load <8 x i32>, <8 x i32>* %q2, align 4 1719 %s3 = load <8 x i32>, <8 x i32>* %q3, align 4 1720 %t1 = shufflevector <8 x i32> %s1, <8 x i32> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1721 %t2 = shufflevector <8 x i32> %s3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1722 %interleaved = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1723 store <24 x i32> %interleaved, <24 x i32>* %p, align 4 1724 ret void 1725 } 1726 1727 define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 { 1728 ; SSE2-LABEL: wrongorder: 1729 ; SSE2: # %bb.0: 1730 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 1731 ; SSE2-NEXT: movaps %xmm0, 48(%rdi) 1732 ; SSE2-NEXT: movaps %xmm0, 32(%rdi) 1733 ; SSE2-NEXT: movaps %xmm0, 16(%rdi) 1734 ; SSE2-NEXT: movaps %xmm0, (%rdi) 1735 ; SSE2-NEXT: retq 1736 ; 1737 ; SSE42-LABEL: wrongorder: 1738 ; SSE42: # %bb.0: 1739 ; SSE42-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 1740 ; SSE42-NEXT: movapd %xmm0, 48(%rdi) 1741 ; SSE42-NEXT: movapd %xmm0, 32(%rdi) 1742 ; SSE42-NEXT: movapd %xmm0, 16(%rdi) 1743 ; SSE42-NEXT: movapd %xmm0, (%rdi) 1744 ; SSE42-NEXT: retq 1745 ; 1746 ; AVX1-LABEL: wrongorder: 1747 ; AVX1: # %bb.0: 1748 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1749 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 1750 ; AVX1-NEXT: vmovaps %ymm1, 32(%rdi) 1751 ; AVX1-NEXT: vmovaps %ymm1, (%rdi) 1752 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1753 ; AVX1-NEXT: vzeroupper 1754 ; AVX1-NEXT: retq 1755 ; 1756 ; AVX2-LABEL: wrongorder: 1757 ; AVX2: # %bb.0: 1758 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1 1759 ; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) 1760 ; AVX2-NEXT: vmovapd %ymm1, (%rdi) 1761 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1762 ; AVX2-NEXT: vzeroupper 1763 ; AVX2-NEXT: retq 1764 ; 1765 ; XOP-LABEL: wrongorder: 1766 ; XOP: # %bb.0: 1767 ; XOP-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1768 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 1769 ; XOP-NEXT: vmovaps %ymm1, 32(%rdi) 1770 ; XOP-NEXT: vmovaps %ymm1, (%rdi) 1771 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1772 ; XOP-NEXT: vzeroupper 1773 ; XOP-NEXT: retq 1774 %shuffle = shufflevector <4 x double> %A, <4 x double> %A, <8 x i32> zeroinitializer 1775 store <8 x double> %shuffle, <8 x double>* %P, align 64 1776 %m2 = load <8 x double>, <8 x double>* %P, align 64 1777 store <8 x double> %m2, <8 x double>* %P, align 64 1778 %m3 = load <8 x double>, <8 x double>* %P, align 64 1779 %m4 = shufflevector <8 x double> %m3, <8 x double> undef, <2 x i32> <i32 2, i32 0> 1780 ret <2 x double> %m4 1781 } 1782