1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 6 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW 7 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST 8 ; 9 ; Verify that the DAG combiner correctly folds bitwise operations across 10 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 11 ; basic and always-safe patterns. Also test that the DAG combiner will combine 12 ; target-specific shuffle instructions where reasonable. 13 14 target triple = "x86_64-unknown-unknown" 15 16 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 17 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 18 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 19 20 define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 21 ; ALL-LABEL: combine_pshufd1: 22 ; ALL: # %bb.0: # %entry 23 ; ALL-NEXT: retq 24 entry: 25 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 26 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 27 ret <4 x i32> %c 28 } 29 30 define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 31 ; ALL-LABEL: combine_pshufd2: 32 ; ALL: # %bb.0: # %entry 33 ; ALL-NEXT: retq 34 entry: 35 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 36 %b.cast = bitcast <4 x i32> %b to <8 x i16> 37 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 38 %c.cast = bitcast <8 x i16> %c to <4 x i32> 39 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 40 ret <4 x i32> %d 41 } 42 43 define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 44 ; ALL-LABEL: combine_pshufd3: 45 ; ALL: # %bb.0: # %entry 46 ; ALL-NEXT: retq 47 entry: 48 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 49 %b.cast = bitcast <4 x i32> %b to <8 x i16> 50 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 51 %c.cast = bitcast <8 x i16> %c to <4 x i32> 52 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 53 ret <4 x i32> %d 54 } 55 56 define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 57 ; SSE-LABEL: combine_pshufd4: 58 ; SSE: # %bb.0: # %entry 59 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 60 ; SSE-NEXT: retq 61 ; 62 ; AVX-LABEL: combine_pshufd4: 63 ; AVX: # %bb.0: # %entry 64 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 65 ; AVX-NEXT: retq 66 entry: 67 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 68 %b.cast = bitcast <4 x i32> %b to <8 x i16> 69 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 70 %c.cast = bitcast <8 x i16> %c to <4 x i32> 71 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 72 ret <4 x i32> %d 73 } 74 75 define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 76 ; SSE-LABEL: combine_pshufd5: 77 ; SSE: # %bb.0: # %entry 78 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 79 ; SSE-NEXT: retq 80 ; 81 ; AVX-LABEL: combine_pshufd5: 82 ; AVX: # %bb.0: # %entry 83 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 84 ; AVX-NEXT: retq 85 entry: 86 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 87 %b.cast = bitcast <4 x i32> %b to <8 x i16> 88 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 89 %c.cast = bitcast <8 x i16> %c to <4 x i32> 90 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 91 ret <4 x i32> %d 92 } 93 94 define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 95 ; SSE-LABEL: combine_pshufd6: 96 ; SSE: # %bb.0: # %entry 97 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 98 ; SSE-NEXT: retq 99 ; 100 ; AVX1-LABEL: combine_pshufd6: 101 ; AVX1: # %bb.0: # %entry 102 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 103 ; AVX1-NEXT: retq 104 ; 105 ; AVX2-LABEL: combine_pshufd6: 106 ; AVX2: # %bb.0: # %entry 107 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 108 ; AVX2-NEXT: retq 109 entry: 110 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 111 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 112 ret <4 x i32> %c 113 } 114 115 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 116 ; ALL-LABEL: combine_pshuflw1: 117 ; ALL: # %bb.0: # %entry 118 ; ALL-NEXT: retq 119 entry: 120 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 121 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 122 ret <8 x i16> %c 123 } 124 125 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 126 ; ALL-LABEL: combine_pshuflw2: 127 ; ALL: # %bb.0: # %entry 128 ; ALL-NEXT: retq 129 entry: 130 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 131 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 132 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 133 ret <8 x i16> %d 134 } 135 136 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 137 ; SSE-LABEL: combine_pshuflw3: 138 ; SSE: # %bb.0: # %entry 139 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 140 ; SSE-NEXT: retq 141 ; 142 ; AVX-LABEL: combine_pshuflw3: 143 ; AVX: # %bb.0: # %entry 144 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 145 ; AVX-NEXT: retq 146 entry: 147 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 148 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 149 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 150 ret <8 x i16> %d 151 } 152 153 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 154 ; SSE-LABEL: combine_pshufhw1: 155 ; SSE: # %bb.0: # %entry 156 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 157 ; SSE-NEXT: retq 158 ; 159 ; AVX-LABEL: combine_pshufhw1: 160 ; AVX: # %bb.0: # %entry 161 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 162 ; AVX-NEXT: retq 163 entry: 164 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 165 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 166 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 167 ret <8 x i16> %d 168 } 169 170 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 171 ; SSE-LABEL: combine_bitwise_ops_test1: 172 ; SSE: # %bb.0: 173 ; SSE-NEXT: pand %xmm1, %xmm0 174 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 175 ; SSE-NEXT: retq 176 ; 177 ; AVX-LABEL: combine_bitwise_ops_test1: 178 ; AVX: # %bb.0: 179 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 180 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 181 ; AVX-NEXT: retq 182 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 183 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 184 %and = and <4 x i32> %shuf1, %shuf2 185 ret <4 x i32> %and 186 } 187 188 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 189 ; SSE-LABEL: combine_bitwise_ops_test2: 190 ; SSE: # %bb.0: 191 ; SSE-NEXT: por %xmm1, %xmm0 192 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 193 ; SSE-NEXT: retq 194 ; 195 ; AVX-LABEL: combine_bitwise_ops_test2: 196 ; AVX: # %bb.0: 197 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 198 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 199 ; AVX-NEXT: retq 200 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 201 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 202 %or = or <4 x i32> %shuf1, %shuf2 203 ret <4 x i32> %or 204 } 205 206 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 207 ; SSE-LABEL: combine_bitwise_ops_test3: 208 ; SSE: # %bb.0: 209 ; SSE-NEXT: pxor %xmm1, %xmm0 210 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 211 ; SSE-NEXT: retq 212 ; 213 ; AVX-LABEL: combine_bitwise_ops_test3: 214 ; AVX: # %bb.0: 215 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 216 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 217 ; AVX-NEXT: retq 218 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 219 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 220 %xor = xor <4 x i32> %shuf1, %shuf2 221 ret <4 x i32> %xor 222 } 223 224 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 225 ; SSE-LABEL: combine_bitwise_ops_test4: 226 ; SSE: # %bb.0: 227 ; SSE-NEXT: pand %xmm1, %xmm0 228 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 229 ; SSE-NEXT: retq 230 ; 231 ; AVX-LABEL: combine_bitwise_ops_test4: 232 ; AVX: # %bb.0: 233 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 234 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 235 ; AVX-NEXT: retq 236 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 237 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 238 %and = and <4 x i32> %shuf1, %shuf2 239 ret <4 x i32> %and 240 } 241 242 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 243 ; SSE-LABEL: combine_bitwise_ops_test5: 244 ; SSE: # %bb.0: 245 ; SSE-NEXT: por %xmm1, %xmm0 246 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 247 ; SSE-NEXT: retq 248 ; 249 ; AVX-LABEL: combine_bitwise_ops_test5: 250 ; AVX: # %bb.0: 251 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 252 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 253 ; AVX-NEXT: retq 254 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 255 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 256 %or = or <4 x i32> %shuf1, %shuf2 257 ret <4 x i32> %or 258 } 259 260 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 261 ; SSE-LABEL: combine_bitwise_ops_test6: 262 ; SSE: # %bb.0: 263 ; SSE-NEXT: pxor %xmm1, %xmm0 264 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 265 ; SSE-NEXT: retq 266 ; 267 ; AVX-LABEL: combine_bitwise_ops_test6: 268 ; AVX: # %bb.0: 269 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 270 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 271 ; AVX-NEXT: retq 272 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 273 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 274 %xor = xor <4 x i32> %shuf1, %shuf2 275 ret <4 x i32> %xor 276 } 277 278 279 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 280 ; are not performing a swizzle operations. 281 282 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 283 ; SSE2-LABEL: combine_bitwise_ops_test1b: 284 ; SSE2: # %bb.0: 285 ; SSE2-NEXT: pand %xmm1, %xmm0 286 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 287 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 288 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 289 ; SSE2-NEXT: retq 290 ; 291 ; SSSE3-LABEL: combine_bitwise_ops_test1b: 292 ; SSSE3: # %bb.0: 293 ; SSSE3-NEXT: pand %xmm1, %xmm0 294 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 295 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 296 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 297 ; SSSE3-NEXT: retq 298 ; 299 ; SSE41-LABEL: combine_bitwise_ops_test1b: 300 ; SSE41: # %bb.0: 301 ; SSE41-NEXT: andps %xmm1, %xmm0 302 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 303 ; SSE41-NEXT: retq 304 ; 305 ; AVX-LABEL: combine_bitwise_ops_test1b: 306 ; AVX: # %bb.0: 307 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 308 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 309 ; AVX-NEXT: retq 310 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 311 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 312 %and = and <4 x i32> %shuf1, %shuf2 313 ret <4 x i32> %and 314 } 315 316 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 317 ; SSE2-LABEL: combine_bitwise_ops_test2b: 318 ; SSE2: # %bb.0: 319 ; SSE2-NEXT: por %xmm1, %xmm0 320 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 321 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 322 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 323 ; SSE2-NEXT: retq 324 ; 325 ; SSSE3-LABEL: combine_bitwise_ops_test2b: 326 ; SSSE3: # %bb.0: 327 ; SSSE3-NEXT: por %xmm1, %xmm0 328 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 329 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 330 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 331 ; SSSE3-NEXT: retq 332 ; 333 ; SSE41-LABEL: combine_bitwise_ops_test2b: 334 ; SSE41: # %bb.0: 335 ; SSE41-NEXT: orps %xmm1, %xmm0 336 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 337 ; SSE41-NEXT: retq 338 ; 339 ; AVX-LABEL: combine_bitwise_ops_test2b: 340 ; AVX: # %bb.0: 341 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 342 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 343 ; AVX-NEXT: retq 344 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 345 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 346 %or = or <4 x i32> %shuf1, %shuf2 347 ret <4 x i32> %or 348 } 349 350 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 351 ; SSE2-LABEL: combine_bitwise_ops_test3b: 352 ; SSE2: # %bb.0: 353 ; SSE2-NEXT: xorps %xmm1, %xmm0 354 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 355 ; SSE2-NEXT: retq 356 ; 357 ; SSSE3-LABEL: combine_bitwise_ops_test3b: 358 ; SSSE3: # %bb.0: 359 ; SSSE3-NEXT: xorps %xmm1, %xmm0 360 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 361 ; SSSE3-NEXT: retq 362 ; 363 ; SSE41-LABEL: combine_bitwise_ops_test3b: 364 ; SSE41: # %bb.0: 365 ; SSE41-NEXT: xorps %xmm1, %xmm0 366 ; SSE41-NEXT: xorps %xmm1, %xmm1 367 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 368 ; SSE41-NEXT: retq 369 ; 370 ; AVX-LABEL: combine_bitwise_ops_test3b: 371 ; AVX: # %bb.0: 372 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 373 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 374 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 375 ; AVX-NEXT: retq 376 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 377 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 378 %xor = xor <4 x i32> %shuf1, %shuf2 379 ret <4 x i32> %xor 380 } 381 382 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 383 ; SSE2-LABEL: combine_bitwise_ops_test4b: 384 ; SSE2: # %bb.0: 385 ; SSE2-NEXT: pand %xmm1, %xmm0 386 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 387 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 388 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 389 ; SSE2-NEXT: retq 390 ; 391 ; SSSE3-LABEL: combine_bitwise_ops_test4b: 392 ; SSSE3: # %bb.0: 393 ; SSSE3-NEXT: pand %xmm1, %xmm0 394 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 395 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 396 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 397 ; SSSE3-NEXT: retq 398 ; 399 ; SSE41-LABEL: combine_bitwise_ops_test4b: 400 ; SSE41: # %bb.0: 401 ; SSE41-NEXT: andps %xmm1, %xmm0 402 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 403 ; SSE41-NEXT: retq 404 ; 405 ; AVX-LABEL: combine_bitwise_ops_test4b: 406 ; AVX: # %bb.0: 407 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 408 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 409 ; AVX-NEXT: retq 410 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 411 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 412 %and = and <4 x i32> %shuf1, %shuf2 413 ret <4 x i32> %and 414 } 415 416 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 417 ; SSE2-LABEL: combine_bitwise_ops_test5b: 418 ; SSE2: # %bb.0: 419 ; SSE2-NEXT: por %xmm1, %xmm0 420 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 421 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 422 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 423 ; SSE2-NEXT: retq 424 ; 425 ; SSSE3-LABEL: combine_bitwise_ops_test5b: 426 ; SSSE3: # %bb.0: 427 ; SSSE3-NEXT: por %xmm1, %xmm0 428 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 429 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 430 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 431 ; SSSE3-NEXT: retq 432 ; 433 ; SSE41-LABEL: combine_bitwise_ops_test5b: 434 ; SSE41: # %bb.0: 435 ; SSE41-NEXT: orps %xmm1, %xmm0 436 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 437 ; SSE41-NEXT: retq 438 ; 439 ; AVX-LABEL: combine_bitwise_ops_test5b: 440 ; AVX: # %bb.0: 441 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 442 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 443 ; AVX-NEXT: retq 444 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 445 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 446 %or = or <4 x i32> %shuf1, %shuf2 447 ret <4 x i32> %or 448 } 449 450 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 451 ; SSE2-LABEL: combine_bitwise_ops_test6b: 452 ; SSE2: # %bb.0: 453 ; SSE2-NEXT: xorps %xmm1, %xmm0 454 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 455 ; SSE2-NEXT: retq 456 ; 457 ; SSSE3-LABEL: combine_bitwise_ops_test6b: 458 ; SSSE3: # %bb.0: 459 ; SSSE3-NEXT: xorps %xmm1, %xmm0 460 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 461 ; SSSE3-NEXT: retq 462 ; 463 ; SSE41-LABEL: combine_bitwise_ops_test6b: 464 ; SSE41: # %bb.0: 465 ; SSE41-NEXT: xorps %xmm1, %xmm0 466 ; SSE41-NEXT: xorps %xmm1, %xmm1 467 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 468 ; SSE41-NEXT: retq 469 ; 470 ; AVX-LABEL: combine_bitwise_ops_test6b: 471 ; AVX: # %bb.0: 472 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 473 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 474 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 475 ; AVX-NEXT: retq 476 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 477 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 478 %xor = xor <4 x i32> %shuf1, %shuf2 479 ret <4 x i32> %xor 480 } 481 482 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 483 ; SSE-LABEL: combine_bitwise_ops_test1c: 484 ; SSE: # %bb.0: 485 ; SSE-NEXT: andps %xmm1, %xmm0 486 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 487 ; SSE-NEXT: retq 488 ; 489 ; AVX-LABEL: combine_bitwise_ops_test1c: 490 ; AVX: # %bb.0: 491 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 492 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 493 ; AVX-NEXT: retq 494 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 495 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 496 %and = and <4 x i32> %shuf1, %shuf2 497 ret <4 x i32> %and 498 } 499 500 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 501 ; SSE-LABEL: combine_bitwise_ops_test2c: 502 ; SSE: # %bb.0: 503 ; SSE-NEXT: orps %xmm1, %xmm0 504 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 505 ; SSE-NEXT: retq 506 ; 507 ; AVX-LABEL: combine_bitwise_ops_test2c: 508 ; AVX: # %bb.0: 509 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 510 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 511 ; AVX-NEXT: retq 512 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 513 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 514 %or = or <4 x i32> %shuf1, %shuf2 515 ret <4 x i32> %or 516 } 517 518 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 519 ; SSE2-LABEL: combine_bitwise_ops_test3c: 520 ; SSE2: # %bb.0: 521 ; SSE2-NEXT: xorps %xmm1, %xmm0 522 ; SSE2-NEXT: xorps %xmm1, %xmm1 523 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 524 ; SSE2-NEXT: retq 525 ; 526 ; SSSE3-LABEL: combine_bitwise_ops_test3c: 527 ; SSSE3: # %bb.0: 528 ; SSSE3-NEXT: xorps %xmm1, %xmm0 529 ; SSSE3-NEXT: xorps %xmm1, %xmm1 530 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 531 ; SSSE3-NEXT: retq 532 ; 533 ; SSE41-LABEL: combine_bitwise_ops_test3c: 534 ; SSE41: # %bb.0: 535 ; SSE41-NEXT: xorps %xmm1, %xmm0 536 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 537 ; SSE41-NEXT: retq 538 ; 539 ; AVX-LABEL: combine_bitwise_ops_test3c: 540 ; AVX: # %bb.0: 541 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 542 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 543 ; AVX-NEXT: retq 544 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 545 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 546 %xor = xor <4 x i32> %shuf1, %shuf2 547 ret <4 x i32> %xor 548 } 549 550 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 551 ; SSE-LABEL: combine_bitwise_ops_test4c: 552 ; SSE: # %bb.0: 553 ; SSE-NEXT: andps %xmm1, %xmm0 554 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 555 ; SSE-NEXT: movaps %xmm2, %xmm0 556 ; SSE-NEXT: retq 557 ; 558 ; AVX-LABEL: combine_bitwise_ops_test4c: 559 ; AVX: # %bb.0: 560 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 561 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 562 ; AVX-NEXT: retq 563 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 564 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 565 %and = and <4 x i32> %shuf1, %shuf2 566 ret <4 x i32> %and 567 } 568 569 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 570 ; SSE-LABEL: combine_bitwise_ops_test5c: 571 ; SSE: # %bb.0: 572 ; SSE-NEXT: orps %xmm1, %xmm0 573 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 574 ; SSE-NEXT: movaps %xmm2, %xmm0 575 ; SSE-NEXT: retq 576 ; 577 ; AVX-LABEL: combine_bitwise_ops_test5c: 578 ; AVX: # %bb.0: 579 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 580 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 581 ; AVX-NEXT: retq 582 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 583 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 584 %or = or <4 x i32> %shuf1, %shuf2 585 ret <4 x i32> %or 586 } 587 588 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 589 ; SSE2-LABEL: combine_bitwise_ops_test6c: 590 ; SSE2: # %bb.0: 591 ; SSE2-NEXT: xorps %xmm1, %xmm0 592 ; SSE2-NEXT: xorps %xmm1, %xmm1 593 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 594 ; SSE2-NEXT: movaps %xmm1, %xmm0 595 ; SSE2-NEXT: retq 596 ; 597 ; SSSE3-LABEL: combine_bitwise_ops_test6c: 598 ; SSSE3: # %bb.0: 599 ; SSSE3-NEXT: xorps %xmm1, %xmm0 600 ; SSSE3-NEXT: xorps %xmm1, %xmm1 601 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 602 ; SSSE3-NEXT: movaps %xmm1, %xmm0 603 ; SSSE3-NEXT: retq 604 ; 605 ; SSE41-LABEL: combine_bitwise_ops_test6c: 606 ; SSE41: # %bb.0: 607 ; SSE41-NEXT: xorps %xmm1, %xmm0 608 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 609 ; SSE41-NEXT: retq 610 ; 611 ; AVX-LABEL: combine_bitwise_ops_test6c: 612 ; AVX: # %bb.0: 613 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 614 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 615 ; AVX-NEXT: retq 616 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 617 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 618 %xor = xor <4 x i32> %shuf1, %shuf2 619 ret <4 x i32> %xor 620 } 621 622 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 623 ; SSE-LABEL: combine_nested_undef_test1: 624 ; SSE: # %bb.0: 625 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 626 ; SSE-NEXT: retq 627 ; 628 ; AVX-LABEL: combine_nested_undef_test1: 629 ; AVX: # %bb.0: 630 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 631 ; AVX-NEXT: retq 632 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 633 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 634 ret <4 x i32> %2 635 } 636 637 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 638 ; SSE-LABEL: combine_nested_undef_test2: 639 ; SSE: # %bb.0: 640 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 641 ; SSE-NEXT: retq 642 ; 643 ; AVX-LABEL: combine_nested_undef_test2: 644 ; AVX: # %bb.0: 645 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 646 ; AVX-NEXT: retq 647 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 648 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 649 ret <4 x i32> %2 650 } 651 652 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 653 ; SSE-LABEL: combine_nested_undef_test3: 654 ; SSE: # %bb.0: 655 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 656 ; SSE-NEXT: retq 657 ; 658 ; AVX-LABEL: combine_nested_undef_test3: 659 ; AVX: # %bb.0: 660 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 661 ; AVX-NEXT: retq 662 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 663 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 664 ret <4 x i32> %2 665 } 666 667 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 668 ; SSE-LABEL: combine_nested_undef_test4: 669 ; SSE: # %bb.0: 670 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 671 ; SSE-NEXT: retq 672 ; 673 ; AVX1-LABEL: combine_nested_undef_test4: 674 ; AVX1: # %bb.0: 675 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 676 ; AVX1-NEXT: retq 677 ; 678 ; AVX2-LABEL: combine_nested_undef_test4: 679 ; AVX2: # %bb.0: 680 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 681 ; AVX2-NEXT: retq 682 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 683 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 684 ret <4 x i32> %2 685 } 686 687 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 688 ; SSE-LABEL: combine_nested_undef_test5: 689 ; SSE: # %bb.0: 690 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 691 ; SSE-NEXT: retq 692 ; 693 ; AVX-LABEL: combine_nested_undef_test5: 694 ; AVX: # %bb.0: 695 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 696 ; AVX-NEXT: retq 697 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 698 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 699 ret <4 x i32> %2 700 } 701 702 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 703 ; SSE-LABEL: combine_nested_undef_test6: 704 ; SSE: # %bb.0: 705 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 706 ; SSE-NEXT: retq 707 ; 708 ; AVX-LABEL: combine_nested_undef_test6: 709 ; AVX: # %bb.0: 710 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 711 ; AVX-NEXT: retq 712 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 713 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 714 ret <4 x i32> %2 715 } 716 717 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 718 ; SSE-LABEL: combine_nested_undef_test7: 719 ; SSE: # %bb.0: 720 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 721 ; SSE-NEXT: retq 722 ; 723 ; AVX-LABEL: combine_nested_undef_test7: 724 ; AVX: # %bb.0: 725 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 726 ; AVX-NEXT: retq 727 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 728 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 729 ret <4 x i32> %2 730 } 731 732 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 733 ; SSE-LABEL: combine_nested_undef_test8: 734 ; SSE: # %bb.0: 735 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 736 ; SSE-NEXT: retq 737 ; 738 ; AVX-LABEL: combine_nested_undef_test8: 739 ; AVX: # %bb.0: 740 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] 741 ; AVX-NEXT: retq 742 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 743 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 744 ret <4 x i32> %2 745 } 746 747 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 748 ; SSE-LABEL: combine_nested_undef_test9: 749 ; SSE: # %bb.0: 750 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 751 ; SSE-NEXT: retq 752 ; 753 ; AVX-LABEL: combine_nested_undef_test9: 754 ; AVX: # %bb.0: 755 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] 756 ; AVX-NEXT: retq 757 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 758 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 759 ret <4 x i32> %2 760 } 761 762 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 763 ; SSE-LABEL: combine_nested_undef_test10: 764 ; SSE: # %bb.0: 765 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 766 ; SSE-NEXT: retq 767 ; 768 ; AVX-LABEL: combine_nested_undef_test10: 769 ; AVX: # %bb.0: 770 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,3] 771 ; AVX-NEXT: retq 772 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 773 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 774 ret <4 x i32> %2 775 } 776 777 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 778 ; SSE-LABEL: combine_nested_undef_test11: 779 ; SSE: # %bb.0: 780 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 781 ; SSE-NEXT: retq 782 ; 783 ; AVX-LABEL: combine_nested_undef_test11: 784 ; AVX: # %bb.0: 785 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] 786 ; AVX-NEXT: retq 787 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 788 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 789 ret <4 x i32> %2 790 } 791 792 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 793 ; SSE-LABEL: combine_nested_undef_test12: 794 ; SSE: # %bb.0: 795 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 796 ; SSE-NEXT: retq 797 ; 798 ; AVX1-LABEL: combine_nested_undef_test12: 799 ; AVX1: # %bb.0: 800 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 801 ; AVX1-NEXT: retq 802 ; 803 ; AVX2-LABEL: combine_nested_undef_test12: 804 ; AVX2: # %bb.0: 805 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 806 ; AVX2-NEXT: retq 807 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 808 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 809 ret <4 x i32> %2 810 } 811 812 ; The following pair of shuffles is folded into vector %A. 813 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 814 ; ALL-LABEL: combine_nested_undef_test13: 815 ; ALL: # %bb.0: 816 ; ALL-NEXT: retq 817 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 818 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 819 ret <4 x i32> %2 820 } 821 822 ; The following pair of shuffles is folded into vector %B. 823 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 824 ; SSE-LABEL: combine_nested_undef_test14: 825 ; SSE: # %bb.0: 826 ; SSE-NEXT: movaps %xmm1, %xmm0 827 ; SSE-NEXT: retq 828 ; 829 ; AVX-LABEL: combine_nested_undef_test14: 830 ; AVX: # %bb.0: 831 ; AVX-NEXT: vmovaps %xmm1, %xmm0 832 ; AVX-NEXT: retq 833 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 834 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 835 ret <4 x i32> %2 836 } 837 838 839 ; Verify that we don't optimize the following cases. We expect more than one shuffle. 840 ; 841 ; FIXME: Many of these already don't make sense, and the rest should stop 842 ; making sense with th enew vector shuffle lowering. Revisit at least testing for 843 ; it. 844 845 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 846 ; SSE2-LABEL: combine_nested_undef_test15: 847 ; SSE2: # %bb.0: 848 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 849 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 850 ; SSE2-NEXT: movaps %xmm1, %xmm0 851 ; SSE2-NEXT: retq 852 ; 853 ; SSSE3-LABEL: combine_nested_undef_test15: 854 ; SSSE3: # %bb.0: 855 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 856 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 857 ; SSSE3-NEXT: movaps %xmm1, %xmm0 858 ; SSSE3-NEXT: retq 859 ; 860 ; SSE41-LABEL: combine_nested_undef_test15: 861 ; SSE41: # %bb.0: 862 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 863 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 864 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 865 ; SSE41-NEXT: retq 866 ; 867 ; AVX1-LABEL: combine_nested_undef_test15: 868 ; AVX1: # %bb.0: 869 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 870 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 871 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 872 ; AVX1-NEXT: retq 873 ; 874 ; AVX2-LABEL: combine_nested_undef_test15: 875 ; AVX2: # %bb.0: 876 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 877 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 878 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 879 ; AVX2-NEXT: retq 880 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 881 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 882 ret <4 x i32> %2 883 } 884 885 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 886 ; SSE2-LABEL: combine_nested_undef_test16: 887 ; SSE2: # %bb.0: 888 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 889 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 890 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 891 ; SSE2-NEXT: retq 892 ; 893 ; SSSE3-LABEL: combine_nested_undef_test16: 894 ; SSSE3: # %bb.0: 895 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 896 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 897 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 898 ; SSSE3-NEXT: retq 899 ; 900 ; SSE41-LABEL: combine_nested_undef_test16: 901 ; SSE41: # %bb.0: 902 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 903 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 904 ; SSE41-NEXT: retq 905 ; 906 ; AVX-LABEL: combine_nested_undef_test16: 907 ; AVX: # %bb.0: 908 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 909 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 910 ; AVX-NEXT: retq 911 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 912 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 913 ret <4 x i32> %2 914 } 915 916 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 917 ; SSE2-LABEL: combine_nested_undef_test17: 918 ; SSE2: # %bb.0: 919 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 920 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 921 ; SSE2-NEXT: retq 922 ; 923 ; SSSE3-LABEL: combine_nested_undef_test17: 924 ; SSSE3: # %bb.0: 925 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 926 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 927 ; SSSE3-NEXT: retq 928 ; 929 ; SSE41-LABEL: combine_nested_undef_test17: 930 ; SSE41: # %bb.0: 931 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 932 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 933 ; SSE41-NEXT: retq 934 ; 935 ; AVX-LABEL: combine_nested_undef_test17: 936 ; AVX: # %bb.0: 937 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 938 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 939 ; AVX-NEXT: retq 940 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 941 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 942 ret <4 x i32> %2 943 } 944 945 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 946 ; SSE-LABEL: combine_nested_undef_test18: 947 ; SSE: # %bb.0: 948 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 949 ; SSE-NEXT: retq 950 ; 951 ; AVX-LABEL: combine_nested_undef_test18: 952 ; AVX: # %bb.0: 953 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] 954 ; AVX-NEXT: retq 955 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 956 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 957 ret <4 x i32> %2 958 } 959 960 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 961 ; SSE2-LABEL: combine_nested_undef_test19: 962 ; SSE2: # %bb.0: 963 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 964 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 965 ; SSE2-NEXT: retq 966 ; 967 ; SSSE3-LABEL: combine_nested_undef_test19: 968 ; SSSE3: # %bb.0: 969 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 970 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 971 ; SSSE3-NEXT: retq 972 ; 973 ; SSE41-LABEL: combine_nested_undef_test19: 974 ; SSE41: # %bb.0: 975 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 976 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 977 ; SSE41-NEXT: retq 978 ; 979 ; AVX-LABEL: combine_nested_undef_test19: 980 ; AVX: # %bb.0: 981 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 982 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 983 ; AVX-NEXT: retq 984 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 985 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 986 ret <4 x i32> %2 987 } 988 989 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 990 ; SSE2-LABEL: combine_nested_undef_test20: 991 ; SSE2: # %bb.0: 992 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 993 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 994 ; SSE2-NEXT: movaps %xmm1, %xmm0 995 ; SSE2-NEXT: retq 996 ; 997 ; SSSE3-LABEL: combine_nested_undef_test20: 998 ; SSSE3: # %bb.0: 999 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1000 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1001 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1002 ; SSSE3-NEXT: retq 1003 ; 1004 ; SSE41-LABEL: combine_nested_undef_test20: 1005 ; SSE41: # %bb.0: 1006 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1007 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1008 ; SSE41-NEXT: retq 1009 ; 1010 ; AVX-LABEL: combine_nested_undef_test20: 1011 ; AVX: # %bb.0: 1012 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1013 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] 1014 ; AVX-NEXT: retq 1015 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1016 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1017 ret <4 x i32> %2 1018 } 1019 1020 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1021 ; SSE2-LABEL: combine_nested_undef_test21: 1022 ; SSE2: # %bb.0: 1023 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1024 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1025 ; SSE2-NEXT: retq 1026 ; 1027 ; SSSE3-LABEL: combine_nested_undef_test21: 1028 ; SSSE3: # %bb.0: 1029 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1030 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1031 ; SSSE3-NEXT: retq 1032 ; 1033 ; SSE41-LABEL: combine_nested_undef_test21: 1034 ; SSE41: # %bb.0: 1035 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1036 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1037 ; SSE41-NEXT: retq 1038 ; 1039 ; AVX1-LABEL: combine_nested_undef_test21: 1040 ; AVX1: # %bb.0: 1041 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1042 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1043 ; AVX1-NEXT: retq 1044 ; 1045 ; AVX2-LABEL: combine_nested_undef_test21: 1046 ; AVX2: # %bb.0: 1047 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1048 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1049 ; AVX2-NEXT: retq 1050 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1051 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1052 ret <4 x i32> %2 1053 } 1054 1055 1056 ; Test that we correctly combine shuffles according to rule 1057 ; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1058 1059 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1060 ; SSE-LABEL: combine_nested_undef_test22: 1061 ; SSE: # %bb.0: 1062 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1063 ; SSE-NEXT: retq 1064 ; 1065 ; AVX-LABEL: combine_nested_undef_test22: 1066 ; AVX: # %bb.0: 1067 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] 1068 ; AVX-NEXT: retq 1069 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1070 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1071 ret <4 x i32> %2 1072 } 1073 1074 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1075 ; SSE-LABEL: combine_nested_undef_test23: 1076 ; SSE: # %bb.0: 1077 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1078 ; SSE-NEXT: retq 1079 ; 1080 ; AVX-LABEL: combine_nested_undef_test23: 1081 ; AVX: # %bb.0: 1082 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] 1083 ; AVX-NEXT: retq 1084 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1085 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1086 ret <4 x i32> %2 1087 } 1088 1089 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1090 ; SSE-LABEL: combine_nested_undef_test24: 1091 ; SSE: # %bb.0: 1092 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1093 ; SSE-NEXT: retq 1094 ; 1095 ; AVX-LABEL: combine_nested_undef_test24: 1096 ; AVX: # %bb.0: 1097 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] 1098 ; AVX-NEXT: retq 1099 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1100 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1101 ret <4 x i32> %2 1102 } 1103 1104 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1105 ; SSE-LABEL: combine_nested_undef_test25: 1106 ; SSE: # %bb.0: 1107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1108 ; SSE-NEXT: retq 1109 ; 1110 ; AVX1-LABEL: combine_nested_undef_test25: 1111 ; AVX1: # %bb.0: 1112 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1113 ; AVX1-NEXT: retq 1114 ; 1115 ; AVX2-LABEL: combine_nested_undef_test25: 1116 ; AVX2: # %bb.0: 1117 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1118 ; AVX2-NEXT: retq 1119 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1120 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1121 ret <4 x i32> %2 1122 } 1123 1124 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1125 ; SSE-LABEL: combine_nested_undef_test26: 1126 ; SSE: # %bb.0: 1127 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1128 ; SSE-NEXT: retq 1129 ; 1130 ; AVX-LABEL: combine_nested_undef_test26: 1131 ; AVX: # %bb.0: 1132 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1133 ; AVX-NEXT: retq 1134 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1135 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1136 ret <4 x i32> %2 1137 } 1138 1139 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1140 ; SSE-LABEL: combine_nested_undef_test27: 1141 ; SSE: # %bb.0: 1142 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1143 ; SSE-NEXT: retq 1144 ; 1145 ; AVX1-LABEL: combine_nested_undef_test27: 1146 ; AVX1: # %bb.0: 1147 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1148 ; AVX1-NEXT: retq 1149 ; 1150 ; AVX2-LABEL: combine_nested_undef_test27: 1151 ; AVX2: # %bb.0: 1152 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1153 ; AVX2-NEXT: retq 1154 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1155 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1156 ret <4 x i32> %2 1157 } 1158 1159 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1160 ; SSE-LABEL: combine_nested_undef_test28: 1161 ; SSE: # %bb.0: 1162 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1163 ; SSE-NEXT: retq 1164 ; 1165 ; AVX-LABEL: combine_nested_undef_test28: 1166 ; AVX: # %bb.0: 1167 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] 1168 ; AVX-NEXT: retq 1169 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1170 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1171 ret <4 x i32> %2 1172 } 1173 1174 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1175 ; SSE-LABEL: combine_test1: 1176 ; SSE: # %bb.0: 1177 ; SSE-NEXT: movaps %xmm1, %xmm0 1178 ; SSE-NEXT: retq 1179 ; 1180 ; AVX-LABEL: combine_test1: 1181 ; AVX: # %bb.0: 1182 ; AVX-NEXT: vmovaps %xmm1, %xmm0 1183 ; AVX-NEXT: retq 1184 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1185 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1186 ret <4 x float> %2 1187 } 1188 1189 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1190 ; SSE2-LABEL: combine_test2: 1191 ; SSE2: # %bb.0: 1192 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1193 ; SSE2-NEXT: movaps %xmm1, %xmm0 1194 ; SSE2-NEXT: retq 1195 ; 1196 ; SSSE3-LABEL: combine_test2: 1197 ; SSSE3: # %bb.0: 1198 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1199 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1200 ; SSSE3-NEXT: retq 1201 ; 1202 ; SSE41-LABEL: combine_test2: 1203 ; SSE41: # %bb.0: 1204 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1205 ; SSE41-NEXT: retq 1206 ; 1207 ; AVX-LABEL: combine_test2: 1208 ; AVX: # %bb.0: 1209 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1210 ; AVX-NEXT: retq 1211 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1212 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1213 ret <4 x float> %2 1214 } 1215 1216 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1217 ; SSE-LABEL: combine_test3: 1218 ; SSE: # %bb.0: 1219 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1220 ; SSE-NEXT: retq 1221 ; 1222 ; AVX-LABEL: combine_test3: 1223 ; AVX: # %bb.0: 1224 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1225 ; AVX-NEXT: retq 1226 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1227 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1228 ret <4 x float> %2 1229 } 1230 1231 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1232 ; SSE-LABEL: combine_test4: 1233 ; SSE: # %bb.0: 1234 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1235 ; SSE-NEXT: retq 1236 ; 1237 ; AVX-LABEL: combine_test4: 1238 ; AVX: # %bb.0: 1239 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1240 ; AVX-NEXT: retq 1241 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1242 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1243 ret <4 x float> %2 1244 } 1245 1246 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1247 ; SSE2-LABEL: combine_test5: 1248 ; SSE2: # %bb.0: 1249 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1250 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1251 ; SSE2-NEXT: retq 1252 ; 1253 ; SSSE3-LABEL: combine_test5: 1254 ; SSSE3: # %bb.0: 1255 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1256 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1257 ; SSSE3-NEXT: retq 1258 ; 1259 ; SSE41-LABEL: combine_test5: 1260 ; SSE41: # %bb.0: 1261 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1262 ; SSE41-NEXT: retq 1263 ; 1264 ; AVX-LABEL: combine_test5: 1265 ; AVX: # %bb.0: 1266 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1267 ; AVX-NEXT: retq 1268 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1269 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1270 ret <4 x float> %2 1271 } 1272 1273 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1274 ; SSE-LABEL: combine_test6: 1275 ; SSE: # %bb.0: 1276 ; SSE-NEXT: movaps %xmm1, %xmm0 1277 ; SSE-NEXT: retq 1278 ; 1279 ; AVX-LABEL: combine_test6: 1280 ; AVX: # %bb.0: 1281 ; AVX-NEXT: vmovaps %xmm1, %xmm0 1282 ; AVX-NEXT: retq 1283 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1284 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1285 ret <4 x i32> %2 1286 } 1287 1288 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1289 ; SSE2-LABEL: combine_test7: 1290 ; SSE2: # %bb.0: 1291 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1292 ; SSE2-NEXT: movaps %xmm1, %xmm0 1293 ; SSE2-NEXT: retq 1294 ; 1295 ; SSSE3-LABEL: combine_test7: 1296 ; SSSE3: # %bb.0: 1297 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1298 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1299 ; SSSE3-NEXT: retq 1300 ; 1301 ; SSE41-LABEL: combine_test7: 1302 ; SSE41: # %bb.0: 1303 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1304 ; SSE41-NEXT: retq 1305 ; 1306 ; AVX-LABEL: combine_test7: 1307 ; AVX: # %bb.0: 1308 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1309 ; AVX-NEXT: retq 1310 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1311 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1312 ret <4 x i32> %2 1313 } 1314 1315 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1316 ; SSE-LABEL: combine_test8: 1317 ; SSE: # %bb.0: 1318 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1319 ; SSE-NEXT: retq 1320 ; 1321 ; AVX-LABEL: combine_test8: 1322 ; AVX: # %bb.0: 1323 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1324 ; AVX-NEXT: retq 1325 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1326 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1327 ret <4 x i32> %2 1328 } 1329 1330 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1331 ; SSE-LABEL: combine_test9: 1332 ; SSE: # %bb.0: 1333 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1334 ; SSE-NEXT: movaps %xmm1, %xmm0 1335 ; SSE-NEXT: retq 1336 ; 1337 ; AVX-LABEL: combine_test9: 1338 ; AVX: # %bb.0: 1339 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1340 ; AVX-NEXT: retq 1341 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1342 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1343 ret <4 x i32> %2 1344 } 1345 1346 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1347 ; SSE2-LABEL: combine_test10: 1348 ; SSE2: # %bb.0: 1349 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1350 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1351 ; SSE2-NEXT: retq 1352 ; 1353 ; SSSE3-LABEL: combine_test10: 1354 ; SSSE3: # %bb.0: 1355 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1356 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1357 ; SSSE3-NEXT: retq 1358 ; 1359 ; SSE41-LABEL: combine_test10: 1360 ; SSE41: # %bb.0: 1361 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1362 ; SSE41-NEXT: retq 1363 ; 1364 ; AVX-LABEL: combine_test10: 1365 ; AVX: # %bb.0: 1366 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1367 ; AVX-NEXT: retq 1368 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1369 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1370 ret <4 x i32> %2 1371 } 1372 1373 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1374 ; ALL-LABEL: combine_test11: 1375 ; ALL: # %bb.0: 1376 ; ALL-NEXT: retq 1377 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1378 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1379 ret <4 x float> %2 1380 } 1381 1382 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1383 ; SSE2-LABEL: combine_test12: 1384 ; SSE2: # %bb.0: 1385 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1386 ; SSE2-NEXT: movaps %xmm1, %xmm0 1387 ; SSE2-NEXT: retq 1388 ; 1389 ; SSSE3-LABEL: combine_test12: 1390 ; SSSE3: # %bb.0: 1391 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1392 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1393 ; SSSE3-NEXT: retq 1394 ; 1395 ; SSE41-LABEL: combine_test12: 1396 ; SSE41: # %bb.0: 1397 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1398 ; SSE41-NEXT: retq 1399 ; 1400 ; AVX-LABEL: combine_test12: 1401 ; AVX: # %bb.0: 1402 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1403 ; AVX-NEXT: retq 1404 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1405 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1406 ret <4 x float> %2 1407 } 1408 1409 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1410 ; SSE-LABEL: combine_test13: 1411 ; SSE: # %bb.0: 1412 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1413 ; SSE-NEXT: retq 1414 ; 1415 ; AVX-LABEL: combine_test13: 1416 ; AVX: # %bb.0: 1417 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1418 ; AVX-NEXT: retq 1419 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1420 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1421 ret <4 x float> %2 1422 } 1423 1424 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1425 ; SSE-LABEL: combine_test14: 1426 ; SSE: # %bb.0: 1427 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1428 ; SSE-NEXT: retq 1429 ; 1430 ; AVX-LABEL: combine_test14: 1431 ; AVX: # %bb.0: 1432 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1433 ; AVX-NEXT: retq 1434 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1435 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1436 ret <4 x float> %2 1437 } 1438 1439 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1440 ; SSE2-LABEL: combine_test15: 1441 ; SSE2: # %bb.0: 1442 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1443 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1444 ; SSE2-NEXT: retq 1445 ; 1446 ; SSSE3-LABEL: combine_test15: 1447 ; SSSE3: # %bb.0: 1448 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1449 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1450 ; SSSE3-NEXT: retq 1451 ; 1452 ; SSE41-LABEL: combine_test15: 1453 ; SSE41: # %bb.0: 1454 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1455 ; SSE41-NEXT: retq 1456 ; 1457 ; AVX-LABEL: combine_test15: 1458 ; AVX: # %bb.0: 1459 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1460 ; AVX-NEXT: retq 1461 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1462 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1463 ret <4 x float> %2 1464 } 1465 1466 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1467 ; ALL-LABEL: combine_test16: 1468 ; ALL: # %bb.0: 1469 ; ALL-NEXT: retq 1470 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1471 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1472 ret <4 x i32> %2 1473 } 1474 1475 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1476 ; SSE2-LABEL: combine_test17: 1477 ; SSE2: # %bb.0: 1478 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1479 ; SSE2-NEXT: movaps %xmm1, %xmm0 1480 ; SSE2-NEXT: retq 1481 ; 1482 ; SSSE3-LABEL: combine_test17: 1483 ; SSSE3: # %bb.0: 1484 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1485 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1486 ; SSSE3-NEXT: retq 1487 ; 1488 ; SSE41-LABEL: combine_test17: 1489 ; SSE41: # %bb.0: 1490 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1491 ; SSE41-NEXT: retq 1492 ; 1493 ; AVX-LABEL: combine_test17: 1494 ; AVX: # %bb.0: 1495 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1496 ; AVX-NEXT: retq 1497 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1498 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1499 ret <4 x i32> %2 1500 } 1501 1502 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1503 ; SSE-LABEL: combine_test18: 1504 ; SSE: # %bb.0: 1505 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1506 ; SSE-NEXT: retq 1507 ; 1508 ; AVX-LABEL: combine_test18: 1509 ; AVX: # %bb.0: 1510 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1511 ; AVX-NEXT: retq 1512 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1513 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1514 ret <4 x i32> %2 1515 } 1516 1517 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1518 ; SSE-LABEL: combine_test19: 1519 ; SSE: # %bb.0: 1520 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1521 ; SSE-NEXT: retq 1522 ; 1523 ; AVX-LABEL: combine_test19: 1524 ; AVX: # %bb.0: 1525 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1526 ; AVX-NEXT: retq 1527 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1528 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1529 ret <4 x i32> %2 1530 } 1531 1532 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1533 ; SSE2-LABEL: combine_test20: 1534 ; SSE2: # %bb.0: 1535 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1536 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1537 ; SSE2-NEXT: retq 1538 ; 1539 ; SSSE3-LABEL: combine_test20: 1540 ; SSSE3: # %bb.0: 1541 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1542 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1543 ; SSSE3-NEXT: retq 1544 ; 1545 ; SSE41-LABEL: combine_test20: 1546 ; SSE41: # %bb.0: 1547 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1548 ; SSE41-NEXT: retq 1549 ; 1550 ; AVX-LABEL: combine_test20: 1551 ; AVX: # %bb.0: 1552 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1553 ; AVX-NEXT: retq 1554 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1555 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1556 ret <4 x i32> %2 1557 } 1558 1559 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1560 ; SSE-LABEL: combine_test21: 1561 ; SSE: # %bb.0: 1562 ; SSE-NEXT: movaps %xmm0, %xmm2 1563 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1564 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1565 ; SSE-NEXT: movaps %xmm2, (%rdi) 1566 ; SSE-NEXT: retq 1567 ; 1568 ; AVX-LABEL: combine_test21: 1569 ; AVX: # %bb.0: 1570 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1571 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1572 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1573 ; AVX-NEXT: vmovaps %xmm2, (%rdi) 1574 ; AVX-NEXT: vzeroupper 1575 ; AVX-NEXT: retq 1576 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1577 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1578 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1579 ret <4 x i32> %2 1580 } 1581 1582 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1583 ; SSE-LABEL: combine_test22: 1584 ; SSE: # %bb.0: 1585 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1586 ; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1587 ; SSE-NEXT: retq 1588 ; 1589 ; AVX-LABEL: combine_test22: 1590 ; AVX: # %bb.0: 1591 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1592 ; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1593 ; AVX-NEXT: retq 1594 ; Current AVX2 lowering of this is still awful, not adding a test case. 1595 %1 = load <2 x float>, <2 x float>* %a, align 8 1596 %2 = load <2 x float>, <2 x float>* %b, align 8 1597 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1598 ret <8 x float> %3 1599 } 1600 1601 ; PR22359 1602 define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { 1603 ; SSE-LABEL: combine_test23: 1604 ; SSE: # %bb.0: 1605 ; SSE-NEXT: movups %xmm0, (%rdi) 1606 ; SSE-NEXT: retq 1607 ; 1608 ; AVX-LABEL: combine_test23: 1609 ; AVX: # %bb.0: 1610 ; AVX-NEXT: vmovups %xmm0, (%rdi) 1611 ; AVX-NEXT: vzeroupper 1612 ; AVX-NEXT: retq 1613 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 1614 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1615 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1616 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8 1617 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8 1618 ret void 1619 } 1620 1621 ; Check some negative cases. 1622 ; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1623 1624 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1625 ; SSE-LABEL: combine_test1b: 1626 ; SSE: # %bb.0: 1627 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 1628 ; SSE-NEXT: movaps %xmm1, %xmm0 1629 ; SSE-NEXT: retq 1630 ; 1631 ; AVX-LABEL: combine_test1b: 1632 ; AVX: # %bb.0: 1633 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1634 ; AVX-NEXT: retq 1635 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1636 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1637 ret <4 x float> %2 1638 } 1639 1640 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1641 ; SSE2-LABEL: combine_test2b: 1642 ; SSE2: # %bb.0: 1643 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] 1644 ; SSE2-NEXT: movaps %xmm1, %xmm0 1645 ; SSE2-NEXT: retq 1646 ; 1647 ; SSSE3-LABEL: combine_test2b: 1648 ; SSSE3: # %bb.0: 1649 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1650 ; SSSE3-NEXT: retq 1651 ; 1652 ; SSE41-LABEL: combine_test2b: 1653 ; SSE41: # %bb.0: 1654 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1655 ; SSE41-NEXT: retq 1656 ; 1657 ; AVX-LABEL: combine_test2b: 1658 ; AVX: # %bb.0: 1659 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1660 ; AVX-NEXT: retq 1661 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1662 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1663 ret <4 x float> %2 1664 } 1665 1666 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1667 ; SSE2-LABEL: combine_test3b: 1668 ; SSE2: # %bb.0: 1669 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1670 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1671 ; SSE2-NEXT: retq 1672 ; 1673 ; SSSE3-LABEL: combine_test3b: 1674 ; SSSE3: # %bb.0: 1675 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1676 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1677 ; SSSE3-NEXT: retq 1678 ; 1679 ; SSE41-LABEL: combine_test3b: 1680 ; SSE41: # %bb.0: 1681 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1682 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1683 ; SSE41-NEXT: retq 1684 ; 1685 ; AVX-LABEL: combine_test3b: 1686 ; AVX: # %bb.0: 1687 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1688 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1689 ; AVX-NEXT: retq 1690 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1691 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1692 ret <4 x float> %2 1693 } 1694 1695 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1696 ; SSE-LABEL: combine_test4b: 1697 ; SSE: # %bb.0: 1698 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 1699 ; SSE-NEXT: movaps %xmm1, %xmm0 1700 ; SSE-NEXT: retq 1701 ; 1702 ; AVX-LABEL: combine_test4b: 1703 ; AVX: # %bb.0: 1704 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1705 ; AVX-NEXT: retq 1706 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1707 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1708 ret <4 x float> %2 1709 } 1710 1711 1712 ; Verify that we correctly fold shuffles even when we use illegal vector types. 1713 1714 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1715 ; SSE2-LABEL: combine_test1c: 1716 ; SSE2: # %bb.0: 1717 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1718 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1719 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1720 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1721 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1722 ; SSE2-NEXT: retq 1723 ; 1724 ; SSSE3-LABEL: combine_test1c: 1725 ; SSSE3: # %bb.0: 1726 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1727 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1728 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1729 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1730 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1731 ; SSSE3-NEXT: retq 1732 ; 1733 ; SSE41-LABEL: combine_test1c: 1734 ; SSE41: # %bb.0: 1735 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1736 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1737 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1738 ; SSE41-NEXT: retq 1739 ; 1740 ; AVX1-LABEL: combine_test1c: 1741 ; AVX1: # %bb.0: 1742 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1743 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1744 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1745 ; AVX1-NEXT: retq 1746 ; 1747 ; AVX2-LABEL: combine_test1c: 1748 ; AVX2: # %bb.0: 1749 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1750 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1751 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1752 ; AVX2-NEXT: retq 1753 %A = load <4 x i8>, <4 x i8>* %a 1754 %B = load <4 x i8>, <4 x i8>* %b 1755 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1756 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1757 ret <4 x i8> %2 1758 } 1759 1760 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1761 ; SSE2-LABEL: combine_test2c: 1762 ; SSE2: # %bb.0: 1763 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1764 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1765 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1766 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1767 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1768 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1769 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1770 ; SSE2-NEXT: retq 1771 ; 1772 ; SSSE3-LABEL: combine_test2c: 1773 ; SSSE3: # %bb.0: 1774 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1775 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1776 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1777 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1778 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1779 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1780 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1781 ; SSSE3-NEXT: retq 1782 ; 1783 ; SSE41-LABEL: combine_test2c: 1784 ; SSE41: # %bb.0: 1785 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1786 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1787 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1788 ; SSE41-NEXT: retq 1789 ; 1790 ; AVX-LABEL: combine_test2c: 1791 ; AVX: # %bb.0: 1792 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1793 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1794 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1795 ; AVX-NEXT: retq 1796 %A = load <4 x i8>, <4 x i8>* %a 1797 %B = load <4 x i8>, <4 x i8>* %b 1798 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1799 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1800 ret <4 x i8> %2 1801 } 1802 1803 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 1804 ; SSE2-LABEL: combine_test3c: 1805 ; SSE2: # %bb.0: 1806 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1807 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1808 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1809 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1810 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1811 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1812 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1813 ; SSE2-NEXT: retq 1814 ; 1815 ; SSSE3-LABEL: combine_test3c: 1816 ; SSSE3: # %bb.0: 1817 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1818 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1819 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1820 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1821 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1822 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1823 ; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1824 ; SSSE3-NEXT: retq 1825 ; 1826 ; SSE41-LABEL: combine_test3c: 1827 ; SSE41: # %bb.0: 1828 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1829 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1830 ; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1831 ; SSE41-NEXT: retq 1832 ; 1833 ; AVX-LABEL: combine_test3c: 1834 ; AVX: # %bb.0: 1835 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1836 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1837 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1838 ; AVX-NEXT: retq 1839 %A = load <4 x i8>, <4 x i8>* %a 1840 %B = load <4 x i8>, <4 x i8>* %b 1841 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1842 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1843 ret <4 x i8> %2 1844 } 1845 1846 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 1847 ; SSE2-LABEL: combine_test4c: 1848 ; SSE2: # %bb.0: 1849 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1850 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1851 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1852 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1853 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1854 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1855 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1856 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1857 ; SSE2-NEXT: retq 1858 ; 1859 ; SSSE3-LABEL: combine_test4c: 1860 ; SSSE3: # %bb.0: 1861 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1862 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1863 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1864 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1865 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1866 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1867 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1868 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1869 ; SSSE3-NEXT: retq 1870 ; 1871 ; SSE41-LABEL: combine_test4c: 1872 ; SSE41: # %bb.0: 1873 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1874 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1875 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1876 ; SSE41-NEXT: retq 1877 ; 1878 ; AVX1-LABEL: combine_test4c: 1879 ; AVX1: # %bb.0: 1880 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1881 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1882 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1883 ; AVX1-NEXT: retq 1884 ; 1885 ; AVX2-LABEL: combine_test4c: 1886 ; AVX2: # %bb.0: 1887 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1888 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1889 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1890 ; AVX2-NEXT: retq 1891 %A = load <4 x i8>, <4 x i8>* %a 1892 %B = load <4 x i8>, <4 x i8>* %b 1893 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1894 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1895 ret <4 x i8> %2 1896 } 1897 1898 1899 ; The following test cases are generated from this C++ code 1900 ; 1901 ;__m128 blend_01(__m128 a, __m128 b) 1902 ;{ 1903 ; __m128 s = a; 1904 ; s = _mm_blend_ps( s, b, 1<<0 ); 1905 ; s = _mm_blend_ps( s, b, 1<<1 ); 1906 ; return s; 1907 ;} 1908 ; 1909 ;__m128 blend_02(__m128 a, __m128 b) 1910 ;{ 1911 ; __m128 s = a; 1912 ; s = _mm_blend_ps( s, b, 1<<0 ); 1913 ; s = _mm_blend_ps( s, b, 1<<2 ); 1914 ; return s; 1915 ;} 1916 ; 1917 ;__m128 blend_123(__m128 a, __m128 b) 1918 ;{ 1919 ; __m128 s = a; 1920 ; s = _mm_blend_ps( s, b, 1<<1 ); 1921 ; s = _mm_blend_ps( s, b, 1<<2 ); 1922 ; s = _mm_blend_ps( s, b, 1<<3 ); 1923 ; return s; 1924 ;} 1925 1926 ; Ideally, we should collapse the following shuffles into a single one. 1927 1928 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 1929 ; SSE2-LABEL: combine_blend_01: 1930 ; SSE2: # %bb.0: 1931 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1932 ; SSE2-NEXT: retq 1933 ; 1934 ; SSSE3-LABEL: combine_blend_01: 1935 ; SSSE3: # %bb.0: 1936 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1937 ; SSSE3-NEXT: retq 1938 ; 1939 ; SSE41-LABEL: combine_blend_01: 1940 ; SSE41: # %bb.0: 1941 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1942 ; SSE41-NEXT: retq 1943 ; 1944 ; AVX-LABEL: combine_blend_01: 1945 ; AVX: # %bb.0: 1946 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1947 ; AVX-NEXT: retq 1948 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 1949 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1950 ret <4 x float> %shuffle6 1951 } 1952 1953 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 1954 ; SSE2-LABEL: combine_blend_02: 1955 ; SSE2: # %bb.0: 1956 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1957 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1958 ; SSE2-NEXT: movaps %xmm1, %xmm0 1959 ; SSE2-NEXT: retq 1960 ; 1961 ; SSSE3-LABEL: combine_blend_02: 1962 ; SSSE3: # %bb.0: 1963 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1964 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1965 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1966 ; SSSE3-NEXT: retq 1967 ; 1968 ; SSE41-LABEL: combine_blend_02: 1969 ; SSE41: # %bb.0: 1970 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1971 ; SSE41-NEXT: retq 1972 ; 1973 ; AVX-LABEL: combine_blend_02: 1974 ; AVX: # %bb.0: 1975 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1976 ; AVX-NEXT: retq 1977 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 1978 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1979 ret <4 x float> %shuffle6 1980 } 1981 1982 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 1983 ; SSE2-LABEL: combine_blend_123: 1984 ; SSE2: # %bb.0: 1985 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1986 ; SSE2-NEXT: movaps %xmm1, %xmm0 1987 ; SSE2-NEXT: retq 1988 ; 1989 ; SSSE3-LABEL: combine_blend_123: 1990 ; SSSE3: # %bb.0: 1991 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1992 ; SSSE3-NEXT: movaps %xmm1, %xmm0 1993 ; SSSE3-NEXT: retq 1994 ; 1995 ; SSE41-LABEL: combine_blend_123: 1996 ; SSE41: # %bb.0: 1997 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1998 ; SSE41-NEXT: retq 1999 ; 2000 ; AVX-LABEL: combine_blend_123: 2001 ; AVX: # %bb.0: 2002 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2003 ; AVX-NEXT: retq 2004 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2005 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 2006 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2007 ret <4 x float> %shuffle12 2008 } 2009 2010 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 2011 ; SSE-LABEL: combine_test_movhl_1: 2012 ; SSE: # %bb.0: 2013 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2014 ; SSE-NEXT: movaps %xmm1, %xmm0 2015 ; SSE-NEXT: retq 2016 ; 2017 ; AVX-LABEL: combine_test_movhl_1: 2018 ; AVX: # %bb.0: 2019 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2020 ; AVX-NEXT: retq 2021 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 2022 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 2023 ret <4 x i32> %2 2024 } 2025 2026 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 2027 ; SSE-LABEL: combine_test_movhl_2: 2028 ; SSE: # %bb.0: 2029 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2030 ; SSE-NEXT: movaps %xmm1, %xmm0 2031 ; SSE-NEXT: retq 2032 ; 2033 ; AVX-LABEL: combine_test_movhl_2: 2034 ; AVX: # %bb.0: 2035 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2036 ; AVX-NEXT: retq 2037 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 2038 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 2039 ret <4 x i32> %2 2040 } 2041 2042 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 2043 ; SSE-LABEL: combine_test_movhl_3: 2044 ; SSE: # %bb.0: 2045 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2046 ; SSE-NEXT: movaps %xmm1, %xmm0 2047 ; SSE-NEXT: retq 2048 ; 2049 ; AVX-LABEL: combine_test_movhl_3: 2050 ; AVX: # %bb.0: 2051 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2052 ; AVX-NEXT: retq 2053 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 2054 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2055 ret <4 x i32> %2 2056 } 2057 2058 2059 ; Verify that we fold shuffles according to rule: 2060 ; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2061 2062 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2063 ; SSE2-LABEL: combine_undef_input_test1: 2064 ; SSE2: # %bb.0: 2065 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2066 ; SSE2-NEXT: retq 2067 ; 2068 ; SSSE3-LABEL: combine_undef_input_test1: 2069 ; SSSE3: # %bb.0: 2070 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2071 ; SSSE3-NEXT: retq 2072 ; 2073 ; SSE41-LABEL: combine_undef_input_test1: 2074 ; SSE41: # %bb.0: 2075 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2076 ; SSE41-NEXT: retq 2077 ; 2078 ; AVX-LABEL: combine_undef_input_test1: 2079 ; AVX: # %bb.0: 2080 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2081 ; AVX-NEXT: retq 2082 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2083 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2084 ret <4 x float> %2 2085 } 2086 2087 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2088 ; SSE-LABEL: combine_undef_input_test2: 2089 ; SSE: # %bb.0: 2090 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2091 ; SSE-NEXT: retq 2092 ; 2093 ; AVX-LABEL: combine_undef_input_test2: 2094 ; AVX: # %bb.0: 2095 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2096 ; AVX-NEXT: retq 2097 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2098 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2099 ret <4 x float> %2 2100 } 2101 2102 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2103 ; SSE-LABEL: combine_undef_input_test3: 2104 ; SSE: # %bb.0: 2105 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2106 ; SSE-NEXT: retq 2107 ; 2108 ; AVX-LABEL: combine_undef_input_test3: 2109 ; AVX: # %bb.0: 2110 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2111 ; AVX-NEXT: retq 2112 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2113 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2114 ret <4 x float> %2 2115 } 2116 2117 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2118 ; SSE-LABEL: combine_undef_input_test4: 2119 ; SSE: # %bb.0: 2120 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2121 ; SSE-NEXT: retq 2122 ; 2123 ; AVX-LABEL: combine_undef_input_test4: 2124 ; AVX: # %bb.0: 2125 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2126 ; AVX-NEXT: retq 2127 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2128 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2129 ret <4 x float> %2 2130 } 2131 2132 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2133 ; SSE2-LABEL: combine_undef_input_test5: 2134 ; SSE2: # %bb.0: 2135 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2136 ; SSE2-NEXT: movapd %xmm1, %xmm0 2137 ; SSE2-NEXT: retq 2138 ; 2139 ; SSSE3-LABEL: combine_undef_input_test5: 2140 ; SSSE3: # %bb.0: 2141 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2142 ; SSSE3-NEXT: movapd %xmm1, %xmm0 2143 ; SSSE3-NEXT: retq 2144 ; 2145 ; SSE41-LABEL: combine_undef_input_test5: 2146 ; SSE41: # %bb.0: 2147 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2148 ; SSE41-NEXT: retq 2149 ; 2150 ; AVX-LABEL: combine_undef_input_test5: 2151 ; AVX: # %bb.0: 2152 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2153 ; AVX-NEXT: retq 2154 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2155 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2156 ret <4 x float> %2 2157 } 2158 2159 2160 ; Verify that we fold shuffles according to rule: 2161 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2162 2163 define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2164 ; ALL-LABEL: combine_undef_input_test6: 2165 ; ALL: # %bb.0: 2166 ; ALL-NEXT: retq 2167 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2168 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2169 ret <4 x float> %2 2170 } 2171 2172 define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2173 ; SSE2-LABEL: combine_undef_input_test7: 2174 ; SSE2: # %bb.0: 2175 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2176 ; SSE2-NEXT: retq 2177 ; 2178 ; SSSE3-LABEL: combine_undef_input_test7: 2179 ; SSSE3: # %bb.0: 2180 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2181 ; SSSE3-NEXT: retq 2182 ; 2183 ; SSE41-LABEL: combine_undef_input_test7: 2184 ; SSE41: # %bb.0: 2185 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2186 ; SSE41-NEXT: retq 2187 ; 2188 ; AVX-LABEL: combine_undef_input_test7: 2189 ; AVX: # %bb.0: 2190 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2191 ; AVX-NEXT: retq 2192 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2193 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2194 ret <4 x float> %2 2195 } 2196 2197 define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2198 ; SSE2-LABEL: combine_undef_input_test8: 2199 ; SSE2: # %bb.0: 2200 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2201 ; SSE2-NEXT: retq 2202 ; 2203 ; SSSE3-LABEL: combine_undef_input_test8: 2204 ; SSSE3: # %bb.0: 2205 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2206 ; SSSE3-NEXT: retq 2207 ; 2208 ; SSE41-LABEL: combine_undef_input_test8: 2209 ; SSE41: # %bb.0: 2210 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2211 ; SSE41-NEXT: retq 2212 ; 2213 ; AVX-LABEL: combine_undef_input_test8: 2214 ; AVX: # %bb.0: 2215 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2216 ; AVX-NEXT: retq 2217 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2218 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2219 ret <4 x float> %2 2220 } 2221 2222 define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2223 ; SSE-LABEL: combine_undef_input_test9: 2224 ; SSE: # %bb.0: 2225 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 2226 ; SSE-NEXT: retq 2227 ; 2228 ; AVX-LABEL: combine_undef_input_test9: 2229 ; AVX: # %bb.0: 2230 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2231 ; AVX-NEXT: retq 2232 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2233 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2234 ret <4 x float> %2 2235 } 2236 2237 define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2238 ; ALL-LABEL: combine_undef_input_test10: 2239 ; ALL: # %bb.0: 2240 ; ALL-NEXT: retq 2241 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2242 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2243 ret <4 x float> %2 2244 } 2245 2246 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2247 ; SSE2-LABEL: combine_undef_input_test11: 2248 ; SSE2: # %bb.0: 2249 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2250 ; SSE2-NEXT: retq 2251 ; 2252 ; SSSE3-LABEL: combine_undef_input_test11: 2253 ; SSSE3: # %bb.0: 2254 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2255 ; SSSE3-NEXT: retq 2256 ; 2257 ; SSE41-LABEL: combine_undef_input_test11: 2258 ; SSE41: # %bb.0: 2259 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2260 ; SSE41-NEXT: retq 2261 ; 2262 ; AVX-LABEL: combine_undef_input_test11: 2263 ; AVX: # %bb.0: 2264 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2265 ; AVX-NEXT: retq 2266 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2267 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2268 ret <4 x float> %2 2269 } 2270 2271 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2272 ; SSE-LABEL: combine_undef_input_test12: 2273 ; SSE: # %bb.0: 2274 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2275 ; SSE-NEXT: retq 2276 ; 2277 ; AVX-LABEL: combine_undef_input_test12: 2278 ; AVX: # %bb.0: 2279 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2280 ; AVX-NEXT: retq 2281 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2282 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2283 ret <4 x float> %2 2284 } 2285 2286 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2287 ; SSE-LABEL: combine_undef_input_test13: 2288 ; SSE: # %bb.0: 2289 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2290 ; SSE-NEXT: retq 2291 ; 2292 ; AVX-LABEL: combine_undef_input_test13: 2293 ; AVX: # %bb.0: 2294 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2295 ; AVX-NEXT: retq 2296 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2297 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2298 ret <4 x float> %2 2299 } 2300 2301 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2302 ; SSE-LABEL: combine_undef_input_test14: 2303 ; SSE: # %bb.0: 2304 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2305 ; SSE-NEXT: retq 2306 ; 2307 ; AVX-LABEL: combine_undef_input_test14: 2308 ; AVX: # %bb.0: 2309 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2310 ; AVX-NEXT: retq 2311 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2312 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2313 ret <4 x float> %2 2314 } 2315 2316 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2317 ; SSE2-LABEL: combine_undef_input_test15: 2318 ; SSE2: # %bb.0: 2319 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2320 ; SSE2-NEXT: movapd %xmm1, %xmm0 2321 ; SSE2-NEXT: retq 2322 ; 2323 ; SSSE3-LABEL: combine_undef_input_test15: 2324 ; SSSE3: # %bb.0: 2325 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2326 ; SSSE3-NEXT: movapd %xmm1, %xmm0 2327 ; SSSE3-NEXT: retq 2328 ; 2329 ; SSE41-LABEL: combine_undef_input_test15: 2330 ; SSE41: # %bb.0: 2331 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2332 ; SSE41-NEXT: retq 2333 ; 2334 ; AVX-LABEL: combine_undef_input_test15: 2335 ; AVX: # %bb.0: 2336 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2337 ; AVX-NEXT: retq 2338 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2339 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2340 ret <4 x float> %2 2341 } 2342 2343 2344 ; Verify that shuffles are canonicalized according to rules: 2345 ; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2346 ; 2347 ; This allows to trigger the following combine rule: 2348 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2349 ; 2350 ; As a result, all the shuffle pairs in each function below should be 2351 ; combined into a single legal shuffle operation. 2352 2353 define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2354 ; ALL-LABEL: combine_undef_input_test16: 2355 ; ALL: # %bb.0: 2356 ; ALL-NEXT: retq 2357 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2358 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2359 ret <4 x float> %2 2360 } 2361 2362 define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2363 ; SSE2-LABEL: combine_undef_input_test17: 2364 ; SSE2: # %bb.0: 2365 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2366 ; SSE2-NEXT: retq 2367 ; 2368 ; SSSE3-LABEL: combine_undef_input_test17: 2369 ; SSSE3: # %bb.0: 2370 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2371 ; SSSE3-NEXT: retq 2372 ; 2373 ; SSE41-LABEL: combine_undef_input_test17: 2374 ; SSE41: # %bb.0: 2375 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2376 ; SSE41-NEXT: retq 2377 ; 2378 ; AVX-LABEL: combine_undef_input_test17: 2379 ; AVX: # %bb.0: 2380 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2381 ; AVX-NEXT: retq 2382 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2383 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2384 ret <4 x float> %2 2385 } 2386 2387 define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2388 ; SSE2-LABEL: combine_undef_input_test18: 2389 ; SSE2: # %bb.0: 2390 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2391 ; SSE2-NEXT: retq 2392 ; 2393 ; SSSE3-LABEL: combine_undef_input_test18: 2394 ; SSSE3: # %bb.0: 2395 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2396 ; SSSE3-NEXT: retq 2397 ; 2398 ; SSE41-LABEL: combine_undef_input_test18: 2399 ; SSE41: # %bb.0: 2400 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2401 ; SSE41-NEXT: retq 2402 ; 2403 ; AVX-LABEL: combine_undef_input_test18: 2404 ; AVX: # %bb.0: 2405 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2406 ; AVX-NEXT: retq 2407 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2408 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2409 ret <4 x float> %2 2410 } 2411 2412 define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2413 ; SSE-LABEL: combine_undef_input_test19: 2414 ; SSE: # %bb.0: 2415 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 2416 ; SSE-NEXT: retq 2417 ; 2418 ; AVX-LABEL: combine_undef_input_test19: 2419 ; AVX: # %bb.0: 2420 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2421 ; AVX-NEXT: retq 2422 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2423 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2424 ret <4 x float> %2 2425 } 2426 2427 define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2428 ; ALL-LABEL: combine_undef_input_test20: 2429 ; ALL: # %bb.0: 2430 ; ALL-NEXT: retq 2431 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2432 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2433 ret <4 x float> %2 2434 } 2435 2436 ; These tests are designed to test the ability to combine away unnecessary 2437 ; operations feeding into a shuffle. The AVX cases are the important ones as 2438 ; they leverage operations which cannot be done naturally on the entire vector 2439 ; and thus are decomposed into multiple smaller operations. 2440 2441 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2442 ; SSE-LABEL: combine_unneeded_subvector1: 2443 ; SSE: # %bb.0: 2444 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2445 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2446 ; SSE-NEXT: movdqa %xmm0, %xmm1 2447 ; SSE-NEXT: retq 2448 ; 2449 ; AVX1-LABEL: combine_unneeded_subvector1: 2450 ; AVX1: # %bb.0: 2451 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2452 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2453 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2454 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2455 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2456 ; AVX1-NEXT: retq 2457 ; 2458 ; AVX2-SLOW-LABEL: combine_unneeded_subvector1: 2459 ; AVX2-SLOW: # %bb.0: 2460 ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2461 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2462 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2463 ; AVX2-SLOW-NEXT: retq 2464 ; 2465 ; AVX2-FAST-LABEL: combine_unneeded_subvector1: 2466 ; AVX2-FAST: # %bb.0: 2467 ; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2468 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2469 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2470 ; AVX2-FAST-NEXT: retq 2471 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2472 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2473 ret <8 x i32> %c 2474 } 2475 2476 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2477 ; SSE-LABEL: combine_unneeded_subvector2: 2478 ; SSE: # %bb.0: 2479 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2480 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2481 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2482 ; SSE-NEXT: retq 2483 ; 2484 ; AVX1-LABEL: combine_unneeded_subvector2: 2485 ; AVX1: # %bb.0: 2486 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2487 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2488 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2489 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2490 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2491 ; AVX1-NEXT: retq 2492 ; 2493 ; AVX2-LABEL: combine_unneeded_subvector2: 2494 ; AVX2: # %bb.0: 2495 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2496 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2497 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2498 ; AVX2-NEXT: retq 2499 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2500 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2501 ret <8 x i32> %d 2502 } 2503 2504 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2505 ; SSE2-LABEL: combine_insertps1: 2506 ; SSE2: # %bb.0: 2507 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2508 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2509 ; SSE2-NEXT: movaps %xmm1, %xmm0 2510 ; SSE2-NEXT: retq 2511 ; 2512 ; SSSE3-LABEL: combine_insertps1: 2513 ; SSSE3: # %bb.0: 2514 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2515 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2516 ; SSSE3-NEXT: movaps %xmm1, %xmm0 2517 ; SSSE3-NEXT: retq 2518 ; 2519 ; SSE41-LABEL: combine_insertps1: 2520 ; SSE41: # %bb.0: 2521 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2522 ; SSE41-NEXT: retq 2523 ; 2524 ; AVX-LABEL: combine_insertps1: 2525 ; AVX: # %bb.0: 2526 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2527 ; AVX-NEXT: retq 2528 2529 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2530 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2531 ret <4 x float> %d 2532 } 2533 2534 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2535 ; SSE2-LABEL: combine_insertps2: 2536 ; SSE2: # %bb.0: 2537 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2538 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2539 ; SSE2-NEXT: movaps %xmm1, %xmm0 2540 ; SSE2-NEXT: retq 2541 ; 2542 ; SSSE3-LABEL: combine_insertps2: 2543 ; SSSE3: # %bb.0: 2544 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2545 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2546 ; SSSE3-NEXT: movaps %xmm1, %xmm0 2547 ; SSSE3-NEXT: retq 2548 ; 2549 ; SSE41-LABEL: combine_insertps2: 2550 ; SSE41: # %bb.0: 2551 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2552 ; SSE41-NEXT: retq 2553 ; 2554 ; AVX-LABEL: combine_insertps2: 2555 ; AVX: # %bb.0: 2556 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2557 ; AVX-NEXT: retq 2558 2559 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2560 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2561 ret <4 x float> %d 2562 } 2563 2564 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2565 ; SSE2-LABEL: combine_insertps3: 2566 ; SSE2: # %bb.0: 2567 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2568 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2569 ; SSE2-NEXT: retq 2570 ; 2571 ; SSSE3-LABEL: combine_insertps3: 2572 ; SSSE3: # %bb.0: 2573 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2574 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2575 ; SSSE3-NEXT: retq 2576 ; 2577 ; SSE41-LABEL: combine_insertps3: 2578 ; SSE41: # %bb.0: 2579 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2580 ; SSE41-NEXT: retq 2581 ; 2582 ; AVX-LABEL: combine_insertps3: 2583 ; AVX: # %bb.0: 2584 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2585 ; AVX-NEXT: retq 2586 2587 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2588 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2589 ret <4 x float> %d 2590 } 2591 2592 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2593 ; SSE2-LABEL: combine_insertps4: 2594 ; SSE2: # %bb.0: 2595 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2596 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2597 ; SSE2-NEXT: retq 2598 ; 2599 ; SSSE3-LABEL: combine_insertps4: 2600 ; SSSE3: # %bb.0: 2601 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2602 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2603 ; SSSE3-NEXT: retq 2604 ; 2605 ; SSE41-LABEL: combine_insertps4: 2606 ; SSE41: # %bb.0: 2607 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2608 ; SSE41-NEXT: retq 2609 ; 2610 ; AVX-LABEL: combine_insertps4: 2611 ; AVX: # %bb.0: 2612 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2613 ; AVX-NEXT: retq 2614 2615 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2616 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2617 ret <4 x float> %d 2618 } 2619 2620 define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { 2621 ; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2622 ; SSE: # %bb.0: 2623 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2624 ; SSE-NEXT: movaps %xmm0, (%rsi) 2625 ; SSE-NEXT: retq 2626 ; 2627 ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2628 ; AVX: # %bb.0: 2629 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2630 ; AVX-NEXT: vmovaps %xmm0, (%rsi) 2631 ; AVX-NEXT: retq 2632 %1 = load double, double* %a0, align 8 2633 %2 = insertelement <2 x double> undef, double %1, i32 0 2634 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2635 %4 = bitcast <2 x double> %3 to <4 x float> 2636 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2637 store <4 x float> %5, <4 x float>* %a1, align 16 2638 ret void 2639 } 2640 2641 ; PR30371 2642 define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2643 ; SSE2-LABEL: combine_constant_insertion_v4f32: 2644 ; SSE2: # %bb.0: 2645 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3> 2646 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2647 ; SSE2-NEXT: movaps %xmm1, %xmm0 2648 ; SSE2-NEXT: retq 2649 ; 2650 ; SSSE3-LABEL: combine_constant_insertion_v4f32: 2651 ; SSSE3: # %bb.0: 2652 ; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3> 2653 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2654 ; SSSE3-NEXT: movaps %xmm1, %xmm0 2655 ; SSSE3-NEXT: retq 2656 ; 2657 ; SSE41-LABEL: combine_constant_insertion_v4f32: 2658 ; SSE41: # %bb.0: 2659 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2660 ; SSE41-NEXT: retq 2661 ; 2662 ; AVX-LABEL: combine_constant_insertion_v4f32: 2663 ; AVX: # %bb.0: 2664 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2665 ; AVX-NEXT: retq 2666 %a0 = insertelement <4 x float> undef, float %f, i32 0 2667 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2668 ret <4 x float> %ret 2669 } 2670 2671 define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2672 ; SSE2-LABEL: combine_constant_insertion_v4i32: 2673 ; SSE2: # %bb.0: 2674 ; SSE2-NEXT: movd %edi, %xmm1 2675 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2676 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2677 ; SSE2-NEXT: retq 2678 ; 2679 ; SSSE3-LABEL: combine_constant_insertion_v4i32: 2680 ; SSSE3: # %bb.0: 2681 ; SSSE3-NEXT: movd %edi, %xmm1 2682 ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2683 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2684 ; SSSE3-NEXT: retq 2685 ; 2686 ; SSE41-LABEL: combine_constant_insertion_v4i32: 2687 ; SSE41: # %bb.0: 2688 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30> 2689 ; SSE41-NEXT: pinsrd $0, %edi, %xmm0 2690 ; SSE41-NEXT: retq 2691 ; 2692 ; AVX-LABEL: combine_constant_insertion_v4i32: 2693 ; AVX: # %bb.0: 2694 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30> 2695 ; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 2696 ; AVX-NEXT: retq 2697 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2698 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2699 ret <4 x i32> %ret 2700 } 2701 2702 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2703 ; SSE-LABEL: PR22377: 2704 ; SSE: # %bb.0: # %entry 2705 ; SSE-NEXT: movaps %xmm0, %xmm1 2706 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] 2707 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2708 ; SSE-NEXT: addps %xmm0, %xmm1 2709 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2710 ; SSE-NEXT: retq 2711 ; 2712 ; AVX-LABEL: PR22377: 2713 ; AVX: # %bb.0: # %entry 2714 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 2715 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2716 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 2717 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2718 ; AVX-NEXT: retq 2719 entry: 2720 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2721 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2722 %r2 = fadd <4 x float> %s1, %s2 2723 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2724 ret <4 x float> %s3 2725 } 2726 2727 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2728 ; SSE2-LABEL: PR22390: 2729 ; SSE2: # %bb.0: # %entry 2730 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2731 ; SSE2-NEXT: movaps %xmm0, %xmm2 2732 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2733 ; SSE2-NEXT: addps %xmm0, %xmm2 2734 ; SSE2-NEXT: movaps %xmm2, %xmm0 2735 ; SSE2-NEXT: retq 2736 ; 2737 ; SSSE3-LABEL: PR22390: 2738 ; SSSE3: # %bb.0: # %entry 2739 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2740 ; SSSE3-NEXT: movaps %xmm0, %xmm2 2741 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2742 ; SSSE3-NEXT: addps %xmm0, %xmm2 2743 ; SSSE3-NEXT: movaps %xmm2, %xmm0 2744 ; SSSE3-NEXT: retq 2745 ; 2746 ; SSE41-LABEL: PR22390: 2747 ; SSE41: # %bb.0: # %entry 2748 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2749 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2750 ; SSE41-NEXT: addps %xmm1, %xmm0 2751 ; SSE41-NEXT: retq 2752 ; 2753 ; AVX-LABEL: PR22390: 2754 ; AVX: # %bb.0: # %entry 2755 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2756 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2757 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2758 ; AVX-NEXT: retq 2759 entry: 2760 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2761 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2762 %r2 = fadd <4 x float> %s1, %s2 2763 ret <4 x float> %r2 2764 } 2765 2766 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2767 ; SSE2-LABEL: PR22412: 2768 ; SSE2: # %bb.0: # %entry 2769 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2770 ; SSE2-NEXT: movapd %xmm2, %xmm0 2771 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2772 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2773 ; SSE2-NEXT: movaps %xmm3, %xmm1 2774 ; SSE2-NEXT: retq 2775 ; 2776 ; SSSE3-LABEL: PR22412: 2777 ; SSSE3: # %bb.0: # %entry 2778 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2779 ; SSSE3-NEXT: movapd %xmm2, %xmm0 2780 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2781 ; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2782 ; SSSE3-NEXT: movaps %xmm3, %xmm1 2783 ; SSSE3-NEXT: retq 2784 ; 2785 ; SSE41-LABEL: PR22412: 2786 ; SSE41: # %bb.0: # %entry 2787 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 2788 ; SSE41-NEXT: movaps %xmm0, %xmm1 2789 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2] 2790 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2] 2791 ; SSE41-NEXT: movaps %xmm1, %xmm0 2792 ; SSE41-NEXT: movaps %xmm3, %xmm1 2793 ; SSE41-NEXT: retq 2794 ; 2795 ; AVX1-LABEL: PR22412: 2796 ; AVX1: # %bb.0: # %entry 2797 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 2798 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 2799 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2800 ; AVX1-NEXT: retq 2801 ; 2802 ; AVX2-SLOW-LABEL: PR22412: 2803 ; AVX2-SLOW: # %bb.0: # %entry 2804 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 2805 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 2806 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] 2807 ; AVX2-SLOW-NEXT: retq 2808 ; 2809 ; AVX2-FAST-LABEL: PR22412: 2810 ; AVX2-FAST: # %bb.0: # %entry 2811 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 2812 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2] 2813 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 2814 ; AVX2-FAST-NEXT: retq 2815 entry: 2816 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2817 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2818 ret <8 x float> %s2 2819 } 2820 2821 define <4 x float> @PR30264(<4 x float> %x) { 2822 ; SSE2-LABEL: PR30264: 2823 ; SSE2: # %bb.0: 2824 ; SSE2-NEXT: xorps %xmm1, %xmm1 2825 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 2826 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2827 ; SSE2-NEXT: movaps %xmm1, %xmm0 2828 ; SSE2-NEXT: retq 2829 ; 2830 ; SSSE3-LABEL: PR30264: 2831 ; SSSE3: # %bb.0: 2832 ; SSSE3-NEXT: xorps %xmm1, %xmm1 2833 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 2834 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2835 ; SSSE3-NEXT: movaps %xmm1, %xmm0 2836 ; SSSE3-NEXT: retq 2837 ; 2838 ; SSE41-LABEL: PR30264: 2839 ; SSE41: # %bb.0: 2840 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4,1> 2841 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] 2842 ; SSE41-NEXT: movaps %xmm1, %xmm0 2843 ; SSE41-NEXT: retq 2844 ; 2845 ; AVX-LABEL: PR30264: 2846 ; AVX: # %bb.0: 2847 ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4,1> 2848 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] 2849 ; AVX-NEXT: retq 2850 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2851 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2852 ret <4 x float> %shuf2 2853 } 2854