1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512DQ 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,VBMI 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512VL,AVX512VLDQ 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,AVX512VLBW 12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,INT256,AVX512VL,VLVBMI 13 14 define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { 15 ; XOP-LABEL: var_shuffle_v4i64: 16 ; XOP: # %bb.0: 17 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 18 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 19 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 20 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 21 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 22 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 23 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 24 ; XOP-NEXT: retq 25 ; 26 ; AVX1-LABEL: var_shuffle_v4i64: 27 ; AVX1: # %bb.0: 28 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 29 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 30 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 31 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 32 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 33 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 34 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 35 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 36 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 37 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 38 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 39 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 40 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 41 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 42 ; AVX1-NEXT: retq 43 ; 44 ; AVX2-LABEL: var_shuffle_v4i64: 45 ; AVX2: # %bb.0: 46 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 47 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 48 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 49 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 50 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 51 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 52 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 53 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 54 ; AVX2-NEXT: retq 55 ; 56 ; AVX512-LABEL: var_shuffle_v4i64: 57 ; AVX512: # %bb.0: 58 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 59 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 60 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 61 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 62 ; AVX512-NEXT: retq 63 ; 64 ; AVX512VL-LABEL: var_shuffle_v4i64: 65 ; AVX512VL: # %bb.0: 66 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 67 ; AVX512VL-NEXT: retq 68 %index0 = extractelement <4 x i64> %indices, i32 0 69 %index1 = extractelement <4 x i64> %indices, i32 1 70 %index2 = extractelement <4 x i64> %indices, i32 2 71 %index3 = extractelement <4 x i64> %indices, i32 3 72 %v0 = extractelement <4 x i64> %v, i64 %index0 73 %v1 = extractelement <4 x i64> %v, i64 %index1 74 %v2 = extractelement <4 x i64> %v, i64 %index2 75 %v3 = extractelement <4 x i64> %v, i64 %index3 76 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 77 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 78 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 79 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 80 ret <4 x i64> %ret3 81 } 82 83 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { 84 ; XOP-LABEL: var_shuffle_v8i32: 85 ; XOP: # %bb.0: 86 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 87 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 88 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 89 ; XOP-NEXT: retq 90 ; 91 ; AVX1-LABEL: var_shuffle_v8i32: 92 ; AVX1: # %bb.0: 93 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 94 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 95 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 96 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 97 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 98 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 99 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 100 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 101 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 102 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 103 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 104 ; AVX1-NEXT: retq 105 ; 106 ; INT256-LABEL: var_shuffle_v8i32: 107 ; INT256: # %bb.0: 108 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 109 ; INT256-NEXT: retq 110 %index0 = extractelement <8 x i32> %indices, i32 0 111 %index1 = extractelement <8 x i32> %indices, i32 1 112 %index2 = extractelement <8 x i32> %indices, i32 2 113 %index3 = extractelement <8 x i32> %indices, i32 3 114 %index4 = extractelement <8 x i32> %indices, i32 4 115 %index5 = extractelement <8 x i32> %indices, i32 5 116 %index6 = extractelement <8 x i32> %indices, i32 6 117 %index7 = extractelement <8 x i32> %indices, i32 7 118 %v0 = extractelement <8 x i32> %v, i32 %index0 119 %v1 = extractelement <8 x i32> %v, i32 %index1 120 %v2 = extractelement <8 x i32> %v, i32 %index2 121 %v3 = extractelement <8 x i32> %v, i32 %index3 122 %v4 = extractelement <8 x i32> %v, i32 %index4 123 %v5 = extractelement <8 x i32> %v, i32 %index5 124 %v6 = extractelement <8 x i32> %v, i32 %index6 125 %v7 = extractelement <8 x i32> %v, i32 %index7 126 %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0 127 %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1 128 %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2 129 %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3 130 %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4 131 %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5 132 %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6 133 %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7 134 ret <8 x i32> %ret7 135 } 136 137 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind { 138 ; XOP-LABEL: var_shuffle_v16i16: 139 ; XOP: # %bb.0: 140 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 141 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 142 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 143 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 144 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 145 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 146 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm1 147 ; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm0, %xmm0 148 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 149 ; XOP-NEXT: retq 150 ; 151 ; AVX1-LABEL: var_shuffle_v16i16: 152 ; AVX1: # %bb.0: 153 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 154 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 155 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 156 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 157 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 158 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 159 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 160 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 161 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 162 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 163 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 164 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 165 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 166 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 167 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4 168 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 169 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 170 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 171 ; AVX1-NEXT: retq 172 ; 173 ; AVX2-LABEL: var_shuffle_v16i16: 174 ; AVX2: # %bb.0: 175 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 176 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 177 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 178 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 179 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 180 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 181 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 182 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 183 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 184 ; AVX2-NEXT: retq 185 ; 186 ; AVX512-LABEL: var_shuffle_v16i16: 187 ; AVX512: # %bb.0: 188 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 189 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 190 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 191 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 192 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 193 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 194 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 195 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 196 ; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 197 ; AVX512-NEXT: retq 198 ; 199 ; AVX512VLDQ-LABEL: var_shuffle_v16i16: 200 ; AVX512VLDQ: # %bb.0: 201 ; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 202 ; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 203 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 204 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 205 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm3 206 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 207 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 208 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 209 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 210 ; AVX512VLDQ-NEXT: retq 211 ; 212 ; AVX512VLBW-LABEL: var_shuffle_v16i16: 213 ; AVX512VLBW: # %bb.0: 214 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 215 ; AVX512VLBW-NEXT: retq 216 ; 217 ; VLVBMI-LABEL: var_shuffle_v16i16: 218 ; VLVBMI: # %bb.0: 219 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 220 ; VLVBMI-NEXT: retq 221 %index0 = extractelement <16 x i16> %indices, i32 0 222 %index1 = extractelement <16 x i16> %indices, i32 1 223 %index2 = extractelement <16 x i16> %indices, i32 2 224 %index3 = extractelement <16 x i16> %indices, i32 3 225 %index4 = extractelement <16 x i16> %indices, i32 4 226 %index5 = extractelement <16 x i16> %indices, i32 5 227 %index6 = extractelement <16 x i16> %indices, i32 6 228 %index7 = extractelement <16 x i16> %indices, i32 7 229 %index8 = extractelement <16 x i16> %indices, i32 8 230 %index9 = extractelement <16 x i16> %indices, i32 9 231 %index10 = extractelement <16 x i16> %indices, i32 10 232 %index11 = extractelement <16 x i16> %indices, i32 11 233 %index12 = extractelement <16 x i16> %indices, i32 12 234 %index13 = extractelement <16 x i16> %indices, i32 13 235 %index14 = extractelement <16 x i16> %indices, i32 14 236 %index15 = extractelement <16 x i16> %indices, i32 15 237 %v0 = extractelement <16 x i16> %v, i16 %index0 238 %v1 = extractelement <16 x i16> %v, i16 %index1 239 %v2 = extractelement <16 x i16> %v, i16 %index2 240 %v3 = extractelement <16 x i16> %v, i16 %index3 241 %v4 = extractelement <16 x i16> %v, i16 %index4 242 %v5 = extractelement <16 x i16> %v, i16 %index5 243 %v6 = extractelement <16 x i16> %v, i16 %index6 244 %v7 = extractelement <16 x i16> %v, i16 %index7 245 %v8 = extractelement <16 x i16> %v, i16 %index8 246 %v9 = extractelement <16 x i16> %v, i16 %index9 247 %v10 = extractelement <16 x i16> %v, i16 %index10 248 %v11 = extractelement <16 x i16> %v, i16 %index11 249 %v12 = extractelement <16 x i16> %v, i16 %index12 250 %v13 = extractelement <16 x i16> %v, i16 %index13 251 %v14 = extractelement <16 x i16> %v, i16 %index14 252 %v15 = extractelement <16 x i16> %v, i16 %index15 253 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 254 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 255 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 256 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 257 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 258 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 259 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 260 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 261 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 262 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 263 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 264 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 265 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 266 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 267 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 268 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 269 ret <16 x i16> %ret15 270 } 271 272 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { 273 ; XOP-LABEL: var_shuffle_v32i8: 274 ; XOP: # %bb.0: 275 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 276 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 277 ; XOP-NEXT: vpperm %xmm2, %xmm3, %xmm0, %xmm2 278 ; XOP-NEXT: vpperm %xmm1, %xmm3, %xmm0, %xmm0 279 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 280 ; XOP-NEXT: retq 281 ; 282 ; AVX1-LABEL: var_shuffle_v32i8: 283 ; AVX1: # %bb.0: 284 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 285 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 286 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 287 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 288 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6 289 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 290 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm2, %xmm2 291 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 292 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm4 293 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 294 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 295 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 296 ; AVX1-NEXT: retq 297 ; 298 ; AVX2-LABEL: var_shuffle_v32i8: 299 ; AVX2: # %bb.0: 300 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 301 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 302 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 303 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 304 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 305 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 306 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 307 ; AVX2-NEXT: retq 308 ; 309 ; AVX512-LABEL: var_shuffle_v32i8: 310 ; AVX512: # %bb.0: 311 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 312 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 313 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 314 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 315 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 316 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 317 ; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 318 ; AVX512-NEXT: retq 319 ; 320 ; AVX512VLDQ-LABEL: var_shuffle_v32i8: 321 ; AVX512VLDQ: # %bb.0: 322 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 323 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 324 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm3 325 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 326 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 327 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 328 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 329 ; AVX512VLDQ-NEXT: retq 330 ; 331 ; AVX512VLBW-LABEL: var_shuffle_v32i8: 332 ; AVX512VLBW: # %bb.0: 333 ; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 334 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] 335 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 336 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 337 ; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 338 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm0 {%k1} 339 ; AVX512VLBW-NEXT: retq 340 ; 341 ; VLVBMI-LABEL: var_shuffle_v32i8: 342 ; VLVBMI: # %bb.0: 343 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 344 ; VLVBMI-NEXT: retq 345 %index0 = extractelement <32 x i8> %indices, i32 0 346 %index1 = extractelement <32 x i8> %indices, i32 1 347 %index2 = extractelement <32 x i8> %indices, i32 2 348 %index3 = extractelement <32 x i8> %indices, i32 3 349 %index4 = extractelement <32 x i8> %indices, i32 4 350 %index5 = extractelement <32 x i8> %indices, i32 5 351 %index6 = extractelement <32 x i8> %indices, i32 6 352 %index7 = extractelement <32 x i8> %indices, i32 7 353 %index8 = extractelement <32 x i8> %indices, i32 8 354 %index9 = extractelement <32 x i8> %indices, i32 9 355 %index10 = extractelement <32 x i8> %indices, i32 10 356 %index11 = extractelement <32 x i8> %indices, i32 11 357 %index12 = extractelement <32 x i8> %indices, i32 12 358 %index13 = extractelement <32 x i8> %indices, i32 13 359 %index14 = extractelement <32 x i8> %indices, i32 14 360 %index15 = extractelement <32 x i8> %indices, i32 15 361 %index16 = extractelement <32 x i8> %indices, i32 16 362 %index17 = extractelement <32 x i8> %indices, i32 17 363 %index18 = extractelement <32 x i8> %indices, i32 18 364 %index19 = extractelement <32 x i8> %indices, i32 19 365 %index20 = extractelement <32 x i8> %indices, i32 20 366 %index21 = extractelement <32 x i8> %indices, i32 21 367 %index22 = extractelement <32 x i8> %indices, i32 22 368 %index23 = extractelement <32 x i8> %indices, i32 23 369 %index24 = extractelement <32 x i8> %indices, i32 24 370 %index25 = extractelement <32 x i8> %indices, i32 25 371 %index26 = extractelement <32 x i8> %indices, i32 26 372 %index27 = extractelement <32 x i8> %indices, i32 27 373 %index28 = extractelement <32 x i8> %indices, i32 28 374 %index29 = extractelement <32 x i8> %indices, i32 29 375 %index30 = extractelement <32 x i8> %indices, i32 30 376 %index31 = extractelement <32 x i8> %indices, i32 31 377 %v0 = extractelement <32 x i8> %v, i8 %index0 378 %v1 = extractelement <32 x i8> %v, i8 %index1 379 %v2 = extractelement <32 x i8> %v, i8 %index2 380 %v3 = extractelement <32 x i8> %v, i8 %index3 381 %v4 = extractelement <32 x i8> %v, i8 %index4 382 %v5 = extractelement <32 x i8> %v, i8 %index5 383 %v6 = extractelement <32 x i8> %v, i8 %index6 384 %v7 = extractelement <32 x i8> %v, i8 %index7 385 %v8 = extractelement <32 x i8> %v, i8 %index8 386 %v9 = extractelement <32 x i8> %v, i8 %index9 387 %v10 = extractelement <32 x i8> %v, i8 %index10 388 %v11 = extractelement <32 x i8> %v, i8 %index11 389 %v12 = extractelement <32 x i8> %v, i8 %index12 390 %v13 = extractelement <32 x i8> %v, i8 %index13 391 %v14 = extractelement <32 x i8> %v, i8 %index14 392 %v15 = extractelement <32 x i8> %v, i8 %index15 393 %v16 = extractelement <32 x i8> %v, i8 %index16 394 %v17 = extractelement <32 x i8> %v, i8 %index17 395 %v18 = extractelement <32 x i8> %v, i8 %index18 396 %v19 = extractelement <32 x i8> %v, i8 %index19 397 %v20 = extractelement <32 x i8> %v, i8 %index20 398 %v21 = extractelement <32 x i8> %v, i8 %index21 399 %v22 = extractelement <32 x i8> %v, i8 %index22 400 %v23 = extractelement <32 x i8> %v, i8 %index23 401 %v24 = extractelement <32 x i8> %v, i8 %index24 402 %v25 = extractelement <32 x i8> %v, i8 %index25 403 %v26 = extractelement <32 x i8> %v, i8 %index26 404 %v27 = extractelement <32 x i8> %v, i8 %index27 405 %v28 = extractelement <32 x i8> %v, i8 %index28 406 %v29 = extractelement <32 x i8> %v, i8 %index29 407 %v30 = extractelement <32 x i8> %v, i8 %index30 408 %v31 = extractelement <32 x i8> %v, i8 %index31 409 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 410 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 411 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 412 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 413 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 414 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 415 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 416 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 417 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 418 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 419 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 420 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 421 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 422 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 423 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 424 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 425 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 426 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 427 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 428 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 429 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 430 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 431 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 432 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 433 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 434 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 435 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 436 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 437 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 438 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 439 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 440 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 441 ret <32 x i8> %ret31 442 } 443 444 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind { 445 ; XOP-LABEL: var_shuffle_v4f64: 446 ; XOP: # %bb.0: 447 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 448 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 449 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 450 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 451 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 452 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 453 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 454 ; XOP-NEXT: retq 455 ; 456 ; AVX1-LABEL: var_shuffle_v4f64: 457 ; AVX1: # %bb.0: 458 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 459 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 460 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 461 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 462 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 463 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 464 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 465 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 466 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 467 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 468 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 469 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 470 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 471 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 472 ; AVX1-NEXT: retq 473 ; 474 ; AVX2-LABEL: var_shuffle_v4f64: 475 ; AVX2: # %bb.0: 476 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 477 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 478 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 479 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 480 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 481 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 482 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 483 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 484 ; AVX2-NEXT: retq 485 ; 486 ; AVX512-LABEL: var_shuffle_v4f64: 487 ; AVX512: # %bb.0: 488 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 489 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 490 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 491 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 492 ; AVX512-NEXT: retq 493 ; 494 ; AVX512VL-LABEL: var_shuffle_v4f64: 495 ; AVX512VL: # %bb.0: 496 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 497 ; AVX512VL-NEXT: retq 498 %index0 = extractelement <4 x i64> %indices, i32 0 499 %index1 = extractelement <4 x i64> %indices, i32 1 500 %index2 = extractelement <4 x i64> %indices, i32 2 501 %index3 = extractelement <4 x i64> %indices, i32 3 502 %v0 = extractelement <4 x double> %v, i64 %index0 503 %v1 = extractelement <4 x double> %v, i64 %index1 504 %v2 = extractelement <4 x double> %v, i64 %index2 505 %v3 = extractelement <4 x double> %v, i64 %index3 506 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 507 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 508 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 509 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 510 ret <4 x double> %ret3 511 } 512 513 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind { 514 ; XOP-LABEL: var_shuffle_v8f32: 515 ; XOP: # %bb.0: 516 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 517 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 518 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 519 ; XOP-NEXT: retq 520 ; 521 ; AVX1-LABEL: var_shuffle_v8f32: 522 ; AVX1: # %bb.0: 523 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 524 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 525 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 526 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 527 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 528 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 529 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 530 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 531 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 532 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 533 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 534 ; AVX1-NEXT: retq 535 ; 536 ; INT256-LABEL: var_shuffle_v8f32: 537 ; INT256: # %bb.0: 538 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 539 ; INT256-NEXT: retq 540 %index0 = extractelement <8 x i32> %indices, i32 0 541 %index1 = extractelement <8 x i32> %indices, i32 1 542 %index2 = extractelement <8 x i32> %indices, i32 2 543 %index3 = extractelement <8 x i32> %indices, i32 3 544 %index4 = extractelement <8 x i32> %indices, i32 4 545 %index5 = extractelement <8 x i32> %indices, i32 5 546 %index6 = extractelement <8 x i32> %indices, i32 6 547 %index7 = extractelement <8 x i32> %indices, i32 7 548 %v0 = extractelement <8 x float> %v, i32 %index0 549 %v1 = extractelement <8 x float> %v, i32 %index1 550 %v2 = extractelement <8 x float> %v, i32 %index2 551 %v3 = extractelement <8 x float> %v, i32 %index3 552 %v4 = extractelement <8 x float> %v, i32 %index4 553 %v5 = extractelement <8 x float> %v, i32 %index5 554 %v6 = extractelement <8 x float> %v, i32 %index6 555 %v7 = extractelement <8 x float> %v, i32 %index7 556 %ret0 = insertelement <8 x float> undef, float %v0, i32 0 557 %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1 558 %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2 559 %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3 560 %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4 561 %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5 562 %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6 563 %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7 564 ret <8 x float> %ret7 565 } 566 567 ; 568 ; PR35820 - Unequal source/destination vector sizes 569 ; 570 571 define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind { 572 ; XOP-LABEL: var_shuffle_v4i64_from_v2i64: 573 ; XOP: # %bb.0: 574 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 575 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 576 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 577 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 578 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 579 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 580 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 581 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 582 ; XOP-NEXT: retq 583 ; 584 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: 585 ; AVX1: # %bb.0: 586 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 587 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 588 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 589 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 590 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 591 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 592 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 593 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 594 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 595 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 596 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 597 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 598 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 599 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 600 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 601 ; AVX1-NEXT: retq 602 ; 603 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: 604 ; AVX2: # %bb.0: 605 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 606 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 607 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 608 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 609 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 610 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 611 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 612 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 613 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 614 ; AVX2-NEXT: retq 615 ; 616 ; AVX512-LABEL: var_shuffle_v4i64_from_v2i64: 617 ; AVX512: # %bb.0: 618 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 619 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 620 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 621 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 622 ; AVX512-NEXT: retq 623 ; 624 ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64: 625 ; AVX512VL: # %bb.0: 626 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 627 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 628 ; AVX512VL-NEXT: retq 629 %index0 = extractelement <4 x i64> %indices, i32 0 630 %index1 = extractelement <4 x i64> %indices, i32 1 631 %index2 = extractelement <4 x i64> %indices, i32 2 632 %index3 = extractelement <4 x i64> %indices, i32 3 633 %v0 = extractelement <2 x i64> %v, i64 %index0 634 %v1 = extractelement <2 x i64> %v, i64 %index1 635 %v2 = extractelement <2 x i64> %v, i64 %index2 636 %v3 = extractelement <2 x i64> %v, i64 %index3 637 %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 638 %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 639 %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 640 %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 641 ret <4 x i64> %ret3 642 } 643 644 define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind { 645 ; XOP-LABEL: var_shuffle_v8i32_from_v4i32: 646 ; XOP: # %bb.0: # %entry 647 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 648 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 649 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 650 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 651 ; XOP-NEXT: retq 652 ; 653 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: 654 ; AVX1: # %bb.0: # %entry 655 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 656 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 657 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 658 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 659 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 660 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 661 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 662 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 663 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 664 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 665 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 666 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 667 ; AVX1-NEXT: retq 668 ; 669 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32: 670 ; INT256: # %bb.0: # %entry 671 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 672 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 673 ; INT256-NEXT: retq 674 entry: 675 %tmp1 = extractelement <8 x i32> %indices, i32 0 676 %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1 677 %tmp2 = extractelement <8 x i32> %indices, i32 1 678 %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2 679 %tmp3 = extractelement <8 x i32> %indices, i32 2 680 %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3 681 %tmp4 = extractelement <8 x i32> %indices, i32 3 682 %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4 683 %tmp5 = extractelement <8 x i32> %indices, i32 4 684 %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5 685 %tmp6 = extractelement <8 x i32> %indices, i32 5 686 %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6 687 %tmp7 = extractelement <8 x i32> %indices, i32 6 688 %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7 689 %tmp8 = extractelement <8 x i32> %indices, i32 7 690 %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8 691 %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0 692 %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1 693 %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2 694 %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3 695 %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4 696 %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5 697 %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6 698 %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7 699 ret <8 x i32> %tmp16 700 } 701 702 define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind { 703 ; XOP-LABEL: var_shuffle_v16i16_from_v8i16: 704 ; XOP: # %bb.0: 705 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] 706 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] 707 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 708 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 709 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 710 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm1 711 ; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm0, %xmm0 712 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 713 ; XOP-NEXT: retq 714 ; 715 ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16: 716 ; AVX1: # %bb.0: 717 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] 718 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 719 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] 720 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 721 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 722 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 723 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 724 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 725 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 726 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5 727 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 728 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 729 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 730 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4 731 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 732 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 733 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 734 ; AVX1-NEXT: retq 735 ; 736 ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16: 737 ; AVX2: # %bb.0: 738 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 739 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 740 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 741 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 742 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 743 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 744 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 745 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 746 ; AVX2-NEXT: retq 747 ; 748 ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16: 749 ; AVX512: # %bb.0: 750 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 751 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 752 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 753 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 754 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 755 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 756 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 757 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 758 ; AVX512-NEXT: retq 759 ; 760 ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16: 761 ; AVX512VLDQ: # %bb.0: 762 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 763 ; AVX512VLDQ-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 764 ; AVX512VLDQ-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 765 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 766 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 767 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 768 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 769 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 770 ; AVX512VLDQ-NEXT: retq 771 ; 772 ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: 773 ; AVX512VLBW: # %bb.0: 774 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 775 ; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 776 ; AVX512VLBW-NEXT: retq 777 ; 778 ; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16: 779 ; VLVBMI: # %bb.0: 780 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 781 ; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 782 ; VLVBMI-NEXT: retq 783 %index0 = extractelement <16 x i16> %indices, i32 0 784 %index1 = extractelement <16 x i16> %indices, i32 1 785 %index2 = extractelement <16 x i16> %indices, i32 2 786 %index3 = extractelement <16 x i16> %indices, i32 3 787 %index4 = extractelement <16 x i16> %indices, i32 4 788 %index5 = extractelement <16 x i16> %indices, i32 5 789 %index6 = extractelement <16 x i16> %indices, i32 6 790 %index7 = extractelement <16 x i16> %indices, i32 7 791 %index8 = extractelement <16 x i16> %indices, i32 8 792 %index9 = extractelement <16 x i16> %indices, i32 9 793 %index10 = extractelement <16 x i16> %indices, i32 10 794 %index11 = extractelement <16 x i16> %indices, i32 11 795 %index12 = extractelement <16 x i16> %indices, i32 12 796 %index13 = extractelement <16 x i16> %indices, i32 13 797 %index14 = extractelement <16 x i16> %indices, i32 14 798 %index15 = extractelement <16 x i16> %indices, i32 15 799 %v0 = extractelement <8 x i16> %v, i16 %index0 800 %v1 = extractelement <8 x i16> %v, i16 %index1 801 %v2 = extractelement <8 x i16> %v, i16 %index2 802 %v3 = extractelement <8 x i16> %v, i16 %index3 803 %v4 = extractelement <8 x i16> %v, i16 %index4 804 %v5 = extractelement <8 x i16> %v, i16 %index5 805 %v6 = extractelement <8 x i16> %v, i16 %index6 806 %v7 = extractelement <8 x i16> %v, i16 %index7 807 %v8 = extractelement <8 x i16> %v, i16 %index8 808 %v9 = extractelement <8 x i16> %v, i16 %index9 809 %v10 = extractelement <8 x i16> %v, i16 %index10 810 %v11 = extractelement <8 x i16> %v, i16 %index11 811 %v12 = extractelement <8 x i16> %v, i16 %index12 812 %v13 = extractelement <8 x i16> %v, i16 %index13 813 %v14 = extractelement <8 x i16> %v, i16 %index14 814 %v15 = extractelement <8 x i16> %v, i16 %index15 815 %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0 816 %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1 817 %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2 818 %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3 819 %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4 820 %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5 821 %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6 822 %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7 823 %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8 824 %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9 825 %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10 826 %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11 827 %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12 828 %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13 829 %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14 830 %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15 831 ret <16 x i16> %ret15 832 } 833 834 define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind { 835 ; XOP-LABEL: var_shuffle_v32i8_from_v16i8: 836 ; XOP: # %bb.0: 837 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 838 ; XOP-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm2 839 ; XOP-NEXT: vpperm %xmm1, %xmm0, %xmm0, %xmm0 840 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 841 ; XOP-NEXT: retq 842 ; 843 ; AVX1-LABEL: var_shuffle_v32i8_from_v16i8: 844 ; AVX1: # %bb.0: 845 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 846 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 847 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 848 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5 849 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 850 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm2, %xmm2 851 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 852 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm4 853 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 854 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm0, %xmm0 855 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 856 ; AVX1-NEXT: retq 857 ; 858 ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8: 859 ; AVX2: # %bb.0: 860 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 861 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 862 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 863 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 864 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 865 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 866 ; AVX2-NEXT: retq 867 ; 868 ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8: 869 ; AVX512: # %bb.0: 870 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 871 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 872 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 873 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 874 ; AVX512-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 875 ; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 876 ; AVX512-NEXT: retq 877 ; 878 ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8: 879 ; AVX512VLDQ: # %bb.0: 880 ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 881 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 882 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 883 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 884 ; AVX512VLDQ-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1 885 ; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 886 ; AVX512VLDQ-NEXT: retq 887 ; 888 ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8: 889 ; AVX512VLBW: # %bb.0: 890 ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 891 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 892 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 893 ; AVX512VLBW-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %k1 894 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} 895 ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0 896 ; AVX512VLBW-NEXT: retq 897 ; 898 ; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8: 899 ; VLVBMI: # %bb.0: 900 ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 901 ; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 902 ; VLVBMI-NEXT: retq 903 %index0 = extractelement <32 x i8> %indices, i32 0 904 %index1 = extractelement <32 x i8> %indices, i32 1 905 %index2 = extractelement <32 x i8> %indices, i32 2 906 %index3 = extractelement <32 x i8> %indices, i32 3 907 %index4 = extractelement <32 x i8> %indices, i32 4 908 %index5 = extractelement <32 x i8> %indices, i32 5 909 %index6 = extractelement <32 x i8> %indices, i32 6 910 %index7 = extractelement <32 x i8> %indices, i32 7 911 %index8 = extractelement <32 x i8> %indices, i32 8 912 %index9 = extractelement <32 x i8> %indices, i32 9 913 %index10 = extractelement <32 x i8> %indices, i32 10 914 %index11 = extractelement <32 x i8> %indices, i32 11 915 %index12 = extractelement <32 x i8> %indices, i32 12 916 %index13 = extractelement <32 x i8> %indices, i32 13 917 %index14 = extractelement <32 x i8> %indices, i32 14 918 %index15 = extractelement <32 x i8> %indices, i32 15 919 %index16 = extractelement <32 x i8> %indices, i32 16 920 %index17 = extractelement <32 x i8> %indices, i32 17 921 %index18 = extractelement <32 x i8> %indices, i32 18 922 %index19 = extractelement <32 x i8> %indices, i32 19 923 %index20 = extractelement <32 x i8> %indices, i32 20 924 %index21 = extractelement <32 x i8> %indices, i32 21 925 %index22 = extractelement <32 x i8> %indices, i32 22 926 %index23 = extractelement <32 x i8> %indices, i32 23 927 %index24 = extractelement <32 x i8> %indices, i32 24 928 %index25 = extractelement <32 x i8> %indices, i32 25 929 %index26 = extractelement <32 x i8> %indices, i32 26 930 %index27 = extractelement <32 x i8> %indices, i32 27 931 %index28 = extractelement <32 x i8> %indices, i32 28 932 %index29 = extractelement <32 x i8> %indices, i32 29 933 %index30 = extractelement <32 x i8> %indices, i32 30 934 %index31 = extractelement <32 x i8> %indices, i32 31 935 %v0 = extractelement <16 x i8> %v, i8 %index0 936 %v1 = extractelement <16 x i8> %v, i8 %index1 937 %v2 = extractelement <16 x i8> %v, i8 %index2 938 %v3 = extractelement <16 x i8> %v, i8 %index3 939 %v4 = extractelement <16 x i8> %v, i8 %index4 940 %v5 = extractelement <16 x i8> %v, i8 %index5 941 %v6 = extractelement <16 x i8> %v, i8 %index6 942 %v7 = extractelement <16 x i8> %v, i8 %index7 943 %v8 = extractelement <16 x i8> %v, i8 %index8 944 %v9 = extractelement <16 x i8> %v, i8 %index9 945 %v10 = extractelement <16 x i8> %v, i8 %index10 946 %v11 = extractelement <16 x i8> %v, i8 %index11 947 %v12 = extractelement <16 x i8> %v, i8 %index12 948 %v13 = extractelement <16 x i8> %v, i8 %index13 949 %v14 = extractelement <16 x i8> %v, i8 %index14 950 %v15 = extractelement <16 x i8> %v, i8 %index15 951 %v16 = extractelement <16 x i8> %v, i8 %index16 952 %v17 = extractelement <16 x i8> %v, i8 %index17 953 %v18 = extractelement <16 x i8> %v, i8 %index18 954 %v19 = extractelement <16 x i8> %v, i8 %index19 955 %v20 = extractelement <16 x i8> %v, i8 %index20 956 %v21 = extractelement <16 x i8> %v, i8 %index21 957 %v22 = extractelement <16 x i8> %v, i8 %index22 958 %v23 = extractelement <16 x i8> %v, i8 %index23 959 %v24 = extractelement <16 x i8> %v, i8 %index24 960 %v25 = extractelement <16 x i8> %v, i8 %index25 961 %v26 = extractelement <16 x i8> %v, i8 %index26 962 %v27 = extractelement <16 x i8> %v, i8 %index27 963 %v28 = extractelement <16 x i8> %v, i8 %index28 964 %v29 = extractelement <16 x i8> %v, i8 %index29 965 %v30 = extractelement <16 x i8> %v, i8 %index30 966 %v31 = extractelement <16 x i8> %v, i8 %index31 967 %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0 968 %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1 969 %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2 970 %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3 971 %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4 972 %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5 973 %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6 974 %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7 975 %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8 976 %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9 977 %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10 978 %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11 979 %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12 980 %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13 981 %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14 982 %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15 983 %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16 984 %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17 985 %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18 986 %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19 987 %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20 988 %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21 989 %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22 990 %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23 991 %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24 992 %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25 993 %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26 994 %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27 995 %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28 996 %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29 997 %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30 998 %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31 999 ret <32 x i8> %ret31 1000 } 1001 1002 define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind { 1003 ; XOP-LABEL: var_shuffle_v4f64_from_v2f64: 1004 ; XOP: # %bb.0: 1005 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1006 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1007 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1008 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 1009 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 1010 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1011 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1012 ; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 1013 ; XOP-NEXT: retq 1014 ; 1015 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: 1016 ; AVX1: # %bb.0: 1017 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1018 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] 1019 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1020 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1021 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 1022 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 1023 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1024 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 1025 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1026 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 1027 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1028 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 1029 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1030 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1031 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 1032 ; AVX1-NEXT: retq 1033 ; 1034 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: 1035 ; AVX2: # %bb.0: 1036 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1037 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 1038 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] 1039 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 1040 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] 1041 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 1042 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 1043 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1044 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 1045 ; AVX2-NEXT: retq 1046 ; 1047 ; AVX512-LABEL: var_shuffle_v4f64_from_v2f64: 1048 ; AVX512: # %bb.0: 1049 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1050 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1051 ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 1052 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1053 ; AVX512-NEXT: retq 1054 ; 1055 ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64: 1056 ; AVX512VL: # %bb.0: 1057 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1058 ; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 1059 ; AVX512VL-NEXT: retq 1060 %index0 = extractelement <4 x i64> %indices, i32 0 1061 %index1 = extractelement <4 x i64> %indices, i32 1 1062 %index2 = extractelement <4 x i64> %indices, i32 2 1063 %index3 = extractelement <4 x i64> %indices, i32 3 1064 %v0 = extractelement <2 x double> %v, i64 %index0 1065 %v1 = extractelement <2 x double> %v, i64 %index1 1066 %v2 = extractelement <2 x double> %v, i64 %index2 1067 %v3 = extractelement <2 x double> %v, i64 %index3 1068 %ret0 = insertelement <4 x double> undef, double %v0, i32 0 1069 %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1 1070 %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2 1071 %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3 1072 ret <4 x double> %ret3 1073 } 1074 1075 define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind { 1076 ; XOP-LABEL: var_shuffle_v8f32_from_v4f32: 1077 ; XOP: # %bb.0: # %entry 1078 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1079 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1080 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1081 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 1082 ; XOP-NEXT: retq 1083 ; 1084 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: 1085 ; AVX1: # %bb.0: # %entry 1086 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1087 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] 1088 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1089 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1090 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 1091 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 1092 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1093 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 1094 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 1095 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1096 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1097 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 1098 ; AVX1-NEXT: retq 1099 ; 1100 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: 1101 ; INT256: # %bb.0: # %entry 1102 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1103 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1104 ; INT256-NEXT: retq 1105 entry: 1106 %tmp1 = extractelement <8 x i32> %indices, i32 0 1107 %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1 1108 %tmp2 = extractelement <8 x i32> %indices, i32 1 1109 %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2 1110 %tmp3 = extractelement <8 x i32> %indices, i32 2 1111 %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3 1112 %tmp4 = extractelement <8 x i32> %indices, i32 3 1113 %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4 1114 %tmp5 = extractelement <8 x i32> %indices, i32 4 1115 %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5 1116 %tmp6 = extractelement <8 x i32> %indices, i32 5 1117 %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6 1118 %tmp7 = extractelement <8 x i32> %indices, i32 6 1119 %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7 1120 %tmp8 = extractelement <8 x i32> %indices, i32 7 1121 %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8 1122 %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0 1123 %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1 1124 %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2 1125 %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3 1126 %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4 1127 %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5 1128 %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6 1129 %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7 1130 ret <8 x float> %tmp16 1131 } 1132 1133 define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind { 1134 ; XOP-LABEL: var_shuffle_v4i32_from_v8i32: 1135 ; XOP: # %bb.0: # %entry 1136 ; XOP-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1137 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1138 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1139 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 1140 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1141 ; XOP-NEXT: vzeroupper 1142 ; XOP-NEXT: retq 1143 ; 1144 ; AVX1-LABEL: var_shuffle_v4i32_from_v8i32: 1145 ; AVX1: # %bb.0: # %entry 1146 ; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1147 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] 1148 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 1149 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1150 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1151 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3] 1152 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1153 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm4 1154 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 1155 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1156 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 1157 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1158 ; AVX1-NEXT: vzeroupper 1159 ; AVX1-NEXT: retq 1160 ; 1161 ; INT256-LABEL: var_shuffle_v4i32_from_v8i32: 1162 ; INT256: # %bb.0: # %entry 1163 ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1164 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 1165 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1166 ; INT256-NEXT: vzeroupper 1167 ; INT256-NEXT: retq 1168 entry: 1169 %tmp1 = extractelement <4 x i32> %indices, i32 0 1170 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1 1171 %tmp2 = extractelement <4 x i32> %indices, i32 1 1172 %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2 1173 %tmp3 = extractelement <4 x i32> %indices, i32 2 1174 %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3 1175 %tmp4 = extractelement <4 x i32> %indices, i32 3 1176 %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4 1177 %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0 1178 %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1 1179 %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2 1180 %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3 1181 ret <4 x i32> %tmp12 1182 } 1183