1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ 3 4 target triple = "x86_64-unknown-unknown" 5 6 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { 7 ; AVX512F-LABEL: shuf2i1_1_0: 8 ; AVX512F: # BB#0: 9 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 10 ; AVX512F-NEXT: retq 11 ; 12 ; VL_BW_DQ-LABEL: shuf2i1_1_0: 13 ; VL_BW_DQ: # BB#0: 14 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 15 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 16 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 17 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 18 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 19 ; VL_BW_DQ-NEXT: retq 20 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0> 21 ret <2 x i1> %b 22 } 23 24 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { 25 ; AVX512F-LABEL: shuf2i1_1_2: 26 ; AVX512F: # BB#0: 27 ; AVX512F-NEXT: movl $1, %eax 28 ; AVX512F-NEXT: vmovq %rax, %xmm1 29 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 30 ; AVX512F-NEXT: retq 31 ; 32 ; VL_BW_DQ-LABEL: shuf2i1_1_2: 33 ; VL_BW_DQ: # BB#0: 34 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 35 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 36 ; VL_BW_DQ-NEXT: movb $1, %al 37 ; VL_BW_DQ-NEXT: kmovb %eax, %k0 38 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 39 ; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0 40 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 41 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 42 ; VL_BW_DQ-NEXT: retq 43 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2> 44 ret <2 x i1> %b 45 } 46 47 48 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { 49 ; AVX512F-LABEL: shuf4i1_3_2_10: 50 ; AVX512F: # BB#0: 51 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 52 ; AVX512F-NEXT: retq 53 ; 54 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10: 55 ; VL_BW_DQ: # BB#0: 56 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 57 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 58 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 59 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 60 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 61 ; VL_BW_DQ-NEXT: retq 62 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 63 ret <4 x i1> %b 64 } 65 66 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { 67 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 68 ; AVX512F: # BB#0: 69 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 70 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 71 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} 72 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] 73 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 74 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 75 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 76 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} 77 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 78 ; AVX512F-NEXT: retq 79 ; 80 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 81 ; VL_BW_DQ: # BB#0: 82 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 83 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 84 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 85 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 86 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 87 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 88 ; VL_BW_DQ-NEXT: retq 89 %a2 = icmp eq <8 x i64> %a, %a1 90 %b2 = icmp eq <8 x i64> %b, %b1 91 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 92 ret <8 x i1> %c 93 } 94 95 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { 96 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 97 ; AVX512F: # BB#0: 98 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 99 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 100 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 101 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z} 102 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z} 103 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 104 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 105 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1 106 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 107 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} 108 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 109 ; AVX512F-NEXT: retq 110 ; 111 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 112 ; VL_BW_DQ: # BB#0: 113 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 114 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 115 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 116 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 117 ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 118 ; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 119 ; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 120 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 121 ; VL_BW_DQ-NEXT: retq 122 %a2 = icmp eq <16 x i32> %a, %a1 123 %b2 = icmp eq <16 x i32> %b, %b1 124 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 125 ret <16 x i1> %c 126 } 127 128 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { 129 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 130 ; AVX512F: # BB#0: 131 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 132 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] 133 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] 134 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] 135 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 136 ; AVX512F-NEXT: retq 137 ; 138 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 139 ; VL_BW_DQ: # BB#0: 140 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 141 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 142 ; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 143 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 144 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 145 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 146 ; VL_BW_DQ-NEXT: retq 147 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 148 ret <32 x i1> %b 149 } 150 151 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { 152 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 153 ; AVX512F: # BB#0: 154 ; AVX512F-NEXT: movzbl %dil, %eax 155 ; AVX512F-NEXT: kmovw %eax, %k1 156 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 157 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} 158 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 159 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 160 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 161 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 162 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} 163 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 164 ; AVX512F-NEXT: retq 165 ; 166 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 167 ; VL_BW_DQ: # BB#0: 168 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 169 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 170 ; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 171 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 172 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 173 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 174 ; VL_BW_DQ-NEXT: retq 175 %b = bitcast i8 %a to <8 x i1> 176 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef> 177 ret <8 x i1> %c 178 } 179 180 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { 181 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 182 ; AVX512F: # BB#0: 183 ; AVX512F-NEXT: movzbl %dil, %eax 184 ; AVX512F-NEXT: kmovw %eax, %k1 185 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} 186 ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 187 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 188 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 189 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 190 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 191 ; AVX512F-NEXT: kmovw %k0, %eax 192 ; AVX512F-NEXT: retq 193 ; 194 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 195 ; VL_BW_DQ: # BB#0: 196 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 197 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 198 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 199 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 200 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 201 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 202 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 203 ; VL_BW_DQ-NEXT: retq 204 %b = bitcast i8 %a to <8 x i1> 205 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef> 206 %d = bitcast <8 x i1> %c to i8 207 ret i8 %d 208 } 209 210 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { 211 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 212 ; AVX512F: # BB#0: 213 ; AVX512F-NEXT: movzbl %dil, %eax 214 ; AVX512F-NEXT: kmovw %eax, %k1 215 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} 216 ; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] 217 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 218 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 219 ; AVX512F-NEXT: kmovw %k0, %eax 220 ; AVX512F-NEXT: retq 221 ; 222 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 223 ; VL_BW_DQ: # BB#0: 224 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 225 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 226 ; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] 227 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 228 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 229 ; VL_BW_DQ-NEXT: retq 230 %b = bitcast i8 %a to <8 x i1> 231 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> 232 %d = bitcast <8 x i1> %c to i8 233 ret i8 %d 234 } 235 236 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { 237 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 238 ; AVX512F: # BB#0: 239 ; AVX512F-NEXT: movzbl %dil, %eax 240 ; AVX512F-NEXT: kmovw %eax, %k1 241 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} 242 ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 243 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 244 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 245 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 246 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 247 ; AVX512F-NEXT: kmovw %k0, %eax 248 ; AVX512F-NEXT: retq 249 ; 250 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 251 ; VL_BW_DQ: # BB#0: 252 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 253 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 254 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 255 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 256 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 257 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 258 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 259 ; VL_BW_DQ-NEXT: retq 260 %b = bitcast i8 %a to <8 x i1> 261 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 262 %d = bitcast <8 x i1>%c to i8 263 ret i8 %d 264 } 265 266 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { 267 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 268 ; AVX512F: # BB#0: 269 ; AVX512F-NEXT: movzbl %dil, %eax 270 ; AVX512F-NEXT: kmovw %eax, %k1 271 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} 272 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 273 ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 274 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 275 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm0 276 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 277 ; AVX512F-NEXT: kmovw %k0, %eax 278 ; AVX512F-NEXT: retq 279 ; 280 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 281 ; VL_BW_DQ: # BB#0: 282 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 283 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 284 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 285 ; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 286 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 287 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 288 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 289 ; VL_BW_DQ-NEXT: retq 290 %b = bitcast i8 %a to <8 x i1> 291 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0> 292 %d = bitcast <8 x i1>%c to i8 293 ret i8 %d 294 } 295 296 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { 297 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 298 ; AVX512F: # BB#0: 299 ; AVX512F-NEXT: movzbl %dil, %eax 300 ; AVX512F-NEXT: kmovw %eax, %k1 301 ; AVX512F-NEXT: movb $51, %al 302 ; AVX512F-NEXT: movzbl %al, %eax 303 ; AVX512F-NEXT: kmovw %eax, %k2 304 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 305 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z} 306 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} 307 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] 308 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 309 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm0 310 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 311 ; AVX512F-NEXT: kmovw %k0, %eax 312 ; AVX512F-NEXT: retq 313 ; 314 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 315 ; VL_BW_DQ: # BB#0: 316 ; VL_BW_DQ-NEXT: kmovb %edi, %k0 317 ; VL_BW_DQ-NEXT: movb $51, %al 318 ; VL_BW_DQ-NEXT: kmovb %eax, %k1 319 ; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 320 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 321 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] 322 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 323 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 324 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 325 ; VL_BW_DQ-NEXT: retq 326 %b = bitcast i8 %a to <8 x i1> 327 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1> 328 %c1 = bitcast <8 x i1>%c to i8 329 ret i8 %c1 330 } 331 332 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { 333 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 334 ; AVX512F: # BB#0: 335 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 336 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 337 ; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 338 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 339 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} 340 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] 341 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm3 342 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 343 ; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm0 344 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 345 ; AVX512F-NEXT: kmovw %k0, %eax 346 ; AVX512F-NEXT: retq 347 ; 348 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 349 ; VL_BW_DQ: # BB#0: 350 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 351 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 352 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] 353 ; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 354 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 355 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 356 ; VL_BW_DQ-NEXT: kmovb %k0, %eax 357 ; VL_BW_DQ-NEXT: retq 358 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 359 %c1 = bitcast <8 x i1>%c to i8 360 ret i8 %c1 361 } 362 363 364 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { 365 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 366 ; AVX512F: # BB#0: 367 ; AVX512F-NEXT: kmovw %edi, %k1 368 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} 369 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 370 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 371 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 372 ; AVX512F-NEXT: kmovw %k0, %eax 373 ; AVX512F-NEXT: retq 374 ; 375 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 376 ; VL_BW_DQ: # BB#0: 377 ; VL_BW_DQ-NEXT: kmovw %edi, %k0 378 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 379 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 380 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 381 ; VL_BW_DQ-NEXT: kmovw %k0, %eax 382 ; VL_BW_DQ-NEXT: retq 383 %b = bitcast i16 %a to <16 x i1> 384 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer 385 %d = bitcast <16 x i1> %c to i16 386 ret i16 %d 387 } 388 389 define i64 @shuf64i1_zero(i64 %a) { 390 ; VL_BW_DQ-LABEL: shuf64i1_zero: 391 ; VL_BW_DQ: # BB#0: 392 ; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0 393 ; VL_BW_DQ-NEXT: kmovq %k0, %rax 394 ; VL_BW_DQ-NEXT: retq 395 %b = bitcast i64 %a to <64 x i1> 396 %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer 397 %d = bitcast <64 x i1> %c to i64 398 ret i64 %d 399 } 400