1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 12 13 ; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 15 16 ; 17 ; Variable Rotates 18 ; 19 20 define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 21 ; SSE2-LABEL: var_rotate_v2i64: 22 ; SSE2: # %bb.0: 23 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 24 ; SSE2-NEXT: psubq %xmm1, %xmm2 25 ; SSE2-NEXT: movdqa %xmm0, %xmm3 26 ; SSE2-NEXT: psllq %xmm1, %xmm3 27 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 28 ; SSE2-NEXT: movdqa %xmm0, %xmm4 29 ; SSE2-NEXT: psllq %xmm1, %xmm4 30 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 31 ; SSE2-NEXT: movdqa %xmm0, %xmm1 32 ; SSE2-NEXT: psrlq %xmm2, %xmm1 33 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 34 ; SSE2-NEXT: psrlq %xmm2, %xmm0 35 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 36 ; SSE2-NEXT: orpd %xmm4, %xmm0 37 ; SSE2-NEXT: retq 38 ; 39 ; SSE41-LABEL: var_rotate_v2i64: 40 ; SSE41: # %bb.0: 41 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 42 ; SSE41-NEXT: psubq %xmm1, %xmm2 43 ; SSE41-NEXT: movdqa %xmm0, %xmm3 44 ; SSE41-NEXT: psllq %xmm1, %xmm3 45 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 46 ; SSE41-NEXT: movdqa %xmm0, %xmm4 47 ; SSE41-NEXT: psllq %xmm1, %xmm4 48 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 49 ; SSE41-NEXT: movdqa %xmm0, %xmm1 50 ; SSE41-NEXT: psrlq %xmm2, %xmm1 51 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 52 ; SSE41-NEXT: psrlq %xmm2, %xmm0 53 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 54 ; SSE41-NEXT: por %xmm4, %xmm0 55 ; SSE41-NEXT: retq 56 ; 57 ; AVX1-LABEL: var_rotate_v2i64: 58 ; AVX1: # %bb.0: 59 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 60 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 61 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 62 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 63 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 64 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 65 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 66 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 67 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 68 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 69 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 70 ; AVX1-NEXT: retq 71 ; 72 ; AVX2-LABEL: var_rotate_v2i64: 73 ; AVX2: # %bb.0: 74 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 75 ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 76 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 77 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 78 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 79 ; AVX2-NEXT: retq 80 ; 81 ; AVX512F-LABEL: var_rotate_v2i64: 82 ; AVX512F: # %bb.0: 83 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 84 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 85 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 86 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 87 ; AVX512F-NEXT: vzeroupper 88 ; AVX512F-NEXT: retq 89 ; 90 ; AVX512VL-LABEL: var_rotate_v2i64: 91 ; AVX512VL: # %bb.0: 92 ; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 93 ; AVX512VL-NEXT: retq 94 ; 95 ; AVX512BW-LABEL: var_rotate_v2i64: 96 ; AVX512BW: # %bb.0: 97 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 98 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 99 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 100 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 101 ; AVX512BW-NEXT: vzeroupper 102 ; AVX512BW-NEXT: retq 103 ; 104 ; AVX512VLBW-LABEL: var_rotate_v2i64: 105 ; AVX512VLBW: # %bb.0: 106 ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 107 ; AVX512VLBW-NEXT: retq 108 ; 109 ; XOP-LABEL: var_rotate_v2i64: 110 ; XOP: # %bb.0: 111 ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 112 ; XOP-NEXT: retq 113 ; 114 ; X32-SSE-LABEL: var_rotate_v2i64: 115 ; X32-SSE: # %bb.0: 116 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 117 ; X32-SSE-NEXT: psubq %xmm1, %xmm2 118 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 119 ; X32-SSE-NEXT: psllq %xmm1, %xmm3 120 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 121 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 122 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 123 ; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 124 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 125 ; X32-SSE-NEXT: psrlq %xmm2, %xmm1 126 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 127 ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 128 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 129 ; X32-SSE-NEXT: orpd %xmm4, %xmm0 130 ; X32-SSE-NEXT: retl 131 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 132 %shl = shl <2 x i64> %a, %b 133 %lshr = lshr <2 x i64> %a, %b64 134 %or = or <2 x i64> %shl, %lshr 135 ret <2 x i64> %or 136 } 137 138 define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 139 ; SSE2-LABEL: var_rotate_v4i32: 140 ; SSE2: # %bb.0: 141 ; SSE2-NEXT: pslld $23, %xmm1 142 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 143 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 144 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 145 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 146 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 147 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 148 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 149 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 150 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 151 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 152 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 153 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 154 ; SSE2-NEXT: por %xmm3, %xmm0 155 ; SSE2-NEXT: retq 156 ; 157 ; SSE41-LABEL: var_rotate_v4i32: 158 ; SSE41: # %bb.0: 159 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 160 ; SSE41-NEXT: pslld $23, %xmm1 161 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 162 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 163 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 164 ; SSE41-NEXT: pmuludq %xmm2, %xmm3 165 ; SSE41-NEXT: pmuludq %xmm1, %xmm0 166 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 167 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 168 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 169 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 170 ; SSE41-NEXT: por %xmm1, %xmm0 171 ; SSE41-NEXT: retq 172 ; 173 ; AVX1-LABEL: var_rotate_v4i32: 174 ; AVX1: # %bb.0: 175 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 176 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 177 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 178 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 179 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 180 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 181 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 182 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 184 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 185 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 186 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 187 ; AVX1-NEXT: retq 188 ; 189 ; AVX2-LABEL: var_rotate_v4i32: 190 ; AVX2: # %bb.0: 191 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 192 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 193 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 194 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 195 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 196 ; AVX2-NEXT: retq 197 ; 198 ; AVX512F-LABEL: var_rotate_v4i32: 199 ; AVX512F: # %bb.0: 200 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 201 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 202 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 203 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 204 ; AVX512F-NEXT: vzeroupper 205 ; AVX512F-NEXT: retq 206 ; 207 ; AVX512VL-LABEL: var_rotate_v4i32: 208 ; AVX512VL: # %bb.0: 209 ; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 210 ; AVX512VL-NEXT: retq 211 ; 212 ; AVX512BW-LABEL: var_rotate_v4i32: 213 ; AVX512BW: # %bb.0: 214 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 215 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 216 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 217 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 218 ; AVX512BW-NEXT: vzeroupper 219 ; AVX512BW-NEXT: retq 220 ; 221 ; AVX512VLBW-LABEL: var_rotate_v4i32: 222 ; AVX512VLBW: # %bb.0: 223 ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 224 ; AVX512VLBW-NEXT: retq 225 ; 226 ; XOP-LABEL: var_rotate_v4i32: 227 ; XOP: # %bb.0: 228 ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 229 ; XOP-NEXT: retq 230 ; 231 ; X32-SSE-LABEL: var_rotate_v4i32: 232 ; X32-SSE: # %bb.0: 233 ; X32-SSE-NEXT: pslld $23, %xmm1 234 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 235 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 236 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 237 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 238 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 239 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 240 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 241 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 242 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 243 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 244 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 245 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 246 ; X32-SSE-NEXT: por %xmm3, %xmm0 247 ; X32-SSE-NEXT: retl 248 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 249 %shl = shl <4 x i32> %a, %b 250 %lshr = lshr <4 x i32> %a, %b32 251 %or = or <4 x i32> %shl, %lshr 252 ret <4 x i32> %or 253 } 254 255 define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 256 ; SSE2-LABEL: var_rotate_v8i16: 257 ; SSE2: # %bb.0: 258 ; SSE2-NEXT: pxor %xmm2, %xmm2 259 ; SSE2-NEXT: movdqa %xmm1, %xmm3 260 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 261 ; SSE2-NEXT: pslld $23, %xmm3 262 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 263 ; SSE2-NEXT: paddd %xmm4, %xmm3 264 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 265 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 266 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 267 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 268 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 269 ; SSE2-NEXT: pslld $23, %xmm1 270 ; SSE2-NEXT: paddd %xmm4, %xmm1 271 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 272 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 273 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 274 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 275 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 276 ; SSE2-NEXT: movdqa %xmm0, %xmm2 277 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2 278 ; SSE2-NEXT: pmullw %xmm1, %xmm0 279 ; SSE2-NEXT: por %xmm2, %xmm0 280 ; SSE2-NEXT: retq 281 ; 282 ; SSE41-LABEL: var_rotate_v8i16: 283 ; SSE41: # %bb.0: 284 ; SSE41-NEXT: pxor %xmm2, %xmm2 285 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 286 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 287 ; SSE41-NEXT: pslld $23, %xmm1 288 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] 289 ; SSE41-NEXT: paddd %xmm2, %xmm1 290 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 291 ; SSE41-NEXT: pslld $23, %xmm3 292 ; SSE41-NEXT: paddd %xmm2, %xmm3 293 ; SSE41-NEXT: cvttps2dq %xmm3, %xmm2 294 ; SSE41-NEXT: packusdw %xmm1, %xmm2 295 ; SSE41-NEXT: movdqa %xmm0, %xmm1 296 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1 297 ; SSE41-NEXT: pmullw %xmm2, %xmm0 298 ; SSE41-NEXT: por %xmm1, %xmm0 299 ; SSE41-NEXT: retq 300 ; 301 ; AVX1-LABEL: var_rotate_v8i16: 302 ; AVX1: # %bb.0: 303 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 304 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 305 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 306 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 307 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 308 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 309 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 310 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 311 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 312 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 313 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 314 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 315 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 316 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 317 ; AVX1-NEXT: retq 318 ; 319 ; AVX2-LABEL: var_rotate_v8i16: 320 ; AVX2: # %bb.0: 321 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 322 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 323 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 324 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 325 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 326 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 327 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] 328 ; AVX2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 329 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 330 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 331 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 332 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 333 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 334 ; AVX2-NEXT: vzeroupper 335 ; AVX2-NEXT: retq 336 ; 337 ; AVX512F-LABEL: var_rotate_v8i16: 338 ; AVX512F: # %bb.0: 339 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 340 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 341 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 342 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 343 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 344 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 345 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 346 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 347 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 348 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 349 ; AVX512F-NEXT: vzeroupper 350 ; AVX512F-NEXT: retq 351 ; 352 ; AVX512VL-LABEL: var_rotate_v8i16: 353 ; AVX512VL: # %bb.0: 354 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 355 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 356 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 357 ; AVX512VL-NEXT: vpmovdw %ymm2, %xmm2 358 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 359 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 360 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 361 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 362 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 363 ; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 364 ; AVX512VL-NEXT: vzeroupper 365 ; AVX512VL-NEXT: retq 366 ; 367 ; AVX512BW-LABEL: var_rotate_v8i16: 368 ; AVX512BW: # %bb.0: 369 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 370 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 371 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 372 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 373 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 374 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 375 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 376 ; AVX512BW-NEXT: vzeroupper 377 ; AVX512BW-NEXT: retq 378 ; 379 ; AVX512VLBW-LABEL: var_rotate_v8i16: 380 ; AVX512VLBW: # %bb.0: 381 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 382 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 383 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 384 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 385 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 386 ; AVX512VLBW-NEXT: retq 387 ; 388 ; XOP-LABEL: var_rotate_v8i16: 389 ; XOP: # %bb.0: 390 ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 391 ; XOP-NEXT: retq 392 ; 393 ; X32-SSE-LABEL: var_rotate_v8i16: 394 ; X32-SSE: # %bb.0: 395 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 396 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 397 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 398 ; X32-SSE-NEXT: pslld $23, %xmm3 399 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 400 ; X32-SSE-NEXT: paddd %xmm4, %xmm3 401 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3 402 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 403 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 404 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 405 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 406 ; X32-SSE-NEXT: pslld $23, %xmm1 407 ; X32-SSE-NEXT: paddd %xmm4, %xmm1 408 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 409 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 410 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 411 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 412 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 413 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 414 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2 415 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0 416 ; X32-SSE-NEXT: por %xmm2, %xmm0 417 ; X32-SSE-NEXT: retl 418 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 419 %shl = shl <8 x i16> %a, %b 420 %lshr = lshr <8 x i16> %a, %b16 421 %or = or <8 x i16> %shl, %lshr 422 ret <8 x i16> %or 423 } 424 425 define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 426 ; SSE2-LABEL: var_rotate_v16i8: 427 ; SSE2: # %bb.0: 428 ; SSE2-NEXT: movdqa %xmm0, %xmm2 429 ; SSE2-NEXT: psllw $5, %xmm1 430 ; SSE2-NEXT: pxor %xmm0, %xmm0 431 ; SSE2-NEXT: pxor %xmm3, %xmm3 432 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 433 ; SSE2-NEXT: movdqa %xmm2, %xmm4 434 ; SSE2-NEXT: psrlw $4, %xmm4 435 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 436 ; SSE2-NEXT: movdqa %xmm2, %xmm5 437 ; SSE2-NEXT: psllw $4, %xmm5 438 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 439 ; SSE2-NEXT: por %xmm4, %xmm5 440 ; SSE2-NEXT: pand %xmm3, %xmm5 441 ; SSE2-NEXT: pandn %xmm2, %xmm3 442 ; SSE2-NEXT: por %xmm5, %xmm3 443 ; SSE2-NEXT: movdqa %xmm3, %xmm2 444 ; SSE2-NEXT: psrlw $6, %xmm2 445 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 446 ; SSE2-NEXT: movdqa %xmm3, %xmm4 447 ; SSE2-NEXT: psllw $2, %xmm4 448 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 449 ; SSE2-NEXT: por %xmm2, %xmm4 450 ; SSE2-NEXT: paddb %xmm1, %xmm1 451 ; SSE2-NEXT: pxor %xmm2, %xmm2 452 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 453 ; SSE2-NEXT: pand %xmm2, %xmm4 454 ; SSE2-NEXT: pandn %xmm3, %xmm2 455 ; SSE2-NEXT: por %xmm4, %xmm2 456 ; SSE2-NEXT: movdqa %xmm2, %xmm3 457 ; SSE2-NEXT: paddb %xmm2, %xmm3 458 ; SSE2-NEXT: movdqa %xmm2, %xmm4 459 ; SSE2-NEXT: psrlw $7, %xmm4 460 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 461 ; SSE2-NEXT: por %xmm3, %xmm4 462 ; SSE2-NEXT: paddb %xmm1, %xmm1 463 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 464 ; SSE2-NEXT: pand %xmm0, %xmm4 465 ; SSE2-NEXT: pandn %xmm2, %xmm0 466 ; SSE2-NEXT: por %xmm4, %xmm0 467 ; SSE2-NEXT: retq 468 ; 469 ; SSE41-LABEL: var_rotate_v16i8: 470 ; SSE41: # %bb.0: 471 ; SSE41-NEXT: movdqa %xmm1, %xmm2 472 ; SSE41-NEXT: movdqa %xmm0, %xmm1 473 ; SSE41-NEXT: psrlw $4, %xmm0 474 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 475 ; SSE41-NEXT: movdqa %xmm1, %xmm3 476 ; SSE41-NEXT: psllw $4, %xmm3 477 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 478 ; SSE41-NEXT: por %xmm0, %xmm3 479 ; SSE41-NEXT: psllw $5, %xmm2 480 ; SSE41-NEXT: movdqa %xmm2, %xmm0 481 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 482 ; SSE41-NEXT: movdqa %xmm1, %xmm0 483 ; SSE41-NEXT: psrlw $6, %xmm0 484 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 485 ; SSE41-NEXT: movdqa %xmm1, %xmm3 486 ; SSE41-NEXT: psllw $2, %xmm3 487 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 488 ; SSE41-NEXT: por %xmm0, %xmm3 489 ; SSE41-NEXT: paddb %xmm2, %xmm2 490 ; SSE41-NEXT: movdqa %xmm2, %xmm0 491 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 492 ; SSE41-NEXT: movdqa %xmm1, %xmm0 493 ; SSE41-NEXT: paddb %xmm1, %xmm0 494 ; SSE41-NEXT: movdqa %xmm1, %xmm3 495 ; SSE41-NEXT: psrlw $7, %xmm3 496 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 497 ; SSE41-NEXT: por %xmm0, %xmm3 498 ; SSE41-NEXT: paddb %xmm2, %xmm2 499 ; SSE41-NEXT: movdqa %xmm2, %xmm0 500 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 501 ; SSE41-NEXT: movdqa %xmm1, %xmm0 502 ; SSE41-NEXT: retq 503 ; 504 ; AVX-LABEL: var_rotate_v16i8: 505 ; AVX: # %bb.0: 506 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 507 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 508 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 509 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 510 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 511 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 512 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 513 ; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 514 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 515 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 516 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 517 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 518 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 519 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 520 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 521 ; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 522 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 523 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 524 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 525 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 526 ; AVX-NEXT: retq 527 ; 528 ; AVX512F-LABEL: var_rotate_v16i8: 529 ; AVX512F: # %bb.0: 530 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm2 531 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 532 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm3 533 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 534 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 535 ; AVX512F-NEXT: vpsllw $5, %xmm1, %xmm1 536 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 537 ; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm2 538 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 539 ; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 540 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 541 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 542 ; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 543 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 544 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm2 545 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 546 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 547 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 548 ; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 549 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 550 ; AVX512F-NEXT: retq 551 ; 552 ; AVX512VL-LABEL: var_rotate_v16i8: 553 ; AVX512VL: # %bb.0: 554 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 555 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 556 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm3 557 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 558 ; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 559 ; AVX512VL-NEXT: vpsllw $5, %xmm1, %xmm1 560 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 561 ; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm2 562 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 563 ; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 564 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 565 ; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 566 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 567 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 568 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 569 ; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 570 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 571 ; AVX512VL-NEXT: vpor %xmm3, %xmm2, %xmm2 572 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 573 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 574 ; AVX512VL-NEXT: retq 575 ; 576 ; AVX512BW-LABEL: var_rotate_v16i8: 577 ; AVX512BW: # %bb.0: 578 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 579 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 580 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 581 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 582 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 583 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 584 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 585 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 586 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 587 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 588 ; AVX512BW-NEXT: vzeroupper 589 ; AVX512BW-NEXT: retq 590 ; 591 ; AVX512VLBW-LABEL: var_rotate_v16i8: 592 ; AVX512VLBW: # %bb.0: 593 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 594 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 595 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm2 596 ; AVX512VLBW-NEXT: vpmovwb %ymm2, %xmm2 597 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 598 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 599 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 600 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 601 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 602 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 603 ; AVX512VLBW-NEXT: vzeroupper 604 ; AVX512VLBW-NEXT: retq 605 ; 606 ; XOP-LABEL: var_rotate_v16i8: 607 ; XOP: # %bb.0: 608 ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 609 ; XOP-NEXT: retq 610 ; 611 ; X32-SSE-LABEL: var_rotate_v16i8: 612 ; X32-SSE: # %bb.0: 613 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 614 ; X32-SSE-NEXT: psllw $5, %xmm1 615 ; X32-SSE-NEXT: pxor %xmm0, %xmm0 616 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 617 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 618 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4 619 ; X32-SSE-NEXT: psrlw $4, %xmm4 620 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 621 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5 622 ; X32-SSE-NEXT: psllw $4, %xmm5 623 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5 624 ; X32-SSE-NEXT: por %xmm4, %xmm5 625 ; X32-SSE-NEXT: pand %xmm3, %xmm5 626 ; X32-SSE-NEXT: pandn %xmm2, %xmm3 627 ; X32-SSE-NEXT: por %xmm5, %xmm3 628 ; X32-SSE-NEXT: movdqa %xmm3, %xmm2 629 ; X32-SSE-NEXT: psrlw $6, %xmm2 630 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 631 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 632 ; X32-SSE-NEXT: psllw $2, %xmm4 633 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 634 ; X32-SSE-NEXT: por %xmm2, %xmm4 635 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 636 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 637 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 638 ; X32-SSE-NEXT: pand %xmm2, %xmm4 639 ; X32-SSE-NEXT: pandn %xmm3, %xmm2 640 ; X32-SSE-NEXT: por %xmm4, %xmm2 641 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 642 ; X32-SSE-NEXT: paddb %xmm2, %xmm3 643 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4 644 ; X32-SSE-NEXT: psrlw $7, %xmm4 645 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 646 ; X32-SSE-NEXT: por %xmm3, %xmm4 647 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 648 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm0 649 ; X32-SSE-NEXT: pand %xmm0, %xmm4 650 ; X32-SSE-NEXT: pandn %xmm2, %xmm0 651 ; X32-SSE-NEXT: por %xmm4, %xmm0 652 ; X32-SSE-NEXT: retl 653 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 654 %shl = shl <16 x i8> %a, %b 655 %lshr = lshr <16 x i8> %a, %b8 656 %or = or <16 x i8> %shl, %lshr 657 ret <16 x i8> %or 658 } 659 660 ; 661 ; Uniform Variable Rotates 662 ; 663 664 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 665 ; SSE-LABEL: splatvar_rotate_v2i64: 666 ; SSE: # %bb.0: 667 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 668 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,64] 669 ; SSE-NEXT: psubq %xmm2, %xmm3 670 ; SSE-NEXT: movdqa %xmm0, %xmm2 671 ; SSE-NEXT: psllq %xmm1, %xmm2 672 ; SSE-NEXT: psrlq %xmm3, %xmm0 673 ; SSE-NEXT: por %xmm2, %xmm0 674 ; SSE-NEXT: retq 675 ; 676 ; AVX1-LABEL: splatvar_rotate_v2i64: 677 ; AVX1: # %bb.0: 678 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 679 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 680 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 681 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 682 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 683 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 684 ; AVX1-NEXT: retq 685 ; 686 ; AVX2-LABEL: splatvar_rotate_v2i64: 687 ; AVX2: # %bb.0: 688 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2 689 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 690 ; AVX2-NEXT: vpsubq %xmm2, %xmm3, %xmm2 691 ; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 692 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 693 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 694 ; AVX2-NEXT: retq 695 ; 696 ; AVX512F-LABEL: splatvar_rotate_v2i64: 697 ; AVX512F: # %bb.0: 698 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 699 ; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 700 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 701 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 702 ; AVX512F-NEXT: vzeroupper 703 ; AVX512F-NEXT: retq 704 ; 705 ; AVX512VL-LABEL: splatvar_rotate_v2i64: 706 ; AVX512VL: # %bb.0: 707 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 708 ; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 709 ; AVX512VL-NEXT: retq 710 ; 711 ; AVX512BW-LABEL: splatvar_rotate_v2i64: 712 ; AVX512BW: # %bb.0: 713 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 714 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1 715 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 716 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 717 ; AVX512BW-NEXT: vzeroupper 718 ; AVX512BW-NEXT: retq 719 ; 720 ; AVX512VLBW-LABEL: splatvar_rotate_v2i64: 721 ; AVX512VLBW: # %bb.0: 722 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1 723 ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 724 ; AVX512VLBW-NEXT: retq 725 ; 726 ; XOPAVX1-LABEL: splatvar_rotate_v2i64: 727 ; XOPAVX1: # %bb.0: 728 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 729 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 730 ; XOPAVX1-NEXT: retq 731 ; 732 ; XOPAVX2-LABEL: splatvar_rotate_v2i64: 733 ; XOPAVX2: # %bb.0: 734 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 735 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 736 ; XOPAVX2-NEXT: retq 737 ; 738 ; X32-SSE-LABEL: splatvar_rotate_v2i64: 739 ; X32-SSE: # %bb.0: 740 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 741 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] 742 ; X32-SSE-NEXT: psubq %xmm2, %xmm3 743 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 744 ; X32-SSE-NEXT: psllq %xmm1, %xmm2 745 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 746 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 747 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 748 ; X32-SSE-NEXT: psrlq %xmm3, %xmm0 749 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 750 ; X32-SSE-NEXT: orpd %xmm2, %xmm0 751 ; X32-SSE-NEXT: retl 752 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 753 %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat 754 %shl = shl <2 x i64> %a, %splat 755 %lshr = lshr <2 x i64> %a, %splat64 756 %or = or <2 x i64> %shl, %lshr 757 ret <2 x i64> %or 758 } 759 760 define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 761 ; SSE2-LABEL: splatvar_rotate_v4i32: 762 ; SSE2: # %bb.0: 763 ; SSE2-NEXT: xorps %xmm2, %xmm2 764 ; SSE2-NEXT: xorps %xmm3, %xmm3 765 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] 766 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 767 ; SSE2-NEXT: movdqa %xmm0, %xmm4 768 ; SSE2-NEXT: pslld %xmm3, %xmm4 769 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] 770 ; SSE2-NEXT: psubd %xmm1, %xmm3 771 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 772 ; SSE2-NEXT: psrld %xmm2, %xmm0 773 ; SSE2-NEXT: por %xmm4, %xmm0 774 ; SSE2-NEXT: retq 775 ; 776 ; SSE41-LABEL: splatvar_rotate_v4i32: 777 ; SSE41: # %bb.0: 778 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 779 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 780 ; SSE41-NEXT: movdqa %xmm0, %xmm3 781 ; SSE41-NEXT: pslld %xmm2, %xmm3 782 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 783 ; SSE41-NEXT: psubd %xmm1, %xmm2 784 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero 785 ; SSE41-NEXT: psrld %xmm1, %xmm0 786 ; SSE41-NEXT: por %xmm3, %xmm0 787 ; SSE41-NEXT: retq 788 ; 789 ; AVX1-LABEL: splatvar_rotate_v4i32: 790 ; AVX1: # %bb.0: 791 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 792 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 793 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 794 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 795 ; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 796 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 797 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 798 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 799 ; AVX1-NEXT: retq 800 ; 801 ; AVX2-LABEL: splatvar_rotate_v4i32: 802 ; AVX2: # %bb.0: 803 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 804 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 805 ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 806 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 807 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 808 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 809 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 810 ; AVX2-NEXT: retq 811 ; 812 ; AVX512F-LABEL: splatvar_rotate_v4i32: 813 ; AVX512F: # %bb.0: 814 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 815 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 816 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 817 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 818 ; AVX512F-NEXT: vzeroupper 819 ; AVX512F-NEXT: retq 820 ; 821 ; AVX512VL-LABEL: splatvar_rotate_v4i32: 822 ; AVX512VL: # %bb.0: 823 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 824 ; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 825 ; AVX512VL-NEXT: retq 826 ; 827 ; AVX512BW-LABEL: splatvar_rotate_v4i32: 828 ; AVX512BW: # %bb.0: 829 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 830 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 831 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 832 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 833 ; AVX512BW-NEXT: vzeroupper 834 ; AVX512BW-NEXT: retq 835 ; 836 ; AVX512VLBW-LABEL: splatvar_rotate_v4i32: 837 ; AVX512VLBW: # %bb.0: 838 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 839 ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 840 ; AVX512VLBW-NEXT: retq 841 ; 842 ; XOPAVX1-LABEL: splatvar_rotate_v4i32: 843 ; XOPAVX1: # %bb.0: 844 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 845 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 846 ; XOPAVX1-NEXT: retq 847 ; 848 ; XOPAVX2-LABEL: splatvar_rotate_v4i32: 849 ; XOPAVX2: # %bb.0: 850 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 851 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 852 ; XOPAVX2-NEXT: retq 853 ; 854 ; X32-SSE-LABEL: splatvar_rotate_v4i32: 855 ; X32-SSE: # %bb.0: 856 ; X32-SSE-NEXT: xorps %xmm2, %xmm2 857 ; X32-SSE-NEXT: xorps %xmm3, %xmm3 858 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] 859 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 860 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 861 ; X32-SSE-NEXT: pslld %xmm3, %xmm4 862 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] 863 ; X32-SSE-NEXT: psubd %xmm1, %xmm3 864 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] 865 ; X32-SSE-NEXT: psrld %xmm2, %xmm0 866 ; X32-SSE-NEXT: por %xmm4, %xmm0 867 ; X32-SSE-NEXT: retl 868 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 869 %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat 870 %shl = shl <4 x i32> %a, %splat 871 %lshr = lshr <4 x i32> %a, %splat32 872 %or = or <4 x i32> %shl, %lshr 873 ret <4 x i32> %or 874 } 875 876 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 877 ; SSE2-LABEL: splatvar_rotate_v8i16: 878 ; SSE2: # %bb.0: 879 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] 880 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 881 ; SSE2-NEXT: pextrw $0, %xmm1, %eax 882 ; SSE2-NEXT: movd %eax, %xmm1 883 ; SSE2-NEXT: movdqa %xmm0, %xmm3 884 ; SSE2-NEXT: psllw %xmm1, %xmm3 885 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 886 ; SSE2-NEXT: psubw %xmm2, %xmm1 887 ; SSE2-NEXT: pextrw $0, %xmm1, %eax 888 ; SSE2-NEXT: movd %eax, %xmm1 889 ; SSE2-NEXT: psrlw %xmm1, %xmm0 890 ; SSE2-NEXT: por %xmm3, %xmm0 891 ; SSE2-NEXT: retq 892 ; 893 ; SSE41-LABEL: splatvar_rotate_v8i16: 894 ; SSE41: # %bb.0: 895 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 896 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 897 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 898 ; SSE41-NEXT: movdqa %xmm0, %xmm3 899 ; SSE41-NEXT: psllw %xmm2, %xmm3 900 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 901 ; SSE41-NEXT: psubw %xmm1, %xmm2 902 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 903 ; SSE41-NEXT: psrlw %xmm1, %xmm0 904 ; SSE41-NEXT: por %xmm3, %xmm0 905 ; SSE41-NEXT: retq 906 ; 907 ; AVX1-LABEL: splatvar_rotate_v8i16: 908 ; AVX1: # %bb.0: 909 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 910 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 911 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 912 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 913 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 914 ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 915 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 916 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 917 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 918 ; AVX1-NEXT: retq 919 ; 920 ; AVX2-LABEL: splatvar_rotate_v8i16: 921 ; AVX2: # %bb.0: 922 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 923 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 924 ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 925 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 926 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 927 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 928 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 929 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 930 ; AVX2-NEXT: retq 931 ; 932 ; AVX512F-LABEL: splatvar_rotate_v8i16: 933 ; AVX512F: # %bb.0: 934 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 935 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 936 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2 937 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 938 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 939 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 940 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 941 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 942 ; AVX512F-NEXT: retq 943 ; 944 ; AVX512VL-LABEL: splatvar_rotate_v8i16: 945 ; AVX512VL: # %bb.0: 946 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 947 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 948 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2 949 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 950 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 951 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 952 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 953 ; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 954 ; AVX512VL-NEXT: retq 955 ; 956 ; AVX512BW-LABEL: splatvar_rotate_v8i16: 957 ; AVX512BW: # %bb.0: 958 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 959 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 960 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 961 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 962 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 963 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 964 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 965 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 966 ; AVX512BW-NEXT: vzeroupper 967 ; AVX512BW-NEXT: retq 968 ; 969 ; AVX512VLBW-LABEL: splatvar_rotate_v8i16: 970 ; AVX512VLBW: # %bb.0: 971 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 972 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 973 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 974 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 975 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 976 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 977 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 978 ; AVX512VLBW-NEXT: retq 979 ; 980 ; XOPAVX1-LABEL: splatvar_rotate_v8i16: 981 ; XOPAVX1: # %bb.0: 982 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 983 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 984 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 985 ; XOPAVX1-NEXT: retq 986 ; 987 ; XOPAVX2-LABEL: splatvar_rotate_v8i16: 988 ; XOPAVX2: # %bb.0: 989 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 990 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 991 ; XOPAVX2-NEXT: retq 992 ; 993 ; X32-SSE-LABEL: splatvar_rotate_v8i16: 994 ; X32-SSE: # %bb.0: 995 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] 996 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 997 ; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 998 ; X32-SSE-NEXT: movd %eax, %xmm1 999 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 1000 ; X32-SSE-NEXT: psllw %xmm1, %xmm3 1001 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 1002 ; X32-SSE-NEXT: psubw %xmm2, %xmm1 1003 ; X32-SSE-NEXT: pextrw $0, %xmm1, %eax 1004 ; X32-SSE-NEXT: movd %eax, %xmm1 1005 ; X32-SSE-NEXT: psrlw %xmm1, %xmm0 1006 ; X32-SSE-NEXT: por %xmm3, %xmm0 1007 ; X32-SSE-NEXT: retl 1008 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 1009 %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 1010 %shl = shl <8 x i16> %a, %splat 1011 %lshr = lshr <8 x i16> %a, %splat16 1012 %or = or <8 x i16> %shl, %lshr 1013 ret <8 x i16> %or 1014 } 1015 1016 define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 1017 ; SSE2-LABEL: splatvar_rotate_v16i8: 1018 ; SSE2: # %bb.0: 1019 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1020 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1021 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 1022 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 1023 ; SSE2-NEXT: movdqa %xmm2, %xmm0 1024 ; SSE2-NEXT: psrlw $4, %xmm0 1025 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1026 ; SSE2-NEXT: movdqa %xmm2, %xmm3 1027 ; SSE2-NEXT: psllw $4, %xmm3 1028 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1029 ; SSE2-NEXT: por %xmm0, %xmm3 1030 ; SSE2-NEXT: psllw $5, %xmm1 1031 ; SSE2-NEXT: pxor %xmm0, %xmm0 1032 ; SSE2-NEXT: pxor %xmm4, %xmm4 1033 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm4 1034 ; SSE2-NEXT: pand %xmm4, %xmm3 1035 ; SSE2-NEXT: pandn %xmm2, %xmm4 1036 ; SSE2-NEXT: por %xmm3, %xmm4 1037 ; SSE2-NEXT: movdqa %xmm4, %xmm2 1038 ; SSE2-NEXT: psrlw $6, %xmm2 1039 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1040 ; SSE2-NEXT: movdqa %xmm4, %xmm3 1041 ; SSE2-NEXT: psllw $2, %xmm3 1042 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1043 ; SSE2-NEXT: por %xmm2, %xmm3 1044 ; SSE2-NEXT: paddb %xmm1, %xmm1 1045 ; SSE2-NEXT: pxor %xmm2, %xmm2 1046 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 1047 ; SSE2-NEXT: pand %xmm2, %xmm3 1048 ; SSE2-NEXT: pandn %xmm4, %xmm2 1049 ; SSE2-NEXT: por %xmm3, %xmm2 1050 ; SSE2-NEXT: movdqa %xmm2, %xmm3 1051 ; SSE2-NEXT: paddb %xmm2, %xmm3 1052 ; SSE2-NEXT: movdqa %xmm2, %xmm4 1053 ; SSE2-NEXT: psrlw $7, %xmm4 1054 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1055 ; SSE2-NEXT: por %xmm3, %xmm4 1056 ; SSE2-NEXT: paddb %xmm1, %xmm1 1057 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 1058 ; SSE2-NEXT: pand %xmm0, %xmm4 1059 ; SSE2-NEXT: pandn %xmm2, %xmm0 1060 ; SSE2-NEXT: por %xmm4, %xmm0 1061 ; SSE2-NEXT: retq 1062 ; 1063 ; SSE41-LABEL: splatvar_rotate_v16i8: 1064 ; SSE41: # %bb.0: 1065 ; SSE41-NEXT: movdqa %xmm1, %xmm2 1066 ; SSE41-NEXT: movdqa %xmm0, %xmm1 1067 ; SSE41-NEXT: pxor %xmm0, %xmm0 1068 ; SSE41-NEXT: pshufb %xmm0, %xmm2 1069 ; SSE41-NEXT: movdqa %xmm1, %xmm0 1070 ; SSE41-NEXT: psrlw $4, %xmm0 1071 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1072 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1073 ; SSE41-NEXT: psllw $4, %xmm3 1074 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1075 ; SSE41-NEXT: por %xmm0, %xmm3 1076 ; SSE41-NEXT: psllw $5, %xmm2 1077 ; SSE41-NEXT: movdqa %xmm2, %xmm0 1078 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1079 ; SSE41-NEXT: movdqa %xmm1, %xmm0 1080 ; SSE41-NEXT: psrlw $6, %xmm0 1081 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1082 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1083 ; SSE41-NEXT: psllw $2, %xmm3 1084 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1085 ; SSE41-NEXT: por %xmm0, %xmm3 1086 ; SSE41-NEXT: paddb %xmm2, %xmm2 1087 ; SSE41-NEXT: movdqa %xmm2, %xmm0 1088 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1089 ; SSE41-NEXT: movdqa %xmm1, %xmm0 1090 ; SSE41-NEXT: paddb %xmm1, %xmm0 1091 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1092 ; SSE41-NEXT: psrlw $7, %xmm3 1093 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1094 ; SSE41-NEXT: por %xmm0, %xmm3 1095 ; SSE41-NEXT: paddb %xmm2, %xmm2 1096 ; SSE41-NEXT: movdqa %xmm2, %xmm0 1097 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1098 ; SSE41-NEXT: movdqa %xmm1, %xmm0 1099 ; SSE41-NEXT: retq 1100 ; 1101 ; AVX1-LABEL: splatvar_rotate_v16i8: 1102 ; AVX1: # %bb.0: 1103 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1104 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1105 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1106 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1107 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 1108 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1109 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 1110 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 1111 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1112 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 1113 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1114 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 1115 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1116 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 1117 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1118 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1119 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1120 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 1121 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1122 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 1123 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1124 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1125 ; AVX1-NEXT: retq 1126 ; 1127 ; AVX2-LABEL: splatvar_rotate_v16i8: 1128 ; AVX2: # %bb.0: 1129 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1130 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 1131 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1132 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 1133 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1134 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 1135 ; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 1136 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1137 ; AVX2-NEXT: vpsrlw $6, %xmm0, %xmm2 1138 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1139 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 1140 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1141 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 1142 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1143 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1144 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1145 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm3 1146 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1147 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 1148 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1149 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1150 ; AVX2-NEXT: retq 1151 ; 1152 ; AVX512F-LABEL: splatvar_rotate_v16i8: 1153 ; AVX512F: # %bb.0: 1154 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 1155 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm2 1156 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1157 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm3 1158 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1159 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 1160 ; AVX512F-NEXT: vpsllw $5, %xmm1, %xmm1 1161 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1162 ; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm2 1163 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1164 ; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 1165 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1166 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 1167 ; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1168 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1169 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1170 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 1171 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1172 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 1173 ; AVX512F-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1174 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1175 ; AVX512F-NEXT: retq 1176 ; 1177 ; AVX512VL-LABEL: splatvar_rotate_v16i8: 1178 ; AVX512VL: # %bb.0: 1179 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 1180 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 1181 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1182 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm3 1183 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1184 ; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 1185 ; AVX512VL-NEXT: vpsllw $5, %xmm1, %xmm1 1186 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1187 ; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm2 1188 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1189 ; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 1190 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1191 ; AVX512VL-NEXT: vpor %xmm2, %xmm3, %xmm2 1192 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1193 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1194 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 1195 ; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 1196 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1197 ; AVX512VL-NEXT: vpor %xmm3, %xmm2, %xmm2 1198 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 1199 ; AVX512VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1200 ; AVX512VL-NEXT: retq 1201 ; 1202 ; AVX512BW-LABEL: splatvar_rotate_v16i8: 1203 ; AVX512BW: # %bb.0: 1204 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 1205 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1206 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1207 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 1208 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 1209 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1210 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1211 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1212 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 1213 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1214 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 1215 ; AVX512BW-NEXT: vzeroupper 1216 ; AVX512BW-NEXT: retq 1217 ; 1218 ; AVX512VLBW-LABEL: splatvar_rotate_v16i8: 1219 ; AVX512VLBW: # %bb.0: 1220 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 1221 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1222 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1223 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm2 1224 ; AVX512VLBW-NEXT: vpmovwb %ymm2, %xmm2 1225 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1226 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1227 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1228 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 1229 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1230 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 1231 ; AVX512VLBW-NEXT: vzeroupper 1232 ; AVX512VLBW-NEXT: retq 1233 ; 1234 ; XOPAVX1-LABEL: splatvar_rotate_v16i8: 1235 ; XOPAVX1: # %bb.0: 1236 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1237 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1238 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 1239 ; XOPAVX1-NEXT: retq 1240 ; 1241 ; XOPAVX2-LABEL: splatvar_rotate_v16i8: 1242 ; XOPAVX2: # %bb.0: 1243 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1244 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 1245 ; XOPAVX2-NEXT: retq 1246 ; 1247 ; X32-SSE-LABEL: splatvar_rotate_v16i8: 1248 ; X32-SSE: # %bb.0: 1249 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1250 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1251 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] 1252 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 1253 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0 1254 ; X32-SSE-NEXT: psrlw $4, %xmm0 1255 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 1256 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1257 ; X32-SSE-NEXT: psllw $4, %xmm3 1258 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 1259 ; X32-SSE-NEXT: por %xmm0, %xmm3 1260 ; X32-SSE-NEXT: psllw $5, %xmm1 1261 ; X32-SSE-NEXT: pxor %xmm0, %xmm0 1262 ; X32-SSE-NEXT: pxor %xmm4, %xmm4 1263 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm4 1264 ; X32-SSE-NEXT: pand %xmm4, %xmm3 1265 ; X32-SSE-NEXT: pandn %xmm2, %xmm4 1266 ; X32-SSE-NEXT: por %xmm3, %xmm4 1267 ; X32-SSE-NEXT: movdqa %xmm4, %xmm2 1268 ; X32-SSE-NEXT: psrlw $6, %xmm2 1269 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 1270 ; X32-SSE-NEXT: movdqa %xmm4, %xmm3 1271 ; X32-SSE-NEXT: psllw $2, %xmm3 1272 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 1273 ; X32-SSE-NEXT: por %xmm2, %xmm3 1274 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 1275 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 1276 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 1277 ; X32-SSE-NEXT: pand %xmm2, %xmm3 1278 ; X32-SSE-NEXT: pandn %xmm4, %xmm2 1279 ; X32-SSE-NEXT: por %xmm3, %xmm2 1280 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 1281 ; X32-SSE-NEXT: paddb %xmm2, %xmm3 1282 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4 1283 ; X32-SSE-NEXT: psrlw $7, %xmm4 1284 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1285 ; X32-SSE-NEXT: por %xmm3, %xmm4 1286 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 1287 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm0 1288 ; X32-SSE-NEXT: pand %xmm0, %xmm4 1289 ; X32-SSE-NEXT: pandn %xmm2, %xmm0 1290 ; X32-SSE-NEXT: por %xmm4, %xmm0 1291 ; X32-SSE-NEXT: retl 1292 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 1293 %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 1294 %shl = shl <16 x i8> %a, %splat 1295 %lshr = lshr <16 x i8> %a, %splat8 1296 %or = or <16 x i8> %shl, %lshr 1297 ret <16 x i8> %or 1298 } 1299 1300 ; 1301 ; Constant Rotates 1302 ; 1303 1304 define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 1305 ; SSE2-LABEL: constant_rotate_v2i64: 1306 ; SSE2: # %bb.0: 1307 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1308 ; SSE2-NEXT: psllq $4, %xmm1 1309 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1310 ; SSE2-NEXT: psllq $14, %xmm2 1311 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1312 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1313 ; SSE2-NEXT: psrlq $60, %xmm1 1314 ; SSE2-NEXT: psrlq $50, %xmm0 1315 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1316 ; SSE2-NEXT: orpd %xmm2, %xmm0 1317 ; SSE2-NEXT: retq 1318 ; 1319 ; SSE41-LABEL: constant_rotate_v2i64: 1320 ; SSE41: # %bb.0: 1321 ; SSE41-NEXT: movdqa %xmm0, %xmm1 1322 ; SSE41-NEXT: psllq $14, %xmm1 1323 ; SSE41-NEXT: movdqa %xmm0, %xmm2 1324 ; SSE41-NEXT: psllq $4, %xmm2 1325 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1326 ; SSE41-NEXT: movdqa %xmm0, %xmm1 1327 ; SSE41-NEXT: psrlq $50, %xmm1 1328 ; SSE41-NEXT: psrlq $60, %xmm0 1329 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1330 ; SSE41-NEXT: por %xmm2, %xmm0 1331 ; SSE41-NEXT: retq 1332 ; 1333 ; AVX1-LABEL: constant_rotate_v2i64: 1334 ; AVX1: # %bb.0: 1335 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 1336 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2 1337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1338 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2 1339 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 1340 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1341 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1342 ; AVX1-NEXT: retq 1343 ; 1344 ; AVX2-LABEL: constant_rotate_v2i64: 1345 ; AVX2: # %bb.0: 1346 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 1347 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1348 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 1349 ; AVX2-NEXT: retq 1350 ; 1351 ; AVX512F-LABEL: constant_rotate_v2i64: 1352 ; AVX512F: # %bb.0: 1353 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1354 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1355 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1356 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1357 ; AVX512F-NEXT: vzeroupper 1358 ; AVX512F-NEXT: retq 1359 ; 1360 ; AVX512VL-LABEL: constant_rotate_v2i64: 1361 ; AVX512VL: # %bb.0: 1362 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1363 ; AVX512VL-NEXT: retq 1364 ; 1365 ; AVX512BW-LABEL: constant_rotate_v2i64: 1366 ; AVX512BW: # %bb.0: 1367 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1368 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1369 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1370 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1371 ; AVX512BW-NEXT: vzeroupper 1372 ; AVX512BW-NEXT: retq 1373 ; 1374 ; AVX512VLBW-LABEL: constant_rotate_v2i64: 1375 ; AVX512VLBW: # %bb.0: 1376 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 1377 ; AVX512VLBW-NEXT: retq 1378 ; 1379 ; XOP-LABEL: constant_rotate_v2i64: 1380 ; XOP: # %bb.0: 1381 ; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 1382 ; XOP-NEXT: retq 1383 ; 1384 ; X32-SSE-LABEL: constant_rotate_v2i64: 1385 ; X32-SSE: # %bb.0: 1386 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1387 ; X32-SSE-NEXT: psllq $4, %xmm1 1388 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1389 ; X32-SSE-NEXT: psllq $14, %xmm2 1390 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1391 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1392 ; X32-SSE-NEXT: psrlq $60, %xmm1 1393 ; X32-SSE-NEXT: psrlq $50, %xmm0 1394 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1395 ; X32-SSE-NEXT: orpd %xmm2, %xmm0 1396 ; X32-SSE-NEXT: retl 1397 %shl = shl <2 x i64> %a, <i64 4, i64 14> 1398 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 1399 %or = or <2 x i64> %shl, %lshr 1400 ret <2 x i64> %or 1401 } 1402 1403 define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 1404 ; SSE2-LABEL: constant_rotate_v4i32: 1405 ; SSE2: # %bb.0: 1406 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1407 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1408 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 1409 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1410 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1411 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 1412 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1413 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1414 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1415 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1416 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1417 ; SSE2-NEXT: por %xmm3, %xmm0 1418 ; SSE2-NEXT: retq 1419 ; 1420 ; SSE41-LABEL: constant_rotate_v4i32: 1421 ; SSE41: # %bb.0: 1422 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1423 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1424 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1425 ; SSE41-NEXT: pmuludq %xmm2, %xmm3 1426 ; SSE41-NEXT: pmuludq %xmm1, %xmm0 1427 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1428 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1429 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1430 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1431 ; SSE41-NEXT: por %xmm1, %xmm0 1432 ; SSE41-NEXT: retq 1433 ; 1434 ; AVX1-LABEL: constant_rotate_v4i32: 1435 ; AVX1: # %bb.0: 1436 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,64,128] 1437 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1438 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1439 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 1440 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1441 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1442 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1443 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 1444 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1445 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1446 ; AVX1-NEXT: retq 1447 ; 1448 ; AVX2-LABEL: constant_rotate_v4i32: 1449 ; AVX2: # %bb.0: 1450 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 1451 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 1452 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1453 ; AVX2-NEXT: retq 1454 ; 1455 ; AVX512F-LABEL: constant_rotate_v4i32: 1456 ; AVX512F: # %bb.0: 1457 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1458 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1459 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1460 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1461 ; AVX512F-NEXT: vzeroupper 1462 ; AVX512F-NEXT: retq 1463 ; 1464 ; AVX512VL-LABEL: constant_rotate_v4i32: 1465 ; AVX512VL: # %bb.0: 1466 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1467 ; AVX512VL-NEXT: retq 1468 ; 1469 ; AVX512BW-LABEL: constant_rotate_v4i32: 1470 ; AVX512BW: # %bb.0: 1471 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1472 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1473 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1474 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1475 ; AVX512BW-NEXT: vzeroupper 1476 ; AVX512BW-NEXT: retq 1477 ; 1478 ; AVX512VLBW-LABEL: constant_rotate_v4i32: 1479 ; AVX512VLBW: # %bb.0: 1480 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 1481 ; AVX512VLBW-NEXT: retq 1482 ; 1483 ; XOP-LABEL: constant_rotate_v4i32: 1484 ; XOP: # %bb.0: 1485 ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1486 ; XOP-NEXT: retq 1487 ; 1488 ; X32-SSE-LABEL: constant_rotate_v4i32: 1489 ; X32-SSE: # %bb.0: 1490 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1491 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1492 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 1493 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1494 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1495 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 1496 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1497 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1498 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1499 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1500 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1501 ; X32-SSE-NEXT: por %xmm3, %xmm0 1502 ; X32-SSE-NEXT: retl 1503 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1504 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 1505 %or = or <4 x i32> %shl, %lshr 1506 ret <4 x i32> %or 1507 } 1508 1509 define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 1510 ; SSE-LABEL: constant_rotate_v8i16: 1511 ; SSE: # %bb.0: 1512 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1513 ; SSE-NEXT: movdqa %xmm0, %xmm2 1514 ; SSE-NEXT: pmulhuw %xmm1, %xmm2 1515 ; SSE-NEXT: pmullw %xmm1, %xmm0 1516 ; SSE-NEXT: por %xmm2, %xmm0 1517 ; SSE-NEXT: retq 1518 ; 1519 ; AVX-LABEL: constant_rotate_v8i16: 1520 ; AVX: # %bb.0: 1521 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1522 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1523 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1524 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1525 ; AVX-NEXT: retq 1526 ; 1527 ; AVX512F-LABEL: constant_rotate_v8i16: 1528 ; AVX512F: # %bb.0: 1529 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1530 ; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1531 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1532 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1533 ; AVX512F-NEXT: retq 1534 ; 1535 ; AVX512VL-LABEL: constant_rotate_v8i16: 1536 ; AVX512VL: # %bb.0: 1537 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1538 ; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1539 ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1540 ; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1541 ; AVX512VL-NEXT: retq 1542 ; 1543 ; AVX512BW-LABEL: constant_rotate_v8i16: 1544 ; AVX512BW: # %bb.0: 1545 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1546 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1547 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] 1548 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1549 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1550 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 1551 ; AVX512BW-NEXT: vzeroupper 1552 ; AVX512BW-NEXT: retq 1553 ; 1554 ; AVX512VLBW-LABEL: constant_rotate_v8i16: 1555 ; AVX512VLBW: # %bb.0: 1556 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 1557 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 1558 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1559 ; AVX512VLBW-NEXT: retq 1560 ; 1561 ; XOP-LABEL: constant_rotate_v8i16: 1562 ; XOP: # %bb.0: 1563 ; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1564 ; XOP-NEXT: retq 1565 ; 1566 ; X32-SSE-LABEL: constant_rotate_v8i16: 1567 ; X32-SSE: # %bb.0: 1568 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1569 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 1570 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2 1571 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0 1572 ; X32-SSE-NEXT: por %xmm2, %xmm0 1573 ; X32-SSE-NEXT: retl 1574 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1575 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1576 %or = or <8 x i16> %shl, %lshr 1577 ret <8 x i16> %or 1578 } 1579 1580 define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1581 ; SSE2-LABEL: constant_rotate_v16i8: 1582 ; SSE2: # %bb.0: 1583 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1584 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1585 ; SSE2-NEXT: pxor %xmm0, %xmm0 1586 ; SSE2-NEXT: pxor %xmm3, %xmm3 1587 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 1588 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1589 ; SSE2-NEXT: psrlw $4, %xmm4 1590 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1591 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1592 ; SSE2-NEXT: psllw $4, %xmm5 1593 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 1594 ; SSE2-NEXT: por %xmm4, %xmm5 1595 ; SSE2-NEXT: pand %xmm3, %xmm5 1596 ; SSE2-NEXT: pandn %xmm1, %xmm3 1597 ; SSE2-NEXT: por %xmm5, %xmm3 1598 ; SSE2-NEXT: movdqa %xmm3, %xmm1 1599 ; SSE2-NEXT: psrlw $6, %xmm1 1600 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 1601 ; SSE2-NEXT: movdqa %xmm3, %xmm4 1602 ; SSE2-NEXT: psllw $2, %xmm4 1603 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1604 ; SSE2-NEXT: por %xmm1, %xmm4 1605 ; SSE2-NEXT: paddb %xmm2, %xmm2 1606 ; SSE2-NEXT: pxor %xmm1, %xmm1 1607 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1608 ; SSE2-NEXT: pand %xmm1, %xmm4 1609 ; SSE2-NEXT: pandn %xmm3, %xmm1 1610 ; SSE2-NEXT: por %xmm4, %xmm1 1611 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1612 ; SSE2-NEXT: paddb %xmm1, %xmm3 1613 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1614 ; SSE2-NEXT: psrlw $7, %xmm4 1615 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1616 ; SSE2-NEXT: por %xmm3, %xmm4 1617 ; SSE2-NEXT: paddb %xmm2, %xmm2 1618 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 1619 ; SSE2-NEXT: pand %xmm0, %xmm4 1620 ; SSE2-NEXT: pandn %xmm1, %xmm0 1621 ; SSE2-NEXT: por %xmm4, %xmm0 1622 ; SSE2-NEXT: retq 1623 ; 1624 ; SSE41-LABEL: constant_rotate_v16i8: 1625 ; SSE41: # %bb.0: 1626 ; SSE41-NEXT: movdqa %xmm0, %xmm1 1627 ; SSE41-NEXT: psrlw $4, %xmm0 1628 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 1629 ; SSE41-NEXT: movdqa %xmm1, %xmm2 1630 ; SSE41-NEXT: psllw $4, %xmm2 1631 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1632 ; SSE41-NEXT: por %xmm0, %xmm2 1633 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256] 1634 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1635 ; SSE41-NEXT: movdqa %xmm1, %xmm2 1636 ; SSE41-NEXT: psrlw $6, %xmm2 1637 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1638 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1639 ; SSE41-NEXT: psllw $2, %xmm3 1640 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1641 ; SSE41-NEXT: por %xmm2, %xmm3 1642 ; SSE41-NEXT: paddb %xmm0, %xmm0 1643 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1644 ; SSE41-NEXT: movdqa %xmm1, %xmm2 1645 ; SSE41-NEXT: paddb %xmm1, %xmm2 1646 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1647 ; SSE41-NEXT: psrlw $7, %xmm3 1648 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1649 ; SSE41-NEXT: por %xmm2, %xmm3 1650 ; SSE41-NEXT: paddb %xmm0, %xmm0 1651 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 1652 ; SSE41-NEXT: movdqa %xmm1, %xmm0 1653 ; SSE41-NEXT: retq 1654 ; 1655 ; AVX-LABEL: constant_rotate_v16i8: 1656 ; AVX: # %bb.0: 1657 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1658 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1659 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 1660 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1661 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 1662 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1663 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1664 ; AVX-NEXT: vpsrlw $6, %xmm0, %xmm1 1665 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1666 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 1667 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1668 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 1669 ; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1670 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1671 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1672 ; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 1673 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1674 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 1675 ; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1676 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1677 ; AVX-NEXT: retq 1678 ; 1679 ; AVX512F-LABEL: constant_rotate_v16i8: 1680 ; AVX512F: # %bb.0: 1681 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm1 1682 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1683 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 1684 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1685 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 1686 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1687 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1688 ; AVX512F-NEXT: vpsrlw $6, %xmm0, %xmm1 1689 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1690 ; AVX512F-NEXT: vpsllw $2, %xmm0, %xmm3 1691 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1692 ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 1693 ; AVX512F-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1694 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1695 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1696 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm3 1697 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1698 ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 1699 ; AVX512F-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1700 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1701 ; AVX512F-NEXT: retq 1702 ; 1703 ; AVX512VL-LABEL: constant_rotate_v16i8: 1704 ; AVX512VL: # %bb.0: 1705 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm1 1706 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1707 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 1708 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 1709 ; AVX512VL-NEXT: vpor %xmm1, %xmm2, %xmm1 1710 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1711 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1712 ; AVX512VL-NEXT: vpsrlw $6, %xmm0, %xmm1 1713 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1714 ; AVX512VL-NEXT: vpsllw $2, %xmm0, %xmm3 1715 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1716 ; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 1717 ; AVX512VL-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1718 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1719 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm1 1720 ; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm3 1721 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 1722 ; AVX512VL-NEXT: vpor %xmm3, %xmm1, %xmm1 1723 ; AVX512VL-NEXT: vpaddb %xmm2, %xmm2, %xmm2 1724 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1725 ; AVX512VL-NEXT: retq 1726 ; 1727 ; AVX512BW-LABEL: constant_rotate_v16i8: 1728 ; AVX512BW: # %bb.0: 1729 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1730 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1731 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1732 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 1733 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1734 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1735 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1736 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1737 ; AVX512BW-NEXT: vzeroupper 1738 ; AVX512BW-NEXT: retq 1739 ; 1740 ; AVX512VLBW-LABEL: constant_rotate_v16i8: 1741 ; AVX512VLBW: # %bb.0: 1742 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1743 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 1744 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 1745 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1746 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1747 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1748 ; AVX512VLBW-NEXT: vzeroupper 1749 ; AVX512VLBW-NEXT: retq 1750 ; 1751 ; XOP-LABEL: constant_rotate_v16i8: 1752 ; XOP: # %bb.0: 1753 ; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 1754 ; XOP-NEXT: retq 1755 ; 1756 ; X32-SSE-LABEL: constant_rotate_v16i8: 1757 ; X32-SSE: # %bb.0: 1758 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1759 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256] 1760 ; X32-SSE-NEXT: pxor %xmm0, %xmm0 1761 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 1762 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3 1763 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1764 ; X32-SSE-NEXT: psrlw $4, %xmm4 1765 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1766 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 1767 ; X32-SSE-NEXT: psllw $4, %xmm5 1768 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5 1769 ; X32-SSE-NEXT: por %xmm4, %xmm5 1770 ; X32-SSE-NEXT: pand %xmm3, %xmm5 1771 ; X32-SSE-NEXT: pandn %xmm1, %xmm3 1772 ; X32-SSE-NEXT: por %xmm5, %xmm3 1773 ; X32-SSE-NEXT: movdqa %xmm3, %xmm1 1774 ; X32-SSE-NEXT: psrlw $6, %xmm1 1775 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 1776 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1777 ; X32-SSE-NEXT: psllw $2, %xmm4 1778 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1779 ; X32-SSE-NEXT: por %xmm1, %xmm4 1780 ; X32-SSE-NEXT: paddb %xmm2, %xmm2 1781 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 1782 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1 1783 ; X32-SSE-NEXT: pand %xmm1, %xmm4 1784 ; X32-SSE-NEXT: pandn %xmm3, %xmm1 1785 ; X32-SSE-NEXT: por %xmm4, %xmm1 1786 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1787 ; X32-SSE-NEXT: paddb %xmm1, %xmm3 1788 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 1789 ; X32-SSE-NEXT: psrlw $7, %xmm4 1790 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4 1791 ; X32-SSE-NEXT: por %xmm3, %xmm4 1792 ; X32-SSE-NEXT: paddb %xmm2, %xmm2 1793 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm0 1794 ; X32-SSE-NEXT: pand %xmm0, %xmm4 1795 ; X32-SSE-NEXT: pandn %xmm1, %xmm0 1796 ; X32-SSE-NEXT: por %xmm4, %xmm0 1797 ; X32-SSE-NEXT: retl 1798 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1799 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1800 %or = or <16 x i8> %shl, %lshr 1801 ret <16 x i8> %or 1802 } 1803 1804 ; 1805 ; Uniform Constant Rotates 1806 ; 1807 1808 define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1809 ; SSE-LABEL: splatconstant_rotate_v2i64: 1810 ; SSE: # %bb.0: 1811 ; SSE-NEXT: movdqa %xmm0, %xmm1 1812 ; SSE-NEXT: psllq $14, %xmm1 1813 ; SSE-NEXT: psrlq $50, %xmm0 1814 ; SSE-NEXT: por %xmm1, %xmm0 1815 ; SSE-NEXT: retq 1816 ; 1817 ; AVX-LABEL: splatconstant_rotate_v2i64: 1818 ; AVX: # %bb.0: 1819 ; AVX-NEXT: vpsllq $14, %xmm0, %xmm1 1820 ; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0 1821 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1822 ; AVX-NEXT: retq 1823 ; 1824 ; AVX512F-LABEL: splatconstant_rotate_v2i64: 1825 ; AVX512F: # %bb.0: 1826 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1827 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0 1828 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1829 ; AVX512F-NEXT: vzeroupper 1830 ; AVX512F-NEXT: retq 1831 ; 1832 ; AVX512VL-LABEL: splatconstant_rotate_v2i64: 1833 ; AVX512VL: # %bb.0: 1834 ; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 1835 ; AVX512VL-NEXT: retq 1836 ; 1837 ; AVX512BW-LABEL: splatconstant_rotate_v2i64: 1838 ; AVX512BW: # %bb.0: 1839 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1840 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 1841 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1842 ; AVX512BW-NEXT: vzeroupper 1843 ; AVX512BW-NEXT: retq 1844 ; 1845 ; AVX512VLBW-LABEL: splatconstant_rotate_v2i64: 1846 ; AVX512VLBW: # %bb.0: 1847 ; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 1848 ; AVX512VLBW-NEXT: retq 1849 ; 1850 ; XOP-LABEL: splatconstant_rotate_v2i64: 1851 ; XOP: # %bb.0: 1852 ; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1853 ; XOP-NEXT: retq 1854 ; 1855 ; X32-SSE-LABEL: splatconstant_rotate_v2i64: 1856 ; X32-SSE: # %bb.0: 1857 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1858 ; X32-SSE-NEXT: psllq $14, %xmm1 1859 ; X32-SSE-NEXT: psrlq $50, %xmm0 1860 ; X32-SSE-NEXT: por %xmm1, %xmm0 1861 ; X32-SSE-NEXT: retl 1862 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1863 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1864 %or = or <2 x i64> %shl, %lshr 1865 ret <2 x i64> %or 1866 } 1867 1868 define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1869 ; SSE-LABEL: splatconstant_rotate_v4i32: 1870 ; SSE: # %bb.0: 1871 ; SSE-NEXT: movdqa %xmm0, %xmm1 1872 ; SSE-NEXT: psrld $28, %xmm1 1873 ; SSE-NEXT: pslld $4, %xmm0 1874 ; SSE-NEXT: por %xmm1, %xmm0 1875 ; SSE-NEXT: retq 1876 ; 1877 ; AVX-LABEL: splatconstant_rotate_v4i32: 1878 ; AVX: # %bb.0: 1879 ; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1880 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1881 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1882 ; AVX-NEXT: retq 1883 ; 1884 ; AVX512F-LABEL: splatconstant_rotate_v4i32: 1885 ; AVX512F: # %bb.0: 1886 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1887 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1888 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1889 ; AVX512F-NEXT: vzeroupper 1890 ; AVX512F-NEXT: retq 1891 ; 1892 ; AVX512VL-LABEL: splatconstant_rotate_v4i32: 1893 ; AVX512VL: # %bb.0: 1894 ; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 1895 ; AVX512VL-NEXT: retq 1896 ; 1897 ; AVX512BW-LABEL: splatconstant_rotate_v4i32: 1898 ; AVX512BW: # %bb.0: 1899 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1900 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1901 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1902 ; AVX512BW-NEXT: vzeroupper 1903 ; AVX512BW-NEXT: retq 1904 ; 1905 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i32: 1906 ; AVX512VLBW: # %bb.0: 1907 ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 1908 ; AVX512VLBW-NEXT: retq 1909 ; 1910 ; XOP-LABEL: splatconstant_rotate_v4i32: 1911 ; XOP: # %bb.0: 1912 ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1913 ; XOP-NEXT: retq 1914 ; 1915 ; X32-SSE-LABEL: splatconstant_rotate_v4i32: 1916 ; X32-SSE: # %bb.0: 1917 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1918 ; X32-SSE-NEXT: psrld $28, %xmm1 1919 ; X32-SSE-NEXT: pslld $4, %xmm0 1920 ; X32-SSE-NEXT: por %xmm1, %xmm0 1921 ; X32-SSE-NEXT: retl 1922 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1923 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1924 %or = or <4 x i32> %shl, %lshr 1925 ret <4 x i32> %or 1926 } 1927 1928 define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1929 ; SSE-LABEL: splatconstant_rotate_v8i16: 1930 ; SSE: # %bb.0: 1931 ; SSE-NEXT: movdqa %xmm0, %xmm1 1932 ; SSE-NEXT: psrlw $9, %xmm1 1933 ; SSE-NEXT: psllw $7, %xmm0 1934 ; SSE-NEXT: por %xmm1, %xmm0 1935 ; SSE-NEXT: retq 1936 ; 1937 ; AVX-LABEL: splatconstant_rotate_v8i16: 1938 ; AVX: # %bb.0: 1939 ; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1 1940 ; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 1941 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1942 ; AVX-NEXT: retq 1943 ; 1944 ; AVX512-LABEL: splatconstant_rotate_v8i16: 1945 ; AVX512: # %bb.0: 1946 ; AVX512-NEXT: vpsrlw $9, %xmm0, %xmm1 1947 ; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 1948 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1949 ; AVX512-NEXT: retq 1950 ; 1951 ; XOP-LABEL: splatconstant_rotate_v8i16: 1952 ; XOP: # %bb.0: 1953 ; XOP-NEXT: vprotw $7, %xmm0, %xmm0 1954 ; XOP-NEXT: retq 1955 ; 1956 ; X32-SSE-LABEL: splatconstant_rotate_v8i16: 1957 ; X32-SSE: # %bb.0: 1958 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 1959 ; X32-SSE-NEXT: psrlw $9, %xmm1 1960 ; X32-SSE-NEXT: psllw $7, %xmm0 1961 ; X32-SSE-NEXT: por %xmm1, %xmm0 1962 ; X32-SSE-NEXT: retl 1963 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1964 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1965 %or = or <8 x i16> %shl, %lshr 1966 ret <8 x i16> %or 1967 } 1968 1969 define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 1970 ; SSE-LABEL: splatconstant_rotate_v16i8: 1971 ; SSE: # %bb.0: 1972 ; SSE-NEXT: movdqa %xmm0, %xmm1 1973 ; SSE-NEXT: psrlw $4, %xmm1 1974 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1 1975 ; SSE-NEXT: psllw $4, %xmm0 1976 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 1977 ; SSE-NEXT: por %xmm1, %xmm0 1978 ; SSE-NEXT: retq 1979 ; 1980 ; AVX-LABEL: splatconstant_rotate_v16i8: 1981 ; AVX: # %bb.0: 1982 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1983 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1984 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 1985 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1986 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1987 ; AVX-NEXT: retq 1988 ; 1989 ; AVX512-LABEL: splatconstant_rotate_v16i8: 1990 ; AVX512: # %bb.0: 1991 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 1992 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1993 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 1994 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 1995 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1996 ; AVX512-NEXT: retq 1997 ; 1998 ; XOP-LABEL: splatconstant_rotate_v16i8: 1999 ; XOP: # %bb.0: 2000 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2001 ; XOP-NEXT: retq 2002 ; 2003 ; X32-SSE-LABEL: splatconstant_rotate_v16i8: 2004 ; X32-SSE: # %bb.0: 2005 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2006 ; X32-SSE-NEXT: psrlw $4, %xmm1 2007 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 2008 ; X32-SSE-NEXT: psllw $4, %xmm0 2009 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2010 ; X32-SSE-NEXT: por %xmm1, %xmm0 2011 ; X32-SSE-NEXT: retl 2012 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2013 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2014 %or = or <16 x i8> %shl, %lshr 2015 ret <16 x i8> %or 2016 } 2017 2018 ; 2019 ; Masked Uniform Constant Rotates 2020 ; 2021 2022 define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 2023 ; SSE-LABEL: splatconstant_rotate_mask_v2i64: 2024 ; SSE: # %bb.0: 2025 ; SSE-NEXT: psrlq $49, %xmm0 2026 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2027 ; SSE-NEXT: retq 2028 ; 2029 ; AVX-LABEL: splatconstant_rotate_mask_v2i64: 2030 ; AVX: # %bb.0: 2031 ; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 2032 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2033 ; AVX-NEXT: retq 2034 ; 2035 ; AVX512F-LABEL: splatconstant_rotate_mask_v2i64: 2036 ; AVX512F: # %bb.0: 2037 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2038 ; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 2039 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2040 ; AVX512F-NEXT: vzeroupper 2041 ; AVX512F-NEXT: retq 2042 ; 2043 ; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: 2044 ; AVX512VL: # %bb.0: 2045 ; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 2046 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2047 ; AVX512VL-NEXT: retq 2048 ; 2049 ; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: 2050 ; AVX512BW: # %bb.0: 2051 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2052 ; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 2053 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2054 ; AVX512BW-NEXT: vzeroupper 2055 ; AVX512BW-NEXT: retq 2056 ; 2057 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64: 2058 ; AVX512VLBW: # %bb.0: 2059 ; AVX512VLBW-NEXT: vprolq $15, %xmm0, %xmm0 2060 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2061 ; AVX512VLBW-NEXT: retq 2062 ; 2063 ; XOP-LABEL: splatconstant_rotate_mask_v2i64: 2064 ; XOP: # %bb.0: 2065 ; XOP-NEXT: vprotq $15, %xmm0, %xmm0 2066 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2067 ; XOP-NEXT: retq 2068 ; 2069 ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64: 2070 ; X32-SSE: # %bb.0: 2071 ; X32-SSE-NEXT: psrlq $49, %xmm0 2072 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2073 ; X32-SSE-NEXT: retl 2074 %shl = shl <2 x i64> %a, <i64 15, i64 15> 2075 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 2076 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 2077 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 2078 %or = or <2 x i64> %lmask, %rmask 2079 ret <2 x i64> %or 2080 } 2081 2082 define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 2083 ; SSE-LABEL: splatconstant_rotate_mask_v4i32: 2084 ; SSE: # %bb.0: 2085 ; SSE-NEXT: movdqa %xmm0, %xmm1 2086 ; SSE-NEXT: psrld $28, %xmm1 2087 ; SSE-NEXT: pslld $4, %xmm0 2088 ; SSE-NEXT: por %xmm1, %xmm0 2089 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2090 ; SSE-NEXT: retq 2091 ; 2092 ; AVX-LABEL: splatconstant_rotate_mask_v4i32: 2093 ; AVX: # %bb.0: 2094 ; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 2095 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0 2096 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2097 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2098 ; AVX-NEXT: retq 2099 ; 2100 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i32: 2101 ; AVX512F: # %bb.0: 2102 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2103 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 2104 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2105 ; AVX512F-NEXT: vzeroupper 2106 ; AVX512F-NEXT: retq 2107 ; 2108 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: 2109 ; AVX512VL: # %bb.0: 2110 ; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 2111 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2112 ; AVX512VL-NEXT: retq 2113 ; 2114 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: 2115 ; AVX512BW: # %bb.0: 2116 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2117 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 2118 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2119 ; AVX512BW-NEXT: vzeroupper 2120 ; AVX512BW-NEXT: retq 2121 ; 2122 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32: 2123 ; AVX512VLBW: # %bb.0: 2124 ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 2125 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2126 ; AVX512VLBW-NEXT: retq 2127 ; 2128 ; XOP-LABEL: splatconstant_rotate_mask_v4i32: 2129 ; XOP: # %bb.0: 2130 ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 2131 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2132 ; XOP-NEXT: retq 2133 ; 2134 ; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32: 2135 ; X32-SSE: # %bb.0: 2136 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2137 ; X32-SSE-NEXT: psrld $28, %xmm1 2138 ; X32-SSE-NEXT: pslld $4, %xmm0 2139 ; X32-SSE-NEXT: por %xmm1, %xmm0 2140 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2141 ; X32-SSE-NEXT: retl 2142 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 2143 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 2144 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 2145 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 2146 %or = or <4 x i32> %lmask, %rmask 2147 ret <4 x i32> %or 2148 } 2149 2150 define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 2151 ; SSE-LABEL: splatconstant_rotate_mask_v8i16: 2152 ; SSE: # %bb.0: 2153 ; SSE-NEXT: movdqa %xmm0, %xmm1 2154 ; SSE-NEXT: psrlw $11, %xmm1 2155 ; SSE-NEXT: psllw $5, %xmm0 2156 ; SSE-NEXT: por %xmm1, %xmm0 2157 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2158 ; SSE-NEXT: retq 2159 ; 2160 ; AVX-LABEL: splatconstant_rotate_mask_v8i16: 2161 ; AVX: # %bb.0: 2162 ; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1 2163 ; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 2164 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2165 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2166 ; AVX-NEXT: retq 2167 ; 2168 ; AVX512-LABEL: splatconstant_rotate_mask_v8i16: 2169 ; AVX512: # %bb.0: 2170 ; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1 2171 ; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 2172 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 2173 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2174 ; AVX512-NEXT: retq 2175 ; 2176 ; XOP-LABEL: splatconstant_rotate_mask_v8i16: 2177 ; XOP: # %bb.0: 2178 ; XOP-NEXT: vprotw $5, %xmm0, %xmm0 2179 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2180 ; XOP-NEXT: retq 2181 ; 2182 ; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16: 2183 ; X32-SSE: # %bb.0: 2184 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2185 ; X32-SSE-NEXT: psrlw $11, %xmm1 2186 ; X32-SSE-NEXT: psllw $5, %xmm0 2187 ; X32-SSE-NEXT: por %xmm1, %xmm0 2188 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2189 ; X32-SSE-NEXT: retl 2190 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 2191 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 2192 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 2193 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 2194 %or = or <8 x i16> %lmask, %rmask 2195 ret <8 x i16> %or 2196 } 2197 2198 define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 2199 ; SSE-LABEL: splatconstant_rotate_mask_v16i8: 2200 ; SSE: # %bb.0: 2201 ; SSE-NEXT: movdqa %xmm0, %xmm1 2202 ; SSE-NEXT: psrlw $4, %xmm1 2203 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1 2204 ; SSE-NEXT: psllw $4, %xmm0 2205 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2206 ; SSE-NEXT: por %xmm1, %xmm0 2207 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2208 ; SSE-NEXT: retq 2209 ; 2210 ; AVX-LABEL: splatconstant_rotate_mask_v16i8: 2211 ; AVX: # %bb.0: 2212 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2213 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2214 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2215 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2216 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2217 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2218 ; AVX-NEXT: retq 2219 ; 2220 ; AVX512-LABEL: splatconstant_rotate_mask_v16i8: 2221 ; AVX512: # %bb.0: 2222 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 2223 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2224 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 2225 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2226 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 2227 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2228 ; AVX512-NEXT: retq 2229 ; 2230 ; XOP-LABEL: splatconstant_rotate_mask_v16i8: 2231 ; XOP: # %bb.0: 2232 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2233 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2234 ; XOP-NEXT: retq 2235 ; 2236 ; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8: 2237 ; X32-SSE: # %bb.0: 2238 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 2239 ; X32-SSE-NEXT: psrlw $4, %xmm1 2240 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 2241 ; X32-SSE-NEXT: psllw $4, %xmm0 2242 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2243 ; X32-SSE-NEXT: por %xmm1, %xmm0 2244 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 2245 ; X32-SSE-NEXT: retl 2246 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2247 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2248 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 2249 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 2250 %or = or <16 x i8> %lmask, %rmask 2251 ret <16 x i8> %or 2252 } 2253