1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST 5 6 ; fold (srl 0, x) -> 0 7 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { 8 ; SSE-LABEL: combine_vec_lshr_zero: 9 ; SSE: # %bb.0: 10 ; SSE-NEXT: xorps %xmm0, %xmm0 11 ; SSE-NEXT: retq 12 ; 13 ; AVX-LABEL: combine_vec_lshr_zero: 14 ; AVX: # %bb.0: 15 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 16 ; AVX-NEXT: retq 17 %1 = lshr <4 x i32> zeroinitializer, %x 18 ret <4 x i32> %1 19 } 20 21 ; fold (srl x, c >= size(x)) -> undef 22 define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) { 23 ; CHECK-LABEL: combine_vec_lshr_outofrange0: 24 ; CHECK: # %bb.0: 25 ; CHECK-NEXT: retq 26 %1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 27 ret <4 x i32> %1 28 } 29 30 define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) { 31 ; CHECK-LABEL: combine_vec_lshr_outofrange1: 32 ; CHECK: # %bb.0: 33 ; CHECK-NEXT: retq 34 %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 35 ret <4 x i32> %1 36 } 37 38 ; fold (srl x, 0) -> x 39 define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) { 40 ; CHECK-LABEL: combine_vec_lshr_by_zero: 41 ; CHECK: # %bb.0: 42 ; CHECK-NEXT: retq 43 %1 = lshr <4 x i32> %x, zeroinitializer 44 ret <4 x i32> %1 45 } 46 47 ; if (srl x, c) is known to be zero, return 0 48 define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) { 49 ; SSE-LABEL: combine_vec_lshr_known_zero0: 50 ; SSE: # %bb.0: 51 ; SSE-NEXT: xorps %xmm0, %xmm0 52 ; SSE-NEXT: retq 53 ; 54 ; AVX-LABEL: combine_vec_lshr_known_zero0: 55 ; AVX: # %bb.0: 56 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 57 ; AVX-NEXT: retq 58 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15> 59 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 60 ret <4 x i32> %2 61 } 62 63 define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) { 64 ; SSE-LABEL: combine_vec_lshr_known_zero1: 65 ; SSE: # %bb.0: 66 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 67 ; SSE-NEXT: movdqa %xmm0, %xmm1 68 ; SSE-NEXT: psrld $11, %xmm1 69 ; SSE-NEXT: movdqa %xmm0, %xmm2 70 ; SSE-NEXT: psrld $9, %xmm2 71 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 72 ; SSE-NEXT: movdqa %xmm0, %xmm1 73 ; SSE-NEXT: psrld $10, %xmm1 74 ; SSE-NEXT: psrld $8, %xmm0 75 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 76 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 77 ; SSE-NEXT: retq 78 ; 79 ; AVX-LABEL: combine_vec_lshr_known_zero1: 80 ; AVX: # %bb.0: 81 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] 82 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 83 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 84 ; AVX-NEXT: retq 85 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15> 86 %2 = lshr <4 x i32> %1, <i32 8, i32 9, i32 10, i32 11> 87 ret <4 x i32> %2 88 } 89 90 ; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2)) 91 define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) { 92 ; SSE-LABEL: combine_vec_lshr_lshr0: 93 ; SSE: # %bb.0: 94 ; SSE-NEXT: psrld $6, %xmm0 95 ; SSE-NEXT: retq 96 ; 97 ; AVX-LABEL: combine_vec_lshr_lshr0: 98 ; AVX: # %bb.0: 99 ; AVX-NEXT: vpsrld $6, %xmm0, %xmm0 100 ; AVX-NEXT: retq 101 %1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 102 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 103 ret <4 x i32> %2 104 } 105 106 define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) { 107 ; SSE-LABEL: combine_vec_lshr_lshr1: 108 ; SSE: # %bb.0: 109 ; SSE-NEXT: movdqa %xmm0, %xmm1 110 ; SSE-NEXT: psrld $10, %xmm1 111 ; SSE-NEXT: movdqa %xmm0, %xmm2 112 ; SSE-NEXT: psrld $6, %xmm2 113 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 114 ; SSE-NEXT: movdqa %xmm0, %xmm1 115 ; SSE-NEXT: psrld $8, %xmm1 116 ; SSE-NEXT: psrld $4, %xmm0 117 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 118 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 119 ; SSE-NEXT: retq 120 ; 121 ; AVX-LABEL: combine_vec_lshr_lshr1: 122 ; AVX: # %bb.0: 123 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 124 ; AVX-NEXT: retq 125 %1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 126 %2 = lshr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 127 ret <4 x i32> %2 128 } 129 130 ; fold (srl (srl x, c1), c2) -> 0 131 define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) { 132 ; SSE-LABEL: combine_vec_lshr_lshr_zero0: 133 ; SSE: # %bb.0: 134 ; SSE-NEXT: xorps %xmm0, %xmm0 135 ; SSE-NEXT: retq 136 ; 137 ; AVX-LABEL: combine_vec_lshr_lshr_zero0: 138 ; AVX: # %bb.0: 139 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 140 ; AVX-NEXT: retq 141 %1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 142 %2 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20> 143 ret <4 x i32> %2 144 } 145 146 define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) { 147 ; SSE-LABEL: combine_vec_lshr_lshr_zero1: 148 ; SSE: # %bb.0: 149 ; SSE-NEXT: xorps %xmm0, %xmm0 150 ; SSE-NEXT: retq 151 ; 152 ; AVX-LABEL: combine_vec_lshr_lshr_zero1: 153 ; AVX: # %bb.0: 154 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 155 ; AVX-NEXT: retq 156 %1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 157 %2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 158 ret <4 x i32> %2 159 } 160 161 ; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2))) 162 define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { 163 ; SSE-LABEL: combine_vec_lshr_trunc_lshr0: 164 ; SSE: # %bb.0: 165 ; SSE-NEXT: psrlq $48, %xmm1 166 ; SSE-NEXT: psrlq $48, %xmm0 167 ; SSE-NEXT: packusdw %xmm1, %xmm0 168 ; SSE-NEXT: retq 169 ; 170 ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: 171 ; AVX: # %bb.0: 172 ; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0 173 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 174 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 175 ; AVX-NEXT: vzeroupper 176 ; AVX-NEXT: retq 177 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 178 %2 = trunc <4 x i64> %1 to <4 x i32> 179 %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16> 180 ret <4 x i32> %3 181 } 182 183 define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) { 184 ; SSE-LABEL: combine_vec_lshr_trunc_lshr1: 185 ; SSE: # %bb.0: 186 ; SSE-NEXT: movdqa %xmm1, %xmm2 187 ; SSE-NEXT: psrlq $35, %xmm2 188 ; SSE-NEXT: psrlq $34, %xmm1 189 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 190 ; SSE-NEXT: movdqa %xmm0, %xmm2 191 ; SSE-NEXT: psrlq $33, %xmm2 192 ; SSE-NEXT: psrlq $32, %xmm0 193 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 194 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 195 ; SSE-NEXT: movaps %xmm0, %xmm1 196 ; SSE-NEXT: psrld $19, %xmm1 197 ; SSE-NEXT: movaps %xmm0, %xmm2 198 ; SSE-NEXT: psrld $17, %xmm2 199 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 200 ; SSE-NEXT: movaps %xmm0, %xmm1 201 ; SSE-NEXT: psrld $18, %xmm1 202 ; SSE-NEXT: psrld $16, %xmm0 203 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 204 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 205 ; SSE-NEXT: retq 206 ; 207 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1: 208 ; AVX2-SLOW: # %bb.0: 209 ; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 210 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 211 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 212 ; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 213 ; AVX2-SLOW-NEXT: vzeroupper 214 ; AVX2-SLOW-NEXT: retq 215 ; 216 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1: 217 ; AVX2-FAST: # %bb.0: 218 ; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 219 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 220 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 221 ; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 222 ; AVX2-FAST-NEXT: vzeroupper 223 ; AVX2-FAST-NEXT: retq 224 %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35> 225 %2 = trunc <4 x i64> %1 to <4 x i32> 226 %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19> 227 ret <4 x i32> %3 228 } 229 230 ; fold (srl (trunc (srl x, c1)), c2) -> 0 231 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) { 232 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0: 233 ; SSE: # %bb.0: 234 ; SSE-NEXT: xorps %xmm0, %xmm0 235 ; SSE-NEXT: retq 236 ; 237 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0: 238 ; AVX: # %bb.0: 239 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 240 ; AVX-NEXT: retq 241 %1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48> 242 %2 = trunc <4 x i64> %1 to <4 x i32> 243 %3 = lshr <4 x i32> %2, <i32 24, i32 24, i32 24, i32 24> 244 ret <4 x i32> %3 245 } 246 247 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) { 248 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1: 249 ; SSE: # %bb.0: 250 ; SSE-NEXT: movdqa %xmm1, %xmm2 251 ; SSE-NEXT: psrlq $51, %xmm2 252 ; SSE-NEXT: psrlq $50, %xmm1 253 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 254 ; SSE-NEXT: movdqa %xmm0, %xmm2 255 ; SSE-NEXT: psrlq $49, %xmm2 256 ; SSE-NEXT: psrlq $48, %xmm0 257 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 258 ; SSE-NEXT: packusdw %xmm1, %xmm0 259 ; SSE-NEXT: movdqa %xmm0, %xmm1 260 ; SSE-NEXT: psrld $27, %xmm1 261 ; SSE-NEXT: movdqa %xmm0, %xmm2 262 ; SSE-NEXT: psrld $25, %xmm2 263 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 264 ; SSE-NEXT: movdqa %xmm0, %xmm1 265 ; SSE-NEXT: psrld $26, %xmm1 266 ; SSE-NEXT: psrld $24, %xmm0 267 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 268 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 269 ; SSE-NEXT: retq 270 ; 271 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1: 272 ; AVX: # %bb.0: 273 ; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 274 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 275 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 276 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 277 ; AVX-NEXT: vzeroupper 278 ; AVX-NEXT: retq 279 %1 = lshr <4 x i64> %x, <i64 48, i64 49, i64 50, i64 51> 280 %2 = trunc <4 x i64> %1 to <4 x i32> 281 %3 = lshr <4 x i32> %2, <i32 24, i32 25, i32 26, i32 27> 282 ret <4 x i32> %3 283 } 284 285 ; fold (srl (shl x, c), c) -> (and x, cst2) 286 define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) { 287 ; SSE-LABEL: combine_vec_lshr_shl_mask0: 288 ; SSE: # %bb.0: 289 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 290 ; SSE-NEXT: retq 291 ; 292 ; AVX-LABEL: combine_vec_lshr_shl_mask0: 293 ; AVX: # %bb.0: 294 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] 295 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 296 ; AVX-NEXT: retq 297 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 298 %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 299 ret <4 x i32> %2 300 } 301 302 define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) { 303 ; SSE-LABEL: combine_vec_lshr_shl_mask1: 304 ; SSE: # %bb.0: 305 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 306 ; SSE-NEXT: retq 307 ; 308 ; AVX-LABEL: combine_vec_lshr_shl_mask1: 309 ; AVX: # %bb.0: 310 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 311 ; AVX-NEXT: retq 312 %1 = shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5> 313 %2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5> 314 ret <4 x i32> %2 315 } 316 317 ; fold (srl (sra X, Y), 31) -> (srl X, 31) 318 define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) { 319 ; SSE-LABEL: combine_vec_lshr_ashr_sign: 320 ; SSE: # %bb.0: 321 ; SSE-NEXT: psrld $31, %xmm0 322 ; SSE-NEXT: retq 323 ; 324 ; AVX-LABEL: combine_vec_lshr_ashr_sign: 325 ; AVX: # %bb.0: 326 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 327 ; AVX-NEXT: retq 328 %1 = ashr <4 x i32> %x, %y 329 %2 = lshr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 330 ret <4 x i32> %2 331 } 332 333 ; fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). 334 define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) { 335 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit0: 336 ; SSE: # %bb.0: 337 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 338 ; SSE-NEXT: psrld $4, %xmm0 339 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 340 ; SSE-NEXT: retq 341 ; 342 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0: 343 ; AVX: # %bb.0: 344 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] 345 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 346 ; AVX-NEXT: vpsrld $4, %xmm0, %xmm0 347 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] 348 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 349 ; AVX-NEXT: retq 350 %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 351 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0) 352 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5> 353 ret <4 x i32> %3 354 } 355 356 define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { 357 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: 358 ; SSE: # %bb.0: 359 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 360 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 361 ; SSE-NEXT: movdqa %xmm0, %xmm1 362 ; SSE-NEXT: pand %xmm2, %xmm1 363 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 364 ; SSE-NEXT: movdqa %xmm3, %xmm4 365 ; SSE-NEXT: pshufb %xmm1, %xmm4 366 ; SSE-NEXT: movdqa %xmm0, %xmm1 367 ; SSE-NEXT: psrlw $4, %xmm1 368 ; SSE-NEXT: pand %xmm2, %xmm1 369 ; SSE-NEXT: pxor %xmm2, %xmm2 370 ; SSE-NEXT: pshufb %xmm1, %xmm3 371 ; SSE-NEXT: pcmpeqb %xmm2, %xmm1 372 ; SSE-NEXT: pand %xmm4, %xmm1 373 ; SSE-NEXT: paddb %xmm3, %xmm1 374 ; SSE-NEXT: movdqa %xmm0, %xmm3 375 ; SSE-NEXT: pcmpeqb %xmm2, %xmm3 376 ; SSE-NEXT: psrlw $8, %xmm3 377 ; SSE-NEXT: pand %xmm1, %xmm3 378 ; SSE-NEXT: psrlw $8, %xmm1 379 ; SSE-NEXT: paddw %xmm3, %xmm1 380 ; SSE-NEXT: pcmpeqw %xmm2, %xmm0 381 ; SSE-NEXT: psrld $16, %xmm0 382 ; SSE-NEXT: pand %xmm1, %xmm0 383 ; SSE-NEXT: psrld $16, %xmm1 384 ; SSE-NEXT: paddd %xmm0, %xmm1 385 ; SSE-NEXT: psrld $5, %xmm1 386 ; SSE-NEXT: movdqa %xmm1, %xmm0 387 ; SSE-NEXT: retq 388 ; 389 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: 390 ; AVX: # %bb.0: 391 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 392 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 393 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 394 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 395 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 396 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 397 ; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 398 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 399 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 400 ; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 401 ; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 402 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 403 ; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 404 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 405 ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 406 ; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 407 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 408 ; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 409 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 410 ; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 411 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 412 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 413 ; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 414 ; AVX-NEXT: retq 415 %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128> 416 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0) 417 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5> 418 ret <4 x i32> %3 419 } 420 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 421 422 ; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). 423 define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { 424 ; SSE-LABEL: combine_vec_lshr_trunc_and: 425 ; SSE: # %bb.0: 426 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 427 ; SSE-NEXT: andps {{.*}}(%rip), %xmm1 428 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 429 ; SSE-NEXT: movdqa %xmm0, %xmm3 430 ; SSE-NEXT: psrld %xmm2, %xmm3 431 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 432 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 433 ; SSE-NEXT: movdqa %xmm0, %xmm5 434 ; SSE-NEXT: psrld %xmm4, %xmm5 435 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 436 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 437 ; SSE-NEXT: movdqa %xmm0, %xmm3 438 ; SSE-NEXT: psrld %xmm1, %xmm3 439 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 440 ; SSE-NEXT: psrld %xmm1, %xmm0 441 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 442 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 443 ; SSE-NEXT: retq 444 ; 445 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: 446 ; AVX2-SLOW: # %bb.0: 447 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 448 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 449 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 450 ; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 451 ; AVX2-SLOW-NEXT: vzeroupper 452 ; AVX2-SLOW-NEXT: retq 453 ; 454 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and: 455 ; AVX2-FAST: # %bb.0: 456 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 457 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 458 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 459 ; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 460 ; AVX2-FAST-NEXT: vzeroupper 461 ; AVX2-FAST-NEXT: retq 462 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 463 %2 = trunc <4 x i64> %1 to <4 x i32> 464 %3 = lshr <4 x i32> %x, %2 465 ret <4 x i32> %3 466 } 467