1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST 5 6 ; fold (sra 0, x) -> 0 7 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { 8 ; SSE-LABEL: combine_vec_ashr_zero: 9 ; SSE: # %bb.0: 10 ; SSE-NEXT: xorps %xmm0, %xmm0 11 ; SSE-NEXT: retq 12 ; 13 ; AVX-LABEL: combine_vec_ashr_zero: 14 ; AVX: # %bb.0: 15 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 16 ; AVX-NEXT: retq 17 %1 = ashr <4 x i32> zeroinitializer, %x 18 ret <4 x i32> %1 19 } 20 21 ; fold (sra -1, x) -> -1 22 define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) { 23 ; SSE-LABEL: combine_vec_ashr_allones: 24 ; SSE: # %bb.0: 25 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 26 ; SSE-NEXT: retq 27 ; 28 ; AVX-LABEL: combine_vec_ashr_allones: 29 ; AVX: # %bb.0: 30 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 31 ; AVX-NEXT: retq 32 %1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x 33 ret <4 x i32> %1 34 } 35 36 ; fold (sra x, c >= size(x)) -> undef 37 define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) { 38 ; CHECK-LABEL: combine_vec_ashr_outofrange0: 39 ; CHECK: # %bb.0: 40 ; CHECK-NEXT: retq 41 %1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 42 ret <4 x i32> %1 43 } 44 45 define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) { 46 ; CHECK-LABEL: combine_vec_ashr_outofrange1: 47 ; CHECK: # %bb.0: 48 ; CHECK-NEXT: retq 49 %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 50 ret <4 x i32> %1 51 } 52 53 ; fold (sra x, 0) -> x 54 define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) { 55 ; CHECK-LABEL: combine_vec_ashr_by_zero: 56 ; CHECK: # %bb.0: 57 ; CHECK-NEXT: retq 58 %1 = ashr <4 x i32> %x, zeroinitializer 59 ret <4 x i32> %1 60 } 61 62 ; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) 63 define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) { 64 ; SSE-LABEL: combine_vec_ashr_ashr0: 65 ; SSE: # %bb.0: 66 ; SSE-NEXT: psrad $6, %xmm0 67 ; SSE-NEXT: retq 68 ; 69 ; AVX-LABEL: combine_vec_ashr_ashr0: 70 ; AVX: # %bb.0: 71 ; AVX-NEXT: vpsrad $6, %xmm0, %xmm0 72 ; AVX-NEXT: retq 73 %1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 74 %2 = ashr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 75 ret <4 x i32> %2 76 } 77 78 define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) { 79 ; SSE-LABEL: combine_vec_ashr_ashr1: 80 ; SSE: # %bb.0: 81 ; SSE-NEXT: movdqa %xmm0, %xmm1 82 ; SSE-NEXT: psrad $10, %xmm1 83 ; SSE-NEXT: movdqa %xmm0, %xmm2 84 ; SSE-NEXT: psrad $6, %xmm2 85 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 86 ; SSE-NEXT: movdqa %xmm0, %xmm1 87 ; SSE-NEXT: psrad $8, %xmm1 88 ; SSE-NEXT: psrad $4, %xmm0 89 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 90 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 91 ; SSE-NEXT: retq 92 ; 93 ; AVX-LABEL: combine_vec_ashr_ashr1: 94 ; AVX: # %bb.0: 95 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 96 ; AVX-NEXT: retq 97 %1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 98 %2 = ashr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 99 ret <4 x i32> %2 100 } 101 102 define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) { 103 ; SSE-LABEL: combine_vec_ashr_ashr2: 104 ; SSE: # %bb.0: 105 ; SSE-NEXT: psrad $31, %xmm0 106 ; SSE-NEXT: retq 107 ; 108 ; AVX-LABEL: combine_vec_ashr_ashr2: 109 ; AVX: # %bb.0: 110 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 111 ; AVX-NEXT: retq 112 %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 113 %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 114 ret <4 x i32> %2 115 } 116 117 define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) { 118 ; SSE-LABEL: combine_vec_ashr_ashr3: 119 ; SSE: # %bb.0: 120 ; SSE-NEXT: movdqa %xmm0, %xmm1 121 ; SSE-NEXT: psrad $27, %xmm1 122 ; SSE-NEXT: movdqa %xmm0, %xmm2 123 ; SSE-NEXT: psrad $5, %xmm2 124 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 125 ; SSE-NEXT: movdqa %xmm0, %xmm1 126 ; SSE-NEXT: psrad $31, %xmm1 127 ; SSE-NEXT: psrad $1, %xmm0 128 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 129 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 130 ; SSE-NEXT: movdqa %xmm0, %xmm1 131 ; SSE-NEXT: psrad $10, %xmm1 132 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] 133 ; SSE-NEXT: psrad $31, %xmm0 134 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 135 ; SSE-NEXT: retq 136 ; 137 ; AVX-LABEL: combine_vec_ashr_ashr3: 138 ; AVX: # %bb.0: 139 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 140 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 141 ; AVX-NEXT: retq 142 %1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27> 143 %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0> 144 ret <4 x i32> %2 145 } 146 147 ; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). 148 define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { 149 ; SSE-LABEL: combine_vec_ashr_trunc_and: 150 ; SSE: # %bb.0: 151 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 152 ; SSE-NEXT: andps {{.*}}(%rip), %xmm1 153 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 154 ; SSE-NEXT: movdqa %xmm0, %xmm3 155 ; SSE-NEXT: psrad %xmm2, %xmm3 156 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 157 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 158 ; SSE-NEXT: movdqa %xmm0, %xmm5 159 ; SSE-NEXT: psrad %xmm4, %xmm5 160 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 161 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 162 ; SSE-NEXT: movdqa %xmm0, %xmm3 163 ; SSE-NEXT: psrad %xmm1, %xmm3 164 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 165 ; SSE-NEXT: psrad %xmm1, %xmm0 166 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 167 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 168 ; SSE-NEXT: retq 169 ; 170 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: 171 ; AVX2-SLOW: # %bb.0: 172 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 173 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 174 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 175 ; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0 176 ; AVX2-SLOW-NEXT: vzeroupper 177 ; AVX2-SLOW-NEXT: retq 178 ; 179 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and: 180 ; AVX2-FAST: # %bb.0: 181 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 182 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 183 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 184 ; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0 185 ; AVX2-FAST-NEXT: vzeroupper 186 ; AVX2-FAST-NEXT: retq 187 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 188 %2 = trunc <4 x i64> %1 to <4 x i32> 189 %3 = ashr <4 x i32> %x, %2 190 ret <4 x i32> %3 191 } 192 193 ; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) 194 ; if c1 is equal to the number of bits the trunc removes 195 define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { 196 ; SSE-LABEL: combine_vec_ashr_trunc_lshr: 197 ; SSE: # %bb.0: 198 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 199 ; SSE-NEXT: movaps %xmm0, %xmm2 200 ; SSE-NEXT: movaps %xmm0, %xmm1 201 ; SSE-NEXT: psrad $2, %xmm1 202 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 203 ; SSE-NEXT: psrad $3, %xmm0 204 ; SSE-NEXT: psrad $1, %xmm2 205 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 206 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 207 ; SSE-NEXT: movdqa %xmm1, %xmm0 208 ; SSE-NEXT: retq 209 ; 210 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: 211 ; AVX2-SLOW: # %bb.0: 212 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 213 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 214 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 215 ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 216 ; AVX2-SLOW-NEXT: vzeroupper 217 ; AVX2-SLOW-NEXT: retq 218 ; 219 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr: 220 ; AVX2-FAST: # %bb.0: 221 ; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 222 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 223 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 224 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 225 ; AVX2-FAST-NEXT: vzeroupper 226 ; AVX2-FAST-NEXT: retq 227 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 228 %2 = trunc <4 x i64> %1 to <4 x i32> 229 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3> 230 ret <4 x i32> %3 231 } 232 233 ; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) 234 ; if c1 is equal to the number of bits the trunc removes 235 define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) { 236 ; SSE-LABEL: combine_vec_ashr_trunc_ashr: 237 ; SSE: # %bb.0: 238 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 239 ; SSE-NEXT: movaps %xmm0, %xmm2 240 ; SSE-NEXT: movaps %xmm0, %xmm1 241 ; SSE-NEXT: psrad $2, %xmm1 242 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 243 ; SSE-NEXT: psrad $3, %xmm0 244 ; SSE-NEXT: psrad $1, %xmm2 245 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 246 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 247 ; SSE-NEXT: movdqa %xmm1, %xmm0 248 ; SSE-NEXT: retq 249 ; 250 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: 251 ; AVX2-SLOW: # %bb.0: 252 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] 253 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 254 ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 255 ; AVX2-SLOW-NEXT: vzeroupper 256 ; AVX2-SLOW-NEXT: retq 257 ; 258 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: 259 ; AVX2-FAST: # %bb.0: 260 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] 261 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 262 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 263 ; AVX2-FAST-NEXT: vzeroupper 264 ; AVX2-FAST-NEXT: retq 265 %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 266 %2 = trunc <4 x i64> %1 to <4 x i32> 267 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3> 268 ret <4 x i32> %3 269 } 270 271 ; If the sign bit is known to be zero, switch this to a SRL. 272 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { 273 ; SSE-LABEL: combine_vec_ashr_positive: 274 ; SSE: # %bb.0: 275 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 276 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 277 ; SSE-NEXT: movdqa %xmm0, %xmm3 278 ; SSE-NEXT: psrld %xmm2, %xmm3 279 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 280 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 281 ; SSE-NEXT: movdqa %xmm0, %xmm5 282 ; SSE-NEXT: psrld %xmm4, %xmm5 283 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 284 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 285 ; SSE-NEXT: movdqa %xmm0, %xmm3 286 ; SSE-NEXT: psrld %xmm1, %xmm3 287 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 288 ; SSE-NEXT: psrld %xmm1, %xmm0 289 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 290 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 291 ; SSE-NEXT: retq 292 ; 293 ; AVX-LABEL: combine_vec_ashr_positive: 294 ; AVX: # %bb.0: 295 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 296 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 297 ; AVX-NEXT: retq 298 %1 = and <4 x i32> %x, <i32 15, i32 255, i32 4095, i32 65535> 299 %2 = ashr <4 x i32> %1, %y 300 ret <4 x i32> %2 301 } 302 303 define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) { 304 ; SSE-LABEL: combine_vec_ashr_positive_splat: 305 ; SSE: # %bb.0: 306 ; SSE-NEXT: xorps %xmm0, %xmm0 307 ; SSE-NEXT: retq 308 ; 309 ; AVX-LABEL: combine_vec_ashr_positive_splat: 310 ; AVX: # %bb.0: 311 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 312 ; AVX-NEXT: retq 313 %1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023> 314 %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10> 315 ret <4 x i32> %2 316 } 317