1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512BW 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP 8 9 ; fold (sdiv x, 1) -> x 10 define i32 @combine_sdiv_by_one(i32 %x) { 11 ; CHECK-LABEL: combine_sdiv_by_one: 12 ; CHECK: # %bb.0: 13 ; CHECK-NEXT: movl %edi, %eax 14 ; CHECK-NEXT: retq 15 %1 = sdiv i32 %x, 1 16 ret i32 %1 17 } 18 19 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { 20 ; CHECK-LABEL: combine_vec_sdiv_by_one: 21 ; CHECK: # %bb.0: 22 ; CHECK-NEXT: retq 23 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 24 ret <4 x i32> %1 25 } 26 27 ; fold (sdiv x, -1) -> 0 - x 28 define i32 @combine_sdiv_by_negone(i32 %x) { 29 ; CHECK-LABEL: combine_sdiv_by_negone: 30 ; CHECK: # %bb.0: 31 ; CHECK-NEXT: negl %edi 32 ; CHECK-NEXT: movl %edi, %eax 33 ; CHECK-NEXT: retq 34 %1 = sdiv i32 %x, -1 35 ret i32 %1 36 } 37 38 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { 39 ; SSE-LABEL: combine_vec_sdiv_by_negone: 40 ; SSE: # %bb.0: 41 ; SSE-NEXT: pxor %xmm1, %xmm1 42 ; SSE-NEXT: psubd %xmm0, %xmm1 43 ; SSE-NEXT: movdqa %xmm1, %xmm0 44 ; SSE-NEXT: retq 45 ; 46 ; AVX-LABEL: combine_vec_sdiv_by_negone: 47 ; AVX: # %bb.0: 48 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 49 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 50 ; AVX-NEXT: retq 51 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 52 ret <4 x i32> %1 53 } 54 55 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0) 56 define i32 @combine_sdiv_by_minsigned(i32 %x) { 57 ; CHECK-LABEL: combine_sdiv_by_minsigned: 58 ; CHECK: # %bb.0: 59 ; CHECK-NEXT: xorl %eax, %eax 60 ; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 61 ; CHECK-NEXT: sete %al 62 ; CHECK-NEXT: retq 63 %1 = sdiv i32 %x, -2147483648 64 ret i32 %1 65 } 66 67 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { 68 ; SSE-LABEL: combine_vec_sdiv_by_minsigned: 69 ; SSE: # %bb.0: 70 ; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 71 ; SSE-NEXT: psrld $31, %xmm0 72 ; SSE-NEXT: retq 73 ; 74 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned: 75 ; AVX1: # %bb.0: 76 ; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 77 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 78 ; AVX1-NEXT: retq 79 ; 80 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned: 81 ; AVX2: # %bb.0: 82 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 83 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 84 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 85 ; AVX2-NEXT: retq 86 ; 87 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned: 88 ; AVX512F: # %bb.0: 89 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 90 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 91 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0 92 ; AVX512F-NEXT: retq 93 ; 94 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned: 95 ; AVX512BW: # %bb.0: 96 ; AVX512BW-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1 97 ; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} 98 ; AVX512BW-NEXT: retq 99 ; 100 ; XOP-LABEL: combine_vec_sdiv_by_minsigned: 101 ; XOP: # %bb.0: 102 ; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm0, %xmm0 103 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 104 ; XOP-NEXT: retq 105 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 106 ret <4 x i32> %1 107 } 108 109 ; TODO fold (sdiv x, x) -> 1 110 define i32 @combine_sdiv_dupe(i32 %x) { 111 ; CHECK-LABEL: combine_sdiv_dupe: 112 ; CHECK: # %bb.0: 113 ; CHECK-NEXT: movl %edi, %eax 114 ; CHECK-NEXT: cltd 115 ; CHECK-NEXT: idivl %edi 116 ; CHECK-NEXT: retq 117 %1 = sdiv i32 %x, %x 118 ret i32 %1 119 } 120 121 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { 122 ; SSE-LABEL: combine_vec_sdiv_dupe: 123 ; SSE: # %bb.0: 124 ; SSE-NEXT: pextrd $1, %xmm0, %ecx 125 ; SSE-NEXT: movl %ecx, %eax 126 ; SSE-NEXT: cltd 127 ; SSE-NEXT: idivl %ecx 128 ; SSE-NEXT: movl %eax, %ecx 129 ; SSE-NEXT: movd %xmm0, %esi 130 ; SSE-NEXT: movl %esi, %eax 131 ; SSE-NEXT: cltd 132 ; SSE-NEXT: idivl %esi 133 ; SSE-NEXT: movd %eax, %xmm1 134 ; SSE-NEXT: pinsrd $1, %ecx, %xmm1 135 ; SSE-NEXT: pextrd $2, %xmm0, %ecx 136 ; SSE-NEXT: movl %ecx, %eax 137 ; SSE-NEXT: cltd 138 ; SSE-NEXT: idivl %ecx 139 ; SSE-NEXT: pinsrd $2, %eax, %xmm1 140 ; SSE-NEXT: pextrd $3, %xmm0, %ecx 141 ; SSE-NEXT: movl %ecx, %eax 142 ; SSE-NEXT: cltd 143 ; SSE-NEXT: idivl %ecx 144 ; SSE-NEXT: pinsrd $3, %eax, %xmm1 145 ; SSE-NEXT: movdqa %xmm1, %xmm0 146 ; SSE-NEXT: retq 147 ; 148 ; AVX-LABEL: combine_vec_sdiv_dupe: 149 ; AVX: # %bb.0: 150 ; AVX-NEXT: vpextrd $1, %xmm0, %ecx 151 ; AVX-NEXT: movl %ecx, %eax 152 ; AVX-NEXT: cltd 153 ; AVX-NEXT: idivl %ecx 154 ; AVX-NEXT: movl %eax, %ecx 155 ; AVX-NEXT: vmovd %xmm0, %esi 156 ; AVX-NEXT: movl %esi, %eax 157 ; AVX-NEXT: cltd 158 ; AVX-NEXT: idivl %esi 159 ; AVX-NEXT: vmovd %eax, %xmm1 160 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 161 ; AVX-NEXT: vpextrd $2, %xmm0, %ecx 162 ; AVX-NEXT: movl %ecx, %eax 163 ; AVX-NEXT: cltd 164 ; AVX-NEXT: idivl %ecx 165 ; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 166 ; AVX-NEXT: vpextrd $3, %xmm0, %ecx 167 ; AVX-NEXT: movl %ecx, %eax 168 ; AVX-NEXT: cltd 169 ; AVX-NEXT: idivl %ecx 170 ; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 171 ; AVX-NEXT: retq 172 %1 = sdiv <4 x i32> %x, %x 173 ret <4 x i32> %1 174 } 175 176 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive 177 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { 178 ; SSE-LABEL: combine_vec_sdiv_by_pos0: 179 ; SSE: # %bb.0: 180 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 181 ; SSE-NEXT: psrld $2, %xmm0 182 ; SSE-NEXT: retq 183 ; 184 ; AVX-LABEL: combine_vec_sdiv_by_pos0: 185 ; AVX: # %bb.0: 186 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 187 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 188 ; AVX-NEXT: retq 189 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 190 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 191 ret <4 x i32> %2 192 } 193 194 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { 195 ; SSE-LABEL: combine_vec_sdiv_by_pos1: 196 ; SSE: # %bb.0: 197 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 198 ; SSE-NEXT: movdqa %xmm0, %xmm2 199 ; SSE-NEXT: movdqa %xmm0, %xmm1 200 ; SSE-NEXT: psrld $3, %xmm1 201 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 202 ; SSE-NEXT: psrld $4, %xmm0 203 ; SSE-NEXT: psrld $2, %xmm2 204 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 205 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 206 ; SSE-NEXT: movdqa %xmm1, %xmm0 207 ; SSE-NEXT: retq 208 ; 209 ; AVX1-LABEL: combine_vec_sdiv_by_pos1: 210 ; AVX1: # %bb.0: 211 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 212 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 213 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 214 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 215 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 216 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 217 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 218 ; AVX1-NEXT: retq 219 ; 220 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1: 221 ; AVX2ORLATER: # %bb.0: 222 ; AVX2ORLATER-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 223 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 224 ; AVX2ORLATER-NEXT: retq 225 ; 226 ; XOP-LABEL: combine_vec_sdiv_by_pos1: 227 ; XOP: # %bb.0: 228 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 229 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 230 ; XOP-NEXT: retq 231 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 232 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16> 233 ret <4 x i32> %2 234 } 235 236 ; fold (sdiv x, (1 << c)) -> x >>u c 237 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { 238 ; SSE-LABEL: combine_vec_sdiv_by_pow2a: 239 ; SSE: # %bb.0: 240 ; SSE-NEXT: movdqa %xmm0, %xmm1 241 ; SSE-NEXT: psrad $31, %xmm1 242 ; SSE-NEXT: psrld $30, %xmm1 243 ; SSE-NEXT: paddd %xmm0, %xmm1 244 ; SSE-NEXT: psrad $2, %xmm1 245 ; SSE-NEXT: movdqa %xmm1, %xmm0 246 ; SSE-NEXT: retq 247 ; 248 ; AVX-LABEL: combine_vec_sdiv_by_pow2a: 249 ; AVX: # %bb.0: 250 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 251 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 252 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 253 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 254 ; AVX-NEXT: retq 255 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 256 ret <4 x i32> %1 257 } 258 259 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { 260 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg: 261 ; SSE: # %bb.0: 262 ; SSE-NEXT: movdqa %xmm0, %xmm1 263 ; SSE-NEXT: psrad $31, %xmm1 264 ; SSE-NEXT: psrld $30, %xmm1 265 ; SSE-NEXT: paddd %xmm0, %xmm1 266 ; SSE-NEXT: psrad $2, %xmm1 267 ; SSE-NEXT: pxor %xmm0, %xmm0 268 ; SSE-NEXT: psubd %xmm1, %xmm0 269 ; SSE-NEXT: retq 270 ; 271 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg: 272 ; AVX: # %bb.0: 273 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 274 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 275 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 276 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 277 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 278 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 279 ; AVX-NEXT: retq 280 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4> 281 ret <4 x i32> %1 282 } 283 284 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { 285 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 286 ; SSE: # %bb.0: 287 ; SSE-NEXT: movdqa %xmm0, %xmm1 288 ; SSE-NEXT: pxor %xmm0, %xmm0 289 ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 290 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 291 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2] 292 ; SSE-NEXT: pmullw %xmm2, %xmm3 293 ; SSE-NEXT: psrlw $8, %xmm3 294 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 295 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 296 ; SSE-NEXT: pmullw %xmm2, %xmm0 297 ; SSE-NEXT: psrlw $8, %xmm0 298 ; SSE-NEXT: packuswb %xmm0, %xmm3 299 ; SSE-NEXT: paddb %xmm1, %xmm3 300 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 301 ; SSE-NEXT: movdqa %xmm2, %xmm4 302 ; SSE-NEXT: psraw $4, %xmm4 303 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384] 304 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 305 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 306 ; SSE-NEXT: movdqa %xmm2, %xmm4 307 ; SSE-NEXT: psraw $2, %xmm4 308 ; SSE-NEXT: paddw %xmm0, %xmm0 309 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 310 ; SSE-NEXT: movdqa %xmm2, %xmm4 311 ; SSE-NEXT: psraw $1, %xmm4 312 ; SSE-NEXT: paddw %xmm0, %xmm0 313 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 314 ; SSE-NEXT: psrlw $8, %xmm2 315 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 316 ; SSE-NEXT: movdqa %xmm3, %xmm4 317 ; SSE-NEXT: psraw $4, %xmm4 318 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 319 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 320 ; SSE-NEXT: movdqa %xmm3, %xmm4 321 ; SSE-NEXT: psraw $2, %xmm4 322 ; SSE-NEXT: paddw %xmm0, %xmm0 323 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 324 ; SSE-NEXT: movdqa %xmm3, %xmm4 325 ; SSE-NEXT: psraw $1, %xmm4 326 ; SSE-NEXT: paddw %xmm0, %xmm0 327 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 328 ; SSE-NEXT: psrlw $8, %xmm3 329 ; SSE-NEXT: packuswb %xmm2, %xmm3 330 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 331 ; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1 332 ; SSE-NEXT: movdqa %xmm1, %xmm0 333 ; SSE-NEXT: retq 334 ; 335 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 336 ; AVX1: # %bb.0: 337 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 338 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 339 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 340 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] 341 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 342 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 343 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 344 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 345 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 346 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 347 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 348 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 349 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 350 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3 351 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16384,32800,41056,8384,16384,32800,41056,8384] 352 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 353 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 354 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3 355 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 356 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 357 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3 358 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 359 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 360 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 361 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 362 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm3 363 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 364 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 365 ; AVX1-NEXT: vpsraw $2, %xmm1, %xmm3 366 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 367 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 368 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm3 369 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 370 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 371 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 372 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 373 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 374 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 375 ; AVX1-NEXT: retq 376 ; 377 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 378 ; AVX2: # %bb.0: 379 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 380 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 381 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 382 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 383 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 384 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 385 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 386 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 387 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 388 ; AVX2-NEXT: vpsraw $4, %xmm2, %xmm3 389 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16384,32800,41056,8384,16384,32800,41056,8384] 390 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 391 ; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 392 ; AVX2-NEXT: vpsraw $2, %xmm2, %xmm3 393 ; AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 394 ; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 395 ; AVX2-NEXT: vpsraw $1, %xmm2, %xmm3 396 ; AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 397 ; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 398 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 399 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 400 ; AVX2-NEXT: vpsraw $4, %xmm1, %xmm3 401 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 402 ; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 403 ; AVX2-NEXT: vpsraw $2, %xmm1, %xmm3 404 ; AVX2-NEXT: vpaddw %xmm4, %xmm4, %xmm4 405 ; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 406 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm3 407 ; AVX2-NEXT: vpaddw %xmm4, %xmm4, %xmm4 408 ; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 409 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 410 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 411 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 412 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 413 ; AVX2-NEXT: vzeroupper 414 ; AVX2-NEXT: retq 415 ; 416 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 417 ; AVX512F: # %bb.0: 418 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 419 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 420 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 421 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 422 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 423 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 424 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 425 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 426 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 427 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 428 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 429 ; AVX512F-NEXT: vzeroupper 430 ; AVX512F-NEXT: retq 431 ; 432 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 433 ; AVX512BW: # %bb.0: 434 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 435 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 436 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 437 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 438 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 439 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1 440 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 441 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1 442 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 443 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101 444 ; AVX512BW-NEXT: kmovd %eax, %k1 445 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 446 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 447 ; AVX512BW-NEXT: vzeroupper 448 ; AVX512BW-NEXT: retq 449 ; 450 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 451 ; XOP: # %bb.0: 452 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 453 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 454 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1 455 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 456 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1 457 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 458 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 459 ; XOP-NEXT: retq 460 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2> 461 ret <16 x i8> %1 462 } 463 464 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { 465 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 466 ; SSE: # %bb.0: 467 ; SSE-NEXT: movdqa %xmm0, %xmm1 468 ; SSE-NEXT: psraw $15, %xmm1 469 ; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 470 ; SSE-NEXT: paddw %xmm0, %xmm1 471 ; SSE-NEXT: movdqa %xmm1, %xmm2 472 ; SSE-NEXT: psraw $4, %xmm2 473 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] 474 ; SSE-NEXT: movdqa %xmm2, %xmm3 475 ; SSE-NEXT: psraw $2, %xmm3 476 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] 477 ; SSE-NEXT: movdqa %xmm3, %xmm1 478 ; SSE-NEXT: psraw $1, %xmm1 479 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] 480 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 481 ; SSE-NEXT: movdqa %xmm1, %xmm0 482 ; SSE-NEXT: retq 483 ; 484 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 485 ; AVX1: # %bb.0: 486 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 487 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 488 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 489 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 490 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] 491 ; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2 492 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 493 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 494 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 495 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 496 ; AVX1-NEXT: retq 497 ; 498 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 499 ; AVX2: # %bb.0: 500 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 501 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 502 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 503 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 504 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 505 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 506 ; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 507 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 508 ; AVX2-NEXT: vzeroupper 509 ; AVX2-NEXT: retq 510 ; 511 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 512 ; AVX512F: # %bb.0: 513 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1 514 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 515 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1 516 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 517 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 518 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 519 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 520 ; AVX512F-NEXT: vzeroupper 521 ; AVX512F-NEXT: retq 522 ; 523 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 524 ; AVX512BW: # %bb.0: 525 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1 526 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1 527 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1 528 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm1, %xmm1 529 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 530 ; AVX512BW-NEXT: retq 531 ; 532 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 533 ; XOP: # %bb.0: 534 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1 535 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1 536 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1 537 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm1, %xmm1 538 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 539 ; XOP-NEXT: retq 540 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 541 ret <8 x i16> %1 542 } 543 544 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { 545 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 546 ; SSE: # %bb.0: 547 ; SSE-NEXT: movdqa %xmm0, %xmm2 548 ; SSE-NEXT: psraw $15, %xmm2 549 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] 550 ; SSE-NEXT: pmulhuw %xmm3, %xmm2 551 ; SSE-NEXT: paddw %xmm0, %xmm2 552 ; SSE-NEXT: movdqa %xmm2, %xmm4 553 ; SSE-NEXT: psraw $4, %xmm4 554 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4],xmm4[5,6],xmm2[7] 555 ; SSE-NEXT: movdqa %xmm4, %xmm5 556 ; SSE-NEXT: psraw $2, %xmm5 557 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 558 ; SSE-NEXT: movdqa %xmm5, %xmm2 559 ; SSE-NEXT: psraw $1, %xmm2 560 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7] 561 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 562 ; SSE-NEXT: movdqa %xmm1, %xmm0 563 ; SSE-NEXT: psraw $15, %xmm0 564 ; SSE-NEXT: pmulhuw %xmm3, %xmm0 565 ; SSE-NEXT: paddw %xmm1, %xmm0 566 ; SSE-NEXT: movdqa %xmm0, %xmm3 567 ; SSE-NEXT: psraw $4, %xmm3 568 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3],xmm0[4],xmm3[5,6],xmm0[7] 569 ; SSE-NEXT: movdqa %xmm3, %xmm0 570 ; SSE-NEXT: psraw $2, %xmm0 571 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] 572 ; SSE-NEXT: movdqa %xmm0, %xmm3 573 ; SSE-NEXT: psraw $1, %xmm3 574 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7] 575 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] 576 ; SSE-NEXT: movdqa %xmm2, %xmm0 577 ; SSE-NEXT: movdqa %xmm3, %xmm1 578 ; SSE-NEXT: retq 579 ; 580 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 581 ; AVX1: # %bb.0: 582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 583 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 584 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] 585 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 586 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 587 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 588 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] 589 ; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2 590 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 591 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 592 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] 593 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2 594 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 595 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 596 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3 597 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7] 598 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3 599 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] 600 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3 601 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] 602 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 603 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 604 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 605 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 606 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 607 ; AVX1-NEXT: retq 608 ; 609 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 610 ; AVX2: # %bb.0: 611 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 612 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 613 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] 614 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm4 615 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm4, %ymm4 616 ; AVX2-NEXT: vpaddw %ymm4, %ymm0, %ymm4 617 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] 618 ; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm3 619 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 620 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] 621 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] 622 ; AVX2-NEXT: vpsravd %ymm2, %ymm1, %ymm1 623 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 624 ; AVX2-NEXT: vpackusdw %ymm3, %ymm1, %ymm1 625 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 626 ; AVX2-NEXT: retq 627 ; 628 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 629 ; AVX512F: # %bb.0: 630 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 631 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 632 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 633 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 634 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 635 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 636 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 637 ; AVX512F-NEXT: retq 638 ; 639 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 640 ; AVX512BW: # %bb.0: 641 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1 642 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 643 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1 644 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1 645 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 646 ; AVX512BW-NEXT: retq 647 ; 648 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 649 ; XOP: # %bb.0: 650 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 651 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 652 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [65520,65522,65521,65524,65523,65525,65526,65521] 653 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 654 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 655 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535] 656 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 657 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 658 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 659 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 660 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 661 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 662 ; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 663 ; XOP-NEXT: retq 664 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 665 ret <16 x i16> %1 666 } 667 668 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { 669 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 670 ; SSE: # %bb.0: 671 ; SSE-NEXT: movdqa %xmm1, %xmm4 672 ; SSE-NEXT: movdqa %xmm0, %xmm1 673 ; SSE-NEXT: psraw $15, %xmm0 674 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2] 675 ; SSE-NEXT: pmulhuw %xmm5, %xmm0 676 ; SSE-NEXT: paddw %xmm1, %xmm0 677 ; SSE-NEXT: movdqa %xmm0, %xmm6 678 ; SSE-NEXT: psraw $4, %xmm6 679 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3],xmm0[4],xmm6[5,6],xmm0[7] 680 ; SSE-NEXT: movdqa %xmm6, %xmm7 681 ; SSE-NEXT: psraw $2, %xmm7 682 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] 683 ; SSE-NEXT: movdqa %xmm7, %xmm0 684 ; SSE-NEXT: psraw $1, %xmm0 685 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] 686 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 687 ; SSE-NEXT: movdqa %xmm4, %xmm1 688 ; SSE-NEXT: psraw $15, %xmm1 689 ; SSE-NEXT: pmulhuw %xmm5, %xmm1 690 ; SSE-NEXT: paddw %xmm4, %xmm1 691 ; SSE-NEXT: movdqa %xmm1, %xmm6 692 ; SSE-NEXT: psraw $4, %xmm6 693 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3],xmm1[4],xmm6[5,6],xmm1[7] 694 ; SSE-NEXT: movdqa %xmm6, %xmm7 695 ; SSE-NEXT: psraw $2, %xmm7 696 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] 697 ; SSE-NEXT: movdqa %xmm7, %xmm1 698 ; SSE-NEXT: psraw $1, %xmm1 699 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] 700 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] 701 ; SSE-NEXT: movdqa %xmm2, %xmm4 702 ; SSE-NEXT: psraw $15, %xmm4 703 ; SSE-NEXT: pmulhuw %xmm5, %xmm4 704 ; SSE-NEXT: paddw %xmm2, %xmm4 705 ; SSE-NEXT: movdqa %xmm4, %xmm6 706 ; SSE-NEXT: psraw $4, %xmm6 707 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4],xmm6[5,6],xmm4[7] 708 ; SSE-NEXT: movdqa %xmm6, %xmm7 709 ; SSE-NEXT: psraw $2, %xmm7 710 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] 711 ; SSE-NEXT: movdqa %xmm7, %xmm4 712 ; SSE-NEXT: psraw $1, %xmm4 713 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] 714 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] 715 ; SSE-NEXT: movdqa %xmm3, %xmm2 716 ; SSE-NEXT: psraw $15, %xmm2 717 ; SSE-NEXT: pmulhuw %xmm5, %xmm2 718 ; SSE-NEXT: paddw %xmm3, %xmm2 719 ; SSE-NEXT: movdqa %xmm2, %xmm5 720 ; SSE-NEXT: psraw $4, %xmm5 721 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7] 722 ; SSE-NEXT: movdqa %xmm5, %xmm2 723 ; SSE-NEXT: psraw $2, %xmm2 724 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5],xmm2[6],xmm5[7] 725 ; SSE-NEXT: movdqa %xmm2, %xmm5 726 ; SSE-NEXT: psraw $1, %xmm5 727 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] 728 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] 729 ; SSE-NEXT: movdqa %xmm4, %xmm2 730 ; SSE-NEXT: movdqa %xmm5, %xmm3 731 ; SSE-NEXT: retq 732 ; 733 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 734 ; AVX1: # %bb.0: 735 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 736 ; AVX1-NEXT: vpsraw $15, %xmm3, %xmm4 737 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2] 738 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4 739 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 740 ; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 741 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7] 742 ; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 743 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] 744 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 745 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] 746 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4 747 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4 748 ; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm4 749 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 750 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7] 751 ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 752 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] 753 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 754 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] 755 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 756 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 757 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 758 ; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0 759 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 760 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 761 ; AVX1-NEXT: vpsraw $15, %xmm3, %xmm5 762 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm5 763 ; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 764 ; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 765 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4],xmm5[5,6],xmm3[7] 766 ; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 767 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] 768 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 769 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] 770 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm5 771 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm2 772 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm2 773 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm5 774 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7] 775 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm5 776 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] 777 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm5 778 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] 779 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 780 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 781 ; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1 782 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 783 ; AVX1-NEXT: retq 784 ; 785 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 786 ; AVX2: # %bb.0: 787 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 788 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 789 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 790 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm5 791 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2] 792 ; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 793 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5 794 ; AVX2-NEXT: vpaddw %ymm5, %ymm0, %ymm5 795 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] 796 ; AVX2-NEXT: vpsravd %ymm4, %ymm7, %ymm7 797 ; AVX2-NEXT: vpsrld $16, %ymm7, %ymm7 798 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 799 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] 800 ; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm5 801 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 802 ; AVX2-NEXT: vpackusdw %ymm7, %ymm5, %ymm5 803 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] 804 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm5 805 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5 806 ; AVX2-NEXT: vpaddw %ymm5, %ymm1, %ymm5 807 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] 808 ; AVX2-NEXT: vpsravd %ymm4, %ymm6, %ymm4 809 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 810 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] 811 ; AVX2-NEXT: vpsravd %ymm3, %ymm2, %ymm2 812 ; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 813 ; AVX2-NEXT: vpackusdw %ymm4, %ymm2, %ymm2 814 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 815 ; AVX2-NEXT: retq 816 ; 817 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 818 ; AVX512F: # %bb.0: 819 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 820 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2] 821 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 822 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 823 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 824 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 825 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 826 ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] 827 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2 828 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 829 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 830 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2 831 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 832 ; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2 833 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 834 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2 835 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 836 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 837 ; AVX512F-NEXT: retq 838 ; 839 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 840 ; AVX512BW: # %bb.0: 841 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1 842 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 843 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 844 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1 845 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 846 ; AVX512BW-NEXT: kmovd %eax, %k1 847 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} 848 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 849 ; AVX512BW-NEXT: retq 850 ; 851 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 852 ; XOP: # %bb.0: 853 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 854 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 855 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [65520,65522,65521,65524,65523,65525,65526,65521] 856 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 857 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 858 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535] 859 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 860 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 861 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 862 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 863 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 864 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 865 ; XOP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 866 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 867 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 868 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 869 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6 870 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2 871 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 872 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6 873 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4 874 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4 875 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3 876 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 877 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1 878 ; XOP-NEXT: retq 879 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 880 ret <32 x i16> %1 881 } 882 883 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { 884 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 885 ; SSE: # %bb.0: 886 ; SSE-NEXT: movdqa %xmm0, %xmm1 887 ; SSE-NEXT: psrad $31, %xmm1 888 ; SSE-NEXT: movdqa %xmm1, %xmm2 889 ; SSE-NEXT: psrld $28, %xmm2 890 ; SSE-NEXT: movdqa %xmm1, %xmm3 891 ; SSE-NEXT: psrld $30, %xmm3 892 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 893 ; SSE-NEXT: psrld $29, %xmm1 894 ; SSE-NEXT: pxor %xmm2, %xmm2 895 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 896 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 897 ; SSE-NEXT: paddd %xmm0, %xmm2 898 ; SSE-NEXT: movdqa %xmm2, %xmm1 899 ; SSE-NEXT: movdqa %xmm2, %xmm3 900 ; SSE-NEXT: psrad $3, %xmm3 901 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] 902 ; SSE-NEXT: psrad $4, %xmm2 903 ; SSE-NEXT: psrad $2, %xmm1 904 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 905 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] 906 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] 907 ; SSE-NEXT: retq 908 ; 909 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 910 ; AVX1: # %bb.0: 911 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 912 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 913 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 914 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 915 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 916 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 917 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 918 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 919 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 920 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 921 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 922 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 923 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 924 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 925 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 926 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 927 ; AVX1-NEXT: retq 928 ; 929 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 930 ; AVX2ORLATER: # %bb.0: 931 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 932 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 933 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 934 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 935 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 936 ; AVX2ORLATER-NEXT: retq 937 ; 938 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 939 ; XOP: # %bb.0: 940 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 941 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1 942 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 943 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1 944 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 945 ; XOP-NEXT: retq 946 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 947 ret <4 x i32> %1 948 } 949 950 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { 951 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 952 ; SSE: # %bb.0: 953 ; SSE-NEXT: movdqa %xmm0, %xmm2 954 ; SSE-NEXT: movdqa %xmm0, %xmm3 955 ; SSE-NEXT: psrad $31, %xmm3 956 ; SSE-NEXT: movdqa %xmm3, %xmm0 957 ; SSE-NEXT: psrld $28, %xmm0 958 ; SSE-NEXT: movdqa %xmm3, %xmm4 959 ; SSE-NEXT: psrld $30, %xmm4 960 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5,6,7] 961 ; SSE-NEXT: psrld $29, %xmm3 962 ; SSE-NEXT: pxor %xmm5, %xmm5 963 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 964 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 965 ; SSE-NEXT: paddd %xmm2, %xmm3 966 ; SSE-NEXT: movdqa %xmm3, %xmm4 967 ; SSE-NEXT: movdqa %xmm3, %xmm0 968 ; SSE-NEXT: psrad $3, %xmm0 969 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 970 ; SSE-NEXT: psrad $4, %xmm3 971 ; SSE-NEXT: psrad $2, %xmm4 972 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 973 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 974 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] 975 ; SSE-NEXT: movdqa %xmm1, %xmm3 976 ; SSE-NEXT: psrad $31, %xmm3 977 ; SSE-NEXT: movdqa %xmm3, %xmm2 978 ; SSE-NEXT: psrld $28, %xmm2 979 ; SSE-NEXT: movdqa %xmm3, %xmm4 980 ; SSE-NEXT: psrld $30, %xmm4 981 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] 982 ; SSE-NEXT: psrld $29, %xmm3 983 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 984 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 985 ; SSE-NEXT: paddd %xmm1, %xmm3 986 ; SSE-NEXT: movdqa %xmm3, %xmm4 987 ; SSE-NEXT: movdqa %xmm3, %xmm2 988 ; SSE-NEXT: psrad $3, %xmm2 989 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 990 ; SSE-NEXT: psrad $4, %xmm3 991 ; SSE-NEXT: psrad $2, %xmm4 992 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 993 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 994 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 995 ; SSE-NEXT: movdqa %xmm2, %xmm1 996 ; SSE-NEXT: retq 997 ; 998 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 999 ; AVX1: # %bb.0: 1000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1001 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1002 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1003 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1004 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1005 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1006 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1007 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 1008 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1009 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1010 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1011 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1012 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1013 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 1014 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1015 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1016 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 1017 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1018 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm5 1019 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 1020 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1021 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 1022 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1023 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 1024 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1025 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1026 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1027 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 1028 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1029 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1030 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1031 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1032 ; AVX1-NEXT: retq 1033 ; 1034 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1035 ; AVX2ORLATER: # %bb.0: 1036 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1 1037 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 1038 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1 1039 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 1040 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1041 ; AVX2ORLATER-NEXT: retq 1042 ; 1043 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1044 ; XOP: # %bb.0: 1045 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1046 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 1047 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967264,4294967266,4294967267,4294967268] 1048 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 1049 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1050 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] 1051 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 1052 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 1053 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 1054 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1055 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2 1056 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1057 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1058 ; XOP-NEXT: retq 1059 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1060 ret <8 x i32> %1 1061 } 1062 1063 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { 1064 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1065 ; SSE: # %bb.0: 1066 ; SSE-NEXT: movdqa %xmm1, %xmm4 1067 ; SSE-NEXT: movdqa %xmm0, %xmm1 1068 ; SSE-NEXT: movdqa %xmm0, %xmm6 1069 ; SSE-NEXT: psrad $31, %xmm6 1070 ; SSE-NEXT: movdqa %xmm6, %xmm0 1071 ; SSE-NEXT: psrld $28, %xmm0 1072 ; SSE-NEXT: movdqa %xmm6, %xmm7 1073 ; SSE-NEXT: psrld $30, %xmm7 1074 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4,5,6,7] 1075 ; SSE-NEXT: psrld $29, %xmm6 1076 ; SSE-NEXT: pxor %xmm5, %xmm5 1077 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1078 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] 1079 ; SSE-NEXT: paddd %xmm1, %xmm6 1080 ; SSE-NEXT: movdqa %xmm6, %xmm7 1081 ; SSE-NEXT: movdqa %xmm6, %xmm0 1082 ; SSE-NEXT: psrad $3, %xmm0 1083 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] 1084 ; SSE-NEXT: psrad $4, %xmm6 1085 ; SSE-NEXT: psrad $2, %xmm7 1086 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 1087 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] 1088 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1089 ; SSE-NEXT: movdqa %xmm4, %xmm6 1090 ; SSE-NEXT: psrad $31, %xmm6 1091 ; SSE-NEXT: movdqa %xmm6, %xmm1 1092 ; SSE-NEXT: psrld $28, %xmm1 1093 ; SSE-NEXT: movdqa %xmm6, %xmm7 1094 ; SSE-NEXT: psrld $30, %xmm7 1095 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] 1096 ; SSE-NEXT: psrld $29, %xmm6 1097 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1098 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] 1099 ; SSE-NEXT: paddd %xmm4, %xmm6 1100 ; SSE-NEXT: movdqa %xmm6, %xmm7 1101 ; SSE-NEXT: movdqa %xmm6, %xmm1 1102 ; SSE-NEXT: psrad $3, %xmm1 1103 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 1104 ; SSE-NEXT: psrad $4, %xmm6 1105 ; SSE-NEXT: psrad $2, %xmm7 1106 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 1107 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7] 1108 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] 1109 ; SSE-NEXT: movdqa %xmm2, %xmm6 1110 ; SSE-NEXT: psrad $31, %xmm6 1111 ; SSE-NEXT: movdqa %xmm6, %xmm4 1112 ; SSE-NEXT: psrld $28, %xmm4 1113 ; SSE-NEXT: movdqa %xmm6, %xmm7 1114 ; SSE-NEXT: psrld $30, %xmm7 1115 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5,6,7] 1116 ; SSE-NEXT: psrld $29, %xmm6 1117 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1118 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] 1119 ; SSE-NEXT: paddd %xmm2, %xmm6 1120 ; SSE-NEXT: movdqa %xmm6, %xmm7 1121 ; SSE-NEXT: movdqa %xmm6, %xmm4 1122 ; SSE-NEXT: psrad $3, %xmm4 1123 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] 1124 ; SSE-NEXT: psrad $4, %xmm6 1125 ; SSE-NEXT: psrad $2, %xmm7 1126 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 1127 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7] 1128 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] 1129 ; SSE-NEXT: movdqa %xmm3, %xmm2 1130 ; SSE-NEXT: psrad $31, %xmm2 1131 ; SSE-NEXT: movdqa %xmm2, %xmm6 1132 ; SSE-NEXT: psrld $28, %xmm6 1133 ; SSE-NEXT: movdqa %xmm2, %xmm7 1134 ; SSE-NEXT: psrld $30, %xmm7 1135 ; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 1136 ; SSE-NEXT: psrld $29, %xmm2 1137 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 1138 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7] 1139 ; SSE-NEXT: paddd %xmm3, %xmm2 1140 ; SSE-NEXT: movdqa %xmm2, %xmm6 1141 ; SSE-NEXT: movdqa %xmm2, %xmm5 1142 ; SSE-NEXT: psrad $3, %xmm5 1143 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] 1144 ; SSE-NEXT: psrad $4, %xmm2 1145 ; SSE-NEXT: psrad $2, %xmm6 1146 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] 1147 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1148 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] 1149 ; SSE-NEXT: movdqa %xmm4, %xmm2 1150 ; SSE-NEXT: movdqa %xmm5, %xmm3 1151 ; SSE-NEXT: retq 1152 ; 1153 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1154 ; AVX1: # %bb.0: 1155 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1156 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm2 1157 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm4 1158 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm5 1159 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1160 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm5 1161 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1162 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] 1163 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 1164 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 1165 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1166 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1167 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1168 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm5 1169 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1170 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1171 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 1172 ; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5 1173 ; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6 1174 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1175 ; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4 1176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1177 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1178 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm4 1179 ; AVX1-NEXT: vpsrad $4, %xmm4, %xmm5 1180 ; AVX1-NEXT: vpsrad $2, %xmm4, %xmm6 1181 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1182 ; AVX1-NEXT: vpsrad $3, %xmm4, %xmm6 1183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 1184 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1185 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1186 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] 1187 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1188 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 1189 ; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5 1190 ; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6 1191 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1192 ; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4 1193 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1194 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1195 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 1196 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1197 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1198 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1199 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm5 1200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1201 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1202 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 1203 ; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5 1204 ; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6 1205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1206 ; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4 1207 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1208 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] 1209 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm2 1210 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm4 1211 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm5 1212 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1213 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm5 1214 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 1215 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1216 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1217 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1218 ; AVX1-NEXT: retq 1219 ; 1220 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1221 ; AVX2: # %bb.0: 1222 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 1223 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [32,30,29,28,32,30,29,28] 1224 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1225 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1226 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 1227 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4] 1228 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1229 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1230 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1231 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 1232 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1233 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2 1234 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1235 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1236 ; AVX2-NEXT: retq 1237 ; 1238 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1239 ; AVX512F: # %bb.0: 1240 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1 1241 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 1242 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1243 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 1244 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 1245 ; AVX512F-NEXT: kmovw %eax, %k1 1246 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1247 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1248 ; AVX512F-NEXT: retq 1249 ; 1250 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1251 ; AVX512BW: # %bb.0: 1252 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1 1253 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 1254 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1255 ; AVX512BW-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 1256 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111 1257 ; AVX512BW-NEXT: kmovd %eax, %k1 1258 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1259 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1260 ; AVX512BW-NEXT: retq 1261 ; 1262 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1263 ; XOP: # %bb.0: 1264 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1265 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 1266 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967264,4294967266,4294967267,4294967268] 1267 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 1268 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1269 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292] 1270 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1271 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 1272 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1273 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5 1274 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5 1275 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 1276 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1277 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1278 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5 1279 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1280 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1281 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1282 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5 1283 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4 1284 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4 1285 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3 1286 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1287 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1288 ; XOP-NEXT: retq 1289 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1290 ret <16 x i32> %1 1291 } 1292 1293 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { 1294 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1295 ; SSE: # %bb.0: 1296 ; SSE-NEXT: movdqa %xmm0, %xmm1 1297 ; SSE-NEXT: psrad $31, %xmm1 1298 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1299 ; SSE-NEXT: psrlq $62, %xmm1 1300 ; SSE-NEXT: pxor %xmm2, %xmm2 1301 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1302 ; SSE-NEXT: paddq %xmm0, %xmm2 1303 ; SSE-NEXT: movdqa %xmm2, %xmm1 1304 ; SSE-NEXT: psrlq $2, %xmm1 1305 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1306 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] 1307 ; SSE-NEXT: pxor %xmm2, %xmm1 1308 ; SSE-NEXT: psubq %xmm2, %xmm1 1309 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1310 ; SSE-NEXT: retq 1311 ; 1312 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1313 ; AVX1: # %bb.0: 1314 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1315 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1316 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1317 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1318 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1319 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm2 1320 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1321 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] 1322 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 1323 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1324 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1325 ; AVX1-NEXT: retq 1326 ; 1327 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1328 ; AVX2: # %bb.0: 1329 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1330 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1331 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1 1332 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1333 ; AVX2-NEXT: movl $2, %eax 1334 ; AVX2-NEXT: vmovq %rax, %xmm2 1335 ; AVX2-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1336 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1 1337 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] 1338 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 1339 ; AVX2-NEXT: vpsubq %xmm2, %xmm1, %xmm1 1340 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1341 ; AVX2-NEXT: retq 1342 ; 1343 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1344 ; AVX512F: # %bb.0: 1345 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1346 ; AVX512F-NEXT: movl $2, %eax 1347 ; AVX512F-NEXT: vmovq %rax, %xmm1 1348 ; AVX512F-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1349 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 1350 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm2, %xmm2 1351 ; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1352 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1 1353 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1354 ; AVX512F-NEXT: vzeroupper 1355 ; AVX512F-NEXT: retq 1356 ; 1357 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1358 ; AVX512BW: # %bb.0: 1359 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1 1360 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1 1361 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1362 ; AVX512BW-NEXT: movl $2, %eax 1363 ; AVX512BW-NEXT: vmovq %rax, %xmm2 1364 ; AVX512BW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1365 ; AVX512BW-NEXT: vpsravq %xmm2, %xmm1, %xmm1 1366 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1367 ; AVX512BW-NEXT: retq 1368 ; 1369 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1370 ; XOP: # %bb.0: 1371 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1 1372 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1 1373 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1374 ; XOP-NEXT: movq $-2, %rax 1375 ; XOP-NEXT: vmovq %rax, %xmm2 1376 ; XOP-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1377 ; XOP-NEXT: vpshaq %xmm2, %xmm1, %xmm1 1378 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1379 ; XOP-NEXT: retq 1380 %1 = sdiv <2 x i64> %x, <i64 1, i64 4> 1381 ret <2 x i64> %1 1382 } 1383 1384 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { 1385 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1386 ; SSE: # %bb.0: 1387 ; SSE-NEXT: movdqa %xmm1, %xmm2 1388 ; SSE-NEXT: psrad $31, %xmm1 1389 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1390 ; SSE-NEXT: movdqa %xmm1, %xmm3 1391 ; SSE-NEXT: psrlq $60, %xmm3 1392 ; SSE-NEXT: psrlq $61, %xmm1 1393 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1394 ; SSE-NEXT: paddq %xmm2, %xmm1 1395 ; SSE-NEXT: movdqa %xmm1, %xmm2 1396 ; SSE-NEXT: psrlq $4, %xmm2 1397 ; SSE-NEXT: psrlq $3, %xmm1 1398 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1399 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488] 1400 ; SSE-NEXT: pxor %xmm2, %xmm1 1401 ; SSE-NEXT: psubq %xmm2, %xmm1 1402 ; SSE-NEXT: movdqa %xmm0, %xmm2 1403 ; SSE-NEXT: psrad $31, %xmm2 1404 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1405 ; SSE-NEXT: psrlq $62, %xmm2 1406 ; SSE-NEXT: pxor %xmm3, %xmm3 1407 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1408 ; SSE-NEXT: paddq %xmm0, %xmm3 1409 ; SSE-NEXT: movdqa %xmm3, %xmm2 1410 ; SSE-NEXT: psrlq $2, %xmm2 1411 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1412 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952] 1413 ; SSE-NEXT: pxor %xmm3, %xmm2 1414 ; SSE-NEXT: psubq %xmm3, %xmm2 1415 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1416 ; SSE-NEXT: retq 1417 ; 1418 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1419 ; AVX1: # %bb.0: 1420 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1421 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1422 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 1423 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4 1424 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3 1425 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1426 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1427 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 1428 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1 1429 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1430 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488] 1431 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1432 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 1433 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 1434 ; AVX1-NEXT: vpsrlq $62, %xmm3, %xmm3 1435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1436 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1437 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm3 1438 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1439 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952] 1440 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 1441 ; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 1442 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1443 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1444 ; AVX1-NEXT: retq 1445 ; 1446 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1447 ; AVX2: # %bb.0: 1448 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1449 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 1450 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1451 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1452 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1453 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488] 1454 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 1455 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1457 ; AVX2-NEXT: retq 1458 ; 1459 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1460 ; AVX512F: # %bb.0: 1461 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1462 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,4] 1463 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 1464 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2 1465 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 1466 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1 1467 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1468 ; AVX512F-NEXT: retq 1469 ; 1470 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1471 ; AVX512BW: # %bb.0: 1472 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 1473 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1474 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1475 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm1, %ymm1 1476 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1477 ; AVX512BW-NEXT: retq 1478 ; 1479 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1480 ; XOP: # %bb.0: 1481 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] 1482 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 1483 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm2 1484 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1485 ; XOP-NEXT: movq $-2, %rax 1486 ; XOP-NEXT: vmovq %rax, %xmm3 1487 ; XOP-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] 1488 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm2 1489 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1490 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1 1491 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1 1492 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1493 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1 1494 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1495 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1496 ; XOP-NEXT: retq 1497 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16> 1498 ret <4 x i64> %1 1499 } 1500 1501 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { 1502 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1503 ; SSE: # %bb.0: 1504 ; SSE-NEXT: movdqa %xmm3, %xmm4 1505 ; SSE-NEXT: movdqa %xmm1, %xmm3 1506 ; SSE-NEXT: psrad $31, %xmm1 1507 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1508 ; SSE-NEXT: movdqa %xmm1, %xmm5 1509 ; SSE-NEXT: psrlq $60, %xmm5 1510 ; SSE-NEXT: psrlq $61, %xmm1 1511 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] 1512 ; SSE-NEXT: paddq %xmm3, %xmm1 1513 ; SSE-NEXT: movdqa %xmm1, %xmm3 1514 ; SSE-NEXT: psrlq $4, %xmm3 1515 ; SSE-NEXT: psrlq $3, %xmm1 1516 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1517 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488] 1518 ; SSE-NEXT: pxor %xmm5, %xmm1 1519 ; SSE-NEXT: psubq %xmm5, %xmm1 1520 ; SSE-NEXT: movdqa %xmm4, %xmm3 1521 ; SSE-NEXT: psrad $31, %xmm3 1522 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1523 ; SSE-NEXT: movdqa %xmm3, %xmm6 1524 ; SSE-NEXT: psrlq $60, %xmm6 1525 ; SSE-NEXT: psrlq $61, %xmm3 1526 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] 1527 ; SSE-NEXT: paddq %xmm4, %xmm3 1528 ; SSE-NEXT: movdqa %xmm3, %xmm4 1529 ; SSE-NEXT: psrlq $4, %xmm4 1530 ; SSE-NEXT: psrlq $3, %xmm3 1531 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1532 ; SSE-NEXT: pxor %xmm5, %xmm3 1533 ; SSE-NEXT: psubq %xmm5, %xmm3 1534 ; SSE-NEXT: movdqa %xmm0, %xmm4 1535 ; SSE-NEXT: psrad $31, %xmm4 1536 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1537 ; SSE-NEXT: psrlq $62, %xmm4 1538 ; SSE-NEXT: pxor %xmm5, %xmm5 1539 ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1540 ; SSE-NEXT: paddq %xmm0, %xmm4 1541 ; SSE-NEXT: movdqa %xmm4, %xmm6 1542 ; SSE-NEXT: psrlq $2, %xmm6 1543 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] 1544 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,2305843009213693952] 1545 ; SSE-NEXT: pxor %xmm4, %xmm6 1546 ; SSE-NEXT: psubq %xmm4, %xmm6 1547 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] 1548 ; SSE-NEXT: movdqa %xmm2, %xmm6 1549 ; SSE-NEXT: psrad $31, %xmm6 1550 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1551 ; SSE-NEXT: psrlq $62, %xmm6 1552 ; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1553 ; SSE-NEXT: paddq %xmm2, %xmm6 1554 ; SSE-NEXT: movdqa %xmm6, %xmm5 1555 ; SSE-NEXT: psrlq $2, %xmm5 1556 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1557 ; SSE-NEXT: pxor %xmm4, %xmm5 1558 ; SSE-NEXT: psubq %xmm4, %xmm5 1559 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] 1560 ; SSE-NEXT: retq 1561 ; 1562 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1563 ; AVX1: # %bb.0: 1564 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1565 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1566 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 1567 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5 1568 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4 1569 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] 1570 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1571 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4 1572 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1573 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1574 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1575 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1576 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1577 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 1578 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 1579 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] 1580 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 1581 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm6 1582 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1583 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952] 1584 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 1585 ; AVX1-NEXT: vpsubq %xmm6, %xmm5, %xmm5 1586 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1587 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1588 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1589 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 1590 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm7 1591 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 1592 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] 1593 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1594 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 1595 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1596 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1597 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1598 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1599 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 1600 ; AVX1-NEXT: vpsrlq $62, %xmm4, %xmm4 1601 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1602 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 1603 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm4 1604 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1605 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 1606 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1607 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1608 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1609 ; AVX1-NEXT: retq 1610 ; 1611 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1612 ; AVX2: # %bb.0: 1613 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1614 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 1615 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [64,62,61,60] 1616 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 1617 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 1618 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,2,3,4] 1619 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 1620 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488] 1621 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 1622 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 1623 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1624 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 1625 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2 1626 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2 1627 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2 1628 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 1629 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1630 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1631 ; AVX2-NEXT: retq 1632 ; 1633 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1634 ; AVX512F: # %bb.0: 1635 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1636 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 1637 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1638 ; AVX512F-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1 1639 ; AVX512F-NEXT: movb $17, %al 1640 ; AVX512F-NEXT: kmovw %eax, %k1 1641 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1642 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1643 ; AVX512F-NEXT: retq 1644 ; 1645 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1646 ; AVX512BW: # %bb.0: 1647 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1 1648 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 1649 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1650 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1 1651 ; AVX512BW-NEXT: movb $17, %al 1652 ; AVX512BW-NEXT: kmovd %eax, %k1 1653 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1654 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1655 ; AVX512BW-NEXT: retq 1656 ; 1657 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1658 ; XOP: # %bb.0: 1659 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1660 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] 1661 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 1662 ; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [18446744073709551555,18446744073709551556] 1663 ; XOP-NEXT: vpshlq %xmm8, %xmm4, %xmm4 1664 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 1665 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] 1666 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1667 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 1668 ; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551552,18446744073709551554] 1669 ; XOP-NEXT: vpshlq %xmm7, %xmm6, %xmm6 1670 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 1671 ; XOP-NEXT: movq $-2, %rax 1672 ; XOP-NEXT: vmovq %rax, %xmm5 1673 ; XOP-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7] 1674 ; XOP-NEXT: vpshaq %xmm5, %xmm6, %xmm6 1675 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 1676 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 1677 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1678 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6 1679 ; XOP-NEXT: vpshlq %xmm8, %xmm6, %xmm6 1680 ; XOP-NEXT: vpaddq %xmm6, %xmm2, %xmm2 1681 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1682 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3 1683 ; XOP-NEXT: vpshlq %xmm7, %xmm3, %xmm3 1684 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3 1685 ; XOP-NEXT: vpshaq %xmm5, %xmm3, %xmm3 1686 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1687 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1688 ; XOP-NEXT: retq 1689 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16> 1690 ret <8 x i64> %1 1691 } 1692 1693 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { 1694 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1695 ; SSE: # %bb.0: 1696 ; SSE-NEXT: movdqa %xmm0, %xmm2 1697 ; SSE-NEXT: psrad $31, %xmm2 1698 ; SSE-NEXT: movdqa %xmm2, %xmm1 1699 ; SSE-NEXT: psrld $28, %xmm1 1700 ; SSE-NEXT: movdqa %xmm2, %xmm3 1701 ; SSE-NEXT: psrld $30, %xmm3 1702 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] 1703 ; SSE-NEXT: psrld $29, %xmm2 1704 ; SSE-NEXT: pxor %xmm4, %xmm4 1705 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 1706 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1707 ; SSE-NEXT: paddd %xmm0, %xmm2 1708 ; SSE-NEXT: movdqa %xmm2, %xmm3 1709 ; SSE-NEXT: movdqa %xmm2, %xmm1 1710 ; SSE-NEXT: psrad $3, %xmm1 1711 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1712 ; SSE-NEXT: psrad $4, %xmm2 1713 ; SSE-NEXT: psrad $2, %xmm3 1714 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1715 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1716 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1717 ; SSE-NEXT: psubd %xmm1, %xmm4 1718 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] 1719 ; SSE-NEXT: movdqa %xmm1, %xmm0 1720 ; SSE-NEXT: retq 1721 ; 1722 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1723 ; AVX1: # %bb.0: 1724 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 1725 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1726 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 1727 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1728 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 1729 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1730 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 1731 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1732 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1733 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1734 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm4 1735 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 1736 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 1737 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 1738 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1739 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1740 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm1 1741 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1742 ; AVX1-NEXT: retq 1743 ; 1744 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1745 ; AVX2ORLATER: # %bb.0: 1746 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 1747 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 1748 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1749 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 1750 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1751 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 1752 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 1753 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1754 ; AVX2ORLATER-NEXT: retq 1755 ; 1756 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1757 ; XOP: # %bb.0: 1758 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 1759 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1 1760 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1761 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1 1762 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1763 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1764 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1 1765 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1766 ; XOP-NEXT: retq 1767 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16> 1768 ret <4 x i32> %1 1769 } 1770 1771 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { 1772 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1: 1773 ; CHECK: # %bb.0: 1774 ; CHECK-NEXT: retq 1775 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16> 1776 ret <4 x i32> %1 1777 } 1778 1779 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { 1780 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2: 1781 ; CHECK: # %bb.0: 1782 ; CHECK-NEXT: retq 1783 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16> 1784 ret <4 x i32> %1 1785 } 1786 1787 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { 1788 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3: 1789 ; CHECK: # %bb.0: 1790 ; CHECK-NEXT: retq 1791 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16> 1792 ret <4 x i32> %1 1793 } 1794 1795 ; PR37119 1796 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { 1797 ; SSE-LABEL: non_splat_minus_one_divisor_0: 1798 ; SSE: # %bb.0: 1799 ; SSE-NEXT: movdqa %xmm0, %xmm1 1800 ; SSE-NEXT: pxor %xmm2, %xmm2 1801 ; SSE-NEXT: psubb %xmm0, %xmm2 1802 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1803 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm2 1804 ; SSE-NEXT: movdqa %xmm2, %xmm0 1805 ; SSE-NEXT: retq 1806 ; 1807 ; AVX1-LABEL: non_splat_minus_one_divisor_0: 1808 ; AVX1: # %bb.0: 1809 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1810 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1811 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1812 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1813 ; AVX1-NEXT: retq 1814 ; 1815 ; AVX2-LABEL: non_splat_minus_one_divisor_0: 1816 ; AVX2: # %bb.0: 1817 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1818 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1819 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1820 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1821 ; AVX2-NEXT: retq 1822 ; 1823 ; AVX512F-LABEL: non_splat_minus_one_divisor_0: 1824 ; AVX512F: # %bb.0: 1825 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1826 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1827 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1828 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1829 ; AVX512F-NEXT: retq 1830 ; 1831 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0: 1832 ; AVX512BW: # %bb.0: 1833 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1834 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 1835 ; AVX512BW-NEXT: kmovd %eax, %k1 1836 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} 1837 ; AVX512BW-NEXT: retq 1838 ; 1839 ; XOP-LABEL: non_splat_minus_one_divisor_0: 1840 ; XOP: # %bb.0: 1841 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1842 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1843 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1844 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1845 ; XOP-NEXT: retq 1846 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1847 ret <16 x i8> %div 1848 } 1849 1850 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { 1851 ; SSE-LABEL: non_splat_minus_one_divisor_1: 1852 ; SSE: # %bb.0: 1853 ; SSE-NEXT: movdqa %xmm0, %xmm1 1854 ; SSE-NEXT: pxor %xmm2, %xmm2 1855 ; SSE-NEXT: pxor %xmm0, %xmm0 1856 ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 1857 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1858 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm4 1859 ; SSE-NEXT: psrlw $8, %xmm4 1860 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1861 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1862 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 1863 ; SSE-NEXT: psrlw $8, %xmm0 1864 ; SSE-NEXT: packuswb %xmm0, %xmm4 1865 ; SSE-NEXT: paddb %xmm1, %xmm4 1866 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 1867 ; SSE-NEXT: movdqa %xmm3, %xmm5 1868 ; SSE-NEXT: psraw $4, %xmm5 1869 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376] 1870 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 1871 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 1872 ; SSE-NEXT: movdqa %xmm3, %xmm5 1873 ; SSE-NEXT: psraw $2, %xmm5 1874 ; SSE-NEXT: paddw %xmm0, %xmm0 1875 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 1876 ; SSE-NEXT: movdqa %xmm3, %xmm5 1877 ; SSE-NEXT: psraw $1, %xmm5 1878 ; SSE-NEXT: paddw %xmm0, %xmm0 1879 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 1880 ; SSE-NEXT: psrlw $8, %xmm3 1881 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1882 ; SSE-NEXT: movdqa %xmm4, %xmm5 1883 ; SSE-NEXT: psraw $4, %xmm5 1884 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 1885 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 1886 ; SSE-NEXT: movdqa %xmm4, %xmm5 1887 ; SSE-NEXT: psraw $2, %xmm5 1888 ; SSE-NEXT: paddw %xmm0, %xmm0 1889 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 1890 ; SSE-NEXT: movdqa %xmm4, %xmm5 1891 ; SSE-NEXT: psraw $1, %xmm5 1892 ; SSE-NEXT: paddw %xmm0, %xmm0 1893 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 1894 ; SSE-NEXT: psrlw $8, %xmm4 1895 ; SSE-NEXT: packuswb %xmm3, %xmm4 1896 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1897 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1 1898 ; SSE-NEXT: psubb %xmm1, %xmm2 1899 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 1900 ; SSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1901 ; SSE-NEXT: movdqa %xmm1, %xmm0 1902 ; SSE-NEXT: retq 1903 ; 1904 ; AVX1-LABEL: non_splat_minus_one_divisor_1: 1905 ; AVX1: # %bb.0: 1906 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1907 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 1908 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1909 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 1910 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1911 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1912 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1913 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 1914 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1915 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 1916 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 1917 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1918 ; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 1919 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,32,0,32,8192,8224,57376,57376] 1920 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 1921 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1922 ; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 1923 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 1924 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1925 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 1926 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 1927 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1928 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1929 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1930 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 1931 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1932 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1933 ; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 1934 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 1935 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1936 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 1937 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 1938 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1939 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1940 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 1941 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1942 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 1943 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1944 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 1945 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1946 ; AVX1-NEXT: retq 1947 ; 1948 ; AVX2-LABEL: non_splat_minus_one_divisor_1: 1949 ; AVX2: # %bb.0: 1950 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1951 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 1952 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 1953 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 1954 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 1955 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1956 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 1957 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2 1958 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1959 ; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 1960 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,32,0,32,8192,8224,57376,57376] 1961 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 1962 ; AVX2-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1963 ; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 1964 ; AVX2-NEXT: vpaddw %xmm6, %xmm6, %xmm6 1965 ; AVX2-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1966 ; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 1967 ; AVX2-NEXT: vpaddw %xmm6, %xmm6, %xmm6 1968 ; AVX2-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 1969 ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 1970 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1971 ; AVX2-NEXT: vpsraw $4, %xmm2, %xmm4 1972 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1973 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1974 ; AVX2-NEXT: vpsraw $2, %xmm2, %xmm4 1975 ; AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 1976 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1977 ; AVX2-NEXT: vpsraw $1, %xmm2, %xmm4 1978 ; AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 1979 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 1980 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1981 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 1982 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 1983 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 1984 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1985 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 1986 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1987 ; AVX2-NEXT: vzeroupper 1988 ; AVX2-NEXT: retq 1989 ; 1990 ; AVX512F-LABEL: non_splat_minus_one_divisor_1: 1991 ; AVX512F: # %bb.0: 1992 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1993 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 1994 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 1995 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2 1996 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 1997 ; AVX512F-NEXT: vpaddb %xmm2, %xmm0, %xmm2 1998 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 1999 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm2, %zmm2 2000 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 2001 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2002 ; AVX512F-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 2003 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1 2004 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2005 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2006 ; AVX512F-NEXT: vzeroupper 2007 ; AVX512F-NEXT: retq 2008 ; 2009 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1: 2010 ; AVX512BW: # %bb.0: 2011 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2012 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2013 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2014 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2 2015 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2016 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2 2017 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2 2018 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm2, %ymm2 2019 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2020 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2021 ; AVX512BW-NEXT: kmovd %eax, %k1 2022 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} 2023 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 2024 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44 2025 ; AVX512BW-NEXT: kmovd %eax, %k1 2026 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} 2027 ; AVX512BW-NEXT: vzeroupper 2028 ; AVX512BW-NEXT: retq 2029 ; 2030 ; XOP-LABEL: non_splat_minus_one_divisor_1: 2031 ; XOP: # %bb.0: 2032 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2033 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2034 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm2, %xmm2 2035 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2 2036 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm2, %xmm2 2037 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2038 ; XOP-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 2039 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1 2040 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2041 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2042 ; XOP-NEXT: retq 2043 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128> 2044 ret <16 x i8> %div 2045 } 2046 2047 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { 2048 ; SSE-LABEL: non_splat_minus_one_divisor_2: 2049 ; SSE: # %bb.0: 2050 ; SSE-NEXT: movdqa %xmm0, %xmm1 2051 ; SSE-NEXT: psrld $31, %xmm1 2052 ; SSE-NEXT: pxor %xmm2, %xmm2 2053 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2054 ; SSE-NEXT: paddd %xmm0, %xmm1 2055 ; SSE-NEXT: psrad $1, %xmm1 2056 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2057 ; SSE-NEXT: psubd %xmm1, %xmm2 2058 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] 2059 ; SSE-NEXT: movdqa %xmm1, %xmm0 2060 ; SSE-NEXT: retq 2061 ; 2062 ; AVX1-LABEL: non_splat_minus_one_divisor_2: 2063 ; AVX1: # %bb.0: 2064 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 2065 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2066 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2067 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2068 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 2069 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2070 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm1 2071 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2072 ; AVX1-NEXT: retq 2073 ; 2074 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2: 2075 ; AVX2ORLATER: # %bb.0: 2076 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 2077 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 2078 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2079 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 2080 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2081 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2082 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2083 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 2084 ; AVX2ORLATER-NEXT: retq 2085 ; 2086 ; XOP-LABEL: non_splat_minus_one_divisor_2: 2087 ; XOP: # %bb.0: 2088 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 2089 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1 2090 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2091 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1 2092 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2093 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2094 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2095 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2096 ; XOP-NEXT: retq 2097 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2> 2098 ret <4 x i32> %div 2099 } 2100