1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 5 6 ; fold (udiv x, 1) -> x 7 define i32 @combine_udiv_by_one(i32 %x) { 8 ; CHECK-LABEL: combine_udiv_by_one: 9 ; CHECK: # %bb.0: 10 ; CHECK-NEXT: movl %edi, %eax 11 ; CHECK-NEXT: retq 12 %1 = udiv i32 %x, 1 13 ret i32 %1 14 } 15 16 define <4 x i32> @combine_vec_udiv_by_one(<4 x i32> %x) { 17 ; CHECK-LABEL: combine_vec_udiv_by_one: 18 ; CHECK: # %bb.0: 19 ; CHECK-NEXT: retq 20 %1 = udiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 21 ret <4 x i32> %1 22 } 23 24 ; fold (udiv x, -1) -> select((icmp eq x, -1), 1, 0) 25 define i32 @combine_udiv_by_negone(i32 %x) { 26 ; CHECK-LABEL: combine_udiv_by_negone: 27 ; CHECK: # %bb.0: 28 ; CHECK-NEXT: xorl %eax, %eax 29 ; CHECK-NEXT: cmpl $-1, %edi 30 ; CHECK-NEXT: sete %al 31 ; CHECK-NEXT: retq 32 %1 = udiv i32 %x, -1 33 ret i32 %1 34 } 35 36 define <4 x i32> @combine_vec_udiv_by_negone(<4 x i32> %x) { 37 ; SSE-LABEL: combine_vec_udiv_by_negone: 38 ; SSE: # %bb.0: 39 ; SSE-NEXT: pcmpeqd %xmm1, %xmm1 40 ; SSE-NEXT: pcmpeqd %xmm1, %xmm0 41 ; SSE-NEXT: psrld $31, %xmm0 42 ; SSE-NEXT: retq 43 ; 44 ; AVX-LABEL: combine_vec_udiv_by_negone: 45 ; AVX: # %bb.0: 46 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 47 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 48 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 49 ; AVX-NEXT: retq 50 %1 = udiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 51 ret <4 x i32> %1 52 } 53 54 ; fold (udiv x, INT_MIN) -> (srl x, 31) 55 define i32 @combine_udiv_by_minsigned(i32 %x) { 56 ; CHECK-LABEL: combine_udiv_by_minsigned: 57 ; CHECK: # %bb.0: 58 ; CHECK-NEXT: shrl $31, %edi 59 ; CHECK-NEXT: movl %edi, %eax 60 ; CHECK-NEXT: retq 61 %1 = udiv i32 %x, -2147483648 62 ret i32 %1 63 } 64 65 define <4 x i32> @combine_vec_udiv_by_minsigned(<4 x i32> %x) { 66 ; SSE-LABEL: combine_vec_udiv_by_minsigned: 67 ; SSE: # %bb.0: 68 ; SSE-NEXT: psrld $31, %xmm0 69 ; SSE-NEXT: retq 70 ; 71 ; AVX-LABEL: combine_vec_udiv_by_minsigned: 72 ; AVX: # %bb.0: 73 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 74 ; AVX-NEXT: retq 75 %1 = udiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 76 ret <4 x i32> %1 77 } 78 79 ; TODO fold (udiv x, x) -> 1 80 define i32 @combine_udiv_dupe(i32 %x) { 81 ; CHECK-LABEL: combine_udiv_dupe: 82 ; CHECK: # %bb.0: 83 ; CHECK-NEXT: xorl %edx, %edx 84 ; CHECK-NEXT: movl %edi, %eax 85 ; CHECK-NEXT: divl %edi 86 ; CHECK-NEXT: retq 87 %1 = udiv i32 %x, %x 88 ret i32 %1 89 } 90 91 define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) { 92 ; SSE-LABEL: combine_vec_udiv_dupe: 93 ; SSE: # %bb.0: 94 ; SSE-NEXT: pextrd $1, %xmm0, %eax 95 ; SSE-NEXT: xorl %edx, %edx 96 ; SSE-NEXT: divl %eax 97 ; SSE-NEXT: movl %eax, %ecx 98 ; SSE-NEXT: movd %xmm0, %eax 99 ; SSE-NEXT: xorl %edx, %edx 100 ; SSE-NEXT: divl %eax 101 ; SSE-NEXT: movd %eax, %xmm1 102 ; SSE-NEXT: pinsrd $1, %ecx, %xmm1 103 ; SSE-NEXT: pextrd $2, %xmm0, %eax 104 ; SSE-NEXT: xorl %edx, %edx 105 ; SSE-NEXT: divl %eax 106 ; SSE-NEXT: pinsrd $2, %eax, %xmm1 107 ; SSE-NEXT: pextrd $3, %xmm0, %eax 108 ; SSE-NEXT: xorl %edx, %edx 109 ; SSE-NEXT: divl %eax 110 ; SSE-NEXT: pinsrd $3, %eax, %xmm1 111 ; SSE-NEXT: movdqa %xmm1, %xmm0 112 ; SSE-NEXT: retq 113 ; 114 ; AVX-LABEL: combine_vec_udiv_dupe: 115 ; AVX: # %bb.0: 116 ; AVX-NEXT: vpextrd $1, %xmm0, %eax 117 ; AVX-NEXT: xorl %edx, %edx 118 ; AVX-NEXT: divl %eax 119 ; AVX-NEXT: movl %eax, %ecx 120 ; AVX-NEXT: vmovd %xmm0, %eax 121 ; AVX-NEXT: xorl %edx, %edx 122 ; AVX-NEXT: divl %eax 123 ; AVX-NEXT: vmovd %eax, %xmm1 124 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 125 ; AVX-NEXT: vpextrd $2, %xmm0, %eax 126 ; AVX-NEXT: xorl %edx, %edx 127 ; AVX-NEXT: divl %eax 128 ; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 129 ; AVX-NEXT: vpextrd $3, %xmm0, %eax 130 ; AVX-NEXT: xorl %edx, %edx 131 ; AVX-NEXT: divl %eax 132 ; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 133 ; AVX-NEXT: retq 134 %1 = udiv <4 x i32> %x, %x 135 ret <4 x i32> %1 136 } 137 138 ; fold (udiv x, (1 << c)) -> x >>u c 139 define <4 x i32> @combine_vec_udiv_by_pow2a(<4 x i32> %x) { 140 ; SSE-LABEL: combine_vec_udiv_by_pow2a: 141 ; SSE: # %bb.0: 142 ; SSE-NEXT: psrld $2, %xmm0 143 ; SSE-NEXT: retq 144 ; 145 ; AVX-LABEL: combine_vec_udiv_by_pow2a: 146 ; AVX: # %bb.0: 147 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 148 ; AVX-NEXT: retq 149 %1 = udiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 150 ret <4 x i32> %1 151 } 152 153 define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) { 154 ; SSE-LABEL: combine_vec_udiv_by_pow2b: 155 ; SSE: # %bb.0: 156 ; SSE-NEXT: movdqa %xmm0, %xmm2 157 ; SSE-NEXT: movdqa %xmm0, %xmm1 158 ; SSE-NEXT: psrld $3, %xmm1 159 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 160 ; SSE-NEXT: psrld $4, %xmm0 161 ; SSE-NEXT: psrld $2, %xmm2 162 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 163 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 164 ; SSE-NEXT: movdqa %xmm1, %xmm0 165 ; SSE-NEXT: retq 166 ; 167 ; AVX1-LABEL: combine_vec_udiv_by_pow2b: 168 ; AVX1: # %bb.0: 169 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 170 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 171 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 172 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 173 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 174 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 175 ; AVX1-NEXT: retq 176 ; 177 ; AVX2-LABEL: combine_vec_udiv_by_pow2b: 178 ; AVX2: # %bb.0: 179 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 180 ; AVX2-NEXT: retq 181 %1 = udiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 182 ret <4 x i32> %1 183 } 184 185 define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { 186 ; SSE-LABEL: combine_vec_udiv_by_pow2c: 187 ; SSE: # %bb.0: 188 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 189 ; SSE-NEXT: movdqa %xmm0, %xmm3 190 ; SSE-NEXT: psrld %xmm2, %xmm3 191 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 192 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 193 ; SSE-NEXT: movdqa %xmm0, %xmm5 194 ; SSE-NEXT: psrld %xmm4, %xmm5 195 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 196 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 197 ; SSE-NEXT: movdqa %xmm0, %xmm3 198 ; SSE-NEXT: psrld %xmm1, %xmm3 199 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 200 ; SSE-NEXT: psrld %xmm1, %xmm0 201 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 202 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 203 ; SSE-NEXT: retq 204 ; 205 ; AVX1-LABEL: combine_vec_udiv_by_pow2c: 206 ; AVX1: # %bb.0: 207 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 208 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 209 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 210 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 212 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 213 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 214 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 215 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 216 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 217 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 218 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 219 ; AVX1-NEXT: retq 220 ; 221 ; AVX2-LABEL: combine_vec_udiv_by_pow2c: 222 ; AVX2: # %bb.0: 223 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 224 ; AVX2-NEXT: retq 225 %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y 226 %2 = udiv <4 x i32> %x, %1 227 ret <4 x i32> %2 228 } 229 230 ; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 231 define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { 232 ; SSE-LABEL: combine_vec_udiv_by_shl_pow2a: 233 ; SSE: # %bb.0: 234 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 235 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 236 ; SSE-NEXT: movdqa %xmm0, %xmm3 237 ; SSE-NEXT: psrld %xmm2, %xmm3 238 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 239 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 240 ; SSE-NEXT: movdqa %xmm0, %xmm5 241 ; SSE-NEXT: psrld %xmm4, %xmm5 242 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 243 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 244 ; SSE-NEXT: movdqa %xmm0, %xmm3 245 ; SSE-NEXT: psrld %xmm1, %xmm3 246 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 247 ; SSE-NEXT: psrld %xmm1, %xmm0 248 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 249 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 250 ; SSE-NEXT: retq 251 ; 252 ; AVX1-LABEL: combine_vec_udiv_by_shl_pow2a: 253 ; AVX1: # %bb.0: 254 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 255 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 256 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 257 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 258 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 259 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 260 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 261 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 262 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 263 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 264 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 265 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 266 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 267 ; AVX1-NEXT: retq 268 ; 269 ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: 270 ; AVX2: # %bb.0: 271 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] 272 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 273 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 274 ; AVX2-NEXT: retq 275 %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y 276 %2 = udiv <4 x i32> %x, %1 277 ret <4 x i32> %2 278 } 279 280 define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { 281 ; SSE-LABEL: combine_vec_udiv_by_shl_pow2b: 282 ; SSE: # %bb.0: 283 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 284 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 285 ; SSE-NEXT: movdqa %xmm0, %xmm3 286 ; SSE-NEXT: psrld %xmm2, %xmm3 287 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] 288 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 289 ; SSE-NEXT: movdqa %xmm0, %xmm5 290 ; SSE-NEXT: psrld %xmm4, %xmm5 291 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 292 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 293 ; SSE-NEXT: movdqa %xmm0, %xmm3 294 ; SSE-NEXT: psrld %xmm1, %xmm3 295 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 296 ; SSE-NEXT: psrld %xmm1, %xmm0 297 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 298 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 299 ; SSE-NEXT: retq 300 ; 301 ; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b: 302 ; AVX1: # %bb.0: 303 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 304 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 305 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 306 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 307 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 308 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 309 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 310 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 311 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 312 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 313 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 314 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 315 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 316 ; AVX1-NEXT: retq 317 ; 318 ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b: 319 ; AVX2: # %bb.0: 320 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 321 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 322 ; AVX2-NEXT: retq 323 %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y 324 %2 = udiv <4 x i32> %x, %1 325 ret <4 x i32> %2 326 } 327 328 ; fold (udiv x, c1) 329 define i32 @combine_udiv_uniform(i32 %x) { 330 ; CHECK-LABEL: combine_udiv_uniform: 331 ; CHECK: # %bb.0: 332 ; CHECK-NEXT: movl %edi, %ecx 333 ; CHECK-NEXT: movl $2987803337, %eax # imm = 0xB21642C9 334 ; CHECK-NEXT: imulq %rcx, %rax 335 ; CHECK-NEXT: shrq $36, %rax 336 ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax 337 ; CHECK-NEXT: retq 338 %1 = udiv i32 %x, 23 339 ret i32 %1 340 } 341 342 define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) { 343 ; SSE-LABEL: combine_vec_udiv_uniform: 344 ; SSE: # %bb.0: 345 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645] 346 ; SSE-NEXT: pmulhuw %xmm0, %xmm1 347 ; SSE-NEXT: psubw %xmm1, %xmm0 348 ; SSE-NEXT: psrlw $1, %xmm0 349 ; SSE-NEXT: paddw %xmm1, %xmm0 350 ; SSE-NEXT: psrlw $4, %xmm0 351 ; SSE-NEXT: retq 352 ; 353 ; AVX-LABEL: combine_vec_udiv_uniform: 354 ; AVX: # %bb.0: 355 ; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 356 ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 357 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 358 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 359 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 360 ; AVX-NEXT: retq 361 %1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23> 362 ret <8 x i16> %1 363 } 364 365 define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { 366 ; SSE-LABEL: combine_vec_udiv_nonuniform: 367 ; SSE: # %bb.0: 368 ; SSE-NEXT: movd %xmm0, %eax 369 ; SSE-NEXT: movzwl %ax, %ecx 370 ; SSE-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D 371 ; SSE-NEXT: shrl $16, %ecx 372 ; SSE-NEXT: subl %ecx, %eax 373 ; SSE-NEXT: movzwl %ax, %eax 374 ; SSE-NEXT: shrl %eax 375 ; SSE-NEXT: addl %ecx, %eax 376 ; SSE-NEXT: shrl $4, %eax 377 ; SSE-NEXT: movd %eax, %xmm1 378 ; SSE-NEXT: pextrw $1, %xmm0, %eax 379 ; SSE-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 380 ; SSE-NEXT: shrl $21, %eax 381 ; SSE-NEXT: pinsrw $1, %eax, %xmm1 382 ; SSE-NEXT: pextrw $2, %xmm0, %eax 383 ; SSE-NEXT: imull $8195, %eax, %eax # imm = 0x2003 384 ; SSE-NEXT: shrl $29, %eax 385 ; SSE-NEXT: pinsrw $2, %eax, %xmm1 386 ; SSE-NEXT: pextrw $3, %xmm0, %eax 387 ; SSE-NEXT: shrl $3, %eax 388 ; SSE-NEXT: imull $9363, %eax, %eax # imm = 0x2493 389 ; SSE-NEXT: shrl $16, %eax 390 ; SSE-NEXT: pinsrw $3, %eax, %xmm1 391 ; SSE-NEXT: pextrw $4, %xmm0, %eax 392 ; SSE-NEXT: shrl $7, %eax 393 ; SSE-NEXT: pinsrw $4, %eax, %xmm1 394 ; SSE-NEXT: pextrw $5, %xmm0, %eax 395 ; SSE-NEXT: xorl %ecx, %ecx 396 ; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF 397 ; SSE-NEXT: sete %cl 398 ; SSE-NEXT: pinsrw $5, %ecx, %xmm1 399 ; SSE-NEXT: pextrw $6, %xmm0, %eax 400 ; SSE-NEXT: imull $32897, %eax, %eax # imm = 0x8081 401 ; SSE-NEXT: shrl $31, %eax 402 ; SSE-NEXT: pinsrw $6, %eax, %xmm1 403 ; SSE-NEXT: pextrw $7, %xmm0, %eax 404 ; SSE-NEXT: shrl $15, %eax 405 ; SSE-NEXT: pinsrw $7, %eax, %xmm1 406 ; SSE-NEXT: movdqa %xmm1, %xmm0 407 ; SSE-NEXT: retq 408 ; 409 ; AVX-LABEL: combine_vec_udiv_nonuniform: 410 ; AVX: # %bb.0: 411 ; AVX-NEXT: vmovd %xmm0, %eax 412 ; AVX-NEXT: movzwl %ax, %ecx 413 ; AVX-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D 414 ; AVX-NEXT: shrl $16, %ecx 415 ; AVX-NEXT: subl %ecx, %eax 416 ; AVX-NEXT: movzwl %ax, %eax 417 ; AVX-NEXT: shrl %eax 418 ; AVX-NEXT: addl %ecx, %eax 419 ; AVX-NEXT: shrl $4, %eax 420 ; AVX-NEXT: vmovd %eax, %xmm1 421 ; AVX-NEXT: vpextrw $1, %xmm0, %eax 422 ; AVX-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1 423 ; AVX-NEXT: shrl $21, %eax 424 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 425 ; AVX-NEXT: vpextrw $2, %xmm0, %eax 426 ; AVX-NEXT: imull $8195, %eax, %eax # imm = 0x2003 427 ; AVX-NEXT: shrl $29, %eax 428 ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 429 ; AVX-NEXT: vpextrw $3, %xmm0, %eax 430 ; AVX-NEXT: shrl $3, %eax 431 ; AVX-NEXT: imull $9363, %eax, %eax # imm = 0x2493 432 ; AVX-NEXT: shrl $16, %eax 433 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 434 ; AVX-NEXT: vpextrw $4, %xmm0, %eax 435 ; AVX-NEXT: shrl $7, %eax 436 ; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 437 ; AVX-NEXT: vpextrw $5, %xmm0, %eax 438 ; AVX-NEXT: xorl %ecx, %ecx 439 ; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF 440 ; AVX-NEXT: sete %cl 441 ; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 442 ; AVX-NEXT: vpextrw $6, %xmm0, %eax 443 ; AVX-NEXT: imull $32897, %eax, %eax # imm = 0x8081 444 ; AVX-NEXT: shrl $31, %eax 445 ; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 446 ; AVX-NEXT: vpextrw $7, %xmm0, %eax 447 ; AVX-NEXT: shrl $15, %eax 448 ; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 449 ; AVX-NEXT: retq 450 %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768> 451 ret <8 x i16> %1 452 } 453