1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 4 5 ; Check that under certain conditions we can factor out a rotate 6 ; from the following idioms: 7 ; (a*c0) >> s1 | (a*c1) 8 ; (a/c0) << s1 | (a/c1) 9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv 10 ; with one of the shifts from the rotate idiom 11 12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { 13 ; CHECK-LABEL: vroll_v4i32_extract_shl: 14 ; CHECK: # %bb.0: 15 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 16 ; CHECK-NEXT: vprold $7, %zmm0, %zmm0 17 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 18 ; CHECK-NEXT: vzeroupper 19 ; CHECK-NEXT: ret{{[l|q]}} 20 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 21 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10> 22 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25> 23 %out = or <4 x i32> %lhs_shift, %rhs_mul 24 ret <4 x i32> %out 25 } 26 27 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { 28 ; CHECK-LABEL: vrolq_v4i64_extract_shrl: 29 ; CHECK: # %bb.0: 30 ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 31 ; CHECK-NEXT: vprolq $29, %zmm0, %zmm0 32 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 33 ; CHECK-NEXT: ret{{[l|q]}} 34 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40> 35 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5> 36 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29> 37 %out = or <4 x i64> %lhs_div, %rhs_shift 38 ret <4 x i64> %out 39 } 40 41 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { 42 ; CHECK-LABEL: vroll_extract_mul: 43 ; CHECK: # %bb.0: 44 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] 45 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 46 ; CHECK-NEXT: vprold $6, %zmm0, %zmm0 47 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 48 ; CHECK-NEXT: ret{{[l|q]}} 49 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640> 50 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 51 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26> 52 %out = or <8 x i32> %lhs_mul, %rhs_shift 53 ret <8 x i32> %out 54 } 55 56 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { 57 ; X86-LABEL: vrolq_extract_udiv: 58 ; X86: # %bb.0: 59 ; X86-NEXT: subl $44, %esp 60 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 61 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 62 ; X86-NEXT: vmovss %xmm0, (%esp) 63 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 64 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 65 ; X86-NEXT: calll __udivdi3 66 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 67 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 68 ; X86-NEXT: vextractps $2, %xmm0, (%esp) 69 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 70 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 71 ; X86-NEXT: vmovd %eax, %xmm0 72 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 73 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 74 ; X86-NEXT: calll __udivdi3 75 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 76 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 77 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 78 ; X86-NEXT: vprolq $57, %zmm0, %zmm0 79 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 80 ; X86-NEXT: addl $44, %esp 81 ; X86-NEXT: vzeroupper 82 ; X86-NEXT: retl 83 ; 84 ; X64-LABEL: vrolq_extract_udiv: 85 ; X64: # %bb.0: 86 ; X64-NEXT: vpextrq $1, %xmm0, %rax 87 ; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB 88 ; X64-NEXT: mulq %rcx 89 ; X64-NEXT: shrq %rdx 90 ; X64-NEXT: vmovq %rdx, %xmm1 91 ; X64-NEXT: vmovq %xmm0, %rax 92 ; X64-NEXT: mulq %rcx 93 ; X64-NEXT: shrq %rdx 94 ; X64-NEXT: vmovq %rdx, %xmm0 95 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 96 ; X64-NEXT: vprolq $57, %zmm0, %zmm0 97 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 98 ; X64-NEXT: vzeroupper 99 ; X64-NEXT: retq 100 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 101 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384> 102 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57> 103 %out = or <2 x i64> %lhs_shift, %rhs_div 104 ret <2 x i64> %out 105 } 106 107 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { 108 ; X86-LABEL: vrolw_extract_mul_with_mask: 109 ; X86: # %bb.0: 110 ; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] 111 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 112 ; X86-NEXT: vprold $7, %zmm0, %zmm0 113 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 114 ; X86-NEXT: vzeroupper 115 ; X86-NEXT: retl 116 ; 117 ; X64-LABEL: vrolw_extract_mul_with_mask: 118 ; X64: # %bb.0: 119 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] 120 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 121 ; X64-NEXT: vprold $7, %zmm0, %zmm0 122 ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 123 ; X64-NEXT: vzeroupper 124 ; X64-NEXT: retq 125 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152> 126 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 127 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160> 128 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25> 129 %out = or <4 x i32> %lhs_and, %rhs_shift 130 ret <4 x i32> %out 131 } 132 133 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { 134 ; X86-LABEL: illegal_no_extract_mul: 135 ; X86: # %bb.0: 136 ; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm1 137 ; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm0 138 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm0 139 ; X86-NEXT: vporq %zmm0, %zmm1, %zmm0 140 ; X86-NEXT: retl 141 ; 142 ; X64-LABEL: illegal_no_extract_mul: 143 ; X64: # %bb.0: 144 ; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1 145 ; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 146 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm0 147 ; X64-NEXT: vporq %zmm0, %zmm1, %zmm0 148 ; X64-NEXT: retq 149 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640> 150 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 151 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 152 %out = or <32 x i16> %lhs_mul, %rhs_shift 153 ret <32 x i16> %out 154 } 155 156 ; Result would undershift 157 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { 158 ; CHECK-LABEL: no_extract_shl: 159 ; CHECK: # %bb.0: 160 ; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1 161 ; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0 162 ; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1 163 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 164 ; CHECK-NEXT: ret{{[l|q]}} 165 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11> 166 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24> 167 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50> 168 %out = or <4 x i64> %lhs_shift, %rhs_mul 169 ret <4 x i64> %out 170 } 171 172 ; Result would overshift 173 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { 174 ; CHECK-LABEL: no_extract_shrl: 175 ; CHECK: # %bb.0: 176 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840] 177 ; CHECK-NEXT: vpslld $25, %xmm0, %xmm2 178 ; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1 179 ; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0 180 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 181 ; CHECK-NEXT: ret{{[l|q]}} 182 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 183 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 184 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28> 185 %out = or <4 x i32> %lhs_shift, %rhs_div 186 ret <4 x i32> %out 187 } 188 189 ; Can factor 512 from 1536, but result is 3 instead of 9 190 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind { 191 ; CHECK-LABEL: no_extract_mul: 192 ; CHECK: # %bb.0: 193 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536] 194 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 195 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9] 196 ; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 197 ; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0 198 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 199 ; CHECK-NEXT: ret{{[l|q]}} 200 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536> 201 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 202 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 203 %out = or <8 x i32> %lhs_mul, %rhs_shift 204 ret <8 x i32> %out 205 } 206 207 ; Can't evenly factor 256 from 770 208 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { 209 ; X86-LABEL: no_extract_udiv: 210 ; X86: # %bb.0: 211 ; X86-NEXT: subl $60, %esp 212 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 213 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 214 ; X86-NEXT: vmovss %xmm0, (%esp) 215 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 216 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 217 ; X86-NEXT: calll __udivdi3 218 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 219 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 220 ; X86-NEXT: vextractps $2, %xmm0, (%esp) 221 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 222 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 223 ; X86-NEXT: vmovd %eax, %xmm0 224 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 225 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 226 ; X86-NEXT: calll __udivdi3 227 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 228 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 229 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 230 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 231 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 232 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 233 ; X86-NEXT: vmovss %xmm0, (%esp) 234 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 235 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 236 ; X86-NEXT: calll __udivdi3 237 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 238 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 239 ; X86-NEXT: vextractps $2, %xmm0, (%esp) 240 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 241 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 242 ; X86-NEXT: vmovd %eax, %xmm0 243 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 244 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 245 ; X86-NEXT: calll __udivdi3 246 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 247 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 248 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 249 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 250 ; X86-NEXT: vpsllq $56, %xmm1, %xmm1 251 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 252 ; X86-NEXT: addl $60, %esp 253 ; X86-NEXT: retl 254 ; 255 ; X64-LABEL: no_extract_udiv: 256 ; X64: # %bb.0: 257 ; X64-NEXT: vpextrq $1, %xmm0, %rcx 258 ; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB 259 ; X64-NEXT: movq %rcx, %rax 260 ; X64-NEXT: mulq %rdi 261 ; X64-NEXT: shrq %rdx 262 ; X64-NEXT: vmovq %rdx, %xmm1 263 ; X64-NEXT: vmovq %xmm0, %rsi 264 ; X64-NEXT: movq %rsi, %rax 265 ; X64-NEXT: mulq %rdi 266 ; X64-NEXT: shrq %rdx 267 ; X64-NEXT: vmovq %rdx, %xmm0 268 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 269 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B 270 ; X64-NEXT: movq %rcx, %rax 271 ; X64-NEXT: mulq %rdi 272 ; X64-NEXT: shrq $9, %rdx 273 ; X64-NEXT: vmovq %rdx, %xmm1 274 ; X64-NEXT: movq %rsi, %rax 275 ; X64-NEXT: mulq %rdi 276 ; X64-NEXT: shrq $9, %rdx 277 ; X64-NEXT: vmovq %rdx, %xmm2 278 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 279 ; X64-NEXT: vpsllq $56, %xmm0, %xmm0 280 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 281 ; X64-NEXT: retq 282 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 283 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770> 284 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56> 285 %out = or <2 x i64> %lhs_shift, %rhs_div 286 ret <2 x i64> %out 287 } 288