Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
      4 
      5 ; Check that under certain conditions we can factor out a rotate
      6 ; from the following idioms:
      7 ;   (a*c0) >> s1 | (a*c1)
      8 ;   (a/c0) << s1 | (a/c1)
      9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv
     10 ; with one of the shifts from the rotate idiom
     11 
     12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
     13 ; CHECK-LABEL: vroll_v4i32_extract_shl:
     14 ; CHECK:       # %bb.0:
     15 ; CHECK-NEXT:    vpslld $3, %xmm0, %xmm0
     16 ; CHECK-NEXT:    vprold $7, %zmm0, %zmm0
     17 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     18 ; CHECK-NEXT:    vzeroupper
     19 ; CHECK-NEXT:    ret{{[l|q]}}
     20   %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
     21   %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
     22   %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
     23   %out = or <4 x i32> %lhs_shift, %rhs_mul
     24   ret <4 x i32> %out
     25 }
     26 
     27 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
     28 ; CHECK-LABEL: vrolq_v4i64_extract_shrl:
     29 ; CHECK:       # %bb.0:
     30 ; CHECK-NEXT:    vpsrlq $5, %ymm0, %ymm0
     31 ; CHECK-NEXT:    vprolq $29, %zmm0, %zmm0
     32 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
     33 ; CHECK-NEXT:    ret{{[l|q]}}
     34   %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
     35   %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
     36   %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
     37   %out = or <4 x i64> %lhs_div, %rhs_shift
     38   ret <4 x i64> %out
     39 }
     40 
     41 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
     42 ; CHECK-LABEL: vroll_extract_mul:
     43 ; CHECK:       # %bb.0:
     44 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
     45 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
     46 ; CHECK-NEXT:    vprold $6, %zmm0, %zmm0
     47 ; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
     48 ; CHECK-NEXT:    ret{{[l|q]}}
     49   %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
     50   %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
     51   %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
     52   %out = or <8 x i32> %lhs_mul, %rhs_shift
     53   ret <8 x i32> %out
     54 }
     55 
     56 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
     57 ; X86-LABEL: vrolq_extract_udiv:
     58 ; X86:       # %bb.0:
     59 ; X86-NEXT:    subl $44, %esp
     60 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
     61 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
     62 ; X86-NEXT:    vmovss %xmm0, (%esp)
     63 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
     64 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
     65 ; X86-NEXT:    calll __udivdi3
     66 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
     67 ; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
     68 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
     69 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
     70 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
     71 ; X86-NEXT:    vmovd %eax, %xmm0
     72 ; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
     73 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
     74 ; X86-NEXT:    calll __udivdi3
     75 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
     76 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
     77 ; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
     78 ; X86-NEXT:    vprolq $57, %zmm0, %zmm0
     79 ; X86-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     80 ; X86-NEXT:    addl $44, %esp
     81 ; X86-NEXT:    vzeroupper
     82 ; X86-NEXT:    retl
     83 ;
     84 ; X64-LABEL: vrolq_extract_udiv:
     85 ; X64:       # %bb.0:
     86 ; X64-NEXT:    vpextrq $1, %xmm0, %rax
     87 ; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
     88 ; X64-NEXT:    mulq %rcx
     89 ; X64-NEXT:    shrq %rdx
     90 ; X64-NEXT:    vmovq %rdx, %xmm1
     91 ; X64-NEXT:    vmovq %xmm0, %rax
     92 ; X64-NEXT:    mulq %rcx
     93 ; X64-NEXT:    shrq %rdx
     94 ; X64-NEXT:    vmovq %rdx, %xmm0
     95 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     96 ; X64-NEXT:    vprolq $57, %zmm0, %zmm0
     97 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     98 ; X64-NEXT:    vzeroupper
     99 ; X64-NEXT:    retq
    100   %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
    101   %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
    102   %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
    103   %out = or <2 x i64> %lhs_shift, %rhs_div
    104   ret <2 x i64> %out
    105 }
    106 
    107 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
    108 ; X86-LABEL: vrolw_extract_mul_with_mask:
    109 ; X86:       # %bb.0:
    110 ; X86-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
    111 ; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    112 ; X86-NEXT:    vprold $7, %zmm0, %zmm0
    113 ; X86-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    114 ; X86-NEXT:    vzeroupper
    115 ; X86-NEXT:    retl
    116 ;
    117 ; X64-LABEL: vrolw_extract_mul_with_mask:
    118 ; X64:       # %bb.0:
    119 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
    120 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    121 ; X64-NEXT:    vprold $7, %zmm0, %zmm0
    122 ; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    123 ; X64-NEXT:    vzeroupper
    124 ; X64-NEXT:    retq
    125   %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
    126   %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
    127   %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
    128   %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
    129   %out = or <4 x i32> %lhs_and, %rhs_shift
    130   ret <4 x i32> %out
    131 }
    132 
    133 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
    134 ; X86-LABEL: illegal_no_extract_mul:
    135 ; X86:       # %bb.0:
    136 ; X86-NEXT:    vpmullw {{\.LCPI.*}}, %zmm0, %zmm1
    137 ; X86-NEXT:    vpmullw {{\.LCPI.*}}, %zmm0, %zmm0
    138 ; X86-NEXT:    vpsrlw $10, %zmm0, %zmm0
    139 ; X86-NEXT:    vporq %zmm0, %zmm1, %zmm0
    140 ; X86-NEXT:    retl
    141 ;
    142 ; X64-LABEL: illegal_no_extract_mul:
    143 ; X64:       # %bb.0:
    144 ; X64-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm1
    145 ; X64-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
    146 ; X64-NEXT:    vpsrlw $10, %zmm0, %zmm0
    147 ; X64-NEXT:    vporq %zmm0, %zmm1, %zmm0
    148 ; X64-NEXT:    retq
    149   %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
    150   %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
    151   %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
    152   %out = or <32 x i16> %lhs_mul, %rhs_shift
    153   ret <32 x i16> %out
    154 }
    155 
    156 ; Result would undershift
    157 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
    158 ; CHECK-LABEL: no_extract_shl:
    159 ; CHECK:       # %bb.0:
    160 ; CHECK-NEXT:    vpsllq $11, %ymm0, %ymm1
    161 ; CHECK-NEXT:    vpsllq $24, %ymm0, %ymm0
    162 ; CHECK-NEXT:    vpsrlq $50, %ymm1, %ymm1
    163 ; CHECK-NEXT:    vpor %ymm0, %ymm1, %ymm0
    164 ; CHECK-NEXT:    ret{{[l|q]}}
    165   %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
    166   %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
    167   %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
    168   %out = or <4 x i64> %lhs_shift, %rhs_mul
    169   ret <4 x i64> %out
    170 }
    171 
    172 ; Result would overshift
    173 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
    174 ; CHECK-LABEL: no_extract_shrl:
    175 ; CHECK:       # %bb.0:
    176 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840]
    177 ; CHECK-NEXT:    vpslld $25, %xmm0, %xmm2
    178 ; CHECK-NEXT:    vpand %xmm1, %xmm2, %xmm1
    179 ; CHECK-NEXT:    vpsrld $9, %xmm0, %xmm0
    180 ; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
    181 ; CHECK-NEXT:    ret{{[l|q]}}
    182   %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
    183   %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
    184   %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
    185   %out = or <4 x i32> %lhs_shift, %rhs_div
    186   ret <4 x i32> %out
    187 }
    188 
    189 ; Can factor 512 from 1536, but result is 3 instead of 9
    190 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
    191 ; CHECK-LABEL: no_extract_mul:
    192 ; CHECK:       # %bb.0:
    193 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536]
    194 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
    195 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9]
    196 ; CHECK-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
    197 ; CHECK-NEXT:    vpsrld $23, %ymm0, %ymm0
    198 ; CHECK-NEXT:    vpor %ymm0, %ymm1, %ymm0
    199 ; CHECK-NEXT:    ret{{[l|q]}}
    200   %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
    201   %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
    202   %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
    203   %out = or <8 x i32> %lhs_mul, %rhs_shift
    204   ret <8 x i32> %out
    205 }
    206 
    207 ; Can't evenly factor 256 from 770
    208 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
    209 ; X86-LABEL: no_extract_udiv:
    210 ; X86:       # %bb.0:
    211 ; X86-NEXT:    subl $60, %esp
    212 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
    213 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
    214 ; X86-NEXT:    vmovss %xmm0, (%esp)
    215 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
    216 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
    217 ; X86-NEXT:    calll __udivdi3
    218 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
    219 ; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
    220 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
    221 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
    222 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
    223 ; X86-NEXT:    vmovd %eax, %xmm0
    224 ; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
    225 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
    226 ; X86-NEXT:    calll __udivdi3
    227 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
    228 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
    229 ; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
    230 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
    231 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
    232 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
    233 ; X86-NEXT:    vmovss %xmm0, (%esp)
    234 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
    235 ; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
    236 ; X86-NEXT:    calll __udivdi3
    237 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
    238 ; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
    239 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
    240 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
    241 ; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
    242 ; X86-NEXT:    vmovd %eax, %xmm0
    243 ; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
    244 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
    245 ; X86-NEXT:    calll __udivdi3
    246 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
    247 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
    248 ; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
    249 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
    250 ; X86-NEXT:    vpsllq $56, %xmm1, %xmm1
    251 ; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
    252 ; X86-NEXT:    addl $60, %esp
    253 ; X86-NEXT:    retl
    254 ;
    255 ; X64-LABEL: no_extract_udiv:
    256 ; X64:       # %bb.0:
    257 ; X64-NEXT:    vpextrq $1, %xmm0, %rcx
    258 ; X64-NEXT:    movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
    259 ; X64-NEXT:    movq %rcx, %rax
    260 ; X64-NEXT:    mulq %rdi
    261 ; X64-NEXT:    shrq %rdx
    262 ; X64-NEXT:    vmovq %rdx, %xmm1
    263 ; X64-NEXT:    vmovq %xmm0, %rsi
    264 ; X64-NEXT:    movq %rsi, %rax
    265 ; X64-NEXT:    mulq %rdi
    266 ; X64-NEXT:    shrq %rdx
    267 ; X64-NEXT:    vmovq %rdx, %xmm0
    268 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    269 ; X64-NEXT:    movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
    270 ; X64-NEXT:    movq %rcx, %rax
    271 ; X64-NEXT:    mulq %rdi
    272 ; X64-NEXT:    shrq $9, %rdx
    273 ; X64-NEXT:    vmovq %rdx, %xmm1
    274 ; X64-NEXT:    movq %rsi, %rax
    275 ; X64-NEXT:    mulq %rdi
    276 ; X64-NEXT:    shrq $9, %rdx
    277 ; X64-NEXT:    vmovq %rdx, %xmm2
    278 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
    279 ; X64-NEXT:    vpsllq $56, %xmm0, %xmm0
    280 ; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
    281 ; X64-NEXT:    retq
    282   %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
    283   %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
    284   %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
    285   %out = or <2 x i64> %lhs_shift, %rhs_div
    286   ret <2 x i64> %out
    287 }
    288