Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512VL
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512DQVL
      7 
      8 define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
      9 ; SSE-LABEL: combine_shuffle_sext_pmuldq:
     10 ; SSE:       # %bb.0:
     11 ; SSE-NEXT:    pmuldq %xmm1, %xmm0
     12 ; SSE-NEXT:    retq
     13 ;
     14 ; AVX-LABEL: combine_shuffle_sext_pmuldq:
     15 ; AVX:       # %bb.0:
     16 ; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
     17 ; AVX-NEXT:    retq
     18   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
     19   %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
     20   %3 = sext <2 x i32> %1 to <2 x i64>
     21   %4 = sext <2 x i32> %2 to <2 x i64>
     22   %5 = mul nuw <2 x i64> %3, %4
     23   ret <2 x i64> %5
     24 }
     25 
     26 define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
     27 ; SSE-LABEL: combine_shuffle_zext_pmuludq:
     28 ; SSE:       # %bb.0:
     29 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
     30 ; SSE-NEXT:    retq
     31 ;
     32 ; AVX-LABEL: combine_shuffle_zext_pmuludq:
     33 ; AVX:       # %bb.0:
     34 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     35 ; AVX-NEXT:    retq
     36   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
     37   %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
     38   %3 = zext <2 x i32> %1 to <2 x i64>
     39   %4 = zext <2 x i32> %2 to <2 x i64>
     40   %5 = mul nuw <2 x i64> %3, %4
     41   ret <2 x i64> %5
     42 }
     43 
     44 ; TODO - blends are superfluous
     45 define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
     46 ; SSE-LABEL: combine_shuffle_zero_pmuludq:
     47 ; SSE:       # %bb.0:
     48 ; SSE-NEXT:    pxor %xmm2, %xmm2
     49 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
     50 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
     51 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
     52 ; SSE-NEXT:    retq
     53 ;
     54 ; AVX2-LABEL: combine_shuffle_zero_pmuludq:
     55 ; AVX2:       # %bb.0:
     56 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     57 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
     58 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
     59 ; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     60 ; AVX2-NEXT:    retq
     61 ;
     62 ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
     63 ; AVX512VL:       # %bb.0:
     64 ; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     65 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
     66 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
     67 ; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     68 ; AVX512VL-NEXT:    retq
     69 ;
     70 ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
     71 ; AVX512DQVL:       # %bb.0:
     72 ; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     73 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
     74 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
     75 ; AVX512DQVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     76 ; AVX512DQVL-NEXT:    retq
     77   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
     78   %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
     79   %3 = bitcast <4 x i32> %1 to <2 x i64>
     80   %4 = bitcast <4 x i32> %2 to <2 x i64>
     81   %5 = mul <2 x i64> %3, %4
     82   ret <2 x i64> %5
     83 }
     84 
     85 ; TODO - blends are superfluous
     86 define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
     87 ; SSE-LABEL: combine_shuffle_zero_pmuludq_256:
     88 ; SSE:       # %bb.0:
     89 ; SSE-NEXT:    pxor %xmm4, %xmm4
     90 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
     91 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
     92 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
     93 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
     94 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
     95 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
     96 ; SSE-NEXT:    retq
     97 ;
     98 ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
     99 ; AVX2:       # %bb.0:
    100 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    101 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
    102 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
    103 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    104 ; AVX2-NEXT:    retq
    105 ;
    106 ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
    107 ; AVX512VL:       # %bb.0:
    108 ; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    109 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
    110 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
    111 ; AVX512VL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    112 ; AVX512VL-NEXT:    retq
    113 ;
    114 ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
    115 ; AVX512DQVL:       # %bb.0:
    116 ; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    117 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
    118 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
    119 ; AVX512DQVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    120 ; AVX512DQVL-NEXT:    retq
    121   %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
    122   %2 = shufflevector <8 x i32> %a1, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
    123   %3 = bitcast <8 x i32> %1 to <4 x i64>
    124   %4 = bitcast <8 x i32> %2 to <4 x i64>
    125   %5 = mul <4 x i64> %3, %4
    126   ret <4 x i64> %5
    127 }
    128 
    129 define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
    130 ; SSE-LABEL: combine_zext_pmuludq_256:
    131 ; SSE:       # %bb.0:
    132 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    133 ; SSE-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
    134 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    135 ; SSE-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
    136 ; SSE-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
    137 ; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    138 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [715827883,715827883]
    139 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    140 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    141 ; SSE-NEXT:    pmuludq %xmm1, %xmm4
    142 ; SSE-NEXT:    pmuludq %xmm1, %xmm3
    143 ; SSE-NEXT:    movdqa %xmm4, %xmm1
    144 ; SSE-NEXT:    retq
    145 ;
    146 ; AVX2-LABEL: combine_zext_pmuludq_256:
    147 ; AVX2:       # %bb.0:
    148 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    149 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    150 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    151 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [715827883,715827883,715827883,715827883]
    152 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
    153 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
    154 ; AVX2-NEXT:    retq
    155 ;
    156 ; AVX512VL-LABEL: combine_zext_pmuludq_256:
    157 ; AVX512VL:       # %bb.0:
    158 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
    159 ; AVX512VL-NEXT:    vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    160 ; AVX512VL-NEXT:    retq
    161 ;
    162 ; AVX512DQVL-LABEL: combine_zext_pmuludq_256:
    163 ; AVX512DQVL:       # %bb.0:
    164 ; AVX512DQVL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
    165 ; AVX512DQVL-NEXT:    vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    166 ; AVX512DQVL-NEXT:    retq
    167   %1 = zext <8 x i32> %a to <8 x i64>
    168   %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883>
    169   ret <8 x i64> %2
    170 }
    171