Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
      5 ;
      6 ; Combine tests involving AVX target shuffles
      7 
      8 declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8)
      9 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8)
     10 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8)
     11 declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8)
     12 
     13 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
     14 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
     15 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
     16 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
     17 
     18 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8)
     19 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8)
     20 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
     21 
     22 define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
     23 ; ALL-LABEL: combine_vpermilvar_4f32_identity:
     24 ; ALL:       # BB#0:
     25 ; ALL-NEXT:    retq
     26   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
     27   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
     28   ret <4 x float> %2
     29 }
     30 
     31 define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
     32 ; ALL-LABEL: combine_vpermilvar_4f32_movddup:
     33 ; ALL:       # BB#0:
     34 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
     35 ; ALL-NEXT:    retq
     36   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
     37   ret <4 x float> %1
     38 }
     39 define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
     40 ; ALL-LABEL: combine_vpermilvar_4f32_movddup_load:
     41 ; ALL:       # BB#0:
     42 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     43 ; ALL-NEXT:    retq
     44   %1 = load <4 x float>, <4 x float> *%a0
     45   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
     46   ret <4 x float> %2
     47 }
     48 
     49 define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
     50 ; ALL-LABEL: combine_vpermilvar_4f32_movshdup:
     51 ; ALL:       # BB#0:
     52 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
     53 ; ALL-NEXT:    retq
     54   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
     55   ret <4 x float> %1
     56 }
     57 
     58 define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
     59 ; ALL-LABEL: combine_vpermilvar_4f32_movsldup:
     60 ; ALL:       # BB#0:
     61 ; ALL-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
     62 ; ALL-NEXT:    retq
     63   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
     64   ret <4 x float> %1
     65 }
     66 
     67 define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
     68 ; ALL-LABEL: combine_vpermilvar_4f32_unpckh:
     69 ; ALL:       # BB#0:
     70 ; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
     71 ; ALL-NEXT:    retq
     72   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
     73   ret <4 x float> %1
     74 }
     75 
     76 define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
     77 ; ALL-LABEL: combine_vpermilvar_4f32_unpckl:
     78 ; ALL:       # BB#0:
     79 ; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
     80 ; ALL-NEXT:    retq
     81   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
     82   ret <4 x float> %1
     83 }
     84 
     85 define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
     86 ; ALL-LABEL: combine_vpermilvar_8f32_identity:
     87 ; ALL:       # BB#0:
     88 ; ALL-NEXT:    retq
     89   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
     90   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
     91   ret <8 x float> %2
     92 }
     93 
     94 define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
     95 ; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
     96 ; ALL:       # BB#0:
     97 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
     98 ; ALL-NEXT:    retq
     99   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
    100   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
    101   ret <8 x float> %2
    102 }
    103 
    104 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
    105 ; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
    106 ; ALL:       # BB#0:
    107 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
    108 ; ALL-NEXT:    retq
    109   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    110   %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
    111   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    112   ret <8 x float> %3
    113 }
    114 
    115 define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
    116 ; ALL-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
    117 ; ALL:       # BB#0:
    118 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    119 ; ALL-NEXT:    retq
    120   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    121   %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
    122   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    123   ret <8 x float> %3
    124 }
    125 
    126 define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
    127 ; ALL-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
    128 ; ALL:       # BB#0:
    129 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    130 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
    131 ; ALL-NEXT:    retq
    132   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    133   %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    134   %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    135   ret <4 x double> %3
    136 }
    137 
    138 define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
    139 ; ALL-LABEL: combine_vpermilvar_8f32_movddup:
    140 ; ALL:       # BB#0:
    141 ; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    142 ; ALL-NEXT:    retq
    143   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
    144   ret <8 x float> %1
    145 }
    146 define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
    147 ; ALL-LABEL: combine_vpermilvar_8f32_movddup_load:
    148 ; ALL:       # BB#0:
    149 ; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
    150 ; ALL-NEXT:    retq
    151   %1 = load <8 x float>, <8 x float> *%a0
    152   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
    153   ret <8 x float> %2
    154 }
    155 
    156 define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
    157 ; ALL-LABEL: combine_vpermilvar_8f32_movshdup:
    158 ; ALL:       # BB#0:
    159 ; ALL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
    160 ; ALL-NEXT:    retq
    161   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
    162   ret <8 x float> %1
    163 }
    164 
    165 define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
    166 ; ALL-LABEL: combine_vpermilvar_8f32_movsldup:
    167 ; ALL:       # BB#0:
    168 ; ALL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    169 ; ALL-NEXT:    retq
    170   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
    171   ret <8 x float> %1
    172 }
    173 
    174 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
    175 ; ALL-LABEL: combine_vpermilvar_2f64_identity:
    176 ; ALL:       # BB#0:
    177 ; ALL-NEXT:    retq
    178   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
    179   %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>  %1, <2 x i64> <i64 2, i64 0>)
    180   ret <2 x double> %2
    181 }
    182 
    183 define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
    184 ; ALL-LABEL: combine_vpermilvar_2f64_movddup:
    185 ; ALL:       # BB#0:
    186 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    187 ; ALL-NEXT:    retq
    188   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
    189   ret <2 x double> %1
    190 }
    191 
    192 define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
    193 ; ALL-LABEL: combine_vpermilvar_4f64_identity:
    194 ; ALL:       # BB#0:
    195 ; ALL-NEXT:    retq
    196   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    197   %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>  %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    198   ret <4 x double> %2
    199 }
    200 
    201 define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
    202 ; ALL-LABEL: combine_vpermilvar_4f64_movddup:
    203 ; ALL:       # BB#0:
    204 ; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    205 ; ALL-NEXT:    retq
    206   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
    207   ret <4 x double> %1
    208 }
    209 
    210 define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
    211 ; ALL-LABEL: combine_vpermilvar_4f32_4stage:
    212 ; ALL:       # BB#0:
    213 ; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
    214 ; ALL-NEXT:    retq
    215   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    216   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
    217   %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
    218   %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    219   ret <4 x float> %4
    220 }
    221 
    222 define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
    223 ; ALL-LABEL: combine_vpermilvar_8f32_4stage:
    224 ; ALL:       # BB#0:
    225 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
    226 ; ALL-NEXT:    retq
    227   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    228   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
    229   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
    230   %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    231   ret <8 x float> %4
    232 }
    233 
    234 define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
    235 ; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
    236 ; ALL:       # BB#0:
    237 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
    238 ; ALL-NEXT:    retq
    239   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    240   %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
    241   ret <4 x float> %2
    242 }
    243