Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
      3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
      4 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
      5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
      6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
      7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
      8 ;
      9 ; Combine tests involving AVX target shuffles
     10 
     11 declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8)
     12 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8)
     13 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8)
     14 declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8)
     15 
     16 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
     17 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
     18 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
     19 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
     20 
     21 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8)
     22 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8)
     23 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
     24 
     25 define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
     26 ; X32-LABEL: combine_vpermilvar_4f32_identity:
     27 ; X32:       # %bb.0:
     28 ; X32-NEXT:    retl
     29 ;
     30 ; X64-LABEL: combine_vpermilvar_4f32_identity:
     31 ; X64:       # %bb.0:
     32 ; X64-NEXT:    retq
     33   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
     34   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
     35   ret <4 x float> %2
     36 }
     37 
     38 define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
     39 ; X32-LABEL: combine_vpermilvar_4f32_movddup:
     40 ; X32:       # %bb.0:
     41 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
     42 ; X32-NEXT:    retl
     43 ;
     44 ; X64-LABEL: combine_vpermilvar_4f32_movddup:
     45 ; X64:       # %bb.0:
     46 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
     47 ; X64-NEXT:    retq
     48   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
     49   ret <4 x float> %1
     50 }
     51 define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
     52 ; X32-LABEL: combine_vpermilvar_4f32_movddup_load:
     53 ; X32:       # %bb.0:
     54 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     55 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     56 ; X32-NEXT:    retl
     57 ;
     58 ; X64-LABEL: combine_vpermilvar_4f32_movddup_load:
     59 ; X64:       # %bb.0:
     60 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     61 ; X64-NEXT:    retq
     62   %1 = load <4 x float>, <4 x float> *%a0
     63   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
     64   ret <4 x float> %2
     65 }
     66 
     67 define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
     68 ; X32-LABEL: combine_vpermilvar_4f32_movshdup:
     69 ; X32:       # %bb.0:
     70 ; X32-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
     71 ; X32-NEXT:    retl
     72 ;
     73 ; X64-LABEL: combine_vpermilvar_4f32_movshdup:
     74 ; X64:       # %bb.0:
     75 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
     76 ; X64-NEXT:    retq
     77   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
     78   ret <4 x float> %1
     79 }
     80 
     81 define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
     82 ; X32-LABEL: combine_vpermilvar_4f32_movsldup:
     83 ; X32:       # %bb.0:
     84 ; X32-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
     85 ; X32-NEXT:    retl
     86 ;
     87 ; X64-LABEL: combine_vpermilvar_4f32_movsldup:
     88 ; X64:       # %bb.0:
     89 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
     90 ; X64-NEXT:    retq
     91   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
     92   ret <4 x float> %1
     93 }
     94 
     95 define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
     96 ; X32-LABEL: combine_vpermilvar_4f32_unpckh:
     97 ; X32:       # %bb.0:
     98 ; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
     99 ; X32-NEXT:    retl
    100 ;
    101 ; X64-LABEL: combine_vpermilvar_4f32_unpckh:
    102 ; X64:       # %bb.0:
    103 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
    104 ; X64-NEXT:    retq
    105   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
    106   ret <4 x float> %1
    107 }
    108 
    109 define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
    110 ; X32-LABEL: combine_vpermilvar_4f32_unpckl:
    111 ; X32:       # %bb.0:
    112 ; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
    113 ; X32-NEXT:    retl
    114 ;
    115 ; X64-LABEL: combine_vpermilvar_4f32_unpckl:
    116 ; X64:       # %bb.0:
    117 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
    118 ; X64-NEXT:    retq
    119   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
    120   ret <4 x float> %1
    121 }
    122 
    123 define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
    124 ; X32-LABEL: combine_vpermilvar_8f32_identity:
    125 ; X32:       # %bb.0:
    126 ; X32-NEXT:    retl
    127 ;
    128 ; X64-LABEL: combine_vpermilvar_8f32_identity:
    129 ; X64:       # %bb.0:
    130 ; X64-NEXT:    retq
    131   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
    132   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
    133   ret <8 x float> %2
    134 }
    135 
    136 define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
    137 ; X32-LABEL: combine_vpermilvar_8f32_10326u4u:
    138 ; X32:       # %bb.0:
    139 ; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
    140 ; X32-NEXT:    retl
    141 ;
    142 ; X64-LABEL: combine_vpermilvar_8f32_10326u4u:
    143 ; X64:       # %bb.0:
    144 ; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
    145 ; X64-NEXT:    retq
    146   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
    147   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
    148   ret <8 x float> %2
    149 }
    150 
    151 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
    152 ; X32-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
    153 ; X32-AVX1:       # %bb.0:
    154 ; X32-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
    155 ; X32-AVX1-NEXT:    retl
    156 ;
    157 ; X32-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
    158 ; X32-AVX2:       # %bb.0:
    159 ; X32-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
    160 ; X32-AVX2-NEXT:    retl
    161 ;
    162 ; X32-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
    163 ; X32-AVX512:       # %bb.0:
    164 ; X32-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
    165 ; X32-AVX512-NEXT:    retl
    166 ;
    167 ; X64-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
    168 ; X64-AVX1:       # %bb.0:
    169 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
    170 ; X64-AVX1-NEXT:    retq
    171 ;
    172 ; X64-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
    173 ; X64-AVX2:       # %bb.0:
    174 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
    175 ; X64-AVX2-NEXT:    retq
    176 ;
    177 ; X64-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
    178 ; X64-AVX512:       # %bb.0:
    179 ; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
    180 ; X64-AVX512-NEXT:    retq
    181   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    182   %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
    183   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    184   ret <8 x float> %3
    185 }
    186 
    187 define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
    188 ; X32-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
    189 ; X32:       # %bb.0:
    190 ; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    191 ; X32-NEXT:    retl
    192 ;
    193 ; X64-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
    194 ; X64:       # %bb.0:
    195 ; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    196 ; X64-NEXT:    retq
    197   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    198   %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
    199   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    200   ret <8 x float> %3
    201 }
    202 
    203 define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
    204 ; X32-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
    205 ; X32:       # %bb.0:
    206 ; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
    207 ; X32-NEXT:    vmovapd %xmm0, %xmm0
    208 ; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
    209 ; X32-NEXT:    retl
    210 ;
    211 ; X64-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
    212 ; X64:       # %bb.0:
    213 ; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
    214 ; X64-NEXT:    vmovapd %xmm0, %xmm0
    215 ; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
    216 ; X64-NEXT:    retq
    217   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    218   %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    219   %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    220   ret <4 x double> %3
    221 }
    222 
    223 define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
    224 ; X32-LABEL: combine_vpermilvar_8f32_movddup:
    225 ; X32:       # %bb.0:
    226 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    227 ; X32-NEXT:    retl
    228 ;
    229 ; X64-LABEL: combine_vpermilvar_8f32_movddup:
    230 ; X64:       # %bb.0:
    231 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    232 ; X64-NEXT:    retq
    233   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
    234   ret <8 x float> %1
    235 }
    236 define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
    237 ; X32-LABEL: combine_vpermilvar_8f32_movddup_load:
    238 ; X32:       # %bb.0:
    239 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    240 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
    241 ; X32-NEXT:    retl
    242 ;
    243 ; X64-LABEL: combine_vpermilvar_8f32_movddup_load:
    244 ; X64:       # %bb.0:
    245 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
    246 ; X64-NEXT:    retq
    247   %1 = load <8 x float>, <8 x float> *%a0
    248   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
    249   ret <8 x float> %2
    250 }
    251 
    252 define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
    253 ; X32-LABEL: combine_vpermilvar_8f32_movshdup:
    254 ; X32:       # %bb.0:
    255 ; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
    256 ; X32-NEXT:    retl
    257 ;
    258 ; X64-LABEL: combine_vpermilvar_8f32_movshdup:
    259 ; X64:       # %bb.0:
    260 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
    261 ; X64-NEXT:    retq
    262   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
    263   ret <8 x float> %1
    264 }
    265 
    266 define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
    267 ; X32-LABEL: combine_vpermilvar_8f32_movsldup:
    268 ; X32:       # %bb.0:
    269 ; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    270 ; X32-NEXT:    retl
    271 ;
    272 ; X64-LABEL: combine_vpermilvar_8f32_movsldup:
    273 ; X64:       # %bb.0:
    274 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    275 ; X64-NEXT:    retq
    276   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
    277   ret <8 x float> %1
    278 }
    279 
    280 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
    281 ; X32-LABEL: combine_vpermilvar_2f64_identity:
    282 ; X32:       # %bb.0:
    283 ; X32-NEXT:    retl
    284 ;
    285 ; X64-LABEL: combine_vpermilvar_2f64_identity:
    286 ; X64:       # %bb.0:
    287 ; X64-NEXT:    retq
    288   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
    289   %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>  %1, <2 x i64> <i64 2, i64 0>)
    290   ret <2 x double> %2
    291 }
    292 
    293 define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
    294 ; X32-LABEL: combine_vpermilvar_2f64_movddup:
    295 ; X32:       # %bb.0:
    296 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    297 ; X32-NEXT:    retl
    298 ;
    299 ; X64-LABEL: combine_vpermilvar_2f64_movddup:
    300 ; X64:       # %bb.0:
    301 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    302 ; X64-NEXT:    retq
    303   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
    304   ret <2 x double> %1
    305 }
    306 
    307 define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
    308 ; X32-LABEL: combine_vpermilvar_4f64_identity:
    309 ; X32:       # %bb.0:
    310 ; X32-NEXT:    retl
    311 ;
    312 ; X64-LABEL: combine_vpermilvar_4f64_identity:
    313 ; X64:       # %bb.0:
    314 ; X64-NEXT:    retq
    315   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    316   %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>  %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
    317   ret <4 x double> %2
    318 }
    319 
    320 define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
    321 ; X32-LABEL: combine_vpermilvar_4f64_movddup:
    322 ; X32:       # %bb.0:
    323 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    324 ; X32-NEXT:    retl
    325 ;
    326 ; X64-LABEL: combine_vpermilvar_4f64_movddup:
    327 ; X64:       # %bb.0:
    328 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    329 ; X64-NEXT:    retq
    330   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
    331   ret <4 x double> %1
    332 }
    333 
    334 define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
    335 ; X32-LABEL: combine_vpermilvar_4f32_4stage:
    336 ; X32:       # %bb.0:
    337 ; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
    338 ; X32-NEXT:    retl
    339 ;
    340 ; X64-LABEL: combine_vpermilvar_4f32_4stage:
    341 ; X64:       # %bb.0:
    342 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
    343 ; X64-NEXT:    retq
    344   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    345   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
    346   %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
    347   %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    348   ret <4 x float> %4
    349 }
    350 
    351 define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
    352 ; X32-LABEL: combine_vpermilvar_8f32_4stage:
    353 ; X32:       # %bb.0:
    354 ; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
    355 ; X32-NEXT:    retl
    356 ;
    357 ; X64-LABEL: combine_vpermilvar_8f32_4stage:
    358 ; X64:       # %bb.0:
    359 ; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
    360 ; X64-NEXT:    retq
    361   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    362   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
    363   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
    364   %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
    365   ret <8 x float> %4
    366 }
    367 
    368 define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
    369 ; X32-LABEL: combine_vpermilvar_4f32_as_insertps:
    370 ; X32:       # %bb.0:
    371 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
    372 ; X32-NEXT:    retl
    373 ;
    374 ; X64-LABEL: combine_vpermilvar_4f32_as_insertps:
    375 ; X64:       # %bb.0:
    376 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
    377 ; X64-NEXT:    retq
    378   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    379   %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
    380   ret <4 x float> %2
    381 }
    382 
    383 define <2 x double> @constant_fold_vpermilvar_pd() {
    384 ; X32-LABEL: constant_fold_vpermilvar_pd:
    385 ; X32:       # %bb.0:
    386 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
    387 ; X32-NEXT:    retl
    388 ;
    389 ; X64-LABEL: constant_fold_vpermilvar_pd:
    390 ; X64:       # %bb.0:
    391 ; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
    392 ; X64-NEXT:    retq
    393   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
    394   ret <2 x double> %1
    395 }
    396 
    397 define <4 x double> @constant_fold_vpermilvar_pd_256() {
    398 ; X32-LABEL: constant_fold_vpermilvar_pd_256:
    399 ; X32:       # %bb.0:
    400 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
    401 ; X32-NEXT:    retl
    402 ;
    403 ; X64-LABEL: constant_fold_vpermilvar_pd_256:
    404 ; X64:       # %bb.0:
    405 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
    406 ; X64-NEXT:    retq
    407   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
    408   ret <4 x double> %1
    409 }
    410 
    411 define <4 x float> @constant_fold_vpermilvar_ps() {
    412 ; X32-LABEL: constant_fold_vpermilvar_ps:
    413 ; X32:       # %bb.0:
    414 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
    415 ; X32-NEXT:    retl
    416 ;
    417 ; X64-LABEL: constant_fold_vpermilvar_ps:
    418 ; X64:       # %bb.0:
    419 ; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
    420 ; X64-NEXT:    retq
    421   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
    422   ret <4 x float> %1
    423 }
    424 
    425 define <8 x float> @constant_fold_vpermilvar_ps_256() {
    426 ; X32-LABEL: constant_fold_vpermilvar_ps_256:
    427 ; X32:       # %bb.0:
    428 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
    429 ; X32-NEXT:    retl
    430 ;
    431 ; X64-LABEL: constant_fold_vpermilvar_ps_256:
    432 ; X64:       # %bb.0:
    433 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
    434 ; X64-NEXT:    retq
    435   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
    436   ret <8 x float> %1
    437 }
    438