1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1 3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2 4 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 8 ; 9 ; Combine tests involving AVX target shuffles 10 11 declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) 12 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) 13 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) 14 declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) 15 16 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) 17 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) 18 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) 19 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) 20 21 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) 22 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) 23 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) 24 25 define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) { 26 ; X32-LABEL: combine_vpermilvar_4f32_identity: 27 ; X32: # %bb.0: 28 ; X32-NEXT: retl 29 ; 30 ; X64-LABEL: combine_vpermilvar_4f32_identity: 31 ; X64: # %bb.0: 32 ; X64-NEXT: retq 33 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 34 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 35 ret <4 x float> %2 36 } 37 38 define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) { 39 ; X32-LABEL: combine_vpermilvar_4f32_movddup: 40 ; X32: # %bb.0: 41 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 42 ; X32-NEXT: retl 43 ; 44 ; X64-LABEL: combine_vpermilvar_4f32_movddup: 45 ; X64: # %bb.0: 46 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 47 ; X64-NEXT: retq 48 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>) 49 ret <4 x float> %1 50 } 51 define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) { 52 ; X32-LABEL: combine_vpermilvar_4f32_movddup_load: 53 ; X32: # %bb.0: 54 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 55 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 56 ; X32-NEXT: retl 57 ; 58 ; X64-LABEL: combine_vpermilvar_4f32_movddup_load: 59 ; X64: # %bb.0: 60 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 61 ; X64-NEXT: retq 62 %1 = load <4 x float>, <4 x float> *%a0 63 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>) 64 ret <4 x float> %2 65 } 66 67 define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) { 68 ; X32-LABEL: combine_vpermilvar_4f32_movshdup: 69 ; X32: # %bb.0: 70 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 71 ; X32-NEXT: retl 72 ; 73 ; X64-LABEL: combine_vpermilvar_4f32_movshdup: 74 ; X64: # %bb.0: 75 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 76 ; X64-NEXT: retq 77 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>) 78 ret <4 x float> %1 79 } 80 81 define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) { 82 ; X32-LABEL: combine_vpermilvar_4f32_movsldup: 83 ; X32: # %bb.0: 84 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 85 ; X32-NEXT: retl 86 ; 87 ; X64-LABEL: combine_vpermilvar_4f32_movsldup: 88 ; X64: # %bb.0: 89 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 90 ; X64-NEXT: retq 91 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>) 92 ret <4 x float> %1 93 } 94 95 define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { 96 ; X32-LABEL: combine_vpermilvar_4f32_unpckh: 97 ; X32: # %bb.0: 98 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 99 ; X32-NEXT: retl 100 ; 101 ; X64-LABEL: combine_vpermilvar_4f32_unpckh: 102 ; X64: # %bb.0: 103 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 104 ; X64-NEXT: retq 105 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>) 106 ret <4 x float> %1 107 } 108 109 define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { 110 ; X32-LABEL: combine_vpermilvar_4f32_unpckl: 111 ; X32: # %bb.0: 112 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 113 ; X32-NEXT: retl 114 ; 115 ; X64-LABEL: combine_vpermilvar_4f32_unpckl: 116 ; X64: # %bb.0: 117 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 118 ; X64-NEXT: retq 119 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>) 120 ret <4 x float> %1 121 } 122 123 define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) { 124 ; X32-LABEL: combine_vpermilvar_8f32_identity: 125 ; X32: # %bb.0: 126 ; X32-NEXT: retl 127 ; 128 ; X64-LABEL: combine_vpermilvar_8f32_identity: 129 ; X64: # %bb.0: 130 ; X64-NEXT: retq 131 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>) 132 %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>) 133 ret <8 x float> %2 134 } 135 136 define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) { 137 ; X32-LABEL: combine_vpermilvar_8f32_10326u4u: 138 ; X32: # %bb.0: 139 ; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u] 140 ; X32-NEXT: retl 141 ; 142 ; X64-LABEL: combine_vpermilvar_8f32_10326u4u: 143 ; X64: # %bb.0: 144 ; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u] 145 ; X64-NEXT: retq 146 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>) 147 %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>) 148 ret <8 x float> %2 149 } 150 151 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) { 152 ; X32-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32: 153 ; X32-AVX1: # %bb.0: 154 ; X32-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 155 ; X32-AVX1-NEXT: retl 156 ; 157 ; X32-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: 158 ; X32-AVX2: # %bb.0: 159 ; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 160 ; X32-AVX2-NEXT: retl 161 ; 162 ; X32-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: 163 ; X32-AVX512: # %bb.0: 164 ; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 165 ; X32-AVX512-NEXT: retl 166 ; 167 ; X64-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32: 168 ; X64-AVX1: # %bb.0: 169 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 170 ; X64-AVX1-NEXT: retq 171 ; 172 ; X64-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: 173 ; X64-AVX2: # %bb.0: 174 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 175 ; X64-AVX2-NEXT: retq 176 ; 177 ; X64-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: 178 ; X64-AVX512: # %bb.0: 179 ; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 180 ; X64-AVX512-NEXT: retq 181 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 182 %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 183 %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 184 ret <8 x float> %3 185 } 186 187 define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { 188 ; X32-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: 189 ; X32: # %bb.0: 190 ; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 191 ; X32-NEXT: retl 192 ; 193 ; X64-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: 194 ; X64: # %bb.0: 195 ; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 196 ; X64-NEXT: retq 197 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 198 %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3> 199 %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 200 ret <8 x float> %3 201 } 202 203 define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) { 204 ; X32-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd: 205 ; X32: # %bb.0: 206 ; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 207 ; X32-NEXT: vmovapd %xmm0, %xmm0 208 ; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 209 ; X32-NEXT: retl 210 ; 211 ; X64-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd: 212 ; X64: # %bb.0: 213 ; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 214 ; X64-NEXT: vmovapd %xmm0, %xmm0 215 ; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 216 ; X64-NEXT: retq 217 %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>) 218 %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 219 %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>) 220 ret <4 x double> %3 221 } 222 223 define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) { 224 ; X32-LABEL: combine_vpermilvar_8f32_movddup: 225 ; X32: # %bb.0: 226 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 227 ; X32-NEXT: retl 228 ; 229 ; X64-LABEL: combine_vpermilvar_8f32_movddup: 230 ; X64: # %bb.0: 231 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 232 ; X64-NEXT: retq 233 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>) 234 ret <8 x float> %1 235 } 236 define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) { 237 ; X32-LABEL: combine_vpermilvar_8f32_movddup_load: 238 ; X32: # %bb.0: 239 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 240 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] 241 ; X32-NEXT: retl 242 ; 243 ; X64-LABEL: combine_vpermilvar_8f32_movddup_load: 244 ; X64: # %bb.0: 245 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] 246 ; X64-NEXT: retq 247 %1 = load <8 x float>, <8 x float> *%a0 248 %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>) 249 ret <8 x float> %2 250 } 251 252 define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) { 253 ; X32-LABEL: combine_vpermilvar_8f32_movshdup: 254 ; X32: # %bb.0: 255 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 256 ; X32-NEXT: retl 257 ; 258 ; X64-LABEL: combine_vpermilvar_8f32_movshdup: 259 ; X64: # %bb.0: 260 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 261 ; X64-NEXT: retq 262 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>) 263 ret <8 x float> %1 264 } 265 266 define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) { 267 ; X32-LABEL: combine_vpermilvar_8f32_movsldup: 268 ; X32: # %bb.0: 269 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 270 ; X32-NEXT: retl 271 ; 272 ; X64-LABEL: combine_vpermilvar_8f32_movsldup: 273 ; X64: # %bb.0: 274 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 275 ; X64-NEXT: retq 276 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>) 277 ret <8 x float> %1 278 } 279 280 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) { 281 ; X32-LABEL: combine_vpermilvar_2f64_identity: 282 ; X32: # %bb.0: 283 ; X32-NEXT: retl 284 ; 285 ; X64-LABEL: combine_vpermilvar_2f64_identity: 286 ; X64: # %bb.0: 287 ; X64-NEXT: retq 288 %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>) 289 %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> <i64 2, i64 0>) 290 ret <2 x double> %2 291 } 292 293 define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) { 294 ; X32-LABEL: combine_vpermilvar_2f64_movddup: 295 ; X32: # %bb.0: 296 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 297 ; X32-NEXT: retl 298 ; 299 ; X64-LABEL: combine_vpermilvar_2f64_movddup: 300 ; X64: # %bb.0: 301 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 302 ; X64-NEXT: retq 303 %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>) 304 ret <2 x double> %1 305 } 306 307 define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) { 308 ; X32-LABEL: combine_vpermilvar_4f64_identity: 309 ; X32: # %bb.0: 310 ; X32-NEXT: retl 311 ; 312 ; X64-LABEL: combine_vpermilvar_4f64_identity: 313 ; X64: # %bb.0: 314 ; X64-NEXT: retq 315 %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>) 316 %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>) 317 ret <4 x double> %2 318 } 319 320 define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) { 321 ; X32-LABEL: combine_vpermilvar_4f64_movddup: 322 ; X32: # %bb.0: 323 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 324 ; X32-NEXT: retl 325 ; 326 ; X64-LABEL: combine_vpermilvar_4f64_movddup: 327 ; X64: # %bb.0: 328 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 329 ; X64-NEXT: retq 330 %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>) 331 ret <4 x double> %1 332 } 333 334 define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { 335 ; X32-LABEL: combine_vpermilvar_4f32_4stage: 336 ; X32: # %bb.0: 337 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] 338 ; X32-NEXT: retl 339 ; 340 ; X64-LABEL: combine_vpermilvar_4f32_4stage: 341 ; X64: # %bb.0: 342 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] 343 ; X64-NEXT: retq 344 %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 345 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>) 346 %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>) 347 %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 348 ret <4 x float> %4 349 } 350 351 define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { 352 ; X32-LABEL: combine_vpermilvar_8f32_4stage: 353 ; X32: # %bb.0: 354 ; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] 355 ; X32-NEXT: retl 356 ; 357 ; X64-LABEL: combine_vpermilvar_8f32_4stage: 358 ; X64: # %bb.0: 359 ; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] 360 ; X64-NEXT: retq 361 %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 362 %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>) 363 %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>) 364 %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) 365 ret <8 x float> %4 366 } 367 368 define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) { 369 ; X32-LABEL: combine_vpermilvar_4f32_as_insertps: 370 ; X32: # %bb.0: 371 ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero 372 ; X32-NEXT: retl 373 ; 374 ; X64-LABEL: combine_vpermilvar_4f32_as_insertps: 375 ; X64: # %bb.0: 376 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero 377 ; X64-NEXT: retq 378 %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 379 %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4> 380 ret <4 x float> %2 381 } 382 383 define <2 x double> @constant_fold_vpermilvar_pd() { 384 ; X32-LABEL: constant_fold_vpermilvar_pd: 385 ; X32: # %bb.0: 386 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] 387 ; X32-NEXT: retl 388 ; 389 ; X64-LABEL: constant_fold_vpermilvar_pd: 390 ; X64: # %bb.0: 391 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00] 392 ; X64-NEXT: retq 393 %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>) 394 ret <2 x double> %1 395 } 396 397 define <4 x double> @constant_fold_vpermilvar_pd_256() { 398 ; X32-LABEL: constant_fold_vpermilvar_pd_256: 399 ; X32: # %bb.0: 400 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] 401 ; X32-NEXT: retl 402 ; 403 ; X64-LABEL: constant_fold_vpermilvar_pd_256: 404 ; X64: # %bb.0: 405 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00] 406 ; X64-NEXT: retq 407 %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) 408 ret <4 x double> %1 409 } 410 411 define <4 x float> @constant_fold_vpermilvar_ps() { 412 ; X32-LABEL: constant_fold_vpermilvar_ps: 413 ; X32: # %bb.0: 414 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] 415 ; X32-NEXT: retl 416 ; 417 ; X64-LABEL: constant_fold_vpermilvar_ps: 418 ; X64: # %bb.0: 419 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00] 420 ; X64-NEXT: retq 421 %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>) 422 ret <4 x float> %1 423 } 424 425 define <8 x float> @constant_fold_vpermilvar_ps_256() { 426 ; X32-LABEL: constant_fold_vpermilvar_ps_256: 427 ; X32: # %bb.0: 428 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] 429 ; X32-NEXT: retl 430 ; 431 ; X64-LABEL: constant_fold_vpermilvar_ps_256: 432 ; X64: # %bb.0: 433 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00] 434 ; X64-NEXT: retq 435 %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>) 436 ret <8 x float> %1 437 } 438