1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=FMA 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=FMA 6 7 ; PR31866 8 ; complex float complex_square_f32(complex float x) { 9 ; return x*x; 10 ; } 11 12 define <2 x float> @complex_square_f32(<2 x float>) #0 { 13 ; SSE-LABEL: complex_square_f32: 14 ; SSE: # %bb.0: 15 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 16 ; SSE-NEXT: movaps %xmm0, %xmm2 17 ; SSE-NEXT: addss %xmm0, %xmm2 18 ; SSE-NEXT: mulss %xmm1, %xmm2 19 ; SSE-NEXT: mulss %xmm0, %xmm0 20 ; SSE-NEXT: mulss %xmm1, %xmm1 21 ; SSE-NEXT: subss %xmm1, %xmm0 22 ; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 23 ; SSE-NEXT: retq 24 ; 25 ; AVX1-LABEL: complex_square_f32: 26 ; AVX1: # %bb.0: 27 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 28 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm2 29 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm2 30 ; AVX1-NEXT: vmulss %xmm0, %xmm0, %xmm0 31 ; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1 32 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 33 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 34 ; AVX1-NEXT: retq 35 ; 36 ; FMA-LABEL: complex_square_f32: 37 ; FMA: # %bb.0: 38 ; FMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 39 ; FMA-NEXT: vaddss %xmm0, %xmm0, %xmm2 40 ; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 41 ; FMA-NEXT: vmulss %xmm1, %xmm1, %xmm1 42 ; FMA-NEXT: vfmsub231ss %xmm0, %xmm0, %xmm1 43 ; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[2,3] 44 ; FMA-NEXT: retq 45 %2 = extractelement <2 x float> %0, i32 0 46 %3 = extractelement <2 x float> %0, i32 1 47 %4 = fmul fast float %3, 2.000000e+00 48 %5 = fmul fast float %4, %2 49 %6 = fmul fast float %2, %2 50 %7 = fmul fast float %3, %3 51 %8 = fsub fast float %6, %7 52 %9 = insertelement <2 x float> undef, float %8, i32 0 53 %10 = insertelement <2 x float> %9, float %5, i32 1 54 ret <2 x float> %10 55 } 56 57 define <2 x double> @complex_square_f64(<2 x double>) #0 { 58 ; SSE-LABEL: complex_square_f64: 59 ; SSE: # %bb.0: 60 ; SSE-NEXT: movaps %xmm0, %xmm1 61 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 62 ; SSE-NEXT: movaps %xmm0, %xmm2 63 ; SSE-NEXT: addsd %xmm0, %xmm2 64 ; SSE-NEXT: mulsd %xmm1, %xmm2 65 ; SSE-NEXT: mulsd %xmm0, %xmm0 66 ; SSE-NEXT: mulsd %xmm1, %xmm1 67 ; SSE-NEXT: subsd %xmm1, %xmm0 68 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 69 ; SSE-NEXT: retq 70 ; 71 ; AVX1-LABEL: complex_square_f64: 72 ; AVX1: # %bb.0: 73 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 74 ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm2 75 ; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm2 76 ; AVX1-NEXT: vmulsd %xmm0, %xmm0, %xmm0 77 ; AVX1-NEXT: vmulsd %xmm1, %xmm1, %xmm1 78 ; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 79 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 80 ; AVX1-NEXT: retq 81 ; 82 ; FMA-LABEL: complex_square_f64: 83 ; FMA: # %bb.0: 84 ; FMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 85 ; FMA-NEXT: vaddsd %xmm0, %xmm0, %xmm2 86 ; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm2 87 ; FMA-NEXT: vmulsd %xmm1, %xmm1, %xmm1 88 ; FMA-NEXT: vfmsub231sd %xmm0, %xmm0, %xmm1 89 ; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm2[0] 90 ; FMA-NEXT: retq 91 %2 = extractelement <2 x double> %0, i32 0 92 %3 = extractelement <2 x double> %0, i32 1 93 %4 = fmul fast double %3, 2.000000e+00 94 %5 = fmul fast double %4, %2 95 %6 = fmul fast double %2, %2 96 %7 = fmul fast double %3, %3 97 %8 = fsub fast double %6, %7 98 %9 = insertelement <2 x double> undef, double %8, i32 0 99 %10 = insertelement <2 x double> %9, double %5, i32 1 100 ret <2 x double> %10 101 } 102 103 ; complex float complex_mul_f32(complex float x, complex float y) { 104 ; return x*y; 105 ; } 106 107 define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 { 108 ; SSE-LABEL: complex_mul_f32: 109 ; SSE: # %bb.0: 110 ; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 111 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 112 ; SSE-NEXT: movaps %xmm3, %xmm4 113 ; SSE-NEXT: mulss %xmm0, %xmm4 114 ; SSE-NEXT: mulss %xmm1, %xmm0 115 ; SSE-NEXT: mulss %xmm2, %xmm1 116 ; SSE-NEXT: addss %xmm4, %xmm1 117 ; SSE-NEXT: mulss %xmm2, %xmm3 118 ; SSE-NEXT: subss %xmm3, %xmm0 119 ; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 120 ; SSE-NEXT: retq 121 ; 122 ; AVX1-LABEL: complex_mul_f32: 123 ; AVX1: # %bb.0: 124 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 125 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 126 ; AVX1-NEXT: vmulss %xmm0, %xmm3, %xmm4 127 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm5 128 ; AVX1-NEXT: vaddss %xmm5, %xmm4, %xmm4 129 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 130 ; AVX1-NEXT: vmulss %xmm2, %xmm3, %xmm1 131 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 132 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 133 ; AVX1-NEXT: retq 134 ; 135 ; FMA-LABEL: complex_mul_f32: 136 ; FMA: # %bb.0: 137 ; FMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 138 ; FMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 139 ; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm4 140 ; FMA-NEXT: vfmadd231ss %xmm0, %xmm3, %xmm4 141 ; FMA-NEXT: vmulss %xmm2, %xmm3, %xmm2 142 ; FMA-NEXT: vfmsub231ss %xmm0, %xmm1, %xmm2 143 ; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[2,3] 144 ; FMA-NEXT: retq 145 %3 = extractelement <2 x float> %0, i32 0 146 %4 = extractelement <2 x float> %0, i32 1 147 %5 = extractelement <2 x float> %1, i32 0 148 %6 = extractelement <2 x float> %1, i32 1 149 %7 = fmul fast float %6, %3 150 %8 = fmul fast float %5, %4 151 %9 = fadd fast float %7, %8 152 %10 = fmul fast float %5, %3 153 %11 = fmul fast float %6, %4 154 %12 = fsub fast float %10, %11 155 %13 = insertelement <2 x float> undef, float %12, i32 0 156 %14 = insertelement <2 x float> %13, float %9, i32 1 157 ret <2 x float> %14 158 } 159 160 define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 { 161 ; SSE-LABEL: complex_mul_f64: 162 ; SSE: # %bb.0: 163 ; SSE-NEXT: movaps %xmm0, %xmm2 164 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 165 ; SSE-NEXT: movaps %xmm1, %xmm3 166 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 167 ; SSE-NEXT: movaps %xmm3, %xmm4 168 ; SSE-NEXT: mulsd %xmm0, %xmm4 169 ; SSE-NEXT: mulsd %xmm1, %xmm0 170 ; SSE-NEXT: mulsd %xmm2, %xmm1 171 ; SSE-NEXT: addsd %xmm4, %xmm1 172 ; SSE-NEXT: mulsd %xmm2, %xmm3 173 ; SSE-NEXT: subsd %xmm3, %xmm0 174 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 175 ; SSE-NEXT: retq 176 ; 177 ; AVX1-LABEL: complex_mul_f64: 178 ; AVX1: # %bb.0: 179 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 180 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 181 ; AVX1-NEXT: vmulsd %xmm0, %xmm3, %xmm4 182 ; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm5 183 ; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 184 ; AVX1-NEXT: vmulsd %xmm0, %xmm1, %xmm0 185 ; AVX1-NEXT: vmulsd %xmm2, %xmm3, %xmm1 186 ; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 187 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] 188 ; AVX1-NEXT: retq 189 ; 190 ; FMA-LABEL: complex_mul_f64: 191 ; FMA: # %bb.0: 192 ; FMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 193 ; FMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 194 ; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm4 195 ; FMA-NEXT: vfmadd231sd %xmm0, %xmm3, %xmm4 196 ; FMA-NEXT: vmulsd %xmm2, %xmm3, %xmm2 197 ; FMA-NEXT: vfmsub231sd %xmm0, %xmm1, %xmm2 198 ; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm4[0] 199 ; FMA-NEXT: retq 200 %3 = extractelement <2 x double> %0, i32 0 201 %4 = extractelement <2 x double> %0, i32 1 202 %5 = extractelement <2 x double> %1, i32 0 203 %6 = extractelement <2 x double> %1, i32 1 204 %7 = fmul fast double %6, %3 205 %8 = fmul fast double %5, %4 206 %9 = fadd fast double %7, %8 207 %10 = fmul fast double %5, %3 208 %11 = fmul fast double %6, %4 209 %12 = fsub fast double %10, %11 210 %13 = insertelement <2 x double> undef, double %12, i32 0 211 %14 = insertelement <2 x double> %13, double %9, i32 1 212 ret <2 x double> %14 213 } 214 215 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" } 216