1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud (at) inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_COMPLEX_NEON_H 11 #define EIGEN_COMPLEX_NEON_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000); 18 static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000); 19 20 //---------- float ---------- 21 struct Packet2cf 22 { 23 EIGEN_STRONG_INLINE Packet2cf() {} 24 EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} 25 Packet4f v; 26 }; 27 28 template<> struct packet_traits<std::complex<float> > : default_packet_traits 29 { 30 typedef Packet2cf type; 31 enum { 32 Vectorizable = 1, 33 AlignedOnScalar = 1, 34 size = 2, 35 36 HasAdd = 1, 37 HasSub = 1, 38 HasMul = 1, 39 HasDiv = 1, 40 HasNegate = 1, 41 HasAbs = 0, 42 HasAbs2 = 0, 43 HasMin = 0, 44 HasMax = 0, 45 HasSetLinear = 0 46 }; 47 }; 48 49 template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; 50 51 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) 52 { 53 float32x2_t r64; 54 r64 = vld1_f32((float *)&from); 55 56 return Packet2cf(vcombine_f32(r64, r64)); 57 } 58 59 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); } 60 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); } 61 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); } 62 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) 63 { 64 Packet4ui b = vreinterpretq_u32_f32(a.v); 65 return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR))); 66 } 67 68 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) 69 { 70 Packet4f v1, v2; 71 float32x2_t a_lo, a_hi; 72 73 // Get the real values of a | a1_re | a1_re | a2_re | a2_re | 74 v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0)); 75 // Get the real values of a | a1_im | a1_im | a2_im | a2_im | 76 v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1)); 77 // Multiply the real a with b 78 v1 = vmulq_f32(v1, b.v); 79 // Multiply the imag a with b 80 v2 = vmulq_f32(v2, b.v); 81 // Conjugate v2 82 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR)); 83 // Swap real/imag elements in v2. 84 a_lo = vrev64_f32(vget_low_f32(v2)); 85 a_hi = vrev64_f32(vget_high_f32(v2)); 86 v2 = vcombine_f32(a_lo, a_hi); 87 // Add and return the result 88 return Packet2cf(vaddq_f32(v1, v2)); 89 } 90 91 template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) 92 { 93 return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); 94 } 95 template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) 96 { 97 return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); 98 } 99 template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) 100 { 101 return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); 102 } 103 template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) 104 { 105 return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); 106 } 107 108 template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); } 109 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); } 110 111 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } 112 113 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } 114 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } 115 116 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { __pld((float *)addr); } 117 118 template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) 119 { 120 std::complex<float> EIGEN_ALIGN16 x[2]; 121 vst1q_f32((float *)x, a.v); 122 return x[0]; 123 } 124 125 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) 126 { 127 float32x2_t a_lo, a_hi; 128 Packet4f a_r128; 129 130 a_lo = vget_low_f32(a.v); 131 a_hi = vget_high_f32(a.v); 132 a_r128 = vcombine_f32(a_hi, a_lo); 133 134 return Packet2cf(a_r128); 135 } 136 137 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) 138 { 139 return Packet2cf(vrev64q_f32(a.v)); 140 } 141 142 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) 143 { 144 float32x2_t a1, a2; 145 std::complex<float> s; 146 147 a1 = vget_low_f32(a.v); 148 a2 = vget_high_f32(a.v); 149 a2 = vadd_f32(a1, a2); 150 vst1_f32((float *)&s, a2); 151 152 return s; 153 } 154 155 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) 156 { 157 Packet4f sum1, sum2, sum; 158 159 // Add the first two 64-bit float32x2_t of vecs[0] 160 sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); 161 sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); 162 sum = vaddq_f32(sum1, sum2); 163 164 return Packet2cf(sum); 165 } 166 167 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) 168 { 169 float32x2_t a1, a2, v1, v2, prod; 170 std::complex<float> s; 171 172 a1 = vget_low_f32(a.v); 173 a2 = vget_high_f32(a.v); 174 // Get the real values of a | a1_re | a1_re | a2_re | a2_re | 175 v1 = vdup_lane_f32(a1, 0); 176 // Get the real values of a | a1_im | a1_im | a2_im | a2_im | 177 v2 = vdup_lane_f32(a1, 1); 178 // Multiply the real a with b 179 v1 = vmul_f32(v1, a2); 180 // Multiply the imag a with b 181 v2 = vmul_f32(v2, a2); 182 // Conjugate v2 183 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR)); 184 // Swap real/imag elements in v2. 185 v2 = vrev64_f32(v2); 186 // Add v1, v2 187 prod = vadd_f32(v1, v2); 188 189 vst1_f32((float *)&s, prod); 190 191 return s; 192 } 193 194 template<int Offset> 195 struct palign_impl<Offset,Packet2cf> 196 { 197 EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) 198 { 199 if (Offset==1) 200 { 201 first.v = vextq_f32(first.v, second.v, 2); 202 } 203 } 204 }; 205 206 template<> struct conj_helper<Packet2cf, Packet2cf, false,true> 207 { 208 EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const 209 { return padd(pmul(x,y),c); } 210 211 EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const 212 { 213 return internal::pmul(a, pconj(b)); 214 } 215 }; 216 217 template<> struct conj_helper<Packet2cf, Packet2cf, true,false> 218 { 219 EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const 220 { return padd(pmul(x,y),c); } 221 222 EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const 223 { 224 return internal::pmul(pconj(a), b); 225 } 226 }; 227 228 template<> struct conj_helper<Packet2cf, Packet2cf, true,true> 229 { 230 EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const 231 { return padd(pmul(x,y),c); } 232 233 EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const 234 { 235 return pconj(internal::pmul(a, b)); 236 } 237 }; 238 239 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) 240 { 241 // TODO optimize it for AltiVec 242 Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); 243 Packet4f s, rev_s; 244 float32x2_t a_lo, a_hi; 245 246 // this computes the norm 247 s = vmulq_f32(b.v, b.v); 248 a_lo = vrev64_f32(vget_low_f32(s)); 249 a_hi = vrev64_f32(vget_high_f32(s)); 250 rev_s = vcombine_f32(a_lo, a_hi); 251 252 return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); 253 } 254 255 } // end namespace internal 256 257 } // end namespace Eigen 258 259 #endif // EIGEN_COMPLEX_NEON_H 260