Home | History | Annotate | Download | only in NEON
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // This Source Code Form is subject to the terms of the Mozilla
      5 // Public License v. 2.0. If a copy of the MPL was not distributed
      6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      7 
      8 /* The sin, cos, exp, and log functions of this file come from
      9  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
     10  */
     11 
     12 #ifndef EIGEN_MATH_FUNCTIONS_NEON_H
     13 #define EIGEN_MATH_FUNCTIONS_NEON_H
     14 
     15 namespace Eigen {
     16 
     17 namespace internal {
     18 
     19 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
     20 Packet4f pexp<Packet4f>(const Packet4f& _x)
     21 {
     22   Packet4f x = _x;
     23   Packet4f tmp, fx;
     24 
     25   _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
     26   _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
     27   _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
     28   _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
     29   _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
     30   _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
     31   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
     32   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
     33   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
     34   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
     35   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
     36   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
     37   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
     38   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
     39 
     40   x = vminq_f32(x, p4f_exp_hi);
     41   x = vmaxq_f32(x, p4f_exp_lo);
     42 
     43   /* express exp(x) as exp(g + n*log(2)) */
     44   fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
     45 
     46   /* perform a floorf */
     47   tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
     48 
     49   /* if greater, substract 1 */
     50   Packet4ui mask = vcgtq_f32(tmp, fx);
     51   mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
     52 
     53   fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
     54 
     55   tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
     56   Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
     57   x = vsubq_f32(x, tmp);
     58   x = vsubq_f32(x, z);
     59 
     60   Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
     61   z = vmulq_f32(x, x);
     62   y = vaddq_f32(y, p4f_cephes_exp_p1);
     63   y = vmulq_f32(y, x);
     64   y = vaddq_f32(y, p4f_cephes_exp_p2);
     65   y = vmulq_f32(y, x);
     66   y = vaddq_f32(y, p4f_cephes_exp_p3);
     67   y = vmulq_f32(y, x);
     68   y = vaddq_f32(y, p4f_cephes_exp_p4);
     69   y = vmulq_f32(y, x);
     70   y = vaddq_f32(y, p4f_cephes_exp_p5);
     71 
     72   y = vmulq_f32(y, z);
     73   y = vaddq_f32(y, x);
     74   y = vaddq_f32(y, p4f_1);
     75 
     76   /* build 2^n */
     77   int32x4_t mm;
     78   mm = vcvtq_s32_f32(fx);
     79   mm = vaddq_s32(mm, p4i_0x7f);
     80   mm = vshlq_n_s32(mm, 23);
     81   Packet4f pow2n = vreinterpretq_f32_s32(mm);
     82 
     83   y = vmulq_f32(y, pow2n);
     84   return y;
     85 }
     86 
     87 } // end namespace internal
     88 
     89 } // end namespace Eigen
     90 
     91 #endif // EIGEN_MATH_FUNCTIONS_NEON_H
     92