1 #include <arm_neon.h> 2 3 namespace math { 4 namespace internal { 5 #define _IOS_SHUFFLE_1032(vec) vrev64q_f32(vec) 6 #define _IOS_SHUFFLE_2301(vec) vcombine_f32(vget_high_f32(vec), vget_low_f32(vec)) 7 inline float32x4_t dot4VecResult(const float32x4_t& vec1, const float32x4_t& vec2) { 8 float32x4_t result = vmulq_f32(vec1, vec2); 9 result = vaddq_f32(result, _IOS_SHUFFLE_1032(result)); 10 result = vaddq_f32(result, _IOS_SHUFFLE_2301(result)); 11 return result; 12 } 13 14 inline float32x4_t fastRSqrt(const float32x4_t& vec) { 15 float32x4_t result; 16 result = vrsqrteq_f32(vec); 17 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(result, result), vec), result); 18 return result; 19 } 20 21 } 22 typedef float32x4_t Vector3; 23 24 inline Vector3 normalize(const Vector3& v1) { 25 float32x4_t dot; 26 dot = vsetq_lane_f32(0.0f, v1, 3); 27 dot = internal::dot4VecResult(dot, dot); 28 29 if (vgetq_lane_f32(dot, 0) == 0.0f) { 30 return v1; 31 } else { 32 Vector3 result; 33 result = vmulq_f32(v1, internal::fastRSqrt(dot)); 34 return result; 35 } 36 } 37 38 inline Vector3 cross(const Vector3& v1, const Vector3& v2) { 39 float32x4x2_t v_1203 = vzipq_f32(vcombine_f32(vrev64_f32(vget_low_f32(v1)), vrev64_f32(vget_low_f32(v2))), vcombine_f32(vget_high_f32(v1), vget_high_f32(v2))); 40 float32x4x2_t v_2013 = vzipq_f32(vcombine_f32(vrev64_f32(vget_low_f32(v_1203.val[0])), vrev64_f32(vget_low_f32(v_1203.val[1]))), vcombine_f32(vget_high_f32(v_1203.val[0]), vget_high_f32(v_1203.val[1]))); 41 42 Vector3 result; 43 result = vmlsq_f32(vmulq_f32(v_1203.val[0], v_2013.val[1]), v_1203.val[1], v_2013.val[0]); 44 return result; 45 } 46 } 47 48 void _f_with_internal_compiler_error_in_reload_cse_simplify_operands(const math::Vector3& v1, const math::Vector3& v2) { 49 math::normalize(math::cross(v1, v2)); 50 } 51