Home | History | Annotate | Download | only in LinearMath
      1 /*
      2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
      3 
      4 This software is provided 'as-is', without any express or implied warranty.
      5 In no event will the authors be held liable for any damages arising from the use of this software.
      6 Permission is granted to anyone to use this software for any purpose,
      7 including commercial applications, and to alter it and redistribute it freely,
      8 subject to the following restrictions:
      9 
     10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
     11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
     12 3. This notice may not be removed or altered from any source distribution.
     13 */
     14 
     15 
     16 
     17 #ifndef BT_SIMD__QUATERNION_H_
     18 #define BT_SIMD__QUATERNION_H_
     19 
     20 
     21 #include "btVector3.h"
     22 #include "btQuadWord.h"
     23 
     24 
     25 #ifdef BT_USE_DOUBLE_PRECISION
     26 #define btQuaternionData btQuaternionDoubleData
     27 #define btQuaternionDataName "btQuaternionDoubleData"
     28 #else
     29 #define btQuaternionData btQuaternionFloatData
     30 #define btQuaternionDataName "btQuaternionFloatData"
     31 #endif //BT_USE_DOUBLE_PRECISION
     32 
     33 
     34 
     35 #ifdef BT_USE_SSE
     36 
     37 //const __m128 ATTRIBUTE_ALIGNED16(vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
     38 #define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
     39 
     40 #endif
     41 
     42 #if defined(BT_USE_SSE)
     43 
     44 #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
     45 #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
     46 
     47 #elif defined(BT_USE_NEON)
     48 
     49 const btSimdFloat4 ATTRIBUTE_ALIGNED16(vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
     50 const btSimdFloat4 ATTRIBUTE_ALIGNED16(vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
     51 
     52 #endif
     53 
     54 /**@brief The btQuaternion implements quaternion to perform linear algebra rotations in combination with btMatrix3x3, btVector3 and btTransform. */
     55 class btQuaternion : public btQuadWord {
     56 public:
     57   /**@brief No initialization constructor */
     58 	btQuaternion() {}
     59 
     60 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON)
     61 	// Set Vector
     62 	SIMD_FORCE_INLINE btQuaternion(const btSimdFloat4 vec)
     63 	{
     64 		mVec128 = vec;
     65 	}
     66 
     67 	// Copy constructor
     68 	SIMD_FORCE_INLINE btQuaternion(const btQuaternion& rhs)
     69 	{
     70 		mVec128 = rhs.mVec128;
     71 	}
     72 
     73 	// Assignment Operator
     74 	SIMD_FORCE_INLINE btQuaternion&
     75 	operator=(const btQuaternion& v)
     76 	{
     77 		mVec128 = v.mVec128;
     78 
     79 		return *this;
     80 	}
     81 
     82 #endif
     83 
     84 	//		template <typename btScalar>
     85 	//		explicit Quaternion(const btScalar *v) : Tuple4<btScalar>(v) {}
     86   /**@brief Constructor from scalars */
     87 	btQuaternion(const btScalar& _x, const btScalar& _y, const btScalar& _z, const btScalar& _w)
     88 		: btQuadWord(_x, _y, _z, _w)
     89 	{}
     90   /**@brief Axis angle Constructor
     91    * @param axis The axis which the rotation is around
     92    * @param angle The magnitude of the rotation around the angle (Radians) */
     93 	btQuaternion(const btVector3& _axis, const btScalar& _angle)
     94 	{
     95 		setRotation(_axis, _angle);
     96 	}
     97   /**@brief Constructor from Euler angles
     98    * @param yaw Angle around Y unless BT_EULER_DEFAULT_ZYX defined then Z
     99    * @param pitch Angle around X unless BT_EULER_DEFAULT_ZYX defined then Y
    100    * @param roll Angle around Z unless BT_EULER_DEFAULT_ZYX defined then X */
    101 	btQuaternion(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
    102 	{
    103 #ifndef BT_EULER_DEFAULT_ZYX
    104 		setEuler(yaw, pitch, roll);
    105 #else
    106 		setEulerZYX(yaw, pitch, roll);
    107 #endif
    108 	}
    109   /**@brief Set the rotation using axis angle notation
    110    * @param axis The axis around which to rotate
    111    * @param angle The magnitude of the rotation in Radians */
    112 	void setRotation(const btVector3& axis, const btScalar& _angle)
    113 	{
    114 		btScalar d = axis.length();
    115 		btAssert(d != btScalar(0.0));
    116 		btScalar s = btSin(_angle * btScalar(0.5)) / d;
    117 		setValue(axis.x() * s, axis.y() * s, axis.z() * s,
    118 			btCos(_angle * btScalar(0.5)));
    119 	}
    120   /**@brief Set the quaternion using Euler angles
    121    * @param yaw Angle around Y
    122    * @param pitch Angle around X
    123    * @param roll Angle around Z */
    124 	void setEuler(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
    125 	{
    126 		btScalar halfYaw = btScalar(yaw) * btScalar(0.5);
    127 		btScalar halfPitch = btScalar(pitch) * btScalar(0.5);
    128 		btScalar halfRoll = btScalar(roll) * btScalar(0.5);
    129 		btScalar cosYaw = btCos(halfYaw);
    130 		btScalar sinYaw = btSin(halfYaw);
    131 		btScalar cosPitch = btCos(halfPitch);
    132 		btScalar sinPitch = btSin(halfPitch);
    133 		btScalar cosRoll = btCos(halfRoll);
    134 		btScalar sinRoll = btSin(halfRoll);
    135 		setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
    136 			cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
    137 			sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
    138 			cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
    139 	}
    140   /**@brief Set the quaternion using euler angles
    141    * @param yaw Angle around Z
    142    * @param pitch Angle around Y
    143    * @param roll Angle around X */
    144 	void setEulerZYX(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
    145 	{
    146 		btScalar halfYaw = btScalar(yaw) * btScalar(0.5);
    147 		btScalar halfPitch = btScalar(pitch) * btScalar(0.5);
    148 		btScalar halfRoll = btScalar(roll) * btScalar(0.5);
    149 		btScalar cosYaw = btCos(halfYaw);
    150 		btScalar sinYaw = btSin(halfYaw);
    151 		btScalar cosPitch = btCos(halfPitch);
    152 		btScalar sinPitch = btSin(halfPitch);
    153 		btScalar cosRoll = btCos(halfRoll);
    154 		btScalar sinRoll = btSin(halfRoll);
    155 		setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw, //x
    156                          cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw, //y
    157                          cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw, //z
    158                          cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw); //formerly yzx
    159 	}
    160   /**@brief Add two quaternions
    161    * @param q The quaternion to add to this one */
    162 	SIMD_FORCE_INLINE	btQuaternion& operator+=(const btQuaternion& q)
    163 	{
    164 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    165 		mVec128 = _mm_add_ps(mVec128, q.mVec128);
    166 #elif defined(BT_USE_NEON)
    167 		mVec128 = vaddq_f32(mVec128, q.mVec128);
    168 #else
    169 		m_floats[0] += q.x();
    170         m_floats[1] += q.y();
    171         m_floats[2] += q.z();
    172         m_floats[3] += q.m_floats[3];
    173 #endif
    174 		return *this;
    175 	}
    176 
    177   /**@brief Subtract out a quaternion
    178    * @param q The quaternion to subtract from this one */
    179 	btQuaternion& operator-=(const btQuaternion& q)
    180 	{
    181 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    182 		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
    183 #elif defined(BT_USE_NEON)
    184 		mVec128 = vsubq_f32(mVec128, q.mVec128);
    185 #else
    186 		m_floats[0] -= q.x();
    187         m_floats[1] -= q.y();
    188         m_floats[2] -= q.z();
    189         m_floats[3] -= q.m_floats[3];
    190 #endif
    191         return *this;
    192 	}
    193 
    194   /**@brief Scale this quaternion
    195    * @param s The scalar to scale by */
    196 	btQuaternion& operator*=(const btScalar& s)
    197 	{
    198 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    199 		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
    200 		vs = bt_pshufd_ps(vs, 0);	//	(S S S S)
    201 		mVec128 = _mm_mul_ps(mVec128, vs);
    202 #elif defined(BT_USE_NEON)
    203 		mVec128 = vmulq_n_f32(mVec128, s);
    204 #else
    205 		m_floats[0] *= s;
    206         m_floats[1] *= s;
    207         m_floats[2] *= s;
    208         m_floats[3] *= s;
    209 #endif
    210 		return *this;
    211 	}
    212 
    213   /**@brief Multiply this quaternion by q on the right
    214    * @param q The other quaternion
    215    * Equivilant to this = this * q */
    216 	btQuaternion& operator*=(const btQuaternion& q)
    217 	{
    218 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    219 		__m128 vQ2 = q.get128();
    220 
    221 		__m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
    222 		__m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
    223 
    224 		A1 = A1 * B1;
    225 
    226 		__m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
    227 		__m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
    228 
    229 		A2 = A2 * B2;
    230 
    231 		B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
    232 		B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
    233 
    234 		B1 = B1 * B2;	//	A3 *= B3
    235 
    236 		mVec128 = bt_splat_ps(mVec128, 3);	//	A0
    237 		mVec128 = mVec128 * vQ2;	//	A0 * B0
    238 
    239 		A1 = A1 + A2;	//	AB12
    240 		mVec128 = mVec128 - B1;	//	AB03 = AB0 - AB3
    241 		A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
    242 		mVec128 = mVec128+ A1;	//	AB03 + AB12
    243 
    244 #elif defined(BT_USE_NEON)
    245 
    246         float32x4_t vQ1 = mVec128;
    247         float32x4_t vQ2 = q.get128();
    248         float32x4_t A0, A1, B1, A2, B2, A3, B3;
    249         float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
    250 
    251         {
    252         float32x2x2_t tmp;
    253         tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
    254         vQ1zx = tmp.val[0];
    255 
    256         tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
    257         vQ2zx = tmp.val[0];
    258         }
    259         vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
    260 
    261         vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
    262 
    263         vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
    264         vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
    265 
    266         A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x
    267         B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X
    268 
    269         A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
    270         B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
    271 
    272         A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
    273         B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
    274 
    275         A1 = vmulq_f32(A1, B1);
    276         A2 = vmulq_f32(A2, B2);
    277         A3 = vmulq_f32(A3, B3);	//	A3 *= B3
    278         A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
    279 
    280         A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
    281         A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3
    282 
    283         //	change the sign of the last element
    284         A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
    285         A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
    286 
    287         mVec128 = A0;
    288 #else
    289 		setValue(
    290             m_floats[3] * q.x() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.z() - m_floats[2] * q.y(),
    291 			m_floats[3] * q.y() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.x() - m_floats[0] * q.z(),
    292 			m_floats[3] * q.z() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.y() - m_floats[1] * q.x(),
    293 			m_floats[3] * q.m_floats[3] - m_floats[0] * q.x() - m_floats[1] * q.y() - m_floats[2] * q.z());
    294 #endif
    295 		return *this;
    296 	}
    297   /**@brief Return the dot product between this quaternion and another
    298    * @param q The other quaternion */
    299 	btScalar dot(const btQuaternion& q) const
    300 	{
    301 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    302 		__m128	vd;
    303 
    304 		vd = _mm_mul_ps(mVec128, q.mVec128);
    305 
    306         __m128 t = _mm_movehl_ps(vd, vd);
    307 		vd = _mm_add_ps(vd, t);
    308 		t = _mm_shuffle_ps(vd, vd, 0x55);
    309 		vd = _mm_add_ss(vd, t);
    310 
    311         return _mm_cvtss_f32(vd);
    312 #elif defined(BT_USE_NEON)
    313 		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
    314 		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
    315 		x = vpadd_f32(x, x);
    316 		return vget_lane_f32(x, 0);
    317 #else
    318 		return  m_floats[0] * q.x() +
    319                 m_floats[1] * q.y() +
    320                 m_floats[2] * q.z() +
    321                 m_floats[3] * q.m_floats[3];
    322 #endif
    323 	}
    324 
    325   /**@brief Return the length squared of the quaternion */
    326 	btScalar length2() const
    327 	{
    328 		return dot(*this);
    329 	}
    330 
    331   /**@brief Return the length of the quaternion */
    332 	btScalar length() const
    333 	{
    334 		return btSqrt(length2());
    335 	}
    336 
    337   /**@brief Normalize the quaternion
    338    * Such that x^2 + y^2 + z^2 +w^2 = 1 */
    339 	btQuaternion& normalize()
    340 	{
    341 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    342 		__m128	vd;
    343 
    344 		vd = _mm_mul_ps(mVec128, mVec128);
    345 
    346         __m128 t = _mm_movehl_ps(vd, vd);
    347 		vd = _mm_add_ps(vd, t);
    348 		t = _mm_shuffle_ps(vd, vd, 0x55);
    349 		vd = _mm_add_ss(vd, t);
    350 
    351 		vd = _mm_sqrt_ss(vd);
    352 		vd = _mm_div_ss(vOnes, vd);
    353         vd = bt_pshufd_ps(vd, 0); // splat
    354 		mVec128 = _mm_mul_ps(mVec128, vd);
    355 
    356 		return *this;
    357 #else
    358 		return *this /= length();
    359 #endif
    360 	}
    361 
    362   /**@brief Return a scaled version of this quaternion
    363    * @param s The scale factor */
    364 	SIMD_FORCE_INLINE btQuaternion
    365 	operator*(const btScalar& s) const
    366 	{
    367 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    368 		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
    369 		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
    370 
    371 		return btQuaternion(_mm_mul_ps(mVec128, vs));
    372 #elif defined(BT_USE_NEON)
    373 		return btQuaternion(vmulq_n_f32(mVec128, s));
    374 #else
    375 		return btQuaternion(x() * s, y() * s, z() * s, m_floats[3] * s);
    376 #endif
    377 	}
    378 
    379   /**@brief Return an inversely scaled versionof this quaternion
    380    * @param s The inverse scale factor */
    381 	btQuaternion operator/(const btScalar& s) const
    382 	{
    383 		btAssert(s != btScalar(0.0));
    384 		return *this * (btScalar(1.0) / s);
    385 	}
    386 
    387   /**@brief Inversely scale this quaternion
    388    * @param s The scale factor */
    389 	btQuaternion& operator/=(const btScalar& s)
    390 	{
    391 		btAssert(s != btScalar(0.0));
    392 		return *this *= btScalar(1.0) / s;
    393 	}
    394 
    395   /**@brief Return a normalized version of this quaternion */
    396 	btQuaternion normalized() const
    397 	{
    398 		return *this / length();
    399 	}
    400 	/**@brief Return the ***half*** angle between this quaternion and the other
    401    * @param q The other quaternion */
    402 	btScalar angle(const btQuaternion& q) const
    403 	{
    404 		btScalar s = btSqrt(length2() * q.length2());
    405 		btAssert(s != btScalar(0.0));
    406 		return btAcos(dot(q) / s);
    407 	}
    408 
    409 	/**@brief Return the angle between this quaternion and the other along the shortest path
    410 	* @param q The other quaternion */
    411 	btScalar angleShortestPath(const btQuaternion& q) const
    412 	{
    413 		btScalar s = btSqrt(length2() * q.length2());
    414 		btAssert(s != btScalar(0.0));
    415 		if (dot(q) < 0) // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
    416 			return btAcos(dot(-q) / s) * btScalar(2.0);
    417 		else
    418 			return btAcos(dot(q) / s) * btScalar(2.0);
    419 	}
    420 
    421   /**@brief Return the angle of rotation represented by this quaternion */
    422 	btScalar getAngle() const
    423 	{
    424 		btScalar s = btScalar(2.) * btAcos(m_floats[3]);
    425 		return s;
    426 	}
    427 
    428 	/**@brief Return the angle of rotation represented by this quaternion along the shortest path*/
    429 	btScalar getAngleShortestPath() const
    430 	{
    431 		btScalar s;
    432 		if (dot(*this) < 0)
    433 			s = btScalar(2.) * btAcos(m_floats[3]);
    434 		else
    435 			s = btScalar(2.) * btAcos(-m_floats[3]);
    436 
    437 		return s;
    438 	}
    439 
    440 
    441 	/**@brief Return the axis of the rotation represented by this quaternion */
    442 	btVector3 getAxis() const
    443 	{
    444 		btScalar s_squared = 1.f-m_floats[3]*m_floats[3];
    445 
    446 		if (s_squared < btScalar(10.) * SIMD_EPSILON) //Check for divide by zero
    447 			return btVector3(1.0, 0.0, 0.0);  // Arbitrary
    448 		btScalar s = 1.f/btSqrt(s_squared);
    449 		return btVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
    450 	}
    451 
    452 	/**@brief Return the inverse of this quaternion */
    453 	btQuaternion inverse() const
    454 	{
    455 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    456 		return btQuaternion(_mm_xor_ps(mVec128, vQInv));
    457 #elif defined(BT_USE_NEON)
    458         return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
    459 #else
    460 		return btQuaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
    461 #endif
    462 	}
    463 
    464   /**@brief Return the sum of this quaternion and the other
    465    * @param q2 The other quaternion */
    466 	SIMD_FORCE_INLINE btQuaternion
    467 	operator+(const btQuaternion& q2) const
    468 	{
    469 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    470 		return btQuaternion(_mm_add_ps(mVec128, q2.mVec128));
    471 #elif defined(BT_USE_NEON)
    472         return btQuaternion(vaddq_f32(mVec128, q2.mVec128));
    473 #else
    474 		const btQuaternion& q1 = *this;
    475 		return btQuaternion(q1.x() + q2.x(), q1.y() + q2.y(), q1.z() + q2.z(), q1.m_floats[3] + q2.m_floats[3]);
    476 #endif
    477 	}
    478 
    479   /**@brief Return the difference between this quaternion and the other
    480    * @param q2 The other quaternion */
    481 	SIMD_FORCE_INLINE btQuaternion
    482 	operator-(const btQuaternion& q2) const
    483 	{
    484 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    485 		return btQuaternion(_mm_sub_ps(mVec128, q2.mVec128));
    486 #elif defined(BT_USE_NEON)
    487         return btQuaternion(vsubq_f32(mVec128, q2.mVec128));
    488 #else
    489 		const btQuaternion& q1 = *this;
    490 		return btQuaternion(q1.x() - q2.x(), q1.y() - q2.y(), q1.z() - q2.z(), q1.m_floats[3] - q2.m_floats[3]);
    491 #endif
    492 	}
    493 
    494   /**@brief Return the negative of this quaternion
    495    * This simply negates each element */
    496 	SIMD_FORCE_INLINE btQuaternion operator-() const
    497 	{
    498 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    499 		return btQuaternion(_mm_xor_ps(mVec128, btvMzeroMask));
    500 #elif defined(BT_USE_NEON)
    501 		return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
    502 #else
    503 		const btQuaternion& q2 = *this;
    504 		return btQuaternion( - q2.x(), - q2.y(),  - q2.z(),  - q2.m_floats[3]);
    505 #endif
    506 	}
    507   /**@todo document this and it's use */
    508 	SIMD_FORCE_INLINE btQuaternion farthest( const btQuaternion& qd) const
    509 	{
    510 		btQuaternion diff,sum;
    511 		diff = *this - qd;
    512 		sum = *this + qd;
    513 		if( diff.dot(diff) > sum.dot(sum) )
    514 			return qd;
    515 		return (-qd);
    516 	}
    517 
    518 	/**@todo document this and it's use */
    519 	SIMD_FORCE_INLINE btQuaternion nearest( const btQuaternion& qd) const
    520 	{
    521 		btQuaternion diff,sum;
    522 		diff = *this - qd;
    523 		sum = *this + qd;
    524 		if( diff.dot(diff) < sum.dot(sum) )
    525 			return qd;
    526 		return (-qd);
    527 	}
    528 
    529 
    530   /**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion
    531    * @param q The other quaternion to interpolate with
    532    * @param t The ratio between this and q to interpolate.  If t = 0 the result is this, if t=1 the result is q.
    533    * Slerp interpolates assuming constant velocity.  */
    534 	btQuaternion slerp(const btQuaternion& q, const btScalar& t) const
    535 	{
    536 	  btScalar magnitude = btSqrt(length2() * q.length2());
    537 	  btAssert(magnitude > btScalar(0));
    538 
    539     btScalar product = dot(q) / magnitude;
    540     if (btFabs(product) < btScalar(1))
    541 		{
    542       // Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
    543       const btScalar sign = (product < 0) ? btScalar(-1) : btScalar(1);
    544 
    545       const btScalar theta = btAcos(sign * product);
    546       const btScalar s1 = btSin(sign * t * theta);
    547       const btScalar d = btScalar(1.0) / btSin(theta);
    548       const btScalar s0 = btSin((btScalar(1.0) - t) * theta);
    549 
    550       return btQuaternion(
    551           (m_floats[0] * s0 + q.x() * s1) * d,
    552           (m_floats[1] * s0 + q.y() * s1) * d,
    553           (m_floats[2] * s0 + q.z() * s1) * d,
    554           (m_floats[3] * s0 + q.m_floats[3] * s1) * d);
    555 		}
    556 		else
    557 		{
    558 			return *this;
    559 		}
    560 	}
    561 
    562 	static const btQuaternion&	getIdentity()
    563 	{
    564 		static const btQuaternion identityQuat(btScalar(0.),btScalar(0.),btScalar(0.),btScalar(1.));
    565 		return identityQuat;
    566 	}
    567 
    568 	SIMD_FORCE_INLINE const btScalar& getW() const { return m_floats[3]; }
    569 
    570 	SIMD_FORCE_INLINE	void	serialize(struct	btQuaternionData& dataOut) const;
    571 
    572 	SIMD_FORCE_INLINE	void	deSerialize(const struct	btQuaternionData& dataIn);
    573 
    574 	SIMD_FORCE_INLINE	void	serializeFloat(struct	btQuaternionFloatData& dataOut) const;
    575 
    576 	SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btQuaternionFloatData& dataIn);
    577 
    578 	SIMD_FORCE_INLINE	void	serializeDouble(struct	btQuaternionDoubleData& dataOut) const;
    579 
    580 	SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btQuaternionDoubleData& dataIn);
    581 
    582 };
    583 
    584 
    585 
    586 
    587 
    588 /**@brief Return the product of two quaternions */
    589 SIMD_FORCE_INLINE btQuaternion
    590 operator*(const btQuaternion& q1, const btQuaternion& q2)
    591 {
    592 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    593 	__m128 vQ1 = q1.get128();
    594 	__m128 vQ2 = q2.get128();
    595 	__m128 A0, A1, B1, A2, B2;
    596 
    597 	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0)); // X Y  z x     //      vtrn
    598 	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0)); // W W  W X     // vdup vext
    599 
    600 	A1 = A1 * B1;
    601 
    602 	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1)); // Y Z  X Y     // vext
    603 	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1)); // z x  Y Y     // vtrn vdup
    604 
    605 	A2 = A2 * B2;
    606 
    607 	B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2)); // z x Y Z      // vtrn vext
    608 	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2)); // Y Z x z      // vext vtrn
    609 
    610 	B1 = B1 * B2;	//	A3 *= B3
    611 
    612 	A0 = bt_splat_ps(vQ1, 3);	//	A0
    613 	A0 = A0 * vQ2;	//	A0 * B0
    614 
    615 	A1 = A1 + A2;	//	AB12
    616 	A0 =  A0 - B1;	//	AB03 = AB0 - AB3
    617 
    618     A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
    619 	A0 = A0 + A1;	//	AB03 + AB12
    620 
    621 	return btQuaternion(A0);
    622 
    623 #elif defined(BT_USE_NEON)
    624 
    625 	float32x4_t vQ1 = q1.get128();
    626 	float32x4_t vQ2 = q2.get128();
    627 	float32x4_t A0, A1, B1, A2, B2, A3, B3;
    628     float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
    629 
    630     {
    631     float32x2x2_t tmp;
    632     tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
    633     vQ1zx = tmp.val[0];
    634 
    635     tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
    636     vQ2zx = tmp.val[0];
    637     }
    638     vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
    639 
    640     vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
    641 
    642     vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
    643     vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
    644 
    645     A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x
    646     B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X
    647 
    648 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
    649     B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
    650 
    651     A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
    652     B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
    653 
    654 	A1 = vmulq_f32(A1, B1);
    655 	A2 = vmulq_f32(A2, B2);
    656 	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
    657 	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1); //	A0 * B0
    658 
    659 	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
    660 	A0 = vsubq_f32(A0, A3);	//	AB03 = AB0 - AB3
    661 
    662     //	change the sign of the last element
    663     A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
    664 	A0 = vaddq_f32(A0, A1);	//	AB03 + AB12
    665 
    666 	return btQuaternion(A0);
    667 
    668 #else
    669 	return btQuaternion(
    670         q1.w() * q2.x() + q1.x() * q2.w() + q1.y() * q2.z() - q1.z() * q2.y(),
    671 		q1.w() * q2.y() + q1.y() * q2.w() + q1.z() * q2.x() - q1.x() * q2.z(),
    672 		q1.w() * q2.z() + q1.z() * q2.w() + q1.x() * q2.y() - q1.y() * q2.x(),
    673 		q1.w() * q2.w() - q1.x() * q2.x() - q1.y() * q2.y() - q1.z() * q2.z());
    674 #endif
    675 }
    676 
    677 SIMD_FORCE_INLINE btQuaternion
    678 operator*(const btQuaternion& q, const btVector3& w)
    679 {
    680 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    681 	__m128 vQ1 = q.get128();
    682 	__m128 vQ2 = w.get128();
    683 	__m128 A1, B1, A2, B2, A3, B3;
    684 
    685 	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
    686 	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
    687 
    688 	A1 = A1 * B1;
    689 
    690 	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
    691 	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
    692 
    693 	A2 = A2 * B2;
    694 
    695 	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
    696 	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
    697 
    698 	A3 = A3 * B3;	//	A3 *= B3
    699 
    700 	A1 = A1 + A2;	//	AB12
    701 	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
    702     A1 = A1 - A3;	//	AB123 = AB12 - AB3
    703 
    704 	return btQuaternion(A1);
    705 
    706 #elif defined(BT_USE_NEON)
    707 
    708 	float32x4_t vQ1 = q.get128();
    709 	float32x4_t vQ2 = w.get128();
    710 	float32x4_t A1, B1, A2, B2, A3, B3;
    711     float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
    712 
    713     vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
    714     {
    715     float32x2x2_t tmp;
    716 
    717     tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
    718     vQ2zx = tmp.val[0];
    719 
    720     tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
    721     vQ1zx = tmp.val[0];
    722     }
    723 
    724     vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
    725 
    726     vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
    727     vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
    728 
    729     A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx); // W W  W X
    730     B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                    // X Y  z x
    731 
    732 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
    733     B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
    734 
    735     A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
    736     B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
    737 
    738 	A1 = vmulq_f32(A1, B1);
    739 	A2 = vmulq_f32(A2, B2);
    740 	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
    741 
    742 	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
    743 
    744     //	change the sign of the last element
    745     A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
    746 
    747     A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
    748 
    749 	return btQuaternion(A1);
    750 
    751 #else
    752 	return btQuaternion(
    753          q.w() * w.x() + q.y() * w.z() - q.z() * w.y(),
    754 		 q.w() * w.y() + q.z() * w.x() - q.x() * w.z(),
    755 		 q.w() * w.z() + q.x() * w.y() - q.y() * w.x(),
    756 		-q.x() * w.x() - q.y() * w.y() - q.z() * w.z());
    757 #endif
    758 }
    759 
    760 SIMD_FORCE_INLINE btQuaternion
    761 operator*(const btVector3& w, const btQuaternion& q)
    762 {
    763 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    764 	__m128 vQ1 = w.get128();
    765 	__m128 vQ2 = q.get128();
    766 	__m128 A1, B1, A2, B2, A3, B3;
    767 
    768 	A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));  // X Y  z x
    769 	B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));  // W W  W X
    770 
    771 	A1 = A1 * B1;
    772 
    773 	A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
    774 	B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
    775 
    776 	A2 = A2 *B2;
    777 
    778 	A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
    779 	B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
    780 
    781 	A3 = A3 * B3;	//	A3 *= B3
    782 
    783 	A1 = A1 + A2;	//	AB12
    784 	A1 = _mm_xor_ps(A1, vPPPM);	//	change sign of the last element
    785 	A1 = A1 - A3;	//	AB123 = AB12 - AB3
    786 
    787 	return btQuaternion(A1);
    788 
    789 #elif defined(BT_USE_NEON)
    790 
    791 	float32x4_t vQ1 = w.get128();
    792 	float32x4_t vQ2 = q.get128();
    793 	float32x4_t  A1, B1, A2, B2, A3, B3;
    794     float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
    795 
    796     {
    797     float32x2x2_t tmp;
    798 
    799     tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );       // {z x}, {w y}
    800     vQ1zx = tmp.val[0];
    801 
    802     tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );       // {z x}, {w y}
    803     vQ2zx = tmp.val[0];
    804     }
    805     vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
    806 
    807     vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
    808 
    809     vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
    810     vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
    811 
    812     A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                    // X Y  z x
    813     B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx); // W W  W X
    814 
    815 	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
    816     B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
    817 
    818     A3 = vcombine_f32(vQ1zx, vQ1yz);        // Z X Y Z
    819     B3 = vcombine_f32(vQ2yz, vQ2xz);        // Y Z x z
    820 
    821 	A1 = vmulq_f32(A1, B1);
    822 	A2 = vmulq_f32(A2, B2);
    823 	A3 = vmulq_f32(A3, B3);	//	A3 *= B3
    824 
    825 	A1 = vaddq_f32(A1, A2);	//	AB12 = AB1 + AB2
    826 
    827     //	change the sign of the last element
    828     A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
    829 
    830     A1 = vsubq_f32(A1, A3);	//	AB123 = AB12 - AB3
    831 
    832 	return btQuaternion(A1);
    833 
    834 #else
    835 	return btQuaternion(
    836         +w.x() * q.w() + w.y() * q.z() - w.z() * q.y(),
    837 		+w.y() * q.w() + w.z() * q.x() - w.x() * q.z(),
    838 		+w.z() * q.w() + w.x() * q.y() - w.y() * q.x(),
    839 		-w.x() * q.x() - w.y() * q.y() - w.z() * q.z());
    840 #endif
    841 }
    842 
    843 /**@brief Calculate the dot product between two quaternions */
    844 SIMD_FORCE_INLINE btScalar
    845 dot(const btQuaternion& q1, const btQuaternion& q2)
    846 {
    847 	return q1.dot(q2);
    848 }
    849 
    850 
    851 /**@brief Return the length of a quaternion */
    852 SIMD_FORCE_INLINE btScalar
    853 length(const btQuaternion& q)
    854 {
    855 	return q.length();
    856 }
    857 
    858 /**@brief Return the angle between two quaternions*/
    859 SIMD_FORCE_INLINE btScalar
    860 btAngle(const btQuaternion& q1, const btQuaternion& q2)
    861 {
    862 	return q1.angle(q2);
    863 }
    864 
    865 /**@brief Return the inverse of a quaternion*/
    866 SIMD_FORCE_INLINE btQuaternion
    867 inverse(const btQuaternion& q)
    868 {
    869 	return q.inverse();
    870 }
    871 
    872 /**@brief Return the result of spherical linear interpolation betwen two quaternions
    873  * @param q1 The first quaternion
    874  * @param q2 The second quaternion
    875  * @param t The ration between q1 and q2.  t = 0 return q1, t=1 returns q2
    876  * Slerp assumes constant velocity between positions. */
    877 SIMD_FORCE_INLINE btQuaternion
    878 slerp(const btQuaternion& q1, const btQuaternion& q2, const btScalar& t)
    879 {
    880 	return q1.slerp(q2, t);
    881 }
    882 
    883 SIMD_FORCE_INLINE btVector3
    884 quatRotate(const btQuaternion& rotation, const btVector3& v)
    885 {
    886 	btQuaternion q = rotation * v;
    887 	q *= rotation.inverse();
    888 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    889 	return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
    890 #elif defined(BT_USE_NEON)
    891     return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
    892 #else
    893 	return btVector3(q.getX(),q.getY(),q.getZ());
    894 #endif
    895 }
    896 
    897 SIMD_FORCE_INLINE btQuaternion
    898 shortestArcQuat(const btVector3& v0, const btVector3& v1) // Game Programming Gems 2.10. make sure v0,v1 are normalized
    899 {
    900 	btVector3 c = v0.cross(v1);
    901 	btScalar  d = v0.dot(v1);
    902 
    903 	if (d < -1.0 + SIMD_EPSILON)
    904 	{
    905 		btVector3 n,unused;
    906 		btPlaneSpace1(v0,n,unused);
    907 		return btQuaternion(n.x(),n.y(),n.z(),0.0f); // just pick any vector that is orthogonal to v0
    908 	}
    909 
    910 	btScalar  s = btSqrt((1.0f + d) * 2.0f);
    911 	btScalar rs = 1.0f / s;
    912 
    913 	return btQuaternion(c.getX()*rs,c.getY()*rs,c.getZ()*rs,s * 0.5f);
    914 }
    915 
    916 SIMD_FORCE_INLINE btQuaternion
    917 shortestArcQuatNormalize2(btVector3& v0,btVector3& v1)
    918 {
    919 	v0.normalize();
    920 	v1.normalize();
    921 	return shortestArcQuat(v0,v1);
    922 }
    923 
    924 
    925 
    926 
    927 struct	btQuaternionFloatData
    928 {
    929 	float	m_floats[4];
    930 };
    931 
    932 struct	btQuaternionDoubleData
    933 {
    934 	double	m_floats[4];
    935 
    936 };
    937 
    938 SIMD_FORCE_INLINE	void	btQuaternion::serializeFloat(struct	btQuaternionFloatData& dataOut) const
    939 {
    940 	///could also do a memcpy, check if it is worth it
    941 	for (int i=0;i<4;i++)
    942 		dataOut.m_floats[i] = float(m_floats[i]);
    943 }
    944 
    945 SIMD_FORCE_INLINE void	btQuaternion::deSerializeFloat(const struct	btQuaternionFloatData& dataIn)
    946 {
    947 	for (int i=0;i<4;i++)
    948 		m_floats[i] = btScalar(dataIn.m_floats[i]);
    949 }
    950 
    951 
    952 SIMD_FORCE_INLINE	void	btQuaternion::serializeDouble(struct	btQuaternionDoubleData& dataOut) const
    953 {
    954 	///could also do a memcpy, check if it is worth it
    955 	for (int i=0;i<4;i++)
    956 		dataOut.m_floats[i] = double(m_floats[i]);
    957 }
    958 
    959 SIMD_FORCE_INLINE void	btQuaternion::deSerializeDouble(const struct	btQuaternionDoubleData& dataIn)
    960 {
    961 	for (int i=0;i<4;i++)
    962 		m_floats[i] = btScalar(dataIn.m_floats[i]);
    963 }
    964 
    965 
    966 SIMD_FORCE_INLINE	void	btQuaternion::serialize(struct	btQuaternionData& dataOut) const
    967 {
    968 	///could also do a memcpy, check if it is worth it
    969 	for (int i=0;i<4;i++)
    970 		dataOut.m_floats[i] = m_floats[i];
    971 }
    972 
    973 SIMD_FORCE_INLINE void	btQuaternion::deSerialize(const struct	btQuaternionData& dataIn)
    974 {
    975 	for (int i=0;i<4;i++)
    976 		m_floats[i] = dataIn.m_floats[i];
    977 }
    978 
    979 
    980 #endif //BT_SIMD__QUATERNION_H_
    981 
    982 
    983 
    984