Home | History | Annotate | Download | only in LinearMath
      1 /*
      2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
      3 
      4 This software is provided 'as-is', without any express or implied warranty.
      5 In no event will the authors be held liable for any damages arising from the use of this software.
      6 Permission is granted to anyone to use this software for any purpose,
      7 including commercial applications, and to alter it and redistribute it freely,
      8 subject to the following restrictions:
      9 
     10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
     11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
     12 3. This notice may not be removed or altered from any source distribution.
     13 */
     14 
     15 
     16 #ifndef	BT_MATRIX3x3_H
     17 #define BT_MATRIX3x3_H
     18 
     19 #include "btVector3.h"
     20 #include "btQuaternion.h"
     21 #include <stdio.h>
     22 
     23 #ifdef BT_USE_SSE
     24 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
     25 //const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
     26 #define vMPPP (_mm_set_ps (+0.0f, +0.0f, +0.0f, -0.0f))
     27 #endif
     28 
     29 #if defined(BT_USE_SSE)
     30 #define v1000 (_mm_set_ps(0.0f,0.0f,0.0f,1.0f))
     31 #define v0100 (_mm_set_ps(0.0f,0.0f,1.0f,0.0f))
     32 #define v0010 (_mm_set_ps(0.0f,1.0f,0.0f,0.0f))
     33 #elif defined(BT_USE_NEON)
     34 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
     35 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
     36 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
     37 #endif
     38 
     39 #ifdef BT_USE_DOUBLE_PRECISION
     40 #define btMatrix3x3Data	btMatrix3x3DoubleData
     41 #else
     42 #define btMatrix3x3Data	btMatrix3x3FloatData
     43 #endif //BT_USE_DOUBLE_PRECISION
     44 
     45 
     46 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
     47 * Make sure to only include a pure orthogonal matrix without scaling. */
     48 ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
     49 
     50 	///Data storage for the matrix, each vector is a row of the matrix
     51 	btVector3 m_el[3];
     52 
     53 public:
     54 	/** @brief No initializaion constructor */
     55 	btMatrix3x3 () {}
     56 
     57 	//		explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
     58 
     59 	/**@brief Constructor from Quaternion */
     60 	explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
     61 	/*
     62 	template <typename btScalar>
     63 	Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
     64 	{
     65 	setEulerYPR(yaw, pitch, roll);
     66 	}
     67 	*/
     68 	/** @brief Constructor with row major formatting */
     69 	btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
     70 		const btScalar& yx, const btScalar& yy, const btScalar& yz,
     71 		const btScalar& zx, const btScalar& zy, const btScalar& zz)
     72 	{
     73 		setValue(xx, xy, xz,
     74 			yx, yy, yz,
     75 			zx, zy, zz);
     76 	}
     77 
     78 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
     79 	SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 )
     80 	{
     81         m_el[0].mVec128 = v0;
     82         m_el[1].mVec128 = v1;
     83         m_el[2].mVec128 = v2;
     84 	}
     85 
     86 	SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 )
     87 	{
     88         m_el[0] = v0;
     89         m_el[1] = v1;
     90         m_el[2] = v2;
     91 	}
     92 
     93 	// Copy constructor
     94 	SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
     95 	{
     96 		m_el[0].mVec128 = rhs.m_el[0].mVec128;
     97 		m_el[1].mVec128 = rhs.m_el[1].mVec128;
     98 		m_el[2].mVec128 = rhs.m_el[2].mVec128;
     99 	}
    100 
    101 	// Assignment Operator
    102 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m)
    103 	{
    104 		m_el[0].mVec128 = m.m_el[0].mVec128;
    105 		m_el[1].mVec128 = m.m_el[1].mVec128;
    106 		m_el[2].mVec128 = m.m_el[2].mVec128;
    107 
    108 		return *this;
    109 	}
    110 
    111 #else
    112 
    113 	/** @brief Copy constructor */
    114 	SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
    115 	{
    116 		m_el[0] = other.m_el[0];
    117 		m_el[1] = other.m_el[1];
    118 		m_el[2] = other.m_el[2];
    119 	}
    120 
    121 	/** @brief Assignment Operator */
    122 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
    123 	{
    124 		m_el[0] = other.m_el[0];
    125 		m_el[1] = other.m_el[1];
    126 		m_el[2] = other.m_el[2];
    127 		return *this;
    128 	}
    129 
    130 #endif
    131 
    132 	/** @brief Get a column of the matrix as a vector
    133 	*  @param i Column number 0 indexed */
    134 	SIMD_FORCE_INLINE btVector3 getColumn(int i) const
    135 	{
    136 		return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
    137 	}
    138 
    139 
    140 	/** @brief Get a row of the matrix as a vector
    141 	*  @param i Row number 0 indexed */
    142 	SIMD_FORCE_INLINE const btVector3& getRow(int i) const
    143 	{
    144 		btFullAssert(0 <= i && i < 3);
    145 		return m_el[i];
    146 	}
    147 
    148 	/** @brief Get a mutable reference to a row of the matrix as a vector
    149 	*  @param i Row number 0 indexed */
    150 	SIMD_FORCE_INLINE btVector3&  operator[](int i)
    151 	{
    152 		btFullAssert(0 <= i && i < 3);
    153 		return m_el[i];
    154 	}
    155 
    156 	/** @brief Get a const reference to a row of the matrix as a vector
    157 	*  @param i Row number 0 indexed */
    158 	SIMD_FORCE_INLINE const btVector3& operator[](int i) const
    159 	{
    160 		btFullAssert(0 <= i && i < 3);
    161 		return m_el[i];
    162 	}
    163 
    164 	/** @brief Multiply by the target matrix on the right
    165 	*  @param m Rotation matrix to be applied
    166 	* Equivilant to this = this * m */
    167 	btMatrix3x3& operator*=(const btMatrix3x3& m);
    168 
    169 	/** @brief Adds by the target matrix on the right
    170 	*  @param m matrix to be applied
    171 	* Equivilant to this = this + m */
    172 	btMatrix3x3& operator+=(const btMatrix3x3& m);
    173 
    174 	/** @brief Substractss by the target matrix on the right
    175 	*  @param m matrix to be applied
    176 	* Equivilant to this = this - m */
    177 	btMatrix3x3& operator-=(const btMatrix3x3& m);
    178 
    179 	/** @brief Set from the rotational part of a 4x4 OpenGL matrix
    180 	*  @param m A pointer to the beginning of the array of scalars*/
    181 	void setFromOpenGLSubMatrix(const btScalar *m)
    182 	{
    183 		m_el[0].setValue(m[0],m[4],m[8]);
    184 		m_el[1].setValue(m[1],m[5],m[9]);
    185 		m_el[2].setValue(m[2],m[6],m[10]);
    186 
    187 	}
    188 	/** @brief Set the values of the matrix explicitly (row major)
    189 	*  @param xx Top left
    190 	*  @param xy Top Middle
    191 	*  @param xz Top Right
    192 	*  @param yx Middle Left
    193 	*  @param yy Middle Middle
    194 	*  @param yz Middle Right
    195 	*  @param zx Bottom Left
    196 	*  @param zy Bottom Middle
    197 	*  @param zz Bottom Right*/
    198 	void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz,
    199 		const btScalar& yx, const btScalar& yy, const btScalar& yz,
    200 		const btScalar& zx, const btScalar& zy, const btScalar& zz)
    201 	{
    202 		m_el[0].setValue(xx,xy,xz);
    203 		m_el[1].setValue(yx,yy,yz);
    204 		m_el[2].setValue(zx,zy,zz);
    205 	}
    206 
    207 	/** @brief Set the matrix from a quaternion
    208 	*  @param q The Quaternion to match */
    209 	void setRotation(const btQuaternion& q)
    210 	{
    211 		btScalar d = q.length2();
    212 		btFullAssert(d != btScalar(0.0));
    213 		btScalar s = btScalar(2.0) / d;
    214 
    215     #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    216         __m128	vs, Q = q.get128();
    217 		__m128i Qi = btCastfTo128i(Q);
    218         __m128	Y, Z;
    219         __m128	V1, V2, V3;
    220         __m128	V11, V21, V31;
    221         __m128	NQ = _mm_xor_ps(Q, btvMzeroMask);
    222 		__m128i NQi = btCastfTo128i(NQ);
    223 
    224         V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));	// Y X Z W
    225 		V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));     // -X -X  Y  W
    226         V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));	// Z Y X W
    227         V1 = _mm_xor_ps(V1, vMPPP);	//	change the sign of the first element
    228 
    229         V11	= btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));	// Y Y X W
    230 		V21 = _mm_unpackhi_ps(Q, Q);                    //  Z  Z  W  W
    231 		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));	//  X  Z -X -W
    232 
    233 		V2 = V2 * V1;	//
    234 		V1 = V1 * V11;	//
    235 		V3 = V3 * V31;	//
    236 
    237         V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));	//	-Z -W  Y  W
    238 		V11 = V11 * V21;	//
    239         V21 = _mm_xor_ps(V21, vMPPP);	//	change the sign of the first element
    240 		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));	//	 W  W -Y -W
    241         V31 = _mm_xor_ps(V31, vMPPP);	//	change the sign of the first element
    242 		Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));	// -W -Z -X -W
    243 		Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3)));	//  Y  X  Y  W
    244 
    245 		vs = _mm_load_ss(&s);
    246 		V21 = V21 * Y;
    247 		V31 = V31 * Z;
    248 
    249 		V1 = V1 + V11;
    250         V2 = V2 + V21;
    251         V3 = V3 + V31;
    252 
    253         vs = bt_splat3_ps(vs, 0);
    254             //	s ready
    255         V1 = V1 * vs;
    256         V2 = V2 * vs;
    257         V3 = V3 * vs;
    258 
    259         V1 = V1 + v1000;
    260         V2 = V2 + v0100;
    261         V3 = V3 + v0010;
    262 
    263         m_el[0] = V1;
    264         m_el[1] = V2;
    265         m_el[2] = V3;
    266     #else
    267 		btScalar xs = q.x() * s,   ys = q.y() * s,   zs = q.z() * s;
    268 		btScalar wx = q.w() * xs,  wy = q.w() * ys,  wz = q.w() * zs;
    269 		btScalar xx = q.x() * xs,  xy = q.x() * ys,  xz = q.x() * zs;
    270 		btScalar yy = q.y() * ys,  yz = q.y() * zs,  zz = q.z() * zs;
    271 		setValue(
    272             btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
    273 			xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
    274 			xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
    275 	#endif
    276     }
    277 
    278 
    279 	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
    280 	*  @param yaw Yaw about Y axis
    281 	*  @param pitch Pitch about X axis
    282 	*  @param roll Roll about Z axis
    283 	*/
    284 	void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
    285 	{
    286 		setEulerZYX(roll, pitch, yaw);
    287 	}
    288 
    289 	/** @brief Set the matrix from euler angles YPR around ZYX axes
    290 	* @param eulerX Roll about X axis
    291 	* @param eulerY Pitch around Y axis
    292 	* @param eulerZ Yaw aboud Z axis
    293 	*
    294 	* These angles are used to produce a rotation matrix. The euler
    295 	* angles are applied in ZYX order. I.e a vector is first rotated
    296 	* about X then Y and then Z
    297 	**/
    298 	void setEulerZYX(btScalar eulerX,btScalar eulerY,btScalar eulerZ) {
    299 		///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
    300 		btScalar ci ( btCos(eulerX));
    301 		btScalar cj ( btCos(eulerY));
    302 		btScalar ch ( btCos(eulerZ));
    303 		btScalar si ( btSin(eulerX));
    304 		btScalar sj ( btSin(eulerY));
    305 		btScalar sh ( btSin(eulerZ));
    306 		btScalar cc = ci * ch;
    307 		btScalar cs = ci * sh;
    308 		btScalar sc = si * ch;
    309 		btScalar ss = si * sh;
    310 
    311 		setValue(cj * ch, sj * sc - cs, sj * cc + ss,
    312 			cj * sh, sj * ss + cc, sj * cs - sc,
    313 			-sj,      cj * si,      cj * ci);
    314 	}
    315 
    316 	/**@brief Set the matrix to the identity */
    317 	void setIdentity()
    318 	{
    319 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
    320 			m_el[0] = v1000;
    321 			m_el[1] = v0100;
    322 			m_el[2] = v0010;
    323 #else
    324 		setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0),
    325 			btScalar(0.0), btScalar(1.0), btScalar(0.0),
    326 			btScalar(0.0), btScalar(0.0), btScalar(1.0));
    327 #endif
    328 	}
    329 
    330 	static const btMatrix3x3&	getIdentity()
    331 	{
    332 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
    333         static const btMatrix3x3
    334         identityMatrix(v1000, v0100, v0010);
    335 #else
    336 		static const btMatrix3x3
    337         identityMatrix(
    338             btScalar(1.0), btScalar(0.0), btScalar(0.0),
    339 			btScalar(0.0), btScalar(1.0), btScalar(0.0),
    340 			btScalar(0.0), btScalar(0.0), btScalar(1.0));
    341 #endif
    342 		return identityMatrix;
    343 	}
    344 
    345 	/**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
    346 	* @param m The array to be filled */
    347 	void getOpenGLSubMatrix(btScalar *m) const
    348 	{
    349 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    350         __m128 v0 = m_el[0].mVec128;
    351         __m128 v1 = m_el[1].mVec128;
    352         __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
    353         __m128 *vm = (__m128 *)m;
    354         __m128 vT;
    355 
    356         v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
    357 
    358         vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
    359         v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
    360 
    361         v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
    362         v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
    363         v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
    364 
    365         vm[0] = v0;
    366         vm[1] = v1;
    367         vm[2] = v2;
    368 #elif defined(BT_USE_NEON)
    369         // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
    370         static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
    371         float32x4_t *vm = (float32x4_t *)m;
    372         float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
    373         float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
    374         float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
    375         float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
    376         float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
    377         float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
    378 
    379         vm[0] = v0;
    380         vm[1] = v1;
    381         vm[2] = v2;
    382 #else
    383 		m[0]  = btScalar(m_el[0].x());
    384 		m[1]  = btScalar(m_el[1].x());
    385 		m[2]  = btScalar(m_el[2].x());
    386 		m[3]  = btScalar(0.0);
    387 		m[4]  = btScalar(m_el[0].y());
    388 		m[5]  = btScalar(m_el[1].y());
    389 		m[6]  = btScalar(m_el[2].y());
    390 		m[7]  = btScalar(0.0);
    391 		m[8]  = btScalar(m_el[0].z());
    392 		m[9]  = btScalar(m_el[1].z());
    393 		m[10] = btScalar(m_el[2].z());
    394 		m[11] = btScalar(0.0);
    395 #endif
    396 	}
    397 
    398 	/**@brief Get the matrix represented as a quaternion
    399 	* @param q The quaternion which will be set */
    400 	void getRotation(btQuaternion& q) const
    401 	{
    402 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    403         btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
    404         btScalar s, x;
    405 
    406         union {
    407             btSimdFloat4 vec;
    408             btScalar f[4];
    409         } temp;
    410 
    411         if (trace > btScalar(0.0))
    412         {
    413             x = trace + btScalar(1.0);
    414 
    415             temp.f[0]=m_el[2].y() - m_el[1].z();
    416             temp.f[1]=m_el[0].z() - m_el[2].x();
    417             temp.f[2]=m_el[1].x() - m_el[0].y();
    418             temp.f[3]=x;
    419             //temp.f[3]= s * btScalar(0.5);
    420         }
    421         else
    422         {
    423             int i, j, k;
    424             if(m_el[0].x() < m_el[1].y())
    425             {
    426                 if( m_el[1].y() < m_el[2].z() )
    427                     { i = 2; j = 0; k = 1; }
    428                 else
    429                     { i = 1; j = 2; k = 0; }
    430             }
    431             else
    432             {
    433                 if( m_el[0].x() < m_el[2].z())
    434                     { i = 2; j = 0; k = 1; }
    435                 else
    436                     { i = 0; j = 1; k = 2; }
    437             }
    438 
    439             x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
    440 
    441             temp.f[3] = (m_el[k][j] - m_el[j][k]);
    442             temp.f[j] = (m_el[j][i] + m_el[i][j]);
    443             temp.f[k] = (m_el[k][i] + m_el[i][k]);
    444             temp.f[i] = x;
    445             //temp.f[i] = s * btScalar(0.5);
    446         }
    447 
    448         s = btSqrt(x);
    449         q.set128(temp.vec);
    450         s = btScalar(0.5) / s;
    451 
    452         q *= s;
    453 #else
    454 		btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
    455 
    456 		btScalar temp[4];
    457 
    458 		if (trace > btScalar(0.0))
    459 		{
    460 			btScalar s = btSqrt(trace + btScalar(1.0));
    461 			temp[3]=(s * btScalar(0.5));
    462 			s = btScalar(0.5) / s;
    463 
    464 			temp[0]=((m_el[2].y() - m_el[1].z()) * s);
    465 			temp[1]=((m_el[0].z() - m_el[2].x()) * s);
    466 			temp[2]=((m_el[1].x() - m_el[0].y()) * s);
    467 		}
    468 		else
    469 		{
    470 			int i = m_el[0].x() < m_el[1].y() ?
    471 				(m_el[1].y() < m_el[2].z() ? 2 : 1) :
    472 				(m_el[0].x() < m_el[2].z() ? 2 : 0);
    473 			int j = (i + 1) % 3;
    474 			int k = (i + 2) % 3;
    475 
    476 			btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
    477 			temp[i] = s * btScalar(0.5);
    478 			s = btScalar(0.5) / s;
    479 
    480 			temp[3] = (m_el[k][j] - m_el[j][k]) * s;
    481 			temp[j] = (m_el[j][i] + m_el[i][j]) * s;
    482 			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
    483 		}
    484 		q.setValue(temp[0],temp[1],temp[2],temp[3]);
    485 #endif
    486 	}
    487 
    488 	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
    489 	* @param yaw Yaw around Y axis
    490 	* @param pitch Pitch around X axis
    491 	* @param roll around Z axis */
    492 	void getEulerYPR(btScalar& yaw, btScalar& pitch, btScalar& roll) const
    493 	{
    494 
    495 		// first use the normal calculus
    496 		yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
    497 		pitch = btScalar(btAsin(-m_el[2].x()));
    498 		roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
    499 
    500 		// on pitch = +/-HalfPI
    501 		if (btFabs(pitch)==SIMD_HALF_PI)
    502 		{
    503 			if (yaw>0)
    504 				yaw-=SIMD_PI;
    505 			else
    506 				yaw+=SIMD_PI;
    507 
    508 			if (roll>0)
    509 				roll-=SIMD_PI;
    510 			else
    511 				roll+=SIMD_PI;
    512 		}
    513 	};
    514 
    515 
    516 	/**@brief Get the matrix represented as euler angles around ZYX
    517 	* @param yaw Yaw around X axis
    518 	* @param pitch Pitch around Y axis
    519 	* @param roll around X axis
    520 	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
    521 	void getEulerZYX(btScalar& yaw, btScalar& pitch, btScalar& roll, unsigned int solution_number = 1) const
    522 	{
    523 		struct Euler
    524 		{
    525 			btScalar yaw;
    526 			btScalar pitch;
    527 			btScalar roll;
    528 		};
    529 
    530 		Euler euler_out;
    531 		Euler euler_out2; //second solution
    532 		//get the pointer to the raw data
    533 
    534 		// Check that pitch is not at a singularity
    535 		if (btFabs(m_el[2].x()) >= 1)
    536 		{
    537 			euler_out.yaw = 0;
    538 			euler_out2.yaw = 0;
    539 
    540 			// From difference of angles formula
    541 			btScalar delta = btAtan2(m_el[0].x(),m_el[0].z());
    542 			if (m_el[2].x() > 0)  //gimbal locked up
    543 			{
    544 				euler_out.pitch = SIMD_PI / btScalar(2.0);
    545 				euler_out2.pitch = SIMD_PI / btScalar(2.0);
    546 				euler_out.roll = euler_out.pitch + delta;
    547 				euler_out2.roll = euler_out.pitch + delta;
    548 			}
    549 			else // gimbal locked down
    550 			{
    551 				euler_out.pitch = -SIMD_PI / btScalar(2.0);
    552 				euler_out2.pitch = -SIMD_PI / btScalar(2.0);
    553 				euler_out.roll = -euler_out.pitch + delta;
    554 				euler_out2.roll = -euler_out.pitch + delta;
    555 			}
    556 		}
    557 		else
    558 		{
    559 			euler_out.pitch = - btAsin(m_el[2].x());
    560 			euler_out2.pitch = SIMD_PI - euler_out.pitch;
    561 
    562 			euler_out.roll = btAtan2(m_el[2].y()/btCos(euler_out.pitch),
    563 				m_el[2].z()/btCos(euler_out.pitch));
    564 			euler_out2.roll = btAtan2(m_el[2].y()/btCos(euler_out2.pitch),
    565 				m_el[2].z()/btCos(euler_out2.pitch));
    566 
    567 			euler_out.yaw = btAtan2(m_el[1].x()/btCos(euler_out.pitch),
    568 				m_el[0].x()/btCos(euler_out.pitch));
    569 			euler_out2.yaw = btAtan2(m_el[1].x()/btCos(euler_out2.pitch),
    570 				m_el[0].x()/btCos(euler_out2.pitch));
    571 		}
    572 
    573 		if (solution_number == 1)
    574 		{
    575 			yaw = euler_out.yaw;
    576 			pitch = euler_out.pitch;
    577 			roll = euler_out.roll;
    578 		}
    579 		else
    580 		{
    581 			yaw = euler_out2.yaw;
    582 			pitch = euler_out2.pitch;
    583 			roll = euler_out2.roll;
    584 		}
    585 	}
    586 
    587 	/**@brief Create a scaled copy of the matrix
    588 	* @param s Scaling vector The elements of the vector will scale each column */
    589 
    590 	btMatrix3x3 scaled(const btVector3& s) const
    591 	{
    592 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    593 		return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
    594 #else
    595 		return btMatrix3x3(
    596             m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
    597 			m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
    598 			m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
    599 #endif
    600 	}
    601 
    602 	/**@brief Return the determinant of the matrix */
    603 	btScalar            determinant() const;
    604 	/**@brief Return the adjoint of the matrix */
    605 	btMatrix3x3 adjoint() const;
    606 	/**@brief Return the matrix with all values non negative */
    607 	btMatrix3x3 absolute() const;
    608 	/**@brief Return the transpose of the matrix */
    609 	btMatrix3x3 transpose() const;
    610 	/**@brief Return the inverse of the matrix */
    611 	btMatrix3x3 inverse() const;
    612 
    613 	/// Solve A * x = b, where b is a column vector. This is more efficient
    614 	/// than computing the inverse in one-shot cases.
    615 	///Solve33 is from Box2d, thanks to Erin Catto,
    616 	btVector3 solve33(const btVector3& b) const
    617 	{
    618 		btVector3 col1 = getColumn(0);
    619 		btVector3 col2 = getColumn(1);
    620 		btVector3 col3 = getColumn(2);
    621 
    622 		btScalar det = btDot(col1, btCross(col2, col3));
    623 		if (btFabs(det)>SIMD_EPSILON)
    624 		{
    625 			det = 1.0f / det;
    626 		}
    627 		btVector3 x;
    628 		x[0] = det * btDot(b, btCross(col2, col3));
    629 		x[1] = det * btDot(col1, btCross(b, col3));
    630 		x[2] = det * btDot(col1, btCross(col2, b));
    631 		return x;
    632 	}
    633 
    634 	btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
    635 	btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
    636 
    637 	SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const
    638 	{
    639 		return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
    640 	}
    641 	SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const
    642 	{
    643 		return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
    644 	}
    645 	SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const
    646 	{
    647 		return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
    648 	}
    649 
    650 
    651 	/**@brief diagonalizes this matrix by the Jacobi method.
    652 	* @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
    653 	* coordinate system, i.e., old_this = rot * new_this * rot^T.
    654 	* @param threshold See iteration
    655 	* @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
    656 	* by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
    657 	*
    658 	* Note that this matrix is assumed to be symmetric.
    659 	*/
    660 	void diagonalize(btMatrix3x3& rot, btScalar threshold, int maxSteps)
    661 	{
    662 		rot.setIdentity();
    663 		for (int step = maxSteps; step > 0; step--)
    664 		{
    665 			// find off-diagonal element [p][q] with largest magnitude
    666 			int p = 0;
    667 			int q = 1;
    668 			int r = 2;
    669 			btScalar max = btFabs(m_el[0][1]);
    670 			btScalar v = btFabs(m_el[0][2]);
    671 			if (v > max)
    672 			{
    673 				q = 2;
    674 				r = 1;
    675 				max = v;
    676 			}
    677 			v = btFabs(m_el[1][2]);
    678 			if (v > max)
    679 			{
    680 				p = 1;
    681 				q = 2;
    682 				r = 0;
    683 				max = v;
    684 			}
    685 
    686 			btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
    687 			if (max <= t)
    688 			{
    689 				if (max <= SIMD_EPSILON * t)
    690 				{
    691 					return;
    692 				}
    693 				step = 1;
    694 			}
    695 
    696 			// compute Jacobi rotation J which leads to a zero for element [p][q]
    697 			btScalar mpq = m_el[p][q];
    698 			btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
    699 			btScalar theta2 = theta * theta;
    700 			btScalar cos;
    701 			btScalar sin;
    702 			if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
    703 			{
    704 				t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
    705 					: 1 / (theta - btSqrt(1 + theta2));
    706 				cos = 1 / btSqrt(1 + t * t);
    707 				sin = cos * t;
    708 			}
    709 			else
    710 			{
    711 				// approximation for large theta-value, i.e., a nearly diagonal matrix
    712 				t = 1 / (theta * (2 + btScalar(0.5) / theta2));
    713 				cos = 1 - btScalar(0.5) * t * t;
    714 				sin = cos * t;
    715 			}
    716 
    717 			// apply rotation to matrix (this = J^T * this * J)
    718 			m_el[p][q] = m_el[q][p] = 0;
    719 			m_el[p][p] -= t * mpq;
    720 			m_el[q][q] += t * mpq;
    721 			btScalar mrp = m_el[r][p];
    722 			btScalar mrq = m_el[r][q];
    723 			m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
    724 			m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
    725 
    726 			// apply rotation to rot (rot = rot * J)
    727 			for (int i = 0; i < 3; i++)
    728 			{
    729 				btVector3& row = rot[i];
    730 				mrp = row[p];
    731 				mrq = row[q];
    732 				row[p] = cos * mrp - sin * mrq;
    733 				row[q] = cos * mrq + sin * mrp;
    734 			}
    735 		}
    736 	}
    737 
    738 
    739 
    740 
    741 	/**@brief Calculate the matrix cofactor
    742 	* @param r1 The first row to use for calculating the cofactor
    743 	* @param c1 The first column to use for calculating the cofactor
    744 	* @param r1 The second row to use for calculating the cofactor
    745 	* @param c1 The second column to use for calculating the cofactor
    746 	* See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
    747 	*/
    748 	btScalar cofac(int r1, int c1, int r2, int c2) const
    749 	{
    750 		return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
    751 	}
    752 
    753 	void	serialize(struct	btMatrix3x3Data& dataOut) const;
    754 
    755 	void	serializeFloat(struct	btMatrix3x3FloatData& dataOut) const;
    756 
    757 	void	deSerialize(const struct	btMatrix3x3Data& dataIn);
    758 
    759 	void	deSerializeFloat(const struct	btMatrix3x3FloatData& dataIn);
    760 
    761 	void	deSerializeDouble(const struct	btMatrix3x3DoubleData& dataIn);
    762 
    763 };
    764 
    765 
    766 SIMD_FORCE_INLINE btMatrix3x3&
    767 btMatrix3x3::operator*=(const btMatrix3x3& m)
    768 {
    769 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    770     __m128 rv00, rv01, rv02;
    771     __m128 rv10, rv11, rv12;
    772     __m128 rv20, rv21, rv22;
    773     __m128 mv0, mv1, mv2;
    774 
    775     rv02 = m_el[0].mVec128;
    776     rv12 = m_el[1].mVec128;
    777     rv22 = m_el[2].mVec128;
    778 
    779     mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
    780     mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
    781     mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
    782 
    783     // rv0
    784     rv00 = bt_splat_ps(rv02, 0);
    785     rv01 = bt_splat_ps(rv02, 1);
    786     rv02 = bt_splat_ps(rv02, 2);
    787 
    788     rv00 = _mm_mul_ps(rv00, mv0);
    789     rv01 = _mm_mul_ps(rv01, mv1);
    790     rv02 = _mm_mul_ps(rv02, mv2);
    791 
    792     // rv1
    793     rv10 = bt_splat_ps(rv12, 0);
    794     rv11 = bt_splat_ps(rv12, 1);
    795     rv12 = bt_splat_ps(rv12, 2);
    796 
    797     rv10 = _mm_mul_ps(rv10, mv0);
    798     rv11 = _mm_mul_ps(rv11, mv1);
    799     rv12 = _mm_mul_ps(rv12, mv2);
    800 
    801     // rv2
    802     rv20 = bt_splat_ps(rv22, 0);
    803     rv21 = bt_splat_ps(rv22, 1);
    804     rv22 = bt_splat_ps(rv22, 2);
    805 
    806     rv20 = _mm_mul_ps(rv20, mv0);
    807     rv21 = _mm_mul_ps(rv21, mv1);
    808     rv22 = _mm_mul_ps(rv22, mv2);
    809 
    810     rv00 = _mm_add_ps(rv00, rv01);
    811     rv10 = _mm_add_ps(rv10, rv11);
    812     rv20 = _mm_add_ps(rv20, rv21);
    813 
    814     m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
    815     m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
    816     m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
    817 
    818 #elif defined(BT_USE_NEON)
    819 
    820     float32x4_t rv0, rv1, rv2;
    821     float32x4_t v0, v1, v2;
    822     float32x4_t mv0, mv1, mv2;
    823 
    824     v0 = m_el[0].mVec128;
    825     v1 = m_el[1].mVec128;
    826     v2 = m_el[2].mVec128;
    827 
    828     mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
    829     mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
    830     mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
    831 
    832     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
    833     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
    834     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
    835 
    836     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
    837     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
    838     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
    839 
    840     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
    841     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
    842     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
    843 
    844     m_el[0].mVec128 = rv0;
    845     m_el[1].mVec128 = rv1;
    846     m_el[2].mVec128 = rv2;
    847 #else
    848 	setValue(
    849         m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
    850 		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
    851 		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
    852 #endif
    853 	return *this;
    854 }
    855 
    856 SIMD_FORCE_INLINE btMatrix3x3&
    857 btMatrix3x3::operator+=(const btMatrix3x3& m)
    858 {
    859 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    860     m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
    861     m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
    862     m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
    863 #else
    864 	setValue(
    865 		m_el[0][0]+m.m_el[0][0],
    866 		m_el[0][1]+m.m_el[0][1],
    867 		m_el[0][2]+m.m_el[0][2],
    868 		m_el[1][0]+m.m_el[1][0],
    869 		m_el[1][1]+m.m_el[1][1],
    870 		m_el[1][2]+m.m_el[1][2],
    871 		m_el[2][0]+m.m_el[2][0],
    872 		m_el[2][1]+m.m_el[2][1],
    873 		m_el[2][2]+m.m_el[2][2]);
    874 #endif
    875 	return *this;
    876 }
    877 
    878 SIMD_FORCE_INLINE btMatrix3x3
    879 operator*(const btMatrix3x3& m, const btScalar & k)
    880 {
    881 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
    882     __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
    883     return btMatrix3x3(
    884                 _mm_mul_ps(m[0].mVec128, vk),
    885                 _mm_mul_ps(m[1].mVec128, vk),
    886                 _mm_mul_ps(m[2].mVec128, vk));
    887 #elif defined(BT_USE_NEON)
    888     return btMatrix3x3(
    889                 vmulq_n_f32(m[0].mVec128, k),
    890                 vmulq_n_f32(m[1].mVec128, k),
    891                 vmulq_n_f32(m[2].mVec128, k));
    892 #else
    893 	return btMatrix3x3(
    894 		m[0].x()*k,m[0].y()*k,m[0].z()*k,
    895 		m[1].x()*k,m[1].y()*k,m[1].z()*k,
    896 		m[2].x()*k,m[2].y()*k,m[2].z()*k);
    897 #endif
    898 }
    899 
    900 SIMD_FORCE_INLINE btMatrix3x3
    901 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
    902 {
    903 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    904 	return btMatrix3x3(
    905         m1[0].mVec128 + m2[0].mVec128,
    906         m1[1].mVec128 + m2[1].mVec128,
    907         m1[2].mVec128 + m2[2].mVec128);
    908 #else
    909 	return btMatrix3x3(
    910         m1[0][0]+m2[0][0],
    911         m1[0][1]+m2[0][1],
    912         m1[0][2]+m2[0][2],
    913 
    914         m1[1][0]+m2[1][0],
    915         m1[1][1]+m2[1][1],
    916         m1[1][2]+m2[1][2],
    917 
    918         m1[2][0]+m2[2][0],
    919         m1[2][1]+m2[2][1],
    920         m1[2][2]+m2[2][2]);
    921 #endif
    922 }
    923 
    924 SIMD_FORCE_INLINE btMatrix3x3
    925 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
    926 {
    927 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    928 	return btMatrix3x3(
    929         m1[0].mVec128 - m2[0].mVec128,
    930         m1[1].mVec128 - m2[1].mVec128,
    931         m1[2].mVec128 - m2[2].mVec128);
    932 #else
    933 	return btMatrix3x3(
    934         m1[0][0]-m2[0][0],
    935         m1[0][1]-m2[0][1],
    936         m1[0][2]-m2[0][2],
    937 
    938         m1[1][0]-m2[1][0],
    939         m1[1][1]-m2[1][1],
    940         m1[1][2]-m2[1][2],
    941 
    942         m1[2][0]-m2[2][0],
    943         m1[2][1]-m2[2][1],
    944         m1[2][2]-m2[2][2]);
    945 #endif
    946 }
    947 
    948 
    949 SIMD_FORCE_INLINE btMatrix3x3&
    950 btMatrix3x3::operator-=(const btMatrix3x3& m)
    951 {
    952 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
    953     m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
    954     m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
    955     m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
    956 #else
    957 	setValue(
    958 	m_el[0][0]-m.m_el[0][0],
    959 	m_el[0][1]-m.m_el[0][1],
    960 	m_el[0][2]-m.m_el[0][2],
    961 	m_el[1][0]-m.m_el[1][0],
    962 	m_el[1][1]-m.m_el[1][1],
    963 	m_el[1][2]-m.m_el[1][2],
    964 	m_el[2][0]-m.m_el[2][0],
    965 	m_el[2][1]-m.m_el[2][1],
    966 	m_el[2][2]-m.m_el[2][2]);
    967 #endif
    968 	return *this;
    969 }
    970 
    971 
    972 SIMD_FORCE_INLINE btScalar
    973 btMatrix3x3::determinant() const
    974 {
    975 	return btTriple((*this)[0], (*this)[1], (*this)[2]);
    976 }
    977 
    978 
    979 SIMD_FORCE_INLINE btMatrix3x3
    980 btMatrix3x3::absolute() const
    981 {
    982 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
    983     return btMatrix3x3(
    984             _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
    985             _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
    986             _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
    987 #elif defined(BT_USE_NEON)
    988     return btMatrix3x3(
    989             (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
    990             (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
    991             (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
    992 #else
    993 	return btMatrix3x3(
    994             btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
    995             btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
    996             btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
    997 #endif
    998 }
    999 
   1000 SIMD_FORCE_INLINE btMatrix3x3
   1001 btMatrix3x3::transpose() const
   1002 {
   1003 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1004     __m128 v0 = m_el[0].mVec128;
   1005     __m128 v1 = m_el[1].mVec128;
   1006     __m128 v2 = m_el[2].mVec128;    //  x2 y2 z2 w2
   1007     __m128 vT;
   1008 
   1009     v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
   1010 
   1011     vT = _mm_unpackhi_ps(v0, v1);	//	z0 z1 * *
   1012     v0 = _mm_unpacklo_ps(v0, v1);	//	x0 x1 y0 y1
   1013 
   1014     v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );	// y0 y1 y2 0
   1015     v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );	// x0 x1 x2 0
   1016     v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));	// z0 z1 z2 0
   1017 
   1018 
   1019     return btMatrix3x3( v0, v1, v2 );
   1020 #elif defined(BT_USE_NEON)
   1021     // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
   1022     static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
   1023     float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );  // {x0 x1 z0 z1}, {y0 y1 w0 w1}
   1024     float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );       // {x2  0 }, {y2 0}
   1025     float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
   1026     float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
   1027     float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
   1028     float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );       // z0 z1 z2  0
   1029     return btMatrix3x3( v0, v1, v2 );
   1030 #else
   1031 	return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
   1032                         m_el[0].y(), m_el[1].y(), m_el[2].y(),
   1033                         m_el[0].z(), m_el[1].z(), m_el[2].z());
   1034 #endif
   1035 }
   1036 
   1037 SIMD_FORCE_INLINE btMatrix3x3
   1038 btMatrix3x3::adjoint() const
   1039 {
   1040 	return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
   1041 		cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
   1042 		cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
   1043 }
   1044 
   1045 SIMD_FORCE_INLINE btMatrix3x3
   1046 btMatrix3x3::inverse() const
   1047 {
   1048 	btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
   1049 	btScalar det = (*this)[0].dot(co);
   1050 	btFullAssert(det != btScalar(0.0));
   1051 	btScalar s = btScalar(1.0) / det;
   1052 	return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
   1053 		co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
   1054 		co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
   1055 }
   1056 
   1057 SIMD_FORCE_INLINE btMatrix3x3
   1058 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
   1059 {
   1060 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1061     // zeros w
   1062 //    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
   1063     __m128 row = m_el[0].mVec128;
   1064     __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
   1065     __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
   1066     __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
   1067     __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
   1068     __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
   1069     __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
   1070     row = m_el[1].mVec128;
   1071     r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
   1072     r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
   1073     r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
   1074     row = m_el[2].mVec128;
   1075     r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
   1076     r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
   1077     r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
   1078     return btMatrix3x3( r0, r1, r2 );
   1079 
   1080 #elif defined BT_USE_NEON
   1081     // zeros w
   1082     static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
   1083     float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
   1084     float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
   1085     float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
   1086     float32x4_t row = m_el[0].mVec128;
   1087     float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
   1088     float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
   1089     float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
   1090     row = m_el[1].mVec128;
   1091     r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
   1092     r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
   1093     r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
   1094     row = m_el[2].mVec128;
   1095     r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
   1096     r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
   1097     r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
   1098     return btMatrix3x3( r0, r1, r2 );
   1099 #else
   1100     return btMatrix3x3(
   1101 		m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
   1102 		m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
   1103 		m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
   1104 		m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
   1105 		m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
   1106 		m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
   1107 		m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
   1108 		m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
   1109 		m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
   1110 #endif
   1111 }
   1112 
   1113 SIMD_FORCE_INLINE btMatrix3x3
   1114 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
   1115 {
   1116 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1117     __m128 a0 = m_el[0].mVec128;
   1118     __m128 a1 = m_el[1].mVec128;
   1119     __m128 a2 = m_el[2].mVec128;
   1120 
   1121     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
   1122     __m128 mx = mT[0].mVec128;
   1123     __m128 my = mT[1].mVec128;
   1124     __m128 mz = mT[2].mVec128;
   1125 
   1126     __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
   1127     __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
   1128     __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
   1129     r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
   1130     r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
   1131     r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
   1132     r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
   1133     r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
   1134     r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
   1135     return btMatrix3x3( r0, r1, r2);
   1136 
   1137 #elif defined BT_USE_NEON
   1138     float32x4_t a0 = m_el[0].mVec128;
   1139     float32x4_t a1 = m_el[1].mVec128;
   1140     float32x4_t a2 = m_el[2].mVec128;
   1141 
   1142     btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
   1143     float32x4_t mx = mT[0].mVec128;
   1144     float32x4_t my = mT[1].mVec128;
   1145     float32x4_t mz = mT[2].mVec128;
   1146 
   1147     float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
   1148     float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
   1149     float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
   1150     r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
   1151     r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
   1152     r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
   1153     r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
   1154     r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
   1155     r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
   1156     return btMatrix3x3( r0, r1, r2 );
   1157 
   1158 #else
   1159 	return btMatrix3x3(
   1160 		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
   1161 		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
   1162 		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
   1163 #endif
   1164 }
   1165 
   1166 SIMD_FORCE_INLINE btVector3
   1167 operator*(const btMatrix3x3& m, const btVector3& v)
   1168 {
   1169 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
   1170     return v.dot3(m[0], m[1], m[2]);
   1171 #else
   1172 	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
   1173 #endif
   1174 }
   1175 
   1176 
   1177 SIMD_FORCE_INLINE btVector3
   1178 operator*(const btVector3& v, const btMatrix3x3& m)
   1179 {
   1180 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1181 
   1182     const __m128 vv = v.mVec128;
   1183 
   1184     __m128 c0 = bt_splat_ps( vv, 0);
   1185     __m128 c1 = bt_splat_ps( vv, 1);
   1186     __m128 c2 = bt_splat_ps( vv, 2);
   1187 
   1188     c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
   1189     c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
   1190     c0 = _mm_add_ps(c0, c1);
   1191     c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
   1192 
   1193     return btVector3(_mm_add_ps(c0, c2));
   1194 #elif defined(BT_USE_NEON)
   1195     const float32x4_t vv = v.mVec128;
   1196     const float32x2_t vlo = vget_low_f32(vv);
   1197     const float32x2_t vhi = vget_high_f32(vv);
   1198 
   1199     float32x4_t c0, c1, c2;
   1200 
   1201     c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
   1202     c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
   1203     c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
   1204 
   1205     c0 = vmulq_lane_f32(c0, vlo, 0);
   1206     c1 = vmulq_lane_f32(c1, vlo, 1);
   1207     c2 = vmulq_lane_f32(c2, vhi, 0);
   1208     c0 = vaddq_f32(c0, c1);
   1209     c0 = vaddq_f32(c0, c2);
   1210 
   1211     return btVector3(c0);
   1212 #else
   1213 	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
   1214 #endif
   1215 }
   1216 
   1217 SIMD_FORCE_INLINE btMatrix3x3
   1218 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
   1219 {
   1220 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1221 
   1222     __m128 m10 = m1[0].mVec128;
   1223     __m128 m11 = m1[1].mVec128;
   1224     __m128 m12 = m1[2].mVec128;
   1225 
   1226     __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
   1227 
   1228     __m128 c0 = bt_splat_ps( m10, 0);
   1229     __m128 c1 = bt_splat_ps( m11, 0);
   1230     __m128 c2 = bt_splat_ps( m12, 0);
   1231 
   1232     c0 = _mm_mul_ps(c0, m2v);
   1233     c1 = _mm_mul_ps(c1, m2v);
   1234     c2 = _mm_mul_ps(c2, m2v);
   1235 
   1236     m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
   1237 
   1238     __m128 c0_1 = bt_splat_ps( m10, 1);
   1239     __m128 c1_1 = bt_splat_ps( m11, 1);
   1240     __m128 c2_1 = bt_splat_ps( m12, 1);
   1241 
   1242     c0_1 = _mm_mul_ps(c0_1, m2v);
   1243     c1_1 = _mm_mul_ps(c1_1, m2v);
   1244     c2_1 = _mm_mul_ps(c2_1, m2v);
   1245 
   1246     m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
   1247 
   1248     c0 = _mm_add_ps(c0, c0_1);
   1249     c1 = _mm_add_ps(c1, c1_1);
   1250     c2 = _mm_add_ps(c2, c2_1);
   1251 
   1252     m10 = bt_splat_ps( m10, 2);
   1253     m11 = bt_splat_ps( m11, 2);
   1254     m12 = bt_splat_ps( m12, 2);
   1255 
   1256     m10 = _mm_mul_ps(m10, m2v);
   1257     m11 = _mm_mul_ps(m11, m2v);
   1258     m12 = _mm_mul_ps(m12, m2v);
   1259 
   1260     c0 = _mm_add_ps(c0, m10);
   1261     c1 = _mm_add_ps(c1, m11);
   1262     c2 = _mm_add_ps(c2, m12);
   1263 
   1264     return btMatrix3x3(c0, c1, c2);
   1265 
   1266 #elif defined(BT_USE_NEON)
   1267 
   1268     float32x4_t rv0, rv1, rv2;
   1269     float32x4_t v0, v1, v2;
   1270     float32x4_t mv0, mv1, mv2;
   1271 
   1272     v0 = m1[0].mVec128;
   1273     v1 = m1[1].mVec128;
   1274     v2 = m1[2].mVec128;
   1275 
   1276     mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
   1277     mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
   1278     mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
   1279 
   1280     rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
   1281     rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
   1282     rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
   1283 
   1284     rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
   1285     rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
   1286     rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
   1287 
   1288     rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
   1289     rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
   1290     rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
   1291 
   1292 	return btMatrix3x3(rv0, rv1, rv2);
   1293 
   1294 #else
   1295 	return btMatrix3x3(
   1296 		m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
   1297 		m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
   1298 		m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
   1299 #endif
   1300 }
   1301 
   1302 /*
   1303 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
   1304 return btMatrix3x3(
   1305 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
   1306 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
   1307 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
   1308 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
   1309 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
   1310 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
   1311 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
   1312 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
   1313 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
   1314 }
   1315 */
   1316 
   1317 /**@brief Equality operator between two matrices
   1318 * It will test all elements are equal.  */
   1319 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
   1320 {
   1321 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
   1322 
   1323     __m128 c0, c1, c2;
   1324 
   1325     c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
   1326     c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
   1327     c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
   1328 
   1329     c0 = _mm_and_ps(c0, c1);
   1330     c0 = _mm_and_ps(c0, c2);
   1331 
   1332     return (0x7 == _mm_movemask_ps((__m128)c0));
   1333 #else
   1334 	return
   1335     (   m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
   1336 		m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
   1337 		m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
   1338 #endif
   1339 }
   1340 
   1341 ///for serialization
   1342 struct	btMatrix3x3FloatData
   1343 {
   1344 	btVector3FloatData m_el[3];
   1345 };
   1346 
   1347 ///for serialization
   1348 struct	btMatrix3x3DoubleData
   1349 {
   1350 	btVector3DoubleData m_el[3];
   1351 };
   1352 
   1353 
   1354 
   1355 
   1356 SIMD_FORCE_INLINE	void	btMatrix3x3::serialize(struct	btMatrix3x3Data& dataOut) const
   1357 {
   1358 	for (int i=0;i<3;i++)
   1359 		m_el[i].serialize(dataOut.m_el[i]);
   1360 }
   1361 
   1362 SIMD_FORCE_INLINE	void	btMatrix3x3::serializeFloat(struct	btMatrix3x3FloatData& dataOut) const
   1363 {
   1364 	for (int i=0;i<3;i++)
   1365 		m_el[i].serializeFloat(dataOut.m_el[i]);
   1366 }
   1367 
   1368 
   1369 SIMD_FORCE_INLINE	void	btMatrix3x3::deSerialize(const struct	btMatrix3x3Data& dataIn)
   1370 {
   1371 	for (int i=0;i<3;i++)
   1372 		m_el[i].deSerialize(dataIn.m_el[i]);
   1373 }
   1374 
   1375 SIMD_FORCE_INLINE	void	btMatrix3x3::deSerializeFloat(const struct	btMatrix3x3FloatData& dataIn)
   1376 {
   1377 	for (int i=0;i<3;i++)
   1378 		m_el[i].deSerializeFloat(dataIn.m_el[i]);
   1379 }
   1380 
   1381 SIMD_FORCE_INLINE	void	btMatrix3x3::deSerializeDouble(const struct	btMatrix3x3DoubleData& dataIn)
   1382 {
   1383 	for (int i=0;i<3;i++)
   1384 		m_el[i].deSerializeDouble(dataIn.m_el[i]);
   1385 }
   1386 
   1387 #endif //BT_MATRIX3x3_H
   1388 
   1389