Home | History | Annotate | Download | only in LinearMath
      1 /*
      2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
      3 
      4 This software is provided 'as-is', without any express or implied warranty.
      5 In no event will the authors be held liable for any damages arising from the use of this software.
      6 Permission is granted to anyone to use this software for any purpose,
      7 including commercial applications, and to alter it and redistribute it freely,
      8 subject to the following restrictions:
      9 
     10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
     11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
     12 3. This notice may not be removed or altered from any source distribution.
     13 */
     14 
     15 
     16 
     17 #ifndef BT_VECTOR3_H
     18 #define BT_VECTOR3_H
     19 
     20 //#include <stdint.h>
     21 #include "btScalar.h"
     22 #include "btMinMax.h"
     23 #include "btAlignedAllocator.h"
     24 
     25 #ifdef BT_USE_DOUBLE_PRECISION
     26 #define btVector3Data btVector3DoubleData
     27 #define btVector3DataName "btVector3DoubleData"
     28 #else
     29 #define btVector3Data btVector3FloatData
     30 #define btVector3DataName "btVector3FloatData"
     31 #endif //BT_USE_DOUBLE_PRECISION
     32 
     33 #if defined BT_USE_SSE
     34 
     35 //typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
     36 
     37 #ifdef _MSC_VER
     38 #pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
     39 #endif
     40 
     41 
     42 #define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
     43 //#define bt_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
     44 #define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
     45 #define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
     46 #define bt_splat_ps( _a, _i )  bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
     47 
     48 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
     49 #define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
     50 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
     51 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
     52 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
     53 #define btvxyzMaskf btvFFF0fMask
     54 #define btvAbsfMask btCastiTo128f(btvAbsMask)
     55 
     56 //there is an issue with XCode 3.2 (LCx errors)
     57 #define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
     58 #define v1110		 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
     59 #define vHalf		 (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
     60 #define v1_5		 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
     61 
     62 //const __m128 ATTRIBUTE_ALIGNED16(btvMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
     63 //const __m128 ATTRIBUTE_ALIGNED16(v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
     64 //const __m128 ATTRIBUTE_ALIGNED16(vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
     65 //const __m128 ATTRIBUTE_ALIGNED16(v1_5)  = {1.5f, 1.5f, 1.5f, 1.5f};
     66 
     67 #endif
     68 
     69 #ifdef BT_USE_NEON
     70 
     71 const float32x4_t ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
     72 const int32x4_t ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){static_cast<int32_t>(0xFFFFFFFF),
     73 	static_cast<int32_t>(0xFFFFFFFF), static_cast<int32_t>(0xFFFFFFFF), 0x0};
     74 const int32x4_t ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
     75 const int32x4_t ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
     76 
     77 #endif
     78 
     79 /**@brief btVector3 can be used to represent 3D points and vectors.
     80  * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
     81  * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
     82  */
     83 ATTRIBUTE_ALIGNED16(class) btVector3
     84 {
     85 public:
     86 
     87 	BT_DECLARE_ALIGNED_ALLOCATOR();
     88 
     89 #if defined (__SPU__) && defined (__CELLOS_LV2__)
     90 		btScalar	m_floats[4];
     91 public:
     92 	SIMD_FORCE_INLINE const vec_float4&	get128() const
     93 	{
     94 		return *((const vec_float4*)&m_floats[0]);
     95 	}
     96 public:
     97 #else //__CELLOS_LV2__ __SPU__
     98     #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
     99         union {
    100             btSimdFloat4      mVec128;
    101             btScalar	m_floats[4];
    102         };
    103         SIMD_FORCE_INLINE	btSimdFloat4	get128() const
    104         {
    105             return mVec128;
    106         }
    107         SIMD_FORCE_INLINE	void	set128(btSimdFloat4 v128)
    108         {
    109             mVec128 = v128;
    110         }
    111     #else
    112         btScalar	m_floats[4];
    113     #endif
    114 #endif //__CELLOS_LV2__ __SPU__
    115 
    116 	public:
    117 
    118   /**@brief No initialization constructor */
    119 	SIMD_FORCE_INLINE btVector3()
    120 	{
    121 
    122 	}
    123 
    124 
    125 
    126   /**@brief Constructor from scalars
    127    * @param x X value
    128    * @param y Y value
    129    * @param z Z value
    130    */
    131 	SIMD_FORCE_INLINE btVector3(const btScalar& _x, const btScalar& _y, const btScalar& _z)
    132 	{
    133 		m_floats[0] = _x;
    134 		m_floats[1] = _y;
    135 		m_floats[2] = _z;
    136 		m_floats[3] = btScalar(0.f);
    137 	}
    138 
    139 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
    140 	// Set Vector
    141 	SIMD_FORCE_INLINE btVector3( btSimdFloat4 v)
    142 	{
    143 		mVec128 = v;
    144 	}
    145 
    146 	// Copy constructor
    147 	SIMD_FORCE_INLINE btVector3(const btVector3& rhs)
    148 	{
    149 		mVec128 = rhs.mVec128;
    150 	}
    151 
    152 	// Assignment Operator
    153 	SIMD_FORCE_INLINE btVector3&
    154 	operator=(const btVector3& v)
    155 	{
    156 		mVec128 = v.mVec128;
    157 
    158 		return *this;
    159 	}
    160 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
    161 
    162 /**@brief Add a vector to this one
    163  * @param The vector to add to this one */
    164 	SIMD_FORCE_INLINE btVector3& operator+=(const btVector3& v)
    165 	{
    166 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    167 		mVec128 = _mm_add_ps(mVec128, v.mVec128);
    168 #elif defined(BT_USE_NEON)
    169 		mVec128 = vaddq_f32(mVec128, v.mVec128);
    170 #else
    171 		m_floats[0] += v.m_floats[0];
    172 		m_floats[1] += v.m_floats[1];
    173 		m_floats[2] += v.m_floats[2];
    174 #endif
    175 		return *this;
    176 	}
    177 
    178 
    179   /**@brief Subtract a vector from this one
    180    * @param The vector to subtract */
    181 	SIMD_FORCE_INLINE btVector3& operator-=(const btVector3& v)
    182 	{
    183 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    184 		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
    185 #elif defined(BT_USE_NEON)
    186 		mVec128 = vsubq_f32(mVec128, v.mVec128);
    187 #else
    188 		m_floats[0] -= v.m_floats[0];
    189 		m_floats[1] -= v.m_floats[1];
    190 		m_floats[2] -= v.m_floats[2];
    191 #endif
    192 		return *this;
    193 	}
    194 
    195   /**@brief Scale the vector
    196    * @param s Scale factor */
    197 	SIMD_FORCE_INLINE btVector3& operator*=(const btScalar& s)
    198 	{
    199 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    200 		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
    201 		vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
    202 		mVec128 = _mm_mul_ps(mVec128, vs);
    203 #elif defined(BT_USE_NEON)
    204 		mVec128 = vmulq_n_f32(mVec128, s);
    205 #else
    206 		m_floats[0] *= s;
    207 		m_floats[1] *= s;
    208 		m_floats[2] *= s;
    209 #endif
    210 		return *this;
    211 	}
    212 
    213   /**@brief Inversely scale the vector
    214    * @param s Scale factor to divide by */
    215 	SIMD_FORCE_INLINE btVector3& operator/=(const btScalar& s)
    216 	{
    217 		btFullAssert(s != btScalar(0.0));
    218 
    219 #if 0 //defined(BT_USE_SSE_IN_API)
    220 // this code is not faster !
    221 		__m128 vs = _mm_load_ss(&s);
    222 		vs = _mm_div_ss(v1110, vs);
    223 		vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
    224 
    225 		mVec128 = _mm_mul_ps(mVec128, vs);
    226 
    227 		return *this;
    228 #else
    229 		return *this *= btScalar(1.0) / s;
    230 #endif
    231 	}
    232 
    233   /**@brief Return the dot product
    234    * @param v The other vector in the dot product */
    235 	SIMD_FORCE_INLINE btScalar dot(const btVector3& v) const
    236 	{
    237 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    238 		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
    239 		__m128 z = _mm_movehl_ps(vd, vd);
    240 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
    241 		vd = _mm_add_ss(vd, y);
    242 		vd = _mm_add_ss(vd, z);
    243 		return _mm_cvtss_f32(vd);
    244 #elif defined(BT_USE_NEON)
    245 		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
    246 		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
    247 		x = vadd_f32(x, vget_high_f32(vd));
    248 		return vget_lane_f32(x, 0);
    249 #else
    250 		return	m_floats[0] * v.m_floats[0] +
    251 				m_floats[1] * v.m_floats[1] +
    252 				m_floats[2] * v.m_floats[2];
    253 #endif
    254 	}
    255 
    256   /**@brief Return the length of the vector squared */
    257 	SIMD_FORCE_INLINE btScalar length2() const
    258 	{
    259 		return dot(*this);
    260 	}
    261 
    262   /**@brief Return the length of the vector */
    263 	SIMD_FORCE_INLINE btScalar length() const
    264 	{
    265 		return btSqrt(length2());
    266 	}
    267 
    268 	/**@brief Return the norm (length) of the vector */
    269 	SIMD_FORCE_INLINE btScalar norm() const
    270 	{
    271 		return length();
    272 	}
    273 
    274   /**@brief Return the distance squared between the ends of this and another vector
    275    * This is symantically treating the vector like a point */
    276 	SIMD_FORCE_INLINE btScalar distance2(const btVector3& v) const;
    277 
    278   /**@brief Return the distance between the ends of this and another vector
    279    * This is symantically treating the vector like a point */
    280 	SIMD_FORCE_INLINE btScalar distance(const btVector3& v) const;
    281 
    282 	SIMD_FORCE_INLINE btVector3& safeNormalize()
    283 	{
    284 		btVector3 absVec = this->absolute();
    285 		int maxIndex = absVec.maxAxis();
    286 		if (absVec[maxIndex]>0)
    287 		{
    288 			*this /= absVec[maxIndex];
    289 			return *this /= length();
    290 		}
    291 		setValue(1,0,0);
    292 		return *this;
    293 	}
    294 
    295   /**@brief Normalize this vector
    296    * x^2 + y^2 + z^2 = 1 */
    297 	SIMD_FORCE_INLINE btVector3& normalize()
    298 	{
    299 
    300 		btAssert(!fuzzyZero());
    301 
    302 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    303         // dot product first
    304 		__m128 vd = _mm_mul_ps(mVec128, mVec128);
    305 		__m128 z = _mm_movehl_ps(vd, vd);
    306 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
    307 		vd = _mm_add_ss(vd, y);
    308 		vd = _mm_add_ss(vd, z);
    309 
    310         #if 0
    311         vd = _mm_sqrt_ss(vd);
    312 		vd = _mm_div_ss(v1110, vd);
    313 		vd = bt_splat_ps(vd, 0x80);
    314 		mVec128 = _mm_mul_ps(mVec128, vd);
    315         #else
    316 
    317         // NR step 1/sqrt(x) - vd is x, y is output
    318         y = _mm_rsqrt_ss(vd); // estimate
    319 
    320         //  one step NR
    321         z = v1_5;
    322         vd = _mm_mul_ss(vd, vHalf); // vd * 0.5
    323         //x2 = vd;
    324         vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0
    325         vd = _mm_mul_ss(vd, y); // vd * 0.5 * y0 * y0
    326         z = _mm_sub_ss(z, vd);  // 1.5 - vd * 0.5 * y0 * y0
    327 
    328         y = _mm_mul_ss(y, z);   // y0 * (1.5 - vd * 0.5 * y0 * y0)
    329 
    330 		y = bt_splat_ps(y, 0x80);
    331 		mVec128 = _mm_mul_ps(mVec128, y);
    332 
    333         #endif
    334 
    335 
    336 		return *this;
    337 #else
    338 		return *this /= length();
    339 #endif
    340 	}
    341 
    342   /**@brief Return a normalized version of this vector */
    343 	SIMD_FORCE_INLINE btVector3 normalized() const;
    344 
    345   /**@brief Return a rotated version of this vector
    346    * @param wAxis The axis to rotate about
    347    * @param angle The angle to rotate by */
    348 	SIMD_FORCE_INLINE btVector3 rotate( const btVector3& wAxis, const btScalar angle ) const;
    349 
    350   /**@brief Return the angle between this and another vector
    351    * @param v The other vector */
    352 	SIMD_FORCE_INLINE btScalar angle(const btVector3& v) const
    353 	{
    354 		btScalar s = btSqrt(length2() * v.length2());
    355 		btFullAssert(s != btScalar(0.0));
    356 		return btAcos(dot(v) / s);
    357 	}
    358 
    359   /**@brief Return a vector will the absolute values of each element */
    360 	SIMD_FORCE_INLINE btVector3 absolute() const
    361 	{
    362 
    363 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    364 		return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
    365 #elif defined(BT_USE_NEON)
    366 		return btVector3(vabsq_f32(mVec128));
    367 #else
    368 		return btVector3(
    369 			btFabs(m_floats[0]),
    370 			btFabs(m_floats[1]),
    371 			btFabs(m_floats[2]));
    372 #endif
    373 	}
    374 
    375   /**@brief Return the cross product between this and another vector
    376    * @param v The other vector */
    377 	SIMD_FORCE_INLINE btVector3 cross(const btVector3& v) const
    378 	{
    379 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    380 		__m128	T, V;
    381 
    382 		T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
    383 		V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
    384 
    385 		V = _mm_mul_ps(V, mVec128);
    386 		T = _mm_mul_ps(T, v.mVec128);
    387 		V = _mm_sub_ps(V, T);
    388 
    389 		V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
    390 		return btVector3(V);
    391 #elif defined(BT_USE_NEON)
    392 		float32x4_t T, V;
    393 		// form (Y, Z, X, _) of mVec128 and v.mVec128
    394 		float32x2_t Tlow = vget_low_f32(mVec128);
    395 		float32x2_t Vlow = vget_low_f32(v.mVec128);
    396 		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
    397 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
    398 
    399 		V = vmulq_f32(V, mVec128);
    400 		T = vmulq_f32(T, v.mVec128);
    401 		V = vsubq_f32(V, T);
    402 		Vlow = vget_low_f32(V);
    403 		// form (Y, Z, X, _);
    404 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
    405 		V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
    406 
    407 		return btVector3(V);
    408 #else
    409 		return btVector3(
    410 			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
    411 			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
    412 			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
    413 #endif
    414 	}
    415 
    416 	SIMD_FORCE_INLINE btScalar triple(const btVector3& v1, const btVector3& v2) const
    417 	{
    418 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    419 		// cross:
    420 		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
    421 		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));	//	(Y Z X 0)
    422 
    423 		V = _mm_mul_ps(V, v1.mVec128);
    424 		T = _mm_mul_ps(T, v2.mVec128);
    425 		V = _mm_sub_ps(V, T);
    426 
    427 		V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
    428 
    429 		// dot:
    430 		V = _mm_mul_ps(V, mVec128);
    431 		__m128 z = _mm_movehl_ps(V, V);
    432 		__m128 y = _mm_shuffle_ps(V, V, 0x55);
    433 		V = _mm_add_ss(V, y);
    434 		V = _mm_add_ss(V, z);
    435 		return _mm_cvtss_f32(V);
    436 
    437 #elif defined(BT_USE_NEON)
    438 		// cross:
    439 		float32x4_t T, V;
    440 		// form (Y, Z, X, _) of mVec128 and v.mVec128
    441 		float32x2_t Tlow = vget_low_f32(v1.mVec128);
    442 		float32x2_t Vlow = vget_low_f32(v2.mVec128);
    443 		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
    444 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
    445 
    446 		V = vmulq_f32(V, v1.mVec128);
    447 		T = vmulq_f32(T, v2.mVec128);
    448 		V = vsubq_f32(V, T);
    449 		Vlow = vget_low_f32(V);
    450 		// form (Y, Z, X, _);
    451 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
    452 
    453 		// dot:
    454 		V = vmulq_f32(mVec128, V);
    455 		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
    456 		x = vadd_f32(x, vget_high_f32(V));
    457 		return vget_lane_f32(x, 0);
    458 #else
    459 		return
    460 			m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
    461 			m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
    462 			m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
    463 #endif
    464 	}
    465 
    466   /**@brief Return the axis with the smallest value
    467    * Note return values are 0,1,2 for x, y, or z */
    468 	SIMD_FORCE_INLINE int minAxis() const
    469 	{
    470 		return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
    471 	}
    472 
    473   /**@brief Return the axis with the largest value
    474    * Note return values are 0,1,2 for x, y, or z */
    475 	SIMD_FORCE_INLINE int maxAxis() const
    476 	{
    477 		return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
    478 	}
    479 
    480 	SIMD_FORCE_INLINE int furthestAxis() const
    481 	{
    482 		return absolute().minAxis();
    483 	}
    484 
    485 	SIMD_FORCE_INLINE int closestAxis() const
    486 	{
    487 		return absolute().maxAxis();
    488 	}
    489 
    490 
    491 	SIMD_FORCE_INLINE void setInterpolate3(const btVector3& v0, const btVector3& v1, btScalar rt)
    492 	{
    493 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    494 		__m128	vrt = _mm_load_ss(&rt);	//	(rt 0 0 0)
    495 		btScalar s = btScalar(1.0) - rt;
    496 		__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
    497 		vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
    498 		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
    499 		vrt = bt_pshufd_ps(vrt, 0x80);	//	(rt rt rt 0.0)
    500 		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
    501 		__m128 tmp3 = _mm_add_ps(r0,r1);
    502 		mVec128 = tmp3;
    503 #elif defined(BT_USE_NEON)
    504 		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
    505 		vl = vmulq_n_f32(vl, rt);
    506 		mVec128 = vaddq_f32(vl, v0.mVec128);
    507 #else
    508 		btScalar s = btScalar(1.0) - rt;
    509 		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
    510 		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
    511 		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
    512 		//don't do the unused w component
    513 		//		m_co[3] = s * v0[3] + rt * v1[3];
    514 #endif
    515 	}
    516 
    517   /**@brief Return the linear interpolation between this and another vector
    518    * @param v The other vector
    519    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
    520 	SIMD_FORCE_INLINE btVector3 lerp(const btVector3& v, const btScalar& t) const
    521 	{
    522 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    523 		__m128	vt = _mm_load_ss(&t);	//	(t 0 0 0)
    524 		vt = bt_pshufd_ps(vt, 0x80);	//	(rt rt rt 0.0)
    525 		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
    526 		vl = _mm_mul_ps(vl, vt);
    527 		vl = _mm_add_ps(vl, mVec128);
    528 
    529 		return btVector3(vl);
    530 #elif defined(BT_USE_NEON)
    531 		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
    532 		vl = vmulq_n_f32(vl, t);
    533 		vl = vaddq_f32(vl, mVec128);
    534 
    535 		return btVector3(vl);
    536 #else
    537 		return
    538 			btVector3(	m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
    539 						m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
    540 						m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
    541 #endif
    542 	}
    543 
    544   /**@brief Elementwise multiply this vector by the other
    545    * @param v The other vector */
    546 	SIMD_FORCE_INLINE btVector3& operator*=(const btVector3& v)
    547 	{
    548 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    549 		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
    550 #elif defined(BT_USE_NEON)
    551 		mVec128 = vmulq_f32(mVec128, v.mVec128);
    552 #else
    553 		m_floats[0] *= v.m_floats[0];
    554 		m_floats[1] *= v.m_floats[1];
    555 		m_floats[2] *= v.m_floats[2];
    556 #endif
    557 		return *this;
    558 	}
    559 
    560 	 /**@brief Return the x value */
    561 		SIMD_FORCE_INLINE const btScalar& getX() const { return m_floats[0]; }
    562   /**@brief Return the y value */
    563 		SIMD_FORCE_INLINE const btScalar& getY() const { return m_floats[1]; }
    564   /**@brief Return the z value */
    565 		SIMD_FORCE_INLINE const btScalar& getZ() const { return m_floats[2]; }
    566   /**@brief Set the x value */
    567 		SIMD_FORCE_INLINE void	setX(btScalar _x) { m_floats[0] = _x;};
    568   /**@brief Set the y value */
    569 		SIMD_FORCE_INLINE void	setY(btScalar _y) { m_floats[1] = _y;};
    570   /**@brief Set the z value */
    571 		SIMD_FORCE_INLINE void	setZ(btScalar _z) { m_floats[2] = _z;};
    572   /**@brief Set the w value */
    573 		SIMD_FORCE_INLINE void	setW(btScalar _w) { m_floats[3] = _w;};
    574   /**@brief Return the x value */
    575 		SIMD_FORCE_INLINE const btScalar& x() const { return m_floats[0]; }
    576   /**@brief Return the y value */
    577 		SIMD_FORCE_INLINE const btScalar& y() const { return m_floats[1]; }
    578   /**@brief Return the z value */
    579 		SIMD_FORCE_INLINE const btScalar& z() const { return m_floats[2]; }
    580   /**@brief Return the w value */
    581 		SIMD_FORCE_INLINE const btScalar& w() const { return m_floats[3]; }
    582 
    583 	//SIMD_FORCE_INLINE btScalar&       operator[](int i)       { return (&m_floats[0])[i];	}
    584 	//SIMD_FORCE_INLINE const btScalar& operator[](int i) const { return (&m_floats[0])[i]; }
    585 	///operator btScalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
    586 	SIMD_FORCE_INLINE	operator       btScalar *()       { return &m_floats[0]; }
    587 	SIMD_FORCE_INLINE	operator const btScalar *() const { return &m_floats[0]; }
    588 
    589 	SIMD_FORCE_INLINE	bool	operator==(const btVector3& other) const
    590 	{
    591 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    592         return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
    593 #else
    594 		return ((m_floats[3]==other.m_floats[3]) &&
    595                 (m_floats[2]==other.m_floats[2]) &&
    596                 (m_floats[1]==other.m_floats[1]) &&
    597                 (m_floats[0]==other.m_floats[0]));
    598 #endif
    599 	}
    600 
    601 	SIMD_FORCE_INLINE	bool	operator!=(const btVector3& other) const
    602 	{
    603 		return !(*this == other);
    604 	}
    605 
    606   /**@brief Set each element to the max of the current values and the values of another btVector3
    607    * @param other The other btVector3 to compare with
    608    */
    609 	SIMD_FORCE_INLINE void	setMax(const btVector3& other)
    610 	{
    611 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    612 		mVec128 = _mm_max_ps(mVec128, other.mVec128);
    613 #elif defined(BT_USE_NEON)
    614 		mVec128 = vmaxq_f32(mVec128, other.mVec128);
    615 #else
    616 		btSetMax(m_floats[0], other.m_floats[0]);
    617 		btSetMax(m_floats[1], other.m_floats[1]);
    618 		btSetMax(m_floats[2], other.m_floats[2]);
    619 		btSetMax(m_floats[3], other.w());
    620 #endif
    621 	}
    622 
    623   /**@brief Set each element to the min of the current values and the values of another btVector3
    624    * @param other The other btVector3 to compare with
    625    */
    626 	SIMD_FORCE_INLINE void	setMin(const btVector3& other)
    627 	{
    628 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    629 		mVec128 = _mm_min_ps(mVec128, other.mVec128);
    630 #elif defined(BT_USE_NEON)
    631 		mVec128 = vminq_f32(mVec128, other.mVec128);
    632 #else
    633 		btSetMin(m_floats[0], other.m_floats[0]);
    634 		btSetMin(m_floats[1], other.m_floats[1]);
    635 		btSetMin(m_floats[2], other.m_floats[2]);
    636 		btSetMin(m_floats[3], other.w());
    637 #endif
    638 	}
    639 
    640 	SIMD_FORCE_INLINE void 	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z)
    641 	{
    642 		m_floats[0]=_x;
    643 		m_floats[1]=_y;
    644 		m_floats[2]=_z;
    645 		m_floats[3] = btScalar(0.f);
    646 	}
    647 
    648 	void	getSkewSymmetricMatrix(btVector3* v0,btVector3* v1,btVector3* v2) const
    649 	{
    650 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    651 
    652 		__m128 V  = _mm_and_ps(mVec128, btvFFF0fMask);
    653 		__m128 V0 = _mm_xor_ps(btvMzeroMask, V);
    654 		__m128 V2 = _mm_movelh_ps(V0, V);
    655 
    656 		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
    657 
    658         V0 = _mm_shuffle_ps(V0, V, 0xDB);
    659 		V2 = _mm_shuffle_ps(V2, V, 0xF9);
    660 
    661 		v0->mVec128 = V0;
    662 		v1->mVec128 = V1;
    663 		v2->mVec128 = V2;
    664 #else
    665 		v0->setValue(0.		,-z()		,y());
    666 		v1->setValue(z()	,0.			,-x());
    667 		v2->setValue(-y()	,x()	,0.);
    668 #endif
    669 	}
    670 
    671 	void setZero()
    672 	{
    673 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    674 		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
    675 #elif defined(BT_USE_NEON)
    676 		int32x4_t vi = vdupq_n_s32(0);
    677 		mVec128 = vreinterpretq_f32_s32(vi);
    678 #else
    679 		setValue(btScalar(0.),btScalar(0.),btScalar(0.));
    680 #endif
    681 	}
    682 
    683 	SIMD_FORCE_INLINE bool isZero() const
    684 	{
    685 		return m_floats[0] == btScalar(0) && m_floats[1] == btScalar(0) && m_floats[2] == btScalar(0);
    686 	}
    687 
    688 
    689 	SIMD_FORCE_INLINE bool fuzzyZero() const
    690 	{
    691 		return length2() < SIMD_EPSILON*SIMD_EPSILON;
    692 	}
    693 
    694 	SIMD_FORCE_INLINE	void	serialize(struct	btVector3Data& dataOut) const;
    695 
    696 	SIMD_FORCE_INLINE	void	deSerialize(const struct	btVector3Data& dataIn);
    697 
    698 	SIMD_FORCE_INLINE	void	serializeFloat(struct	btVector3FloatData& dataOut) const;
    699 
    700 	SIMD_FORCE_INLINE	void	deSerializeFloat(const struct	btVector3FloatData& dataIn);
    701 
    702 	SIMD_FORCE_INLINE	void	serializeDouble(struct	btVector3DoubleData& dataOut) const;
    703 
    704 	SIMD_FORCE_INLINE	void	deSerializeDouble(const struct	btVector3DoubleData& dataIn);
    705 
    706         /**@brief returns index of maximum dot product between this and vectors in array[]
    707          * @param array The other vectors
    708          * @param array_count The number of other vectors
    709          * @param dotOut The maximum dot product */
    710         SIMD_FORCE_INLINE   long    maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const;
    711 
    712         /**@brief returns index of minimum dot product between this and vectors in array[]
    713          * @param array The other vectors
    714          * @param array_count The number of other vectors
    715          * @param dotOut The minimum dot product */
    716         SIMD_FORCE_INLINE   long    minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const;
    717 
    718     /* create a vector as  btVector3( this->dot( btVector3 v0 ), this->dot( btVector3 v1), this->dot( btVector3 v2 ))  */
    719     SIMD_FORCE_INLINE btVector3  dot3( const btVector3 &v0, const btVector3 &v1, const btVector3 &v2 ) const
    720     {
    721 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    722 
    723         __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
    724         __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
    725         __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
    726         __m128 b0 = _mm_unpacklo_ps( a0, a1 );
    727         __m128 b1 = _mm_unpackhi_ps( a0, a1 );
    728         __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
    729         __m128 r = _mm_movelh_ps( b0, b2 );
    730         r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
    731         a2 = _mm_and_ps( a2, btvxyzMaskf);
    732         r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
    733         return btVector3(r);
    734 
    735 #elif defined(BT_USE_NEON)
    736         static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
    737         float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
    738         float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
    739         float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
    740         float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
    741         a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
    742         float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
    743         float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
    744         return btVector3( vcombine_f32(b0, b1) );
    745 #else
    746 		return btVector3( dot(v0), dot(v1), dot(v2));
    747 #endif
    748     }
    749 };
    750 
    751 /**@brief Return the sum of two vectors (Point symantics)*/
    752 SIMD_FORCE_INLINE btVector3
    753 operator+(const btVector3& v1, const btVector3& v2)
    754 {
    755 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    756 	return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
    757 #elif defined(BT_USE_NEON)
    758 	return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
    759 #else
    760 	return btVector3(
    761 			v1.m_floats[0] + v2.m_floats[0],
    762 			v1.m_floats[1] + v2.m_floats[1],
    763 			v1.m_floats[2] + v2.m_floats[2]);
    764 #endif
    765 }
    766 
    767 /**@brief Return the elementwise product of two vectors */
    768 SIMD_FORCE_INLINE btVector3
    769 operator*(const btVector3& v1, const btVector3& v2)
    770 {
    771 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    772 	return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
    773 #elif defined(BT_USE_NEON)
    774 	return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
    775 #else
    776 	return btVector3(
    777 			v1.m_floats[0] * v2.m_floats[0],
    778 			v1.m_floats[1] * v2.m_floats[1],
    779 			v1.m_floats[2] * v2.m_floats[2]);
    780 #endif
    781 }
    782 
    783 /**@brief Return the difference between two vectors */
    784 SIMD_FORCE_INLINE btVector3
    785 operator-(const btVector3& v1, const btVector3& v2)
    786 {
    787 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)  && defined(BT_USE_SSE))
    788 
    789 	//	without _mm_and_ps this code causes slowdown in Concave moving
    790 	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
    791 	return btVector3(_mm_and_ps(r, btvFFF0fMask));
    792 #elif defined(BT_USE_NEON)
    793 	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
    794 	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
    795 #else
    796 	return btVector3(
    797 			v1.m_floats[0] - v2.m_floats[0],
    798 			v1.m_floats[1] - v2.m_floats[1],
    799 			v1.m_floats[2] - v2.m_floats[2]);
    800 #endif
    801 }
    802 
    803 /**@brief Return the negative of the vector */
    804 SIMD_FORCE_INLINE btVector3
    805 operator-(const btVector3& v)
    806 {
    807 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
    808 	__m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
    809 	return btVector3(_mm_and_ps(r, btvFFF0fMask));
    810 #elif defined(BT_USE_NEON)
    811 	return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
    812 #else
    813 	return btVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
    814 #endif
    815 }
    816 
    817 /**@brief Return the vector scaled by s */
    818 SIMD_FORCE_INLINE btVector3
    819 operator*(const btVector3& v, const btScalar& s)
    820 {
    821 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    822 	__m128	vs = _mm_load_ss(&s);	//	(S 0 0 0)
    823 	vs = bt_pshufd_ps(vs, 0x80);	//	(S S S 0.0)
    824 	return btVector3(_mm_mul_ps(v.mVec128, vs));
    825 #elif defined(BT_USE_NEON)
    826 	float32x4_t r = vmulq_n_f32(v.mVec128, s);
    827 	return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
    828 #else
    829 	return btVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
    830 #endif
    831 }
    832 
    833 /**@brief Return the vector scaled by s */
    834 SIMD_FORCE_INLINE btVector3
    835 operator*(const btScalar& s, const btVector3& v)
    836 {
    837 	return v * s;
    838 }
    839 
    840 /**@brief Return the vector inversely scaled by s */
    841 SIMD_FORCE_INLINE btVector3
    842 operator/(const btVector3& v, const btScalar& s)
    843 {
    844 	btFullAssert(s != btScalar(0.0));
    845 #if 0 //defined(BT_USE_SSE_IN_API)
    846 // this code is not faster !
    847 	__m128 vs = _mm_load_ss(&s);
    848     vs = _mm_div_ss(v1110, vs);
    849 	vs = bt_pshufd_ps(vs, 0x00);	//	(S S S S)
    850 
    851 	return btVector3(_mm_mul_ps(v.mVec128, vs));
    852 #else
    853 	return v * (btScalar(1.0) / s);
    854 #endif
    855 }
    856 
    857 /**@brief Return the vector inversely scaled by s */
    858 SIMD_FORCE_INLINE btVector3
    859 operator/(const btVector3& v1, const btVector3& v2)
    860 {
    861 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
    862 	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
    863 	vec = _mm_and_ps(vec, btvFFF0fMask);
    864 	return btVector3(vec);
    865 #elif defined(BT_USE_NEON)
    866 	float32x4_t x, y, v, m;
    867 
    868 	x = v1.mVec128;
    869 	y = v2.mVec128;
    870 
    871 	v = vrecpeq_f32(y);			// v ~ 1/y
    872 	m = vrecpsq_f32(y, v);		// m = (2-v*y)
    873 	v = vmulq_f32(v, m);		// vv = v*m ~~ 1/y
    874 	m = vrecpsq_f32(y, v);		// mm = (2-vv*y)
    875 	v = vmulq_f32(v, x);		// x*vv
    876 	v = vmulq_f32(v, m);		// (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
    877 
    878 	return btVector3(v);
    879 #else
    880 	return btVector3(
    881 			v1.m_floats[0] / v2.m_floats[0],
    882 			v1.m_floats[1] / v2.m_floats[1],
    883 			v1.m_floats[2] / v2.m_floats[2]);
    884 #endif
    885 }
    886 
    887 /**@brief Return the dot product between two vectors */
    888 SIMD_FORCE_INLINE btScalar
    889 btDot(const btVector3& v1, const btVector3& v2)
    890 {
    891 	return v1.dot(v2);
    892 }
    893 
    894 
    895 /**@brief Return the distance squared between two vectors */
    896 SIMD_FORCE_INLINE btScalar
    897 btDistance2(const btVector3& v1, const btVector3& v2)
    898 {
    899 	return v1.distance2(v2);
    900 }
    901 
    902 
    903 /**@brief Return the distance between two vectors */
    904 SIMD_FORCE_INLINE btScalar
    905 btDistance(const btVector3& v1, const btVector3& v2)
    906 {
    907 	return v1.distance(v2);
    908 }
    909 
    910 /**@brief Return the angle between two vectors */
    911 SIMD_FORCE_INLINE btScalar
    912 btAngle(const btVector3& v1, const btVector3& v2)
    913 {
    914 	return v1.angle(v2);
    915 }
    916 
    917 /**@brief Return the cross product of two vectors */
    918 SIMD_FORCE_INLINE btVector3
    919 btCross(const btVector3& v1, const btVector3& v2)
    920 {
    921 	return v1.cross(v2);
    922 }
    923 
    924 SIMD_FORCE_INLINE btScalar
    925 btTriple(const btVector3& v1, const btVector3& v2, const btVector3& v3)
    926 {
    927 	return v1.triple(v2, v3);
    928 }
    929 
    930 /**@brief Return the linear interpolation between two vectors
    931  * @param v1 One vector
    932  * @param v2 The other vector
    933  * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
    934 SIMD_FORCE_INLINE btVector3
    935 lerp(const btVector3& v1, const btVector3& v2, const btScalar& t)
    936 {
    937 	return v1.lerp(v2, t);
    938 }
    939 
    940 
    941 
    942 SIMD_FORCE_INLINE btScalar btVector3::distance2(const btVector3& v) const
    943 {
    944 	return (v - *this).length2();
    945 }
    946 
    947 SIMD_FORCE_INLINE btScalar btVector3::distance(const btVector3& v) const
    948 {
    949 	return (v - *this).length();
    950 }
    951 
    952 SIMD_FORCE_INLINE btVector3 btVector3::normalized() const
    953 {
    954 	btVector3 nrm = *this;
    955 
    956 	return nrm.normalize();
    957 }
    958 
    959 SIMD_FORCE_INLINE btVector3 btVector3::rotate( const btVector3& wAxis, const btScalar _angle ) const
    960 {
    961 	// wAxis must be a unit lenght vector
    962 
    963 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
    964 
    965     __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
    966 	btScalar ssin = btSin( _angle );
    967     __m128 C = wAxis.cross( mVec128 ).mVec128;
    968 	O = _mm_and_ps(O, btvFFF0fMask);
    969     btScalar scos = btCos( _angle );
    970 
    971 	__m128 vsin = _mm_load_ss(&ssin);	//	(S 0 0 0)
    972     __m128 vcos = _mm_load_ss(&scos);	//	(S 0 0 0)
    973 
    974 	__m128 Y = bt_pshufd_ps(O, 0xC9);	//	(Y Z X 0)
    975 	__m128 Z = bt_pshufd_ps(O, 0xD2);	//	(Z X Y 0)
    976 	O = _mm_add_ps(O, Y);
    977 	vsin = bt_pshufd_ps(vsin, 0x80);	//	(S S S 0)
    978 	O = _mm_add_ps(O, Z);
    979     vcos = bt_pshufd_ps(vcos, 0x80);	//	(S S S 0)
    980 
    981     vsin = vsin * C;
    982 	O = O * wAxis.mVec128;
    983 	__m128 X = mVec128 - O;
    984 
    985     O = O + vsin;
    986 	vcos = vcos * X;
    987 	O = O + vcos;
    988 
    989 	return btVector3(O);
    990 #else
    991 	btVector3 o = wAxis * wAxis.dot( *this );
    992 	btVector3 _x = *this - o;
    993 	btVector3 _y;
    994 
    995 	_y = wAxis.cross( *this );
    996 
    997 	return ( o + _x * btCos( _angle ) + _y * btSin( _angle ) );
    998 #endif
    999 }
   1000 
   1001 SIMD_FORCE_INLINE   long    btVector3::maxDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
   1002 {
   1003 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
   1004     #if defined _WIN32 || defined (BT_USE_SSE)
   1005         const long scalar_cutoff = 10;
   1006         long _maxdot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
   1007     #elif defined BT_USE_NEON
   1008         const long scalar_cutoff = 4;
   1009         extern long (*_maxdot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
   1010     #endif
   1011     if( array_count < scalar_cutoff )
   1012 #endif
   1013     {
   1014         btScalar maxDot1 = -SIMD_INFINITY;
   1015         int i = 0;
   1016         int ptIndex = -1;
   1017         for( i = 0; i < array_count; i++ )
   1018         {
   1019             btScalar dot = array[i].dot(*this);
   1020 
   1021             if( dot > maxDot1 )
   1022             {
   1023                 maxDot1 = dot;
   1024                 ptIndex = i;
   1025             }
   1026         }
   1027 
   1028         dotOut = maxDot1;
   1029         return ptIndex;
   1030     }
   1031 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
   1032     return _maxdot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
   1033 #endif
   1034 }
   1035 
   1036 SIMD_FORCE_INLINE   long    btVector3::minDot( const btVector3 *array, long array_count, btScalar &dotOut ) const
   1037 {
   1038 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
   1039     #if defined BT_USE_SSE
   1040         const long scalar_cutoff = 10;
   1041         long _mindot_large( const float *array, const float *vec, unsigned long array_count, float *dotOut );
   1042     #elif defined BT_USE_NEON
   1043         const long scalar_cutoff = 4;
   1044         extern long (*_mindot_large)( const float *array, const float *vec, unsigned long array_count, float *dotOut );
   1045     #else
   1046         #error unhandled arch!
   1047     #endif
   1048 
   1049     if( array_count < scalar_cutoff )
   1050 #endif
   1051     {
   1052         btScalar  minDot = SIMD_INFINITY;
   1053         int i = 0;
   1054         int ptIndex = -1;
   1055 
   1056         for( i = 0; i < array_count; i++ )
   1057         {
   1058             btScalar dot = array[i].dot(*this);
   1059 
   1060             if( dot < minDot )
   1061             {
   1062                 minDot = dot;
   1063                 ptIndex = i;
   1064             }
   1065         }
   1066 
   1067         dotOut = minDot;
   1068 
   1069         return ptIndex;
   1070     }
   1071 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
   1072     return _mindot_large( (float*) array, (float*) &m_floats[0], array_count, &dotOut );
   1073 #endif//BT_USE_SIMD_VECTOR3
   1074 }
   1075 
   1076 
   1077 class btVector4 : public btVector3
   1078 {
   1079 public:
   1080 
   1081 	SIMD_FORCE_INLINE btVector4() {}
   1082 
   1083 
   1084 	SIMD_FORCE_INLINE btVector4(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
   1085 		: btVector3(_x,_y,_z)
   1086 	{
   1087 		m_floats[3] = _w;
   1088 	}
   1089 
   1090 #if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON)
   1091 	SIMD_FORCE_INLINE btVector4(const btSimdFloat4 vec)
   1092 	{
   1093 		mVec128 = vec;
   1094 	}
   1095 
   1096 	SIMD_FORCE_INLINE btVector4(const btVector3& rhs)
   1097 	{
   1098 		mVec128 = rhs.mVec128;
   1099 	}
   1100 
   1101 	SIMD_FORCE_INLINE btVector4&
   1102 	operator=(const btVector4& v)
   1103 	{
   1104 		mVec128 = v.mVec128;
   1105 		return *this;
   1106 	}
   1107 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
   1108 
   1109 	SIMD_FORCE_INLINE btVector4 absolute4() const
   1110 	{
   1111 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
   1112 		return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
   1113 #elif defined(BT_USE_NEON)
   1114 		return btVector4(vabsq_f32(mVec128));
   1115 #else
   1116 		return btVector4(
   1117 			btFabs(m_floats[0]),
   1118 			btFabs(m_floats[1]),
   1119 			btFabs(m_floats[2]),
   1120 			btFabs(m_floats[3]));
   1121 #endif
   1122 	}
   1123 
   1124 
   1125 	btScalar	getW() const { return m_floats[3];}
   1126 
   1127 
   1128 		SIMD_FORCE_INLINE int maxAxis4() const
   1129 	{
   1130 		int maxIndex = -1;
   1131 		btScalar maxVal = btScalar(-BT_LARGE_FLOAT);
   1132 		if (m_floats[0] > maxVal)
   1133 		{
   1134 			maxIndex = 0;
   1135 			maxVal = m_floats[0];
   1136 		}
   1137 		if (m_floats[1] > maxVal)
   1138 		{
   1139 			maxIndex = 1;
   1140 			maxVal = m_floats[1];
   1141 		}
   1142 		if (m_floats[2] > maxVal)
   1143 		{
   1144 			maxIndex = 2;
   1145 			maxVal =m_floats[2];
   1146 		}
   1147 		if (m_floats[3] > maxVal)
   1148 		{
   1149 			maxIndex = 3;
   1150 			maxVal = m_floats[3];
   1151 		}
   1152 
   1153 		return maxIndex;
   1154 	}
   1155 
   1156 
   1157 	SIMD_FORCE_INLINE int minAxis4() const
   1158 	{
   1159 		int minIndex = -1;
   1160 		btScalar minVal = btScalar(BT_LARGE_FLOAT);
   1161 		if (m_floats[0] < minVal)
   1162 		{
   1163 			minIndex = 0;
   1164 			minVal = m_floats[0];
   1165 		}
   1166 		if (m_floats[1] < minVal)
   1167 		{
   1168 			minIndex = 1;
   1169 			minVal = m_floats[1];
   1170 		}
   1171 		if (m_floats[2] < minVal)
   1172 		{
   1173 			minIndex = 2;
   1174 			minVal =m_floats[2];
   1175 		}
   1176 		if (m_floats[3] < minVal)
   1177 		{
   1178 			minIndex = 3;
   1179 			minVal = m_floats[3];
   1180 		}
   1181 
   1182 		return minIndex;
   1183 	}
   1184 
   1185 
   1186 	SIMD_FORCE_INLINE int closestAxis4() const
   1187 	{
   1188 		return absolute4().maxAxis4();
   1189 	}
   1190 
   1191 
   1192 
   1193 
   1194   /**@brief Set x,y,z and zero w
   1195    * @param x Value of x
   1196    * @param y Value of y
   1197    * @param z Value of z
   1198    */
   1199 
   1200 
   1201 /*		void getValue(btScalar *m) const
   1202 		{
   1203 			m[0] = m_floats[0];
   1204 			m[1] = m_floats[1];
   1205 			m[2] =m_floats[2];
   1206 		}
   1207 */
   1208 /**@brief Set the values
   1209    * @param x Value of x
   1210    * @param y Value of y
   1211    * @param z Value of z
   1212    * @param w Value of w
   1213    */
   1214 		SIMD_FORCE_INLINE void	setValue(const btScalar& _x, const btScalar& _y, const btScalar& _z,const btScalar& _w)
   1215 		{
   1216 			m_floats[0]=_x;
   1217 			m_floats[1]=_y;
   1218 			m_floats[2]=_z;
   1219 			m_floats[3]=_w;
   1220 		}
   1221 
   1222 
   1223 };
   1224 
   1225 
   1226 ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
   1227 SIMD_FORCE_INLINE void	btSwapScalarEndian(const btScalar& sourceVal, btScalar& destVal)
   1228 {
   1229 	#ifdef BT_USE_DOUBLE_PRECISION
   1230 	unsigned char* dest = (unsigned char*) &destVal;
   1231 	unsigned char* src  = (unsigned char*) &sourceVal;
   1232 	dest[0] = src[7];
   1233     dest[1] = src[6];
   1234     dest[2] = src[5];
   1235     dest[3] = src[4];
   1236     dest[4] = src[3];
   1237     dest[5] = src[2];
   1238     dest[6] = src[1];
   1239     dest[7] = src[0];
   1240 #else
   1241 	unsigned char* dest = (unsigned char*) &destVal;
   1242 	unsigned char* src  = (unsigned char*) &sourceVal;
   1243 	dest[0] = src[3];
   1244     dest[1] = src[2];
   1245     dest[2] = src[1];
   1246     dest[3] = src[0];
   1247 #endif //BT_USE_DOUBLE_PRECISION
   1248 }
   1249 ///btSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
   1250 SIMD_FORCE_INLINE void	btSwapVector3Endian(const btVector3& sourceVec, btVector3& destVec)
   1251 {
   1252 	for (int i=0;i<4;i++)
   1253 	{
   1254 		btSwapScalarEndian(sourceVec[i],destVec[i]);
   1255 	}
   1256 
   1257 }
   1258 
   1259 ///btUnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
   1260 SIMD_FORCE_INLINE void	btUnSwapVector3Endian(btVector3& vector)
   1261 {
   1262 
   1263 	btVector3	swappedVec;
   1264 	for (int i=0;i<4;i++)
   1265 	{
   1266 		btSwapScalarEndian(vector[i],swappedVec[i]);
   1267 	}
   1268 	vector = swappedVec;
   1269 }
   1270 
   1271 template <class T>
   1272 SIMD_FORCE_INLINE void btPlaneSpace1 (const T& n, T& p, T& q)
   1273 {
   1274   if (btFabs(n[2]) > SIMDSQRT12) {
   1275     // choose p in y-z plane
   1276     btScalar a = n[1]*n[1] + n[2]*n[2];
   1277     btScalar k = btRecipSqrt (a);
   1278     p[0] = 0;
   1279 	p[1] = -n[2]*k;
   1280 	p[2] = n[1]*k;
   1281     // set q = n x p
   1282     q[0] = a*k;
   1283 	q[1] = -n[0]*p[2];
   1284 	q[2] = n[0]*p[1];
   1285   }
   1286   else {
   1287     // choose p in x-y plane
   1288     btScalar a = n[0]*n[0] + n[1]*n[1];
   1289     btScalar k = btRecipSqrt (a);
   1290     p[0] = -n[1]*k;
   1291 	p[1] = n[0]*k;
   1292 	p[2] = 0;
   1293     // set q = n x p
   1294     q[0] = -n[2]*p[1];
   1295 	q[1] = n[2]*p[0];
   1296 	q[2] = a*k;
   1297   }
   1298 }
   1299 
   1300 
   1301 struct	btVector3FloatData
   1302 {
   1303 	float	m_floats[4];
   1304 };
   1305 
   1306 struct	btVector3DoubleData
   1307 {
   1308 	double	m_floats[4];
   1309 
   1310 };
   1311 
   1312 SIMD_FORCE_INLINE	void	btVector3::serializeFloat(struct	btVector3FloatData& dataOut) const
   1313 {
   1314 	///could also do a memcpy, check if it is worth it
   1315 	for (int i=0;i<4;i++)
   1316 		dataOut.m_floats[i] = float(m_floats[i]);
   1317 }
   1318 
   1319 SIMD_FORCE_INLINE void	btVector3::deSerializeFloat(const struct	btVector3FloatData& dataIn)
   1320 {
   1321 	for (int i=0;i<4;i++)
   1322 		m_floats[i] = btScalar(dataIn.m_floats[i]);
   1323 }
   1324 
   1325 
   1326 SIMD_FORCE_INLINE	void	btVector3::serializeDouble(struct	btVector3DoubleData& dataOut) const
   1327 {
   1328 	///could also do a memcpy, check if it is worth it
   1329 	for (int i=0;i<4;i++)
   1330 		dataOut.m_floats[i] = double(m_floats[i]);
   1331 }
   1332 
   1333 SIMD_FORCE_INLINE void	btVector3::deSerializeDouble(const struct	btVector3DoubleData& dataIn)
   1334 {
   1335 	for (int i=0;i<4;i++)
   1336 		m_floats[i] = btScalar(dataIn.m_floats[i]);
   1337 }
   1338 
   1339 
   1340 SIMD_FORCE_INLINE	void	btVector3::serialize(struct	btVector3Data& dataOut) const
   1341 {
   1342 	///could also do a memcpy, check if it is worth it
   1343 	for (int i=0;i<4;i++)
   1344 		dataOut.m_floats[i] = m_floats[i];
   1345 }
   1346 
   1347 SIMD_FORCE_INLINE void	btVector3::deSerialize(const struct	btVector3Data& dataIn)
   1348 {
   1349 	for (int i=0;i<4;i++)
   1350 		m_floats[i] = dataIn.m_floats[i];
   1351 }
   1352 
   1353 #endif //BT_VECTOR3_H
   1354