I have some progress with converting SSE->VFP.
I can share the source code, but I still not verified it.
btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD:
{SSE}
Code: Select all
__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
__m128 deltaVel1Dotn = _mm_add_ps(_vmathVfDot3(c.m_contactNormal.mVec128,body1.m_deltaLinearVelocity.mVec128), _vmathVfDot3(c.m_relpos1CrossNormal.mVec128,body1.m_deltaAngularVelocity.mVec128));
__m128 deltaVel2Dotn = _mm_add_ps(_vmathVfDot3((-c.m_contactNormal).mVec128,body2.m_deltaLinearVelocity.mVec128) ,_vmathVfDot3(c.m_relpos2CrossNormal.mVec128,body2.m_deltaAngularVelocity.mVec128));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
btSimdScalar resultLowerLess,resultUpperLess;
resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
__m128 upperMinApplied = _mm_sub_ps(upperLimit1,cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied) );
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1) );
__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,_mm_set1_ps(body1.m_invMass));
__m128 linearComponentB = _mm_mul_ps((-c.m_contactNormal).mVec128,_mm_set1_ps(body2.m_invMass));
__m128 impulseMagnitude = deltaImpulse;
body1.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body1.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
body1.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body1.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
body2.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body2.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
body2.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body2.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
{VFP}
Code: Select all
float32x4_t cpAppliedImp=vdupq_n_f32(c.m_appliedImpulse);
float32x4_t lowerLimit1=vdupq_n_f32(c.m_lowerLimit);
float32x4_t upperLimit1=vdupq_n_f32(c.m_upperLimit);
float32x4_t deltaImpulse=vsubq_f32(vdupq_n_f32(c.m_rhs),vmulq_f32(vdupq_n_f32(c.m_appliedImpulse),vdupq_n_f32(c.m_cfm)));
float32x4_t deltaVel1Dotn=vaddq_f32(vmathVfDot3(c.m_contactNormal.getf32x4(),body1.m_deltaLinearVelocity.getf32x4()),
vmathVfDot3(c.m_relpos1CrossNormal.getf32x4(),body1.m_deltaAngularVelocity.getf32x4()));
float32x4_t deltaVel2Dotn=vaddq_f32(vmathVfDot3((-c.m_contactNormal).getf32x4(),body2.m_deltaLinearVelocity.getf32x4()),
vmathVfDot3(c.m_relpos2CrossNormal.getf32x4(),body2.m_deltaAngularVelocity.getf32x4()));
deltaImpulse=vsubq_f32(deltaImpulse,vmulq_f32(deltaVel1Dotn,vdupq_n_f32(c.m_jacDiagABInv)));
deltaImpulse=vsubq_f32(deltaImpulse,vmulq_f32(deltaVel2Dotn,vdupq_n_f32(c.m_jacDiagABInv)));
float32x4_t sum=vaddq_f32(cpAppliedImp,deltaImpulse);
uint32x4_t resultLowerLess=vcltq_f32(sum,lowerLimit1);
uint32x4_t resultUpperLess=vcltq_f32(sum,upperLimit1);
float32x4_t lowMinApplied=vsubq_f32(lowerLimit1,cpAppliedImp);
deltaImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultLowerLess,vreinterpretq_u32_f32(lowMinApplied)),
vbicq_u32(vreinterpretq_u32_f32(deltaImpulse),resultLowerLess)));
c.m_appliedImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultLowerLess,vreinterpretq_u32_f32(lowerLimit1)),
vbicq_u32(vreinterpretq_u32_f32(sum),resultLowerLess)));
float32x4_t upperMinApplied=vsubq_f32(upperLimit1,cpAppliedImp);
deltaImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultUpperLess,vreinterpretq_u32_f32(deltaImpulse)),
vbicq_u32(vreinterpretq_u32_f32(upperMinApplied),resultUpperLess)));
c.m_appliedImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultUpperLess,vreinterpretq_u32_f32(c.m_appliedImpulse)),
vbicq_u32(vreinterpretq_u32_f32(upperLimit1),resultUpperLess)));
float32x4_t linearComponentA=vmulq_f32(c.m_contactNormal.getf32x4(),vdupq_n_f32(body1.m_invMass));
float32x4_t linearComponentB=vmulq_f32((-c.m_contactNormal).getf32x4(),vdupq_n_f32(body2.m_invMass));
float32x4_t impulseMagnitude=deltaImpulse;
body1.m_deltaLinearVelocity.setVector(vaddq_f32(body1.m_deltaLinearVelocity.getf32x4(),vmulq_f32(linearComponentA,impulseMagnitude)));
body1.m_deltaAngularVelocity.setVector(vaddq_f32(body1.m_deltaAngularVelocity.getf32x4(),vmulq_f32(c.m_angularComponentA.getf32x4(),impulseMagnitude)));
body2.m_deltaLinearVelocity.setVector(vaddq_f32(body2.m_deltaLinearVelocity.getf32x4(),vmulq_f32(linearComponentB,impulseMagnitude)));
body2.m_deltaAngularVelocity.setVector(vaddq_f32(body2.m_deltaAngularVelocity.getf32x4(),vmulq_f32(c.m_angularComponentB.getf32x4(),impulseMagnitude)));
btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD:
{SSE}
Code: Select all
__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
__m128 deltaVel1Dotn = _mm_add_ps(_vmathVfDot3(c.m_contactNormal.mVec128,body1.m_deltaLinearVelocity.mVec128), _vmathVfDot3(c.m_relpos1CrossNormal.mVec128,body1.m_deltaAngularVelocity.mVec128));
__m128 deltaVel2Dotn = _mm_add_ps(_vmathVfDot3((-c.m_contactNormal).mVec128,body2.m_deltaLinearVelocity.mVec128) ,_vmathVfDot3(c.m_relpos2CrossNormal.mVec128,body2.m_deltaAngularVelocity.mVec128));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
btSimdScalar resultLowerLess;
resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,_mm_set1_ps(body1.m_invMass));
__m128 linearComponentB = _mm_mul_ps((-c.m_contactNormal).mVec128,_mm_set1_ps(body2.m_invMass));
__m128 impulseMagnitude = deltaImpulse;
body1.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body1.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
body1.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body1.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
body2.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body2.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
body2.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body2.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
{VFP}
Code: Select all
float32x4_t cpAppliedImp=vdupq_n_f32(c.m_appliedImpulse);
float32x4_t lowerLimit1=vdupq_n_f32(c.m_lowerLimit);
float32x4_t upperLimit1=vdupq_n_f32(c.m_upperLimit);
float32x4_t deltaImpulse=vsubq_f32(vdupq_n_f32(c.m_rhs),vmulq_f32(vdupq_n_f32(c.m_appliedImpulse),vdupq_n_f32(c.m_cfm)));
float32x4_t deltaVel1Dotn=vaddq_f32(vmathVfDot3(c.m_contactNormal.getf32x4(),body1.m_deltaLinearVelocity.getf32x4()),
vmathVfDot3(c.m_relpos1CrossNormal.getf32x4(),body1.m_deltaAngularVelocity.getf32x4()));
float32x4_t deltaVel2Dotn=vaddq_f32(vmathVfDot3((-c.m_contactNormal).getf32x4(),body2.m_deltaLinearVelocity.getf32x4()),
vmathVfDot3(c.m_relpos2CrossNormal.getf32x4(),body2.m_deltaAngularVelocity.getf32x4()));
deltaImpulse=vsubq_f32(deltaImpulse,vmulq_f32(deltaVel1Dotn,vdupq_n_f32(c.m_jacDiagABInv)));
deltaImpulse=vsubq_f32(deltaImpulse,vmulq_f32(deltaVel2Dotn,vdupq_n_f32(c.m_jacDiagABInv)));
float32x4_t sum=vaddq_f32(cpAppliedImp,deltaImpulse);
uint32x4_t resultLowerLess=vcltq_f32(sum,lowerLimit1);
float32x4_t lowMinApplied=vsubq_f32(lowerLimit1,cpAppliedImp);
deltaImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultLowerLess,vreinterpretq_u32_f32(lowMinApplied)),
vbicq_u32(vreinterpretq_u32_f32(deltaImpulse),resultLowerLess)));
c.m_appliedImpulse=vreinterpretq_f32_u32(vorrq_u32(vandq_u32(resultLowerLess,vreinterpretq_u32_f32(lowerLimit1)),
vbicq_u32(vreinterpretq_u32_f32(sum),resultLowerLess)));
float32x4_t linearComponentA=vmulq_f32(c.m_contactNormal.getf32x4(),vdupq_n_f32(body1.m_invMass));
float32x4_t linearComponentB=vmulq_f32((-c.m_contactNormal).getf32x4(),vdupq_n_f32(body2.m_invMass));
float32x4_t impulseMagnitude=deltaImpulse;
body1.m_deltaLinearVelocity.setVector(vaddq_f32(body1.m_deltaLinearVelocity.getf32x4(),vmulq_f32(linearComponentA,impulseMagnitude)));
body1.m_deltaAngularVelocity.setVector(vaddq_f32(body1.m_deltaAngularVelocity.getf32x4(),vmulq_f32(c.m_angularComponentA.getf32x4(),impulseMagnitude)));
body2.m_deltaLinearVelocity.setVector(vaddq_f32(body2.m_deltaLinearVelocity.getf32x4(),vmulq_f32(linearComponentB,impulseMagnitude)));
body2.m_deltaAngularVelocity.setVector(vaddq_f32(body2.m_deltaAngularVelocity.getf32x4(),vmulq_f32(c.m_angularComponentB.getf32x4(),impulseMagnitude)));
btVector3.h:
{VFP}
Code: Select all
SIMD_FORCE_INLINE float32x4_t getf32x4(void) const
{
return vld1q_f32(m_floats);
}
SIMD_FORCE_INLINE void setVector(float32x4_t v)
{
vst1q_f32(m_floats,v);
}
btSolverBody.h:
{SSE}
Code: Select all
struct btSimdScalar
{
SIMD_FORCE_INLINE btSimdScalar()
{
}
SIMD_FORCE_INLINE btSimdScalar(float fl)
:m_vec128 (_mm_set1_ps(fl))
{
}
SIMD_FORCE_INLINE btSimdScalar(__m128 v128)
:m_vec128(v128)
{
}
union
{
__m128 m_vec128;
float m_floats[4];
int m_ints[4];
};
SIMD_FORCE_INLINE __m128 get128()
{
return m_vec128;
}
SIMD_FORCE_INLINE const __m128 get128() const
{
return m_vec128;
}
SIMD_FORCE_INLINE void set128(__m128 v128)
{
m_vec128 = v128;
}
SIMD_FORCE_INLINE operator __m128()
{
return m_vec128;
}
SIMD_FORCE_INLINE operator const __m128() const
{
return m_vec128;
}
SIMD_FORCE_INLINE operator float() const
{
return m_floats[0];
}
};
///@brief Return the elementwise product of two btSimdScalar
SIMD_FORCE_INLINE btSimdScalar
operator*(const btSimdScalar& v1, const btSimdScalar& v2)
{
return btSimdScalar(_mm_mul_ps(v1.get128(),v2.get128()));
}
///@brief Return the elementwise product of two btSimdScalar
SIMD_FORCE_INLINE btSimdScalar
operator+(const btSimdScalar& v1, const btSimdScalar& v2)
{
return btSimdScalar(_mm_add_ps(v1.get128(),v2.get128()));
}
{VFP}
Code: Select all
struct btSimdScalar
{
SIMD_FORCE_INLINE btSimdScalar()
{
}
SIMD_FORCE_INLINE btSimdScalar(float fl)
:m_vec128 (vdupq_n_f32(fl))
{
}
SIMD_FORCE_INLINE btSimdScalar(float32x4_t v128)
:m_vec128(v128)
{
}
union
{
float32x4_t m_vec128;
float m_floats[4];
int m_ints[4];
};
SIMD_FORCE_INLINE float32x4_t get128()
{
return m_vec128;
}
SIMD_FORCE_INLINE const float32x4_t get128() const
{
return m_vec128;
}
SIMD_FORCE_INLINE void set128(float32x4_t v128)
{
m_vec128 = v128;
}
SIMD_FORCE_INLINE operator float32x4_t()
{
return m_vec128;
}
SIMD_FORCE_INLINE operator const float32x4_t() const
{
return m_vec128;
}
SIMD_FORCE_INLINE operator float() const
{
return m_floats[0];
}
};
///@brief Return the elementwise product of two btSimdScalar
SIMD_FORCE_INLINE btSimdScalar
operator*(const btSimdScalar& v1, const btSimdScalar& v2)
{
return btSimdScalar(vmulq_f32(v1.get128(),v2.get128()));
}
///@brief Return the elementwise product of two btSimdScalar
SIMD_FORCE_INLINE btSimdScalar
operator+(const btSimdScalar& v1, const btSimdScalar& v2)
{
return btSimdScalar(vaddq_f32(v1.get128(),v2.get128()));
}