• R/O
  • SSH
  • HTTPS

mikumikudroid: Commit


Commit MetaInfo

Revisión110 (tree)
Tiempo2011-07-20 21:29:52
Autorohsawa

Log Message

some part of btSequentialImpulseConstraintSolver includes NEON code.

Cambiar Resumen

Diferencia incremental

--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet-jni.cpp (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet-jni.cpp (revision 110)
@@ -193,13 +193,11 @@
193193 rbi->m_angularDamping = r_dim;
194194 rbi->m_restitution = recoil;
195195 rbi->m_friction = friction;
196- rbi->m_linearSleepingThreshold = 0;
197- rbi->m_angularSleepingThreshold = 0;
198196
199197 g_rb[g_ptr] = new btRigidBody(*rbi);
198+ g_rb[g_ptr]->setActivationState(DISABLE_DEACTIVATION);
200199
201200 if(type == 0) {
202- g_rb[g_ptr]->setActivationState(DISABLE_DEACTIVATION);
203201 g_rb[g_ptr]->setCollisionFlags(g_rb[g_ptr]->getCollisionFlags() | btCollisionObject :: CF_KINEMATIC_OBJECT);
204202 }
205203
@@ -225,7 +223,7 @@
225223 float *sp_n = env->GetFloatArrayElements(sp, 0);
226224 float *sr_n = env->GetFloatArrayElements(sr, 0);
227225 for(int i = 0; i < 3; i++) {
228- if(sp_n[i] > 0) {
226+ if(sp_n[i] != 0) {
229227 dof->enableSpring(i, true);
230228 dof->setStiffness(i, sp_n[i]);
231229 } else {
@@ -232,7 +230,7 @@
232230 dof->enableSpring(i, false);
233231 }
234232
235- if(sr_n[i] > 0) {
233+ if(sr_n[i] != 0) {
236234 dof->enableSpring(i + 3, true);
237235 dof->setStiffness(i + 3, sr_n[i]);
238236 } else {
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp (revision 110)
@@ -47,6 +47,7 @@
4747 }
4848
4949 #ifdef USE_SIMD
50+#ifdef BT_USE_SSE
5051 #include <emmintrin.h>
5152 #define btVecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
5253 static inline __m128 btSimdDot3( __m128 vec0, __m128 vec1 )
@@ -54,12 +55,14 @@
5455 __m128 result = _mm_mul_ps( vec0, vec1);
5556 return _mm_add_ps( btVecSplat( result, 0 ), _mm_add_ps( btVecSplat( result, 1 ), btVecSplat( result, 2 ) ) );
5657 }
58+#endif//BT_USE_SSE
5759 #endif//USE_SIMD
5860
5961 // Project Gauss Seidel or the equivalent Sequential Impulse
60-void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
62+SIMD_FORCE_INLINE void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
6163 {
6264 #ifdef USE_SIMD
65+#ifdef BT_USE_SSE
6366 __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
6467 __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
6568 __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
@@ -85,6 +88,112 @@
8588 body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
8689 body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
8790 body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
91+#endif//BT_USE_SSE
92+#ifdef BT_USE_NEON
93+ //////////// LOAD
94+ __asm__ __volatile__("vldr.32 d8, %0" : : "Uv"(c.m_contactNormal.m_floats[0]) : "d8", "q4", "s16", "s17");
95+ __asm__ __volatile__("vldr.32 d9, %0" : : "Uv"(c.m_contactNormal.m_floats[2]) : "d9", "q4", "s18", "s19");
96+
97+ __asm__ __volatile__("vldr.32 d4, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[0]) : "d4", "q2", "s8", "s9");
98+ __asm__ __volatile__("vldr.32 d5, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[2]) : "d5", "q2", "s10", "s11");
99+ __asm__ __volatile__("vldr.32 d6, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[0]) : "d6", "q3", "s12", "s13");
100+ __asm__ __volatile__("vldr.32 d7, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[2]) : "d7", "q3", "s14", "s15");
101+
102+
103+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : "d16", "q8");
104+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : "d17", "q8");
105+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : "d18", "q9");
106+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : "d19", "q9");
107+
108+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : "d22", "q11");
109+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : "d23", "q11");
110+ __asm__ __volatile__("vldr.32 d24, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : "d24", "q12");
111+ __asm__ __volatile__("vldr.32 d25, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : "d25", "q12");
112+
113+ __asm__ __volatile__("vldr.32 d0, %0" : : "Uv"(c.m_rhs) : "d0", "q0", "s0", "s1");
114+ __asm__ __volatile__("vldr.32 d1, %0" : : "Uv"(c.m_cfm) : "d1", "q0", "s2", "s3");
115+ __asm__ __volatile__("vldr.32 d2, %0" : : "Uv"(c.m_appliedImpulse) : "d2", "q1", "s4", "s5");
116+ __asm__ __volatile__("vldr.32 d3, %0" : : "Uv"(c.m_jacDiagABInv) : "d3", "q1", "s6", "s7");
117+ __asm__ __volatile__("vldr.32 d10, %0" : : "Uv"(c.m_lowerLimit) : "d10", "q5", "s20", "s21");
118+ __asm__ __volatile__("vldr.32 d11, %0" : : "Uv"(c.m_upperLimit) : "d11", "q5", "s22", "s23");
119+
120+ __asm__ __volatile__("vldr.32 d20, %0" : : "Uv"(body1.internalGetInvMass().m_floats[0]) : "d20", "q10");
121+ __asm__ __volatile__("vldr.32 d21, %0" : : "Uv"(body1.internalGetInvMass().m_floats[2]) : "d21", "q10");
122+ __asm__ __volatile__("vldr.32 d26, %0" : : "Uv"(body2.internalGetInvMass().m_floats[0]) : "d26", "q13");
123+ __asm__ __volatile__("vldr.32 d27, %0" : : "Uv"(body2.internalGetInvMass().m_floats[2]) : "d27", "q13");
124+
125+ __asm__ __volatile__("vldr.32 d12, %0" : : "Uv"(c.m_angularComponentA.m_floats[0]) : "d12", "q6", "s24", "s25");
126+ __asm__ __volatile__("vldr.32 d13, %0" : : "Uv"(c.m_angularComponentA.m_floats[2]) : "d13", "q6", "s26", "s27");
127+ __asm__ __volatile__("vldr.32 d14, %0" : : "Uv"(c.m_angularComponentB.m_floats[0]) : "d14", "q7", "s28", "s29");
128+ __asm__ __volatile__("vldr.32 d15, %0" : : "Uv"(c.m_angularComponentB.m_floats[2]) : "d15", "q7", "s30", "s31");
129+
130+
131+
132+ // d0[0]: c.m_rhs
133+ // d1[0]: c.m_cfm
134+ // d2[0]: c.m_appliedImpulse
135+ // d3[0]: c.m_jacDiagABInv
136+
137+ // q2: c.m_relpos1CrossNormal
138+ // q3: c.m_relpos2CrossNormal
139+ // q4: c.m_contactNormal
140+
141+ // d10[0]: c.m_lowerLimit
142+ // d11[0]: c.m_upperLimit
143+
144+ // q6: c.m_angularComponentA
145+ // q7: c.m_angularComponentB
146+
147+ // q8: body1.internalGetDeltaLinearVelocity()
148+ // q9: body1.internalGetDeltaAngularVelocity()
149+ // q10: body1.internalGetInvMass()
150+ // q11: body2.internalGetDeltaLinearVelocity()
151+ // q12: body2.internalGetDeltaAngularVelocity()
152+ // q13: body2.internalGetInvMass()
153+
154+
155+ //////////// CALCULATE
156+ __asm__ __volatile__("vmul.f32 q14, q4, q8" : : : "q14", "d28", "d29"); // deltaVel1Dotn
157+ __asm__ __volatile__("vmla.f32 q14, q2, q9" : : : "q14", "d28", "d29");
158+ __asm__ __volatile__("vmls.f32 q14, q4, q11" : : : "q14", "d28", "d29"); // deltaVel2Dotn
159+ __asm__ __volatile__("vmla.f32 q14, q3, q12" : : : "q14", "d28", "d29");
160+ __asm__ __volatile__("vmul.f32 q14, q14, d3[0]" : : : "q14", "d28", "d29"); // * c.m_jacDiagABInv
161+
162+ __asm__ __volatile__("vpadd.f32 d31, d28, d29" : : : "d31"); // tree add1
163+ __asm__ __volatile__("vpadd.f32 d31, d31, d31" : : : "d31"); // tree add2
164+
165+ __asm__ __volatile__("vmls.f32 d0, d2, d1" : : : "d0"); // c.m_rhs - c.m_appliedImpulse * c.m_cfm
166+
167+ __asm__ __volatile__("vsub.f32 d0, d0, d31" : : : "q0", "d0", "s0", "s1"); // deltaImpulse d0[0]
168+
169+ __asm__ __volatile__("vadd.f32 d1, d0, d2" : : : "q0", "d1", "s2", "s3"); // add applied impulse(sum)
170+
171+ __asm__ __volatile__("vmax.f32 d1, d1, d10" : : : "q0", "d1", "s2", "s3"); // saturate lower
172+ __asm__ __volatile__("vmin.f32 d1, d1, d11" : : : "q0", "d1", "s2", "s3"); // saturate upper
173+ __asm__ __volatile__("vsub.f32 d0, d1, d2" : : : "q0", "d0", "s0", "s1"); // saturate deltaImpulse
174+
175+
176+ __asm__ __volatile__("vmul.f32 q10, q4, q10" : : : "q10", "d20", "d21"); // linearComponentA
177+ __asm__ __volatile__("vmul.f32 q13, q4, q13" : : : "q13", "d26", "d27"); // linearComponentB
178+
179+ __asm__ __volatile__("vmla.f32 q8, q10, d0[0]" : : : "q8", "d16", "d17"); // update r1.DeltaLinearVelocity
180+ __asm__ __volatile__("vmla.f32 q9, q6, d0[0]" : : : "q9", "d18", "d19"); // update r1.DeltaAngularVelocity
181+ __asm__ __volatile__("vmls.f32 q11, q13, d0[0]" : : : "q11", "d22", "d23"); // update r2.DeltaLinearVelocity
182+ __asm__ __volatile__("vmla.f32 q12, q7, d0[0]" : : : "q12", "d24", "d25"); // update r2.DeltaAngularVelocity
183+
184+
185+ //////////// STORE
186+ __asm__ __volatile__("vstr.32 s2, %0" : "=Uv"(c.m_appliedImpulse) : :);
187+ __asm__ __volatile__("vstr.32 d16, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : :);
188+ __asm__ __volatile__("vstr.32 d17, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : :);
189+ __asm__ __volatile__("vstr.32 d18, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : :);
190+ __asm__ __volatile__("vstr.32 d19, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : :);
191+
192+ __asm__ __volatile__("vstr.32 d22, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : :);
193+ __asm__ __volatile__("vstr.32 d23, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : :);
194+ __asm__ __volatile__("vstr.32 d24, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : :);
195+ __asm__ __volatile__("vstr.32 d25, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : :);
196+#endif//BT_USE_NEON
88197 #else
89198 resolveSingleConstraintRowGeneric(body1,body2,c);
90199 #endif
@@ -91,7 +200,7 @@
91200 }
92201
93202 // Project Gauss Seidel or the equivalent Sequential Impulse
94- void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGeneric(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
203+void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGeneric(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
95204 {
96205 btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
97206 const btScalar deltaVel1Dotn = c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
@@ -120,9 +229,10 @@
120229 body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
121230 }
122231
123- void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
232+SIMD_FORCE_INLINE void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
124233 {
125234 #ifdef USE_SIMD
235+#ifdef BT_USE_SSE
126236 __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
127237 __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
128238 __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
@@ -145,6 +255,106 @@
145255 body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
146256 body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
147257 body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
258+#endif//BT_USE_SSE
259+#ifdef BT_USE_NEON
260+ //////////// LOAD
261+ __asm__ __volatile__("vldr.32 d0, %0" : : "Uv"(c.m_rhs) : "d0", "q0", "s0", "s1");
262+ __asm__ __volatile__("vldr.32 d1, %0" : : "Uv"(c.m_cfm) : "d1", "q0", "s2", "s3");
263+ __asm__ __volatile__("vldr.32 d2, %0" : : "Uv"(c.m_appliedImpulse) : "d2", "q1", "s4", "s5");
264+ __asm__ __volatile__("vldr.32 d3, %0" : : "Uv"(c.m_jacDiagABInv) : "d3", "q1", "s6", "s7");
265+ __asm__ __volatile__("vldr.32 d10, %0" : : "Uv"(c.m_lowerLimit) : "d10", "q5", "s20", "s21");
266+
267+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : "d16", "q8");
268+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : "d17", "q8");
269+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : "d18", "q9");
270+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : "d19", "q9");
271+ __asm__ __volatile__("vldr.32 d20, %0" : : "Uv"(body1.internalGetInvMass().m_floats[0]) : "d20", "q10");
272+ __asm__ __volatile__("vldr.32 d21, %0" : : "Uv"(body1.internalGetInvMass().m_floats[2]) : "d21", "q10");
273+
274+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : "d22", "q11");
275+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : "d23", "q11");
276+ __asm__ __volatile__("vldr.32 d24, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : "d24", "q12");
277+ __asm__ __volatile__("vldr.32 d25, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : "d25", "q12");
278+ __asm__ __volatile__("vldr.32 d26, %0" : : "Uv"(body2.internalGetInvMass().m_floats[0]) : "d26", "q13");
279+ __asm__ __volatile__("vldr.32 d27, %0" : : "Uv"(body2.internalGetInvMass().m_floats[2]) : "d27", "q13");
280+
281+ __asm__ __volatile__("vldr.32 d4, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[0]) : "d4", "q2", "s8", "s9");
282+ __asm__ __volatile__("vldr.32 d5, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[2]) : "d5", "q2", "s10", "s11");
283+ __asm__ __volatile__("vldr.32 d6, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[0]) : "d6", "q3", "s12", "s13");
284+ __asm__ __volatile__("vldr.32 d7, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[2]) : "d7", "q3", "s14", "s15");
285+
286+ __asm__ __volatile__("vldr.32 d8, %0" : : "Uv"(c.m_contactNormal.m_floats[0]) : "d8", "q4", "s16", "s17");
287+ __asm__ __volatile__("vldr.32 d9, %0" : : "Uv"(c.m_contactNormal.m_floats[2]) : "d9", "q4", "s18", "s19");
288+
289+ __asm__ __volatile__("vldr.32 d12, %0" : : "Uv"(c.m_angularComponentA.m_floats[0]) : "d12", "q6", "s24", "s25");
290+ __asm__ __volatile__("vldr.32 d13, %0" : : "Uv"(c.m_angularComponentA.m_floats[2]) : "d13", "q6", "s26", "s27");
291+ __asm__ __volatile__("vldr.32 d14, %0" : : "Uv"(c.m_angularComponentB.m_floats[0]) : "d14", "q7", "s28", "s29");
292+ __asm__ __volatile__("vldr.32 d15, %0" : : "Uv"(c.m_angularComponentB.m_floats[2]) : "d15", "q7", "s30", "s31");
293+
294+
295+ // d0[0]: c.m_rhs
296+ // d1[0]: c.m_cfm
297+ // d2[0]: c.m_appliedImpulse
298+ // d3[0]: c.m_jacDiagABInv
299+
300+ // q2: c.m_relpos1CrossNormal
301+ // q3: c.m_relpos2CrossNormal
302+ // q4: c.m_contactNormal
303+
304+ // d10[0]: c.m_lowerLimit
305+
306+ // q6: c.m_angularComponentA
307+ // q7: c.m_angularComponentB
308+
309+ // q8: body1.internalGetDeltaLinearVelocity()
310+ // q9: body1.internalGetDeltaAngularVelocity()
311+ // q10: body1.internalGetInvMass()
312+ // q11: body2.internalGetDeltaLinearVelocity()
313+ // q12: body2.internalGetDeltaAngularVelocity()
314+ // q13: body2.internalGetInvMass()
315+
316+
317+ //////////// CALCULATE
318+ __asm__ __volatile__("vmls.f32 d0, d2, d1" : : : "d0"); // c.m_rhs - c.m_appliedImpulse * c.m_cfm
319+
320+ __asm__ __volatile__("vmul.f32 q14, q4, q8" : : : "q14", "d28", "d29"); // deltaVel1Dotn
321+ __asm__ __volatile__("vmla.f32 q14, q2, q9" : : : "q14", "d28", "d29");
322+ __asm__ __volatile__("vmls.f32 q14, q4, q11" : : : "q14", "d28", "d29"); // deltaVel2Dotn
323+ __asm__ __volatile__("vmla.f32 q14, q3, q12" : : : "q14", "d28", "d29");
324+ __asm__ __volatile__("vmul.f32 q14, q14, d3[0]" : : : "q14", "d28", "d29"); // * c.m_jacDiagABInv
325+
326+ __asm__ __volatile__("vpadd.f32 d31, d28, d29" : : : "d31"); // tree add1
327+ __asm__ __volatile__("vpadd.f32 d31, d31, d31" : : : "d31"); // tree add2
328+
329+ __asm__ __volatile__("vsub.f32 d0, d0, d31" : : : "q0", "d0", "s0", "s1"); // deltaImpulse d0[0]
330+
331+ __asm__ __volatile__("vadd.f32 d1, d0, d2" : : : "q0", "d1", "s2", "s3"); // add aplied impulse(sum)
332+
333+ __asm__ __volatile__("vmax.f32 d1, d1, d10" : : : "q0", "d1", "s2", "s3"); // satulate low
334+ __asm__ __volatile__("vsub.f32 d0, d1, d2" : : : "q0", "d0", "s0", "s1"); // satulate deltaImpulse
335+
336+
337+ __asm__ __volatile__("vmul.f32 q10, q4, q10" : : : "q10", "d20", "d21"); // linearComponentA
338+ __asm__ __volatile__("vmul.f32 q13, q4, q13" : : : "q13", "d26", "d27"); // linearComponentB
339+
340+ __asm__ __volatile__("vmla.f32 q8, q10, d0[0]" : : : "q8", "d16", "d17"); // update r1.DeltaLinearVelocity
341+ __asm__ __volatile__("vmla.f32 q9, q6, d0[0]" : : : "q9", "d18", "d19"); // update r1.DeltaAngularVelocity
342+ __asm__ __volatile__("vmls.f32 q11, q13, d0[0]" : : : "q11", "d22", "d23"); // update r2.DeltaLinearVelocity
343+ __asm__ __volatile__("vmla.f32 q12, q7, d0[0]" : : : "q12", "d24", "d25"); // update r2.DeltaAngularVelocity
344+
345+
346+ //////////// STORE
347+ __asm__ __volatile__("vstr.32 s2, %0" : "=Uv"(c.m_appliedImpulse) : :);
348+ __asm__ __volatile__("vstr.32 d16, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : :);
349+ __asm__ __volatile__("vstr.32 d17, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : :);
350+ __asm__ __volatile__("vstr.32 d18, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : :);
351+ __asm__ __volatile__("vstr.32 d19, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : :);
352+
353+ __asm__ __volatile__("vstr.32 d22, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : :);
354+ __asm__ __volatile__("vstr.32 d23, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : :);
355+ __asm__ __volatile__("vstr.32 d24, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : :);
356+ __asm__ __volatile__("vstr.32 d25, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : :);
357+#endif//BT_USE_NEON
148358 #else
149359 resolveSingleConstraintRowLowerLimit(body1,body2,c);
150360 #endif
@@ -203,9 +413,10 @@
203413 }
204414 }
205415
206- void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
416+SIMD_FORCE_INLINE void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
207417 {
208418 #ifdef USE_SIMD
419+#ifdef BT_USE_SSE
209420 if (!c.m_rhsPenetration)
210421 return;
211422
@@ -233,6 +444,150 @@
233444 body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
234445 body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
235446 body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
447+#endif//BT_USE_SSE
448+#ifdef BT_USE_NEON
449+ if (!c.m_rhsPenetration)
450+ return;
451+
452+ gNumSplitImpulseRecoveries++;
453+
454+
455+
456+ //////////// LOAD
457+ __asm__ __volatile__("vldr.32 d0, %0" : : "Uv"(c.m_rhsPenetration) : "d0", "q0", "s0", "s1");
458+ __asm__ __volatile__("vldr.32 d1, %0" : : "Uv"(c.m_cfm) : "d1", "q0", "s2", "s3");
459+ __asm__ __volatile__("vldr.32 d2, %0" : : "Uv"(c.m_appliedPushImpulse) : "d2", "q1", "s4", "s5");
460+ __asm__ __volatile__("vldr.32 d3, %0" : : "Uv"(c.m_jacDiagABInv) : "d3", "q1", "s6", "s7");
461+ __asm__ __volatile__("vldr.32 d10, %0" : : "Uv"(c.m_lowerLimit) : "d10", "q5", "s20", "s21");
462+
463+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(body1.internalGetPushVelocity().m_floats[0]) : "d16", "q8");
464+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(body1.internalGetPushVelocity().m_floats[2]) : "d17", "q8");
465+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(body1.internalGetTurnVelocity().m_floats[0]) : "d18", "q9");
466+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(body1.internalGetTurnVelocity().m_floats[2]) : "d19", "q9");
467+ __asm__ __volatile__("vldr.32 d20, %0" : : "Uv"(body1.internalGetInvMass().m_floats[0]) : "d20", "q10");
468+ __asm__ __volatile__("vldr.32 d21, %0" : : "Uv"(body1.internalGetInvMass().m_floats[2]) : "d21", "q10");
469+
470+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(body2.internalGetPushVelocity().m_floats[0]) : "d22", "q11");
471+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(body2.internalGetPushVelocity().m_floats[2]) : "d23", "q11");
472+ __asm__ __volatile__("vldr.32 d24, %0" : : "Uv"(body2.internalGetTurnVelocity().m_floats[0]) : "d24", "q12");
473+ __asm__ __volatile__("vldr.32 d25, %0" : : "Uv"(body2.internalGetTurnVelocity().m_floats[2]) : "d25", "q12");
474+ __asm__ __volatile__("vldr.32 d26, %0" : : "Uv"(body2.internalGetInvMass().m_floats[0]) : "d26", "q13");
475+ __asm__ __volatile__("vldr.32 d27, %0" : : "Uv"(body2.internalGetInvMass().m_floats[2]) : "d27", "q13");
476+
477+ __asm__ __volatile__("vldr.32 d4, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[0]) : "d4", "q2", "s8", "s9");
478+ __asm__ __volatile__("vldr.32 d5, %0" : : "Uv"(c.m_relpos1CrossNormal.m_floats[2]) : "d5", "q2", "s10", "s11");
479+ __asm__ __volatile__("vldr.32 d6, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[0]) : "d6", "q3", "s12", "s13");
480+ __asm__ __volatile__("vldr.32 d7, %0" : : "Uv"(c.m_relpos2CrossNormal.m_floats[2]) : "d7", "q3", "s14", "s15");
481+
482+ __asm__ __volatile__("vldr.32 d8, %0" : : "Uv"(c.m_contactNormal.m_floats[0]) : "d8", "q4", "s16", "s17");
483+ __asm__ __volatile__("vldr.32 d9, %0" : : "Uv"(c.m_contactNormal.m_floats[2]) : "d9", "q4", "s18", "s19");
484+
485+ __asm__ __volatile__("vldr.32 d12, %0" : : "Uv"(c.m_angularComponentA.m_floats[0]) : "d12", "q6", "s24", "s25");
486+ __asm__ __volatile__("vldr.32 d13, %0" : : "Uv"(c.m_angularComponentA.m_floats[2]) : "d13", "q6", "s26", "s27");
487+ __asm__ __volatile__("vldr.32 d14, %0" : : "Uv"(c.m_angularComponentB.m_floats[0]) : "d14", "q7", "s28", "s29");
488+ __asm__ __volatile__("vldr.32 d15, %0" : : "Uv"(c.m_angularComponentB.m_floats[2]) : "d15", "q7", "s30", "s31");
489+
490+
491+ // d0[0]: c.m_rhsPenetration
492+ // d1[0]: c.m_cfm
493+ // d2[0]: c.m_appliedPushImpulse
494+ // d3[0]: c.m_jacDiagABInv
495+
496+ // q2: c.m_relpos1CrossNormal
497+ // q3: c.m_relpos2CrossNormal
498+ // q4: c.m_contactNormal
499+
500+ // d10[0]: c.m_lowerLimit
501+
502+ // q6: c.m_angularComponentA
503+ // q7: c.m_angularComponentB
504+
505+ // q8: body1.internalGetPushVelocity()
506+ // q9: body1.internalGetTurnVelocity()
507+ // q10: body1.internalGetInvMass()
508+ // q11: body2.internalGetPushVelocity()
509+ // q12: body2.internalGetTurnVelocity()
510+ // q13: body2.internalGetInvMass()
511+
512+
513+ //////////// CALCULATE
514+
515+ __asm__ __volatile__("vmul.f32 q14, q4, q8" : : : "q14", "d28", "d29"); // deltaVel1Dotn
516+ __asm__ __volatile__("vmla.f32 q14, q2, q9" : : : "q14", "d28", "d29");
517+ __asm__ __volatile__("vmls.f32 q14, q4, q11" : : : "q14", "d28", "d29"); // deltaVel2Dotn
518+ __asm__ __volatile__("vmla.f32 q14, q3, q12" : : : "q14", "d28", "d29");
519+
520+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : "d16", "q8");
521+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : "d17", "q8");
522+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : "d18", "q9");
523+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : "d19", "q9");
524+
525+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : "d22", "q11");
526+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : "d23", "q11");
527+ __asm__ __volatile__("vldr.32 d24, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : "d24", "q12");
528+ __asm__ __volatile__("vldr.32 d25, %0" : : "Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : "d25", "q12");
529+
530+ __asm__ __volatile__("vmul.f32 q14, q14, d3[0]" : : : "q14", "d28", "d29"); // * c.m_jacDiagABInv
531+
532+ __asm__ __volatile__("vmls.f32 d0, d2, d1" : : : "d0"); // c.m_rhsPenetration - c.m_appliedPushImpulse * c.m_cfm
533+ __asm__ __volatile__("vpadd.f32 d31, d28, d29" : : : "d31"); // tree add1
534+ __asm__ __volatile__("vpadd.f32 d31, d31, d31" : : : "d31"); // tree add2
535+
536+ __asm__ __volatile__("vsub.f32 d0, d0, d31" : : : "q0", "d0", "s0", "s1"); // deltaImpulse d0[0]
537+
538+ __asm__ __volatile__("vadd.f32 d1, d0, d2" : : : "q0", "d1", "s2", "s3"); // add aplied impulse(sum)
539+
540+ __asm__ __volatile__("vmax.f32 d1, d1, d10" : : : "q0", "d1", "s2", "s3"); // satulate low
541+ __asm__ __volatile__("vsub.f32 d0, d1, d2" : : : "q0", "d0", "s0", "s1"); // satulate deltaImpulse
542+
543+
544+ __asm__ __volatile__("vmul.f32 q10, q4, q10" : : : "q10", "d20", "d21"); // linearComponentA
545+ __asm__ __volatile__("vmul.f32 q13, q4, q13" : : : "q13", "d26", "d27"); // linearComponentB
546+
547+ __asm__ __volatile__("vmla.f32 q8, q10, d0[0]" : : : "q8", "d16", "d17"); // update r1.DeltaLinearVelocity
548+ __asm__ __volatile__("vmla.f32 q9, q6, d0[0]" : : : "q9", "d18", "d19"); // update r1.DeltaAngularVelocity
549+ __asm__ __volatile__("vmls.f32 q11, q13, d0[0]" : : : "q11", "d22", "d23"); // update r2.DeltaLinearVelocity
550+ __asm__ __volatile__("vmla.f32 q12, q7, d0[0]" : : : "q12", "d24", "d25"); // update r2.DeltaAngularVelocity
551+
552+
553+ //////////// STORE
554+ __asm__ __volatile__("vstr.32 s2, %0" : "=Uv"(c.m_appliedPushImpulse) : :);
555+ __asm__ __volatile__("vstr.32 d16, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[0]) : :);
556+ __asm__ __volatile__("vstr.32 d17, %0" : "=Uv"(body1.internalGetDeltaLinearVelocity().m_floats[2]) : :);
557+ __asm__ __volatile__("vstr.32 d18, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[0]) : :);
558+ __asm__ __volatile__("vstr.32 d19, %0" : "=Uv"(body1.internalGetDeltaAngularVelocity().m_floats[2]) : :);
559+
560+ __asm__ __volatile__("vstr.32 d22, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[0]) : :);
561+ __asm__ __volatile__("vstr.32 d23, %0" : "=Uv"(body2.internalGetDeltaLinearVelocity().m_floats[2]) : :);
562+ __asm__ __volatile__("vstr.32 d24, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[0]) : :);
563+ __asm__ __volatile__("vstr.32 d25, %0" : "=Uv"(body2.internalGetDeltaAngularVelocity().m_floats[2]) : :);
564+
565+/*
566+ btScalar deltaImpulse = c.m_rhsPenetration-btScalar(c.m_appliedPushImpulse)*c.m_cfm;
567+ const btScalar deltaVel1Dotn = c.m_contactNormal.dot(body1.internalGetPushVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetTurnVelocity());
568+ const btScalar deltaVel2Dotn = -c.m_contactNormal.dot(body2.internalGetPushVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetTurnVelocity());
569+
570+ deltaImpulse -= deltaVel1Dotn*c.m_jacDiagABInv;
571+ deltaImpulse -= deltaVel2Dotn*c.m_jacDiagABInv;
572+ const btScalar sum = btScalar(c.m_appliedPushImpulse) + deltaImpulse;
573+ if (sum < c.m_lowerLimit)
574+ {
575+ deltaImpulse = c.m_lowerLimit-c.m_appliedPushImpulse;
576+ c.m_appliedPushImpulse = c.m_lowerLimit;
577+ }
578+ else
579+ {
580+ c.m_appliedPushImpulse = sum;
581+ }
582+
583+ __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128, body1.internalGetInvMass().mVec128);
584+ __m128 linearComponentB = _mm_mul_ps(c.m_contactNormal.mVec128, body2.internalGetInvMass().mVec128);
585+ body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, vmulq_n_f32(linearComponentA, deltaImpulse));
586+ body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, vmulq_n_f32(c.m_angularComponentA.mVec128, deltaImpulse));
587+ body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128, vmulq_n_f32(linearComponentB, deltaImpulse));
588+ body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, vmulq_n_f32(c.m_angularComponentB.mVec128, deltaImpulse));
589+*/
590+#endif//BT_USE_NEON
236591 #else
237592 resolveSplitPenetrationImpulseCacheFriendly(body1,body2,c);
238593 #endif
@@ -646,6 +1001,8 @@
6461001
6471002 void btSequentialImpulseConstraintSolver::convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal)
6481003 {
1004+ BT_PROFILE("convertContact");
1005+
6491006 btCollisionObject* colObj0=0,*colObj1=0;
6501007
6511008 colObj0 = (btCollisionObject*)manifold->getBody0();
@@ -836,7 +1193,6 @@
8361193 btRigidBody& rbA = constraint->getRigidBodyA();
8371194 btRigidBody& rbB = constraint->getRigidBodyB();
8381195
839-
8401196 int j;
8411197 for ( j=0;j<info1.m_numConstraintRows;j++)
8421198 {
@@ -849,13 +1205,15 @@
8491205 currentConstraintRow[j].m_solverBodyB = &rbB;
8501206 }
8511207
852- rbA.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
853- rbA.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
854- rbB.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
855- rbB.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
1208+// rbA.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
1209+// rbA.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
1210+// rbB.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
1211+// rbB.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
1212+ rbA.internalGetDeltaLinearVelocity().setZero();
1213+ rbA.internalGetDeltaAngularVelocity().setZero();
1214+ rbB.internalGetDeltaLinearVelocity().setZero();
1215+ rbB.internalGetDeltaAngularVelocity().setZero();
8561216
857-
858-
8591217 btTypedConstraint::btConstraintInfo2 info2;
8601218 info2.fps = 1.f/infoGlobal.m_timeStep;
8611219 info2.erp = infoGlobal.m_erp;
@@ -874,7 +1232,30 @@
8741232 info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
8751233 info2.m_numIterations = infoGlobal.m_numIterations;
8761234 constraints[i]->getInfo2(&info2);
1235+
1236+#ifdef BT_USE_NEON
1237+ btScalar ima = rbA.getInvMass();
1238+ btScalar imb = rbB.getInvMass();
1239+ __asm__ __volatile__("vldr.32 d28, %0" : : "Uv"(ima) : "d0" );
1240+ __asm__ __volatile__("vldr.32 d29, %0" : : "Uv"(imb) : "d1" );
1241+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(0).m_floats[0]) : "d16", "q8");
1242+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(0).m_floats[2]) : "d17", "q8");
1243+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(1).m_floats[0]) : "d18", "q9");
1244+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(1).m_floats[2]) : "d19", "q9");
1245+ __asm__ __volatile__("vldr.32 d20, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(2).m_floats[0]) : "d20", "q10");
1246+ __asm__ __volatile__("vldr.32 d21, %0" : : "Uv"(rbA.getInvInertiaTensorWorld().getRow(2).m_floats[2]) : "d21", "q10");
1247+
1248+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(0).m_floats[0]) : "d22", "q11");
1249+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(0).m_floats[2]) : "d23", "q11");
1250+ __asm__ __volatile__("vldr.32 d24, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(1).m_floats[0]) : "d24", "q12");
1251+ __asm__ __volatile__("vldr.32 d25, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(1).m_floats[2]) : "d25", "q12");
1252+ __asm__ __volatile__("vldr.32 d26, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(2).m_floats[0]) : "d26", "q13");
1253+ __asm__ __volatile__("vldr.32 d27, %0" : : "Uv"(rbB.getInvInertiaTensorWorld().getRow(2).m_floats[2]) : "d27", "q13");
8771254
1255+ __asm__ __volatile__("vtrn.32 d28, d29" : : : "d0", "d1" );
1256+// __asm__ __volatile__("vmov.32 d28, d0" : : : "d28" );
1257+#endif//BT_USE_NEON
1258+
8781259 if (currentConstraintRow->m_upperLimit>constraints[i]->getBreakingImpulseThreshold())
8791260 {
8801261 currentConstraintRow->m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
@@ -891,6 +1272,8 @@
8911272 for ( j=0;j<info1.m_numConstraintRows;j++)
8921273 {
8931274 btSolverConstraint& solverConstraint = currentConstraintRow[j];
1275+#ifdef BT_USE_NEON
1276+#endif
8941277 solverConstraint.m_originalContactPoint = constraint;
8951278
8961279 {
@@ -903,6 +1286,44 @@
9031286 }
9041287
9051288 {
1289+#ifdef BT_USE_NEON
1290+ __asm__ __volatile__("vmov.32 d0, d28" : : : "d0" );
1291+
1292+ __asm__ __volatile__("vldr.32 d4, %0" : : "Uv"(solverConstraint.m_relpos1CrossNormal.m_floats[0]) : "d4", "q2", "s7", "s8");
1293+ __asm__ __volatile__("vldr.32 d5, %0" : : "Uv"(solverConstraint.m_relpos1CrossNormal.m_floats[2]) : "d5", "q2", "s9", "s10");
1294+ __asm__ __volatile__("vldr.32 d6, %0" : : "Uv"(solverConstraint.m_relpos2CrossNormal.m_floats[0]) : "d6", "q3", "s11", "s12");
1295+ __asm__ __volatile__("vldr.32 d7, %0" : : "Uv"(solverConstraint.m_relpos2CrossNormal.m_floats[2]) : "d7", "q3", "s13", "s14");
1296+
1297+ __asm__ __volatile__("vldr.32 d2, %0" : : "Uv"(solverConstraint.m_contactNormal.m_floats[0]) : "d2", "q1", "s3", "s4");
1298+ __asm__ __volatile__("vldr.32 d3, %0" : : "Uv"(solverConstraint.m_contactNormal.m_floats[2]) : "d3", "q1", "s5", "s6");
1299+
1300+
1301+ __asm__ __volatile__("vmul.f32 q14, q8, d4[0]" : : : "q14"); // iMJaA
1302+ __asm__ __volatile__("vmla.f32 q14, q9, d4[1]" : : : "q14");
1303+ __asm__ __volatile__("vmla.f32 q14, q10, d5[0]" : : : "q14");
1304+ __asm__ __volatile__("vmul.f32 q15, q14, q2" : : : "q15"); // dot
1305+
1306+ __asm__ __volatile__("vmul.f32 q14, q11, d6[0]" : : : "q14"); // iMJaB
1307+ __asm__ __volatile__("vmla.f32 q14, q12, d6[1]" : : : "q14");
1308+ __asm__ __volatile__("vmla.f32 q14, q13, d7[0]" : : : "q14");
1309+ __asm__ __volatile__("vmla.f32 q15, q14, q3" : : : "q15"); // dot
1310+
1311+
1312+ __asm__ __volatile__("vmul.f32 q14, q1, d0[0]" : : : "q14"); // iMJlA
1313+ __asm__ __volatile__("vmla.f32 q15, q14, q1" : : : "q15"); // dot
1314+
1315+ __asm__ __volatile__("vmul.f32 q14, q1, d0[1]" : : : "q14"); // iMJlB
1316+ __asm__ __volatile__("vmla.f32 q15, q14, q1" : : : "q15"); // dot
1317+
1318+ __asm__ __volatile__("vpadd.f32 d31, d31, d30" : : : "d31"); // tree add1
1319+ __asm__ __volatile__("vpadd.f32 d1, d31, d31" : : : "d1"); // tree add2
1320+
1321+ __asm__ __volatile__("vrecpe.f32 d1, d1" : : : "d1"); // 1/d1[0]
1322+
1323+ __asm__ __volatile__("vmov.f32 %0, d1[0]" : "=r"(solverConstraint.m_jacDiagABInv) : : "memory"); // 1/sum
1324+
1325+ __asm__ __volatile__("vmov.32 d28, d0" : : : "d28" );
1326+#else//BT_USE_NEON
9061327 btVector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass();
9071328 btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
9081329 btVector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal?
@@ -914,6 +1335,7 @@
9141335 sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
9151336
9161337 solverConstraint.m_jacDiagABInv = btScalar(1.)/sum;
1338+#endif//BT_USE_NEON
9171339 }
9181340
9191341
@@ -963,6 +1385,7 @@
9631385 int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
9641386
9651387 ///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
1388+/* disable randomize
9661389 m_orderTmpConstraintPool.resize(numConstraintPool);
9671390 m_orderFrictionConstraintPool.resize(numFrictionPool);
9681391 {
@@ -976,6 +1399,7 @@
9761399 m_orderFrictionConstraintPool[i] = i;
9771400 }
9781401 }
1402+*/
9791403
9801404 return 0.f;
9811405
@@ -989,6 +1413,7 @@
9891413
9901414 int j;
9911415
1416+ /* disable randomize
9921417 if (infoGlobal.m_solverMode & SOLVER_RANDMIZE_ORDER)
9931418 {
9941419 if ((iteration & 7) == 0) {
@@ -1007,6 +1432,7 @@
10071432 }
10081433 }
10091434 }
1435+ */
10101436
10111437 if (infoGlobal.m_solverMode & SOLVER_SIMD)
10121438 {
@@ -1026,7 +1452,8 @@
10261452 int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
10271453 for (j=0;j<numPoolConstraints;j++)
10281454 {
1029- const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1455+// const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1456+ const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
10301457 resolveSingleConstraintRowLowerLimitSIMD(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
10311458
10321459 }
@@ -1034,7 +1461,8 @@
10341461 int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
10351462 for (j=0;j<numFrictionPoolConstraints;j++)
10361463 {
1037- btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
1464+// btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
1465+ btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[j];
10381466 btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
10391467
10401468 if (totalImpulse>btScalar(0))
@@ -1063,7 +1491,8 @@
10631491 int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
10641492 for (j=0;j<numPoolConstraints;j++)
10651493 {
1066- const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1494+// const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1495+ const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
10671496 resolveSingleConstraintRowLowerLimit(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
10681497 }
10691498 ///solve all friction constraints
@@ -1070,7 +1499,8 @@
10701499 int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
10711500 for (j=0;j<numFrictionPoolConstraints;j++)
10721501 {
1073- btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
1502+// btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
1503+ btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[j];
10741504 btScalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
10751505
10761506 if (totalImpulse>btScalar(0))
@@ -1100,7 +1530,8 @@
11001530 int j;
11011531 for (j=0;j<numPoolConstraints;j++)
11021532 {
1103- const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1533+// const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1534+ const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
11041535
11051536 resolveSplitPenetrationSIMD(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
11061537 }
@@ -1116,7 +1547,8 @@
11161547 int j;
11171548 for (j=0;j<numPoolConstraints;j++)
11181549 {
1119- const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1550+// const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
1551+ const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
11201552
11211553 resolveSplitPenetrationImpulseCacheFriendly(*solveManifold.m_solverBodyA,*solveManifold.m_solverBodyB,solveManifold);
11221554 }
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h (revision 110)
@@ -32,8 +32,8 @@
3232 btConstraintArray m_tmpSolverContactConstraintPool;
3333 btConstraintArray m_tmpSolverNonContactConstraintPool;
3434 btConstraintArray m_tmpSolverContactFrictionConstraintPool;
35- btAlignedObjectArray<int> m_orderTmpConstraintPool;
36- btAlignedObjectArray<int> m_orderFrictionConstraintPool;
35+// btAlignedObjectArray<int> m_orderTmpConstraintPool;
36+// btAlignedObjectArray<int> m_orderFrictionConstraintPool;
3737 btAlignedObjectArray<btTypedConstraint::btConstraintInfo1> m_tmpConstraintSizesPool;
3838
3939 void setupFrictionConstraint( btSolverConstraint& solverConstraint, const btVector3& normalAxis,btRigidBody* solverBodyA,btRigidBody* solverBodyIdB,
@@ -59,7 +59,7 @@
5959 void convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);
6060
6161
62- void resolveSplitPenetrationSIMD(
62+ SIMD_FORCE_INLINE void resolveSplitPenetrationSIMD(
6363 btRigidBody& body1,
6464 btRigidBody& body2,
6565 const btSolverConstraint& contactConstraint);
@@ -74,11 +74,11 @@
7474
7575 void resolveSingleConstraintRowGeneric(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
7676
77- void resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
77+ SIMD_FORCE_INLINE void resolveSingleConstraintRowGenericSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
7878
7979 void resolveSingleConstraintRowLowerLimit(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
8080
81- void resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
81+ SIMD_FORCE_INLINE void resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& contactConstraint);
8282
8383 protected:
8484 static btRigidBody& getFixedBody();
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btSolverBody.h (revision 110)
@@ -28,8 +28,27 @@
2828 #define USE_SIMD 1
2929 #endif //
3030
31+#ifdef BT_USE_NEON
32+#define USE_SIMD 1
3133
34+typedef float32x4_t __m128;
35+
36+#define _mm_set1_ps vdupq_n_f32
37+#define _mm_mul_ps vmulq_f32
38+#define _mm_add_ps vaddq_f32
39+#define _mm_sub_ps vsubq_f32
40+#define _mm_and_ps(X, Y) vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(X), vreinterpretq_s32_f32(Y)))
41+#define _mm_or_ps(X, Y) vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(X), vreinterpretq_s32_f32(Y)))
42+#define _mm_andnot_ps(X, Y) vreinterpretq_f32_s32(vbicq_u32(vreinterpretq_s32_f32(X), vreinterpretq_s32_f32(Y)))
43+#define _mm_cmplt_ps(X, Y) vreinterpretq_f32_s32(vcltq_f32((X).get128(), (Y)))
44+#define _mm_shuffle_ps(x, y, z) vdupq_n_f32(*((float*)&(x) + z))
45+#define _MM_SHUFFLE(a, b, c, d) (a)
46+
47+#endif
48+
49+
3250 #ifdef USE_SIMD
51+#ifdef BT_USE_SSE
3352
3453 struct btSimdScalar
3554 {
@@ -99,9 +118,11 @@
99118 return btSimdScalar(_mm_add_ps(v1.get128(),v2.get128()));
100119 }
101120
102-
103121 #else
104122 #define btSimdScalar btScalar
123+#endif//BT_USE_SSE
124+#else
125+#define btSimdScalar btScalar
105126 #endif
106127
107128 ///The btSolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/BulletDynamics/ConstraintSolver/btGeneric6DofConstraint.cpp (revision 110)
@@ -23,6 +23,7 @@
2323 #include "BulletDynamics/Dynamics/btRigidBody.h"
2424 #include "LinearMath/btTransformUtil.h"
2525 #include "LinearMath/btTransformUtil.h"
26+#include "LinearMath/btQuickprof.h"
2627 #include <new>
2728
2829
@@ -525,6 +526,7 @@
525526
526527 void btGeneric6DofConstraint::getInfo1 (btConstraintInfo1* info)
527528 {
529+ BT_PROFILE("getInfo1");
528530 if (m_useSolveConstraintObsolete)
529531 {
530532 info->m_numConstraintRows = 0;
@@ -574,6 +576,8 @@
574576
575577 void btGeneric6DofConstraint::getInfo2 (btConstraintInfo2* info)
576578 {
579+ BT_PROFILE("getInfo2");
580+
577581 btAssert(!m_useSolveConstraintObsolete);
578582
579583 const btTransform& transA = m_rbA.getCenterOfMassTransform();
@@ -781,6 +785,7 @@
781785 const btTransform& transA,const btTransform& transB,const btVector3& linVelA,const btVector3& linVelB,const btVector3& angVelA,const btVector3& angVelB,
782786 btConstraintInfo2 *info, int row, btVector3& ax1, int rotational,int rotAllowed)
783787 {
788+ BT_PROFILE("get_limit_motor_info2");
784789 int srow = row * info->rowskip;
785790 int powered = limot->m_enableMotor;
786791 int limit = limot->m_currentLimit;
@@ -801,6 +806,129 @@
801806 {
802807 if (m_useOffsetForConstraintFrame)
803808 {
809+#undef BT_USE_NEON
810+#ifdef BT_USE_NEON
811+/*
812+ float32x4_t relB = vsubq_f32(m_calculatedTransformB.getOrigin().get128(), transB.getOrigin().get128());
813+ float32x4_t dotb_tmp = vmulq_f32(relB, ax1.get128());
814+ float32x2_t dotb_tmp2 = vpadd_f32(vget_high_f32(dotb_tmp), vget_low_f32(dotb_tmp));
815+ float32x2_t dotb_tmp3 = vpadd_f32(dotb_tmp2, dotb_tmp2);
816+
817+ float32x4_t projB = vmulq_lane_f32(ax1.get128(), dotb_tmp3, 0);
818+ float32x4_t orthoB = vsubq_f32(relB, projB);
819+
820+ float32x4_t relA = vsubq_f32(m_calculatedTransformA.getOrigin().get128(), transA.getOrigin().get128());
821+ float32x4_t dota_tmp = vmulq_f32(relA, ax1.get128());
822+ float32x2_t dota_tmp2 = vpadd_f32(vget_high_f32(dota_tmp), vget_low_f32(dota_tmp));
823+ float32x2_t dota_tmp3 = vpadd_f32(dota_tmp2, dota_tmp2);
824+
825+ float32x4_t projA = vmulq_lane_f32(ax1.get128(), dota_tmp3, 0);
826+ float32x4_t orthoA = vsubq_f32(relA, projA);
827+
828+ float32x2_t desiredOffs = vdup_n_f32(limot->m_currentPosition - limot->m_currentLimitError);
829+
830+ float32x4_t totalDist = vsubq_f32(projA, projB);
831+ totalDist = vmlaq_lane_f32(totalDist, ax1.get128(), desiredOffs, 0);
832+
833+ orthoA = vmlaq_lane_f32(orthoA, totalDist, vdup_n_f32(m_factA), 0);
834+ orthoB = vmlsq_lane_f32(orthoB, totalDist, vdup_n_f32(m_factB), 0);
835+
836+ btVector3 tmpA = btVector3(orthoA).cross(ax1);
837+ btVector3 tmpB = btVector3(orthoB).cross(ax1);
838+
839+*/
840+ __asm__ __volatile__("vldr.32 d14, %0" : : "Uv"(ax1.m_floats[0]) : "d14", "q7", "s28", "s29");
841+ __asm__ __volatile__("vldr.32 d15, %0" : : "Uv"(ax1.m_floats[2]) : "d15", "q7", "s30", "s31");
842+ __asm__ __volatile__("vldr.32 d16, %0" : : "Uv"(m_calculatedTransformA.getOrigin().m_floats[0]) : "d16", "q8");
843+ __asm__ __volatile__("vldr.32 d17, %0" : : "Uv"(m_calculatedTransformA.getOrigin().m_floats[2]) : "d17", "q8");
844+ __asm__ __volatile__("vldr.32 d18, %0" : : "Uv"(transA.getOrigin().m_floats[0]) : "d18", "q9");
845+ __asm__ __volatile__("vldr.32 d19, %0" : : "Uv"(transA.getOrigin().m_floats[2]) : "d19", "q9");
846+ __asm__ __volatile__("vldr.32 d20, %0" : : "Uv"(m_calculatedTransformB.getOrigin().m_floats[0]) : "d20", "q10");
847+ __asm__ __volatile__("vldr.32 d21, %0" : : "Uv"(m_calculatedTransformB.getOrigin().m_floats[2]) : "d21", "q10");
848+ __asm__ __volatile__("vldr.32 d22, %0" : : "Uv"(transB.getOrigin().m_floats[0]) : "d22", "q11");
849+ __asm__ __volatile__("vldr.32 d23, %0" : : "Uv"(transB.getOrigin().m_floats[2]) : "d23", "q11");
850+
851+ __asm__ __volatile__("vldr.32 d2, %0" : : "Uv"(limot->m_currentPosition) : "d2", "q1", "s4", "s5");
852+ __asm__ __volatile__("vldr.32 d3, %0" : : "Uv"(limot->m_currentLimitError) : "d3", "q1", "s6", "s7");
853+ __asm__ __volatile__("vldr.32 d4, %0" : : "Uv"(m_factA) : "d4", "q2", "s8", "s9");
854+ __asm__ __volatile__("vldr.32 d5, %0" : : "Uv"(m_factB) : "d5", "q2", "s10", "s11");
855+
856+
857+ __asm__ __volatile__("vsub.f32 q9, q8, q9" : : : "d18", "d19", "q9"); // relA
858+ __asm__ __volatile__("vmul.f32 q0, q9, q7" : : : "d0", "d1", "q0", "s0", "s1", "s2", "s3"); // dotA
859+ __asm__ __volatile__("vpadd.f32 d1, d1, d0" : : : "d1", "q0", "s2", "s3");
860+ __asm__ __volatile__("vpadd.f32 d1, d1, d1" : : : "d1", "q0", "s2", "s3");
861+
862+ __asm__ __volatile__("vmul.f32 q12, q7, d1[0]" : : : "d24", "d25", "q12"); // projA
863+ __asm__ __volatile__("vsub.f32 q4, q9, q12" : : : "d8", "d9", "q4", "s16", "s17", "s18", "s19"); // orthoA
864+
865+ __asm__ __volatile__("vsub.f32 q11, q10, q11" : : : "d22", "d23", "q11"); // relB
866+ __asm__ __volatile__("vmul.f32 q0, q11, q7" : : : "d0", "d1", "q0", "s0", "s1", "s2", "s3"); // dotB
867+ __asm__ __volatile__("vpadd.f32 d1, d1, d0" : : : "d1", "q0", "s2", "s3");
868+ __asm__ __volatile__("vpadd.f32 d1, d1, d1" : : : "d1", "q0", "s2", "s3");
869+
870+ __asm__ __volatile__("vmul.f32 q14, q7, d1[0]" : : : "d28", "d29", "q14"); // projB
871+ __asm__ __volatile__("vsub.f32 q5, q11, q14" : : : "d10", "d11", "q5", "s20", "s21", "s22", "s23"); // orthoB
872+// __asm__ __volatile__("vsub.f32 q15, q11, q14" : : : "d30", "d31", "q15"); // orthoB
873+
874+
875+ __asm__ __volatile__("vsub.f32 d0, d2, d3" : : : "d0", "q0", "s0", "s1"); // desiredOffs
876+
877+ __asm__ __volatile__("vsub.f32 q3, q12, q14" : : : "d6", "d7", "q3", "s12", "s13", "s14", "s15"); // totalDist
878+ __asm__ __volatile__("vmla.f32 q3, q7, d0[0]" : : : "d6", "d7", "q3", "s12", "s13", "s14", "s15");
879+ __asm__ __volatile__("vmla.f32 q4, q3, d4[0]" : : : "d8", "d9", "q4", "s16", "s17", "s18", "s19"); // relA (is orthoA
880+ __asm__ __volatile__("vmls.f32 q5, q3, d5[0]" : : : "d10", "d11", "q5", "s20", "s21", "s22", "s23"); // relB (is orthoB
881+// __asm__ __volatile__("vmls.f32 q15, q3, d5[0]" : : : "d30", "d31", "q15"); // relB (is orthoB
882+
883+ __asm__ __volatile__("veor.32 q0, q0, q0" : : : "d0", "d1", "q0", "s0", "s1", "s2", "s3"); // clear
884+ __asm__ __volatile__("veor.32 q1, q1, q1" : : : "d2", "d3", "q1", "s4", "s5", "s6", "s7"); // clear
885+
886+ __asm__ __volatile__("vmul.f32 s0, s17, s30" : : : "d0", "q0", "s0"); // tmpA.x
887+ __asm__ __volatile__("vmls.f32 s0, s18, s29" : : : "d0", "q0", "s0");
888+ __asm__ __volatile__("vmul.f32 s1, s18, s28" : : : "d0", "q0", "s1"); // tmpA.y
889+ __asm__ __volatile__("vmls.f32 s1, s16, s30" : : : "d0", "q0", "s1");
890+ __asm__ __volatile__("vmul.f32 s2, s16, s29" : : : "d1", "q0", "s2"); // tmpA.z
891+ __asm__ __volatile__("vmls.f32 s2, s17, s28" : : : "d1", "q0", "s2");
892+
893+ __asm__ __volatile__("vmul.f32 s4, s21, s30" : : : "d2", "q1", "s4"); // tmpB.x
894+ __asm__ __volatile__("vmls.f32 s4, s22, s29" : : : "d2", "q1", "s4");
895+ __asm__ __volatile__("vmul.f32 s5, s22, s28" : : : "d2", "q1", "s5"); // tmpB.y
896+ __asm__ __volatile__("vmls.f32 s5, s20, s30" : : : "d2", "q1", "s5");
897+ __asm__ __volatile__("vmul.f32 s6, s20, s29" : : : "d3", "q1", "s6"); // tmpB.z
898+ __asm__ __volatile__("vmls.f32 s6, s21, s28" : : : "d3", "q1", "s6");
899+
900+ // relA: s16, s17, s18
901+ // relB: s20, s21, s22
902+ // ax: s28, s29, s30
903+
904+ btVector3 tmpA, tmpB;
905+// __asm__ __volatile__("vstr.32 d0, %0" : : "Uv"(tmpA.m_floats[0]) : "memory");
906+// __asm__ __volatile__("vstr.32 d1, %0" : : "Uv"(tmpA.m_floats[2]) : "memory");
907+// __asm__ __volatile__("vstr.32 d2, %0" : : "Uv"(tmpB.m_floats[0]) : "memory");
908+// __asm__ __volatile__("vstr.32 d3, %0" : : "Uv"(tmpB.m_floats[2]) : "memory");
909+
910+ btVector3 orthoA, orthoB;
911+ __asm__ __volatile__("vstr.32 d8, %0" : : "Uv"(orthoA.m_floats[0]) : "memory");
912+ __asm__ __volatile__("vstr.32 d9, %0" : : "Uv"(orthoA.m_floats[2]) : "memory");
913+ __asm__ __volatile__("vstr.32 d10, %0" : : "Uv"(orthoB.m_floats[0]) : "memory");
914+ __asm__ __volatile__("vstr.32 d11, %0" : : "Uv"(orthoB.m_floats[2]) : "memory");
915+// __asm__ __volatile__("vstr.32 d30, %0" : : "Uv"(orthoB.m_floats[0]) : "memory");
916+// __asm__ __volatile__("vstr.32 d31, %0" : : "Uv"(orthoB.m_floats[2]) : "memory");
917+
918+ tmpA = orthoA.cross(ax1);
919+ tmpB = orthoB.cross(ax1);
920+
921+
922+
923+ if(m_hasStaticBody && (!rotAllowed))
924+ {
925+ tmpA *= m_factA;
926+ tmpB *= m_factB;
927+ }
928+ int i;
929+ for (i=0; i<3; i++) info->m_J1angularAxis[srow+i] = tmpA[i];
930+ for (i=0; i<3; i++) info->m_J2angularAxis[srow+i] = -tmpB[i];
931+#else//BT_USE_NEON
804932 btVector3 tmpA, tmpB, relA, relB;
805933 // get vector from bodyB to frameB in WCS
806934 relB = m_calculatedTransformB.getOrigin() - transB.getOrigin();
@@ -829,6 +957,7 @@
829957 int i;
830958 for (i=0; i<3; i++) info->m_J1angularAxis[srow+i] = tmpA[i];
831959 for (i=0; i<3; i++) info->m_J2angularAxis[srow+i] = -tmpB[i];
960+#endif//BT_USE_NEON
832961 } else
833962 {
834963 btVector3 ltd; // Linear Torque Decoupling vector
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/LinearMath/btScalar.h (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/LinearMath/btScalar.h (revision 110)
@@ -166,9 +166,33 @@
166166 #define btFullAssert(x)
167167 #define btLikely(_c) _c
168168 #define btUnlikely(_c) _c
169+
170+#else
169171
172+#ifdef BT_USE_NEON
173+
174+ #include <arm_neon.h>
175+ #define SIMD_FORCE_INLINE inline
176+ #define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
177+ #define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
178+ #define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
179+ #ifndef assert
180+ #include <assert.h>
181+ #endif
182+
183+#if defined(DEBUG) || defined (_DEBUG)
184+ #define btAssert assert
170185 #else
186+ #define btAssert(x)
187+#endif
171188
189+ //btFullAssert is optional, slows down a lot
190+ #define btFullAssert(x)
191+ #define btLikely(_c) _c
192+ #define btUnlikely(_c) _c
193+
194+#else
195+
172196 #define SIMD_FORCE_INLINE inline
173197 ///@todo: check out alignment methods for other platforms/compilers
174198 ///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
@@ -191,6 +215,9 @@
191215 #define btFullAssert(x)
192216 #define btLikely(_c) _c
193217 #define btUnlikely(_c) _c
218+
219+#endif // BT_USE_NEON
220+
194221 #endif //__APPLE__
195222
196223 #endif // LIBSPE2
--- branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/LinearMath/btVector3.h (revision 109)
+++ branches/bullet-neon-release-0.25/MikuMikuDroid/jni/bullet/LinearMath/btVector3.h (revision 110)
@@ -30,8 +30,6 @@
3030 #endif //BT_USE_DOUBLE_PRECISION
3131
3232
33-
34-
3533 /**@brief btVector3 can be used to represent 3D points and vectors.
3634 * It has an un-used w component to suit 16-byte alignment when btVector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
3735 * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
@@ -62,9 +60,28 @@
6260 {
6361 mVec128 = v128;
6462 }
63+#else//BT_USE_SSE
64+#ifdef BT_USE_NEON
65+ union {
66+ float32x4_t mVec128;
67+ btScalar m_floats[4];
68+ };
69+ SIMD_FORCE_INLINE float32x4_t get128() const
70+ {
71+ return mVec128;
72+ }
73+ SIMD_FORCE_INLINE void set128(float32x4_t v128)
74+ {
75+ mVec128 = v128;
76+ }
77+ SIMD_FORCE_INLINE btVector3(const float32x4_t& v128)
78+ {
79+ mVec128 = v128;
80+ }
6581 #else
6682 btScalar m_floats[4];
6783 #endif
84+#endif
6885 #endif //__CELLOS_LV2__ __SPU__
6986
7087 public:
@@ -336,10 +353,23 @@
336353 v2->setValue(-y() ,x() ,0.);
337354 }
338355
356+#ifdef BT_USE_NEON
357+/*
339358 void setZero()
340359 {
360+ mVec128 = vdupq_n_f32(0.0);
361+ }
362+*/
363+ void setZero()
364+ {
341365 setValue(btScalar(0.),btScalar(0.),btScalar(0.));
342366 }
367+#else
368+ void setZero()
369+ {
370+ setValue(btScalar(0.),btScalar(0.),btScalar(0.));
371+ }
372+#endif
343373
344374 SIMD_FORCE_INLINE bool isZero() const
345375 {
Show on old repository browser