#include "QuaternionExponentX87.h" #ifdef _CPU_X86 static const float fEpsilon = 1e-4f; ////////////////////////////////////////////////////////////////////////// // x87 asm optimized quaternion exponent // PARAMETERS: // pSrcVector[IN] - the vector to calculate the exponent for // pDstQuat [OUT]- the quaternion (exponent of the input) // NOTE: // The input vector mimics a quaternion with 0 real component (W) // This version uses FSINCOS, which takes ~70% of execution time ////////////////////////////////////////////////////////////////////////// void quaternionExponent_x87(const float* pSrc, float* pDst) { _asm { mov ESI, pSrc mov EDI, pDst // double d = sqrt( double(pSrc[0])*pSrc[1] + double(pSrc[1])*pSrc[1] + double(pSrc[2])*pSrc[2]); fld [ESI+8] fld [ESI+4] fld [ESI ] fld ST(0) fmul ST(0),ST(0) fld ST(2) fmul ST(0),ST(0) faddp ST(1),ST(0) fld ST(3) fmul ST(0),ST(0) faddp ST(1),ST(0) // ST(0): x^2+y^2+z^2 == d^2 // ST(1): x // ST(2): y // ST(3): z fld fEpsilon fcomip ST, ST(1) jnc small_rotation // this path is almost never taken fsqrt fld ST(0) // now we need cos, sin to replace the current value fsincos // STACK: cos, sin, sqrt, x, y, z fstp dword ptr [EDI] fdivrp ST(1),ST(0) // STACK: sin(d)/d, x,y,z fmul ST(1),ST(0) fmul ST(2),ST(0) fmulp ST(3),ST(0) fstp dword ptr [EDI+ 4] fstp dword ptr [EDI+ 8] fstp dword ptr [EDI+12] } return; _asm { small_rotation: fld1 fsubrp ST(1),ST(0) fstp dword ptr [EDI ] fstp dword ptr [EDI+4] fstp dword ptr [EDI+8] fstp dword ptr [EDI+12] } } static const float fDivBy2 = 1/2.0f, fDivBy3 = 1/3.0f, fDivBy4 = 1/4.0f, fDivBy5 = 1/5.0f, fDivBy6 = 1/6.0f, fDivBy7 = 1/7.0f, fDivBy8 = 1/8.0f, fDivBy9 = 1/9.0f; // Takes pSrc: the x,y,z of the imaginary part of the quaternion 0+xi+yj+zk to calculate the exponent // into pDst: the x,y,z,w of the resulting quaternion IN THAT ORDER void quaternionExponent_x87approx(const float* pSrc, float* pDst) { _asm { mov ESI, pSrc mov EDI, pDst // double d = sqrt( double(pSrc[0])*pSrc[1] + double(pSrc[1])*pSrc[1] + double(pSrc[2])*pSrc[2]); fld [ESI+8] fld [ESI+4] fld [ESI ] fld1 fld1 // STACK: 1 1 x y z fld ST(2) fmul ST(0),ST(0) fld ST(4) fmul ST(0),ST(0) faddp ST(1),ST(0) fld ST(5) fmul ST(0),ST(0) faddp ST(1),ST(0) // ST(0): x^2+y^2+z^2 == d^2 // ST(1): 1 // ST(2): 1 // ST(3): x // ST(4): y // ST(5): z // now we need cos, sin to replace the current value fld ST(0) // STACK: D^2(temp), D^2(const), 1(will be cos), 1(will be sin/D), x, y, z fmul fDivBy2 fsub ST(2),ST(0) fmul fDivBy3 fsub ST(3),ST(0) // STACK: D^2/3!, D^2, 1-D^2/2!(will be cos), 1-D^2/3!(will be sin/D), x, y, z fmul ST(0),ST(1) fmul fDivBy4 fadd ST(2),ST(0) fmul fDivBy5 fadd ST(3),ST(0) // STACK: D^4/5!, D^2, 1-D^2/2!+D^4/4!(will be cos), 1-D^2/3!+D^4/5!(will be sin/D), x, y, z fmul ST(0),ST(1) fmul fDivBy6 fsub ST(2),ST(0) fmul fDivBy7 fsub ST(3),ST(0) // STACK: D^6/7!, D^2, 1-D^2/2!+D^4/4!-D^6/6!, 1-D^2/3!+D^4/5!-D^6/7!, x, y, z // the last step fmulp ST(1),ST(0) // STACK: D^8/7!, 1-D^2/2!+D^4/4!-D^6/6!, 1-D^2/3!+D^4/5!-D^6/7!, x, y, z fmul fDivBy8 fadd ST(1),ST(0) fmul fDivBy9 faddp ST(2),ST(0) // STACK: 1-D^2/2!+D^4/4!-D^6/6!+D^8/8!, 1-D^2/3!+D^4/5!-D^6/7!+D^8/9!, x, y, z // cos(D), sin(D)/D,x,y,z fstp dword ptr [EDI] fmul ST(1),ST(0) fmul ST(2),ST(0) fmulp ST(3),ST(0) fstp dword ptr [EDI+ 4] fstp dword ptr [EDI+ 8] fstp dword ptr [EDI+12] } } #endif