Files
FC1/CryPhysics/matrixnm.cpp
romkazvo 34d6c5d489 123
2023-08-07 19:29:24 +08:00

765 lines
21 KiB
C++

//////////////////////////////////////////////////////////////////////
//
// NxM Matrix
//
// File: matrixnm.cpp
// Description : matrixnm template class implementation for float and real
//
// History:
// -:Created by Anton Knyazev
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "utils.h"
#ifdef USE_MATRIX_POOLS
DECLARE_MTXNxM_POOL(float,512)
DECLARE_VECTORN_POOL(float,256)
#ifndef REAL_IS_FLOAT
DECLARE_MTXNxM_POOL(real,512)
DECLARE_VECTORN_POOL(real,256)
#endif
#endif
int g_bHasSSE;
inline float getlothresh(float) { return 1E-10f; }
inline float gethithresh(float) { return 1E10f; }
inline double getlothresh(double) { return 1E-20; }
inline double gethithresh(double) { return 1E20; }
template<class ftype>
int matrix_tpl<ftype>::jacobi_transformation(matrix_tpl<ftype> &evec, ftype *eval, ftype prec) const
{
if (!(flags & mtx_symmetric) || nCols!=nRows) return 0;
matrix_tpl a(*this);
int n = nRows, p,q,r,iter,pmax,qmax,sz=nRows*nCols;
ftype theta,t,s,c,apr,aqr,arp,arq,thresh=prec,amax;
evec.identity();
evec.flags = (evec.flags & mtx_foreign_data) | mtx_orthogonal | mtx_normal;
for(iter=0; iter<nRows*nCols*10; iter++) {
for(p=0,amax=thresh,pmax=-1;p<n-1;p++) for(q=p+1;q<n;q++) if (sqr(a[p][q])>amax)
{ amax=sqr(a[p][q]); pmax=p; qmax=q; }
if (pmax==-1)
goto exitjacobi;
p=pmax; q=qmax;
theta = (ftype)0.5*(a[q][q]-a[p][p])/a[p][q];
if (fabs_tpl(theta)<gethithresh(theta)) {
t = sqrt_tpl(theta*theta+1);
if (theta>0) t = -theta-t;
else t -= theta;
c = 1/sqrt_tpl(1+t*t); s = t*c;
for(r=0;r<n;r++) { arp=a[r][p];arq=a[r][q]; a[r][p]=c*arp-s*arq; a[r][q]=c*arq+s*arp; }
for(r=0;r<n;r++) { apr=a[p][r];aqr=a[q][r]; a[p][r]=c*apr-s*aqr; a[q][r]=c*aqr+s*apr; }
for(r=0;r<n;r++) { apr=evec[p][r];aqr=evec[q][r]; evec[p][r]=c*apr-s*aqr; evec[q][r]=c*aqr+s*apr; }
}
a[p][q] = 0;
if (iter==sz+1) thresh += getlothresh(thresh);
}
iter=0;
exitjacobi:
for(p=0;p<n*n;p++) {
t = fabs_tpl(evec.data[p]);
if (t<(ftype)1E-6) evec.data[p]=0;
else if (fabs_tpl(t-1)<getlothresh(t))
evec.data[p]=sgnnz(evec.data[p]);
}
for(p=0;p<n;p++) eval[p] = a[p][p];
return iter; // not converged during iterations limit
}
template<class ftype>
int matrix_tpl<ftype>::conjugate_gradient(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
ftype a,b,r2,r2new,denom,maxel;
int i,iter=nRows*3;
minlen*=minlen;
ftype buf[24],*pbuf = nRows>8 ? new ftype[nRows*3]:buf;
vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),p(nRows,pbuf+nRows),Ap(nRows,pbuf+nRows*2);
(r=rh)-=*this*x; p=r;
r2=r.len2();
do {
Ap = *this*p; denom = p*Ap;
if (sqr(denom)<1E-30) break;
a = r2/denom;
#if defined(LINUX)
r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
#else
r -= Ap*a;
#endif
r2new=r.len2();
if (r2new>r2*500)
break;
#if defined(LINUX)
x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
x += p*a;
#endif
b = r2new/r2; r2=r2new;
(p*=b)+=r;
for(i=0,maxel=0;i<nRows;i++) maxel = max(maxel,fabs_tpl(r[i]));
} while (--iter && (r2new>minlen || maxel>minel));
if (pbuf!=buf) delete pbuf;
return nRows-iter;
}
template<class ftype>
int matrix_tpl<ftype>::biconjugate_gradient(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
ftype a,b,r2,r2new,err,denom,maxel;
int i,iter=nRows*3;
minlen*=minlen;
ftype buf[40],*pbuf = nRows>8 ? new ftype[nRows*5]:buf;
vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),rc(nRows,pbuf+nRows),p(nRows,pbuf+nRows*2),
pc(nRows,pbuf+nRows*3),Ap(nRows,pbuf+nRows*4);
(r=rh)-=*this*x; rc=r; p=r; pc=rc; r2 = r.len2();
do {
Ap = *this*p; denom = pc*Ap;
if (sqr(denom)<1E-30) break;
a = r2/denom;
#if defined(LINUX)
r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
rc = ::operator-=(rc, (const vectorn_tpl<ftype>&)((pc**this)*a));
x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
r -= Ap*a;
rc -= (pc**this)*a;
x += p*a;
#endif
r2new = rc*r;
b = r2new/r2; r2 = r2new;
(p*=b)+=r; (pc*=b)+=rc;
for(err=maxel=0,i=0;i<nRows;i++) {
err += sqr(r.data[i]);
maxel = max(maxel,fabs_tpl(r[i]));
}
} while(--iter && (err>minlen || maxel>minel) && sqr(r2)>1E-30);
if (pbuf!=buf) delete pbuf;
return nRows-iter;
}
template<class ftype>
int matrix_tpl<ftype>::minimum_residual(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
ftype a,b,r2,r2new,err,denom,maxel;
int i,iter=nRows*3;
minlen*=minlen;
ftype buf[40],*pbuf = nRows>8 ? new ftype[nRows*5]:buf;
vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),rc(nRows,pbuf+nRows),p(nRows,pbuf+nRows*2),
Ap(nRows,pbuf+nRows*4);
(r=rh)-=*this*x;
rc=*this*r;
p=r;
r2 = rc*r;
do {
Ap = *this*p; denom = Ap.len2();
if (sqr(denom)<1E-30) break;
a = r2/denom;
#if defined(LINUX)
r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
#else
r -= Ap*a;
#endif
rc = *this*r;
#if defined(LINUX)
x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
x += p*a;
#endif
r2new = rc*r;
b = r2new/r2; r2 = r2new;
(p*=b)+=r;
for(err=maxel=0,i=0;i<nRows;i++) {
err += sqr(r.data[i]);
maxel = max(maxel,fabs_tpl(r[i]));
}
} while(--iter && (err>minlen || maxel>minel) && sqr(r2)>1E-30);
if (pbuf!=buf) delete pbuf;
return nRows-iter;
}
template<class ftype>
int matrix_tpl<ftype>::LUdecomposition(ftype *&LUdata,int *&LUidx) const
{
if (nRows!=nCols) return 0;
int i,j,k,imax,alloc=(LUdata==0 | (LUidx==0)<<1);
ftype aij,bij,maxaij,t;
if (alloc & 1) LUdata = new ftype[nRows*nRows];
if (alloc & 2) LUidx = new int[nRows];
matrix_tpl<ftype> LU(nRows,nRows,0,LUdata);
LU = *this;
for(j=0;j<nRows;j++) {
for(i=0;i<=j;i++) {
for(k=0,bij=LU[i][j]; k<i; k++) bij -= LU[i][k]*LU[k][j];
LU[i][j] = bij;
}
for(maxaij=0,imax=j ;i<nRows;i++) {
for(k=0,aij=LU[i][j]; k<j; k++) aij -= LU[i][k]*LU[k][j];
LU[i][j] = aij;
if (aij*aij > maxaij*maxaij)
{ maxaij=aij; imax=i; }
}
LUidx[j] = imax;
if (j==nRows-1 && LU[j][j]!=0) break; // no aij in this case
if (maxaij==0) { // the matrix is singular
if (alloc & 1) delete[] LUdata;
if (alloc & 2) delete[] LUidx;
return 0;
}
if (imax!=j) for(k=0;k<nRows;k++)
{ t=LU[imax][k]; LU[imax][k]=LU[j][k]; LU[j][k]=t; }
maxaij = (ftype)1.0/maxaij;
for(i=j+1;i<nRows;i++)
LU[i][j] *= maxaij;
}
return 1;
}
template<class ftype>
int matrix_tpl<ftype>::solveAx_b(ftype *x,ftype *b, ftype *LUdata,int *LUidx) const
{
int LUidx_buf[16],alloc=0;
if (!LUdata) {
if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
mtx_pool_pos = 0;
LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
}
if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
alloc = LUdata==0 | (LUidx==0)<<1;
if (!LUdecomposition(LUdata,LUidx)) return 0;
}
int i,j; ftype xi;
matrix_tpl<ftype> LU(nRows,nRows,0,LUdata);
for(i=0;i<nRows;i++) x[i]=b[i];
for(i=0;i<nRows;i++) {
xi=x[i]; x[i]=x[LUidx[i]]; x[LUidx[i]]=xi;
for(j=0;j<i;j++) x[i]-=LU[i][j]*x[j];
}
for(i=nRows-1;i>=0;i--) {
for(j=i+1;j<nRows;j++) x[i]-=LU[i][j]*x[j];
x[i] /= LU[i][i];
}
if (alloc & 1) delete[] LUdata;
if (alloc & 2) delete[] LUidx;
return 1;
}
template<class ftype>
matrix_tpl<ftype>& matrix_tpl<ftype>::invert()
{
if (flags & mtx_orthogonal)
return transpose();
if (nRows!=nCols)
return *this;
int i,j; ftype det=0;
if (nRows==1)
data[0]=(ftype)1.0/data[0];
else if (nRows==2) {
det = data[0]*data[3]-data[1]*data[2];
if (det==0) return *this; det = (ftype)1.0/det;
ftype oldata[4]; for(i=0;i<4;i++) oldata[i]=data[i];
data[0]=oldata[3]*det; data[1]=-oldata[1]*det;
data[2]=-oldata[2]*det; data[3]=oldata[0]*det;
} else if (nRows==3) {
for(i=0;i<3;i++) det+=data[i]*data[inc_mod3[i]+3]*data[dec_mod3[i]+6];
for(i=0;i<3;i++) det-=data[dec_mod3[i]]*data[inc_mod3[i]+3]*data[i+6];
if (det==0) return *this; det = (ftype)1.0/det;
ftype oldata[9]; for(i=0;i<9;i++) oldata[i]=data[i];
for(i=0;i<3;i++) for(j=0;j<3;j++)
data[i+j*3] = (oldata[dec_mod3[i]*3+dec_mod3[j]]*oldata[inc_mod3[i]*3+inc_mod3[j]]-
oldata[dec_mod3[i]*3+inc_mod3[j]]*oldata[inc_mod3[i]*3+dec_mod3[j]])*det;
} else {
ftype *LUdata=0,*colmark; int *LUidx=0,LUidx_buf[32], alloc=0;
if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
mtx_pool_pos = 0;
LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
} else alloc = 1;
if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
else alloc |= 2;
if (!LUdecomposition(LUdata,LUidx)) return *this;
if (nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
if (mtx_pool_pos+nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
mtx_pool_pos=0;
colmark=mtx_pool+mtx_pool_pos; mtx_pool_pos+=nRows;
} else {
colmark = new ftype[nRows]; alloc |= 4;
}
for(i=0;i<nRows;i++) colmark[i]=0;
for(i=0;i<nRows;i++) {
colmark[i]=1; solveAx_b(data+i*nRows,colmark, LUdata,LUidx); colmark[i]=0;
}
transpose();
if (alloc & 1) delete[] LUdata;
if (alloc & 2) delete[] LUidx;
if (alloc & 4) delete[] colmark;
}
flags = flags & (mtx_normal | mtx_orthogonal | mtx_symmetric | mtx_PD | mtx_PSD | mtx_diagonal | mtx_identity | mtx_foreign_data);
return *this;
}
template<class ftype>
ftype matrix_tpl<ftype>::determinant(ftype *LUdata,int *LUidx) const
{
if (nRows!=nCols) return 0;
int i,j; ftype det=0;
if (nRows==1)
det = data[0];
else if (nRows==2)
det = data[0]*data[3]-data[1]*data[2];
else if (nRows==3) {
for(i=0;i<3;i++) det+=data[i]*data[inc_mod3[i]+3]*data[dec_mod3[i]+6];
for(i=0;i<3;i++) det-=data[dec_mod3[i]]*data[inc_mod3[i]+3]*data[i+6];
} else if (LUdecomposition(LUdata,LUidx)) {
ftype *LUdata=0; int *LUidx=0,LUidx_buf[32], alloc=0;
if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
mtx_pool_pos = 0;
LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
} else alloc = 1;
if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
else alloc |= 2;
if (!LUdecomposition(LUdata,LUidx)) return 0;
for(i=j=0,det=1;i<nRows;i++,j+=nRows+1) det*=LUdata[j];
for(i=j=0;i<nRows;i++) j+=LUidx[i]!=i;
if (j&2) det*=-1;
if (alloc & 1) delete[] LUdata;
if (alloc & 2) delete[] LUidx;
}
return det;
}
// Linear programming with simplex method
template<class ftype>
int matrix_tpl<ftype>::LPsimplex(int m1,int m2, ftype &objfunout,ftype *xout, int nvars, ftype e) const
{
int M=nRows-2,N=nCols-1,i,j,imax,jmax,iobjfun=M, res=0,iter=(M+N)*8;
ftype rpivot,t;
const int imask=0x7FFFFFFF;
int *irow=new int[M],*icol=new int[N];
#define a (*this)
if (e<0) {
for(i=nRows*nCols-1,e=0;i>=0;i--) e+=data[i];
e *= (ftype)1E-6/(nRows*nCols);
}
if (nvars<0) nvars = N+m1+m2;
if (m1<M) {
iobjfun++;
for(j=0;j<N;j++) {
for(t=0,i=0;i<M;i++) t-=a[i][j];
a[iobjfun][j] = t;
}
}
for(i=0;i<N;i++) icol[i]=i;
for(i=0;i<M;i++) irow[i]=N+i|~imask;
do {
for(t=0,j=0,jmax=-1;j<N;j++) if(icol[j]<nvars && a[iobjfun][j]>t)
{ jmax=j; t=a[iobjfun][j]; }
if (jmax<0)
{ res=1; break; }
for(imax=0; imax<M && a[imax][jmax]>0; imax++);
if (imax==M)
{ res=2; break; }
for(i=imax+1;i<M;i++) if (a[i][jmax]<0 && a[i][N]*a[imax][jmax] < a[imax][N]*a[i][jmax])
imax=i;
varexchange:
rpivot = (ftype)1.0/a[imax][jmax];
for(j=0;j<nCols;j++) a[imax][j]*=-rpivot;
a[imax][jmax] = rpivot;
for(i=0;i<nRows;i++) if(i!=imax) {
a[i][jmax] *= rpivot;
for(j=0;j<nCols;j++) if(j!=jmax)
a[i][j] -= a[imax][j]*a[i][jmax];
}
if ((irow[imax]&imask)>=N+m1 && (irow[imax]&imask)<N+m1+m2 && (irow[imax]&~imask)!=0) {
a[iobjfun][jmax] += 1; // remove slack variable from objective function, nonnegativity constraint will enforce feasibility
for(i=0;i<=M;i++) a[i][jmax]*=-1;
}
i=irow[imax]&imask; irow[imax]=icol[jmax]; icol[jmax]=i;
} while(--iter);
if (m1<M) {
if (res==2 || sqr(a[M][N])<e*e)
res = 0;
else {
for(i=0;i<M;i++) {
if ((irow[i]&imask)>M+m1+m2) {
for(jmax=0;jmax<N && icol[jmax]>=nvars;jmax++);
imax=i; goto varexchange;
} else if ((irow[i]&imask)>N+m1 && (irow[i]&~imask)!=0) {
for(j=0;j<N;j++) a[i][j]*=-1;
irow[i]&=imask;
}
}
res = LPsimplex(M,0, objfunout,xout, nvars, e);
}
} else {
if (xout) {
for(i=0;i<N;i++) xout[i]=0;
for(i=0;i<M;i++) if((irow[i]&imask)<N)
xout[irow[i]&imask] = a[i][N];
}
objfunout = a[iobjfun][N];
}
delete[] irow; delete[] icol;
return res;
#undef a
}
#ifdef PIII_SSE
// TODO: AMD64: put into a separate file
// SSE-optimized code from "Streaming SIMD Extensions - Matrix Multiplication" document by Intel Corp.
__declspec(naked) void PIII_Mult00_6x6_6x6(float *src1, float *src2, float *dst)
{
__asm {
mov ecx, dword ptr [esp+8] // src2
movlps xmm3, qword ptr [ecx+72]
mov edx, dword ptr [esp+4] // src1
// Loading first 4 columns (upper 4 rows) of src2.
movaps xmm0, xmmword ptr [ecx]
movlps xmm1, qword ptr [ecx+24]
movhps xmm1, qword ptr [ecx+32]
movaps xmm2, xmmword ptr [ecx+48]
movhps xmm3, qword ptr [ecx+80]
// Calculating first 4 elements in the first row of the destination matrix.
movss xmm4, dword ptr [edx]
movss xmm5, dword ptr [edx+4]
mov eax, dword ptr [esp+0Ch] // dst
shufps xmm4, xmm4, 0
movss xmm6, dword ptr [edx+8]
shufps xmm5, xmm5, 0
movss xmm7, dword ptr [edx+12]
mulps xmm4, xmm0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm5, xmm4
mulps xmm7, xmm3
addps xmm6, xmm5
addps xmm7, xmm6
movaps xmmword ptr [eax], xmm7
// Calculating first 4 elements in the second row of the destination matrix.
movss xmm4, dword ptr [edx+24]
shufps xmm4, xmm4, 0
mulps xmm4, xmm0
movss xmm5, dword ptr [edx+28]
shufps xmm5, xmm5, 0
mulps xmm5, xmm1
movss xmm6, dword ptr [edx+32]
shufps xmm6, xmm6, 0
movss xmm7, dword ptr [edx+36]
shufps xmm7, xmm7, 0
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm7, xmm6
addps xmm5, xmm4
addps xmm7, xmm5
// Calculating first 4 elements in the third row of the destination matrix.
movss xmm4, dword ptr [edx+48]
movss xmm5, dword ptr [edx+52]
movlps qword ptr [eax+24], xmm7 // save 2nd
movhps qword ptr [eax+32], xmm7 // row
movss xmm6, dword ptr [edx+56]
movss xmm7, dword ptr [edx+60]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm5, xmm4
addps xmm7, xmm6
addps xmm7, xmm5
movaps xmmword ptr [eax+48], xmm7
// Calculating first 4 elements in the fourth row of the destination matrix.
movss xmm4, dword ptr [edx+72]
movss xmm5, dword ptr [edx+76]
movss xmm6, dword ptr [edx+80]
movss xmm7, dword ptr [edx+84]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5
addps xmm6, xmm4
addps xmm7, xmm6
movlps qword ptr [eax+72], xmm7
movhps qword ptr [eax+80], xmm7
// Calculating first 4 elements in the fifth row of the destination matrix.
movss xmm4, dword ptr [edx+96]
movss xmm5, dword ptr [edx+100]
movss xmm6, dword ptr [edx+104]
movss xmm7, dword ptr [edx+108]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm5, xmm4
addps xmm7, xmm6
addps xmm7, xmm5
movaps xmmword ptr [eax+96], xmm7
// Calculating first 4 elements in the sixth row of the destination matrix.
movss xmm4, dword ptr [edx+120]
movss xmm5, dword ptr [edx+124]
movss xmm6, dword ptr [edx+128]
movss xmm7, dword ptr [edx+132]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5
addps xmm6, xmm4
addps xmm7, xmm6
movhps qword ptr [eax+128], xmm7
movlps qword ptr [eax+120], xmm7
// Loading first 4 columns (lower 2 rows) of src2.
movlps xmm0, qword ptr [ecx+96]
movhps xmm0, qword ptr [ecx+104]
movlps xmm1, qword ptr [ecx+120]
movhps xmm1, qword ptr [ecx+128]
// Calculating first 4 elements in the first row of the destination matrix.
movss xmm2, dword ptr [edx+16]
shufps xmm2, xmm2, 0
movss xmm4, dword ptr [edx+40]
movss xmm3, dword ptr [edx+20]
movss xmm5, dword ptr [edx+44]
movaps xmm6, xmmword ptr [eax]
movlps xmm7, qword ptr [eax+24]
shufps xmm3, xmm3, 0
shufps xmm5, xmm5, 0
movhps xmm7, qword ptr [eax+32]
shufps xmm4, xmm4, 0
mulps xmm5, xmm1
mulps xmm2, xmm0
mulps xmm3, xmm1
mulps xmm4, xmm0
addps xmm6, xmm2
addps xmm7, xmm4
addps xmm7, xmm5
addps xmm6, xmm3
movlps qword ptr [eax+24], xmm7
movaps xmmword ptr [eax], xmm6
movhps qword ptr [eax+32], xmm7
// Calculating first 4 elements in the third row of the destination matrix.
movss xmm2, dword ptr [edx+64]
movss xmm4, dword ptr [edx+88]
movss xmm5, dword ptr [edx+92]
movss xmm3, dword ptr [edx+68]
movaps xmm6, xmmword ptr [eax+48]
movlps xmm7, qword ptr [eax+72]
movhps xmm7, qword ptr [eax+80]
shufps xmm2, xmm2, 0
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm3, xmm3, 0
mulps xmm2, xmm0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm3, xmm1
addps xmm6, xmm2
addps xmm6, xmm3
addps xmm7, xmm4
addps xmm7, xmm5
movlps qword ptr [eax+72], xmm7
movaps xmmword ptr [eax+48], xmm6
movhps qword ptr [eax+80], xmm7
// Calculating first 4 elements in the fifth row of the destination matrix.
movss xmm2, dword ptr [edx+112]
movss xmm3, dword ptr [edx+116]
movaps xmm6, xmmword ptr [eax+96]
shufps xmm2, xmm2, 0
shufps xmm3, xmm3, 0
mulps xmm2, xmm0
mulps xmm3, xmm1
addps xmm6, xmm2
addps xmm6, xmm3
movaps xmmword ptr [eax+96], xmm6
// Calculating first 4 elements in the sixth row of the destination matrix.
movss xmm4, dword ptr [edx+136]
movss xmm5, dword ptr [edx+140]
movhps xmm7, qword ptr [eax+128]
movlps xmm7, qword ptr [eax+120]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
addps xmm7, xmm4
addps xmm7, xmm5
// Calculating last 2 columns of the destination matrix.
movlps xmm0, qword ptr [ecx+16]
movhps xmm0, qword ptr [ecx+40]
movhps qword ptr [eax+128], xmm7
movlps qword ptr [eax+120], xmm7
movlps xmm2, qword ptr [ecx+64]
movhps xmm2, qword ptr [ecx+88]
movaps xmm3, xmm2
shufps xmm3, xmm3, 4Eh
movlps xmm4, qword ptr [ecx+112]
movhps xmm4, qword ptr [ecx+136]
movaps xmm5, xmm4
shufps xmm5, xmm5, 4Eh
movlps xmm6, qword ptr [edx]
movhps xmm6, qword ptr [edx+24]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movlps xmm6, qword ptr [edx+8]
movhps xmm6, qword ptr [edx+32]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movhps xmm6, qword ptr [edx+40]
movlps xmm6, qword ptr [edx+16]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+16], xmm7
movhps qword ptr [eax+40], xmm7
movlps xmm6, qword ptr [edx+48]
movhps xmm6, qword ptr [edx+72]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movhps xmm6, qword ptr [edx+80]
movlps xmm6, qword ptr [edx+56]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movlps xmm6, qword ptr [edx+64]
movhps xmm6, qword ptr [edx+88]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+64], xmm7
movhps qword ptr [eax+88], xmm7
movlps xmm6, qword ptr [edx+96]
movhps xmm6, qword ptr [edx+120]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movlps xmm6, qword ptr [edx+104]
movhps xmm6, qword ptr [edx+128]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movlps xmm6, qword ptr [edx+112]
movhps xmm6, qword ptr [edx+136]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+112], xmm7
movhps qword ptr [eax+136], xmm7
ret }
}
#endif
void ForceCompilerToGenerateCodeForMatrixTemplates()
{
matrixf mtxf; float f,*pf; int *pi;
mtxf.jacobi_transformation(mtxf,0);
mtxf.conjugate_gradient(0,0);
mtxf.biconjugate_gradient(0,0);
mtxf.minimum_residual(0,0);
mtxf.LPsimplex(0,0,f);
mtxf.LUdecomposition(pf,pi);
mtxf.solveAx_b(pf,pf);
mtxf.determinant();
mtxf.invert();
matrix mtxr; real r,*pr;
mtxr.jacobi_transformation(mtxr,0);
mtxr.conjugate_gradient(0,0);
mtxr.biconjugate_gradient(0,0);
mtxr.minimum_residual(0,0);
mtxr.LPsimplex(0,0,r);
mtxr.LUdecomposition(pr,pi);
mtxr.solveAx_b(pr,pr);
mtxr.determinant();
mtxr.invert();
}