FC1/CryPhysics/matrixnm.cpp

//////////////////////////////////////////////////////////////////////
//
//	NxM Matrix
//
//	File: matrixnm.cpp
//	Description : matrixnm template class implementation for float and real
//
//	History:
//	-:Created by Anton Knyazev
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"

#include "utils.h"

#ifdef USE_MATRIX_POOLS
	DECLARE_MTXNxM_POOL(float,512)
	DECLARE_VECTORN_POOL(float,256)
#ifndef REAL_IS_FLOAT
	DECLARE_MTXNxM_POOL(real,512)
	DECLARE_VECTORN_POOL(real,256)
#endif
#endif

int g_bHasSSE;
inline float getlothresh(float) { return 1E-10f; }
inline float gethithresh(float) { return 1E10f; }
inline double getlothresh(double) { return 1E-20; }
inline double gethithresh(double) { return 1E20; }

template<class ftype>
int matrix_tpl<ftype>::jacobi_transformation(matrix_tpl<ftype> &evec, ftype *eval, ftype prec) const
{
	if (!(flags & mtx_symmetric) || nCols!=nRows) return 0;

	matrix_tpl a(*this);
	int n = nRows, p,q,r,iter,pmax,qmax,sz=nRows*nCols;
	ftype theta,t,s,c,apr,aqr,arp,arq,thresh=prec,amax;
	evec.identity();
	evec.flags = (evec.flags & mtx_foreign_data) | mtx_orthogonal | mtx_normal;

	for(iter=0; iter<nRows*nCols*10; iter++) {
		for(p=0,amax=thresh,pmax=-1;p<n-1;p++) for(q=p+1;q<n;q++)	if (sqr(a[p][q])>amax)
		{	amax=sqr(a[p][q]); pmax=p; qmax=q; }
		if (pmax==-1)
			goto exitjacobi;
		p=pmax; q=qmax;
		theta = (ftype)0.5*(a[q][q]-a[p][p])/a[p][q];
		if (fabs_tpl(theta)<gethithresh(theta)) {
			t = sqrt_tpl(theta*theta+1);
			if (theta>0) t = -theta-t;
			else t -= theta;
			c = 1/sqrt_tpl(1+t*t); s = t*c;
			for(r=0;r<n;r++) { arp=a[r][p];arq=a[r][q]; a[r][p]=c*arp-s*arq; a[r][q]=c*arq+s*arp; }
			for(r=0;r<n;r++) { apr=a[p][r];aqr=a[q][r]; a[p][r]=c*apr-s*aqr; a[q][r]=c*aqr+s*apr; }
			for(r=0;r<n;r++) { apr=evec[p][r];aqr=evec[q][r]; evec[p][r]=c*apr-s*aqr; evec[q][r]=c*aqr+s*apr; }
		}
		a[p][q] = 0;
		if (iter==sz+1) thresh += getlothresh(thresh);
	}
	iter=0;
	exitjacobi:
	for(p=0;p<n*n;p++) {
		t = fabs_tpl(evec.data[p]);
		if (t<(ftype)1E-6) evec.data[p]=0;
		else if (fabs_tpl(t-1)<getlothresh(t))
			evec.data[p]=sgnnz(evec.data[p]);
	}
	for(p=0;p<n;p++) eval[p] = a[p][p];
	return iter; // not converged during iterations limit
}

template<class ftype>
int matrix_tpl<ftype>::conjugate_gradient(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
	ftype a,b,r2,r2new,denom,maxel;
	int i,iter=nRows*3;
	minlen*=minlen;

	ftype buf[24],*pbuf = nRows>8 ? new ftype[nRows*3]:buf;
	vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),p(nRows,pbuf+nRows),Ap(nRows,pbuf+nRows*2);
	(r=rh)-=*this*x; p=r;
	r2=r.len2();

	do {
		Ap = *this*p; denom = p*Ap;
		if (sqr(denom)<1E-30) break;
		a = r2/denom;
#if defined(LINUX)
		r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
#else
		r -= Ap*a;
#endif
		r2new=r.len2();
		if (r2new>r2*500)
			break;
#if defined(LINUX)
		x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
		x += p*a;
#endif
		b = r2new/r2; r2=r2new;
		(p*=b)+=r;
		for(i=0,maxel=0;i<nRows;i++) maxel = max(maxel,fabs_tpl(r[i]));
	} while (--iter && (r2new>minlen || maxel>minel));

	if (pbuf!=buf) delete pbuf;
	return nRows-iter;
}

template<class ftype>
int matrix_tpl<ftype>::biconjugate_gradient(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
	ftype a,b,r2,r2new,err,denom,maxel;
	int i,iter=nRows*3;
	minlen*=minlen;

	ftype buf[40],*pbuf = nRows>8 ? new ftype[nRows*5]:buf;
	vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),rc(nRows,pbuf+nRows),p(nRows,pbuf+nRows*2),
		pc(nRows,pbuf+nRows*3),Ap(nRows,pbuf+nRows*4);
	(r=rh)-=*this*x; rc=r; p=r; pc=rc; r2 = r.len2();

	do {
		Ap = *this*p;	denom = pc*Ap;
		if (sqr(denom)<1E-30) break;
		a = r2/denom;

#if defined(LINUX)
		r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
		rc = ::operator-=(rc, (const vectorn_tpl<ftype>&)((pc**this)*a));
		x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
		r -= Ap*a;
		rc -= (pc**this)*a;
		x += p*a;
#endif
		r2new = rc*r;
		b = r2new/r2;	r2 = r2new;
		(p*=b)+=r; (pc*=b)+=rc;
		for(err=maxel=0,i=0;i<nRows;i++) {
			err += sqr(r.data[i]);
			maxel = max(maxel,fabs_tpl(r[i]));
		}
	} while(--iter && (err>minlen || maxel>minel) && sqr(r2)>1E-30);

	if (pbuf!=buf) delete pbuf;
	return nRows-iter;
}

template<class ftype>
int matrix_tpl<ftype>::minimum_residual(ftype *startx,ftype *rightside, ftype minlen,ftype minel) const
{
	ftype a,b,r2,r2new,err,denom,maxel;
	int i,iter=nRows*3;
	minlen*=minlen;

	ftype buf[40],*pbuf = nRows>8 ? new ftype[nRows*5]:buf;
	vectorn_tpl<ftype> x(nRows,startx),rh(nRows,rightside),r(nRows,pbuf),rc(nRows,pbuf+nRows),p(nRows,pbuf+nRows*2),
		Ap(nRows,pbuf+nRows*4);
	(r=rh)-=*this*x;
	rc=*this*r;
	p=r;
	r2 = rc*r;
	do {
		Ap = *this*p;	denom = Ap.len2();
		if (sqr(denom)<1E-30) break;
		a = r2/denom;
#if defined(LINUX)
		r = ::operator-=(r, (const vectorn_tpl<ftype>&)(Ap*a));
#else
		r -= Ap*a;
#endif
		rc = *this*r;
#if defined(LINUX)
		x = ::operator+=(x, (const vectorn_tpl<ftype>&)(p*a));
#else
		x += p*a;
#endif
		r2new = rc*r;
		b = r2new/r2;	r2 = r2new;
		(p*=b)+=r;
		for(err=maxel=0,i=0;i<nRows;i++) {
			err += sqr(r.data[i]);
			maxel = max(maxel,fabs_tpl(r[i]));
		}
	} while(--iter && (err>minlen || maxel>minel) && sqr(r2)>1E-30);

	if (pbuf!=buf) delete pbuf;
	return nRows-iter;
}

template<class ftype>
int matrix_tpl<ftype>::LUdecomposition(ftype *&LUdata,int *&LUidx) const
{
	if (nRows!=nCols) return 0;
	int i,j,k,imax,alloc=(LUdata==0 | (LUidx==0)<<1);
	ftype aij,bij,maxaij,t;
	if (alloc & 1) LUdata = new ftype[nRows*nRows];
	if (alloc & 2) LUidx = new int[nRows];
	matrix_tpl<ftype> LU(nRows,nRows,0,LUdata);
	LU = *this;

	for(j=0;j<nRows;j++) {
		for(i=0;i<=j;i++) {
			for(k=0,bij=LU[i][j]; k<i; k++) bij -= LU[i][k]*LU[k][j];
			LU[i][j] = bij;
		}
		for(maxaij=0,imax=j ;i<nRows;i++) {
			for(k=0,aij=LU[i][j]; k<j; k++) aij -= LU[i][k]*LU[k][j];
			LU[i][j] = aij;
			if (aij*aij > maxaij*maxaij)
			{ maxaij=aij; imax=i; }
		}
		LUidx[j] = imax;
		if (j==nRows-1 && LU[j][j]!=0) break; // no aij in this case
		if (maxaij==0) { // the matrix is singular
			if (alloc & 1) delete[] LUdata;
			if (alloc & 2) delete[] LUidx;
			return 0;
		}
		if (imax!=j) for(k=0;k<nRows;k++)
		{ t=LU[imax][k]; LU[imax][k]=LU[j][k]; LU[j][k]=t; }
		maxaij = (ftype)1.0/maxaij;
		for(i=j+1;i<nRows;i++)
			LU[i][j] *= maxaij;
	}
	return 1;
}

template<class ftype>
int matrix_tpl<ftype>::solveAx_b(ftype *x,ftype *b, ftype *LUdata,int *LUidx) const
{
	int LUidx_buf[16],alloc=0;
	if (!LUdata) {
		if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
			if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
				mtx_pool_pos = 0;
			LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
		}
		if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
		alloc = LUdata==0 | (LUidx==0)<<1;
		if (!LUdecomposition(LUdata,LUidx)) return 0;
	}

	int i,j; ftype xi;
	matrix_tpl<ftype> LU(nRows,nRows,0,LUdata);
	for(i=0;i<nRows;i++) x[i]=b[i];
	for(i=0;i<nRows;i++) {
		xi=x[i]; x[i]=x[LUidx[i]]; x[LUidx[i]]=xi;
		for(j=0;j<i;j++) x[i]-=LU[i][j]*x[j];
	}
	for(i=nRows-1;i>=0;i--) {
		for(j=i+1;j<nRows;j++) x[i]-=LU[i][j]*x[j];
		x[i] /= LU[i][i];
	}

	if (alloc & 1) delete[] LUdata;
	if (alloc & 2) delete[] LUidx;
	return 1;
}

template<class ftype>
matrix_tpl<ftype>& matrix_tpl<ftype>::invert()
{
	if (flags & mtx_orthogonal)
		return transpose();
	if (nRows!=nCols)
		return *this;

	int i,j; ftype det=0;
	if (nRows==1)
		data[0]=(ftype)1.0/data[0];
	else if (nRows==2) {
		det = data[0]*data[3]-data[1]*data[2];
		if (det==0) return *this;	det = (ftype)1.0/det;
		ftype oldata[4]; for(i=0;i<4;i++) oldata[i]=data[i];
		data[0]=oldata[3]*det; data[1]=-oldata[1]*det;
		data[2]=-oldata[2]*det; data[3]=oldata[0]*det;
	} else if (nRows==3) {
		for(i=0;i<3;i++) det+=data[i]*data[inc_mod3[i]+3]*data[dec_mod3[i]+6];
		for(i=0;i<3;i++) det-=data[dec_mod3[i]]*data[inc_mod3[i]+3]*data[i+6];
		if (det==0) return *this;	det = (ftype)1.0/det;
		ftype oldata[9]; for(i=0;i<9;i++) oldata[i]=data[i];
		for(i=0;i<3;i++) for(j=0;j<3;j++)
			data[i+j*3] = (oldata[dec_mod3[i]*3+dec_mod3[j]]*oldata[inc_mod3[i]*3+inc_mod3[j]]-
										 oldata[dec_mod3[i]*3+inc_mod3[j]]*oldata[inc_mod3[i]*3+dec_mod3[j]])*det;
	} else {
		ftype *LUdata=0,*colmark; int *LUidx=0,LUidx_buf[32], alloc=0;
		if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
			if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
				mtx_pool_pos = 0;
			LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
		} else alloc = 1;
		if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
		else alloc |= 2;
		if (!LUdecomposition(LUdata,LUidx)) return *this;

		if (nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
			if (mtx_pool_pos+nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
				mtx_pool_pos=0;
			colmark=mtx_pool+mtx_pool_pos; mtx_pool_pos+=nRows;
		} else {
			colmark = new ftype[nRows]; alloc |= 4;
		}

		for(i=0;i<nRows;i++) colmark[i]=0;
		for(i=0;i<nRows;i++) {
			colmark[i]=1;	solveAx_b(data+i*nRows,colmark, LUdata,LUidx); colmark[i]=0;
		}
		transpose();

		if (alloc & 1) delete[] LUdata;
		if (alloc & 2) delete[] LUidx;
		if (alloc & 4) delete[] colmark;
	}
	flags = flags & (mtx_normal | mtx_orthogonal | mtx_symmetric | mtx_PD | mtx_PSD | mtx_diagonal | mtx_identity | mtx_foreign_data);
	return *this;
}

template<class ftype>
ftype matrix_tpl<ftype>::determinant(ftype *LUdata,int *LUidx) const
{
	if (nRows!=nCols) return 0;
	int i,j; ftype det=0;
	if (nRows==1)
		det = data[0];
	else if (nRows==2)
		det = data[0]*data[3]-data[1]*data[2];
	else if (nRows==3) {
		for(i=0;i<3;i++) det+=data[i]*data[inc_mod3[i]+3]*data[dec_mod3[i]+6];
		for(i=0;i<3;i++) det-=data[dec_mod3[i]]*data[inc_mod3[i]+3]*data[i+6];
	} else if (LUdecomposition(LUdata,LUidx)) {
		ftype *LUdata=0; int *LUidx=0,LUidx_buf[32], alloc=0;
		if (nRows*nRows*2<sizeof(mtx_pool)/sizeof(mtx_pool[0])) {
			if (mtx_pool_pos+nRows*nRows > sizeof(mtx_pool)/sizeof(mtx_pool[0]))
				mtx_pool_pos = 0;
			LUdata = mtx_pool+mtx_pool_pos; mtx_pool_pos += nRows*nRows;
		} else alloc = 1;
		if (nRows<=sizeof(LUidx_buf)/sizeof(LUidx_buf[0])) LUidx = LUidx_buf;
		else alloc |= 2;
		if (!LUdecomposition(LUdata,LUidx)) return 0;

		for(i=j=0,det=1;i<nRows;i++,j+=nRows+1) det*=LUdata[j];
		for(i=j=0;i<nRows;i++) j+=LUidx[i]!=i;
		if (j&2) det*=-1;

		if (alloc & 1) delete[] LUdata;
		if (alloc & 2) delete[] LUidx;
	}
	return det;
}

// Linear programming with simplex method

template<class ftype>
int matrix_tpl<ftype>::LPsimplex(int m1,int m2, ftype &objfunout,ftype *xout, int nvars, ftype e) const
{
	int M=nRows-2,N=nCols-1,i,j,imax,jmax,iobjfun=M, res=0,iter=(M+N)*8;
	ftype rpivot,t;
	const int imask=0x7FFFFFFF;
	int *irow=new int[M],*icol=new int[N];
	#define a (*this)

	if (e<0) {
		for(i=nRows*nCols-1,e=0;i>=0;i--)	e+=data[i];
		e *= (ftype)1E-6/(nRows*nCols);
	}

	if (nvars<0) nvars = N+m1+m2;
	if (m1<M) {
		iobjfun++;
		for(j=0;j<N;j++) {
			for(t=0,i=0;i<M;i++) t-=a[i][j];
			a[iobjfun][j] = t;
		}
	}
	for(i=0;i<N;i++) icol[i]=i;
	for(i=0;i<M;i++) irow[i]=N+i|~imask;

	do {
		for(t=0,j=0,jmax=-1;j<N;j++) if(icol[j]<nvars && a[iobjfun][j]>t)
		{	jmax=j; t=a[iobjfun][j]; }
		if (jmax<0)
		{ res=1; break;	}
		for(imax=0; imax<M && a[imax][jmax]>0; imax++);
		if (imax==M)
		{ res=2; break; }
		for(i=imax+1;i<M;i++) if (a[i][jmax]<0 && a[i][N]*a[imax][jmax] < a[imax][N]*a[i][jmax])
			imax=i;
		varexchange:
		rpivot = (ftype)1.0/a[imax][jmax];
		for(j=0;j<nCols;j++) a[imax][j]*=-rpivot;
		a[imax][jmax] = rpivot;
		for(i=0;i<nRows;i++) if(i!=imax) {
			a[i][jmax] *= rpivot;
			for(j=0;j<nCols;j++) if(j!=jmax)
				a[i][j] -= a[imax][j]*a[i][jmax];
		}

		if ((irow[imax]&imask)>=N+m1 && (irow[imax]&imask)<N+m1+m2 && (irow[imax]&~imask)!=0) {
			a[iobjfun][jmax] += 1; // remove slack variable from objective function, nonnegativity constraint will enforce feasibility
			for(i=0;i<=M;i++) a[i][jmax]*=-1;
		}

		i=irow[imax]&imask; irow[imax]=icol[jmax]; icol[jmax]=i;
	} while(--iter);

	if (m1<M) {
		if (res==2 || sqr(a[M][N])<e*e)
			res = 0;
		else {
			for(i=0;i<M;i++) {
				if ((irow[i]&imask)>M+m1+m2) {
					for(jmax=0;jmax<N && icol[jmax]>=nvars;jmax++);
					imax=i; goto varexchange;
				} else if ((irow[i]&imask)>N+m1 && (irow[i]&~imask)!=0) {
					for(j=0;j<N;j++) a[i][j]*=-1;
					irow[i]&=imask;
				}
			}
			res = LPsimplex(M,0, objfunout,xout, nvars, e);
		}
	} else {
		if (xout) {
			for(i=0;i<N;i++) xout[i]=0;
			for(i=0;i<M;i++) if((irow[i]&imask)<N)
				xout[irow[i]&imask] = a[i][N];
		}
		objfunout = a[iobjfun][N];
	}
	delete[] irow; delete[] icol;
	return res;

	#undef a
}

#ifdef PIII_SSE
// TODO: AMD64: put into a separate file

// SSE-optimized code from "Streaming SIMD Extensions - Matrix Multiplication" document by Intel Corp.
__declspec(naked) void PIII_Mult00_6x6_6x6(float *src1, float *src2, float *dst)
{
	__asm {
	mov ecx, dword ptr [esp+8] // src2
	movlps xmm3, qword ptr [ecx+72]
	mov edx, dword ptr [esp+4] // src1
	// Loading first 4 columns (upper 4 rows) of src2.
	movaps xmm0, xmmword ptr [ecx]
	movlps xmm1, qword ptr [ecx+24]
	movhps xmm1, qword ptr [ecx+32]
	movaps xmm2, xmmword ptr [ecx+48]
	movhps xmm3, qword ptr [ecx+80]
	// Calculating first 4 elements in the first row of the destination matrix.
	movss xmm4, dword ptr [edx]
	movss xmm5, dword ptr [edx+4]
	mov eax, dword ptr [esp+0Ch] // dst
	shufps xmm4, xmm4, 0
	movss xmm6, dword ptr [edx+8]
	shufps xmm5, xmm5, 0
	movss xmm7, dword ptr [edx+12]
	mulps xmm4, xmm0
	shufps xmm6, xmm6, 0
	shufps xmm7, xmm7, 0
	mulps xmm5, xmm1
	mulps xmm6, xmm2
	addps xmm5, xmm4
	mulps xmm7, xmm3
	addps xmm6, xmm5
	addps xmm7, xmm6
	movaps xmmword ptr [eax], xmm7
	// Calculating first 4 elements in the second row of the destination matrix.
	movss xmm4, dword ptr [edx+24]
	shufps xmm4, xmm4, 0
	mulps xmm4, xmm0
	movss xmm5, dword ptr [edx+28]
	shufps xmm5, xmm5, 0
	mulps xmm5, xmm1
	movss xmm6, dword ptr [edx+32]
	shufps xmm6, xmm6, 0
	movss xmm7, dword ptr [edx+36]
	shufps xmm7, xmm7, 0
	mulps xmm6, xmm2
	mulps xmm7, xmm3
	addps xmm7, xmm6
	addps xmm5, xmm4
	addps xmm7, xmm5
	// Calculating first 4 elements in the third row of the destination matrix.
	movss xmm4, dword ptr [edx+48]
	movss xmm5, dword ptr [edx+52]
	movlps qword ptr [eax+24], xmm7 // save 2nd
	movhps qword ptr [eax+32], xmm7 // row
	movss xmm6, dword ptr [edx+56]
	movss xmm7, dword ptr [edx+60]
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	shufps xmm6, xmm6, 0
	shufps xmm7, xmm7, 0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	mulps xmm6, xmm2
	mulps xmm7, xmm3
	addps xmm5, xmm4

	addps xmm7, xmm6
	addps xmm7, xmm5
	movaps xmmword ptr [eax+48], xmm7
	// Calculating first 4 elements in the fourth row of the destination matrix.
	movss xmm4, dword ptr [edx+72]
	movss xmm5, dword ptr [edx+76]
	movss xmm6, dword ptr [edx+80]
	movss xmm7, dword ptr [edx+84]
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	shufps xmm6, xmm6, 0
	shufps xmm7, xmm7, 0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	mulps xmm6, xmm2
	mulps xmm7, xmm3
	addps xmm4, xmm5
	addps xmm6, xmm4
	addps xmm7, xmm6
	movlps qword ptr [eax+72], xmm7
	movhps qword ptr [eax+80], xmm7
	// Calculating first 4 elements in the fifth row of the destination matrix.
	movss xmm4, dword ptr [edx+96]
	movss xmm5, dword ptr [edx+100]
	movss xmm6, dword ptr [edx+104]
	movss xmm7, dword ptr [edx+108]
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	shufps xmm6, xmm6, 0
	shufps xmm7, xmm7, 0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	mulps xmm6, xmm2
	mulps xmm7, xmm3
	addps xmm5, xmm4
	addps xmm7, xmm6
	addps xmm7, xmm5
	movaps xmmword ptr [eax+96], xmm7
	// Calculating first 4 elements in the sixth row of the destination matrix.
	movss xmm4, dword ptr [edx+120]
	movss xmm5, dword ptr [edx+124]
	movss xmm6, dword ptr [edx+128]
	movss xmm7, dword ptr [edx+132]
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	shufps xmm6, xmm6, 0
	shufps xmm7, xmm7, 0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	mulps xmm6, xmm2
	mulps xmm7, xmm3
	addps xmm4, xmm5
	addps xmm6, xmm4
	addps xmm7, xmm6
	movhps qword ptr [eax+128], xmm7
	movlps qword ptr [eax+120], xmm7
	// Loading first 4 columns (lower 2 rows) of src2.
	movlps xmm0, qword ptr [ecx+96]
	movhps xmm0, qword ptr [ecx+104]
	movlps xmm1, qword ptr [ecx+120]
	movhps xmm1, qword ptr [ecx+128]
	// Calculating first 4 elements in the first row of the destination matrix.
	movss xmm2, dword ptr [edx+16]
	shufps xmm2, xmm2, 0
	movss xmm4, dword ptr [edx+40]
	movss xmm3, dword ptr [edx+20]
	movss xmm5, dword ptr [edx+44]
	movaps xmm6, xmmword ptr [eax]
	movlps xmm7, qword ptr [eax+24]
	shufps xmm3, xmm3, 0
	shufps xmm5, xmm5, 0
	movhps xmm7, qword ptr [eax+32]
	shufps xmm4, xmm4, 0
	mulps xmm5, xmm1
	mulps xmm2, xmm0
	mulps xmm3, xmm1
	mulps xmm4, xmm0
	addps xmm6, xmm2
	addps xmm7, xmm4
	addps xmm7, xmm5
	addps xmm6, xmm3
	movlps qword ptr [eax+24], xmm7
	movaps xmmword ptr [eax], xmm6
	movhps qword ptr [eax+32], xmm7
	// Calculating first 4 elements in the third row of the destination matrix.
	movss xmm2, dword ptr [edx+64]
	movss xmm4, dword ptr [edx+88]
	movss xmm5, dword ptr [edx+92]
	movss xmm3, dword ptr [edx+68]
	movaps xmm6, xmmword ptr [eax+48]
	movlps xmm7, qword ptr [eax+72]
	movhps xmm7, qword ptr [eax+80]
	shufps xmm2, xmm2, 0
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	shufps xmm3, xmm3, 0
	mulps xmm2, xmm0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	mulps xmm3, xmm1
	addps xmm6, xmm2
	addps xmm6, xmm3
	addps xmm7, xmm4
	addps xmm7, xmm5
	movlps qword ptr [eax+72], xmm7
	movaps xmmword ptr [eax+48], xmm6
	movhps qword ptr [eax+80], xmm7
	// Calculating first 4 elements in the fifth row of the destination matrix.
	movss xmm2, dword ptr [edx+112]
	movss xmm3, dword ptr [edx+116]
	movaps xmm6, xmmword ptr [eax+96]
	shufps xmm2, xmm2, 0
	shufps xmm3, xmm3, 0
	mulps xmm2, xmm0
	mulps xmm3, xmm1
	addps xmm6, xmm2
	addps xmm6, xmm3
	movaps xmmword ptr [eax+96], xmm6
	// Calculating first 4 elements in the sixth row of the destination matrix.
	movss xmm4, dword ptr [edx+136]
	movss xmm5, dword ptr [edx+140]
	movhps xmm7, qword ptr [eax+128]
	movlps xmm7, qword ptr [eax+120]
	shufps xmm4, xmm4, 0
	shufps xmm5, xmm5, 0
	mulps xmm4, xmm0
	mulps xmm5, xmm1
	addps xmm7, xmm4
	addps xmm7, xmm5
	// Calculating last 2 columns of the destination matrix.
	movlps xmm0, qword ptr [ecx+16]
	movhps xmm0, qword ptr [ecx+40]
	movhps qword ptr [eax+128], xmm7
	movlps qword ptr [eax+120], xmm7
	movlps xmm2, qword ptr [ecx+64]
	movhps xmm2, qword ptr [ecx+88]
	movaps xmm3, xmm2
	shufps xmm3, xmm3, 4Eh
	movlps xmm4, qword ptr [ecx+112]
	movhps xmm4, qword ptr [ecx+136]
	movaps xmm5, xmm4
	shufps xmm5, xmm5, 4Eh
	movlps xmm6, qword ptr [edx]
	movhps xmm6, qword ptr [edx+24]
	movaps xmm7, xmm6
	shufps xmm7, xmm7, 0F0h
	mulps xmm7, xmm0
	shufps xmm6, xmm6, 0A5h
	movaps xmm1, xmm0
	shufps xmm1, xmm1, 4Eh
	mulps xmm1, xmm6
	addps xmm7, xmm1
	movlps xmm6, qword ptr [edx+8]
	movhps xmm6, qword ptr [edx+32]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm2
	mulps xmm6, xmm3
	addps xmm7, xmm1
	addps xmm7, xmm6
	movhps xmm6, qword ptr [edx+40]
	movlps xmm6, qword ptr [edx+16]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm4
	mulps xmm6, xmm5
	addps xmm7, xmm1
	addps xmm7, xmm6
	movlps qword ptr [eax+16], xmm7
	movhps qword ptr [eax+40], xmm7
	movlps xmm6, qword ptr [edx+48]
	movhps xmm6, qword ptr [edx+72]
	movaps xmm7, xmm6
	shufps xmm7, xmm7, 0F0h
	mulps xmm7, xmm0
	shufps xmm6, xmm6, 0A5h
	movaps xmm1, xmm0
	shufps xmm1, xmm1, 4Eh
	mulps xmm1, xmm6
	addps xmm7, xmm1
	movhps xmm6, qword ptr [edx+80]
	movlps xmm6, qword ptr [edx+56]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm2
	mulps xmm6, xmm3
	addps xmm7, xmm1
	addps xmm7, xmm6
	movlps xmm6, qword ptr [edx+64]
	movhps xmm6, qword ptr [edx+88]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm4
	mulps xmm6, xmm5
	addps xmm7, xmm1
	addps xmm7, xmm6
	movlps qword ptr [eax+64], xmm7
	movhps qword ptr [eax+88], xmm7
	movlps xmm6, qword ptr [edx+96]
	movhps xmm6, qword ptr [edx+120]
	movaps xmm7, xmm6
	shufps xmm7, xmm7, 0F0h
	mulps xmm7, xmm0
	shufps xmm6, xmm6, 0A5h
	movaps xmm1, xmm0
	shufps xmm1, xmm1, 4Eh
	mulps xmm1, xmm6
	addps xmm7, xmm1
	movlps xmm6, qword ptr [edx+104]
	movhps xmm6, qword ptr [edx+128]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm2
	mulps xmm6, xmm3
	addps xmm7, xmm1
	addps xmm7, xmm6
	movlps xmm6, qword ptr [edx+112]
	movhps xmm6, qword ptr [edx+136]
	movaps xmm1, xmm6
	shufps xmm1, xmm1, 0F0h
	shufps xmm6, xmm6, 0A5h
	mulps xmm1, xmm4
	mulps xmm6, xmm5
	addps xmm7, xmm1
	addps xmm7, xmm6
	movlps qword ptr [eax+112], xmm7
	movhps qword ptr [eax+136], xmm7
	ret }
}
#endif

void ForceCompilerToGenerateCodeForMatrixTemplates()
{
	matrixf mtxf; float f,*pf; int *pi;
	mtxf.jacobi_transformation(mtxf,0);
	mtxf.conjugate_gradient(0,0);
	mtxf.biconjugate_gradient(0,0);
	mtxf.minimum_residual(0,0);
	mtxf.LPsimplex(0,0,f);
	mtxf.LUdecomposition(pf,pi);
	mtxf.solveAx_b(pf,pf);
	mtxf.determinant();
	mtxf.invert();

	matrix mtxr; real r,*pr;
	mtxr.jacobi_transformation(mtxr,0);
	mtxr.conjugate_gradient(0,0);
	mtxr.biconjugate_gradient(0,0);
	mtxr.minimum_residual(0,0);
	mtxr.LPsimplex(0,0,r);
	mtxr.LUdecomposition(pr,pi);
	mtxr.solveAx_b(pr,pr);
	mtxr.determinant();
	mtxr.invert();
}