2678 lines
74 KiB
C
2678 lines
74 KiB
C
|
|
//////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Crytek Common Source code
|
|
//
|
|
// File:Cry_Math.h
|
|
// Description: Misc mathematical functions
|
|
//
|
|
// History:
|
|
// -Jan 31,2001: Created by Marco Corbetta
|
|
// -Jan 04,2003: SSE and 3DNow optimizations by Andrey Honich
|
|
//
|
|
//////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef CRY_SIMD_H
|
|
#define CRY_SIMD_H
|
|
|
|
#if _MSC_VER > 1000
|
|
# pragma once
|
|
#endif
|
|
|
|
#include "TArray.h"
|
|
#include "platform.h"
|
|
//========================================================================================
|
|
|
|
extern int g_CpuFlags;
|
|
|
|
|
|
inline float AngleMod(float a)
|
|
{
|
|
a = (float)((360.0/65536) * ((int)(a*(65536/360.0)) & 65535));
|
|
return a;
|
|
}
|
|
|
|
inline unsigned short Degr2Word(float f)
|
|
{
|
|
return (unsigned short)(AngleMod(f)/360.0f*65536.0f);
|
|
}
|
|
inline float Word2Degr(unsigned short s)
|
|
{
|
|
return (float)s / 65536.0f * 360.0f;
|
|
}
|
|
|
|
/*****************************************************
|
|
MISC FUNCTIONS
|
|
*****************************************************/
|
|
|
|
|
|
//////////////////////////////////////
|
|
#if defined(_CPU_X86)
|
|
ILINE float __fastcall Ffabs(float f) {
|
|
*((unsigned *) & f) &= ~0x80000000;
|
|
return (f);
|
|
}
|
|
#else
|
|
inline float Ffabs(float x) { return fabsf(x); }
|
|
#endif
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
#if defined(_CPU_X86) && !defined(LINUX)
|
|
inline int fastftol_positive(float f)
|
|
{
|
|
int i;
|
|
f -= 0.5f;
|
|
__asm fld [f]
|
|
__asm fistp [i]
|
|
return i;
|
|
}
|
|
#else
|
|
inline int fastftol_positive (float x) { return (int)x; }
|
|
#endif
|
|
|
|
|
|
#if defined(_CPU_X86) && !defined(LINUX)
|
|
ILINE int __fastcall FtoI(float x)
|
|
{
|
|
int t;
|
|
__asm
|
|
{
|
|
fld x
|
|
fistp t
|
|
}
|
|
return t;
|
|
}
|
|
#else
|
|
inline int FtoI(float x) { return (int)x; }
|
|
#endif
|
|
|
|
|
|
|
|
//////////////////////////////////////
|
|
#define rnd() (((float)rand())/RAND_MAX) // Floating point random number generator ( 0 -> 1)
|
|
|
|
//////////////////////////////////////
|
|
inline void multMatrices(double dst[16], const double a[16], const double b[16])
|
|
{
|
|
int i, j;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
for (j = 0; j < 4; j++) {
|
|
dst[i * 4 + j] =
|
|
b[i * 4 + 0] * a[0 * 4 + j] +
|
|
b[i * 4 + 1] * a[1 * 4 + j] +
|
|
b[i * 4 + 2] * a[2 * 4 + j] +
|
|
b[i * 4 + 3] * a[3 * 4 + j];
|
|
}
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////
|
|
inline void multMatrices(float dst[16], const float a[16], const float b[16])
|
|
{
|
|
int i, j;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
for (j = 0; j < 4; j++) {
|
|
dst[i * 4 + j] =
|
|
b[i * 4 + 0] * a[0 * 4 + j] +
|
|
b[i * 4 + 1] * a[1 * 4 + j] +
|
|
b[i * 4 + 2] * a[2 * 4 + j] +
|
|
b[i * 4 + 3] * a[3 * 4 + j];
|
|
}
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////
|
|
// transform vector
|
|
inline void matmult_trans_only(float a[3], float b[4][4], float result[3])
|
|
{
|
|
result[0] = a[0] * b[0][0] + a[1] * b[1][0] + a[2] * b[2][0] + b[3][0];
|
|
result[1] = a[0] * b[0][1] + a[1] * b[1][1] + a[2] * b[2][1] + b[3][1];
|
|
result[2] = a[0] * b[0][2] + a[1] * b[1][2] + a[2] * b[2][2] + b[3][2];
|
|
}
|
|
|
|
|
|
/*
|
|
======================================
|
|
Matrix 4x4
|
|
======================================
|
|
*/
|
|
|
|
|
|
inline void multMatrixf(float *product, const float *m1, const float *m2)
|
|
{
|
|
#define A(row,col) m1[(col<<2)+row]
|
|
#define B(row,col) m2[(col<<2)+row]
|
|
#define P(row,col) product[(col<<2)+row]
|
|
|
|
int i;
|
|
for (i=0; i<4; i++)
|
|
{
|
|
float ai0=A(i,0), ai1=A(i,1), ai2=A(i,2), ai3=A(i,3);
|
|
P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
|
|
P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
|
|
P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
|
|
P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
|
|
}
|
|
|
|
#undef A
|
|
#undef B
|
|
#undef P
|
|
}
|
|
|
|
#if !defined (GAMECUBE) && !defined PS2
|
|
|
|
//==========================================================================================
|
|
// 3DNow! optimizations
|
|
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4731) // frame pointer register 'ebp' modified by inline assembly code
|
|
|
|
struct SConst3DN
|
|
{
|
|
float m_fVal0;
|
|
float m_fVal1;
|
|
};
|
|
const SConst3DN const3DN_1_N1 = {1.0f, -1.0f};
|
|
const SConst3DN const3DN_0_1 = {0.0f, 1.0f};
|
|
|
|
|
|
|
|
|
|
/*!
|
|
Compute inverse of 4x4 transformation SINGLE-PRECISION matrix.
|
|
Code lifted from Brian Paul's Mesa freeware OpenGL implementation.
|
|
Return true for success, false for failure (singular matrix)
|
|
*/
|
|
inline bool QQinvertMatrixf(float *out, const float *m)
|
|
{
|
|
#define SWAP_ROWS(a, b) { float *_tmp = a; (a)=(b); (b)=_tmp; }
|
|
#define MAT(m,r,c) (m)[(c)*4+(r)]
|
|
|
|
float wtmp[4][8];
|
|
float m0, m1, m2, m3, s;
|
|
float *r0, *r1, *r2, *r3;
|
|
|
|
r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3];
|
|
|
|
r0[0] = MAT(m,0,0), r0[1] = MAT(m,0,1),
|
|
r0[2] = MAT(m,0,2), r0[3] = MAT(m,0,3),
|
|
r0[4] = 1.0f, r0[5] = r0[6] = r0[7] = 0.0f,
|
|
|
|
r1[0] = MAT(m,1,0), r1[1] = MAT(m,1,1),
|
|
r1[2] = MAT(m,1,2), r1[3] = MAT(m,1,3),
|
|
r1[5] = 1.0f, r1[4] = r1[6] = r1[7] = 0.0f,
|
|
|
|
r2[0] = MAT(m,2,0), r2[1] = MAT(m,2,1),
|
|
r2[2] = MAT(m,2,2), r2[3] = MAT(m,2,3),
|
|
r2[6] = 1.0f, r2[4] = r2[5] = r2[7] = 0.0f,
|
|
|
|
r3[0] = MAT(m,3,0), r3[1] = MAT(m,3,1),
|
|
r3[2] = MAT(m,3,2), r3[3] = MAT(m,3,3),
|
|
r3[7] = 1.0f, r3[4] = r3[5] = r3[6] = 0.0f;
|
|
|
|
/* choose pivot - or die */
|
|
if (fabs(r3[0])>fabs(r2[0])) SWAP_ROWS(r3, r2);
|
|
if (fabs(r2[0])>fabs(r1[0])) SWAP_ROWS(r2, r1);
|
|
if (fabs(r1[0])>fabs(r0[0])) SWAP_ROWS(r1, r0);
|
|
if (0.0 == r0[0])
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/* eliminate first variable */
|
|
m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
|
|
s = r0[1]; r1[1] -= m1 * s; r2[1] -= m2 * s; r3[1] -= m3 * s;
|
|
s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
|
|
s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
|
|
s = r0[4];
|
|
if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
|
|
s = r0[5];
|
|
if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
|
|
s = r0[6];
|
|
if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
|
|
s = r0[7];
|
|
if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
|
|
|
|
/* choose pivot - or die */
|
|
if (fabs(r3[1])>fabs(r2[1])) SWAP_ROWS(r3, r2);
|
|
if (fabs(r2[1])>fabs(r1[1])) SWAP_ROWS(r2, r1);
|
|
if (0.0 == r1[1])
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/* eliminate second variable */
|
|
m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
|
|
r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
|
|
r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
|
|
s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
|
|
s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
|
|
s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
|
|
s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
|
|
|
|
/* choose pivot - or die */
|
|
if (fabs(r3[2])>fabs(r2[2])) SWAP_ROWS(r3, r2);
|
|
if (0.0 == r2[2])
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/* eliminate third variable */
|
|
m3 = r3[2]/r2[2];
|
|
r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4],
|
|
r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6],
|
|
r3[7] -= m3 * r2[7];
|
|
|
|
/* last check */
|
|
if (0.0 == r3[3])
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
s = 1.0f/r3[3]; /* now back substitute row 3 */
|
|
r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
|
|
|
|
m2 = r2[3]; /* now back substitute row 2 */
|
|
s = 1.0f/r2[2];
|
|
r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2),
|
|
r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2);
|
|
m1 = r1[3];
|
|
r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1,
|
|
r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1;
|
|
m0 = r0[3];
|
|
r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0,
|
|
r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0;
|
|
|
|
m1 = r1[2]; /* now back substitute row 1 */
|
|
s = 1.0f/r1[1];
|
|
r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1),
|
|
r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1);
|
|
m0 = r0[2];
|
|
r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0,
|
|
r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0;
|
|
|
|
m0 = r0[1]; /* now back substitute row 0 */
|
|
s = 1.0f/r0[0];
|
|
r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0),
|
|
r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0);
|
|
|
|
MAT(out,0,0) = r0[4]; MAT(out,0,1) = r0[5],
|
|
MAT(out,0,2) = r0[6]; MAT(out,0,3) = r0[7],
|
|
MAT(out,1,0) = r1[4]; MAT(out,1,1) = r1[5],
|
|
MAT(out,1,2) = r1[6]; MAT(out,1,3) = r1[7],
|
|
MAT(out,2,0) = r2[4]; MAT(out,2,1) = r2[5],
|
|
MAT(out,2,2) = r2[6]; MAT(out,2,3) = r2[7],
|
|
MAT(out,3,0) = r3[4]; MAT(out,3,1) = r3[5],
|
|
MAT(out,3,2) = r3[6]; MAT(out,3,3) = r3[7];
|
|
|
|
return 1;
|
|
|
|
#undef MAT
|
|
#undef SWAP_ROWS
|
|
}
|
|
|
|
inline float cryISqrtf(float fVal)
|
|
{
|
|
unsigned int *n1 = (unsigned int *)&fVal;
|
|
unsigned int n = 0x5f3759df - (*n1 >> 1);
|
|
float *n2 = (float *)&n;
|
|
fVal = (1.5f - (fVal * 0.5f) * *n2 * *n2) * *n2;
|
|
return fVal;
|
|
}
|
|
#if defined _CPU_X86 && !defined(LINUX)
|
|
// ***************************************************************************
|
|
inline void cryPrecacheSSE(const void *src, int nbytes)
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
mov ecx, nbytes
|
|
// 64 bytes per pass
|
|
shr ecx, 6
|
|
jz endLabel
|
|
|
|
loopMemToL1:
|
|
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
|
|
prefetchnta 96[ESI]
|
|
|
|
movq mm1, 0[ESI] // Read in source data
|
|
movq mm2, 8[ESI]
|
|
movq mm3, 16[ESI]
|
|
movq mm4, 24[ESI]
|
|
movq mm5, 32[ESI]
|
|
movq mm6, 40[ESI]
|
|
movq mm7, 48[ESI]
|
|
movq mm0, 56[ESI]
|
|
|
|
add esi, 64
|
|
dec ecx
|
|
jnz loopMemToL1
|
|
|
|
emms
|
|
|
|
endLabel:
|
|
}
|
|
}
|
|
// ***************************************************************************
|
|
inline void cryPrecacheMMX(const void *src, int nbytes)
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
mov ecx, nbytes
|
|
// 64 bytes per pass
|
|
shr ecx, 6
|
|
jz endLabel
|
|
|
|
loopMemToL1:
|
|
movq mm1, 0[ESI] // Read in source data
|
|
movq mm2, 8[ESI]
|
|
movq mm3, 16[ESI]
|
|
movq mm4, 24[ESI]
|
|
movq mm5, 32[ESI]
|
|
movq mm6, 40[ESI]
|
|
movq mm7, 48[ESI]
|
|
movq mm0, 56[ESI]
|
|
|
|
add esi, 64
|
|
dec ecx
|
|
jnz loopMemToL1
|
|
|
|
emms
|
|
|
|
endLabel:
|
|
}
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
inline void cryPrefetchNTSSE(const void *src)
|
|
{
|
|
#if defined(WIN32) && !defined(WIN64) && !defined(LINUX)
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
prefetchnta [ESI] // Prefetch non-temporal
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
ILINE void cryPrefetchT0SSE(const void *src)
|
|
{
|
|
#if defined(WIN32) && !defined(WIN64)
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
prefetchT0 [ESI] // Prefetch
|
|
}
|
|
#endif
|
|
|
|
#if defined(WIN64) && defined(_CPU_AMD64) && !defined(LINUX)
|
|
_mm_prefetch( (char*)src, _MM_HINT_T0 );
|
|
#endif
|
|
|
|
}
|
|
|
|
//=================================================================================
|
|
|
|
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
|
// This code uses any of FOUR different basic copy methods, depending
|
|
// on the transfer size.
|
|
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
|
// "Streaming Store"), and also uses the software prefetch instructions,
|
|
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
|
|
|
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
// form which is an "unrolled loop".
|
|
|
|
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
|
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
|
// also using the "unrolled loop" optimization. This code uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
|
|
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
|
|
|
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
|
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch. The technique is great for
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
|
|
|
|
#if defined _CPU_X86 && !defined(LINUX)
|
|
// Inline assembly syntax for use with Visual C++
|
|
inline void cryMemcpy( void* Dst, const void* Src, int Count )
|
|
{
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
__asm
|
|
{
|
|
mov ecx, [Count] ; number of bytes to copy
|
|
mov edi, [Dst] ; destination
|
|
mov esi, [Src] ; source
|
|
mov ebx, ecx ; keep a copy of count
|
|
|
|
cld
|
|
cmp ecx, TINY_BLOCK_COPY
|
|
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
|
|
|
cmp ecx, 32*1024 ; dont align between 32k-64k because
|
|
jbe $memcpy_do_align ; it appears to be slower
|
|
cmp ecx, 64*1024
|
|
jbe $memcpy_align_done
|
|
$memcpy_do_align:
|
|
mov ecx, 8 ; a trick thats faster than rep movsb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_align_done
|
|
jmp ecx ; jump to array of movsbs
|
|
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
|
|
$memcpy_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to copy
|
|
shr ecx, 6 ; get 64-byte block count
|
|
jz $memcpy_ic_2 ; finish the last few bytes
|
|
|
|
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
|
jae $memcpy_uc_test
|
|
|
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
align 16
|
|
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
|
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
movq mm0, [esi+0] ; read 64 bits
|
|
movq mm1, [esi+8]
|
|
movq [edi+0], mm0 ; write 64 bits
|
|
movq [edi+8], mm1 ; note: the normal movq writes the
|
|
movq mm2, [esi+16] ; data to cache; a cache line will be
|
|
movq mm3, [esi+24] ; allocated as needed, to store the data
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
|
|
add esi, 64 ; update source pointer
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memcpy_ic_1 ; last 64-byte block?
|
|
|
|
$memcpy_ic_2:
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
$memcpy_ic_3:
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsds
|
|
|
|
$memcpy_uc_test:
|
|
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
|
jae $memcpy_bp_1
|
|
|
|
$memcpy_64_test:
|
|
or ecx, ecx ; tail end of block prefetch will jump here
|
|
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
|
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
align 16
|
|
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
|
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
movq mm0,[esi+0] ; read 64 bits
|
|
add edi,64 ; update destination pointer
|
|
movq mm1,[esi+8]
|
|
add esi,64 ; update source pointer
|
|
movq mm2,[esi-48]
|
|
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
|
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
|
movntq [edi-56], mm1 ; from READING the destination address
|
|
movq mm1,[esi-32] ; into the cache, only to be over-written
|
|
movntq [edi-48], mm2 ; so that also helps performance
|
|
movq mm2,[esi-24]
|
|
movntq [edi-40], mm0
|
|
movq mm0,[esi-16]
|
|
movntq [edi-32], mm1
|
|
movq mm1,[esi-8]
|
|
movntq [edi-24], mm2
|
|
movntq [edi-16], mm0
|
|
dec ecx
|
|
movntq [edi-8], mm1
|
|
jnz $memcpy_uc_1 ; last 64-byte block?
|
|
|
|
jmp $memcpy_ic_2 ; almost done
|
|
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch. The technique is great for
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
$memcpy_bp_1: ; large blocks, block prefetch copy
|
|
|
|
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
|
jl $memcpy_64_test ; no, back to regular uncached copy
|
|
|
|
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
|
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
|
align 16
|
|
$memcpy_bp_2:
|
|
mov edx, [esi-64] ; grab one address per cache line
|
|
mov edx, [esi-128] ; grab one address per cache line
|
|
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
|
dec eax ; count down the cache lines
|
|
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
|
|
|
mov eax, CACHEBLOCK ; now that its in cache, do the copy
|
|
align 16
|
|
$memcpy_bp_3:
|
|
movq mm0, [esi ] ; read 64 bits
|
|
movq mm1, [esi+ 8]
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
add esi, 64 ; update source pointer
|
|
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
|
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
|
movntq [edi+16], mm2 ; from READING the destination address
|
|
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
|
movntq [edi+32], mm4 ; so that also helps performance
|
|
movntq [edi+40], mm5
|
|
movntq [edi+48], mm6
|
|
movntq [edi+56], mm7
|
|
add edi, 64 ; update dest pointer
|
|
|
|
dec eax ; count down
|
|
|
|
jnz $memcpy_bp_3 ; keep copying
|
|
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
|
jmp $memcpy_bp_1 ; keep processing chunks
|
|
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
|
align 4
|
|
movsd
|
|
movsd ; perform last 1-15 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd ; perform last 1-7 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
|
|
$memcpy_last_few: ; dword aligned from before movsds
|
|
mov ecx, ebx ; has valid low 2 bits of the byte count
|
|
and ecx, 11b ; the last few cows must come home
|
|
jz $memcpy_final ; no more, lets leave
|
|
rep movsb ; the last 1, 2, or 3 bytes
|
|
|
|
$memcpy_final:
|
|
emms ; clean up the MMX state
|
|
sfence ; flush the write buffer
|
|
// mov eax, [dest] ; ret value = destination pointer
|
|
}
|
|
}
|
|
else
|
|
{
|
|
memcpy(Dst, Src, Count);
|
|
}
|
|
}
|
|
|
|
inline void cryPrefetch(const void* Src, int nCount)
|
|
{
|
|
nCount >>= 6;
|
|
if (nCount > 0)
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, Src;
|
|
mov ecx, nCount;
|
|
mPr0:
|
|
align 16
|
|
dec ecx;
|
|
mov eax, [esi];
|
|
mov eax,0;
|
|
lea esi, [esi+40h];
|
|
jne mPr0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, Src;
|
|
mov ecx, nCount;
|
|
mPr1:
|
|
align 16
|
|
inc ecx;
|
|
mov eax, [esi];
|
|
mov eax,0;
|
|
lea esi, [esi-40h];
|
|
jne mPr1;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void cryMemcpy (void* inDst, const void* inSrc, int nCount, int nFlags)
|
|
{
|
|
int i;
|
|
int cnt;
|
|
INT_PTR nAlign16;
|
|
unsigned char *Src = (unsigned char *)inSrc;
|
|
unsigned char *Dst = (unsigned char *)inDst;
|
|
if (nCount < 32 || (nCount < 128 && !(nFlags & 4)))
|
|
{
|
|
memcpy(Dst, Src, nCount);
|
|
return;
|
|
}
|
|
if (nFlags & 1)
|
|
nAlign16 = (INT_PTR)Src & 0xf;
|
|
else
|
|
nAlign16 = (INT_PTR)Dst & 0xf;
|
|
if (nCount >= 4)
|
|
{
|
|
if (nAlign16 & 3)
|
|
{
|
|
cnt = (int)(4 - nAlign16);
|
|
for (i=0; i<cnt; i++)
|
|
{
|
|
*Dst++ = *Src++;
|
|
}
|
|
nCount -= cnt;
|
|
}
|
|
if (nCount >= 16 && (nAlign16 & 0xc))
|
|
{
|
|
cnt = (int)(16 - nAlign16);
|
|
for (i=0; i<(cnt>>2); i++)
|
|
{
|
|
*(UINT_PTR *)Dst = *(UINT_PTR *)Src;
|
|
Dst += 4;
|
|
Src += 4;
|
|
}
|
|
nCount -= cnt;
|
|
}
|
|
}
|
|
int Count_64 = 0xffffffc0 & nCount;
|
|
int Count_16 = 0x30 & nCount;
|
|
int Count_12 = 0xc & nCount;
|
|
int Count_4 = 0x3 & nCount;
|
|
if (Count_64)
|
|
{
|
|
if (nFlags & 4)
|
|
{
|
|
while (Count_64 > 2048)
|
|
{
|
|
cryPrefetch(Src, 2048);
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
if ((INT_PTR)Src & 0xf)
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
St0:
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St0
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
St1:
|
|
align 16
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St1
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
St2:
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St2
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
St3:
|
|
align 16
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St3
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if (g_CpuFlags & CPUF_MMX)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
St4:
|
|
align 16
|
|
movq mm0,xmmword ptr [ebx]
|
|
movq mm1,xmmword ptr [ebx+08h]
|
|
movq mm2,xmmword ptr [ebx+10h]
|
|
movq mm3,xmmword ptr [ebx+18h]
|
|
movq mm4,xmmword ptr [ebx+20h]
|
|
movq mm5,xmmword ptr [ebx+28h]
|
|
movq mm6,xmmword ptr [ebx+30h]
|
|
movq mm7,xmmword ptr [ebx+38h]
|
|
add ebx,40h
|
|
movq mmword ptr [edx],mm0
|
|
movq mmword ptr [edx+08h],mm1
|
|
movq mmword ptr [edx+10h],mm2
|
|
movq mmword ptr [edx+18h],mm3
|
|
movq mmword ptr [edx+20h],mm0
|
|
movq mmword ptr [edx+28h],mm1
|
|
movq mmword ptr [edx+30h],mm2
|
|
movq mmword ptr [edx+38h],mm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St4
|
|
emms
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, 32;
|
|
shl ecx, 4
|
|
St5:
|
|
align 16
|
|
mov eax,dword ptr [ebx]
|
|
add ebx,4h
|
|
mov dword ptr [edx],eax
|
|
add edx,4h
|
|
dec ecx
|
|
jne St5
|
|
}
|
|
}
|
|
Src += 2048;
|
|
Dst += 2048;
|
|
Count_64 -= 2048;
|
|
}
|
|
if (Count_64 > 128)
|
|
{
|
|
cryPrefetch(Src, Count_64);
|
|
cnt = Count_64>>6;
|
|
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
if ((INT_PTR)Src & 0xf)
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St6:
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St6
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St7:
|
|
align 16
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St7
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St8:
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St8
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St9:
|
|
align 16
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St9
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if (g_CpuFlags & CPUF_MMX)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St10:
|
|
align 16
|
|
movq mm0,xmmword ptr [ebx]
|
|
movq mm1,xmmword ptr [ebx+08h]
|
|
movq mm2,xmmword ptr [ebx+10h]
|
|
movq mm3,xmmword ptr [ebx+18h]
|
|
movq mm4,xmmword ptr [ebx+20h]
|
|
movq mm5,xmmword ptr [ebx+28h]
|
|
movq mm6,xmmword ptr [ebx+30h]
|
|
movq mm7,xmmword ptr [ebx+38h]
|
|
add ebx,40h
|
|
movq mmword ptr [edx],mm0
|
|
movq mmword ptr [edx+08h],mm1
|
|
movq mmword ptr [edx+10h],mm2
|
|
movq mmword ptr [edx+18h],mm3
|
|
movq mmword ptr [edx+20h],mm0
|
|
movq mmword ptr [edx+28h],mm1
|
|
movq mmword ptr [edx+30h],mm2
|
|
movq mmword ptr [edx+38h],mm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St10
|
|
emms
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
shl ecx, 4
|
|
St11:
|
|
align 16
|
|
mov eax,dword ptr [ebx]
|
|
add ebx,4h
|
|
mov dword ptr [edx],eax
|
|
add edx,4h
|
|
dec ecx
|
|
jne St11
|
|
}
|
|
}
|
|
Src += Count_64;
|
|
Dst += Count_64;
|
|
Count_64 = 0;
|
|
}
|
|
}
|
|
if (Count_64)
|
|
{
|
|
cnt = Count_64>>6;
|
|
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
if ((INT_PTR)Src & 0xf)
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St12:
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St12
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St13:
|
|
align 16
|
|
movups xmm0,xmmword ptr [ebx]
|
|
movups xmm1,xmmword ptr [ebx+10h]
|
|
movups xmm2,xmmword ptr [ebx+20h]
|
|
movups xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St13
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St14:
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movups xmmword ptr [edx],xmm0
|
|
movups xmmword ptr [edx+10h],xmm1
|
|
movups xmmword ptr [edx+20h],xmm2
|
|
movups xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St14
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St15:
|
|
align 16
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
movaps xmm1,xmmword ptr [ebx+10h]
|
|
movaps xmm2,xmmword ptr [ebx+20h]
|
|
movaps xmm3,xmmword ptr [ebx+30h]
|
|
add ebx,40h
|
|
movaps xmmword ptr [edx],xmm0
|
|
movaps xmmword ptr [edx+10h],xmm1
|
|
movaps xmmword ptr [edx+20h],xmm2
|
|
movaps xmmword ptr [edx+30h],xmm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St15
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if (g_CpuFlags & CPUF_MMX)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St16:
|
|
align 16
|
|
movq mm0,xmmword ptr [ebx]
|
|
movq mm1,xmmword ptr [ebx+08h]
|
|
movq mm2,xmmword ptr [ebx+10h]
|
|
movq mm3,xmmword ptr [ebx+18h]
|
|
movq mm4,xmmword ptr [ebx+20h]
|
|
movq mm5,xmmword ptr [ebx+28h]
|
|
movq mm6,xmmword ptr [ebx+30h]
|
|
movq mm7,xmmword ptr [ebx+38h]
|
|
add ebx,40h
|
|
movq mmword ptr [edx],mm0
|
|
movq mmword ptr [edx+08h],mm1
|
|
movq mmword ptr [edx+10h],mm2
|
|
movq mmword ptr [edx+18h],mm3
|
|
movq mmword ptr [edx+20h],mm0
|
|
movq mmword ptr [edx+28h],mm1
|
|
movq mmword ptr [edx+30h],mm2
|
|
movq mmword ptr [edx+38h],mm3
|
|
add edx,40h
|
|
dec ecx
|
|
jne St16
|
|
emms
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
shl ecx, 4
|
|
St17:
|
|
align 16
|
|
mov eax,dword ptr [ebx]
|
|
add ebx,4h
|
|
mov dword ptr [edx],eax
|
|
add edx,4h
|
|
dec ecx
|
|
jne St17
|
|
}
|
|
}
|
|
Src += Count_64;
|
|
Dst += Count_64;
|
|
}
|
|
}
|
|
if (Count_16)
|
|
{
|
|
cnt = Count_16>>4;
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
if ((INT_PTR)Src & 0xf)
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St18:
|
|
movups xmm0,xmmword ptr [ebx]
|
|
add ebx,10h
|
|
movups xmmword ptr [edx],xmm0
|
|
add edx,10h
|
|
dec ecx
|
|
jne St18
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St19:
|
|
align 16
|
|
movups xmm0,xmmword ptr [ebx]
|
|
add ebx,10h
|
|
movaps xmmword ptr [edx],xmm0
|
|
add edx,10h
|
|
dec ecx
|
|
jne St19
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((INT_PTR)Dst & 0xf)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St20:
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
add ebx,10h
|
|
movups xmmword ptr [edx],xmm0
|
|
add edx,10h
|
|
dec ecx
|
|
jne St20
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St21:
|
|
align 16
|
|
movaps xmm0,xmmword ptr [ebx]
|
|
add ebx,10h
|
|
movaps xmmword ptr [edx],xmm0
|
|
add edx,10h
|
|
dec ecx
|
|
jne St21
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if (g_CpuFlags & CPUF_MMX)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
St22:
|
|
align 16
|
|
movq mm0,xmmword ptr [ebx]
|
|
movq mm1,xmmword ptr [ebx+08h]
|
|
add ebx,10h
|
|
movq mmword ptr [edx],mm0
|
|
movq mmword ptr [edx+08h],mm1
|
|
add edx,10h
|
|
dec ecx
|
|
jne St22
|
|
emms
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, cnt;
|
|
shl ecx, 2
|
|
St23:
|
|
align 16
|
|
mov eax,dword ptr [ebx]
|
|
add ebx,4h
|
|
mov dword ptr [edx],eax
|
|
add edx,4h
|
|
dec ecx
|
|
jne St23
|
|
}
|
|
}
|
|
Src += Count_16;
|
|
Dst += Count_16;
|
|
}
|
|
if (Count_12)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, Count_12;
|
|
shr ecx, 2
|
|
St24:
|
|
align 16
|
|
mov eax,dword ptr [ebx]
|
|
add ebx,4h
|
|
mov dword ptr [edx],eax
|
|
add edx,4h
|
|
dec ecx
|
|
jne St24
|
|
}
|
|
Src += Count_12;
|
|
Dst += Count_12;
|
|
}
|
|
if (Count_4)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, Src;
|
|
mov edx, Dst;
|
|
mov ecx, Count_4;
|
|
St25:
|
|
align 16
|
|
mov al,byte ptr [ebx]
|
|
add ebx,4h
|
|
mov byte ptr [edx],al
|
|
add edx,4h
|
|
dec ecx
|
|
jne St25
|
|
}
|
|
}
|
|
}
|
|
|
|
//==============================================================================
|
|
|
|
|
|
// Matrix inversion using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline bool invertMatrixf_3DNow(float *pOut, const float *pIn)
|
|
{
|
|
_asm
|
|
{
|
|
mov edx, pIn
|
|
sub esp,40h
|
|
movq mm1,mmword ptr [edx+10h]
|
|
movq mm2,mmword ptr [edx+20h]
|
|
pswapd mm5,mm1
|
|
movq mm3,mmword ptr [edx+30h]
|
|
pswapd mm4,mm2
|
|
movq mm0,mmword ptr [edx]
|
|
pswapd mm6,mm3
|
|
pswapd mm7,mm2
|
|
pfmul mm5,mm0
|
|
pfmul mm4,mm0
|
|
pfmul mm6,mm0
|
|
pswapd mm0,mm3
|
|
pfmul mm7,mm1
|
|
pfnacc mm5,mm4
|
|
pfmul mm1,mm0
|
|
pfmul mm2,mm0
|
|
pfnacc mm6,mm7
|
|
pfnacc mm1,mm2
|
|
movq mm0,mmword ptr [edx+38h]
|
|
movq mm2,mm6
|
|
movq mm4,mm1
|
|
movq mm7,mmword ptr [edx+28h]
|
|
punpckhdq mm2,mm2
|
|
movq mm3,mm1
|
|
punpckhdq mm4,mm4
|
|
pfmul mm2,mm0
|
|
punpckldq mm3,mm3
|
|
pfmul mm4,mmword ptr [edx+18h]
|
|
pfmul mm3,mm7
|
|
pfadd mm2,mm4
|
|
movq mm4,mm6
|
|
pfsub mm2,mm3
|
|
movq mm3,mm1
|
|
punpckldq mm4,mm4
|
|
movq mmword ptr [esp+8],mm2
|
|
movq mm2,mm5
|
|
punpckhdq mm3,mm3
|
|
punpckhdq mm2,mm2
|
|
pfmul mm3,mmword ptr [edx+8]
|
|
pfmul mm2,mm0
|
|
pfmul mm4,mm7
|
|
pfsub mm2,mm4
|
|
pfadd mm2,mm3
|
|
movq mmword ptr [esp+18h],mm2
|
|
movq mm2,mm5
|
|
movq mm3,mm6
|
|
punpckldq mm1,mm1
|
|
punpckldq mm2,mm2
|
|
punpckldq mm3,mm3
|
|
pfmul mm1,mmword ptr [edx+8]
|
|
pfmul mm0,mm2
|
|
pfmul mm3,mmword ptr [edx+18h]
|
|
pfadd mm0,mm1
|
|
pfsub mm0,mm3
|
|
movq mmword ptr [esp+28h],mm0
|
|
punpckhdq mm5,mm5
|
|
punpckhdq mm6,mm6
|
|
pfmul mm2,mm7
|
|
pfmul mm5,mmword ptr [edx+18h]
|
|
movq mm0,mmword ptr [edx+8]
|
|
pfmul mm6,mmword ptr [edx+8]
|
|
pfsub mm2,mm5
|
|
pfadd mm2,mm6
|
|
movq mmword ptr [esp+38h],mm2
|
|
movq mm1,mmword ptr [edx+18h]
|
|
movq mm2,mmword ptr [edx+28h]
|
|
movq mm3,mmword ptr [edx+38h]
|
|
pswapd mm5,mm1
|
|
pswapd mm4,mm2
|
|
pfmul mm5,mm0
|
|
pswapd mm6,mm3
|
|
pfmul mm4,mm0
|
|
pswapd mm7,mm2
|
|
pfmul mm6,mm0
|
|
pswapd mm0,mm3
|
|
pfnacc mm5,mm4
|
|
pfmul mm7,mm1
|
|
pfmul mm1,mm0
|
|
pfmul mm2,mm0
|
|
pfnacc mm6,mm7
|
|
pfnacc mm1,mm2
|
|
movq mm0,mmword ptr [edx+30h]
|
|
movq mm4,mm1
|
|
movq mm3,mm1
|
|
movq mm7,mmword ptr [edx+20h]
|
|
movq mm2,mm6
|
|
punpckhdq mm4,mm4
|
|
punpckldq mm3,mm3
|
|
punpckhdq mm2,mm2
|
|
pfmul mm4,mmword ptr [edx+10h]
|
|
pfmul mm2,mm0
|
|
pfmul mm3,mm7
|
|
pfadd mm2,mm4
|
|
pfsub mm2,mm3
|
|
movq mmword ptr [esp],mm2
|
|
movq mm3,mm1
|
|
movq mm4,mm6
|
|
movq mm2,mm5
|
|
punpckhdq mm3,mm3
|
|
punpckldq mm4,mm4
|
|
punpckhdq mm2,mm2
|
|
pfmul mm3,mmword ptr [edx]
|
|
pfmul mm2,mm0
|
|
pfmul mm4,mm7
|
|
pfsub mm2,mm4
|
|
pfadd mm2,mm3
|
|
movq mmword ptr [esp+10h],mm2
|
|
movq mm4,mm0
|
|
movq mm2,mm5
|
|
movq mm3,mm6
|
|
punpckldq mm1,mm1
|
|
punpckldq mm2,mm2
|
|
punpckldq mm3,mm3
|
|
pfmul mm1,mmword ptr [edx]
|
|
pfmul mm0,mm2
|
|
pfmul mm3,mmword ptr [edx+10h]
|
|
pfadd mm0,mm1
|
|
pfsub mm0,mm3
|
|
movq mmword ptr [esp+20h],mm0
|
|
punpckhdq mm5,mm5
|
|
punpckhdq mm6,mm6
|
|
pfmul mm2,mm7
|
|
pfmul mm5,mmword ptr [edx+10h]
|
|
pfmul mm6,mmword ptr [edx]
|
|
pfsub mm2,mm5
|
|
pfadd mm2,mm6
|
|
movq mmword ptr [esp+30h],mm2
|
|
punpckhdq mm0,mm2
|
|
punpckldq mm7,mm4
|
|
movq mm3,mmword ptr [edx]
|
|
movq mm1,mmword ptr [esp]
|
|
punpckldq mm3,dword ptr [edx+10h]
|
|
punpckhdq mm1,mmword ptr [esp+10h]
|
|
pfmul mm0,mm7
|
|
pfmul mm3,mm1
|
|
pfnacc mm0,mm3
|
|
pfacc mm0,mm0
|
|
movq mm4,mm0
|
|
pfrcp mm1,mm0
|
|
pxor mm3,mm3
|
|
punpckldq mm0,mm0
|
|
pfcmpeq mm3,mm4
|
|
pfrcpit1 mm0,mm1
|
|
pfrcpit2 mm0,mm1
|
|
movd eax,mm3
|
|
pfmul mm0,mmword ptr const3DN_1_N1
|
|
test eax,eax
|
|
pswapd mm1,mm0
|
|
jne m1
|
|
mov eax, pOut
|
|
movq mm2,mmword ptr [esp]
|
|
movq mm5,mmword ptr [esp+8]
|
|
movq mm3,mmword ptr [esp+10h]
|
|
movq mm6,mmword ptr [esp+18h]
|
|
pfmul mm2,mm1
|
|
pfmul mm3,mm0
|
|
pfmul mm5,mm1
|
|
pfmul mm6,mm0
|
|
movq mm4,mm2
|
|
movq mm7,mm5
|
|
punpckhdq mm2,mm3
|
|
punpckldq mm4,mm3
|
|
punpckhdq mm5,mm6
|
|
punpckldq mm7,mm6
|
|
movq mmword ptr [eax],mm2
|
|
movq mmword ptr [eax+10h],mm4
|
|
movq mmword ptr [eax+20h],mm5
|
|
movq mmword ptr [eax+30h],mm7
|
|
movq mm2,mmword ptr [esp+20h]
|
|
movq mm5,mmword ptr [esp+28h]
|
|
movq mm3,mmword ptr [esp+30h]
|
|
movq mm6,mmword ptr [esp+38h]
|
|
pfmul mm2,mm1
|
|
pfmul mm3,mm0
|
|
pfmul mm5,mm1
|
|
pfmul mm6,mm0
|
|
movq mm4,mm2
|
|
movq mm7,mm5
|
|
punpckhdq mm2,mm3
|
|
punpckldq mm4,mm3
|
|
punpckhdq mm5,mm6
|
|
punpckldq mm7,mm6
|
|
movq mmword ptr [eax+8],mm2
|
|
movq mmword ptr [eax+18h],mm4
|
|
movq mmword ptr [eax+28h],mm5
|
|
movq mmword ptr [eax+38h],mm7
|
|
m1:
|
|
femms
|
|
add esp,40h
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Matrix multiplication using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void multMatrixf_3DNow(float *product, const float *m1, const float *m2)
|
|
{
|
|
__asm
|
|
{
|
|
femms
|
|
sub esp,44h
|
|
mov edx, m1
|
|
mov ecx, m2
|
|
mov eax, product
|
|
mov dword ptr [esp+40h],ebp
|
|
mov ebp,esp
|
|
and esp,0FFFFFFF8h
|
|
movq mm0,mmword ptr [edx]
|
|
movq mm1,mmword ptr [edx+10h]
|
|
movq mm3,mmword ptr [edx+28h]
|
|
movq mm4,mmword ptr [edx+38h]
|
|
movq mm2,mm0
|
|
punpckldq mm0,mm1
|
|
punpckhdq mm2,mm1
|
|
movq mm5,mm3
|
|
movq mmword ptr [esp],mm0
|
|
movq mmword ptr [esp+10h],mm2
|
|
movq mm6,mmword ptr [edx+8]
|
|
punpckldq mm3,mm4
|
|
movq mm7,mmword ptr [edx+18h]
|
|
movq mm0,mmword ptr [edx+20h]
|
|
punpckhdq mm5,mm4
|
|
movq mm1,mm6
|
|
movq mmword ptr [esp+28h],mm3
|
|
movq mmword ptr [esp+38h],mm5
|
|
punpckldq mm6,mm7
|
|
movq mm4,mmword ptr [edx+30h]
|
|
movq mm2,mm0
|
|
punpckhdq mm1,mm7
|
|
movq mmword ptr [esp+20h],mm6
|
|
punpckhdq mm0,mm4
|
|
punpckldq mm2,mm4
|
|
movq mmword ptr [esp+30h],mm1
|
|
movq mmword ptr [esp+18h],mm0
|
|
movq mmword ptr [esp+8],mm2
|
|
movq mm0,mmword ptr [ecx]
|
|
movq mm1,mmword ptr [ecx+8]
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
movq mm4,mm0
|
|
movq mm5,mm1
|
|
pfmul mm0,mmword ptr [esp]
|
|
pfmul mm1,mmword ptr [esp+8]
|
|
movq mm6,mm2
|
|
movq mm7,mm3
|
|
pfmul mm2,mmword ptr [esp+10h]
|
|
pfmul mm3,mmword ptr [esp+18h]
|
|
pfmul mm4,mmword ptr [esp+20h]
|
|
pfmul mm5,mmword ptr [esp+28h]
|
|
pfmul mm6,mmword ptr [esp+30h]
|
|
pfmul mm7,mmword ptr [esp+38h]
|
|
pfacc mm0,mm2
|
|
pfacc mm1,mm3
|
|
pfacc mm4,mm6
|
|
pfacc mm5,mm7
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
movq mmword ptr [eax],mm0
|
|
movq mmword ptr [eax+8],mm4
|
|
movq mm0,mmword ptr [ecx+10h]
|
|
movq mm1,mmword ptr [ecx+18h]
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
movq mm4,mm0
|
|
movq mm5,mm1
|
|
pfmul mm0,mmword ptr [esp]
|
|
pfmul mm1,mmword ptr [esp+8]
|
|
movq mm6,mm2
|
|
movq mm7,mm3
|
|
pfmul mm2,mmword ptr [esp+10h]
|
|
pfmul mm3,mmword ptr [esp+18h]
|
|
pfmul mm4,mmword ptr [esp+20h]
|
|
pfmul mm5,mmword ptr [esp+28h]
|
|
pfmul mm6,mmword ptr [esp+30h]
|
|
pfmul mm7,mmword ptr [esp+38h]
|
|
pfacc mm0,mm2
|
|
pfacc mm1,mm3
|
|
pfacc mm4,mm6
|
|
pfacc mm5,mm7
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
movq mmword ptr [eax+10h],mm0
|
|
movq mmword ptr [eax+18h],mm4
|
|
movq mm0,mmword ptr [ecx+20h]
|
|
movq mm1,mmword ptr [ecx+28h]
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
movq mm4,mm0
|
|
movq mm5,mm1
|
|
pfmul mm0,mmword ptr [esp]
|
|
pfmul mm1,mmword ptr [esp+8]
|
|
movq mm6,mm2
|
|
movq mm7,mm3
|
|
pfmul mm2,mmword ptr [esp+10h]
|
|
pfmul mm3,mmword ptr [esp+18h]
|
|
pfmul mm4,mmword ptr [esp+20h]
|
|
pfmul mm5,mmword ptr [esp+28h]
|
|
pfmul mm6,mmword ptr [esp+30h]
|
|
pfmul mm7,mmword ptr [esp+38h]
|
|
pfacc mm0,mm2
|
|
pfacc mm1,mm3
|
|
pfacc mm4,mm6
|
|
pfacc mm5,mm7
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
movq mmword ptr [eax+20h],mm0
|
|
movq mmword ptr [eax+28h],mm4
|
|
movq mm0,mmword ptr [ecx+30h]
|
|
movq mm1,mmword ptr [ecx+38h]
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
movq mm4,mm0
|
|
movq mm5,mm1
|
|
pfmul mm0,mmword ptr [esp]
|
|
pfmul mm1,mmword ptr [esp+8]
|
|
movq mm6,mm2
|
|
movq mm7,mm3
|
|
pfmul mm2,mmword ptr [esp+10h]
|
|
pfmul mm3,mmword ptr [esp+18h]
|
|
pfmul mm4,mmword ptr [esp+20h]
|
|
pfmul mm5,mmword ptr [esp+28h]
|
|
pfmul mm6,mmword ptr [esp+30h]
|
|
pfmul mm7,mmword ptr [esp+38h]
|
|
pfacc mm0,mm2
|
|
pfacc mm1,mm3
|
|
pfacc mm4,mm6
|
|
pfacc mm5,mm7
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
movq mmword ptr [eax+30h],mm0
|
|
movq mmword ptr [eax+38h],mm4
|
|
mov esp,ebp
|
|
mov ebp,dword ptr [esp+40h]
|
|
add esp,44h
|
|
femms
|
|
}
|
|
}
|
|
|
|
// Matrix transposing using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void transposeMatrixf_3DNow(float *product, const float *m)
|
|
{
|
|
__asm
|
|
{
|
|
mov edx,m
|
|
movq mm0,mmword ptr [edx]
|
|
movq mm1,mmword ptr [edx+10h]
|
|
movq mm3,mmword ptr [edx+28h]
|
|
movq mm4,mmword ptr [edx+38h]
|
|
movq mm2,mm0
|
|
mov eax,product
|
|
punpckldq mm0,mm1
|
|
punpckhdq mm2,mm1
|
|
movq mm5,mm3
|
|
movq mmword ptr [eax],mm0
|
|
movq mmword ptr [eax+10h],mm2
|
|
movq mm6,mmword ptr [edx+8]
|
|
punpckldq mm3,mm4
|
|
movq mm7,mmword ptr [edx+18h]
|
|
movq mm0,mmword ptr [edx+20h]
|
|
punpckhdq mm5,mm4
|
|
movq mm1,mm6
|
|
movq mmword ptr [eax+28h],mm3
|
|
movq mmword ptr [eax+38h],mm5
|
|
punpckldq mm6,mm7
|
|
movq mm4,mmword ptr [edx+30h]
|
|
movq mm2,mm0
|
|
punpckhdq mm1,mm7
|
|
movq mmword ptr [eax+20h],mm6
|
|
punpckhdq mm0,mm4
|
|
punpckldq mm2,mm4
|
|
movq mmword ptr [eax+30h],mm1
|
|
movq mmword ptr [eax+18h],mm0
|
|
movq mmword ptr [eax+8],mm2
|
|
femms
|
|
}
|
|
}
|
|
|
|
// Matrix identity using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void indentityMatrixf_3DNow(float *product)
|
|
{
|
|
__asm
|
|
{
|
|
movd mm1,dword ptr const3DN_1_N1
|
|
mov eax,dword ptr product
|
|
pxor mm0,mm0
|
|
movq mm2,mm1
|
|
movq mmword ptr [eax],mm1
|
|
movq mmword ptr [eax+8],mm0
|
|
movq mmword ptr [eax+18h],mm0
|
|
punpckhdq mm2,mm2
|
|
movq mmword ptr [eax+20h],mm0
|
|
movq mmword ptr [eax+28h],mm1
|
|
punpckldq mm2,mm1
|
|
movq mmword ptr [eax+30h],mm0
|
|
movq mmword ptr [eax+10h],mm2
|
|
movq mmword ptr [eax+38h],mm2
|
|
femms
|
|
}
|
|
}
|
|
|
|
// Scale Matrix using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void scaleMatrixf_3DNow(float *product, float x, float y, float z)
|
|
{
|
|
__asm
|
|
{
|
|
femms
|
|
mov eax, product
|
|
movd mm3, y
|
|
movq mm1,mmword ptr const3DN_0_1
|
|
movd mm2, x
|
|
pxor mm0,mm0
|
|
psllq mm3,20h
|
|
movd mm4, z
|
|
movq mmword ptr [eax],mm2
|
|
movq mmword ptr [eax+8],mm0
|
|
movq mmword ptr [eax+18h],mm0
|
|
movq mmword ptr [eax+20h],mm0
|
|
movq mmword ptr [eax+30h],mm0
|
|
movq mmword ptr [eax+10h],mm3
|
|
movq mmword ptr [eax+28h],mm4
|
|
movq mmword ptr [eax+38h],mm1
|
|
femms
|
|
}
|
|
}
|
|
|
|
// Transform position using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void transformVec3f_3DNow(float *result, float *inV, float *matrix)
|
|
{
|
|
_asm
|
|
{
|
|
femms
|
|
mov eax,inV
|
|
mov edx,matrix
|
|
mov ecx,result
|
|
movq mm0,mmword ptr [eax]
|
|
movq mm1,mm0
|
|
punpckldq mm0,mm0
|
|
punpckhdq mm1,mm1
|
|
movd mm2,dword ptr [eax+8]
|
|
punpckldq mm2,mm2
|
|
movq mm3,mm0
|
|
pfmul mm0,mmword ptr [edx]
|
|
movq mm4,mm1
|
|
pfmul mm1,mmword ptr [edx+10h]
|
|
movq mm5,mm2
|
|
pfmul mm2,mmword ptr [edx+20h]
|
|
pfadd mm0,mmword ptr [edx+30h]
|
|
pfmul mm3,mmword ptr [edx+8]
|
|
pfadd mm1,mm2
|
|
pfmul mm4,mmword ptr [edx+18h]
|
|
pfmul mm5,mmword ptr [edx+28h]
|
|
pfadd mm3,mmword ptr [edx+38h]
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
pfadd mm3,mm4
|
|
movq mmword ptr [ecx],mm0
|
|
movq mmword ptr [ecx+8],mm3
|
|
femms
|
|
}
|
|
}
|
|
|
|
// Transform normal using 3DNow! instructions set
|
|
// NOTE: On AMD Athlon - much faster then SSE version
|
|
inline void transformVec3Nf_3DNow(float *result, float *inV, float *matrix)
|
|
{
|
|
_asm
|
|
{
|
|
femms
|
|
mov eax,inV
|
|
mov edx,matrix
|
|
mov ecx,result
|
|
movq mm0,mmword ptr [eax]
|
|
movq mm1,mm0
|
|
punpckldq mm0,mm0
|
|
punpckhdq mm1,mm1
|
|
movd mm2,dword ptr [eax+8]
|
|
punpckldq mm2,mm2
|
|
movq mm3,mm0
|
|
pfmul mm0,mmword ptr [edx]
|
|
movq mm4,mm1
|
|
pfmul mm1,mmword ptr [edx+10h]
|
|
movq mm5,mm2
|
|
pfmul mm2,mmword ptr [edx+20h]
|
|
pfmul mm3,mmword ptr [edx+8]
|
|
pfadd mm1,mm2
|
|
pfmul mm4,mmword ptr [edx+18h]
|
|
pfmul mm5,mmword ptr [edx+28h]
|
|
pfadd mm0,mm1
|
|
pfadd mm4,mm5
|
|
pfadd mm3,mm4
|
|
movq mmword ptr [ecx],mm0
|
|
movd dword ptr [ecx+8],mm3
|
|
femms
|
|
}
|
|
}
|
|
|
|
//==========================================================================================
|
|
// SSE optimizations
|
|
|
|
// Matrix inversion using SSE instructions set
|
|
inline bool invertMatrixf_SSE(float *pOut, const float *pIn)
|
|
{
|
|
_asm
|
|
{
|
|
mov ebx, esp
|
|
and esp,0FFFFFFF0h
|
|
sub esp,90h
|
|
mov eax, pIn
|
|
movaps xmm2,xmmword ptr [esp+30h]
|
|
movlps xmm2,qword ptr [eax]
|
|
movaps xmm1,xmmword ptr [esp+70h]
|
|
lea ecx,[eax+10h]
|
|
movhps xmm2,qword ptr [ecx]
|
|
lea edx,[eax+20h]
|
|
movlps xmm1,qword ptr [edx]
|
|
movaps xmm0,xmm2
|
|
lea ecx,[eax+30h]
|
|
movhps xmm1,qword ptr [ecx]
|
|
shufps xmm0,xmm1,88h
|
|
shufps xmm1,xmm2,0DDh
|
|
lea edx,[eax+8]
|
|
movlps xmm2,qword ptr [edx]
|
|
movaps xmm3,xmm2
|
|
movaps xmm2,xmmword ptr [esp+70h]
|
|
lea ecx,[eax+18h]
|
|
movhps xmm3,qword ptr [ecx]
|
|
movaps xmm5,xmm3
|
|
lea ecx,[eax+38h]
|
|
add eax,28h
|
|
movlps xmm2,qword ptr [eax]
|
|
movhps xmm2,qword ptr [ecx]
|
|
shufps xmm5,xmm2,88h
|
|
shufps xmm2,xmm3,0DDh
|
|
movaps xmm3,xmm5
|
|
mulps xmm3,xmm2
|
|
shufps xmm3,xmm3,0B1h
|
|
movaps xmm4,xmm1
|
|
mulps xmm4,xmm3
|
|
movaps xmmword ptr [esp+50h],xmm4
|
|
movaps xmm4,xmm3
|
|
shufps xmm4,xmm3,4Eh
|
|
movaps xmm6,xmm0
|
|
mulps xmm6,xmm3
|
|
movaps xmm3,xmm1
|
|
mulps xmm3,xmm4
|
|
subps xmm3,xmmword ptr [esp+50h]
|
|
movaps xmm7,xmm0
|
|
mulps xmm7,xmm4
|
|
movaps xmm4,xmm7
|
|
subps xmm4,xmm6
|
|
movaps xmmword ptr [esp+10h],xmm4
|
|
movaps xmm6,xmm1
|
|
mulps xmm6,xmm5
|
|
shufps xmm6,xmm6,0B1h
|
|
movaps xmm7,xmm0
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+40h],xmm7
|
|
movaps xmm4,xmm2
|
|
mulps xmm4,xmm6
|
|
shufps xmm6,xmm6,4Eh
|
|
movaps xmm7,xmm2
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+60h],xmm7
|
|
movaps xmm7,xmm0
|
|
mulps xmm7,xmm6
|
|
movaps xmm6,xmm7
|
|
subps xmm6,xmmword ptr [esp+40h]
|
|
movaps xmmword ptr [esp+40h],xmm6
|
|
movaps xmm6,xmm1
|
|
shufps xmm6,xmm1,4Eh
|
|
mulps xmm6,xmm2
|
|
shufps xmm6,xmm6,0B1h
|
|
shufps xmm5,xmm5,4Eh
|
|
movaps xmm7,xmm5
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+20h],xmm7
|
|
addps xmm4,xmm3
|
|
subps xmm4,xmmword ptr [esp+60h]
|
|
movaps xmm3,xmmword ptr [esp+20h]
|
|
movaps xmm7,xmm0
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp],xmm7
|
|
shufps xmm6,xmm6,4Eh
|
|
movaps xmm7,xmm5
|
|
mulps xmm7,xmm6
|
|
addps xmm3,xmm4
|
|
movaps xmmword ptr [esp+30h],xmm6
|
|
subps xmm3,xmm7
|
|
movaps xmmword ptr [esp+50h],xmm3
|
|
movaps xmm7,xmm2
|
|
movaps xmm3,xmm0
|
|
mulps xmm3,xmmword ptr [esp+30h]
|
|
subps xmm3,xmmword ptr [esp]
|
|
movaps xmmword ptr [esp],xmm3
|
|
movaps xmm3,xmm2
|
|
movaps xmm6,xmm0
|
|
mulps xmm6,xmm1
|
|
shufps xmm6,xmm6,0B1h
|
|
mulps xmm3,xmm6
|
|
movaps xmm4,xmm5
|
|
mulps xmm4,xmm6
|
|
shufps xmm6,xmm6,4Eh
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+20h],xmm7
|
|
movaps xmm7,xmm5
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+70h],xmm7
|
|
movaps xmm6,xmm0
|
|
mulps xmm6,xmm2
|
|
shufps xmm6,xmm6,0B1h
|
|
movaps xmm7,xmm5
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+30h],xmm7
|
|
movaps xmm7,xmm1
|
|
mulps xmm7,xmm6
|
|
shufps xmm6,xmm6,4Eh
|
|
movaps xmmword ptr [esp+60h],xmm7
|
|
movaps xmm7,xmm5
|
|
mulps xmm7,xmm6
|
|
movaps xmmword ptr [esp+80h],xmm7
|
|
movaps xmm7,xmm1
|
|
mulps xmm7,xmm6
|
|
movaps xmm6,xmmword ptr [esp]
|
|
shufps xmm6,xmmword ptr [esp],4Eh
|
|
addps xmm3,xmm6
|
|
movaps xmm6,xmm3
|
|
movaps xmm3,xmmword ptr [esp+20h]
|
|
subps xmm3,xmm6
|
|
movaps xmm6,xmm3
|
|
movaps xmm3,xmmword ptr [esp+60h]
|
|
addps xmm3,xmm6
|
|
subps xmm3,xmm7
|
|
movaps xmmword ptr [esp],xmm3
|
|
movaps xmm3,xmm0
|
|
mulps xmm0,xmmword ptr [esp+50h]
|
|
mulps xmm3,xmm5
|
|
movaps xmm5,xmm3
|
|
shufps xmm5,xmm3,0B1h
|
|
movaps xmm3,xmm2
|
|
movaps xmm6,xmm1
|
|
mulps xmm3,xmm5
|
|
mulps xmm6,xmm5
|
|
shufps xmm5,xmm5,4Eh
|
|
mulps xmm2,xmm5
|
|
movaps xmmword ptr [esp+20h],xmm6
|
|
movaps xmm6,xmm2
|
|
movaps xmm2,xmmword ptr [esp+10h]
|
|
shufps xmm2,xmmword ptr [esp+10h],4Eh
|
|
subps xmm2,xmmword ptr [esp+30h]
|
|
movaps xmm7,xmm2
|
|
movaps xmm2,xmmword ptr [esp+80h]
|
|
addps xmm2,xmm7
|
|
addps xmm3,xmm2
|
|
movaps xmm2,xmmword ptr [esp+40h]
|
|
shufps xmm2,xmmword ptr [esp+40h],4Eh
|
|
subps xmm4,xmm2
|
|
subps xmm4,xmmword ptr [esp+70h]
|
|
subps xmm4,xmmword ptr [esp+20h]
|
|
movaps xmm2,xmm0
|
|
shufps xmm2,xmm0,4Eh
|
|
addps xmm2,xmm0
|
|
movaps xmm0,xmm2
|
|
shufps xmm0,xmm2,0B1h
|
|
mulps xmm1,xmm5
|
|
addss xmm0,xmm2
|
|
subps xmm3,xmm6
|
|
addps xmm1,xmm4
|
|
xorps xmm2,xmm2
|
|
xor eax,eax
|
|
inc eax
|
|
xor ecx,ecx
|
|
comiss xmm0,xmm2
|
|
cmove ecx,eax
|
|
test ecx,ecx
|
|
je m1
|
|
xor eax,eax
|
|
jmp mEnd
|
|
m1:
|
|
mov eax, pOut
|
|
movaps xmmword ptr [esp+10h],xmm0
|
|
rcpss xmm2,xmm0
|
|
movss dword ptr [esp+10h],xmm2
|
|
movaps xmm2,xmmword ptr [esp+10h]
|
|
movaps xmm4,xmm2
|
|
mulss xmm4,xmm2
|
|
mulss xmm0,xmm4
|
|
movaps xmm4,xmm0
|
|
movaps xmm0,xmm2
|
|
addss xmm0,xmm2
|
|
subss xmm0,xmm4
|
|
shufps xmm0,xmm0,0
|
|
movaps xmm2,xmm0
|
|
mulps xmm2,xmmword ptr [esp+50h]
|
|
movlps qword ptr [eax],xmm2
|
|
lea ecx,[eax+8]
|
|
movhps qword ptr [ecx],xmm2
|
|
movaps xmm2,xmm0
|
|
mulps xmm2,xmm3
|
|
lea ecx,[eax+10h]
|
|
movlps qword ptr [ecx],xmm2
|
|
lea ecx,[eax+18h]
|
|
movhps qword ptr [ecx],xmm2
|
|
movaps xmm2,xmm0
|
|
mulps xmm2,xmmword ptr [esp]
|
|
lea ecx,[eax+20h]
|
|
movlps qword ptr [ecx],xmm2
|
|
lea ecx,[eax+28h]
|
|
movhps qword ptr [ecx],xmm2
|
|
lea ecx,[eax+30h]
|
|
mulps xmm0,xmm1
|
|
movlps qword ptr [ecx],xmm0
|
|
lea ecx,[eax+38h]
|
|
movhps qword ptr [ecx],xmm0
|
|
mEnd:
|
|
mov esp, ebx
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Matrix multiplication using SSE instructions set
|
|
// IMPORTANT NOTE: much faster if matrices m1 and product is 16 bytes aligned
|
|
inline void multMatrixf_SSE(float *product, const float *m1, const float *m2)
|
|
{
|
|
__asm
|
|
{
|
|
mov eax, m2;
|
|
mov ecx, m1;
|
|
mov edx, product;
|
|
test dl,0Fh
|
|
jne lNonAligned
|
|
test cl,0Fh
|
|
jne lNonAligned
|
|
movss xmm0,dword ptr [eax]
|
|
movaps xmm1,xmmword ptr [ecx]
|
|
shufps xmm0,xmm0,0
|
|
movss xmm2,dword ptr [eax+4]
|
|
mulps xmm0,xmm1
|
|
shufps xmm2,xmm2,0
|
|
movaps xmm3,xmmword ptr [ecx+10h]
|
|
movss xmm4,dword ptr [eax+8]
|
|
mulps xmm2,xmm3
|
|
shufps xmm4,xmm4,0
|
|
addps xmm0,xmm2
|
|
movaps xmm2,xmmword ptr [ecx+20h]
|
|
movss xmm5,dword ptr [eax+0Ch]
|
|
mulps xmm4,xmm2
|
|
shufps xmm5,xmm5,0
|
|
movaps xmm6,xmmword ptr [ecx+30h]
|
|
mulps xmm5,xmm6
|
|
addps xmm4,xmm5
|
|
addps xmm0,xmm4
|
|
movaps xmmword ptr [edx],xmm0
|
|
movss xmm0,dword ptr [eax+10h]
|
|
movss xmm4,dword ptr [eax+14h]
|
|
shufps xmm0,xmm0,0
|
|
shufps xmm4,xmm4,0
|
|
mulps xmm0,xmm1
|
|
mulps xmm4,xmm3
|
|
movss xmm5,dword ptr [eax+18h]
|
|
addps xmm0,xmm4
|
|
shufps xmm5,xmm5,0
|
|
movss xmm4,dword ptr [eax+1Ch]
|
|
mulps xmm5,xmm2
|
|
shufps xmm4,xmm4,0
|
|
mulps xmm4,xmm6
|
|
addps xmm5,xmm4
|
|
addps xmm0,xmm5
|
|
movaps xmmword ptr [edx+10h],xmm0
|
|
movss xmm0,dword ptr [eax+20h]
|
|
movss xmm4,dword ptr [eax+24h]
|
|
shufps xmm0,xmm0,0
|
|
shufps xmm4,xmm4,0
|
|
mulps xmm0,xmm1
|
|
mulps xmm4,xmm3
|
|
movss xmm5,dword ptr [eax+28h]
|
|
addps xmm0,xmm4
|
|
shufps xmm5,xmm5,0
|
|
movss xmm4,dword ptr [eax+2Ch]
|
|
mulps xmm5,xmm2
|
|
shufps xmm4,xmm4,0
|
|
mulps xmm4,xmm6
|
|
addps xmm5,xmm4
|
|
addps xmm0,xmm5
|
|
movaps xmmword ptr [edx+20h],xmm0
|
|
movss xmm0,dword ptr [eax+30h]
|
|
movss xmm4,dword ptr [eax+34h]
|
|
shufps xmm0,xmm0,0
|
|
shufps xmm4,xmm4,0
|
|
mulps xmm0,xmm1
|
|
mulps xmm4,xmm3
|
|
movss xmm1,dword ptr [eax+38h]
|
|
addps xmm0,xmm4
|
|
shufps xmm1,xmm1,0
|
|
movss xmm3,dword ptr [eax+3Ch]
|
|
mulps xmm1,xmm2
|
|
shufps xmm3,xmm3,0
|
|
mulps xmm3,xmm6
|
|
addps xmm1,xmm3
|
|
addps xmm0,xmm1
|
|
movaps xmmword ptr [edx+30h],xmm0
|
|
jmp lEnd
|
|
lNonAligned:
|
|
movlps xmm0,qword ptr [ecx]
|
|
movhps xmm0,qword ptr [ecx+8]
|
|
movlps xmm1,qword ptr [ecx+10h]
|
|
movhps xmm1,qword ptr [ecx+18h]
|
|
movlps xmm2,qword ptr [ecx+20h]
|
|
movhps xmm2,qword ptr [ecx+28h]
|
|
movlps xmm3,qword ptr [ecx+30h]
|
|
movhps xmm3,qword ptr [ecx+38h]
|
|
movss xmm4,dword ptr [eax]
|
|
movss xmm5,dword ptr [eax+4]
|
|
movss xmm6,dword ptr [eax+8]
|
|
movss xmm7,dword ptr [eax+0Ch]
|
|
shufps xmm4,xmm4,0
|
|
shufps xmm5,xmm5,0
|
|
shufps xmm6,xmm6,0
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm4,xmm0
|
|
mulps xmm5,xmm1
|
|
mulps xmm6,xmm2
|
|
mulps xmm7,xmm3
|
|
addps xmm4,xmm5
|
|
addps xmm6,xmm7
|
|
addps xmm4,xmm6
|
|
movss xmm5,dword ptr [eax+10h]
|
|
movss xmm6,dword ptr [eax+14h]
|
|
movss xmm7,dword ptr [eax+18h]
|
|
shufps xmm5,xmm5,0
|
|
shufps xmm6,xmm6,0
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm5,xmm0
|
|
mulps xmm6,xmm1
|
|
mulps xmm7,xmm2
|
|
addps xmm5,xmm6
|
|
addps xmm5,xmm7
|
|
movss xmm6,dword ptr [eax+1Ch]
|
|
shufps xmm6,xmm6,0
|
|
mulps xmm6,xmm3
|
|
addps xmm5,xmm6
|
|
movss xmm6,dword ptr [eax+20h]
|
|
movss xmm7,dword ptr [eax+24h]
|
|
shufps xmm6,xmm6,0
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm6,xmm0
|
|
mulps xmm7,xmm1
|
|
addps xmm6,xmm7
|
|
movss xmm7,dword ptr [eax+28h]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm7,xmm2
|
|
addps xmm6,xmm7
|
|
movss xmm7,dword ptr [eax+2Ch]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm7,xmm3
|
|
addps xmm6,xmm7
|
|
movss xmm7,dword ptr [eax+30h]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm0,xmm7
|
|
movss xmm7,dword ptr [eax+34h]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm1,xmm7
|
|
movss xmm7,dword ptr [eax+38h]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm2,xmm7
|
|
movss xmm7,dword ptr [eax+3Ch]
|
|
shufps xmm7,xmm7,0
|
|
mulps xmm3,xmm7
|
|
movlps qword ptr [edx],xmm4
|
|
movhps qword ptr [edx+8],xmm4
|
|
addps xmm0,xmm1
|
|
movlps qword ptr [edx+10h],xmm5
|
|
movhps qword ptr [edx+18h],xmm5
|
|
addps xmm2,xmm3
|
|
movlps qword ptr [edx+20h],xmm6
|
|
movhps qword ptr [edx+28h],xmm6
|
|
addps xmm0,xmm2
|
|
movlps qword ptr [edx+30h],xmm0
|
|
movhps qword ptr [edx+38h],xmm0
|
|
lEnd:
|
|
}
|
|
}
|
|
|
|
// Transform position using SSE instructions set
|
|
// IMPORTANT NOTE: much faster if matrix is 16 bytes aligned
|
|
inline void transformVec3f_SSE(float *result, float *inV, float *matrix)
|
|
{
|
|
_asm
|
|
{
|
|
and esp,0FFFFFFF0h
|
|
sub esp,1Ch
|
|
mov ecx,inV
|
|
mov eax,matrix
|
|
test al,0Fh
|
|
movaps xmm0,xmmword ptr [esp+0Ch]
|
|
movlps xmm0,qword ptr [ecx]
|
|
mov ecx,dword ptr [ecx+8]
|
|
push esi
|
|
jne lNonAligned
|
|
movaps xmm2,xmmword ptr [eax+30h]
|
|
movaps xmm3,xmmword ptr [eax+20h]
|
|
mov dword ptr [esp+0Ch],ecx
|
|
movss xmm1,dword ptr [esp+0Ch]
|
|
shufps xmm1,xmm1,0
|
|
mulps xmm3,xmm1
|
|
addps xmm3,xmm2
|
|
movaps xmm2,xmmword ptr [eax+10h]
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,55h
|
|
mulps xmm2,xmm1
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,0
|
|
movaps xmm0,xmmword ptr [eax]
|
|
jmp lEnd
|
|
lNonAligned:
|
|
movaps xmm1,xmmword ptr [esp+10h]
|
|
movaps xmm3,xmmword ptr [esp+10h]
|
|
lea edx,[eax+38h]
|
|
lea esi,[eax+30h]
|
|
movlps xmm1,qword ptr [esi]
|
|
movhps xmm1,qword ptr [edx]
|
|
movaps xmm2,xmm1
|
|
mov dword ptr [esp+0Ch],ecx
|
|
movss xmm1,dword ptr [esp+0Ch]
|
|
shufps xmm1,xmm1,0
|
|
lea edx,[eax+20h]
|
|
movlps xmm3,qword ptr [edx]
|
|
lea ecx,[eax+28h]
|
|
movhps xmm3,qword ptr [ecx]
|
|
mulps xmm3,xmm1
|
|
addps xmm3,xmm2
|
|
movaps xmm2,xmmword ptr [esp+10h]
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,55h
|
|
lea ecx,[eax+18h]
|
|
lea edx,[eax+10h]
|
|
movlps xmm2,qword ptr [edx]
|
|
movhps xmm2,qword ptr [ecx]
|
|
mulps xmm2,xmm1
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,0
|
|
movaps xmm0,xmmword ptr [esp+10h]
|
|
movlps xmm0,qword ptr [eax]
|
|
lea ecx,[eax+8]
|
|
movhps xmm0,qword ptr [ecx]
|
|
lEnd:
|
|
mov eax, result
|
|
mulps xmm0,xmm1
|
|
addps xmm0,xmm2
|
|
addps xmm0,xmm3
|
|
movups xmmword ptr [eax],xmm0
|
|
}
|
|
}
|
|
|
|
// Transform normal using SSE instructions set
|
|
// IMPORTANT NOTE: much faster if matrix is 16 bytes aligned
|
|
inline void transformVec3Nf_SSE(float *result, float *inV, float *matrix)
|
|
{
|
|
_asm
|
|
{
|
|
and esp,0FFFFFFF0h
|
|
sub esp,20h
|
|
mov eax, matrix
|
|
test al,0Fh
|
|
mov ecx, inV
|
|
jne lNonAligned
|
|
movaps xmm1,xmmword ptr [esp+10h]
|
|
movlps xmm1,qword ptr [ecx]
|
|
mov ecx,dword ptr [ecx+8]
|
|
movaps xmm2,xmmword ptr [eax+20h]
|
|
mov dword ptr [esp+0Ch],ecx
|
|
movss xmm0,dword ptr [esp+0Ch]
|
|
shufps xmm0,xmm0,0
|
|
mulps xmm0,xmm2
|
|
movaps xmm2,xmm0
|
|
movaps xmm0,xmmword ptr [eax+10h]
|
|
movaps xmm3,xmm1
|
|
shufps xmm3,xmm1,55h
|
|
mulps xmm3,xmm0
|
|
movaps xmm0,xmmword ptr [eax]
|
|
movaps xmm4,xmm1
|
|
shufps xmm4,xmm1,0
|
|
mulps xmm4,xmm0
|
|
movaps xmm0,xmm4
|
|
jmp lEnd
|
|
lNonAligned:
|
|
movaps xmm0,xmmword ptr [esp+10h]
|
|
movlps xmm0,qword ptr [ecx]
|
|
mov ecx,dword ptr [ecx+8]
|
|
movaps xmm2,xmmword ptr [esp+10h]
|
|
movaps xmm3,xmmword ptr [esp+10h]
|
|
mov dword ptr [esp+0Ch],ecx
|
|
movss xmm1,dword ptr [esp+0Ch]
|
|
shufps xmm1,xmm1,0
|
|
lea edx,[eax+20h]
|
|
movlps xmm2,qword ptr [edx]
|
|
lea ecx,[eax+28h]
|
|
movhps xmm2,qword ptr [ecx]
|
|
mulps xmm2,xmm1
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,55h
|
|
lea ecx,[eax+18h]
|
|
lea edx,[eax+10h]
|
|
movlps xmm3,qword ptr [edx]
|
|
movhps xmm3,qword ptr [ecx]
|
|
mulps xmm3,xmm1
|
|
movaps xmm1,xmm0
|
|
shufps xmm1,xmm0,0
|
|
movaps xmm0,xmmword ptr [esp+10h]
|
|
movlps xmm0,qword ptr [eax]
|
|
lea ecx,[eax+8]
|
|
movhps xmm0,qword ptr [ecx]
|
|
mulps xmm0,xmm1
|
|
lEnd:
|
|
mov eax, result
|
|
addps xmm0,xmm3
|
|
addps xmm0,xmm2
|
|
movaps xmm1,xmm0
|
|
lea ecx,[eax+8]
|
|
movhlps xmm1,xmm0
|
|
movlps qword ptr [eax],xmm0
|
|
movss dword ptr [ecx],xmm1
|
|
mov esp,ebp
|
|
pop ebp
|
|
ret 0Ch
|
|
}
|
|
}
|
|
#else
|
|
|
|
const int PREFNTA_BLOCK = 0x4000;
|
|
|
|
ILINE void cryMemcpy(void* Dst, const void* Src, int n) {
|
|
char* dst=(char*)Dst;
|
|
char* src=(char*)Src;
|
|
while (n > PREFNTA_BLOCK)
|
|
{
|
|
#if !defined(LINUX)
|
|
for (int p = 0; p < PREFNTA_BLOCK; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); }
|
|
#endif
|
|
memcpy(dst, src, PREFNTA_BLOCK);
|
|
src += PREFNTA_BLOCK;
|
|
dst += PREFNTA_BLOCK;
|
|
n -= PREFNTA_BLOCK;
|
|
}
|
|
#if !defined(LINUX)
|
|
for (int p = 0; p < n; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); }
|
|
#endif
|
|
memcpy(dst, src, n);
|
|
}
|
|
|
|
ILINE void cryMemcpy( void* Dst, const void* Src, INT n, int nFlags )
|
|
{
|
|
char* dst=(char*)Dst;
|
|
char* src=(char*)Src;
|
|
while (n > PREFNTA_BLOCK)
|
|
{
|
|
#if !defined(LINUX)
|
|
for (int p = 0; p < PREFNTA_BLOCK; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); }
|
|
#endif
|
|
memcpy(dst, src, PREFNTA_BLOCK);
|
|
src += PREFNTA_BLOCK;
|
|
dst += PREFNTA_BLOCK;
|
|
n -= PREFNTA_BLOCK;
|
|
}
|
|
#if !defined(LINUX)
|
|
for (int p = 0; p < n; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); }
|
|
#endif
|
|
memcpy(dst, src, n);
|
|
}
|
|
#endif
|
|
|
|
inline void mathTransformVec3fN(float *pOut, float *pIn, float *matrix, int nV, int OptFlags)
|
|
{
|
|
#if defined _CPU_X86 && !defined(LINUX)
|
|
// TODO: AMD64 port: NEED TO IMPLEMENT!!!
|
|
int i;
|
|
if (OptFlags & CPUF_3DNOW)
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3Nf_3DNow(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
else
|
|
if (OptFlags & CPUF_SSE)
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3Nf_SSE(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3Nf_SSE(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
inline void mathTransformVec3f(float *pOut, float *pIn, float *matrix, int nV, int OptFlags)
|
|
{
|
|
#if defined(_CPU_X86) && !defined(LINUX)
|
|
int i;
|
|
// TODO: AMD64 port: NEED TO IMPLEMENT!!!
|
|
if (OptFlags & CPUF_3DNOW)
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3f_3DNow(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
else
|
|
if (OptFlags & CPUF_SSE)
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3f_SSE(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (i=0; i<nV; i++)
|
|
{
|
|
transformVec3f_SSE(pOut, pIn, matrix);
|
|
pIn += 3;
|
|
pOut += 3;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
inline void mathMatrixInverse(float *pOut, float *pIn, int OptFlags)
|
|
{
|
|
#if defined(_CPU_X86) && !defined(LINUX)
|
|
// TODO: AMD64 port: NEED TO IMPLEMENT!!!
|
|
if (OptFlags & CPUF_3DNOW)
|
|
invertMatrixf_3DNow(pOut, pIn);
|
|
else
|
|
if (OptFlags & CPUF_SSE)
|
|
invertMatrixf_SSE(pOut, pIn);
|
|
else
|
|
#endif
|
|
QQinvertMatrixf(pOut, pIn);
|
|
}
|
|
|
|
inline void mathMatrixMultiply(float *pOut, float *pM1, float *pM2, int OptFlags)
|
|
{
|
|
#if defined _CPU_X86 && !defined(LINUX)
|
|
if (OptFlags & CPUF_3DNOW)
|
|
multMatrixf_3DNow(pOut, pM1, pM2);
|
|
else
|
|
if (OptFlags & CPUF_SSE)
|
|
multMatrixf_SSE(pOut, pM1, pM2);
|
|
else
|
|
#endif
|
|
multMatrixf(pOut, pM1, pM2);
|
|
}
|
|
|
|
inline void mathMatrixTranspose(float *pOut, float *pIn, int OptFlags)
|
|
{
|
|
#if defined _CPU_X86 && !defined(LINUX)
|
|
// TODO: AMD64 port: NEED TO IMPLEMENT!!!
|
|
if (OptFlags & CPUF_3DNOW)
|
|
transposeMatrixf_3DNow(pOut, pIn);
|
|
else
|
|
#endif
|
|
{
|
|
if (pOut == pIn)
|
|
{
|
|
Exchange(pOut[1], pOut[4]);
|
|
Exchange(pOut[2], pOut[8]);
|
|
Exchange(pOut[3], pOut[12]);
|
|
|
|
Exchange(pOut[6], pOut[9]);
|
|
Exchange(pOut[7], pOut[13]);
|
|
|
|
Exchange(pOut[11], pOut[14]);
|
|
}
|
|
else
|
|
{
|
|
pOut[0] = pIn[0];
|
|
pOut[4] = pIn[1];
|
|
pOut[8] = pIn[2];
|
|
pOut[12] = pIn[3];
|
|
|
|
pOut[1] = pIn[4];
|
|
pOut[5] = pIn[5];
|
|
pOut[9] = pIn[6];
|
|
pOut[13] = pIn[7];
|
|
|
|
pOut[2] = pIn[8];
|
|
pOut[6] = pIn[9];
|
|
pOut[10] = pIn[10];
|
|
pOut[14] = pIn[11];
|
|
|
|
pOut[3] = pIn[12];
|
|
pOut[7] = pIn[13];
|
|
pOut[11] = pIn[14];
|
|
pOut[15] = pIn[15];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void mathRotateX(float *pMatr, float fDegr, int OptFlags)
|
|
{
|
|
Matrix44 rm;
|
|
float cossin[2];
|
|
sincos_tpl(fDegr*gf_DEGTORAD, cossin);
|
|
rm(0,0) = 1; rm(0,1) = 0; rm(0,2) = 0; rm(0,3) = 0;
|
|
rm(1,0) = 0; rm(1,1) = cossin[0]; rm(1,2) = cossin[1]; rm(1,3) = 0;
|
|
rm(2,0) = 0; rm(2,1) = -cossin[1]; rm(2,2) = cossin[0]; rm(2,3) = 0;
|
|
rm(3,0) = 0; rm(3,1) = 0; rm(3,2) = 0; rm(3,3) = 1;
|
|
mathMatrixMultiply(pMatr, pMatr, rm.GetData(), OptFlags);
|
|
}
|
|
inline void mathRotateY(float *pMatr, float fDegr, int OptFlags)
|
|
{
|
|
Matrix44 rm;
|
|
float cossin[2];
|
|
sincos_tpl(fDegr*gf_DEGTORAD, cossin);
|
|
rm(0,0) = cossin[0]; rm(0,1) = 0; rm(0,2) = -cossin[1]; rm(0,3) = 0;
|
|
rm(1,0) = 0; rm(1,1) = 1; rm(1,2) = 0; rm(1,3) = 0;
|
|
rm(2,0) = cossin[1]; rm(2,1) = 0; rm(2,2) = cossin[0]; rm(2,3) = 0;
|
|
rm(3,0) = 0; rm(3,1) = 0; rm(3,2) = 0; rm(3,3) = 1;
|
|
mathMatrixMultiply(pMatr, pMatr, rm.GetData(), OptFlags);
|
|
}
|
|
inline void mathRotateZ(float *pMatr, float fDegr, int OptFlags)
|
|
{
|
|
Matrix44 rm;
|
|
float cossin[2];
|
|
sincos_tpl(fDegr*gf_DEGTORAD, cossin);
|
|
rm(0,0) = cossin[0]; rm(0,1) = cossin[1]; rm(0,2) = 0; rm(0,3) = 0;
|
|
rm(1,0) = -cossin[1]; rm(1,1) = cossin[0]; rm(1,2) = 0; rm(1,3) = 0;
|
|
rm(2,0) = 0; rm(2,1) = 0; rm(2,2) = 1; rm(2,3) = 0;
|
|
rm(3,0) = 0; rm(3,1) = 0; rm(3,2) = 0; rm(3,3) = 1;
|
|
mathMatrixMultiply(pMatr, pMatr, rm.GetData(), OptFlags);
|
|
}
|
|
|
|
inline void mathScale(float *pMatr, Vec3d vScale, int)
|
|
{
|
|
pMatr[0] *= vScale.x; pMatr[4] *= vScale.y; pMatr[8] *= vScale.z;
|
|
pMatr[1] *= vScale.x; pMatr[5] *= vScale.y; pMatr[9] *= vScale.z;
|
|
pMatr[2] *= vScale.x; pMatr[6] *= vScale.y; pMatr[10] *= vScale.z;
|
|
pMatr[3] *= vScale.x; pMatr[7] *= vScale.y; pMatr[11] *= vScale.z;
|
|
}
|
|
|
|
inline void mathCalcMatrix(Matrix44& Matrix, Vec3d vPos, Vec3d vAngs, Vec3d vScale, int OptFlags)
|
|
{
|
|
Matrix.SetTranslationMat(vPos);
|
|
|
|
if (vAngs.z)
|
|
mathRotateZ(Matrix.GetData(), vAngs.z, OptFlags);
|
|
if (vAngs.y)
|
|
mathRotateY(Matrix.GetData(), vAngs.y, OptFlags);
|
|
if (vAngs.x)
|
|
mathRotateX(Matrix.GetData(), vAngs.x, OptFlags);
|
|
if (!IsEquivalent(vScale, Vec3d(1.0f, 1.0f, 1.0f)))
|
|
mathScale(Matrix.GetData(), vScale, OptFlags);
|
|
}
|
|
|
|
#pragma warning(pop)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif //math
|