457 lines
12 KiB
C++
457 lines
12 KiB
C++
#include "stdafx.h"
|
||
#include "SSEUtils.h"
|
||
|
||
namespace cpu
|
||
{
|
||
DWORD g_dwFeatures = 0;
|
||
DWORD g_dwFeaturesEx = 0;
|
||
|
||
const char* g_arrCPUCaps[] = {
|
||
"FPU", "VME", "DE", "PSE", "TSC", "MSR", "PAE", "MCE", "CX8", "APIC",
|
||
"Unknown(10)", "SEP", "MTRR", "PGE", "MCA", "CMOV", "PAT", "PSE-36", "PSN", "CLFSH",
|
||
"Unknown(20)", "DS", "ACPI", "MMX", "FXSR", "SSE", "SSE2", "SS", "HTT", "TM",
|
||
"Unknown(30)", "PBE"
|
||
};
|
||
|
||
const char* g_arrCPUCapsLong[] = {
|
||
"Floating Point Unit On-Chip", "Virtual 8086 Mode Enhancements", "Debugging Extensions", "Page Size Extension", "Time Stamp Counter", "Model Specific Registers RDMSR and WRMSR Instructions", "Physical Address Extension", "Machine Check Exception", "CMPXCHG8B Instruction", "Advanced Programmable Interrupt Controller On-Chip",
|
||
"Unknown(10)", "SYSENTER and SYSEXIT Instructions", "Memory Type Range Registers", "Page Directory Entries Global Bit", "Machine Check Architecture", "Conditional Move Instructions", "Page Attribute Table", "32-Bit Page Size Extension", "Processor Serial Number", "CLFLUSH Instruction",
|
||
"Unknown(20)", "Debug Store", "Thermal Monitor and Software Controlled Clock Facilities", "MMX Technology", "FXSAVE and FXRSTOR Instructions", "SSE", "SSE2", "Self Snoop", "Hyper-Threading Technology", "Thermal Monitor",
|
||
"Unknown(30)", "Pending Break Enable"
|
||
};
|
||
|
||
void logCaps()
|
||
{
|
||
#ifdef _CRY_ANIMATION_BASE_HEADER_
|
||
#ifdef _CPU_X86
|
||
g_GetLog()->LogToFile ("CPU capabilities: ");
|
||
for (unsigned nCap = 0; nCap < 32; ++nCap)
|
||
if (g_dwFeatures&(1<<nCap))
|
||
{
|
||
#ifdef _DEBUG
|
||
g_GetLog()->LogToFilePlus(" %s.", g_arrCPUCapsLong[nCap]);
|
||
#else
|
||
g_GetLog()->LogToFilePlus(" %s", g_arrCPUCaps[nCap]);
|
||
#endif
|
||
}
|
||
if (has3DNow())
|
||
g_GetLog()->LogToFilePlus(" 3DNow!");
|
||
#endif
|
||
#endif
|
||
}
|
||
|
||
// detects CPU features (SSE) and perhaps sets up some
|
||
// pointers to functions
|
||
void detect ()
|
||
{
|
||
#if !defined(LINUX) && defined(_CPU_X86)
|
||
__try
|
||
{
|
||
_asm
|
||
{
|
||
// 386 processor check
|
||
// The AC bit, bit #18, is a new bit introduced in the EFLAGS
|
||
// register on the 486 processor to generate alignment
|
||
// faults.
|
||
// This bit cannot be set on the 386 processor.
|
||
|
||
pushfd // push original EFLAGS
|
||
pop eax // get original EFLAGS
|
||
mov ebx, eax // save original EFLAGS
|
||
xor eax, 040000h // flip AC bit in EFLAGS
|
||
push eax // save new EFLAGS value on stack
|
||
popfd // replace current EFLAGS value
|
||
pushfd // get new EFLAGS
|
||
pop eax // store new EFLAGS in EAX
|
||
cmp eax, ebx // can<61>t toggle AC bit, processor=80386
|
||
jz label386 // jump if 80386 processor
|
||
push ebx
|
||
popfd // restore AC bit in EFLAGS
|
||
|
||
// Checking for ability to set/clear ID flag (Bit 21) in EFLAGS
|
||
// which indicates the presence of a processor with the CPUID
|
||
// instruction.
|
||
pushfd // save EFLAGS to stack
|
||
pop eax // store EFLAGS in EAX
|
||
mov ebx, eax // save in EBX for testing later
|
||
xor eax, 0200000h // flip bit 21 in EFLAGS
|
||
push eax // save new EFLAGS value on stack
|
||
popfd // replace current EFLAGS value
|
||
pushfd // get new EFLAGS
|
||
pop eax // store new EFLAGS in EAX
|
||
cmp eax, ebx // see if bit 21 has changed
|
||
jz labelNoCPUID // CPUID is not present
|
||
|
||
mov EAX, 1
|
||
cpuid
|
||
mov g_dwFeatures, EDX
|
||
|
||
// check for 3DNow!
|
||
mov eax, 080000000h // query for extended functions
|
||
cpuid // get extended function limit
|
||
cmp eax, 080000001h /* functions up to 80000001h must be present */
|
||
jb labelNoExtended /* 80000001h is not available */
|
||
mov eax, 080000001h /* setup extended function 1 */
|
||
cpuid /* call the function */
|
||
mov g_dwFeaturesEx, edx /* bit 31 will be set for 3D Now! support*/
|
||
|
||
labelNoExtended:
|
||
test g_dwFeatures, g_featureSSE
|
||
jz labelNoSSE
|
||
// SSE is present. to check for OS support...
|
||
xorps xmm0, xmm0
|
||
// if we got here safely after xorps, it only can mean we have SSE
|
||
//or g_dwFeatures, g_featureSSE
|
||
jmp labelEndDetect
|
||
label386:
|
||
labelNoCPUID:
|
||
labelNoSSE:
|
||
labelEndDetect:
|
||
}
|
||
}
|
||
__except(EXCEPTION_EXECUTE_HANDLER)
|
||
{
|
||
// OS doesn't support some of the instructions we executed..
|
||
|
||
if(_exception_code () == STATUS_ILLEGAL_INSTRUCTION)
|
||
g_dwFeatures &= ~g_featureSSE;
|
||
}
|
||
|
||
if ((g_dwFeatures & (g_featureFXSR|g_featureSSE)) == (g_featureFXSR|g_featureSSE))
|
||
{
|
||
unsigned nMXCSR;
|
||
__try
|
||
{
|
||
_asm
|
||
{
|
||
stmxcsr nMXCSR
|
||
or nMXCSR, 0x8000
|
||
ldmxcsr nMXCSR
|
||
or nMXCSR, 0x40
|
||
ldmxcsr nMXCSR
|
||
}
|
||
}
|
||
__except(EXCEPTION_EXECUTE_HANDLER)
|
||
{
|
||
}
|
||
}
|
||
#endif
|
||
}
|
||
|
||
|
||
|
||
}
|
||
|
||
#if !defined(LINUX) && defined(_CPU_X86)
|
||
// given the array of matrices, calculates the min/max
|
||
// of their positions, and puts them into the min and max Vec3d
|
||
// NOTE: the matrix array must be aligned on 16-byte boundary
|
||
void getBBoxSSE (const Matrix44* pBones, unsigned numBones, CryAABB* pBBox)
|
||
{
|
||
assert (numBones > 0);
|
||
_asm
|
||
{
|
||
mov EBX, pBones
|
||
movaps xmm0, [EBX+0x30]
|
||
movaps xmm1, xmm0
|
||
mov EDX, pBBox
|
||
add EBX, 0x70 // now EBX points to the next bone matrix
|
||
mov ECX, numBones
|
||
dec ECX
|
||
jz label_End
|
||
|
||
label_Start:
|
||
movaps xmm2, [EBX]
|
||
minps xmm0, xmm2
|
||
maxps xmm1, xmm2
|
||
add EBX, 0x40
|
||
loop label_Start
|
||
|
||
label_End:
|
||
//SSE_MOVSS(EDX,xmm0)
|
||
movss [EDX], xmm0
|
||
shufps xmm0,xmm0, 0xE5
|
||
movss [EDX+4], xmm0
|
||
shufps xmm0,xmm0, 0xE6
|
||
movss [EDX+8], xmm0
|
||
//SSE_MOVSS(EDX+0x0C,xmm1)
|
||
movss [EDX+0x0C], xmm1
|
||
shufps xmm1,xmm1, 0xE5
|
||
movss [EDX+0x0C+4], xmm1
|
||
shufps xmm1,xmm1, 0xE6
|
||
movss [EDX+0x0C+8], xmm1
|
||
}
|
||
}
|
||
|
||
// given the array of matrices, calculates the min/max
|
||
// of their positions, and puts them into the min and max Vec3d
|
||
// NOTE: the matrix array must be aligned on 16-byte boundary
|
||
void getBBoxSSE (const Matrix44* pBones, const CryBBoxA16* pBoneBBox, unsigned numBones, CryAABB* pBBox)
|
||
{
|
||
assert (numBones > 0);
|
||
_asm
|
||
{
|
||
mov EBX, pBones
|
||
movaps xmm0, [EBX+0x30]
|
||
movaps xmm1, xmm0
|
||
mov EDX, pBBox
|
||
add EBX, 0x70 // now EBX points to the next bone matrix
|
||
mov ECX, numBones
|
||
dec ECX
|
||
jz label_End
|
||
|
||
label_Start:
|
||
movaps xmm2, [EBX]
|
||
minps xmm0, xmm2
|
||
maxps xmm1, xmm2
|
||
add EBX, 0x40
|
||
loop label_Start
|
||
|
||
label_End:
|
||
//SSE_MOVSS(EDX,xmm0)
|
||
movss [EDX], xmm0
|
||
shufps xmm0,xmm0, 0xE5
|
||
movss [EDX+4], xmm0
|
||
shufps xmm0,xmm0, 0xE6
|
||
movss [EDX+8], xmm0
|
||
//SSE_MOVSS(EDX+0x0C,xmm1)
|
||
movss [EDX+0x0C], xmm1
|
||
shufps xmm1,xmm1, 0xE5
|
||
movss [EDX+0x0C+4], xmm1
|
||
shufps xmm1,xmm1, 0xE6
|
||
movss [EDX+0x0C+8], xmm1
|
||
}
|
||
}
|
||
#endif
|
||
|
||
// packs the array of Vec3dA16 into Vec3d's
|
||
// nCount - number of vertices
|
||
// pData - [IN] Vec3dA16, [OUT] Vec3d
|
||
void packVec3d16 (void* pData, unsigned nCount)
|
||
{
|
||
#if !defined(LINUX) && defined(_CPU_X86)
|
||
_asm
|
||
{
|
||
mov ESI, pData
|
||
mov EDI, ESI
|
||
|
||
add ESI, 0x10
|
||
add EDI, 0xC
|
||
|
||
mov ECX, nCount
|
||
dec ECX
|
||
jz endLoop
|
||
|
||
startLoop:
|
||
mov EAX, [ESI]
|
||
mov [EDI], EAX
|
||
mov EBX, [ESI+4]
|
||
mov [EDI+4], EBX
|
||
mov EDX, [ESI+8]
|
||
mov [EDI+8], EDX
|
||
|
||
add ESI, 0x10
|
||
add EDI, 0xC
|
||
loop startLoop
|
||
endLoop:
|
||
}
|
||
#else
|
||
float* pTo = (float*)pData + 3;
|
||
float* pFrom = (float*)pData + 4;
|
||
for (unsigned i = 1; i < nCount; ++i)
|
||
{
|
||
pTo[0] = pFrom[0];
|
||
pTo[1] = pFrom[1];
|
||
pTo[2] = pFrom[2];
|
||
pTo += 3;
|
||
pFrom += 4;
|
||
}
|
||
#endif
|
||
}
|
||
|
||
|
||
#if !defined(LINUX) && defined(_CPU_X86)
|
||
__declspec(naked) void PIII_Mult00_4x4_4x4( float *src1, float *src2, float *dst)
|
||
{
|
||
__asm
|
||
{
|
||
mov edx, dword ptr [esp+4] ; src1
|
||
mov eax, dword ptr [esp+0Ch] ; dst
|
||
mov ecx, dword ptr [esp+8] ; src2
|
||
movss xmm0, dword ptr [edx]
|
||
movaps xmm1, xmmword ptr [ecx]
|
||
shufps xmm0, xmm0, 0
|
||
movss xmm2, dword ptr [edx+4]
|
||
mulps xmm0, xmm1
|
||
shufps xmm2, xmm2, 0
|
||
movaps xmm3, xmmword ptr [ecx+10h]
|
||
movss xmm7, dword ptr [edx+8]
|
||
mulps xmm2, xmm3
|
||
shufps xmm7, xmm7, 0
|
||
addps xmm0, xmm2
|
||
movaps xmm4, xmmword ptr [ecx+20h]
|
||
movss xmm2, dword ptr [edx+0Ch]
|
||
mulps xmm7, xmm4
|
||
shufps xmm2, xmm2, 0
|
||
addps xmm0, xmm7
|
||
movaps xmm5, xmmword ptr [ecx+30h]
|
||
movss xmm6, dword ptr [edx+10h]
|
||
mulps xmm2, xmm5
|
||
movss xmm7, dword ptr [edx+14h]
|
||
shufps xmm6, xmm6, 0
|
||
addps xmm0, xmm2
|
||
shufps xmm7, xmm7, 0
|
||
movlps qword ptr [eax], xmm0
|
||
movhps qword ptr [eax+8], xmm0
|
||
mulps xmm7, xmm3
|
||
movss xmm0, dword ptr [edx+18h]
|
||
mulps xmm6, xmm1
|
||
shufps xmm0, xmm0, 0
|
||
addps xmm6, xmm7
|
||
mulps xmm0, xmm4
|
||
movss xmm2, dword ptr [edx+24h]
|
||
addps xmm6, xmm0
|
||
movss xmm0, dword ptr [edx+1Ch]
|
||
movss xmm7, dword ptr [edx+20h]
|
||
shufps xmm0, xmm0, 0
|
||
shufps xmm7, xmm7, 0
|
||
mulps xmm0, xmm5
|
||
mulps xmm7, xmm1
|
||
addps xmm6, xmm0
|
||
shufps xmm2, xmm2, 0
|
||
movlps qword ptr [eax+10h], xmm6
|
||
movhps qword ptr [eax+18h], xmm6
|
||
mulps xmm2, xmm3
|
||
movss xmm6, dword ptr [edx+28h]
|
||
addps xmm7, xmm2
|
||
shufps xmm6, xmm6, 0
|
||
movss xmm2, dword ptr [edx+2Ch]
|
||
mulps xmm6, xmm4
|
||
shufps xmm2, xmm2, 0
|
||
addps xmm7, xmm6
|
||
mulps xmm2, xmm5
|
||
movss xmm0, dword ptr [edx+34h]
|
||
addps xmm7, xmm2
|
||
shufps xmm0, xmm0, 0
|
||
movlps qword ptr [eax+20h], xmm7
|
||
movss xmm2, dword ptr [edx+30h]
|
||
movhps qword ptr [eax+28h], xmm7
|
||
mulps xmm0, xmm3
|
||
shufps xmm2, xmm2, 0
|
||
movss xmm6, dword ptr [edx+38h]
|
||
mulps xmm2, xmm1
|
||
shufps xmm6, xmm6, 0
|
||
addps xmm2, xmm0
|
||
mulps xmm6, xmm4
|
||
movss xmm7, dword ptr [edx+3Ch]
|
||
shufps xmm7, xmm7, 0
|
||
addps xmm2, xmm6
|
||
mulps xmm7, xmm5
|
||
addps xmm2, xmm7
|
||
movaps xmmword ptr [eax+30h], xmm2
|
||
ret
|
||
}
|
||
}
|
||
#endif
|
||
|
||
void multMatrix(float *product, const float *m1, const float *m2)
|
||
{
|
||
#if defined(LINUX) || !defined(_CPU_X86)
|
||
#define A(row,col) m1[(col<<2)+row]
|
||
#define B(row,col) m2[(col<<2)+row]
|
||
#define P(row,col) product[(col<<2)+row]
|
||
|
||
int i;
|
||
for (i=0; i<4; i++)
|
||
{
|
||
float ai0=A(i,0), ai1=A(i,1), ai2=A(i,2), ai3=A(i,3);
|
||
P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
|
||
P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
|
||
P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
|
||
P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
|
||
}
|
||
|
||
|
||
#undef A
|
||
#undef B
|
||
#undef P
|
||
|
||
#else
|
||
__asm
|
||
{
|
||
mov eax, m2;
|
||
mov ecx, m1;
|
||
mov edx, product;
|
||
movss xmm0,dword ptr [eax]
|
||
movaps xmm1,xmmword ptr [ecx]
|
||
shufps xmm0,xmm0,0
|
||
movss xmm2,dword ptr [eax+4]
|
||
mulps xmm0,xmm1
|
||
shufps xmm2,xmm2,0
|
||
movaps xmm3,xmmword ptr [ecx+10h]
|
||
movss xmm4,dword ptr [eax+8]
|
||
mulps xmm2,xmm3
|
||
shufps xmm4,xmm4,0
|
||
addps xmm0,xmm2
|
||
movaps xmm2,xmmword ptr [ecx+20h]
|
||
movss xmm5,dword ptr [eax+0Ch]
|
||
mulps xmm4,xmm2
|
||
shufps xmm5,xmm5,0
|
||
movaps xmm6,xmmword ptr [ecx+30h]
|
||
mulps xmm5,xmm6
|
||
addps xmm4,xmm5
|
||
addps xmm0,xmm4
|
||
movaps xmmword ptr [edx],xmm0
|
||
movss xmm0,dword ptr [eax+10h]
|
||
movss xmm4,dword ptr [eax+14h]
|
||
shufps xmm0,xmm0,0
|
||
shufps xmm4,xmm4,0
|
||
mulps xmm0,xmm1
|
||
mulps xmm4,xmm3
|
||
movss xmm5,dword ptr [eax+18h]
|
||
addps xmm0,xmm4
|
||
shufps xmm5,xmm5,0
|
||
movss xmm4,dword ptr [eax+1Ch]
|
||
mulps xmm5,xmm2
|
||
shufps xmm4,xmm4,0
|
||
mulps xmm4,xmm6
|
||
addps xmm5,xmm4
|
||
addps xmm0,xmm5
|
||
movaps xmmword ptr [edx+10h],xmm0
|
||
movss xmm0,dword ptr [eax+20h]
|
||
movss xmm4,dword ptr [eax+24h]
|
||
shufps xmm0,xmm0,0
|
||
shufps xmm4,xmm4,0
|
||
mulps xmm0,xmm1
|
||
mulps xmm4,xmm3
|
||
movss xmm5,dword ptr [eax+28h]
|
||
addps xmm0,xmm4
|
||
shufps xmm5,xmm5,0
|
||
movss xmm4,dword ptr [eax+2Ch]
|
||
mulps xmm5,xmm2
|
||
shufps xmm4,xmm4,0
|
||
mulps xmm4,xmm6
|
||
addps xmm5,xmm4
|
||
addps xmm0,xmm5
|
||
movaps xmmword ptr [edx+20h],xmm0
|
||
movss xmm0,dword ptr [eax+30h]
|
||
movss xmm4,dword ptr [eax+34h]
|
||
shufps xmm0,xmm0,0
|
||
shufps xmm4,xmm4,0
|
||
mulps xmm0,xmm1
|
||
mulps xmm4,xmm3
|
||
movss xmm1,dword ptr [eax+38h]
|
||
addps xmm0,xmm4
|
||
shufps xmm1,xmm1,0
|
||
movss xmm3,dword ptr [eax+3Ch]
|
||
mulps xmm1,xmm2
|
||
shufps xmm3,xmm3,0
|
||
mulps xmm3,xmm6
|
||
addps xmm1,xmm3
|
||
addps xmm0,xmm1
|
||
movaps xmmword ptr [edx+30h],xmm0
|
||
}
|
||
#endif
|
||
}
|
||
|