Files
FC1/RenderDll/stdafx.h
romkazvo 34d6c5d489 123
2023-08-07 19:29:24 +08:00

1057 lines
31 KiB
C++

/*=============================================================================
RenderPC.cpp: Cry Render support precompiled header generator.
Copyright 2001 Crytek Studios. All Rights Reserved.
Revision history:
* Created by Honitch Andrey
=============================================================================*/
#define CRY_API
#ifdef _DEBUG
#define CRTDBG_MAP_ALLOC
#endif //_DEBUG
//! Include standart headers.
#include <assert.h>
//#define PS2
//#define OPENGL
#ifdef _XBOX
//! Include standart headers.
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <memory.h>
#include <io.h>
#include <memory.h>
#include <time.h>
#include <direct.h>
#include <search.h>
#include <stdarg.h>
typedef unsigned long DWORD;
typedef unsigned short WORD;
typedef unsigned char BYTE;
#include <xtl.h>
#else
#include <windows.h>
#endif
#include <platform.h>
// enable memory pool usage
#define USE_NEWPOOL
#include <CryMemoryManager.h>
#include "CrtOverrides.h"
#if defined _DEBUG && defined OPENGL
#define DEBUGALLOC
#endif
/////////////////////////////////////////////////////////////////////////////
// STL //////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
#include <vector>
#include <list>
#include <map>
#include <hash_map>
#include <set>
#include <string>
#include <algorithm>
typedef const char* cstr;
#define SIZEOF_ARRAY(arr) (sizeof(arr)/sizeof((arr)[0]))
// Include common headers.
//#include "Common\CryHelpers.h"
//typedef string String;
#ifdef DEBUGALLOC
#include <crtdbg.h>
#define DEBUG_CLIENTBLOCK new( _NORMAL_BLOCK, __FILE__, __LINE__)
#define new DEBUG_CLIENTBLOCK
// memman
#define calloc(s,t) _calloc_dbg(s, t, _NORMAL_BLOCK, __FILE__, __LINE__)
#define malloc(s) _malloc_dbg(s, _NORMAL_BLOCK, __FILE__, __LINE__)
#define realloc(p, s) _realloc_dbg(p, s, _NORMAL_BLOCK, __FILE__, __LINE__)
#endif
#include <list2.h>
#include <Names.h>
#define MAX_TMU 8
//! Include main interfaces.
#include <ICryPak.h>
#include <IEntitySystem.h>
#include <IProcess.h>
#include <ITimer.h>
#include <ISystem.h>
#include <ILog.h>
#include <IPhysics.h>
#include <IConsole.h>
#include <IRenderer.h>
#include <IStreamEngine.h>
#include <CrySizer.h>
#include "Font.h"
#include "Except.h"
#include <Cry_Math.h>
#include "Cry_Camera.h"
//#include "_Malloc.h"
#include "math.h"
#include "Common/Mkl/Mkl.h"
#include <VertexFormats.h>
#include <CREPolyMesh.h>
#include "Common/Shaders/Shader.h"
//#include "Common/XFile/File.h"
//#include "Common/Image.h"
#include "Common/Shaders/CShader.h"
#include "Common/EvalFuncs.h"
#include "Common/RenderPipeline.h"
#include "Common/Renderer.h"
#include "Common/CPUDetect.h"
#include "Common/Textures/TexMan.h"
#include "Common/Shaders/Parser.h"
#include "Common/SimpleFrameProfiler.h"
// per-frame profilers: collect the infromation for each frame for
// displaying statistics at the beginning of each frame
#define PROFILER(ID,NAME) DECLARE_FRAME_PROFILER(ID,NAME)
#include "Common/FrameProfilers-list.h"
#undef PROFILER
// All handled render elements (except common ones included in "RendElement.h")
#include "Common/RendElements/CREBeam.h"
#include "Common/RendElements/CREPrefabGeom.h"
#include "Common/RendElements/CREClientPoly.h"
#include "Common/RendElements/CREClientPoly2D.h"
#include "Common/RendElements/CREParticleSpray.h"
#include "Common/RendElements/CREFlares.h"
#include "Common/RendElements/CREPolyBlend.h"
#include "Common/RendElements/CRESkyZone.h"
#include "Common/RendElements/CREOcean.h"
#include "Common/RendElements/CREGlare.h"
#include "Common/RendElements/CRETempMesh.h"
#define max(a,b) (((a) > (b)) ? (a) : (b))
#define min(a,b) (((a) < (b)) ? (a) : (b))
/*-----------------------------------------------------------------------------
Vector transformations.
-----------------------------------------------------------------------------*/
//
// Transformations in optimized assembler format.
// An adaption of Michael Abrash' optimal transformation code.
//
#if DO_ASM
_inline void ASMTransformPoint(const SCoord &Coords, const Vec3d& InVector, Vec3d& OutVector)
{
// SCoords is a structure of 4 vectors: Origin, X, Y, Z
// x y z
// Vector Origin; 0 4 8
// Vector XAxis; 12 16 20
// Vector YAxis; 24 28 32
// Vector ZAxis; 36 40 44
//
// task: VectorSubtract(InVector, Coords.org, Temp);
// Outvector[0] = DotProduct(Temp, Coords.rot[0]);
// Outvector[1] = DotProduct(Temp, Coords.rot[1]);
// Outvector[2] = DotProduct(Temp, Coords.rot[2]);
//
// About 33 cycles on a Pentium.
//
__asm
{
mov esi,[InVector]
mov edx,[Coords]
mov edi,[OutVector]
// get source
fld dword ptr [esi+0]
fld dword ptr [esi+4]
fld dword ptr [esi+8] // z y x
fxch st(2) // xyz
// subtract origin
fsub dword ptr [edx + 0] // xyz
fxch st(1)
fsub dword ptr [edx + 4] // yxz
fxch st(2)
fsub dword ptr [edx + 8] // zxy
fxch st(1) // X Z Y
// triplicate X for transforming
fld st(0) // X X Z Y
fmul dword ptr [edx+12] // Xx X Z Y
fld st(1) // X Xx X Z Y
fmul dword ptr [edx+24] // Xy Xx X Z Y
fxch st(2)
fmul dword ptr [edx+36] // Xz Xx Xy Z Y
fxch st(4) // Y Xx Xy Z Xz
fld st(0) // Y Y Xx Xy Z Xz
fmul dword ptr [edx+16]
fld st(1) // Y Yx Y Xx Xy Z Xz
fmul dword ptr [edx+28]
fxch st(2) // Y Yx Yy Xx Xy Z Xz
fmul dword ptr [edx+40] // Yz Yx Yy Xx Xy Z Xz
fxch st(1) // Yx Yz Yy Xx Xy Z Xz
faddp st(3),st(0) // Yz Yy XxYx Xy Z Xz
faddp st(5),st(0) // Yy XxYx Xy Z XzYz
faddp st(2),st(0) // XxYx XyYy Z XzYz
fxch st(2) // Z XyYy XxYx XzYz
fld st(0) // Z Z XyYy XxYx XzYz
fmul dword ptr [edx+20]
fld st(1) // Z Zx Z XyYy XxYx XzYz
fmul dword ptr [edx+32]
fxch st(2) // Z Zx Zy
fmul dword ptr [edx+44] // Zz Zx Zy XyYy XxYx XzYz
fxch st(1) // Zx Zz Zy XyYy XxYx XzYz
faddp st(4),st(0) // Zz Zy XyYy XxYxZx XzYz
faddp st(4),st(0) // Zy XyYy XxYxZx XzYzZz
faddp st(1),st(0) // XyYyZy XxYxZx XzYzZz
fstp dword ptr [edi+4]
fstp dword ptr [edi+0]
fstp dword ptr [edi+8]
}
}
#endif
#if DO_ASM
_inline void ASMTransformVector(const SCoord &Coords, const Vec3d& InVector, Vec3d& OutVector)
{
__asm
{
mov esi,[InVector]
mov edx,[Coords]
mov edi,[OutVector]
// get source
fld dword ptr [esi+0]
fld dword ptr [esi+4]
fxch st(1)
fld dword ptr [esi+8] // z x y
fxch st(1) // x z y
// triplicate X for transforming
fld st(0) // X X Z Y
fmul dword ptr [edx+12] // Xx X Z Y
fld st(1) // X Xx X Z Y
fmul dword ptr [edx+24] // Xy Xx X Z Y
fxch st(2)
fmul dword ptr [edx+36] // Xz Xx Xy Z Y
fxch st(4) // Y Xx Xy Z Xz
fld st(0) // Y Y Xx Xy Z Xz
fmul dword ptr [edx+16]
fld st(1) // Y Yx Y Xx Xy Z Xz
fmul dword ptr [edx+28]
fxch st(2) // Y Yx Yy Xx Xy Z Xz
fmul dword ptr [edx+40] // Yz Yx Yy Xx Xy Z Xz
fxch st(1) // Yx Yz Yy Xx Xy Z Xz
faddp st(3),st(0) // Yz Yy XxYx Xy Z Xz
faddp st(5),st(0) // Yy XxYx Xy Z XzYz
faddp st(2),st(0) // XxYx XyYy Z XzYz
fxch st(2) // Z XyYy XxYx XzYz
fld st(0) // Z Z XyYy XxYx XzYz
fmul dword ptr [edx+20]
fld st(1) // Z Zx Z XyYy XxYx XzYz
fmul dword ptr [edx+32]
fxch st(2) // Z Zx Zy
fmul dword ptr [edx+44] // Zz Zx Zy XyYy XxYx XzYz
fxch st(1) // Zx Zz Zy XyYy XxYx XzYz
faddp st(4),st(0) // Zz Zy XyYy XxYxZx XzYz
faddp st(4),st(0) // Zy XyYy XxYxZx XzYzZz
faddp st(1),st(0) // XyYyZy XxYxZx XzYzZz
fstp dword ptr [edi+4]
fstp dword ptr [edi+0]
fstp dword ptr [edi+8]
}
}
#endif
//
// Transform a point by a coordinate system, moving
// it by the coordinate system's origin if nonzero.
//
_inline void TransformPoint( const SCoord &Coords, Vec3d& in, Vec3d& out)
{
#if !DO_ASM
Vec3d Temp;
Temp = in - Coords.m_Org;
out[0] = Temp | Coords.m_Vecs[0];
out[1] = Temp | Coords.m_Vecs[1];
out[2] = Temp | Coords.m_Vecs[2];
#else
ASMTransformPoint( Coords, in, out);
#endif
}
//we need a better function-name for this exotic operation
//there already is a "TransformPoint" in Cry_Matrix.h
_inline void TransformPoint( const Matrix44 &Matr, Vec3d& inp, Vec3d& outp)
{
//T_CHANGED_BY_IVO
//Vec3d Temp = inp - *(Vec3d *)&Matr.m_values[3][0];
Vec3d Temp = inp - Matr.GetTranslation();
//T_CHANGED_BY_IVO
//outp.x = Temp | *(Vec3d *)&Matr.m_values[0][0];
//outp.y = Temp | *(Vec3d *)&Matr.m_values[1][0];
//outp.z = Temp | *(Vec3d *)&Matr.m_values[2][0];
outp.x = Temp | Matr.GetOrtX();
outp.y = Temp | Matr.GetOrtY();
outp.z = Temp | Matr.GetOrtZ();
}
//
// Transform a directional vector by a coordinate system.
// Ignore's the coordinate system's origin.
//
_inline void TransformVector( SCoord Coords, Vec3d& in, Vec3d& out )
{
#if !DO_ASM
Vec3d Temp;
Temp = in;
out[0] = Temp | Coords.m_Vecs[0];
out[1] = Temp | Coords.m_Vecs[1];
out[2] = Temp | Coords.m_Vecs[2];
#else
ASMTransformVector( Coords, in, out);
#endif
}
_inline void TransformVec_ViewProj(Vec3d& v, Matrix44 viewmatr, Matrix44 projmatr, vec4_t vv, vec4_t pv)
{
int i;
for (i=0; i<4; i++)
{
vv[i] = viewmatr[0][i]*v[0] + viewmatr[1][i]*v[1] + viewmatr[2][i]*v[2] + viewmatr[3][i];
}
for (i=0; i<4; i++)
{
pv[i] = projmatr[0][i]*vv[0] + projmatr[1][i]*vv[1] + projmatr[2][i]*vv[2] + projmatr[3][i]*vv[3];
}
}
_inline void ProjectPoint(vec4_t pv, Vec3d& v3d, vec2_t v2d)
{
v3d[0] = pv[0] / pv[3];
v3d[1] = pv[1] / pv[3];
v3d[2] = (pv[2] + pv[3]) / (pv[2] + pv[3] + pv[3]);
v2d[0] = (float)QRound((v3d[0] + 1) * gRenDev->GetWidth() * 0.5f);
v2d[1] = (float)QRound((v3d[1] + 1) * gRenDev->GetHeight() * 0.5f);
v2d[0] = (float)QRound(v2d[0]);
v2d[1] = (float)QRound(v2d[1]);
}
_inline void TransformVector(Vec3d& out, Vec3d& in, Matrix44& m)
{
//T_CHANGED_BY_IVO
//out.x = in.x * m.m_values[0][0] + in.y * m.m_values[1][0] + in.z * m.m_values[2][0];
//out.y = in.x * m.m_values[0][1] + in.y * m.m_values[1][1] + in.z * m.m_values[2][1];
//out.z = in.x * m.m_values[0][2] + in.y * m.m_values[1][2] + in.z * m.m_values[2][2];
out.x = in.x * m(0,0) + in.y * m(1,0) + in.z * m(2,0);
out.y = in.x * m(0,1) + in.y * m(1,1) + in.z * m(2,1);
out.z = in.x * m(0,2) + in.y * m(1,2) + in.z * m(2,2);
}
_inline void TransformPosition(Vec3d& out, Vec3d& in, Matrix44& m)
{
//T_CHANGED_BY_IVO
//out.x = in.x * m.m_values[0][0] + in.y * m.m_values[1][0] + in.z * m.m_values[2][0] + m.m_values[3][0];
//out.y = in.x * m.m_values[0][1] + in.y * m.m_values[1][1] + in.z * m.m_values[2][1] + m.m_values[3][1];
//out.z = in.x * m.m_values[0][2] + in.y * m.m_values[1][2] + in.z * m.m_values[2][2] + m.m_values[3][2];
TransformVector (out, in, m);
out += m.GetTranslation();
}
inline Plane TransformPlaneByUsingAdjointT( const Matrix44& M, const Matrix44& TA, const Plane plSrc)
{
//CHANGED_BY_IVO
//Vec3d newNorm = TA.TransformVector(plSrc.n);
Vec3d newNorm = GetTransposed44(TA)*(plSrc.n);
newNorm.Normalize();
if(M.Determinant() < 0.f) newNorm *= -1;
Plane plane;
plane.Set(newNorm, M.TransformPointOLD(plSrc.n * plSrc.d) | newNorm);
return plane;
}
inline Matrix44 TransposeAdjoint(const Matrix44& M)
{
Matrix44 ta;
ta(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
ta(0,1) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
ta(0,2) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
ta(0,3) = 0.f;
ta(1,0) = M(2,1) * M(0,2) - M(2,2) * M(0,1);
ta(1,1) = M(2,2) * M(0,0) - M(2,0) * M(0,2);
ta(1,2) = M(2,0) * M(0,1) - M(2,1) * M(0,0);
ta(1,3) = 0.f;
ta(2,0) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
ta(2,1) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
ta(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
ta(2,3) = 0.f;
ta(3,0) = 0.f;
ta(3,1) = 0.f;
ta(3,2) = 0.f;
ta(3,1) = 1.f;
return ta;
}
inline Plane TransformPlane( const Matrix44& M, const Plane& plSrc)
{
Matrix44 tmpTA = TransposeAdjoint(M);
return TransformPlaneByUsingAdjointT(M, tmpTA, plSrc);
}
// Homogeneous plane transform.
inline Plane TransformPlane2(const Matrix44& m, const Plane& src )
{
Plane plDst;
float v0=src.n.x, v1=src.n.y, v2=src.n.z, v3=src.d;
plDst.n.x = v0 * m[0][0] + v1 * m[0][1] + v2 * m[0][2] + v3 * m[0][3];
plDst.n.y = v0 * m[1][0] + v1 * m[1][1] + v2 * m[1][2] + v3 * m[1][3];
plDst.n.z = v0 * m[2][0] + v1 * m[2][1] + v2 * m[2][2] + v3 * m[2][3];
plDst.d = v0 * m[3][0] + v1 * m[3][1] + v2 * m[3][2] + v3 * m[3][3];
return plDst;
}
inline Plane TransformPlane2_NoTrans(const Matrix44& m, const Plane& src )
{
Plane plDst;
float v0=src.n.x, v1=src.n.y, v2=src.n.z;
plDst.n.x = v0 * m[0][0] + v1 * m[0][1] + v2 * m[0][2];
plDst.n.y = v0 * m[1][0] + v1 * m[1][1] + v2 * m[1][2];
plDst.n.z = v0 * m[2][0] + v1 * m[2][1] + v2 * m[2][2];
plDst.d = src.d;
return plDst;
}
inline Plane TransformPlane2Transposed(const Matrix44& m, const Plane& src )
{
Plane plDst;
float v0=src.n.x, v1=src.n.y, v2=src.n.z, v3=src.d;
plDst.n.x = v0 * m[0][0] + v1 * m[1][0] + v2 * m[2][0] + v3 * m[3][0];
plDst.n.y = v0 * m[0][1] + v1 * m[1][1] + v2 * m[2][1] + v3 * m[3][1];
plDst.n.z = v0 * m[0][2] + v1 * m[2][1] + v2 * m[2][2] + v3 * m[3][2];
plDst.d = v0 * m[0][3] + v1 * m[1][3] + v2 * m[2][3] + v3 * m[3][3];
return plDst;
}
//===============================================================================================
_inline int CullBoxByPlane (float *Mins, float *Maxs, SPlane *p)
{
float dist1, dist2;
int sides;
// fast axial cases
if (p->m_Type < 3)
{
return (p->m_Dist <= Mins[p->m_Type]) ? 1 : (p->m_Dist >= Maxs[p->m_Type]) ? 2 : 3;
}
// general case
switch (p->m_SignBits)
{
case 0:
dist1 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Maxs[2];
dist2 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Mins[2];
break;
case 1:
dist1 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Maxs[2];
dist2 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Mins[2];
break;
case 2:
dist1 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Maxs[2];
dist2 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Mins[2];
break;
case 3:
dist1 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Maxs[2];
dist2 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Mins[2];
break;
case 4:
dist1 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Mins[2];
dist2 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Maxs[2];
break;
case 5:
dist1 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Mins[2];
dist2 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Maxs[2];
break;
case 6:
dist1 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Mins[2];
dist2 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Maxs[2];
break;
case 7:
dist1 = p->m_Normal[0]*Mins[0] + p->m_Normal[1]*Mins[1] + p->m_Normal[2]*Mins[2];
dist2 = p->m_Normal[0]*Maxs[0] + p->m_Normal[1]*Maxs[1] + p->m_Normal[2]*Maxs[2];
break;
default:
dist1 = dist2 = 0; // shut up compiler
ASSERT( 1 );
break;
}
sides = 0;
if (dist1 >= p->m_Dist)
sides = 1;
if (dist2 < p->m_Dist)
sides |= 2;
//ASSERT( sides != 0 );
return sides;
}
//===============================================================================================
// Interfaces from the Game
extern ILog *iLog;
extern IConsole *iConsole;
extern ITimer *iTimer;
extern ISystem *iSystem;
extern int *pTest_int;
extern IPhysicalWorld *pIPhysicalWorld;
#define MAX_PATH_LENGTH 512
inline void _text_to_log(char * format, ...)
{
char buffer[MAX_PATH_LENGTH];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
iLog->Log(buffer);
if (gRenDev->CV_r_log == 3)
gRenDev->Logv(SRendItem::m_RecurseLevel, buffer);
}
inline void _text_to_logPlus(char * format, ...)
{
char buffer[MAX_PATH_LENGTH];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
iLog->LogPlus(buffer);
if (gRenDev->CV_r_log == 3)
gRenDev->Logv(SRendItem::m_RecurseLevel, buffer);
}
inline void _UpdateLoadingScreen(const char * format, ...)
{
if(format)
{
char buffer[MAX_PATH_LENGTH];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
iLog->Log(buffer);
if (gRenDev->CV_r_log == 3)
gRenDev->Logv(SRendItem::m_RecurseLevel, buffer);
}
//iConsole->Update();
//gRenDev->BeginFrame();
//iConsole->Draw();
//gRenDev->Update();
}
inline void _UpdateLoadingScreenPlus(const char * format, ...)
{
if(format)
{
char buffer[MAX_PATH_LENGTH];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
iLog->Log(buffer);
if (gRenDev->CV_r_log == 3)
gRenDev->Logv(SRendItem::m_RecurseLevel, buffer);
}
iConsole->Update();
gRenDev->BeginFrame();
iConsole->Draw();
gRenDev->Update();
}
_inline char * Cry_strdup(const char * str)
{
char *memory;
if (!str)
return(NULL);
memory = (char *)malloc(strlen(str) + 1);
if (memory)
return(strcpy(memory,str));
return(NULL);
}
const char* GetExtension (const char *in);
void StripExtension (const char *in, char *out);
void AddExtension (char *path, char *extension);
void ConvertDOSToUnixName( char *dst, const char *src );
void ConvertUnixToDosName( char *dst, const char *src );
void UsePath (char *name, char *path, char *dst);
#define Vector2Copy(a,b) {b[0]=a[0];b[1]=a[1];}
//==================================================================
// Profiling
inline DWORD sCycles()
{
uint L;
#ifndef PS2
__asm
{
xor eax,eax // Required so that VC++ realizes EAX is modified.
_emit 0x0F // RDTSC - Pentium+ time stamp register to EDX:EAX.
_emit 0x31 // Use only 32 bits in EAX - even a Ghz cpu would have a 4+ sec period.
mov [L],eax // Save low value.
xor edx,edx // Required so that VC++ realizes EDX is modified.
}
#else
L = 0;
#endif
return L;
}
inline double sCycles2()
{
uint L,H;
#ifndef PS2
__asm
{
xor eax,eax // Required so that VC++ realizes EAX is modified.
xor edx,edx // Required so that VC++ realizes EDX is modified.
_emit 0x0F // RDTSC - Pentium+ time stamp register to EDX:EAX.
_emit 0x31 // Use only 32 bits in EAX - even a Ghz cpu would have a 4+ sec period.
mov [L],eax // Save low value.
mov [H],edx // Save high value.
}
#else
L = H = 0;
#endif
return ((DOUBLE)L + 4294967296.0 * (DOUBLE)H);
}
#define FP_BITS(fp) (*(DWORD *)&(fp))
_inline float C_sqrt_tab(float n)
{
if (FP_BITS(n) == 0)
return 0.0; // check for square root of 0
FP_BITS(n) = gRenDev->fast_sqrt_table[(FP_BITS(n) >> 8) & 0xFFFF] | ((((FP_BITS(n) - 0x3F800000) >> 1) + 0x3F800000) & 0x7F800000);
return n;
}
//=========================================================================================
//
// Memory copy.
//
#if DO_ASM
#define DEFINED_cryMemcpy
/******************************************************************************
Copyright (c) 2001 Advanced Micro Devices, Inc.
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
NOT APPLY TO YOU.
AMD does not assume any responsibility for any errors which may appear in the
Materials nor any responsibility to support or update the Materials. AMD retains
the right to make changes to its test specifications at any time, without notice.
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
further information, software, technical information, know-how, or show-how
available to you.
So that all may benefit from your experience, please report any problems
or suggestions about this software to 3dsdk.support@amd.com
AMD Developer Technologies, M/S 585
Advanced Micro Devices, Inc.
5900 E. Ben White Blvd.
Austin, TX 78741
3dsdk.support@amd.com
******************************************************************************/
/*****************************************************************************
MEMCPY_AMD.CPP
******************************************************************************/
// Very optimized memcpy() routine for AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Inline assembly syntax for use with Visual C++
inline void cryMemcpy( void* Dst, const void* Src, INT Count )
{
if( gRenDev->m_Cpu->mCpu[0].mFeatures & CFI_MMX )
{
__asm
{
mov ecx, [Count] ; number of bytes to copy
mov edi, [Dst] ; destination
mov esi, [Src] ; source
mov ebx, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
// mov eax, [dest] ; ret value = destination pointer
}
}
else
{
__asm
{
mov ecx, Count
mov esi, Src
mov edi, Dst
mov ebx, ecx
shr ecx, 2
and ebx, 3
rep movsd
mov ecx, ebx
rep movsb
}
}
}
#else
inline void cryMemcpy( void* Dst, const void* Src, INT Count )
{
memcpy(Dst, Src, Count);
}
#endif
//=========================================================================================
//
// Normal timing.
//
#define ticks(Timer) {Timer -= sCycles2();}
#define unticks(Timer) {Timer += sCycles2()+34;}
//=============================================================================
/*-----------------------------------------------------------------------------
The End.
-----------------------------------------------------------------------------*/