////////////////////////////////////////////////////////////////////// // // Crytek Common Source code // // File:Cry_Math.h // Description: Misc mathematical functions // // History: // -Jan 31,2001: Created by Marco Corbetta // -Jan 04,2003: SSE and 3DNow optimizations by Andrey Honich // ////////////////////////////////////////////////////////////////////// #ifndef CRY_SIMD_H #define CRY_SIMD_H #if _MSC_VER > 1000 # pragma once #endif #include "TArray.h" #include "platform.h" //======================================================================================== extern int g_CpuFlags; inline float AngleMod(float a) { a = (float)((360.0/65536) * ((int)(a*(65536/360.0)) & 65535)); return a; } inline unsigned short Degr2Word(float f) { return (unsigned short)(AngleMod(f)/360.0f*65536.0f); } inline float Word2Degr(unsigned short s) { return (float)s / 65536.0f * 360.0f; } /***************************************************** MISC FUNCTIONS *****************************************************/ ////////////////////////////////////// #if defined(_CPU_X86) ILINE float __fastcall Ffabs(float f) { *((unsigned *) & f) &= ~0x80000000; return (f); } #else inline float Ffabs(float x) { return fabsf(x); } #endif ////////////////////////////////////////////////////////////////////////// #if defined(_CPU_X86) && !defined(LINUX) inline int fastftol_positive(float f) { int i; f -= 0.5f; __asm fld [f] __asm fistp [i] return i; } #else inline int fastftol_positive (float x) { return (int)x; } #endif #if defined(_CPU_X86) && !defined(LINUX) ILINE int __fastcall FtoI(float x) { int t; __asm { fld x fistp t } return t; } #else inline int FtoI(float x) { return (int)x; } #endif ////////////////////////////////////// #define rnd() (((float)rand())/RAND_MAX) // Floating point random number generator ( 0 -> 1) ////////////////////////////////////// inline void multMatrices(double dst[16], const double a[16], const double b[16]) { int i, j; for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { dst[i * 4 + j] = b[i * 4 + 0] * a[0 * 4 + j] + b[i * 4 + 1] * a[1 * 4 + j] + b[i * 4 + 2] * a[2 * 4 + j] + b[i * 4 + 3] * a[3 * 4 + j]; } } } ////////////////////////////////////// inline void multMatrices(float dst[16], const float a[16], const float b[16]) { int i, j; for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { dst[i * 4 + j] = b[i * 4 + 0] * a[0 * 4 + j] + b[i * 4 + 1] * a[1 * 4 + j] + b[i * 4 + 2] * a[2 * 4 + j] + b[i * 4 + 3] * a[3 * 4 + j]; } } } ////////////////////////////////////// // transform vector inline void matmult_trans_only(float a[3], float b[4][4], float result[3]) { result[0] = a[0] * b[0][0] + a[1] * b[1][0] + a[2] * b[2][0] + b[3][0]; result[1] = a[0] * b[0][1] + a[1] * b[1][1] + a[2] * b[2][1] + b[3][1]; result[2] = a[0] * b[0][2] + a[1] * b[1][2] + a[2] * b[2][2] + b[3][2]; } /* ====================================== Matrix 4x4 ====================================== */ inline void multMatrixf(float *product, const float *m1, const float *m2) { #define A(row,col) m1[(col<<2)+row] #define B(row,col) m2[(col<<2)+row] #define P(row,col) product[(col<<2)+row] int i; for (i=0; i<4; i++) { float ai0=A(i,0), ai1=A(i,1), ai2=A(i,2), ai3=A(i,3); P(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0); P(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1); P(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2); P(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3); } #undef A #undef B #undef P } #if !defined (GAMECUBE) && !defined PS2 //========================================================================================== // 3DNow! optimizations #pragma warning(push) #pragma warning(disable:4731) // frame pointer register 'ebp' modified by inline assembly code struct SConst3DN { float m_fVal0; float m_fVal1; }; const SConst3DN const3DN_1_N1 = {1.0f, -1.0f}; const SConst3DN const3DN_0_1 = {0.0f, 1.0f}; /*! Compute inverse of 4x4 transformation SINGLE-PRECISION matrix. Code lifted from Brian Paul's Mesa freeware OpenGL implementation. Return true for success, false for failure (singular matrix) */ inline bool QQinvertMatrixf(float *out, const float *m) { #define SWAP_ROWS(a, b) { float *_tmp = a; (a)=(b); (b)=_tmp; } #define MAT(m,r,c) (m)[(c)*4+(r)] float wtmp[4][8]; float m0, m1, m2, m3, s; float *r0, *r1, *r2, *r3; r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3]; r0[0] = MAT(m,0,0), r0[1] = MAT(m,0,1), r0[2] = MAT(m,0,2), r0[3] = MAT(m,0,3), r0[4] = 1.0f, r0[5] = r0[6] = r0[7] = 0.0f, r1[0] = MAT(m,1,0), r1[1] = MAT(m,1,1), r1[2] = MAT(m,1,2), r1[3] = MAT(m,1,3), r1[5] = 1.0f, r1[4] = r1[6] = r1[7] = 0.0f, r2[0] = MAT(m,2,0), r2[1] = MAT(m,2,1), r2[2] = MAT(m,2,2), r2[3] = MAT(m,2,3), r2[6] = 1.0f, r2[4] = r2[5] = r2[7] = 0.0f, r3[0] = MAT(m,3,0), r3[1] = MAT(m,3,1), r3[2] = MAT(m,3,2), r3[3] = MAT(m,3,3), r3[7] = 1.0f, r3[4] = r3[5] = r3[6] = 0.0f; /* choose pivot - or die */ if (fabs(r3[0])>fabs(r2[0])) SWAP_ROWS(r3, r2); if (fabs(r2[0])>fabs(r1[0])) SWAP_ROWS(r2, r1); if (fabs(r1[0])>fabs(r0[0])) SWAP_ROWS(r1, r0); if (0.0 == r0[0]) { return 0; } /* eliminate first variable */ m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0]; s = r0[1]; r1[1] -= m1 * s; r2[1] -= m2 * s; r3[1] -= m3 * s; s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s; s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s; s = r0[4]; if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; } s = r0[5]; if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; } s = r0[6]; if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; } s = r0[7]; if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; } /* choose pivot - or die */ if (fabs(r3[1])>fabs(r2[1])) SWAP_ROWS(r3, r2); if (fabs(r2[1])>fabs(r1[1])) SWAP_ROWS(r2, r1); if (0.0 == r1[1]) { return 0; } /* eliminate second variable */ m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1]; r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2]; r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3]; s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; } s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; } s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; } s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; } /* choose pivot - or die */ if (fabs(r3[2])>fabs(r2[2])) SWAP_ROWS(r3, r2); if (0.0 == r2[2]) { return 0; } /* eliminate third variable */ m3 = r3[2]/r2[2]; r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6], r3[7] -= m3 * r2[7]; /* last check */ if (0.0 == r3[3]) { return 0; } s = 1.0f/r3[3]; /* now back substitute row 3 */ r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s; m2 = r2[3]; /* now back substitute row 2 */ s = 1.0f/r2[2]; r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2), r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2); m1 = r1[3]; r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1; m0 = r0[3]; r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0; m1 = r1[2]; /* now back substitute row 1 */ s = 1.0f/r1[1]; r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1), r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1); m0 = r0[2]; r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0; m0 = r0[1]; /* now back substitute row 0 */ s = 1.0f/r0[0]; r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0), r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0); MAT(out,0,0) = r0[4]; MAT(out,0,1) = r0[5], MAT(out,0,2) = r0[6]; MAT(out,0,3) = r0[7], MAT(out,1,0) = r1[4]; MAT(out,1,1) = r1[5], MAT(out,1,2) = r1[6]; MAT(out,1,3) = r1[7], MAT(out,2,0) = r2[4]; MAT(out,2,1) = r2[5], MAT(out,2,2) = r2[6]; MAT(out,2,3) = r2[7], MAT(out,3,0) = r3[4]; MAT(out,3,1) = r3[5], MAT(out,3,2) = r3[6]; MAT(out,3,3) = r3[7]; return 1; #undef MAT #undef SWAP_ROWS } inline float cryISqrtf(float fVal) { unsigned int *n1 = (unsigned int *)&fVal; unsigned int n = 0x5f3759df - (*n1 >> 1); float *n2 = (float *)&n; fVal = (1.5f - (fVal * 0.5f) * *n2 * *n2) * *n2; return fVal; } #if defined _CPU_X86 && !defined(LINUX) // *************************************************************************** inline void cryPrecacheSSE(const void *src, int nbytes) { _asm { mov esi, src mov ecx, nbytes // 64 bytes per pass shr ecx, 6 jz endLabel loopMemToL1: prefetchnta 64[ESI] // Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI] // Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] add esi, 64 dec ecx jnz loopMemToL1 emms endLabel: } } // *************************************************************************** inline void cryPrecacheMMX(const void *src, int nbytes) { _asm { mov esi, src mov ecx, nbytes // 64 bytes per pass shr ecx, 6 jz endLabel loopMemToL1: movq mm1, 0[ESI] // Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] add esi, 64 dec ecx jnz loopMemToL1 emms endLabel: } } #endif inline void cryPrefetchNTSSE(const void *src) { #if defined(WIN32) && !defined(WIN64) && !defined(LINUX) _asm { mov esi, src prefetchnta [ESI] // Prefetch non-temporal } #endif } ILINE void cryPrefetchT0SSE(const void *src) { #if defined(WIN32) && !defined(WIN64) _asm { mov esi, src prefetchT0 [ESI] // Prefetch } #endif #if defined(WIN64) && defined(_CPU_AMD64) && !defined(LINUX) _mm_prefetch( (char*)src, _MM_HINT_T0 ); #endif } //================================================================================= // Very optimized memcpy() routine for AMD Athlon and Duron family. // This code uses any of FOUR different basic copy methods, depending // on the transfer size. // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or // "Streaming Store"), and also uses the software prefetch instructions, // be sure you're running on Athlon/Duron or other recent CPU before calling! #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch // Next is a copy that uses the MMX registers to copy 8 bytes at a time, // also using the "unrolled loop" optimization. This code uses // the software prefetch instruction to get the data into the cache. #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE" #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. #if defined _CPU_X86 && !defined(LINUX) // Inline assembly syntax for use with Visual C++ inline void cryMemcpy( void* Dst, const void* Src, int Count ) { if (g_CpuFlags & CPUF_SSE) { __asm { mov ecx, [Count] ; number of bytes to copy mov edi, [Dst] ; destination mov esi, [Src] ; source mov ebx, ecx ; keep a copy of count cld cmp ecx, TINY_BLOCK_COPY jb $memcpy_ic_3 ; tiny? skip mmx copy cmp ecx, 32*1024 ; dont align between 32k-64k because jbe $memcpy_do_align ; it appears to be slower cmp ecx, 64*1024 jbe $memcpy_align_done $memcpy_do_align: mov ecx, 8 ; a trick thats faster than rep movsb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memcpy_align_done jmp ecx ; jump to array of movsbs align 4 movsb movsb movsb movsb movsb movsb movsb movsb $memcpy_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to copy shr ecx, 6 ; get 64-byte block count jz $memcpy_ic_2 ; finish the last few bytes cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy jae $memcpy_uc_test // This is small block copy that uses the MMX registers to copy 8 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. align 16 $memcpy_ic_1: ; 64-byte block copies, in-cache copy prefetchnta [esi + (200*64/34+192)] ; start reading ahead movq mm0, [esi+0] ; read 64 bits movq mm1, [esi+8] movq [edi+0], mm0 ; write 64 bits movq [edi+8], mm1 ; note: the normal movq writes the movq mm2, [esi+16] ; data to cache; a cache line will be movq mm3, [esi+24] ; allocated as needed, to store the data movq [edi+16], mm2 movq [edi+24], mm3 movq mm0, [esi+32] movq mm1, [esi+40] movq [edi+32], mm0 movq [edi+40], mm1 movq mm2, [esi+48] movq mm3, [esi+56] movq [edi+48], mm2 movq [edi+56], mm3 add esi, 64 ; update source pointer add edi, 64 ; update destination pointer dec ecx ; count down jnz $memcpy_ic_1 ; last 64-byte block? $memcpy_ic_2: mov ecx, ebx ; has valid low 6 bits of the byte count $memcpy_ic_3: shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsds $memcpy_uc_test: cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy jae $memcpy_bp_1 $memcpy_64_test: or ecx, ecx ; tail end of block prefetch will jump here jz $memcpy_ic_2 ; no more 64-byte blocks left // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. align 16 $memcpy_uc_1: ; 64-byte blocks, uncached copy prefetchnta [esi + (200*64/34+192)] ; start reading ahead movq mm0,[esi+0] ; read 64 bits add edi,64 ; update destination pointer movq mm1,[esi+8] add esi,64 ; update source pointer movq mm2,[esi-48] movntq [edi-64], mm0 ; write 64 bits, bypassing the cache movq mm0,[esi-40] ; note: movntq also prevents the CPU movntq [edi-56], mm1 ; from READING the destination address movq mm1,[esi-32] ; into the cache, only to be over-written movntq [edi-48], mm2 ; so that also helps performance movq mm2,[esi-24] movntq [edi-40], mm0 movq mm0,[esi-16] movntq [edi-32], mm1 movq mm1,[esi-8] movntq [edi-24], mm2 movntq [edi-16], mm0 dec ecx movntq [edi-8], mm1 jnz $memcpy_uc_1 ; last 64-byte block? jmp $memcpy_ic_2 ; almost done // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. $memcpy_bp_1: ; large blocks, block prefetch copy cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? jl $memcpy_64_test ; no, back to regular uncached copy mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X add esi, CACHEBLOCK * 64 ; move to the top of the block align 16 $memcpy_bp_2: mov edx, [esi-64] ; grab one address per cache line mov edx, [esi-128] ; grab one address per cache line sub esi, 128 ; go reverse order to suppress HW prefetcher dec eax ; count down the cache lines jnz $memcpy_bp_2 ; keep grabbing more lines into cache mov eax, CACHEBLOCK ; now that its in cache, do the copy align 16 $memcpy_bp_3: movq mm0, [esi ] ; read 64 bits movq mm1, [esi+ 8] movq mm2, [esi+16] movq mm3, [esi+24] movq mm4, [esi+32] movq mm5, [esi+40] movq mm6, [esi+48] movq mm7, [esi+56] add esi, 64 ; update source pointer movntq [edi ], mm0 ; write 64 bits, bypassing cache movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU movntq [edi+16], mm2 ; from READING the destination address movntq [edi+24], mm3 ; into the cache, only to be over-written, movntq [edi+32], mm4 ; so that also helps performance movntq [edi+40], mm5 movntq [edi+48], mm6 movntq [edi+56], mm7 add edi, 64 ; update dest pointer dec eax ; count down jnz $memcpy_bp_3 ; keep copying sub ecx, CACHEBLOCK ; update the 64-byte block count jmp $memcpy_bp_1 ; keep processing chunks // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". Then it handles the last few bytes. align 4 movsd movsd ; perform last 1-15 dword copies movsd movsd movsd movsd movsd movsd movsd movsd ; perform last 1-7 dword copies movsd movsd movsd movsd movsd movsd $memcpy_last_few: ; dword aligned from before movsds mov ecx, ebx ; has valid low 2 bits of the byte count and ecx, 11b ; the last few cows must come home jz $memcpy_final ; no more, lets leave rep movsb ; the last 1, 2, or 3 bytes $memcpy_final: emms ; clean up the MMX state sfence ; flush the write buffer // mov eax, [dest] ; ret value = destination pointer } } else { memcpy(Dst, Src, Count); } } inline void cryPrefetch(const void* Src, int nCount) { nCount >>= 6; if (nCount > 0) { _asm { mov esi, Src; mov ecx, nCount; mPr0: align 16 dec ecx; mov eax, [esi]; mov eax,0; lea esi, [esi+40h]; jne mPr0; } } else { _asm { mov esi, Src; mov ecx, nCount; mPr1: align 16 inc ecx; mov eax, [esi]; mov eax,0; lea esi, [esi-40h]; jne mPr1; } } } inline void cryMemcpy (void* inDst, const void* inSrc, int nCount, int nFlags) { int i; int cnt; INT_PTR nAlign16; unsigned char *Src = (unsigned char *)inSrc; unsigned char *Dst = (unsigned char *)inDst; if (nCount < 32 || (nCount < 128 && !(nFlags & 4))) { memcpy(Dst, Src, nCount); return; } if (nFlags & 1) nAlign16 = (INT_PTR)Src & 0xf; else nAlign16 = (INT_PTR)Dst & 0xf; if (nCount >= 4) { if (nAlign16 & 3) { cnt = (int)(4 - nAlign16); for (i=0; i= 16 && (nAlign16 & 0xc)) { cnt = (int)(16 - nAlign16); for (i=0; i<(cnt>>2); i++) { *(UINT_PTR *)Dst = *(UINT_PTR *)Src; Dst += 4; Src += 4; } nCount -= cnt; } } int Count_64 = 0xffffffc0 & nCount; int Count_16 = 0x30 & nCount; int Count_12 = 0xc & nCount; int Count_4 = 0x3 & nCount; if (Count_64) { if (nFlags & 4) { while (Count_64 > 2048) { cryPrefetch(Src, 2048); if (g_CpuFlags & CPUF_SSE) { if ((INT_PTR)Src & 0xf) { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; St0: movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St0 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; St1: align 16 movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St1 } } } else { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; St2: movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St2 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; St3: align 16 movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St3 } } } } else if (g_CpuFlags & CPUF_MMX) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; St4: align 16 movq mm0,xmmword ptr [ebx] movq mm1,xmmword ptr [ebx+08h] movq mm2,xmmword ptr [ebx+10h] movq mm3,xmmword ptr [ebx+18h] movq mm4,xmmword ptr [ebx+20h] movq mm5,xmmword ptr [ebx+28h] movq mm6,xmmword ptr [ebx+30h] movq mm7,xmmword ptr [ebx+38h] add ebx,40h movq mmword ptr [edx],mm0 movq mmword ptr [edx+08h],mm1 movq mmword ptr [edx+10h],mm2 movq mmword ptr [edx+18h],mm3 movq mmword ptr [edx+20h],mm0 movq mmword ptr [edx+28h],mm1 movq mmword ptr [edx+30h],mm2 movq mmword ptr [edx+38h],mm3 add edx,40h dec ecx jne St4 emms } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, 32; shl ecx, 4 St5: align 16 mov eax,dword ptr [ebx] add ebx,4h mov dword ptr [edx],eax add edx,4h dec ecx jne St5 } } Src += 2048; Dst += 2048; Count_64 -= 2048; } if (Count_64 > 128) { cryPrefetch(Src, Count_64); cnt = Count_64>>6; if (g_CpuFlags & CPUF_SSE) { if ((INT_PTR)Src & 0xf) { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St6: movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St6 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St7: align 16 movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St7 } } } else { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St8: movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St8 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St9: align 16 movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St9 } } } } else if (g_CpuFlags & CPUF_MMX) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St10: align 16 movq mm0,xmmword ptr [ebx] movq mm1,xmmword ptr [ebx+08h] movq mm2,xmmword ptr [ebx+10h] movq mm3,xmmword ptr [ebx+18h] movq mm4,xmmword ptr [ebx+20h] movq mm5,xmmword ptr [ebx+28h] movq mm6,xmmword ptr [ebx+30h] movq mm7,xmmword ptr [ebx+38h] add ebx,40h movq mmword ptr [edx],mm0 movq mmword ptr [edx+08h],mm1 movq mmword ptr [edx+10h],mm2 movq mmword ptr [edx+18h],mm3 movq mmword ptr [edx+20h],mm0 movq mmword ptr [edx+28h],mm1 movq mmword ptr [edx+30h],mm2 movq mmword ptr [edx+38h],mm3 add edx,40h dec ecx jne St10 emms } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; shl ecx, 4 St11: align 16 mov eax,dword ptr [ebx] add ebx,4h mov dword ptr [edx],eax add edx,4h dec ecx jne St11 } } Src += Count_64; Dst += Count_64; Count_64 = 0; } } if (Count_64) { cnt = Count_64>>6; if (g_CpuFlags & CPUF_SSE) { if ((INT_PTR)Src & 0xf) { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St12: movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St12 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St13: align 16 movups xmm0,xmmword ptr [ebx] movups xmm1,xmmword ptr [ebx+10h] movups xmm2,xmmword ptr [ebx+20h] movups xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St13 } } } else { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St14: movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movups xmmword ptr [edx],xmm0 movups xmmword ptr [edx+10h],xmm1 movups xmmword ptr [edx+20h],xmm2 movups xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St14 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St15: align 16 movaps xmm0,xmmword ptr [ebx] movaps xmm1,xmmword ptr [ebx+10h] movaps xmm2,xmmword ptr [ebx+20h] movaps xmm3,xmmword ptr [ebx+30h] add ebx,40h movaps xmmword ptr [edx],xmm0 movaps xmmword ptr [edx+10h],xmm1 movaps xmmword ptr [edx+20h],xmm2 movaps xmmword ptr [edx+30h],xmm3 add edx,40h dec ecx jne St15 } } } } else if (g_CpuFlags & CPUF_MMX) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St16: align 16 movq mm0,xmmword ptr [ebx] movq mm1,xmmword ptr [ebx+08h] movq mm2,xmmword ptr [ebx+10h] movq mm3,xmmword ptr [ebx+18h] movq mm4,xmmword ptr [ebx+20h] movq mm5,xmmword ptr [ebx+28h] movq mm6,xmmword ptr [ebx+30h] movq mm7,xmmword ptr [ebx+38h] add ebx,40h movq mmword ptr [edx],mm0 movq mmword ptr [edx+08h],mm1 movq mmword ptr [edx+10h],mm2 movq mmword ptr [edx+18h],mm3 movq mmword ptr [edx+20h],mm0 movq mmword ptr [edx+28h],mm1 movq mmword ptr [edx+30h],mm2 movq mmword ptr [edx+38h],mm3 add edx,40h dec ecx jne St16 emms } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; shl ecx, 4 St17: align 16 mov eax,dword ptr [ebx] add ebx,4h mov dword ptr [edx],eax add edx,4h dec ecx jne St17 } } Src += Count_64; Dst += Count_64; } } if (Count_16) { cnt = Count_16>>4; if (g_CpuFlags & CPUF_SSE) { if ((INT_PTR)Src & 0xf) { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St18: movups xmm0,xmmword ptr [ebx] add ebx,10h movups xmmword ptr [edx],xmm0 add edx,10h dec ecx jne St18 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St19: align 16 movups xmm0,xmmword ptr [ebx] add ebx,10h movaps xmmword ptr [edx],xmm0 add edx,10h dec ecx jne St19 } } } else { if ((INT_PTR)Dst & 0xf) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St20: movaps xmm0,xmmword ptr [ebx] add ebx,10h movups xmmword ptr [edx],xmm0 add edx,10h dec ecx jne St20 } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St21: align 16 movaps xmm0,xmmword ptr [ebx] add ebx,10h movaps xmmword ptr [edx],xmm0 add edx,10h dec ecx jne St21 } } } } else if (g_CpuFlags & CPUF_MMX) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; St22: align 16 movq mm0,xmmword ptr [ebx] movq mm1,xmmword ptr [ebx+08h] add ebx,10h movq mmword ptr [edx],mm0 movq mmword ptr [edx+08h],mm1 add edx,10h dec ecx jne St22 emms } } else { _asm { mov ebx, Src; mov edx, Dst; mov ecx, cnt; shl ecx, 2 St23: align 16 mov eax,dword ptr [ebx] add ebx,4h mov dword ptr [edx],eax add edx,4h dec ecx jne St23 } } Src += Count_16; Dst += Count_16; } if (Count_12) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, Count_12; shr ecx, 2 St24: align 16 mov eax,dword ptr [ebx] add ebx,4h mov dword ptr [edx],eax add edx,4h dec ecx jne St24 } Src += Count_12; Dst += Count_12; } if (Count_4) { _asm { mov ebx, Src; mov edx, Dst; mov ecx, Count_4; St25: align 16 mov al,byte ptr [ebx] add ebx,4h mov byte ptr [edx],al add edx,4h dec ecx jne St25 } } } //============================================================================== // Matrix inversion using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline bool invertMatrixf_3DNow(float *pOut, const float *pIn) { _asm { mov edx, pIn sub esp,40h movq mm1,mmword ptr [edx+10h] movq mm2,mmword ptr [edx+20h] pswapd mm5,mm1 movq mm3,mmword ptr [edx+30h] pswapd mm4,mm2 movq mm0,mmword ptr [edx] pswapd mm6,mm3 pswapd mm7,mm2 pfmul mm5,mm0 pfmul mm4,mm0 pfmul mm6,mm0 pswapd mm0,mm3 pfmul mm7,mm1 pfnacc mm5,mm4 pfmul mm1,mm0 pfmul mm2,mm0 pfnacc mm6,mm7 pfnacc mm1,mm2 movq mm0,mmword ptr [edx+38h] movq mm2,mm6 movq mm4,mm1 movq mm7,mmword ptr [edx+28h] punpckhdq mm2,mm2 movq mm3,mm1 punpckhdq mm4,mm4 pfmul mm2,mm0 punpckldq mm3,mm3 pfmul mm4,mmword ptr [edx+18h] pfmul mm3,mm7 pfadd mm2,mm4 movq mm4,mm6 pfsub mm2,mm3 movq mm3,mm1 punpckldq mm4,mm4 movq mmword ptr [esp+8],mm2 movq mm2,mm5 punpckhdq mm3,mm3 punpckhdq mm2,mm2 pfmul mm3,mmword ptr [edx+8] pfmul mm2,mm0 pfmul mm4,mm7 pfsub mm2,mm4 pfadd mm2,mm3 movq mmword ptr [esp+18h],mm2 movq mm2,mm5 movq mm3,mm6 punpckldq mm1,mm1 punpckldq mm2,mm2 punpckldq mm3,mm3 pfmul mm1,mmword ptr [edx+8] pfmul mm0,mm2 pfmul mm3,mmword ptr [edx+18h] pfadd mm0,mm1 pfsub mm0,mm3 movq mmword ptr [esp+28h],mm0 punpckhdq mm5,mm5 punpckhdq mm6,mm6 pfmul mm2,mm7 pfmul mm5,mmword ptr [edx+18h] movq mm0,mmword ptr [edx+8] pfmul mm6,mmword ptr [edx+8] pfsub mm2,mm5 pfadd mm2,mm6 movq mmword ptr [esp+38h],mm2 movq mm1,mmword ptr [edx+18h] movq mm2,mmword ptr [edx+28h] movq mm3,mmword ptr [edx+38h] pswapd mm5,mm1 pswapd mm4,mm2 pfmul mm5,mm0 pswapd mm6,mm3 pfmul mm4,mm0 pswapd mm7,mm2 pfmul mm6,mm0 pswapd mm0,mm3 pfnacc mm5,mm4 pfmul mm7,mm1 pfmul mm1,mm0 pfmul mm2,mm0 pfnacc mm6,mm7 pfnacc mm1,mm2 movq mm0,mmword ptr [edx+30h] movq mm4,mm1 movq mm3,mm1 movq mm7,mmword ptr [edx+20h] movq mm2,mm6 punpckhdq mm4,mm4 punpckldq mm3,mm3 punpckhdq mm2,mm2 pfmul mm4,mmword ptr [edx+10h] pfmul mm2,mm0 pfmul mm3,mm7 pfadd mm2,mm4 pfsub mm2,mm3 movq mmword ptr [esp],mm2 movq mm3,mm1 movq mm4,mm6 movq mm2,mm5 punpckhdq mm3,mm3 punpckldq mm4,mm4 punpckhdq mm2,mm2 pfmul mm3,mmword ptr [edx] pfmul mm2,mm0 pfmul mm4,mm7 pfsub mm2,mm4 pfadd mm2,mm3 movq mmword ptr [esp+10h],mm2 movq mm4,mm0 movq mm2,mm5 movq mm3,mm6 punpckldq mm1,mm1 punpckldq mm2,mm2 punpckldq mm3,mm3 pfmul mm1,mmword ptr [edx] pfmul mm0,mm2 pfmul mm3,mmword ptr [edx+10h] pfadd mm0,mm1 pfsub mm0,mm3 movq mmword ptr [esp+20h],mm0 punpckhdq mm5,mm5 punpckhdq mm6,mm6 pfmul mm2,mm7 pfmul mm5,mmword ptr [edx+10h] pfmul mm6,mmword ptr [edx] pfsub mm2,mm5 pfadd mm2,mm6 movq mmword ptr [esp+30h],mm2 punpckhdq mm0,mm2 punpckldq mm7,mm4 movq mm3,mmword ptr [edx] movq mm1,mmword ptr [esp] punpckldq mm3,dword ptr [edx+10h] punpckhdq mm1,mmword ptr [esp+10h] pfmul mm0,mm7 pfmul mm3,mm1 pfnacc mm0,mm3 pfacc mm0,mm0 movq mm4,mm0 pfrcp mm1,mm0 pxor mm3,mm3 punpckldq mm0,mm0 pfcmpeq mm3,mm4 pfrcpit1 mm0,mm1 pfrcpit2 mm0,mm1 movd eax,mm3 pfmul mm0,mmword ptr const3DN_1_N1 test eax,eax pswapd mm1,mm0 jne m1 mov eax, pOut movq mm2,mmword ptr [esp] movq mm5,mmword ptr [esp+8] movq mm3,mmword ptr [esp+10h] movq mm6,mmword ptr [esp+18h] pfmul mm2,mm1 pfmul mm3,mm0 pfmul mm5,mm1 pfmul mm6,mm0 movq mm4,mm2 movq mm7,mm5 punpckhdq mm2,mm3 punpckldq mm4,mm3 punpckhdq mm5,mm6 punpckldq mm7,mm6 movq mmword ptr [eax],mm2 movq mmword ptr [eax+10h],mm4 movq mmword ptr [eax+20h],mm5 movq mmword ptr [eax+30h],mm7 movq mm2,mmword ptr [esp+20h] movq mm5,mmword ptr [esp+28h] movq mm3,mmword ptr [esp+30h] movq mm6,mmword ptr [esp+38h] pfmul mm2,mm1 pfmul mm3,mm0 pfmul mm5,mm1 pfmul mm6,mm0 movq mm4,mm2 movq mm7,mm5 punpckhdq mm2,mm3 punpckldq mm4,mm3 punpckhdq mm5,mm6 punpckldq mm7,mm6 movq mmword ptr [eax+8],mm2 movq mmword ptr [eax+18h],mm4 movq mmword ptr [eax+28h],mm5 movq mmword ptr [eax+38h],mm7 m1: femms add esp,40h } return true; } // Matrix multiplication using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void multMatrixf_3DNow(float *product, const float *m1, const float *m2) { __asm { femms sub esp,44h mov edx, m1 mov ecx, m2 mov eax, product mov dword ptr [esp+40h],ebp mov ebp,esp and esp,0FFFFFFF8h movq mm0,mmword ptr [edx] movq mm1,mmword ptr [edx+10h] movq mm3,mmword ptr [edx+28h] movq mm4,mmword ptr [edx+38h] movq mm2,mm0 punpckldq mm0,mm1 punpckhdq mm2,mm1 movq mm5,mm3 movq mmword ptr [esp],mm0 movq mmword ptr [esp+10h],mm2 movq mm6,mmword ptr [edx+8] punpckldq mm3,mm4 movq mm7,mmword ptr [edx+18h] movq mm0,mmword ptr [edx+20h] punpckhdq mm5,mm4 movq mm1,mm6 movq mmword ptr [esp+28h],mm3 movq mmword ptr [esp+38h],mm5 punpckldq mm6,mm7 movq mm4,mmword ptr [edx+30h] movq mm2,mm0 punpckhdq mm1,mm7 movq mmword ptr [esp+20h],mm6 punpckhdq mm0,mm4 punpckldq mm2,mm4 movq mmword ptr [esp+30h],mm1 movq mmword ptr [esp+18h],mm0 movq mmword ptr [esp+8],mm2 movq mm0,mmword ptr [ecx] movq mm1,mmword ptr [ecx+8] movq mm2,mm0 movq mm3,mm1 movq mm4,mm0 movq mm5,mm1 pfmul mm0,mmword ptr [esp] pfmul mm1,mmword ptr [esp+8] movq mm6,mm2 movq mm7,mm3 pfmul mm2,mmword ptr [esp+10h] pfmul mm3,mmword ptr [esp+18h] pfmul mm4,mmword ptr [esp+20h] pfmul mm5,mmword ptr [esp+28h] pfmul mm6,mmword ptr [esp+30h] pfmul mm7,mmword ptr [esp+38h] pfacc mm0,mm2 pfacc mm1,mm3 pfacc mm4,mm6 pfacc mm5,mm7 pfadd mm0,mm1 pfadd mm4,mm5 movq mmword ptr [eax],mm0 movq mmword ptr [eax+8],mm4 movq mm0,mmword ptr [ecx+10h] movq mm1,mmword ptr [ecx+18h] movq mm2,mm0 movq mm3,mm1 movq mm4,mm0 movq mm5,mm1 pfmul mm0,mmword ptr [esp] pfmul mm1,mmword ptr [esp+8] movq mm6,mm2 movq mm7,mm3 pfmul mm2,mmword ptr [esp+10h] pfmul mm3,mmword ptr [esp+18h] pfmul mm4,mmword ptr [esp+20h] pfmul mm5,mmword ptr [esp+28h] pfmul mm6,mmword ptr [esp+30h] pfmul mm7,mmword ptr [esp+38h] pfacc mm0,mm2 pfacc mm1,mm3 pfacc mm4,mm6 pfacc mm5,mm7 pfadd mm0,mm1 pfadd mm4,mm5 movq mmword ptr [eax+10h],mm0 movq mmword ptr [eax+18h],mm4 movq mm0,mmword ptr [ecx+20h] movq mm1,mmword ptr [ecx+28h] movq mm2,mm0 movq mm3,mm1 movq mm4,mm0 movq mm5,mm1 pfmul mm0,mmword ptr [esp] pfmul mm1,mmword ptr [esp+8] movq mm6,mm2 movq mm7,mm3 pfmul mm2,mmword ptr [esp+10h] pfmul mm3,mmword ptr [esp+18h] pfmul mm4,mmword ptr [esp+20h] pfmul mm5,mmword ptr [esp+28h] pfmul mm6,mmword ptr [esp+30h] pfmul mm7,mmword ptr [esp+38h] pfacc mm0,mm2 pfacc mm1,mm3 pfacc mm4,mm6 pfacc mm5,mm7 pfadd mm0,mm1 pfadd mm4,mm5 movq mmword ptr [eax+20h],mm0 movq mmword ptr [eax+28h],mm4 movq mm0,mmword ptr [ecx+30h] movq mm1,mmword ptr [ecx+38h] movq mm2,mm0 movq mm3,mm1 movq mm4,mm0 movq mm5,mm1 pfmul mm0,mmword ptr [esp] pfmul mm1,mmword ptr [esp+8] movq mm6,mm2 movq mm7,mm3 pfmul mm2,mmword ptr [esp+10h] pfmul mm3,mmword ptr [esp+18h] pfmul mm4,mmword ptr [esp+20h] pfmul mm5,mmword ptr [esp+28h] pfmul mm6,mmword ptr [esp+30h] pfmul mm7,mmword ptr [esp+38h] pfacc mm0,mm2 pfacc mm1,mm3 pfacc mm4,mm6 pfacc mm5,mm7 pfadd mm0,mm1 pfadd mm4,mm5 movq mmword ptr [eax+30h],mm0 movq mmword ptr [eax+38h],mm4 mov esp,ebp mov ebp,dword ptr [esp+40h] add esp,44h femms } } // Matrix transposing using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void transposeMatrixf_3DNow(float *product, const float *m) { __asm { mov edx,m movq mm0,mmword ptr [edx] movq mm1,mmword ptr [edx+10h] movq mm3,mmword ptr [edx+28h] movq mm4,mmword ptr [edx+38h] movq mm2,mm0 mov eax,product punpckldq mm0,mm1 punpckhdq mm2,mm1 movq mm5,mm3 movq mmword ptr [eax],mm0 movq mmword ptr [eax+10h],mm2 movq mm6,mmword ptr [edx+8] punpckldq mm3,mm4 movq mm7,mmword ptr [edx+18h] movq mm0,mmword ptr [edx+20h] punpckhdq mm5,mm4 movq mm1,mm6 movq mmword ptr [eax+28h],mm3 movq mmword ptr [eax+38h],mm5 punpckldq mm6,mm7 movq mm4,mmword ptr [edx+30h] movq mm2,mm0 punpckhdq mm1,mm7 movq mmword ptr [eax+20h],mm6 punpckhdq mm0,mm4 punpckldq mm2,mm4 movq mmword ptr [eax+30h],mm1 movq mmword ptr [eax+18h],mm0 movq mmword ptr [eax+8],mm2 femms } } // Matrix identity using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void indentityMatrixf_3DNow(float *product) { __asm { movd mm1,dword ptr const3DN_1_N1 mov eax,dword ptr product pxor mm0,mm0 movq mm2,mm1 movq mmword ptr [eax],mm1 movq mmword ptr [eax+8],mm0 movq mmword ptr [eax+18h],mm0 punpckhdq mm2,mm2 movq mmword ptr [eax+20h],mm0 movq mmword ptr [eax+28h],mm1 punpckldq mm2,mm1 movq mmword ptr [eax+30h],mm0 movq mmword ptr [eax+10h],mm2 movq mmword ptr [eax+38h],mm2 femms } } // Scale Matrix using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void scaleMatrixf_3DNow(float *product, float x, float y, float z) { __asm { femms mov eax, product movd mm3, y movq mm1,mmword ptr const3DN_0_1 movd mm2, x pxor mm0,mm0 psllq mm3,20h movd mm4, z movq mmword ptr [eax],mm2 movq mmword ptr [eax+8],mm0 movq mmword ptr [eax+18h],mm0 movq mmword ptr [eax+20h],mm0 movq mmword ptr [eax+30h],mm0 movq mmword ptr [eax+10h],mm3 movq mmword ptr [eax+28h],mm4 movq mmword ptr [eax+38h],mm1 femms } } // Transform position using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void transformVec3f_3DNow(float *result, float *inV, float *matrix) { _asm { femms mov eax,inV mov edx,matrix mov ecx,result movq mm0,mmword ptr [eax] movq mm1,mm0 punpckldq mm0,mm0 punpckhdq mm1,mm1 movd mm2,dword ptr [eax+8] punpckldq mm2,mm2 movq mm3,mm0 pfmul mm0,mmword ptr [edx] movq mm4,mm1 pfmul mm1,mmword ptr [edx+10h] movq mm5,mm2 pfmul mm2,mmword ptr [edx+20h] pfadd mm0,mmword ptr [edx+30h] pfmul mm3,mmword ptr [edx+8] pfadd mm1,mm2 pfmul mm4,mmword ptr [edx+18h] pfmul mm5,mmword ptr [edx+28h] pfadd mm3,mmword ptr [edx+38h] pfadd mm0,mm1 pfadd mm4,mm5 pfadd mm3,mm4 movq mmword ptr [ecx],mm0 movq mmword ptr [ecx+8],mm3 femms } } // Transform normal using 3DNow! instructions set // NOTE: On AMD Athlon - much faster then SSE version inline void transformVec3Nf_3DNow(float *result, float *inV, float *matrix) { _asm { femms mov eax,inV mov edx,matrix mov ecx,result movq mm0,mmword ptr [eax] movq mm1,mm0 punpckldq mm0,mm0 punpckhdq mm1,mm1 movd mm2,dword ptr [eax+8] punpckldq mm2,mm2 movq mm3,mm0 pfmul mm0,mmword ptr [edx] movq mm4,mm1 pfmul mm1,mmword ptr [edx+10h] movq mm5,mm2 pfmul mm2,mmword ptr [edx+20h] pfmul mm3,mmword ptr [edx+8] pfadd mm1,mm2 pfmul mm4,mmword ptr [edx+18h] pfmul mm5,mmword ptr [edx+28h] pfadd mm0,mm1 pfadd mm4,mm5 pfadd mm3,mm4 movq mmword ptr [ecx],mm0 movd dword ptr [ecx+8],mm3 femms } } //========================================================================================== // SSE optimizations // Matrix inversion using SSE instructions set inline bool invertMatrixf_SSE(float *pOut, const float *pIn) { _asm { mov ebx, esp and esp,0FFFFFFF0h sub esp,90h mov eax, pIn movaps xmm2,xmmword ptr [esp+30h] movlps xmm2,qword ptr [eax] movaps xmm1,xmmword ptr [esp+70h] lea ecx,[eax+10h] movhps xmm2,qword ptr [ecx] lea edx,[eax+20h] movlps xmm1,qword ptr [edx] movaps xmm0,xmm2 lea ecx,[eax+30h] movhps xmm1,qword ptr [ecx] shufps xmm0,xmm1,88h shufps xmm1,xmm2,0DDh lea edx,[eax+8] movlps xmm2,qword ptr [edx] movaps xmm3,xmm2 movaps xmm2,xmmword ptr [esp+70h] lea ecx,[eax+18h] movhps xmm3,qword ptr [ecx] movaps xmm5,xmm3 lea ecx,[eax+38h] add eax,28h movlps xmm2,qword ptr [eax] movhps xmm2,qword ptr [ecx] shufps xmm5,xmm2,88h shufps xmm2,xmm3,0DDh movaps xmm3,xmm5 mulps xmm3,xmm2 shufps xmm3,xmm3,0B1h movaps xmm4,xmm1 mulps xmm4,xmm3 movaps xmmword ptr [esp+50h],xmm4 movaps xmm4,xmm3 shufps xmm4,xmm3,4Eh movaps xmm6,xmm0 mulps xmm6,xmm3 movaps xmm3,xmm1 mulps xmm3,xmm4 subps xmm3,xmmword ptr [esp+50h] movaps xmm7,xmm0 mulps xmm7,xmm4 movaps xmm4,xmm7 subps xmm4,xmm6 movaps xmmword ptr [esp+10h],xmm4 movaps xmm6,xmm1 mulps xmm6,xmm5 shufps xmm6,xmm6,0B1h movaps xmm7,xmm0 mulps xmm7,xmm6 movaps xmmword ptr [esp+40h],xmm7 movaps xmm4,xmm2 mulps xmm4,xmm6 shufps xmm6,xmm6,4Eh movaps xmm7,xmm2 mulps xmm7,xmm6 movaps xmmword ptr [esp+60h],xmm7 movaps xmm7,xmm0 mulps xmm7,xmm6 movaps xmm6,xmm7 subps xmm6,xmmword ptr [esp+40h] movaps xmmword ptr [esp+40h],xmm6 movaps xmm6,xmm1 shufps xmm6,xmm1,4Eh mulps xmm6,xmm2 shufps xmm6,xmm6,0B1h shufps xmm5,xmm5,4Eh movaps xmm7,xmm5 mulps xmm7,xmm6 movaps xmmword ptr [esp+20h],xmm7 addps xmm4,xmm3 subps xmm4,xmmword ptr [esp+60h] movaps xmm3,xmmword ptr [esp+20h] movaps xmm7,xmm0 mulps xmm7,xmm6 movaps xmmword ptr [esp],xmm7 shufps xmm6,xmm6,4Eh movaps xmm7,xmm5 mulps xmm7,xmm6 addps xmm3,xmm4 movaps xmmword ptr [esp+30h],xmm6 subps xmm3,xmm7 movaps xmmword ptr [esp+50h],xmm3 movaps xmm7,xmm2 movaps xmm3,xmm0 mulps xmm3,xmmword ptr [esp+30h] subps xmm3,xmmword ptr [esp] movaps xmmword ptr [esp],xmm3 movaps xmm3,xmm2 movaps xmm6,xmm0 mulps xmm6,xmm1 shufps xmm6,xmm6,0B1h mulps xmm3,xmm6 movaps xmm4,xmm5 mulps xmm4,xmm6 shufps xmm6,xmm6,4Eh mulps xmm7,xmm6 movaps xmmword ptr [esp+20h],xmm7 movaps xmm7,xmm5 mulps xmm7,xmm6 movaps xmmword ptr [esp+70h],xmm7 movaps xmm6,xmm0 mulps xmm6,xmm2 shufps xmm6,xmm6,0B1h movaps xmm7,xmm5 mulps xmm7,xmm6 movaps xmmword ptr [esp+30h],xmm7 movaps xmm7,xmm1 mulps xmm7,xmm6 shufps xmm6,xmm6,4Eh movaps xmmword ptr [esp+60h],xmm7 movaps xmm7,xmm5 mulps xmm7,xmm6 movaps xmmword ptr [esp+80h],xmm7 movaps xmm7,xmm1 mulps xmm7,xmm6 movaps xmm6,xmmword ptr [esp] shufps xmm6,xmmword ptr [esp],4Eh addps xmm3,xmm6 movaps xmm6,xmm3 movaps xmm3,xmmword ptr [esp+20h] subps xmm3,xmm6 movaps xmm6,xmm3 movaps xmm3,xmmword ptr [esp+60h] addps xmm3,xmm6 subps xmm3,xmm7 movaps xmmword ptr [esp],xmm3 movaps xmm3,xmm0 mulps xmm0,xmmword ptr [esp+50h] mulps xmm3,xmm5 movaps xmm5,xmm3 shufps xmm5,xmm3,0B1h movaps xmm3,xmm2 movaps xmm6,xmm1 mulps xmm3,xmm5 mulps xmm6,xmm5 shufps xmm5,xmm5,4Eh mulps xmm2,xmm5 movaps xmmword ptr [esp+20h],xmm6 movaps xmm6,xmm2 movaps xmm2,xmmword ptr [esp+10h] shufps xmm2,xmmword ptr [esp+10h],4Eh subps xmm2,xmmword ptr [esp+30h] movaps xmm7,xmm2 movaps xmm2,xmmword ptr [esp+80h] addps xmm2,xmm7 addps xmm3,xmm2 movaps xmm2,xmmword ptr [esp+40h] shufps xmm2,xmmword ptr [esp+40h],4Eh subps xmm4,xmm2 subps xmm4,xmmword ptr [esp+70h] subps xmm4,xmmword ptr [esp+20h] movaps xmm2,xmm0 shufps xmm2,xmm0,4Eh addps xmm2,xmm0 movaps xmm0,xmm2 shufps xmm0,xmm2,0B1h mulps xmm1,xmm5 addss xmm0,xmm2 subps xmm3,xmm6 addps xmm1,xmm4 xorps xmm2,xmm2 xor eax,eax inc eax xor ecx,ecx comiss xmm0,xmm2 cmove ecx,eax test ecx,ecx je m1 xor eax,eax jmp mEnd m1: mov eax, pOut movaps xmmword ptr [esp+10h],xmm0 rcpss xmm2,xmm0 movss dword ptr [esp+10h],xmm2 movaps xmm2,xmmword ptr [esp+10h] movaps xmm4,xmm2 mulss xmm4,xmm2 mulss xmm0,xmm4 movaps xmm4,xmm0 movaps xmm0,xmm2 addss xmm0,xmm2 subss xmm0,xmm4 shufps xmm0,xmm0,0 movaps xmm2,xmm0 mulps xmm2,xmmword ptr [esp+50h] movlps qword ptr [eax],xmm2 lea ecx,[eax+8] movhps qword ptr [ecx],xmm2 movaps xmm2,xmm0 mulps xmm2,xmm3 lea ecx,[eax+10h] movlps qword ptr [ecx],xmm2 lea ecx,[eax+18h] movhps qword ptr [ecx],xmm2 movaps xmm2,xmm0 mulps xmm2,xmmword ptr [esp] lea ecx,[eax+20h] movlps qword ptr [ecx],xmm2 lea ecx,[eax+28h] movhps qword ptr [ecx],xmm2 lea ecx,[eax+30h] mulps xmm0,xmm1 movlps qword ptr [ecx],xmm0 lea ecx,[eax+38h] movhps qword ptr [ecx],xmm0 mEnd: mov esp, ebx } return true; } // Matrix multiplication using SSE instructions set // IMPORTANT NOTE: much faster if matrices m1 and product is 16 bytes aligned inline void multMatrixf_SSE(float *product, const float *m1, const float *m2) { __asm { mov eax, m2; mov ecx, m1; mov edx, product; test dl,0Fh jne lNonAligned test cl,0Fh jne lNonAligned movss xmm0,dword ptr [eax] movaps xmm1,xmmword ptr [ecx] shufps xmm0,xmm0,0 movss xmm2,dword ptr [eax+4] mulps xmm0,xmm1 shufps xmm2,xmm2,0 movaps xmm3,xmmword ptr [ecx+10h] movss xmm4,dword ptr [eax+8] mulps xmm2,xmm3 shufps xmm4,xmm4,0 addps xmm0,xmm2 movaps xmm2,xmmword ptr [ecx+20h] movss xmm5,dword ptr [eax+0Ch] mulps xmm4,xmm2 shufps xmm5,xmm5,0 movaps xmm6,xmmword ptr [ecx+30h] mulps xmm5,xmm6 addps xmm4,xmm5 addps xmm0,xmm4 movaps xmmword ptr [edx],xmm0 movss xmm0,dword ptr [eax+10h] movss xmm4,dword ptr [eax+14h] shufps xmm0,xmm0,0 shufps xmm4,xmm4,0 mulps xmm0,xmm1 mulps xmm4,xmm3 movss xmm5,dword ptr [eax+18h] addps xmm0,xmm4 shufps xmm5,xmm5,0 movss xmm4,dword ptr [eax+1Ch] mulps xmm5,xmm2 shufps xmm4,xmm4,0 mulps xmm4,xmm6 addps xmm5,xmm4 addps xmm0,xmm5 movaps xmmword ptr [edx+10h],xmm0 movss xmm0,dword ptr [eax+20h] movss xmm4,dword ptr [eax+24h] shufps xmm0,xmm0,0 shufps xmm4,xmm4,0 mulps xmm0,xmm1 mulps xmm4,xmm3 movss xmm5,dword ptr [eax+28h] addps xmm0,xmm4 shufps xmm5,xmm5,0 movss xmm4,dword ptr [eax+2Ch] mulps xmm5,xmm2 shufps xmm4,xmm4,0 mulps xmm4,xmm6 addps xmm5,xmm4 addps xmm0,xmm5 movaps xmmword ptr [edx+20h],xmm0 movss xmm0,dword ptr [eax+30h] movss xmm4,dword ptr [eax+34h] shufps xmm0,xmm0,0 shufps xmm4,xmm4,0 mulps xmm0,xmm1 mulps xmm4,xmm3 movss xmm1,dword ptr [eax+38h] addps xmm0,xmm4 shufps xmm1,xmm1,0 movss xmm3,dword ptr [eax+3Ch] mulps xmm1,xmm2 shufps xmm3,xmm3,0 mulps xmm3,xmm6 addps xmm1,xmm3 addps xmm0,xmm1 movaps xmmword ptr [edx+30h],xmm0 jmp lEnd lNonAligned: movlps xmm0,qword ptr [ecx] movhps xmm0,qword ptr [ecx+8] movlps xmm1,qword ptr [ecx+10h] movhps xmm1,qword ptr [ecx+18h] movlps xmm2,qword ptr [ecx+20h] movhps xmm2,qword ptr [ecx+28h] movlps xmm3,qword ptr [ecx+30h] movhps xmm3,qword ptr [ecx+38h] movss xmm4,dword ptr [eax] movss xmm5,dword ptr [eax+4] movss xmm6,dword ptr [eax+8] movss xmm7,dword ptr [eax+0Ch] shufps xmm4,xmm4,0 shufps xmm5,xmm5,0 shufps xmm6,xmm6,0 shufps xmm7,xmm7,0 mulps xmm4,xmm0 mulps xmm5,xmm1 mulps xmm6,xmm2 mulps xmm7,xmm3 addps xmm4,xmm5 addps xmm6,xmm7 addps xmm4,xmm6 movss xmm5,dword ptr [eax+10h] movss xmm6,dword ptr [eax+14h] movss xmm7,dword ptr [eax+18h] shufps xmm5,xmm5,0 shufps xmm6,xmm6,0 shufps xmm7,xmm7,0 mulps xmm5,xmm0 mulps xmm6,xmm1 mulps xmm7,xmm2 addps xmm5,xmm6 addps xmm5,xmm7 movss xmm6,dword ptr [eax+1Ch] shufps xmm6,xmm6,0 mulps xmm6,xmm3 addps xmm5,xmm6 movss xmm6,dword ptr [eax+20h] movss xmm7,dword ptr [eax+24h] shufps xmm6,xmm6,0 shufps xmm7,xmm7,0 mulps xmm6,xmm0 mulps xmm7,xmm1 addps xmm6,xmm7 movss xmm7,dword ptr [eax+28h] shufps xmm7,xmm7,0 mulps xmm7,xmm2 addps xmm6,xmm7 movss xmm7,dword ptr [eax+2Ch] shufps xmm7,xmm7,0 mulps xmm7,xmm3 addps xmm6,xmm7 movss xmm7,dword ptr [eax+30h] shufps xmm7,xmm7,0 mulps xmm0,xmm7 movss xmm7,dword ptr [eax+34h] shufps xmm7,xmm7,0 mulps xmm1,xmm7 movss xmm7,dword ptr [eax+38h] shufps xmm7,xmm7,0 mulps xmm2,xmm7 movss xmm7,dword ptr [eax+3Ch] shufps xmm7,xmm7,0 mulps xmm3,xmm7 movlps qword ptr [edx],xmm4 movhps qword ptr [edx+8],xmm4 addps xmm0,xmm1 movlps qword ptr [edx+10h],xmm5 movhps qword ptr [edx+18h],xmm5 addps xmm2,xmm3 movlps qword ptr [edx+20h],xmm6 movhps qword ptr [edx+28h],xmm6 addps xmm0,xmm2 movlps qword ptr [edx+30h],xmm0 movhps qword ptr [edx+38h],xmm0 lEnd: } } // Transform position using SSE instructions set // IMPORTANT NOTE: much faster if matrix is 16 bytes aligned inline void transformVec3f_SSE(float *result, float *inV, float *matrix) { _asm { and esp,0FFFFFFF0h sub esp,1Ch mov ecx,inV mov eax,matrix test al,0Fh movaps xmm0,xmmword ptr [esp+0Ch] movlps xmm0,qword ptr [ecx] mov ecx,dword ptr [ecx+8] push esi jne lNonAligned movaps xmm2,xmmword ptr [eax+30h] movaps xmm3,xmmword ptr [eax+20h] mov dword ptr [esp+0Ch],ecx movss xmm1,dword ptr [esp+0Ch] shufps xmm1,xmm1,0 mulps xmm3,xmm1 addps xmm3,xmm2 movaps xmm2,xmmword ptr [eax+10h] movaps xmm1,xmm0 shufps xmm1,xmm0,55h mulps xmm2,xmm1 movaps xmm1,xmm0 shufps xmm1,xmm0,0 movaps xmm0,xmmword ptr [eax] jmp lEnd lNonAligned: movaps xmm1,xmmword ptr [esp+10h] movaps xmm3,xmmword ptr [esp+10h] lea edx,[eax+38h] lea esi,[eax+30h] movlps xmm1,qword ptr [esi] movhps xmm1,qword ptr [edx] movaps xmm2,xmm1 mov dword ptr [esp+0Ch],ecx movss xmm1,dword ptr [esp+0Ch] shufps xmm1,xmm1,0 lea edx,[eax+20h] movlps xmm3,qword ptr [edx] lea ecx,[eax+28h] movhps xmm3,qword ptr [ecx] mulps xmm3,xmm1 addps xmm3,xmm2 movaps xmm2,xmmword ptr [esp+10h] movaps xmm1,xmm0 shufps xmm1,xmm0,55h lea ecx,[eax+18h] lea edx,[eax+10h] movlps xmm2,qword ptr [edx] movhps xmm2,qword ptr [ecx] mulps xmm2,xmm1 movaps xmm1,xmm0 shufps xmm1,xmm0,0 movaps xmm0,xmmword ptr [esp+10h] movlps xmm0,qword ptr [eax] lea ecx,[eax+8] movhps xmm0,qword ptr [ecx] lEnd: mov eax, result mulps xmm0,xmm1 addps xmm0,xmm2 addps xmm0,xmm3 movups xmmword ptr [eax],xmm0 } } // Transform normal using SSE instructions set // IMPORTANT NOTE: much faster if matrix is 16 bytes aligned inline void transformVec3Nf_SSE(float *result, float *inV, float *matrix) { _asm { and esp,0FFFFFFF0h sub esp,20h mov eax, matrix test al,0Fh mov ecx, inV jne lNonAligned movaps xmm1,xmmword ptr [esp+10h] movlps xmm1,qword ptr [ecx] mov ecx,dword ptr [ecx+8] movaps xmm2,xmmword ptr [eax+20h] mov dword ptr [esp+0Ch],ecx movss xmm0,dword ptr [esp+0Ch] shufps xmm0,xmm0,0 mulps xmm0,xmm2 movaps xmm2,xmm0 movaps xmm0,xmmword ptr [eax+10h] movaps xmm3,xmm1 shufps xmm3,xmm1,55h mulps xmm3,xmm0 movaps xmm0,xmmword ptr [eax] movaps xmm4,xmm1 shufps xmm4,xmm1,0 mulps xmm4,xmm0 movaps xmm0,xmm4 jmp lEnd lNonAligned: movaps xmm0,xmmword ptr [esp+10h] movlps xmm0,qword ptr [ecx] mov ecx,dword ptr [ecx+8] movaps xmm2,xmmword ptr [esp+10h] movaps xmm3,xmmword ptr [esp+10h] mov dword ptr [esp+0Ch],ecx movss xmm1,dword ptr [esp+0Ch] shufps xmm1,xmm1,0 lea edx,[eax+20h] movlps xmm2,qword ptr [edx] lea ecx,[eax+28h] movhps xmm2,qword ptr [ecx] mulps xmm2,xmm1 movaps xmm1,xmm0 shufps xmm1,xmm0,55h lea ecx,[eax+18h] lea edx,[eax+10h] movlps xmm3,qword ptr [edx] movhps xmm3,qword ptr [ecx] mulps xmm3,xmm1 movaps xmm1,xmm0 shufps xmm1,xmm0,0 movaps xmm0,xmmword ptr [esp+10h] movlps xmm0,qword ptr [eax] lea ecx,[eax+8] movhps xmm0,qword ptr [ecx] mulps xmm0,xmm1 lEnd: mov eax, result addps xmm0,xmm3 addps xmm0,xmm2 movaps xmm1,xmm0 lea ecx,[eax+8] movhlps xmm1,xmm0 movlps qword ptr [eax],xmm0 movss dword ptr [ecx],xmm1 mov esp,ebp pop ebp ret 0Ch } } #else const int PREFNTA_BLOCK = 0x4000; ILINE void cryMemcpy(void* Dst, const void* Src, int n) { char* dst=(char*)Dst; char* src=(char*)Src; while (n > PREFNTA_BLOCK) { #if !defined(LINUX) for (int p = 0; p < PREFNTA_BLOCK; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); } #endif memcpy(dst, src, PREFNTA_BLOCK); src += PREFNTA_BLOCK; dst += PREFNTA_BLOCK; n -= PREFNTA_BLOCK; } #if !defined(LINUX) for (int p = 0; p < n; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); } #endif memcpy(dst, src, n); } ILINE void cryMemcpy( void* Dst, const void* Src, INT n, int nFlags ) { char* dst=(char*)Dst; char* src=(char*)Src; while (n > PREFNTA_BLOCK) { #if !defined(LINUX) for (int p = 0; p < PREFNTA_BLOCK; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); } #endif memcpy(dst, src, PREFNTA_BLOCK); src += PREFNTA_BLOCK; dst += PREFNTA_BLOCK; n -= PREFNTA_BLOCK; } #if !defined(LINUX) for (int p = 0; p < n; p+=64) { _mm_prefetch((char *) src + p, _MM_HINT_NTA); } #endif memcpy(dst, src, n); } #endif inline void mathTransformVec3fN(float *pOut, float *pIn, float *matrix, int nV, int OptFlags) { #if defined _CPU_X86 && !defined(LINUX) // TODO: AMD64 port: NEED TO IMPLEMENT!!! int i; if (OptFlags & CPUF_3DNOW) { for (i=0; i