123
This commit is contained in:
366
ResourceCompilerPC/CrySkinAMD64.asm
Normal file
366
ResourceCompilerPC/CrySkinAMD64.asm
Normal file
@@ -0,0 +1,366 @@
|
||||
|
||||
;****************************************************************************
|
||||
;*
|
||||
;* This is the 64bit SSE-version of CrySkinFull.cpp.
|
||||
;* Rewritten by Ivo Herzeg
|
||||
;*
|
||||
;* Entry parameters:
|
||||
;*
|
||||
;* rcx = pAux
|
||||
;* rdx = pVertex
|
||||
;* r8 = pDest
|
||||
;* r9 = pBone
|
||||
;* [rSP+60] = pvMin ; After the call but before the push rbp
|
||||
;* [rSP+68] = pBoneEnd
|
||||
;*
|
||||
;****************************************************************************
|
||||
|
||||
|
||||
CRY_SKIN_AUX_INT_SIZE equ 2 ; This must match the define in CrySkinTypes.h
|
||||
|
||||
pvMin = 060h ; After the push rbp
|
||||
pBoneEnd = 068h ; "
|
||||
|
||||
|
||||
|
||||
X00 = 000h
|
||||
X10 = 004h
|
||||
X20 = 008h
|
||||
|
||||
Y01 = 010h
|
||||
Y11 = 014h
|
||||
Y21 = 018h
|
||||
|
||||
Z02 = 020h
|
||||
Z12 = 024h
|
||||
Z22 = 028h
|
||||
|
||||
TransX = 030h ; After the push rbp
|
||||
TransY = 034h ; After the push rbp
|
||||
TransZ = 038h ; After the push rbp
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
_data SEGMENT
|
||||
align 16
|
||||
var6 qword 0,0
|
||||
var7 qword 0,0
|
||||
var8 qword 0,0
|
||||
var9 qword 0,0
|
||||
var10 qword 0,0
|
||||
var11 qword 0,0
|
||||
var12 qword 0,0
|
||||
var13 qword 0,0
|
||||
var14 qword 0,0
|
||||
var15 qword 0,0
|
||||
|
||||
_text SEGMENT
|
||||
PUBLIC Amd64Skinner
|
||||
|
||||
|
||||
Amd64Skinner PROC FRAME
|
||||
|
||||
push rBP
|
||||
push rSI
|
||||
push rDI
|
||||
|
||||
push rAX
|
||||
push rBX
|
||||
push rCX
|
||||
push rDX
|
||||
|
||||
movdqa var6,xmm6
|
||||
movdqa var7,xmm7
|
||||
movdqa var8,xmm8
|
||||
movdqa var9,xmm9
|
||||
movdqa var10,xmm10
|
||||
movdqa var11,xmm11
|
||||
movdqa var12,xmm12
|
||||
movdqa var13,xmm13
|
||||
movdqa var14,xmm14
|
||||
movdqa var15,xmm15
|
||||
|
||||
|
||||
; For debug, I will copy the parameters into the same registers which Crytek used in the inline assembler.
|
||||
|
||||
mov rSI, rdx ;parameter 1
|
||||
mov rDX, rcx ;parameter 2
|
||||
mov rDI, r8 ;parameter 3
|
||||
; mov qqqqq, r9 ;parameter 4 //pointer to the matrix stack
|
||||
|
||||
|
||||
startLoop:
|
||||
cmp r9, pBoneEnd[rSP]
|
||||
jz endLoop
|
||||
|
||||
; load the current matrix; we don't need the move component
|
||||
movss xmm9, [r9+X00]
|
||||
movss xmm10, [r9+Y01]
|
||||
movss xmm11, [r9+X10]
|
||||
movss xmm12, [r9+Y11]
|
||||
movss xmm13, [r9+X20]
|
||||
movss xmm14, [r9+Y21]
|
||||
|
||||
; load the counter for the number of non-flipped tangets for this bone
|
||||
xor rCX,rCX
|
||||
mov CX, word ptr [rdx] ; Was mov CX, word ptr [EDX]
|
||||
add rDX, 2 ; add EDX, 2
|
||||
test eCX, eCX
|
||||
jz endLoopRigid
|
||||
|
||||
startLoopRigid:
|
||||
|
||||
; calculate the destination pointer
|
||||
mov rax, [rSI+0Ch]
|
||||
and rax, 0FFFFFFh
|
||||
add rax, rax
|
||||
|
||||
prefetch [rSI+140h]
|
||||
|
||||
movss xmm0, [rSI+00h] ;x
|
||||
movss xmm3, xmm0
|
||||
movss xmm1, [rSI+04h] ;y
|
||||
movss xmm4, xmm1
|
||||
movss xmm2, [rSI+08h] ;z
|
||||
movss xmm5, xmm2
|
||||
|
||||
prefetchw [rDI+rax*8+40h]
|
||||
|
||||
movss xmm6, xmm0
|
||||
movss xmm7, xmm1
|
||||
movss xmm8, xmm2
|
||||
|
||||
mulss xmm0, xmm13 ;x*M20
|
||||
mulss xmm1, xmm14 ;y*M21
|
||||
mulss xmm2, [r9+Z22] ;z*M22
|
||||
addss xmm0, [r9+TransZ]
|
||||
addss xmm1, xmm2
|
||||
addss xmm0, xmm1
|
||||
|
||||
|
||||
mulss xmm3, xmm9 ;x*M00
|
||||
mulss xmm4, xmm10 ;y*M01
|
||||
mulss xmm5, [r9+Z02] ;z*M02
|
||||
addss xmm3, [r9+TransX]
|
||||
addss xmm4, xmm5
|
||||
addss xmm3, xmm4
|
||||
|
||||
mulss xmm6, xmm11 ;x*M10
|
||||
mulss xmm7, xmm12 ;y*M11
|
||||
mulss xmm8, [r9+Z12] ;z*M12
|
||||
addss xmm6, [r9+TransY]
|
||||
addss xmm7, xmm8
|
||||
addss xmm6, xmm7
|
||||
|
||||
movss [rDI+rAX*8+08h], xmm0
|
||||
movss [rDI+rAX*8+00h], xmm3
|
||||
movss [rDI+rAX*8+04h], xmm6
|
||||
|
||||
add rSI, 010h ; rdi+rax*8 (EDI+EAX*8) points to the destination vector now
|
||||
dec eCX
|
||||
jnz startLoopRigid
|
||||
endLoopRigid:
|
||||
|
||||
|
||||
|
||||
;/////////////////////////////////////////////////////////
|
||||
;// Smooth-1 loop
|
||||
;/////////////////////////////////////////////////////////
|
||||
|
||||
movss xmm9, [r9+X00]
|
||||
movss xmm10, [r9+Y01]
|
||||
movss xmm11, [r9+X10]
|
||||
movss xmm12, [r9+Y11]
|
||||
movss xmm13, [r9+X20]
|
||||
movss xmm14, [r9+Y21]
|
||||
|
||||
|
||||
; load the counter for the number of smooth vertices met for the first time
|
||||
xor ECX,ECX
|
||||
mov CX, word ptr [rdx]
|
||||
add rDX, 2
|
||||
test ECX, ECX
|
||||
jz endLoopSmooth1
|
||||
|
||||
|
||||
startLoopSmooth1:
|
||||
|
||||
; calculate the destination pointer
|
||||
xor EAX,EAX
|
||||
mov AX, word ptr [rdx]
|
||||
add rDX, 2
|
||||
shl rAX,1
|
||||
|
||||
|
||||
prefetch [rSI+140h]
|
||||
|
||||
movss xmm0, [rSI+00h] ;x
|
||||
movss xmm3, xmm0
|
||||
movss xmm1, [rSI+04h] ;y
|
||||
movss xmm4, xmm1
|
||||
movss xmm2, [rSI+08h] ;z
|
||||
movss xmm5, xmm2
|
||||
movss xmm15, [rSI+0ch] ;w
|
||||
|
||||
prefetchw [rDI+rax*8+40h]
|
||||
|
||||
movss xmm6, xmm0
|
||||
movss xmm7, xmm1
|
||||
movss xmm8, xmm2
|
||||
|
||||
mulss xmm0, xmm13 ;x*M20
|
||||
mulss xmm1, xmm14 ;y*M21
|
||||
mulss xmm2, [r9+Z22] ;z*M22
|
||||
addss xmm0, [r9+TransZ]
|
||||
addss xmm1, xmm2
|
||||
addss xmm0, xmm1
|
||||
|
||||
mulss xmm3, xmm9 ;x*M00
|
||||
mulss xmm4, xmm10 ;y*M01
|
||||
mulss xmm5, [r9+Z02] ;z*M02
|
||||
addss xmm3, [r9+TransX]
|
||||
addss xmm4, xmm5
|
||||
addss xmm3, xmm4
|
||||
|
||||
mulss xmm6, xmm11 ;x*M10
|
||||
mulss xmm7, xmm12 ;y*M11
|
||||
mulss xmm8, [r9+Z12] ;z*M12
|
||||
addss xmm6, [r9+TransY]
|
||||
addss xmm7, xmm8
|
||||
addss xmm6, xmm7
|
||||
|
||||
mulss xmm0, xmm15 ;Z*weight
|
||||
mulss xmm3, xmm15 ;X*weight
|
||||
mulss xmm6, xmm15 ;Y*weight
|
||||
|
||||
add rSI, 010h
|
||||
dec eCX
|
||||
|
||||
movss [rDI+rAX*8+08h], xmm0
|
||||
movss [rDI+rAX*8+00h], xmm3
|
||||
movss [rDI+rAX*8+04h], xmm6
|
||||
|
||||
jnz startLoopSmooth1
|
||||
endLoopSmooth1:
|
||||
|
||||
|
||||
|
||||
|
||||
;//////////////////////////////////////////////////////////////////
|
||||
;// Smooth-2 loop
|
||||
;//////////////////////////////////////////////////////////////////
|
||||
|
||||
movss xmm9, [r9+X00]
|
||||
movss xmm10, [r9+Y01]
|
||||
movss xmm11, [r9+X10]
|
||||
movss xmm12, [r9+Y11]
|
||||
movss xmm13, [r9+X20]
|
||||
movss xmm14, [r9+Y21]
|
||||
|
||||
|
||||
;// load the counter for the number of smooth vertices met for the second time
|
||||
xor ECX,ECX
|
||||
mov CX, word ptr [rdx]
|
||||
add rdx, 2
|
||||
test ECX, ECX
|
||||
jz endLoopSmooth2
|
||||
|
||||
startLoopSmooth2:
|
||||
|
||||
; calculate the destination pointer
|
||||
xor EAX,EAX
|
||||
mov ax, word ptr [rdx]
|
||||
add rdx, 2
|
||||
shl rax, 4
|
||||
|
||||
prefetch [rSI+140h]
|
||||
|
||||
movss xmm0, [rSI+00h] ;x
|
||||
movss xmm3, xmm0
|
||||
movss xmm1, [rSI+04h] ;y
|
||||
movss xmm4, xmm1
|
||||
movss xmm2, [rSI+08h] ;z
|
||||
movss xmm5, xmm2
|
||||
movss xmm15, [rSI+0ch] ;w
|
||||
|
||||
prefetchw [rDI+rAX+40h]
|
||||
|
||||
movss xmm6, xmm0
|
||||
movss xmm7, xmm1
|
||||
movss xmm8, xmm2
|
||||
|
||||
mulss xmm0, xmm13 ;x*M20
|
||||
mulss xmm1, xmm14 ;y*M21
|
||||
mulss xmm2, [r9+Z22] ;z*M22
|
||||
addss xmm0, [r9+TransZ]
|
||||
addss xmm1, xmm2
|
||||
addss xmm0, xmm1
|
||||
|
||||
mulss xmm3, xmm9 ;x*M00
|
||||
mulss xmm4, xmm10 ;y*M01
|
||||
mulss xmm5, [r9+Z02] ;z*M02
|
||||
addss xmm3, [r9+TransX]
|
||||
addss xmm4, xmm5
|
||||
addss xmm3, xmm4
|
||||
|
||||
mulss xmm6, xmm11 ;x*M10
|
||||
mulss xmm7, xmm12 ;y*M11
|
||||
mulss xmm8, [r9+Z12] ;z*M12
|
||||
addss xmm6, [r9+TransY]
|
||||
addss xmm7, xmm8
|
||||
addss xmm6, xmm7
|
||||
|
||||
mulss xmm0, xmm15 ;Z*weight
|
||||
mulss xmm3, xmm15 ;X*weight
|
||||
mulss xmm6, xmm15 ;Y*weight
|
||||
|
||||
add rSI, 010h
|
||||
dec eCX
|
||||
|
||||
addss xmm0,[rDI+rAX+08h]
|
||||
addss xmm3,[rDI+rAX+00h]
|
||||
addss xmm6,[rDI+rAX+04h]
|
||||
|
||||
movss [rDI+rAX+08h], xmm0
|
||||
movss [rDI+rAX+00h], xmm3
|
||||
movss [rDI+rAX+04h], xmm6
|
||||
|
||||
jnz startLoopSmooth2
|
||||
|
||||
endLoopSmooth2:
|
||||
|
||||
add r9, 040h
|
||||
jmp startLoop
|
||||
endLoop:
|
||||
|
||||
|
||||
movdqa xmm6, var6
|
||||
movdqa xmm7, var7
|
||||
movdqa xmm8, var8
|
||||
movdqa xmm9, var9
|
||||
movdqa xmm10, var10
|
||||
movdqa xmm11, var11
|
||||
movdqa xmm12, var12
|
||||
movdqa xmm13, var13
|
||||
movdqa xmm14, var14
|
||||
movdqa xmm15, var15
|
||||
|
||||
pop rDX
|
||||
pop rCX
|
||||
pop rBX
|
||||
pop rAX
|
||||
pop rDI
|
||||
pop rSI
|
||||
pop rBP
|
||||
ret
|
||||
|
||||
Amd64Skinner ENDP
|
||||
|
||||
_text ENDS
|
||||
|
||||
.endprolog
|
||||
|
||||
END
|
||||
Reference in New Issue
Block a user