Add support for all x86-64 registers Prefer caller-saved register over callee-saved on WIN64 Support up to 15 function arguments Also (by Ronald S. Bultje) Fix up our asm to work with new x86inc.asm. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>tags/n0.11
| @@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | |||||
| %macro EMU_EDGE_FUNC 0 | %macro EMU_EDGE_FUNC 0 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define w_reg r10 | |||||
| cglobal emu_edge_core, 6, 7, 1 | |||||
| mov r11, r5 ; save block_h | |||||
| %define w_reg r7 | |||||
| cglobal emu_edge_core, 6, 9, 1 | |||||
| mov r8, r5 ; save block_h | |||||
| %else | %else | ||||
| %define w_reg r6 | %define w_reg r6 | ||||
| cglobal emu_edge_core, 2, 7, 0 | cglobal emu_edge_core, 2, 7, 0 | ||||
| @@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0 | |||||
| sub r0, w_reg | sub r0, w_reg | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r3, r0 ; backup of buf+block_h*linesize | mov r3, r0 ; backup of buf+block_h*linesize | ||||
| mov r5, r11 | |||||
| mov r5, r8 | |||||
| %else | %else | ||||
| mov r0m, r0 ; backup of buf+block_h*linesize | mov r0m, r0 ; backup of buf+block_h*linesize | ||||
| mov r5, r5m | mov r5, r5m | ||||
| @@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0 | |||||
| ; FIXME we can do a if size == 1 here if that makes any speed difference, test me | ; FIXME we can do a if size == 1 here if that makes any speed difference, test me | ||||
| sar w_reg, 1 | sar w_reg, 1 | ||||
| sal w_reg, 6 | sal w_reg, 6 | ||||
| ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs | |||||
| ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs | |||||
| ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h | ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea rax, [.emuedge_extend_left_2] | lea rax, [.emuedge_extend_left_2] | ||||
| @@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0 | |||||
| %endif | %endif | ||||
| call w_reg | call w_reg | ||||
| ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w | |||||
| ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w | |||||
| .right_extend: | .right_extend: | ||||
| %if ARCH_X86_32 | %if ARCH_X86_32 | ||||
| mov r0, r0m | mov r0, r0m | ||||
| @@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0 | |||||
| %define vall al | %define vall al | ||||
| %define valh ah | %define valh ah | ||||
| %define valw ax | %define valw ax | ||||
| %define valw2 r10w | |||||
| %define valw2 r7w | |||||
| %define valw3 r3w | %define valw3 r3w | ||||
| %if WIN64 | %if WIN64 | ||||
| %define valw4 r4w | %define valw4 r4w | ||||
| @@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0 | |||||
| ; - else if (%2 & 8) fills 8 bytes into mm0 | ; - else if (%2 & 8) fills 8 bytes into mm0 | ||||
| ; - if (%2 & 7 == 4) fills the last 4 bytes into rax | ; - if (%2 & 7 == 4) fills the last 4 bytes into rax | ||||
| ; - else if (%2 & 4) fills 4 bytes into mm0-1 | ; - else if (%2 & 4) fills 4 bytes into mm0-1 | ||||
| ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax | |||||
| ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax | |||||
| ; (note that we're using r3 for body/bottom because it's a shorter | ; (note that we're using r3 for body/bottom because it's a shorter | ||||
| ; opcode, and then the loop fits in 128 bytes) | ; opcode, and then the loop fits in 128 bytes) | ||||
| ; - else fills remaining bytes into rax | ; - else fills remaining bytes into rax | ||||
| @@ -848,7 +848,7 @@ ALIGN 64 | |||||
| %endrep | %endrep | ||||
| %endmacro ; LEFT_EXTEND | %endmacro ; LEFT_EXTEND | ||||
| ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val | |||||
| ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val | |||||
| %macro RIGHT_EXTEND 0 | %macro RIGHT_EXTEND 0 | ||||
| %assign %%n 2 | %assign %%n 2 | ||||
| %rep 11 | %rep 11 | ||||
| @@ -858,7 +858,7 @@ ALIGN 64 | |||||
| sub r3, r2 ; dst -= linesize | sub r3, r2 ; dst -= linesize | ||||
| READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels | READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels | ||||
| WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels | WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels | ||||
| dec r11 | |||||
| dec r8 | |||||
| %else ; ARCH_X86_32 | %else ; ARCH_X86_32 | ||||
| sub r0, r2 ; dst -= linesize | sub r0, r2 ; dst -= linesize | ||||
| READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels | READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels | ||||
| @@ -937,11 +937,11 @@ ALIGN 64 | |||||
| %macro SLOW_V_EXTEND 0 | %macro SLOW_V_EXTEND 0 | ||||
| .slow_v_extend_loop: | .slow_v_extend_loop: | ||||
| ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h | ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h | ||||
| ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x | |||||
| ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| push r11 ; save old value of block_h | |||||
| push r8 ; save old value of block_h | |||||
| test r3, r3 | test r3, r3 | ||||
| %define cnt_reg r11 | |||||
| %define cnt_reg r8 | |||||
| jz .do_body_copy ; if (!start_y) goto do_body_copy | jz .do_body_copy ; if (!start_y) goto do_body_copy | ||||
| V_COPY_ROW top, r3 | V_COPY_ROW top, r3 | ||||
| %else | %else | ||||
| @@ -955,7 +955,7 @@ ALIGN 64 | |||||
| V_COPY_ROW body, r4 | V_COPY_ROW body, r4 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| pop r11 ; restore old value of block_h | |||||
| pop r8 ; restore old value of block_h | |||||
| %define cnt_reg r3 | %define cnt_reg r3 | ||||
| %endif | %endif | ||||
| test r5, r5 | test r5, r5 | ||||
| @@ -974,7 +974,7 @@ ALIGN 64 | |||||
| %macro SLOW_LEFT_EXTEND 0 | %macro SLOW_LEFT_EXTEND 0 | ||||
| .slow_left_extend_loop: | .slow_left_extend_loop: | ||||
| ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x | |||||
| ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x | |||||
| mov r4, 8 | mov r4, 8 | ||||
| sub r0, linesize | sub r0, linesize | ||||
| READ_V_PIXEL 8, [r0+w_reg] | READ_V_PIXEL 8, [r0+w_reg] | ||||
| @@ -1002,11 +1002,11 @@ ALIGN 64 | |||||
| %macro SLOW_RIGHT_EXTEND 0 | %macro SLOW_RIGHT_EXTEND 0 | ||||
| .slow_right_extend_loop: | .slow_right_extend_loop: | ||||
| ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, | |||||
| ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr | |||||
| ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, | |||||
| ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define buf_reg r3 | %define buf_reg r3 | ||||
| %define bh_reg r11 | |||||
| %define bh_reg r8 | |||||
| %else | %else | ||||
| %define buf_reg r0 | %define buf_reg r0 | ||||
| %define bh_reg r5 | %define bh_reg r5 | ||||
| @@ -749,14 +749,11 @@ INIT_XMM | |||||
| %endmacro | %endmacro | ||||
| %macro DECL_IMDCT 2 | %macro DECL_IMDCT 2 | ||||
| cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input | |||||
| cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define rrevtab r10 | |||||
| %define rtcos r11 | |||||
| %define rtsin r12 | |||||
| push r12 | |||||
| push r13 | |||||
| push r14 | |||||
| %define rrevtab r7 | |||||
| %define rtcos r8 | |||||
| %define rtsin r9 | |||||
| %else | %else | ||||
| %define rrevtab r6 | %define rrevtab r6 | ||||
| %define rtsin r6 | %define rtsin r6 | ||||
| @@ -798,12 +795,12 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movzx r5, word [rrevtab+r4-4] | movzx r5, word [rrevtab+r4-4] | ||||
| movzx r6, word [rrevtab+r4-2] | movzx r6, word [rrevtab+r4-2] | ||||
| movzx r13, word [rrevtab+r3] | |||||
| movzx r14, word [rrevtab+r3+2] | |||||
| movzx r10, word [rrevtab+r3] | |||||
| movzx r11, word [rrevtab+r3+2] | |||||
| movlps [r1+r5 *8], xmm0 | movlps [r1+r5 *8], xmm0 | ||||
| movhps [r1+r6 *8], xmm0 | movhps [r1+r6 *8], xmm0 | ||||
| movlps [r1+r13*8], xmm1 | |||||
| movhps [r1+r14*8], xmm1 | |||||
| movlps [r1+r10*8], xmm1 | |||||
| movhps [r1+r11*8], xmm1 | |||||
| add r4, 4 | add r4, 4 | ||||
| %else | %else | ||||
| mov r6, [esp] | mov r6, [esp] | ||||
| @@ -839,11 +836,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * | |||||
| mov r1, -mmsize | mov r1, -mmsize | ||||
| sub r1, r0 | sub r1, r0 | ||||
| %2 r0, r1, r6, rtcos, rtsin | %2 r0, r1, r6, rtcos, rtsin | ||||
| %if ARCH_X86_64 | |||||
| pop r14 | |||||
| pop r13 | |||||
| pop r12 | |||||
| %else | |||||
| %if ARCH_X86_64 == 0 | |||||
| add esp, 12 | add esp, 12 | ||||
| %endif | %endif | ||||
| %ifidn avx_enabled, 1 | %ifidn avx_enabled, 1 | ||||
| @@ -179,9 +179,8 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2 | |||||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | %macro FLOAT_TO_INT16_INTERLEAVE6 1 | ||||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | ||||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||||
| cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define lend r10d | |||||
| mov lend, r2d | mov lend, r2d | ||||
| %else | %else | ||||
| %define lend dword r2m | %define lend dword r2m | ||||
| @@ -240,9 +239,8 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro FLOAT_INTERLEAVE6 2 | %macro FLOAT_INTERLEAVE6 2 | ||||
| cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 | |||||
| cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define lend r10d | |||||
| mov lend, r2d | mov lend, r2d | ||||
| %else | %else | ||||
| %define lend dword r2m | %define lend dword r2m | ||||
| @@ -91,9 +91,22 @@ SECTION .text | |||||
| %endmacro | %endmacro | ||||
| %macro chroma_mc8_mmx_func 3 | %macro chroma_mc8_mmx_func 3 | ||||
| %ifidn %2, rv40 | |||||
| %ifdef PIC | |||||
| %define rnd_1d_rv40 r8 | |||||
| %define rnd_2d_rv40 r8 | |||||
| %define extra_regs 2 | |||||
| %else ; no-PIC | |||||
| %define rnd_1d_rv40 rnd_rv40_1d_tbl | |||||
| %define rnd_2d_rv40 rnd_rv40_2d_tbl | |||||
| %define extra_regs 1 | |||||
| %endif ; PIC | |||||
| %else | |||||
| %define extra_regs 0 | |||||
| %endif ; rv40 | |||||
| ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, | ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, | ||||
| ; int stride, int h, int mx, int my) | ; int stride, int h, int mx, int my) | ||||
| cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 | |||||
| cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movsxd r2, r2d | movsxd r2, r2d | ||||
| %endif | %endif | ||||
| @@ -106,19 +119,12 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 | |||||
| .at_least_one_non_zero | .at_least_one_non_zero | ||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| %ifdef PIC | |||||
| %define rnd_1d_rv40 r11 | |||||
| %define rnd_2d_rv40 r11 | |||||
| %else ; no-PIC | |||||
| %define rnd_1d_rv40 rnd_rv40_1d_tbl | |||||
| %define rnd_2d_rv40 rnd_rv40_2d_tbl | |||||
| %endif | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r5 | |||||
| and r10, 6 ; &~1 for mx/my=[0,7] | |||||
| lea r10, [r10*4+r4] | |||||
| sar r10d, 1 | |||||
| %define rnd_bias r10 | |||||
| mov r7, r5 | |||||
| and r7, 6 ; &~1 for mx/my=[0,7] | |||||
| lea r7, [r7*4+r4] | |||||
| sar r7d, 1 | |||||
| %define rnd_bias r7 | |||||
| %define dest_reg r0 | %define dest_reg r0 | ||||
| %else ; x86-32 | %else ; x86-32 | ||||
| mov r0, r5 | mov r0, r5 | ||||
| @@ -145,7 +151,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 | |||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [rnd_rv40_1d_tbl] | |||||
| lea r8, [rnd_rv40_1d_tbl] | |||||
| %endif | %endif | ||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r5, r0m | mov r5, r0m | ||||
| @@ -196,7 +202,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 | |||||
| movd m6, r5d ; y | movd m6, r5d ; y | ||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [rnd_rv40_2d_tbl] | |||||
| lea r8, [rnd_rv40_2d_tbl] | |||||
| %endif | %endif | ||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r5, r0m | mov r5, r0m | ||||
| @@ -278,7 +284,13 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 | |||||
| %endmacro | %endmacro | ||||
| %macro chroma_mc4_mmx_func 3 | %macro chroma_mc4_mmx_func 3 | ||||
| cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 | |||||
| %define extra_regs 0 | |||||
| %ifidn %2, rv40 | |||||
| %ifdef PIC | |||||
| %define extra_regs 1 | |||||
| %endif ; PIC | |||||
| %endif ; rv40 | |||||
| cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movsxd r2, r2d | movsxd r2, r2d | ||||
| %endif | %endif | ||||
| @@ -296,8 +308,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 | |||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [rnd_rv40_2d_tbl] | |||||
| %define rnd_2d_rv40 r11 | |||||
| lea r6, [rnd_rv40_2d_tbl] | |||||
| %define rnd_2d_rv40 r6 | |||||
| %else | %else | ||||
| %define rnd_2d_rv40 rnd_rv40_2d_tbl | %define rnd_2d_rv40 rnd_rv40_2d_tbl | ||||
| %endif | %endif | ||||
| @@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10 | |||||
| ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| INIT_MMX | INIT_MMX | ||||
| cglobal deblock_h_luma_8_%1, 5,7 | |||||
| movsxd r10, r1d | |||||
| lea r11, [r10+r10*2] | |||||
| cglobal deblock_h_luma_8_%1, 5,9 | |||||
| movsxd r7, r1d | |||||
| lea r8, [r7+r7*2] | |||||
| lea r6, [r0-4] | lea r6, [r0-4] | ||||
| lea r5, [r0-4+r11] | |||||
| lea r5, [r0-4+r8] | |||||
| %if WIN64 | %if WIN64 | ||||
| sub rsp, 0x98 | sub rsp, 0x98 | ||||
| %define pix_tmp rsp+0x30 | %define pix_tmp rsp+0x30 | ||||
| @@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7 | |||||
| %endif | %endif | ||||
| ; transpose 6x16 -> tmp space | ; transpose 6x16 -> tmp space | ||||
| TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp | |||||
| lea r6, [r6+r10*8] | |||||
| lea r5, [r5+r10*8] | |||||
| TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 | |||||
| TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp | |||||
| lea r6, [r6+r7*8] | |||||
| lea r5, [r5+r7*8] | |||||
| TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 | |||||
| ; vertical filter | ; vertical filter | ||||
| ; alpha, beta, tc0 are still in r2d, r3d, r4 | ; alpha, beta, tc0 are still in r2d, r3d, r4 | ||||
| ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them | |||||
| ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them | |||||
| lea r0, [pix_tmp+0x30] | lea r0, [pix_tmp+0x30] | ||||
| mov r1d, 0x10 | mov r1d, 0x10 | ||||
| %if WIN64 | %if WIN64 | ||||
| @@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7 | |||||
| movq m1, [pix_tmp+0x28] | movq m1, [pix_tmp+0x28] | ||||
| movq m2, [pix_tmp+0x38] | movq m2, [pix_tmp+0x38] | ||||
| movq m3, [pix_tmp+0x48] | movq m3, [pix_tmp+0x48] | ||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) | |||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) | |||||
| shl r10, 3 | |||||
| sub r6, r10 | |||||
| sub r5, r10 | |||||
| shr r10, 3 | |||||
| shl r7, 3 | |||||
| sub r6, r7 | |||||
| sub r5, r7 | |||||
| shr r7, 3 | |||||
| movq m0, [pix_tmp+0x10] | movq m0, [pix_tmp+0x10] | ||||
| movq m1, [pix_tmp+0x20] | movq m1, [pix_tmp+0x20] | ||||
| movq m2, [pix_tmp+0x30] | movq m2, [pix_tmp+0x30] | ||||
| movq m3, [pix_tmp+0x40] | movq m3, [pix_tmp+0x40] | ||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) | |||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) | |||||
| %if WIN64 | %if WIN64 | ||||
| add rsp, 0x98 | add rsp, 0x98 | ||||
| @@ -705,32 +705,32 @@ INIT_MMX | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) | ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| cglobal deblock_h_luma_intra_8_%1, 4,7 | |||||
| movsxd r10, r1d | |||||
| lea r11, [r10*3] | |||||
| cglobal deblock_h_luma_intra_8_%1, 4,9 | |||||
| movsxd r7, r1d | |||||
| lea r8, [r7*3] | |||||
| lea r6, [r0-4] | lea r6, [r0-4] | ||||
| lea r5, [r0-4+r11] | |||||
| lea r5, [r0-4+r8] | |||||
| sub rsp, 0x88 | sub rsp, 0x88 | ||||
| %define pix_tmp rsp | %define pix_tmp rsp | ||||
| ; transpose 8x16 -> tmp space | ; transpose 8x16 -> tmp space | ||||
| TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |||||
| lea r6, [r6+r10*8] | |||||
| lea r5, [r5+r10*8] | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |||||
| lea r6, [r6+r7*8] | |||||
| lea r5, [r5+r7*8] | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |||||
| lea r0, [pix_tmp+0x40] | lea r0, [pix_tmp+0x40] | ||||
| mov r1, 0x10 | mov r1, 0x10 | ||||
| call deblock_v_luma_intra_8_%1 | call deblock_v_luma_intra_8_%1 | ||||
| ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | ||||
| lea r5, [r6+r11] | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) | |||||
| shl r10, 3 | |||||
| sub r6, r10 | |||||
| sub r5, r10 | |||||
| shr r10, 3 | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) | |||||
| lea r5, [r6+r8] | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) | |||||
| shl r7, 3 | |||||
| sub r6, r7 | |||||
| sub r5, r7 | |||||
| shr r7, 3 | |||||
| TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) | |||||
| add rsp, 0x88 | add rsp, 0x88 | ||||
| RET | RET | ||||
| %else | %else | ||||
| @@ -45,8 +45,10 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 | |||||
| db 4+13*8, 5+13*8, 4+14*8, 5+14*8 | db 4+13*8, 5+13*8, 4+14*8, 5+14*8 | ||||
| db 6+13*8, 7+13*8, 6+14*8, 7+14*8 | db 6+13*8, 7+13*8, 6+14*8, 7+14*8 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| %define scan8 r11 | |||||
| %define npicregs 1 | |||||
| %define scan8 picregq | |||||
| %else | %else | ||||
| %define npicregs 0 | |||||
| %define scan8 scan8_mem | %define scan8 scan8_mem | ||||
| %endif | %endif | ||||
| @@ -301,10 +303,10 @@ cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 | |||||
| ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16_8_mmx, 5, 7, 0 | |||||
| cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -323,13 +325,13 @@ cglobal h264_idct_add16_8_mmx, 5, 7, 0 | |||||
| ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, | ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct8_add4_8_mmx, 5, 7, 0 | |||||
| cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |||||
| %assign pad 128+4-(stack_offset&7) | %assign pad 128+4-(stack_offset&7) | ||||
| SUB rsp, pad | SUB rsp, pad | ||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -355,10 +357,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 | |||||
| ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16_8_mmx2, 5, 7, 0 | |||||
| cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -371,16 +373,13 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .no_dc | jz .no_dc | ||||
| DC_ADD_MMX2_INIT r2, r3, r6 | DC_ADD_MMX2_INIT r2, r3, r6 | ||||
| %if ARCH_X86_64 | |||||
| %define dst_reg r10 | |||||
| %define dst_regd r10d | |||||
| %else | |||||
| %define dst_reg r1 | |||||
| %define dst_regd r1d | |||||
| %if ARCH_X86_64 == 0 | |||||
| %define dst2q r1 | |||||
| %define dst2d r1d | |||||
| %endif | %endif | ||||
| mov dst_regd, dword [r1+r5*4] | |||||
| lea dst_reg, [r0+dst_reg] | |||||
| DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | |||||
| mov dst2d, dword [r1+r5*4] | |||||
| lea dst2q, [r0+dst2q] | |||||
| DC_ADD_MMX2_OP movh, dst2q, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| @@ -402,10 +401,10 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 | |||||
| ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 | |||||
| cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -425,10 +424,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 | |||||
| ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 | |||||
| cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -448,16 +447,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| DC_ADD_MMX2_INIT r2, r3, r6 | DC_ADD_MMX2_INIT r2, r3, r6 | ||||
| %if ARCH_X86_64 | |||||
| %define dst_reg r10 | |||||
| %define dst_regd r10d | |||||
| %else | |||||
| %define dst_reg r1 | |||||
| %define dst_regd r1d | |||||
| %if ARCH_X86_64 == 0 | |||||
| %define dst2q r1 | |||||
| %define dst2d r1d | |||||
| %endif | %endif | ||||
| mov dst_regd, dword [r1+r5*4] | |||||
| add dst_reg, r0 | |||||
| DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | |||||
| mov dst2d, dword [r1+r5*4] | |||||
| add dst2q, r0 | |||||
| DC_ADD_MMX2_OP movh, dst2q, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| @@ -470,13 +466,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 | |||||
| ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 | |||||
| cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| %assign pad 128+4-(stack_offset&7) | %assign pad 128+4-(stack_offset&7) | ||||
| SUB rsp, pad | SUB rsp, pad | ||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -489,18 +485,15 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .no_dc | jz .no_dc | ||||
| DC_ADD_MMX2_INIT r2, r3, r6 | DC_ADD_MMX2_INIT r2, r3, r6 | ||||
| %if ARCH_X86_64 | |||||
| %define dst_reg r10 | |||||
| %define dst_regd r10d | |||||
| %else | |||||
| %define dst_reg r1 | |||||
| %define dst_regd r1d | |||||
| %endif | |||||
| mov dst_regd, dword [r1+r5*4] | |||||
| lea dst_reg, [r0+dst_reg] | |||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||||
| lea dst_reg, [dst_reg+r3*4] | |||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | |||||
| %define dst2q r1 | |||||
| %define dst2d r1d | |||||
| %endif | |||||
| mov dst2d, dword [r1+r5*4] | |||||
| lea dst2q, [r0+dst2q] | |||||
| DC_ADD_MMX2_OP mova, dst2q, r3, r6 | |||||
| lea dst2q, [dst2q+r3*4] | |||||
| DC_ADD_MMX2_OP mova, dst2q, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| @@ -533,10 +526,10 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 | |||||
| INIT_XMM | INIT_XMM | ||||
| ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct8_add4_8_sse2, 5, 7, 10 | |||||
| cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| xor r5, r5 | xor r5, r5 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| .nextblock | .nextblock | ||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| @@ -550,18 +543,15 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10 | |||||
| jz .no_dc | jz .no_dc | ||||
| INIT_MMX | INIT_MMX | ||||
| DC_ADD_MMX2_INIT r2, r3, r6 | DC_ADD_MMX2_INIT r2, r3, r6 | ||||
| %if ARCH_X86_64 | |||||
| %define dst_reg r10 | |||||
| %define dst_regd r10d | |||||
| %else | |||||
| %define dst_reg r1 | |||||
| %define dst_regd r1d | |||||
| %endif | |||||
| mov dst_regd, dword [r1+r5*4] | |||||
| add dst_reg, r0 | |||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||||
| lea dst_reg, [dst_reg+r3*4] | |||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | |||||
| %define dst2q r1 | |||||
| %define dst2d r1d | |||||
| %endif | |||||
| mov dst2d, dword [r1+r5*4] | |||||
| add dst2q, r0 | |||||
| DC_ADD_MMX2_OP mova, dst2q, r3, r6 | |||||
| lea dst2q, [dst2q+r3*4] | |||||
| DC_ADD_MMX2_OP mova, dst2q, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| @@ -572,9 +562,9 @@ INIT_MMX | |||||
| REP_RET | REP_RET | ||||
| .no_dc | .no_dc | ||||
| INIT_XMM | INIT_XMM | ||||
| mov dst_regd, dword [r1+r5*4] | |||||
| add dst_reg, r0 | |||||
| IDCT8_ADD_SSE dst_reg, r2, r3, r6 | |||||
| mov dst2d, dword [r1+r5*4] | |||||
| add dst2q, r0 | |||||
| IDCT8_ADD_SSE dst2q, r2, r3, r6 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| @@ -595,7 +585,7 @@ h264_idct_add8_mmx_plane: | |||||
| jz .skipblock | jz .skipblock | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+r5*4] | mov r0d, dword [r1+r5*4] | ||||
| add r0, [r10] | |||||
| add r0, [dst2q] | |||||
| %else | %else | ||||
| mov r0, r1m ; XXX r1m here is actually r0m of the calling func | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -611,20 +601,20 @@ h264_idct_add8_mmx_plane: | |||||
| ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, | ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add8_8_mmx, 5, 7, 0 | |||||
| cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| mov r5, 16 | mov r5, 16 | ||||
| add r2, 512 | add r2, 512 | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov dst2q, r0 | |||||
| %endif | %endif | ||||
| call h264_idct_add8_mmx_plane | call h264_idct_add8_mmx_plane | ||||
| mov r5, 32 | mov r5, 32 | ||||
| add r2, 384 | add r2, 384 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r10, gprsize | |||||
| add dst2q, gprsize | |||||
| %else | %else | ||||
| add r0mp, gprsize | add r0mp, gprsize | ||||
| %endif | %endif | ||||
| @@ -639,7 +629,7 @@ h264_idct_add8_mmx2_plane | |||||
| jz .try_dc | jz .try_dc | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+r5*4] | mov r0d, dword [r1+r5*4] | ||||
| add r0, [r10] | |||||
| add r0, [dst2q] | |||||
| %else | %else | ||||
| mov r0, r1m ; XXX r1m here is actually r0m of the calling func | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -658,7 +648,7 @@ h264_idct_add8_mmx2_plane | |||||
| DC_ADD_MMX2_INIT r2, r3, r6 | DC_ADD_MMX2_INIT r2, r3, r6 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+r5*4] | mov r0d, dword [r1+r5*4] | ||||
| add r0, [r10] | |||||
| add r0, [dst2q] | |||||
| %else | %else | ||||
| mov r0, r1m ; XXX r1m here is actually r0m of the calling func | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -674,20 +664,20 @@ h264_idct_add8_mmx2_plane | |||||
| ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, | ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add8_8_mmx2, 5, 7, 0 | |||||
| cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
| mov r5, 16 | mov r5, 16 | ||||
| add r2, 512 | add r2, 512 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov dst2q, r0 | |||||
| %endif | %endif | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r11, [scan8_mem] | |||||
| lea picregq, [scan8_mem] | |||||
| %endif | %endif | ||||
| call h264_idct_add8_mmx2_plane | call h264_idct_add8_mmx2_plane | ||||
| mov r5, 32 | mov r5, 32 | ||||
| add r2, 384 | add r2, 384 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r10, gprsize | |||||
| add dst2q, gprsize | |||||
| %else | %else | ||||
| add r0mp, gprsize | add r0mp, gprsize | ||||
| %endif | %endif | ||||
| @@ -739,7 +729,7 @@ x264_add8x4_idct_sse2: | |||||
| jz .cycle%1end | jz .cycle%1end | ||||
| mov r0d, dword [r1+%1*8] | mov r0d, dword [r1+%1*8] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r0, r10 | |||||
| add r0, r5 | |||||
| %else | %else | ||||
| add r0, r0m | add r0, r0m | ||||
| %endif | %endif | ||||
| @@ -752,9 +742,9 @@ x264_add8x4_idct_sse2: | |||||
| ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16_8_sse2, 5, 5, 8 | |||||
| cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov r5, r0 | |||||
| %endif | %endif | ||||
| ; unrolling of the loop leads to an average performance gain of | ; unrolling of the loop leads to an average performance gain of | ||||
| ; 20-25% | ; 20-25% | ||||
| @@ -774,7 +764,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 | |||||
| jz .try%1dc | jz .try%1dc | ||||
| mov r0d, dword [r1+%1*8] | mov r0d, dword [r1+%1*8] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r0, r10 | |||||
| add r0, r7 | |||||
| %else | %else | ||||
| add r0, r0m | add r0, r0m | ||||
| %endif | %endif | ||||
| @@ -786,7 +776,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 | |||||
| jz .cycle%1end | jz .cycle%1end | ||||
| mov r0d, dword [r1+%1*8] | mov r0d, dword [r1+%1*8] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r0, r10 | |||||
| add r0, r7 | |||||
| %else | %else | ||||
| add r0, r0m | add r0, r0m | ||||
| %endif | %endif | ||||
| @@ -799,9 +789,9 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 | |||||
| ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 | |||||
| cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov r7, r0 | |||||
| %endif | %endif | ||||
| add16intra_sse2_cycle 0, 0xc | add16intra_sse2_cycle 0, 0xc | ||||
| add16intra_sse2_cycle 1, 0x14 | add16intra_sse2_cycle 1, 0x14 | ||||
| @@ -819,7 +809,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 | |||||
| jz .try%1dc | jz .try%1dc | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | ||||
| add r0, [r10] | |||||
| add r0, [r7] | |||||
| %else | %else | ||||
| mov r0, r0m | mov r0, r0m | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -833,7 +823,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 | |||||
| jz .cycle%1end | jz .cycle%1end | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | ||||
| add r0, [r10] | |||||
| add r0, [r7] | |||||
| %else | %else | ||||
| mov r0, r0m | mov r0, r0m | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -850,15 +840,15 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 | |||||
| ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, | ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, | ||||
| ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | ||||
| cglobal h264_idct_add8_8_sse2, 5, 7, 8 | |||||
| cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8 | |||||
| add r2, 512 | add r2, 512 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov r7, r0 | |||||
| %endif | %endif | ||||
| add8_sse2_cycle 0, 0x34 | add8_sse2_cycle 0, 0x34 | ||||
| add8_sse2_cycle 1, 0x3c | add8_sse2_cycle 1, 0x3c | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r10, gprsize | |||||
| add r7, gprsize | |||||
| %else | %else | ||||
| add r0mp, gprsize | add r0mp, gprsize | ||||
| %endif | %endif | ||||
| @@ -29,24 +29,6 @@ SECTION_RODATA | |||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | pw_pixel_max: times 8 dw ((1 << 10)-1) | ||||
| pd_32: times 4 dd 32 | pd_32: times 4 dd 32 | ||||
| scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 | |||||
| db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 | |||||
| db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 | |||||
| db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 | |||||
| db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 | |||||
| db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 | |||||
| db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 | |||||
| db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 | |||||
| db 4+11*8, 5+11*8, 4+12*8, 5+12*8 | |||||
| db 6+11*8, 7+11*8, 6+12*8, 7+12*8 | |||||
| db 4+13*8, 5+13*8, 4+14*8, 5+14*8 | |||||
| db 6+13*8, 7+13*8, 6+14*8, 7+14*8 | |||||
| %ifdef PIC | |||||
| %define scan8 r11 | |||||
| %else | |||||
| %define scan8 scan8_mem | |||||
| %endif | |||||
| SECTION .text | SECTION .text | ||||
| @@ -315,9 +297,9 @@ IDCT_ADD16INTRA_10 avx | |||||
| ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) | ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro IDCT_ADD8 1 | %macro IDCT_ADD8 1 | ||||
| cglobal h264_idct_add8_10_%1,5,7,7 | |||||
| cglobal h264_idct_add8_10_%1,5,8,7 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov r7, r0 | |||||
| %endif | %endif | ||||
| add r2, 1024 | add r2, 1024 | ||||
| mov r0, [r0] | mov r0, [r0] | ||||
| @@ -325,7 +307,7 @@ cglobal h264_idct_add8_10_%1,5,7,7 | |||||
| ADD16_OP_INTRA %1, 18, 4+ 7*8 | ADD16_OP_INTRA %1, 18, 4+ 7*8 | ||||
| add r2, 1024-128*2 | add r2, 1024-128*2 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0, [r10+gprsize] | |||||
| mov r0, [r7+gprsize] | |||||
| %else | %else | ||||
| mov r0, r0m | mov r0, r0m | ||||
| mov r0, [r0+gprsize] | mov r0, [r0+gprsize] | ||||
| @@ -289,7 +289,7 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro H264_PRED16x16_PLANE 3 | %macro H264_PRED16x16_PLANE 3 | ||||
| cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| cglobal pred16x16_plane_%3_%1, 2, 9, %2 | |||||
| mov r2, r1 ; +stride | mov r2, r1 ; +stride | ||||
| neg r1 ; -stride | neg r1 ; -stride | ||||
| @@ -349,7 +349,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| add r4, r2 | add r4, r2 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define e_reg r11 | |||||
| %define e_reg r8 | |||||
| %else | %else | ||||
| %define e_reg r0 | %define e_reg r0 | ||||
| %endif | %endif | ||||
| @@ -370,8 +370,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| movzx e_reg, byte [r3 ] | movzx e_reg, byte [r3 ] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movzx r10, byte [r4+r2 ] | |||||
| sub r10, e_reg | |||||
| movzx r7, byte [r4+r2 ] | |||||
| sub r7, e_reg | |||||
| %else | %else | ||||
| movzx r6, byte [r4+r2 ] | movzx r6, byte [r4+r2 ] | ||||
| sub r6, e_reg | sub r6, e_reg | ||||
| @@ -386,7 +386,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| movzx r6, byte [r3 ] | movzx r6, byte [r3 ] | ||||
| sub r6, r4 | sub r6, r4 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| lea r6, [r10+r6*2] | |||||
| lea r6, [r7+r6*2] | |||||
| lea r5, [r5+r6*2] | lea r5, [r5+r6*2] | ||||
| add r5, r6 | add r5, r6 | ||||
| %else | %else | ||||
| @@ -396,9 +396,9 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| movzx r4, byte [e_reg ] | movzx r4, byte [e_reg ] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movzx r10, byte [r3 +r2 ] | |||||
| sub r10, r4 | |||||
| sub r5, r10 | |||||
| movzx r7, byte [r3 +r2 ] | |||||
| sub r7, r4 | |||||
| sub r5, r7 | |||||
| %else | %else | ||||
| movzx r6, byte [r3 +r2 ] | movzx r6, byte [r3 +r2 ] | ||||
| sub r6, r4 | sub r6, r4 | ||||
| @@ -410,7 +410,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 | |||||
| movzx r6, byte [r3 +r2*2] | movzx r6, byte [r3 +r2*2] | ||||
| sub r6, r4 | sub r6, r4 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r6, r10 | |||||
| add r6, r7 | |||||
| %endif | %endif | ||||
| lea r5, [r5+r6*8] | lea r5, [r5+r6*8] | ||||
| @@ -588,7 +588,7 @@ H264_PRED16x16_PLANE ssse3, 8, svq3 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro H264_PRED8x8_PLANE 2 | %macro H264_PRED8x8_PLANE 2 | ||||
| cglobal pred8x8_plane_%1, 2, 7, %2 | |||||
| cglobal pred8x8_plane_%1, 2, 9, %2 | |||||
| mov r2, r1 ; +stride | mov r2, r1 ; +stride | ||||
| neg r1 ; -stride | neg r1 ; -stride | ||||
| @@ -642,7 +642,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 | |||||
| add r4, r2 | add r4, r2 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define e_reg r11 | |||||
| %define e_reg r8 | |||||
| %else | %else | ||||
| %define e_reg r0 | %define e_reg r0 | ||||
| %endif | %endif | ||||
| @@ -653,9 +653,9 @@ cglobal pred8x8_plane_%1, 2, 7, %2 | |||||
| movzx e_reg, byte [r3 ] | movzx e_reg, byte [r3 ] | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movzx r10, byte [r4+r2 ] | |||||
| sub r10, e_reg | |||||
| sub r5, r10 | |||||
| movzx r7, byte [r4+r2 ] | |||||
| sub r7, e_reg | |||||
| sub r5, r7 | |||||
| %else | %else | ||||
| movzx r6, byte [r4+r2 ] | movzx r6, byte [r4+r2 ] | ||||
| sub r6, e_reg | sub r6, e_reg | ||||
| @@ -667,7 +667,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 | |||||
| movzx r6, byte [r4+r2*2 ] | movzx r6, byte [r4+r2*2 ] | ||||
| sub r6, e_reg | sub r6, e_reg | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| add r6, r10 | |||||
| add r6, r7 | |||||
| %endif | %endif | ||||
| lea r5, [r5+r6*4] | lea r5, [r5+r6*4] | ||||
| @@ -121,8 +121,8 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 | |||||
| %endmacro | %endmacro | ||||
| %macro MCAxA_OP 8 | %macro MCAxA_OP 8 | ||||
| cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 | |||||
| %if ARCH_X86_32 | %if ARCH_X86_32 | ||||
| cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 | |||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| mov r0, r0m | mov r0, r0m | ||||
| mov r1, r1m | mov r1, r1m | ||||
| @@ -141,17 +141,19 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 | |||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| RET | RET | ||||
| %else ; ARCH_X86_64 | %else ; ARCH_X86_64 | ||||
| mov r10, r0 | |||||
| mov r11, r1 | |||||
| cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8 | |||||
| mov r%7, r0 | |||||
| %assign p1 %7+1 | |||||
| mov r %+ p1, r1 | |||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| lea r0, [r10+%4*2] | |||||
| lea r1, [r11+%4*2] | |||||
| lea r0, [r%7+%4*2] | |||||
| lea r1, [r %+ p1+%4*2] | |||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| lea r0, [r10+r2*%4] | |||||
| lea r1, [r11+r2*%4] | |||||
| lea r0, [r%7+r2*%4] | |||||
| lea r1, [r %+ p1+r2*%4] | |||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| lea r0, [r10+r2*%4+%4*2] | |||||
| lea r1, [r11+r2*%4+%4*2] | |||||
| lea r0, [r%7+r2*%4+%4*2] | |||||
| lea r1, [r %+ p1+r2*%4+%4*2] | |||||
| %if UNIX64 == 0 ; fall through to function | %if UNIX64 == 0 ; fall through to function | ||||
| call stub_%2_h264_qpel%4_%3_10_%1 | call stub_%2_h264_qpel%4_%3_10_%1 | ||||
| RET | RET | ||||
| @@ -127,7 +127,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %macro BIWEIGHT_SETUP 0 | %macro BIWEIGHT_SETUP 0 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define off_regd r11d | |||||
| %define off_regd r7d | |||||
| %else | %else | ||||
| %define off_regd r3d | %define off_regd r3d | ||||
| %endif | %endif | ||||
| @@ -175,7 +175,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| cglobal h264_biweight_16_mmx2, 7, 7, 0 | |||||
| cglobal h264_biweight_16_mmx2, 7, 8, 0 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| .nextrow | .nextrow | ||||
| @@ -194,7 +194,7 @@ cglobal h264_biweight_16_mmx2, 7, 7, 0 | |||||
| REP_RET | REP_RET | ||||
| %macro BIWEIGHT_FUNC_MM 3 | %macro BIWEIGHT_FUNC_MM 3 | ||||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||||
| cglobal h264_biweight_%1_%3, 7, 8, %2 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| .nextrow | .nextrow | ||||
| @@ -215,7 +215,7 @@ INIT_XMM | |||||
| BIWEIGHT_FUNC_MM 16, 8, sse2 | BIWEIGHT_FUNC_MM 16, 8, sse2 | ||||
| %macro BIWEIGHT_FUNC_HALF_MM 3 | %macro BIWEIGHT_FUNC_HALF_MM 3 | ||||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||||
| cglobal h264_biweight_%1_%3, 7, 8, %2 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| sar r3, 1 | sar r3, 1 | ||||
| @@ -245,7 +245,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %macro BIWEIGHT_SSSE3_SETUP 0 | %macro BIWEIGHT_SSSE3_SETUP 0 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define off_regd r11d | |||||
| %define off_regd r7d | |||||
| %else | %else | ||||
| %define off_regd r3d | %define off_regd r3d | ||||
| %endif | %endif | ||||
| @@ -277,7 +277,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| cglobal h264_biweight_16_ssse3, 7, 7, 8 | |||||
| cglobal h264_biweight_16_ssse3, 7, 8, 8 | |||||
| BIWEIGHT_SSSE3_SETUP | BIWEIGHT_SSSE3_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| @@ -296,7 +296,7 @@ cglobal h264_biweight_16_ssse3, 7, 7, 8 | |||||
| REP_RET | REP_RET | ||||
| INIT_XMM | INIT_XMM | ||||
| cglobal h264_biweight_8_ssse3, 7, 7, 8 | |||||
| cglobal h264_biweight_8_ssse3, 7, 8, 8 | |||||
| BIWEIGHT_SSSE3_SETUP | BIWEIGHT_SSSE3_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| sar r3, 1 | sar r3, 1 | ||||
| @@ -1,11 +1,12 @@ | |||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* x86inc.asm: x264asm abstraction layer | ;* x86inc.asm: x264asm abstraction layer | ||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* Copyright (C) 2005-2011 x264 project | |||||
| ;* Copyright (C) 2005-2012 x264 project | |||||
| ;* | ;* | ||||
| ;* Authors: Loren Merritt <lorenm@u.washington.edu> | ;* Authors: Loren Merritt <lorenm@u.washington.edu> | ||||
| ;* Anton Mitrofanov <BugMaster@narod.ru> | ;* Anton Mitrofanov <BugMaster@narod.ru> | ||||
| ;* Jason Garrett-Glaser <darkshikari@gmail.com> | ;* Jason Garrett-Glaser <darkshikari@gmail.com> | ||||
| ;* Henrik Gramner <hengar-6@student.ltu.se> | |||||
| ;* | ;* | ||||
| ;* Permission to use, copy, modify, and/or distribute this software for any | ;* Permission to use, copy, modify, and/or distribute this software for any | ||||
| ;* purpose with or without fee is hereby granted, provided that the above | ;* purpose with or without fee is hereby granted, provided that the above | ||||
| @@ -95,6 +96,9 @@ | |||||
| default rel | default rel | ||||
| %endif | %endif | ||||
| ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) | |||||
| CPU amdnop | |||||
| ; Macros to eliminate most code duplication between x86_32 and x86_64: | ; Macros to eliminate most code duplication between x86_32 and x86_64: | ||||
| ; Currently this works only for leaf functions which load all their arguments | ; Currently this works only for leaf functions which load all their arguments | ||||
| ; into registers at the start, and make no other use of the stack. Luckily that | ; into registers at the start, and make no other use of the stack. Luckily that | ||||
| @@ -128,18 +132,20 @@ | |||||
| ; rNm is the original location of arg N (a register or on the stack), dword | ; rNm is the original location of arg N (a register or on the stack), dword | ||||
| ; rNmp is native size | ; rNmp is native size | ||||
| %macro DECLARE_REG 6 | |||||
| %macro DECLARE_REG 5-6 | |||||
| %define r%1q %2 | %define r%1q %2 | ||||
| %define r%1d %3 | %define r%1d %3 | ||||
| %define r%1w %4 | %define r%1w %4 | ||||
| %define r%1b %5 | %define r%1b %5 | ||||
| %define r%1m %6 | |||||
| %ifid %6 ; i.e. it's a register | |||||
| %if %0 == 5 | |||||
| %define r%1m %3 | |||||
| %define r%1mp %2 | %define r%1mp %2 | ||||
| %elif ARCH_X86_64 ; memory | %elif ARCH_X86_64 ; memory | ||||
| %define r%1mp qword %6 | |||||
| %define r%1m [rsp + stack_offset + %6] | |||||
| %define r%1mp qword r %+ %1m | |||||
| %else | %else | ||||
| %define r%1mp dword %6 | |||||
| %define r%1m [esp + stack_offset + %6] | |||||
| %define r%1mp dword r %+ %1m | |||||
| %endif | %endif | ||||
| %define r%1 %2 | %define r%1 %2 | ||||
| %endmacro | %endmacro | ||||
| @@ -187,7 +193,7 @@ DECLARE_REG_SIZE bp, bpl | |||||
| %endrep | %endrep | ||||
| %endmacro | %endmacro | ||||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 | |||||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| %define gprsize 8 | %define gprsize 8 | ||||
| @@ -205,6 +211,33 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 | |||||
| %assign stack_offset stack_offset-gprsize | %assign stack_offset stack_offset-gprsize | ||||
| %endmacro | %endmacro | ||||
| %macro PUSH_IF_USED 1-* | |||||
| %rep %0 | |||||
| %if %1 < regs_used | |||||
| PUSH r%1 | |||||
| %endif | |||||
| %rotate 1 | |||||
| %endrep | |||||
| %endmacro | |||||
| %macro POP_IF_USED 1-* | |||||
| %rep %0 | |||||
| %if %1 < regs_used | |||||
| pop r%1 | |||||
| %endif | |||||
| %rotate 1 | |||||
| %endrep | |||||
| %endmacro | |||||
| %macro LOAD_IF_USED 1-* | |||||
| %rep %0 | |||||
| %if %1 < num_args | |||||
| mov r%1, r %+ %1 %+ mp | |||||
| %endif | |||||
| %rotate 1 | |||||
| %endrep | |||||
| %endmacro | |||||
| %macro SUB 2 | %macro SUB 2 | ||||
| sub %1, %2 | sub %1, %2 | ||||
| %ifidn %1, rsp | %ifidn %1, rsp | ||||
| @@ -272,39 +305,34 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 | |||||
| %if WIN64 ; Windows x64 ;================================================= | %if WIN64 ; Windows x64 ;================================================= | ||||
| DECLARE_REG 0, rcx, ecx, cx, cl, ecx | |||||
| DECLARE_REG 1, rdx, edx, dx, dl, edx | |||||
| DECLARE_REG 2, r8, r8d, r8w, r8b, r8d | |||||
| DECLARE_REG 3, r9, r9d, r9w, r9b, r9d | |||||
| DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] | |||||
| DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] | |||||
| DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %define r7m [rsp + stack_offset + 64] | |||||
| %define r8m [rsp + stack_offset + 72] | |||||
| %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |||||
| %if %1 < %2 | |||||
| mov r%1, [rsp + stack_offset + 8 + %1*8] | |||||
| %endif | |||||
| %endmacro | |||||
| DECLARE_REG 0, rcx, ecx, cx, cl | |||||
| DECLARE_REG 1, rdx, edx, dx, dl | |||||
| DECLARE_REG 2, R8, R8D, R8W, R8B | |||||
| DECLARE_REG 3, R9, R9D, R9W, R9B | |||||
| DECLARE_REG 4, R10, R10D, R10W, R10B, 40 | |||||
| DECLARE_REG 5, R11, R11D, R11W, R11B, 48 | |||||
| DECLARE_REG 6, rax, eax, ax, al, 56 | |||||
| DECLARE_REG 7, rdi, edi, di, dil, 64 | |||||
| DECLARE_REG 8, rsi, esi, si, sil, 72 | |||||
| DECLARE_REG 9, rbx, ebx, bx, bl, 80 | |||||
| DECLARE_REG 10, rbp, ebp, bp, bpl, 88 | |||||
| DECLARE_REG 11, R12, R12D, R12W, R12B, 96 | |||||
| DECLARE_REG 12, R13, R13D, R13W, R13B, 104 | |||||
| DECLARE_REG 13, R14, R14D, R14W, R14B, 112 | |||||
| DECLARE_REG 14, R15, R15D, R15W, R15B, 120 | |||||
| %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | ||||
| ASSERT %2 >= %1 | |||||
| %assign num_args %1 | |||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| ASSERT regs_used <= 7 | |||||
| %if regs_used > 4 | |||||
| push r4 | |||||
| push r5 | |||||
| %assign stack_offset stack_offset+16 | |||||
| %endif | |||||
| ASSERT regs_used >= num_args | |||||
| ASSERT regs_used <= 15 | |||||
| PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| %if mmsize == 8 | %if mmsize == 8 | ||||
| %assign xmm_regs_used 0 | %assign xmm_regs_used 0 | ||||
| %else | %else | ||||
| WIN64_SPILL_XMM %3 | WIN64_SPILL_XMM %3 | ||||
| %endif | %endif | ||||
| LOAD_IF_USED 4, %1 | |||||
| LOAD_IF_USED 5, %1 | |||||
| LOAD_IF_USED 6, %1 | |||||
| LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| DEFINE_ARGS %4 | DEFINE_ARGS %4 | ||||
| %endmacro | %endmacro | ||||
| @@ -312,12 +340,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %assign xmm_regs_used %1 | %assign xmm_regs_used %1 | ||||
| ASSERT xmm_regs_used <= 16 | ASSERT xmm_regs_used <= 16 | ||||
| %if xmm_regs_used > 6 | %if xmm_regs_used > 6 | ||||
| sub rsp, (xmm_regs_used-6)*16+16 | |||||
| %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 | |||||
| SUB rsp, (xmm_regs_used-6)*16+16 | |||||
| %assign %%i xmm_regs_used | %assign %%i xmm_regs_used | ||||
| %rep (xmm_regs_used-6) | %rep (xmm_regs_used-6) | ||||
| %assign %%i %%i-1 | %assign %%i %%i-1 | ||||
| movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i | |||||
| movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i | |||||
| %endrep | %endrep | ||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| @@ -327,7 +354,7 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %assign %%i xmm_regs_used | %assign %%i xmm_regs_used | ||||
| %rep (xmm_regs_used-6) | %rep (xmm_regs_used-6) | ||||
| %assign %%i %%i-1 | %assign %%i %%i-1 | ||||
| movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] | |||||
| movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] | |||||
| %endrep | %endrep | ||||
| add %1, (xmm_regs_used-6)*16+16 | add %1, (xmm_regs_used-6)*16+16 | ||||
| %endif | %endif | ||||
| @@ -341,15 +368,12 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %macro RET 0 | %macro RET 0 | ||||
| WIN64_RESTORE_XMM_INTERNAL rsp | WIN64_RESTORE_XMM_INTERNAL rsp | ||||
| %if regs_used > 4 | |||||
| pop r5 | |||||
| pop r4 | |||||
| %endif | |||||
| POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 | |||||
| ret | ret | ||||
| %endmacro | %endmacro | ||||
| %macro REP_RET 0 | %macro REP_RET 0 | ||||
| %if regs_used > 4 || xmm_regs_used > 6 | |||||
| %if regs_used > 7 || xmm_regs_used > 6 | |||||
| RET | RET | ||||
| %else | %else | ||||
| rep ret | rep ret | ||||
| @@ -358,92 +382,80 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %elif ARCH_X86_64 ; *nix x64 ;============================================= | %elif ARCH_X86_64 ; *nix x64 ;============================================= | ||||
| DECLARE_REG 0, rdi, edi, di, dil, edi | |||||
| DECLARE_REG 1, rsi, esi, si, sil, esi | |||||
| DECLARE_REG 2, rdx, edx, dx, dl, edx | |||||
| DECLARE_REG 3, rcx, ecx, cx, cl, ecx | |||||
| DECLARE_REG 4, r8, r8d, r8w, r8b, r8d | |||||
| DECLARE_REG 5, r9, r9d, r9w, r9b, r9d | |||||
| DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] | |||||
| %define r7m [rsp + stack_offset + 16] | |||||
| %define r8m [rsp + stack_offset + 24] | |||||
| %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |||||
| %if %1 < %2 | |||||
| mov r%1, [rsp - 40 + %1*8] | |||||
| %endif | |||||
| %endmacro | |||||
| DECLARE_REG 0, rdi, edi, di, dil | |||||
| DECLARE_REG 1, rsi, esi, si, sil | |||||
| DECLARE_REG 2, rdx, edx, dx, dl | |||||
| DECLARE_REG 3, rcx, ecx, cx, cl | |||||
| DECLARE_REG 4, R8, R8D, R8W, R8B | |||||
| DECLARE_REG 5, R9, R9D, R9W, R9B | |||||
| DECLARE_REG 6, rax, eax, ax, al, 8 | |||||
| DECLARE_REG 7, R10, R10D, R10W, R10B, 16 | |||||
| DECLARE_REG 8, R11, R11D, R11W, R11B, 24 | |||||
| DECLARE_REG 9, rbx, ebx, bx, bl, 32 | |||||
| DECLARE_REG 10, rbp, ebp, bp, bpl, 40 | |||||
| DECLARE_REG 11, R12, R12D, R12W, R12B, 48 | |||||
| DECLARE_REG 12, R13, R13D, R13W, R13B, 56 | |||||
| DECLARE_REG 13, R14, R14D, R14W, R14B, 64 | |||||
| DECLARE_REG 14, R15, R15D, R15W, R15B, 72 | |||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | ||||
| ASSERT %2 >= %1 | |||||
| ASSERT %2 <= 7 | |||||
| LOAD_IF_USED 6, %1 | |||||
| %assign num_args %1 | |||||
| %assign regs_used %2 | |||||
| ASSERT regs_used >= num_args | |||||
| ASSERT regs_used <= 15 | |||||
| PUSH_IF_USED 9, 10, 11, 12, 13, 14 | |||||
| LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| DEFINE_ARGS %4 | DEFINE_ARGS %4 | ||||
| %endmacro | %endmacro | ||||
| %macro RET 0 | %macro RET 0 | ||||
| POP_IF_USED 14, 13, 12, 11, 10, 9 | |||||
| ret | ret | ||||
| %endmacro | %endmacro | ||||
| %macro REP_RET 0 | %macro REP_RET 0 | ||||
| rep ret | |||||
| %if regs_used > 9 | |||||
| RET | |||||
| %else | |||||
| rep ret | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %else ; X86_32 ;============================================================== | %else ; X86_32 ;============================================================== | ||||
| DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] | |||||
| DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] | |||||
| DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] | |||||
| DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] | |||||
| DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] | |||||
| DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] | |||||
| DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||||
| %define r7m [esp + stack_offset + 32] | |||||
| %define r8m [esp + stack_offset + 36] | |||||
| DECLARE_REG 0, eax, eax, ax, al, 4 | |||||
| DECLARE_REG 1, ecx, ecx, cx, cl, 8 | |||||
| DECLARE_REG 2, edx, edx, dx, dl, 12 | |||||
| DECLARE_REG 3, ebx, ebx, bx, bl, 16 | |||||
| DECLARE_REG 4, esi, esi, si, null, 20 | |||||
| DECLARE_REG 5, edi, edi, di, null, 24 | |||||
| DECLARE_REG 6, ebp, ebp, bp, null, 28 | |||||
| %define rsp esp | %define rsp esp | ||||
| %macro PUSH_IF_USED 1 ; reg_id | |||||
| %if %1 < regs_used | |||||
| push r%1 | |||||
| %assign stack_offset stack_offset+4 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro POP_IF_USED 1 ; reg_id | |||||
| %if %1 < regs_used | |||||
| pop r%1 | |||||
| %endif | |||||
| %macro DECLARE_ARG 1-* | |||||
| %rep %0 | |||||
| %define r%1m [esp + stack_offset + 4*%1 + 4] | |||||
| %define r%1mp dword r%1m | |||||
| %rotate 1 | |||||
| %endrep | |||||
| %endmacro | %endmacro | ||||
| %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |||||
| %if %1 < %2 | |||||
| mov r%1, [esp + stack_offset + 4 + %1*4] | |||||
| %endif | |||||
| %endmacro | |||||
| DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | ||||
| ASSERT %2 >= %1 | |||||
| %assign num_args %1 | |||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| ASSERT regs_used <= 7 | |||||
| PUSH_IF_USED 3 | |||||
| PUSH_IF_USED 4 | |||||
| PUSH_IF_USED 5 | |||||
| PUSH_IF_USED 6 | |||||
| LOAD_IF_USED 0, %1 | |||||
| LOAD_IF_USED 1, %1 | |||||
| LOAD_IF_USED 2, %1 | |||||
| LOAD_IF_USED 3, %1 | |||||
| LOAD_IF_USED 4, %1 | |||||
| LOAD_IF_USED 5, %1 | |||||
| LOAD_IF_USED 6, %1 | |||||
| %if regs_used > 7 | |||||
| %assign regs_used 7 | |||||
| %endif | |||||
| ASSERT regs_used >= num_args | |||||
| PUSH_IF_USED 3, 4, 5, 6 | |||||
| LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | |||||
| DEFINE_ARGS %4 | DEFINE_ARGS %4 | ||||
| %endmacro | %endmacro | ||||
| %macro RET 0 | %macro RET 0 | ||||
| POP_IF_USED 6 | |||||
| POP_IF_USED 5 | |||||
| POP_IF_USED 4 | |||||
| POP_IF_USED 3 | |||||
| POP_IF_USED 6, 5, 4, 3 | |||||
| ret | ret | ||||
| %endmacro | %endmacro | ||||
| @@ -464,8 +476,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||||
| %endmacro | %endmacro | ||||
| %endif | %endif | ||||
| ;============================================================================= | ;============================================================================= | ||||
| ; arch-independent part | ; arch-independent part | ||||
| ;============================================================================= | ;============================================================================= | ||||
| @@ -62,11 +62,11 @@ SECTION .text | |||||
| %define cntr_reg fltsizeq | %define cntr_reg fltsizeq | ||||
| %define movsx mov | %define movsx mov | ||||
| %else | %else | ||||
| %define cntr_reg r11 | |||||
| %define cntr_reg r7 | |||||
| %define movsx movsxd | %define movsx movsxd | ||||
| %endif | %endif | ||||
| cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset | |||||
| cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset | |||||
| %if %1 == 8 || %1 == 9 || %1 == 10 | %if %1 == 8 || %1 == 9 || %1 == 10 | ||||
| pxor m6, m6 | pxor m6, m6 | ||||
| %endif ; %1 == 8/9/10 | %endif ; %1 == 8/9/10 | ||||
| @@ -53,7 +53,7 @@ SECTION .text | |||||
| %ifnidn %3, X | %ifnidn %3, X | ||||
| cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 | cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 | ||||
| %else | %else | ||||
| cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize | |||||
| cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize | |||||
| %endif | %endif | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| movsxd wq, wd | movsxd wq, wd | ||||
| @@ -245,10 +245,9 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz | |||||
| %define dlt 0 | %define dlt 0 | ||||
| %endif ; %4 ==/!= X4 | %endif ; %4 ==/!= X4 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| push r12 | |||||
| %define srcq r11 | |||||
| %define pos1q r10 | |||||
| %define srcendq r12 | |||||
| %define srcq r8 | |||||
| %define pos1q r7 | |||||
| %define srcendq r9 | |||||
| movsxd fltsizeq, fltsized ; filterSize | movsxd fltsizeq, fltsized ; filterSize | ||||
| lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] | lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] | ||||
| %else ; x86-32 | %else ; x86-32 | ||||
| @@ -388,16 +387,7 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz | |||||
| add wq, 2 | add wq, 2 | ||||
| %endif ; %3 ==/!= X | %endif ; %3 ==/!= X | ||||
| jl .loop | jl .loop | ||||
| %ifnidn %3, X | |||||
| REP_RET | REP_RET | ||||
| %else ; %3 == X | |||||
| %if ARCH_X86_64 | |||||
| pop r12 | |||||
| RET | |||||
| %else ; x86-32 | |||||
| REP_RET | |||||
| %endif ; x86-32/64 | |||||
| %endif ; %3 ==/!= X | |||||
| %endmacro | %endmacro | ||||
| ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm | ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm | ||||