| @@ -99,35 +99,33 @@ SECTION .text | |||
| paddusw m0, m1 | |||
| %endmacro | |||
| ; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | |||
| ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to | |||
| ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |||
| ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. | |||
| %macro HSUM_MMX 3 | |||
| mova %2, %1 | |||
| psrlq %1, 32 | |||
| %macro HSUM 3 | |||
| %if cpuflag(sse2) | |||
| movhlps %2, %1 | |||
| paddusw %1, %2 | |||
| mova %2, %1 | |||
| psrlq %1, 16 | |||
| pshuflw %2, %1, 0xE | |||
| paddusw %1, %2 | |||
| pshuflw %2, %1, 0x1 | |||
| paddusw %1, %2 | |||
| movd %3, %1 | |||
| %endmacro | |||
| %macro HSUM_MMXEXT 3 | |||
| %elif cpuflag(mmxext) | |||
| pshufw %2, %1, 0xE | |||
| paddusw %1, %2 | |||
| pshufw %2, %1, 0x1 | |||
| paddusw %1, %2 | |||
| movd %3, %1 | |||
| %endmacro | |||
| %macro HSUM_SSE2 3 | |||
| movhlps %2, %1 | |||
| paddusw %1, %2 | |||
| pshuflw %2, %1, 0xE | |||
| %elif cpuflag(mmx) | |||
| mova %2, %1 | |||
| psrlq %1, 32 | |||
| paddusw %1, %2 | |||
| pshuflw %2, %1, 0x1 | |||
| mova %2, %1 | |||
| psrlq %1, 16 | |||
| paddusw %1, %2 | |||
| movd %3, %1 | |||
| %endif | |||
| %endmacro | |||
| %macro STORE4 5 | |||
| @@ -144,30 +142,30 @@ SECTION .text | |||
| mova %5, [%1+mmsize*3] | |||
| %endmacro | |||
| %macro hadamard8_16_wrapper 3 | |||
| cglobal hadamard8_diff_%1, 4, 4, %2 | |||
| %macro hadamard8_16_wrapper 2 | |||
| cglobal hadamard8_diff, 4, 4, %1 | |||
| %ifndef m8 | |||
| %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) | |||
| %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |||
| SUB rsp, pad | |||
| %endif | |||
| call hadamard8x8_diff_%1 | |||
| call hadamard8x8_diff %+ SUFFIX | |||
| %ifndef m8 | |||
| ADD rsp, pad | |||
| %endif | |||
| RET | |||
| cglobal hadamard8_diff16_%1, 5, 6, %2 | |||
| cglobal hadamard8_diff16, 5, 6, %1 | |||
| %ifndef m8 | |||
| %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) | |||
| %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |||
| SUB rsp, pad | |||
| %endif | |||
| call hadamard8x8_diff_%1 | |||
| call hadamard8x8_diff %+ SUFFIX | |||
| mov r5d, eax | |||
| add r1, 8 | |||
| add r2, 8 | |||
| call hadamard8x8_diff_%1 | |||
| call hadamard8x8_diff %+ SUFFIX | |||
| add r5d, eax | |||
| cmp r4d, 16 | |||
| @@ -175,12 +173,12 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 | |||
| lea r1, [r1+r3*8-8] | |||
| lea r2, [r2+r3*8-8] | |||
| call hadamard8x8_diff_%1 | |||
| call hadamard8x8_diff %+ SUFFIX | |||
| add r5d, eax | |||
| add r1, 8 | |||
| add r2, 8 | |||
| call hadamard8x8_diff_%1 | |||
| call hadamard8x8_diff %+ SUFFIX | |||
| add r5d, eax | |||
| .done: | |||
| @@ -191,7 +189,25 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 | |||
| RET | |||
| %endmacro | |||
| %macro HADAMARD8_DIFF_MMX 1 | |||
| %macro HADAMARD8_DIFF 0-1 | |||
| %if cpuflag(sse2) | |||
| hadamard8x8_diff %+ SUFFIX: | |||
| lea r0, [r3*3] | |||
| DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize | |||
| HADAMARD8 | |||
| %if ARCH_X86_64 | |||
| TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |||
| %else | |||
| TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] | |||
| %endif | |||
| HADAMARD8 | |||
| ABS_SUM_8x8 rsp+gprsize | |||
| HSUM m0, m1, eax | |||
| and eax, 0xFFFF | |||
| ret | |||
| hadamard8_16_wrapper %1, 3 | |||
| %elif cpuflag(mmx) | |||
| ALIGN 16 | |||
| ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, | |||
| ; int stride, int h) | |||
| @@ -199,7 +215,7 @@ ALIGN 16 | |||
| ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 | |||
| ; can simply call this 2x2x (and that's why we access rsp+gprsize | |||
| ; everywhere, which is rsp of calling func | |||
| hadamard8x8_diff_%1: | |||
| hadamard8x8_diff %+ SUFFIX: | |||
| lea r0, [r3*3] | |||
| ; first 4x8 pixels | |||
| @@ -236,53 +252,35 @@ hadamard8x8_diff_%1: | |||
| and rax, 0xFFFF | |||
| ret | |||
| hadamard8_16_wrapper %1, 0, 14 | |||
| %endmacro | |||
| %macro HADAMARD8_DIFF_SSE2 2 | |||
| hadamard8x8_diff_%1: | |||
| lea r0, [r3*3] | |||
| DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize | |||
| HADAMARD8 | |||
| %if ARCH_X86_64 | |||
| TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |||
| %else | |||
| TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] | |||
| hadamard8_16_wrapper 0, 14 | |||
| %endif | |||
| HADAMARD8 | |||
| ABS_SUM_8x8 rsp+gprsize | |||
| HSUM_SSE2 m0, m1, eax | |||
| and eax, 0xFFFF | |||
| ret | |||
| hadamard8_16_wrapper %1, %2, 3 | |||
| %endmacro | |||
| INIT_MMX | |||
| INIT_MMX mmx | |||
| %define ABS1 ABS1_MMX | |||
| %define HSUM HSUM_MMX | |||
| HADAMARD8_DIFF_MMX mmx | |||
| HADAMARD8_DIFF | |||
| INIT_MMX mmxext | |||
| %define ABS1 ABS1_MMXEXT | |||
| %define HSUM HSUM_MMXEXT | |||
| HADAMARD8_DIFF_MMX mmxext | |||
| HADAMARD8_DIFF | |||
| INIT_XMM | |||
| INIT_XMM sse2 | |||
| %define ABS2 ABS2_MMXEXT | |||
| %if ARCH_X86_64 | |||
| %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |||
| %else | |||
| %define ABS_SUM_8x8 ABS_SUM_8x8_32 | |||
| %endif | |||
| HADAMARD8_DIFF_SSE2 sse2, 10 | |||
| HADAMARD8_DIFF 10 | |||
| INIT_XMM ssse3 | |||
| %define ABS2 ABS2_SSSE3 | |||
| %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |||
| HADAMARD8_DIFF_SSE2 ssse3, 9 | |||
| HADAMARD8_DIFF 9 | |||
| INIT_XMM | |||
| INIT_XMM sse2 | |||
| ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) | |||
| cglobal sse16_sse2, 5, 5, 8 | |||
| cglobal sse16, 5, 5, 8 | |||
| shr r4d, 1 | |||
| pxor m0, m0 ; mm0 = 0 | |||
| pxor m7, m7 ; mm7 holds the sum | |||