tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.4
@@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, | |||
return ret; | |||
} | |||
static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) | |||
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) | |||
{ | |||
int x; | |||
int ret=0; | |||
@@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref) | |||
if (!idet->csp) | |||
idet->csp = av_pix_fmt_desc_get(link->format); | |||
if (idet->csp->comp[0].depth_minus1 / 8 == 1) | |||
idet->filter_line = (void*)filter_line_c_16bit; | |||
if (idet->csp->comp[0].depth_minus1 / 8 == 1){ | |||
idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit; | |||
if (ARCH_X86) | |||
ff_idet_init_x86(idet, 1); | |||
} | |||
filter(ctx); | |||
@@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx) | |||
idet->filter_line = ff_idet_filter_line_c; | |||
if (ARCH_X86) | |||
ff_idet_init_x86(idet); | |||
ff_idet_init_x86(idet, 0); | |||
return 0; | |||
} | |||
@@ -24,6 +24,8 @@ | |||
#define HIST_SIZE 4 | |||
typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); | |||
typedef enum { | |||
TFF, | |||
BFF, | |||
@@ -45,14 +47,15 @@ typedef struct { | |||
AVFrame *cur; | |||
AVFrame *next; | |||
AVFrame *prev; | |||
int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); | |||
ff_idet_filter_func filter_line; | |||
const AVPixFmtDescriptor *csp; | |||
} IDETContext; | |||
void ff_idet_init_x86(IDETContext *idet); | |||
void ff_idet_init_x86(IDETContext *idet, int for_16b); | |||
/* main fall-back for left-over */ | |||
int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); | |||
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w); | |||
#endif |
@@ -25,8 +25,6 @@ | |||
SECTION_TEXT | |||
%if ARCH_X86_32 | |||
; Implementation that does 8-bytes at a time using single-word operations. | |||
%macro IDET_FILTER_LINE 1 | |||
INIT_MMX %1 | |||
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index | |||
RET | |||
%endmacro | |||
%if ARCH_X86_32 | |||
IDET_FILTER_LINE mmxext | |||
IDET_FILTER_LINE mmx | |||
%endif | |||
;****************************************************************************** | |||
; 16bit implementation that does 4/8-pixels at a time | |||
%macro PABS_DIFF_WD 3 ; a, b, junk , output=a | |||
psubusw %3, %2, %1 | |||
psubusw %1, %2 | |||
por %1, %3 | |||
mova %2, %1 | |||
punpcklwd %1, m_zero | |||
punpckhwd %2, m_zero | |||
paddd %1, %2 | |||
%endmacro | |||
%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) | |||
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index | |||
xor indexq, indexq | |||
%define m_zero m1 | |||
%define m_sum m0 | |||
pxor m_sum, m_sum | |||
pxor m_zero, m_zero | |||
.loop_16bit: | |||
movu m2, [bq + indexq * 2] ; B | |||
movu m3, [aq + indexq * 2] ; A | |||
mova m6, m2 | |||
psubusw m5, m2, m3 ; ba | |||
movu m4, [cq + indexq * 2] ; C | |||
add indexq, %1 | |||
psubusw m3, m2 ; ab | |||
CMP indexd, widthd | |||
psubusw m6, m4 ; bc | |||
psubusw m4, m2 ; cb | |||
PABS_DIFF_WD m3, m6, m7 ; |ab - bc| | |||
PABS_DIFF_WD m5, m4, m7 ; |ba - cb| | |||
paddd m_sum, m3 | |||
paddd m_sum, m5 | |||
jl .loop_16bit | |||
mova m2, m_sum | |||
%if mmsize == 16 | |||
psrldq m2, 4 | |||
paddd m_sum, m2 | |||
psrldq m2, 4 | |||
paddd m_sum, m2 | |||
psrldq m2, 4 | |||
paddd m_sum, m2 | |||
%else | |||
psrlq m2, 32 | |||
paddd m_sum, m2 | |||
%endif | |||
movd eax, m_sum | |||
RET | |||
%endmacro | |||
INIT_XMM sse2 | |||
IDET_FILTER_LINE_16BIT 8 | |||
%if ARCH_X86_32 | |||
INIT_MMX mmx | |||
IDET_FILTER_LINE_16BIT 4 | |||
%endif | |||
;****************************************************************************** | |||
; SSE2 8-bit implementation that does 16-bytes at a time: | |||
INIT_XMM sse2 | |||
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total | |||
xor indexq, indexq | |||
@@ -23,6 +23,8 @@ | |||
#include "libavutil/x86/cpu.h" | |||
#include "libavfilter/vf_idet.h" | |||
#if HAVE_YASM | |||
/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ | |||
#define FUNC_MAIN_DECL(KIND, SPAN) \ | |||
int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ | |||
@@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ | |||
return sum; \ | |||
} | |||
#if HAVE_YASM | |||
#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \ | |||
int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ | |||
const uint16_t *c, int w); \ | |||
static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ | |||
const uint16_t *c, int w) { \ | |||
int sum = 0; \ | |||
const int left_over = w & (SPAN - 1); \ | |||
w -= left_over; \ | |||
if (w > 0) \ | |||
sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \ | |||
if (left_over > 0) \ | |||
sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \ | |||
return sum; \ | |||
} | |||
FUNC_MAIN_DECL(sse2, 16) | |||
FUNC_MAIN_DECL_16bit(sse2, 8) | |||
#if ARCH_X86_32 | |||
FUNC_MAIN_DECL(mmx, 8) | |||
FUNC_MAIN_DECL(mmxext, 8) | |||
FUNC_MAIN_DECL_16bit(mmx, 4) | |||
#endif | |||
#endif | |||
av_cold void ff_idet_init_x86(IDETContext *idet) | |||
av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b) | |||
{ | |||
#if HAVE_YASM | |||
const int cpu_flags = av_get_cpu_flags(); | |||
#if ARCH_X86_32 | |||
if (EXTERNAL_MMX(cpu_flags)) { | |||
idet->filter_line = idet_filter_line_mmx; | |||
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx; | |||
} | |||
if (EXTERNAL_MMXEXT(cpu_flags)) { | |||
idet->filter_line = idet_filter_line_mmxext; | |||
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext; | |||
} | |||
#endif // ARCH_x86_32 | |||
if (EXTERNAL_SSE2(cpu_flags)) { | |||
idet->filter_line = idet_filter_line_sse2; | |||
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2; | |||
} | |||
#endif // HAVE_YASM | |||
} |