tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.4
| @@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) | |||||
| int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) | |||||
| { | { | ||||
| int x; | int x; | ||||
| int ret=0; | int ret=0; | ||||
| @@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref) | |||||
| if (!idet->csp) | if (!idet->csp) | ||||
| idet->csp = av_pix_fmt_desc_get(link->format); | idet->csp = av_pix_fmt_desc_get(link->format); | ||||
| if (idet->csp->comp[0].depth_minus1 / 8 == 1) | |||||
| idet->filter_line = (void*)filter_line_c_16bit; | |||||
| if (idet->csp->comp[0].depth_minus1 / 8 == 1){ | |||||
| idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit; | |||||
| if (ARCH_X86) | |||||
| ff_idet_init_x86(idet, 1); | |||||
| } | |||||
| filter(ctx); | filter(ctx); | ||||
| @@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx) | |||||
| idet->filter_line = ff_idet_filter_line_c; | idet->filter_line = ff_idet_filter_line_c; | ||||
| if (ARCH_X86) | if (ARCH_X86) | ||||
| ff_idet_init_x86(idet); | |||||
| ff_idet_init_x86(idet, 0); | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -24,6 +24,8 @@ | |||||
| #define HIST_SIZE 4 | #define HIST_SIZE 4 | ||||
| typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); | |||||
| typedef enum { | typedef enum { | ||||
| TFF, | TFF, | ||||
| BFF, | BFF, | ||||
| @@ -45,14 +47,15 @@ typedef struct { | |||||
| AVFrame *cur; | AVFrame *cur; | ||||
| AVFrame *next; | AVFrame *next; | ||||
| AVFrame *prev; | AVFrame *prev; | ||||
| int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); | |||||
| ff_idet_filter_func filter_line; | |||||
| const AVPixFmtDescriptor *csp; | const AVPixFmtDescriptor *csp; | ||||
| } IDETContext; | } IDETContext; | ||||
| void ff_idet_init_x86(IDETContext *idet); | |||||
| void ff_idet_init_x86(IDETContext *idet, int for_16b); | |||||
| /* main fall-back for left-over */ | /* main fall-back for left-over */ | ||||
| int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); | int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); | ||||
| int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w); | |||||
| #endif | #endif | ||||
| @@ -25,8 +25,6 @@ | |||||
| SECTION_TEXT | SECTION_TEXT | ||||
| %if ARCH_X86_32 | |||||
| ; Implementation that does 8-bytes at a time using single-word operations. | ; Implementation that does 8-bytes at a time using single-word operations. | ||||
| %macro IDET_FILTER_LINE 1 | %macro IDET_FILTER_LINE 1 | ||||
| INIT_MMX %1 | INIT_MMX %1 | ||||
| @@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index | |||||
| RET | RET | ||||
| %endmacro | %endmacro | ||||
| %if ARCH_X86_32 | |||||
| IDET_FILTER_LINE mmxext | IDET_FILTER_LINE mmxext | ||||
| IDET_FILTER_LINE mmx | IDET_FILTER_LINE mmx | ||||
| %endif | %endif | ||||
| ;****************************************************************************** | |||||
| ; 16bit implementation that does 4/8-pixels at a time | |||||
| %macro PABS_DIFF_WD 3 ; a, b, junk , output=a | |||||
| psubusw %3, %2, %1 | |||||
| psubusw %1, %2 | |||||
| por %1, %3 | |||||
| mova %2, %1 | |||||
| punpcklwd %1, m_zero | |||||
| punpckhwd %2, m_zero | |||||
| paddd %1, %2 | |||||
| %endmacro | |||||
| %macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) | |||||
| cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index | |||||
| xor indexq, indexq | |||||
| %define m_zero m1 | |||||
| %define m_sum m0 | |||||
| pxor m_sum, m_sum | |||||
| pxor m_zero, m_zero | |||||
| .loop_16bit: | |||||
| movu m2, [bq + indexq * 2] ; B | |||||
| movu m3, [aq + indexq * 2] ; A | |||||
| mova m6, m2 | |||||
| psubusw m5, m2, m3 ; ba | |||||
| movu m4, [cq + indexq * 2] ; C | |||||
| add indexq, %1 | |||||
| psubusw m3, m2 ; ab | |||||
| CMP indexd, widthd | |||||
| psubusw m6, m4 ; bc | |||||
| psubusw m4, m2 ; cb | |||||
| PABS_DIFF_WD m3, m6, m7 ; |ab - bc| | |||||
| PABS_DIFF_WD m5, m4, m7 ; |ba - cb| | |||||
| paddd m_sum, m3 | |||||
| paddd m_sum, m5 | |||||
| jl .loop_16bit | |||||
| mova m2, m_sum | |||||
| %if mmsize == 16 | |||||
| psrldq m2, 4 | |||||
| paddd m_sum, m2 | |||||
| psrldq m2, 4 | |||||
| paddd m_sum, m2 | |||||
| psrldq m2, 4 | |||||
| paddd m_sum, m2 | |||||
| %else | |||||
| psrlq m2, 32 | |||||
| paddd m_sum, m2 | |||||
| %endif | |||||
| movd eax, m_sum | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM sse2 | |||||
| IDET_FILTER_LINE_16BIT 8 | |||||
| %if ARCH_X86_32 | |||||
| INIT_MMX mmx | |||||
| IDET_FILTER_LINE_16BIT 4 | |||||
| %endif | |||||
| ;****************************************************************************** | |||||
| ; SSE2 8-bit implementation that does 16-bytes at a time: | ; SSE2 8-bit implementation that does 16-bytes at a time: | ||||
| INIT_XMM sse2 | INIT_XMM sse2 | ||||
| cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total | cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total | ||||
| xor indexq, indexq | xor indexq, indexq | ||||
| @@ -23,6 +23,8 @@ | |||||
| #include "libavutil/x86/cpu.h" | #include "libavutil/x86/cpu.h" | ||||
| #include "libavfilter/vf_idet.h" | #include "libavfilter/vf_idet.h" | ||||
| #if HAVE_YASM | |||||
| /* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ | /* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ | ||||
| #define FUNC_MAIN_DECL(KIND, SPAN) \ | #define FUNC_MAIN_DECL(KIND, SPAN) \ | ||||
| int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ | int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ | ||||
| @@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ | |||||
| return sum; \ | return sum; \ | ||||
| } | } | ||||
| #if HAVE_YASM | |||||
| #define FUNC_MAIN_DECL_16bit(KIND, SPAN) \ | |||||
| int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ | |||||
| const uint16_t *c, int w); \ | |||||
| static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ | |||||
| const uint16_t *c, int w) { \ | |||||
| int sum = 0; \ | |||||
| const int left_over = w & (SPAN - 1); \ | |||||
| w -= left_over; \ | |||||
| if (w > 0) \ | |||||
| sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \ | |||||
| if (left_over > 0) \ | |||||
| sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \ | |||||
| return sum; \ | |||||
| } | |||||
| FUNC_MAIN_DECL(sse2, 16) | FUNC_MAIN_DECL(sse2, 16) | ||||
| FUNC_MAIN_DECL_16bit(sse2, 8) | |||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| FUNC_MAIN_DECL(mmx, 8) | FUNC_MAIN_DECL(mmx, 8) | ||||
| FUNC_MAIN_DECL(mmxext, 8) | FUNC_MAIN_DECL(mmxext, 8) | ||||
| FUNC_MAIN_DECL_16bit(mmx, 4) | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| av_cold void ff_idet_init_x86(IDETContext *idet) | |||||
| av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b) | |||||
| { | { | ||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| const int cpu_flags = av_get_cpu_flags(); | const int cpu_flags = av_get_cpu_flags(); | ||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| if (EXTERNAL_MMX(cpu_flags)) { | if (EXTERNAL_MMX(cpu_flags)) { | ||||
| idet->filter_line = idet_filter_line_mmx; | |||||
| idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx; | |||||
| } | } | ||||
| if (EXTERNAL_MMXEXT(cpu_flags)) { | if (EXTERNAL_MMXEXT(cpu_flags)) { | ||||
| idet->filter_line = idet_filter_line_mmxext; | |||||
| idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext; | |||||
| } | } | ||||
| #endif // ARCH_x86_32 | #endif // ARCH_x86_32 | ||||
| if (EXTERNAL_SSE2(cpu_flags)) { | if (EXTERNAL_SSE2(cpu_flags)) { | ||||
| idet->filter_line = idet_filter_line_sse2; | |||||
| idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2; | |||||
| } | } | ||||
| #endif // HAVE_YASM | #endif // HAVE_YASM | ||||
| } | } | ||||