Browse Source

av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()

tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv
MMX: ~30% faster decoding overall
SSE2:~40% faster

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
tags/n2.4
Pascal Massimino Michael Niedermayer 11 years ago
parent
commit
e3fd6a3a4e
4 changed files with 103 additions and 14 deletions
  1. +7
    -4
      libavfilter/vf_idet.c
  2. +5
    -2
      libavfilter/vf_idet.h
  3. +68
    -2
      libavfilter/x86/vf_idet.asm
  4. +23
    -6
      libavfilter/x86/vf_idet_init.c

+ 7
- 4
libavfilter/vf_idet.c View File

@@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c,
return ret; return ret;
} }


static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
{ {
int x; int x;
int ret=0; int ret=0;
@@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref)


if (!idet->csp) if (!idet->csp)
idet->csp = av_pix_fmt_desc_get(link->format); idet->csp = av_pix_fmt_desc_get(link->format);
if (idet->csp->comp[0].depth_minus1 / 8 == 1)
idet->filter_line = (void*)filter_line_c_16bit;
if (idet->csp->comp[0].depth_minus1 / 8 == 1){
idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit;
if (ARCH_X86)
ff_idet_init_x86(idet, 1);
}


filter(ctx); filter(ctx);


@@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx)
idet->filter_line = ff_idet_filter_line_c; idet->filter_line = ff_idet_filter_line_c;


if (ARCH_X86) if (ARCH_X86)
ff_idet_init_x86(idet);
ff_idet_init_x86(idet, 0);


return 0; return 0;
} }


+ 5
- 2
libavfilter/vf_idet.h View File

@@ -24,6 +24,8 @@


#define HIST_SIZE 4 #define HIST_SIZE 4


typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);

typedef enum { typedef enum {
TFF, TFF,
BFF, BFF,
@@ -45,14 +47,15 @@ typedef struct {
AVFrame *cur; AVFrame *cur;
AVFrame *next; AVFrame *next;
AVFrame *prev; AVFrame *prev;
int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w);
ff_idet_filter_func filter_line;


const AVPixFmtDescriptor *csp; const AVPixFmtDescriptor *csp;
} IDETContext; } IDETContext;


void ff_idet_init_x86(IDETContext *idet);
void ff_idet_init_x86(IDETContext *idet, int for_16b);


/* main fall-back for left-over */ /* main fall-back for left-over */
int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w);


#endif #endif

+ 68
- 2
libavfilter/x86/vf_idet.asm View File

@@ -25,8 +25,6 @@


SECTION_TEXT SECTION_TEXT


%if ARCH_X86_32

; Implementation that does 8-bytes at a time using single-word operations. ; Implementation that does 8-bytes at a time using single-word operations.
%macro IDET_FILTER_LINE 1 %macro IDET_FILTER_LINE 1
INIT_MMX %1 INIT_MMX %1
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
RET RET
%endmacro %endmacro


%if ARCH_X86_32
IDET_FILTER_LINE mmxext IDET_FILTER_LINE mmxext
IDET_FILTER_LINE mmx IDET_FILTER_LINE mmx
%endif %endif


;******************************************************************************
; 16bit implementation that does 4/8-pixels at a time

%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
psubusw %3, %2, %1
psubusw %1, %2
por %1, %3

mova %2, %1
punpcklwd %1, m_zero
punpckhwd %2, m_zero
paddd %1, %2
%endmacro

%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
xor indexq, indexq
%define m_zero m1
%define m_sum m0
pxor m_sum, m_sum
pxor m_zero, m_zero

.loop_16bit:
movu m2, [bq + indexq * 2] ; B
movu m3, [aq + indexq * 2] ; A
mova m6, m2
psubusw m5, m2, m3 ; ba

movu m4, [cq + indexq * 2] ; C
add indexq, %1
psubusw m3, m2 ; ab
CMP indexd, widthd

psubusw m6, m4 ; bc
psubusw m4, m2 ; cb

PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
paddd m_sum, m3
paddd m_sum, m5
jl .loop_16bit

mova m2, m_sum
%if mmsize == 16
psrldq m2, 4
paddd m_sum, m2
psrldq m2, 4
paddd m_sum, m2
psrldq m2, 4
paddd m_sum, m2
%else
psrlq m2, 32
paddd m_sum, m2
%endif
movd eax, m_sum
RET
%endmacro

INIT_XMM sse2
IDET_FILTER_LINE_16BIT 8
%if ARCH_X86_32
INIT_MMX mmx
IDET_FILTER_LINE_16BIT 4
%endif

;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time: ; SSE2 8-bit implementation that does 16-bytes at a time:

INIT_XMM sse2 INIT_XMM sse2
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
xor indexq, indexq xor indexq, indexq


+ 23
- 6
libavfilter/x86/vf_idet_init.c View File

@@ -23,6 +23,8 @@
#include "libavutil/x86/cpu.h" #include "libavutil/x86/cpu.h"
#include "libavfilter/vf_idet.h" #include "libavfilter/vf_idet.h"


#if HAVE_YASM

/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ /* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
#define FUNC_MAIN_DECL(KIND, SPAN) \ #define FUNC_MAIN_DECL(KIND, SPAN) \
int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
@@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
return sum; \ return sum; \
} }


#if HAVE_YASM

#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \
int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
const uint16_t *c, int w); \
static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
const uint16_t *c, int w) { \
int sum = 0; \
const int left_over = w & (SPAN - 1); \
w -= left_over; \
if (w > 0) \
sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \
if (left_over > 0) \
sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \
return sum; \
}


FUNC_MAIN_DECL(sse2, 16) FUNC_MAIN_DECL(sse2, 16)
FUNC_MAIN_DECL_16bit(sse2, 8)
#if ARCH_X86_32 #if ARCH_X86_32
FUNC_MAIN_DECL(mmx, 8) FUNC_MAIN_DECL(mmx, 8)
FUNC_MAIN_DECL(mmxext, 8) FUNC_MAIN_DECL(mmxext, 8)
FUNC_MAIN_DECL_16bit(mmx, 4)
#endif #endif


#endif #endif

av_cold void ff_idet_init_x86(IDETContext *idet)
av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
{ {
#if HAVE_YASM #if HAVE_YASM
const int cpu_flags = av_get_cpu_flags(); const int cpu_flags = av_get_cpu_flags();


#if ARCH_X86_32 #if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
idet->filter_line = idet_filter_line_mmx;
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
} }
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
idet->filter_line = idet_filter_line_mmxext;
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
} }
#endif // ARCH_x86_32 #endif // ARCH_x86_32


if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
idet->filter_line = idet_filter_line_sse2;
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
} }
#endif // HAVE_YASM #endif // HAVE_YASM
} }

Loading…
Cancel
Save