Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.3
@@ -625,3 +625,47 @@ INIT_MMX mmx | |||||
PUT_SIGNED_PIXELS_CLAMPED 0 | PUT_SIGNED_PIXELS_CLAMPED 0 | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PUT_SIGNED_PIXELS_CLAMPED 3 | PUT_SIGNED_PIXELS_CLAMPED 3 | ||||
;----------------------------------------------------- | |||||
;void ff_vector_clipf(float *dst, const float *src, | |||||
; float min, float max, int len) | |||||
;----------------------------------------------------- | |||||
INIT_XMM sse | |||||
%if ARCH_X86_32 | |||||
cglobal vector_clipf, 5,5,6, dst, src, min, max, len | |||||
%else | |||||
cglobal vector_clipf, 3,3,6, dst, src, len | |||||
%endif | |||||
%if WIN64 | |||||
SWAP 0, 2 | |||||
SWAP 1, 3 | |||||
%elif ARCH_X86_32 | |||||
movss m0, minm | |||||
movss m1, maxm | |||||
%endif | |||||
SPLATD m0 | |||||
SPLATD m1 | |||||
shl lenq, 2 | |||||
add srcq, lenq | |||||
add dstq, lenq | |||||
neg lenq | |||||
.loop: | |||||
mova m2, [srcq+lenq+mmsize*0] | |||||
mova m3, [srcq+lenq+mmsize*1] | |||||
mova m4, [srcq+lenq+mmsize*2] | |||||
mova m5, [srcq+lenq+mmsize*3] | |||||
maxps m2, m0 | |||||
maxps m3, m0 | |||||
maxps m4, m0 | |||||
maxps m5, m0 | |||||
minps m2, m1 | |||||
minps m3, m1 | |||||
minps m4, m1 | |||||
minps m5, m1 | |||||
mova [dstq+lenq+mmsize*0], m2 | |||||
mova [dstq+lenq+mmsize*1], m3 | |||||
mova [dstq+lenq+mmsize*2], m4 | |||||
mova [dstq+lenq+mmsize*3], m5 | |||||
add lenq, mmsize*4 | |||||
jl .loop | |||||
REP_RET |
@@ -585,12 +585,10 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, | |||||
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, | static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, | ||||
int cpu_flags, unsigned high_bit_depth) | int cpu_flags, unsigned high_bit_depth) | ||||
{ | { | ||||
#if HAVE_SSE_INLINE | |||||
c->vector_clipf = ff_vector_clipf_sse; | |||||
#endif /* HAVE_SSE_INLINE */ | |||||
#if HAVE_YASM | #if HAVE_YASM | ||||
#if HAVE_SSE_EXTERNAL | #if HAVE_SSE_EXTERNAL | ||||
c->vector_clipf = ff_vector_clipf_sse; | |||||
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ | /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ | ||||
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) | if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) | ||||
return; | return; | ||||
@@ -506,37 +506,4 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, | |||||
#endif | #endif | ||||
#endif | #endif | ||||
void ff_vector_clipf_sse(float *dst, const float *src, | |||||
float min, float max, int len) | |||||
{ | |||||
x86_reg i = (len - 16) * 4; | |||||
__asm__ volatile ( | |||||
"movss %3, %%xmm4 \n\t" | |||||
"movss %4, %%xmm5 \n\t" | |||||
"shufps $0, %%xmm4, %%xmm4 \n\t" | |||||
"shufps $0, %%xmm5, %%xmm5 \n\t" | |||||
"1: \n\t" | |||||
"movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel | |||||
"movaps 16(%2, %0), %%xmm1 \n\t" | |||||
"movaps 32(%2, %0), %%xmm2 \n\t" | |||||
"movaps 48(%2, %0), %%xmm3 \n\t" | |||||
"maxps %%xmm4, %%xmm0 \n\t" | |||||
"maxps %%xmm4, %%xmm1 \n\t" | |||||
"maxps %%xmm4, %%xmm2 \n\t" | |||||
"maxps %%xmm4, %%xmm3 \n\t" | |||||
"minps %%xmm5, %%xmm0 \n\t" | |||||
"minps %%xmm5, %%xmm1 \n\t" | |||||
"minps %%xmm5, %%xmm2 \n\t" | |||||
"minps %%xmm5, %%xmm3 \n\t" | |||||
"movaps %%xmm0, (%1, %0) \n\t" | |||||
"movaps %%xmm1, 16(%1, %0) \n\t" | |||||
"movaps %%xmm2, 32(%1, %0) \n\t" | |||||
"movaps %%xmm3, 48(%1, %0) \n\t" | |||||
"sub $64, %0 \n\t" | |||||
"jge 1b \n\t" | |||||
: "+&r" (i) | |||||
: "r" (dst), "r" (src), "m" (min), "m" (max) | |||||
: "memory"); | |||||
} | |||||
#endif /* HAVE_INLINE_ASM */ | #endif /* HAVE_INLINE_ASM */ |