Also add sse2 versions for both. put_pixels_clamped port and sse2 version originally written by Timothy Gu. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>tags/n2.5
| @@ -66,8 +66,7 @@ OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o | |||||
| # subsystems | # subsystems | ||||
| MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o | MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o | ||||
| MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o | MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o | ||||
| MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \ | |||||
| x86/simple_idct.o | |||||
| MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o | |||||
| # decoders/encoders | # decoders/encoders | ||||
| MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o \ | MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o \ | ||||
| @@ -78,3 +78,106 @@ INIT_MMX mmx | |||||
| PUT_SIGNED_PIXELS_CLAMPED 0 | PUT_SIGNED_PIXELS_CLAMPED 0 | ||||
| INIT_XMM sse2 | INIT_XMM sse2 | ||||
| PUT_SIGNED_PIXELS_CLAMPED 3 | PUT_SIGNED_PIXELS_CLAMPED 3 | ||||
| ;-------------------------------------------------------------------------- | |||||
| ; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, | |||||
| ; ptrdiff_t line_size); | |||||
| ;-------------------------------------------------------------------------- | |||||
| ; %1 = block offset | |||||
| %macro PUT_PIXELS_CLAMPED_HALF 1 | |||||
| mova m0, [blockq+mmsize*0+%1] | |||||
| mova m1, [blockq+mmsize*2+%1] | |||||
| %if mmsize == 8 | |||||
| mova m2, [blockq+mmsize*4+%1] | |||||
| mova m3, [blockq+mmsize*6+%1] | |||||
| %endif | |||||
| packuswb m0, [blockq+mmsize*1+%1] | |||||
| packuswb m1, [blockq+mmsize*3+%1] | |||||
| %if mmsize == 8 | |||||
| packuswb m2, [blockq+mmsize*5+%1] | |||||
| packuswb m3, [blockq+mmsize*7+%1] | |||||
| movq [pixelsq], m0 | |||||
| movq [lsizeq+pixelsq], m1 | |||||
| movq [2*lsizeq+pixelsq], m2 | |||||
| movq [lsize3q+pixelsq], m3 | |||||
| %else | |||||
| movq [pixelsq], m0 | |||||
| movhps [lsizeq+pixelsq], m0 | |||||
| movq [2*lsizeq+pixelsq], m1 | |||||
| movhps [lsize3q+pixelsq], m1 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro PUT_PIXELS_CLAMPED 0 | |||||
| cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 | |||||
| lea lsize3q, [lsizeq*3] | |||||
| PUT_PIXELS_CLAMPED_HALF 0 | |||||
| lea pixelsq, [pixelsq+lsizeq*4] | |||||
| PUT_PIXELS_CLAMPED_HALF 64 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX mmx | |||||
| PUT_PIXELS_CLAMPED | |||||
| INIT_XMM sse2 | |||||
| PUT_PIXELS_CLAMPED | |||||
| ;-------------------------------------------------------------------------- | |||||
| ; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, | |||||
| ; ptrdiff_t line_size); | |||||
| ;-------------------------------------------------------------------------- | |||||
| ; %1 = block offset | |||||
| %macro ADD_PIXELS_CLAMPED 1 | |||||
| mova m0, [blockq+mmsize*0+%1] | |||||
| mova m1, [blockq+mmsize*1+%1] | |||||
| %if mmsize == 8 | |||||
| mova m5, [blockq+mmsize*2+%1] | |||||
| mova m6, [blockq+mmsize*3+%1] | |||||
| %endif | |||||
| movq m2, [pixelsq] | |||||
| movq m3, [pixelsq+lsizeq] | |||||
| %if mmsize == 8 | |||||
| mova m7, m2 | |||||
| punpcklbw m2, m4 | |||||
| punpckhbw m7, m4 | |||||
| paddsw m0, m2 | |||||
| paddsw m1, m7 | |||||
| mova m7, m3 | |||||
| punpcklbw m3, m4 | |||||
| punpckhbw m7, m4 | |||||
| paddsw m5, m3 | |||||
| paddsw m6, m7 | |||||
| %else | |||||
| punpcklbw m2, m4 | |||||
| punpcklbw m3, m4 | |||||
| paddsw m0, m2 | |||||
| paddsw m1, m3 | |||||
| %endif | |||||
| packuswb m0, m1 | |||||
| %if mmsize == 8 | |||||
| packuswb m5, m6 | |||||
| movq [pixelsq], m0 | |||||
| movq [pixelsq+lsizeq], m5 | |||||
| %else | |||||
| movq [pixelsq], m0 | |||||
| movhps [pixelsq+lsizeq], m0 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro ADD_PIXELS_CLAMPED 0 | |||||
| cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize | |||||
| pxor m4, m4 | |||||
| ADD_PIXELS_CLAMPED 0 | |||||
| lea pixelsq, [pixelsq+lsizeq*2] | |||||
| ADD_PIXELS_CLAMPED 32 | |||||
| lea pixelsq, [pixelsq+lsizeq*2] | |||||
| ADD_PIXELS_CLAMPED 64 | |||||
| lea pixelsq, [pixelsq+lsizeq*2] | |||||
| ADD_PIXELS_CLAMPED 96 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX mmx | |||||
| ADD_PIXELS_CLAMPED | |||||
| INIT_XMM sse2 | |||||
| ADD_PIXELS_CLAMPED | |||||
| @@ -23,8 +23,12 @@ | |||||
| void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | ||||
| ptrdiff_t line_size); | ptrdiff_t line_size); | ||||
| void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, | |||||
| ptrdiff_t line_size); | |||||
| void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | ||||
| ptrdiff_t line_size); | ptrdiff_t line_size); | ||||
| void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, | |||||
| ptrdiff_t line_size); | |||||
| void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | ||||
| ptrdiff_t line_size); | ptrdiff_t line_size); | ||||
| void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, | void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, | ||||
| @@ -64,9 +64,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, | |||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| if (INLINE_MMX(cpu_flags)) { | if (INLINE_MMX(cpu_flags)) { | ||||
| c->put_pixels_clamped = ff_put_pixels_clamped_mmx; | |||||
| c->add_pixels_clamped = ff_add_pixels_clamped_mmx; | |||||
| if (!high_bit_depth && | if (!high_bit_depth && | ||||
| avctx->lowres == 0 && | avctx->lowres == 0 && | ||||
| (avctx->idct_algo == FF_IDCT_AUTO || | (avctx->idct_algo == FF_IDCT_AUTO || | ||||
| @@ -80,8 +77,12 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, | |||||
| } | } | ||||
| if (EXTERNAL_MMX(cpu_flags)) { | if (EXTERNAL_MMX(cpu_flags)) { | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; | c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; | ||||
| c->put_pixels_clamped = ff_put_pixels_clamped_mmx; | |||||
| c->add_pixels_clamped = ff_add_pixels_clamped_mmx; | |||||
| } | } | ||||
| if (EXTERNAL_SSE2(cpu_flags)) { | if (EXTERNAL_SSE2(cpu_flags)) { | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; | c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; | ||||
| c->put_pixels_clamped = ff_put_pixels_clamped_sse2; | |||||
| c->add_pixels_clamped = ff_add_pixels_clamped_sse2; | |||||
| } | } | ||||
| } | } | ||||
| @@ -1,134 +0,0 @@ | |||||
| /* | |||||
| * SIMD-optimized IDCT-related routines | |||||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||||
| * | |||||
| * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/x86/asm.h" | |||||
| #include "libavcodec/idctdsp.h" | |||||
| #include "idctdsp.h" | |||||
| #include "inline_asm.h" | |||||
| #if HAVE_INLINE_ASM | |||||
| void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||||
| ptrdiff_t line_size) | |||||
| { | |||||
| const int16_t *p; | |||||
| uint8_t *pix; | |||||
| /* read the pixels */ | |||||
| p = block; | |||||
| pix = pixels; | |||||
| /* unrolled loop */ | |||||
| __asm__ volatile ( | |||||
| "movq (%3), %%mm0 \n\t" | |||||
| "movq 8(%3), %%mm1 \n\t" | |||||
| "movq 16(%3), %%mm2 \n\t" | |||||
| "movq 24(%3), %%mm3 \n\t" | |||||
| "movq 32(%3), %%mm4 \n\t" | |||||
| "movq 40(%3), %%mm5 \n\t" | |||||
| "movq 48(%3), %%mm6 \n\t" | |||||
| "movq 56(%3), %%mm7 \n\t" | |||||
| "packuswb %%mm1, %%mm0 \n\t" | |||||
| "packuswb %%mm3, %%mm2 \n\t" | |||||
| "packuswb %%mm5, %%mm4 \n\t" | |||||
| "packuswb %%mm7, %%mm6 \n\t" | |||||
| "movq %%mm0, (%0) \n\t" | |||||
| "movq %%mm2, (%0, %1) \n\t" | |||||
| "movq %%mm4, (%0, %1, 2) \n\t" | |||||
| "movq %%mm6, (%0, %2) \n\t" | |||||
| :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), | |||||
| "r" (p) | |||||
| : "memory"); | |||||
| pix += line_size * 4; | |||||
| p += 32; | |||||
| // if here would be an exact copy of the code above | |||||
| // compiler would generate some very strange code | |||||
| // thus using "r" | |||||
| __asm__ volatile ( | |||||
| "movq (%3), %%mm0 \n\t" | |||||
| "movq 8(%3), %%mm1 \n\t" | |||||
| "movq 16(%3), %%mm2 \n\t" | |||||
| "movq 24(%3), %%mm3 \n\t" | |||||
| "movq 32(%3), %%mm4 \n\t" | |||||
| "movq 40(%3), %%mm5 \n\t" | |||||
| "movq 48(%3), %%mm6 \n\t" | |||||
| "movq 56(%3), %%mm7 \n\t" | |||||
| "packuswb %%mm1, %%mm0 \n\t" | |||||
| "packuswb %%mm3, %%mm2 \n\t" | |||||
| "packuswb %%mm5, %%mm4 \n\t" | |||||
| "packuswb %%mm7, %%mm6 \n\t" | |||||
| "movq %%mm0, (%0) \n\t" | |||||
| "movq %%mm2, (%0, %1) \n\t" | |||||
| "movq %%mm4, (%0, %1, 2) \n\t" | |||||
| "movq %%mm6, (%0, %2) \n\t" | |||||
| :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), | |||||
| "r" (p) | |||||
| : "memory"); | |||||
| } | |||||
| void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||||
| ptrdiff_t line_size) | |||||
| { | |||||
| const int16_t *p; | |||||
| uint8_t *pix; | |||||
| int i; | |||||
| /* read the pixels */ | |||||
| p = block; | |||||
| pix = pixels; | |||||
| MOVQ_ZERO(mm7); | |||||
| i = 4; | |||||
| do { | |||||
| __asm__ volatile ( | |||||
| "movq (%2), %%mm0 \n\t" | |||||
| "movq 8(%2), %%mm1 \n\t" | |||||
| "movq 16(%2), %%mm2 \n\t" | |||||
| "movq 24(%2), %%mm3 \n\t" | |||||
| "movq %0, %%mm4 \n\t" | |||||
| "movq %1, %%mm6 \n\t" | |||||
| "movq %%mm4, %%mm5 \n\t" | |||||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||||
| "paddsw %%mm4, %%mm0 \n\t" | |||||
| "paddsw %%mm5, %%mm1 \n\t" | |||||
| "movq %%mm6, %%mm5 \n\t" | |||||
| "punpcklbw %%mm7, %%mm6 \n\t" | |||||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||||
| "paddsw %%mm6, %%mm2 \n\t" | |||||
| "paddsw %%mm5, %%mm3 \n\t" | |||||
| "packuswb %%mm1, %%mm0 \n\t" | |||||
| "packuswb %%mm3, %%mm2 \n\t" | |||||
| "movq %%mm0, %0 \n\t" | |||||
| "movq %%mm2, %1 \n\t" | |||||
| : "+m" (*pix), "+m" (*(pix + line_size)) | |||||
| : "r" (p) | |||||
| : "memory"); | |||||
| pix += line_size * 2; | |||||
| p += 16; | |||||
| } while (--i); | |||||
| } | |||||
| #endif /* HAVE_INLINE_ASM */ | |||||