| @@ -42,6 +42,7 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o | |||
| YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o | |||
| MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o | |||
| MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o | |||
| YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | |||
| MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o | |||
| YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | |||
| MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o | |||
| @@ -19,117 +19,27 @@ | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavutil/common.h" | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/pngdsp.h" | |||
| #include "dsputil_mmx.h" | |||
| #define PAETH(cpu, abs3)\ | |||
| static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ | |||
| {\ | |||
| x86_reg i = -bpp;\ | |||
| x86_reg end = w-3;\ | |||
| __asm__ volatile(\ | |||
| "pxor %%mm7, %%mm7 \n"\ | |||
| "movd (%1,%0), %%mm0 \n"\ | |||
| "movd (%2,%0), %%mm1 \n"\ | |||
| "punpcklbw %%mm7, %%mm0 \n"\ | |||
| "punpcklbw %%mm7, %%mm1 \n"\ | |||
| "add %4, %0 \n"\ | |||
| "1: \n"\ | |||
| "movq %%mm1, %%mm2 \n"\ | |||
| "movd (%2,%0), %%mm1 \n"\ | |||
| "movq %%mm2, %%mm3 \n"\ | |||
| "punpcklbw %%mm7, %%mm1 \n"\ | |||
| "movq %%mm2, %%mm4 \n"\ | |||
| "psubw %%mm1, %%mm3 \n"\ | |||
| "psubw %%mm0, %%mm4 \n"\ | |||
| "movq %%mm3, %%mm5 \n"\ | |||
| "paddw %%mm4, %%mm5 \n"\ | |||
| abs3\ | |||
| "movq %%mm4, %%mm6 \n"\ | |||
| "pminsw %%mm5, %%mm6 \n"\ | |||
| "pcmpgtw %%mm6, %%mm3 \n"\ | |||
| "pcmpgtw %%mm5, %%mm4 \n"\ | |||
| "movq %%mm4, %%mm6 \n"\ | |||
| "pand %%mm3, %%mm4 \n"\ | |||
| "pandn %%mm3, %%mm6 \n"\ | |||
| "pandn %%mm0, %%mm3 \n"\ | |||
| "movd (%3,%0), %%mm0 \n"\ | |||
| "pand %%mm1, %%mm6 \n"\ | |||
| "pand %%mm4, %%mm2 \n"\ | |||
| "punpcklbw %%mm7, %%mm0 \n"\ | |||
| "movq %6, %%mm5 \n"\ | |||
| "paddw %%mm6, %%mm0 \n"\ | |||
| "paddw %%mm2, %%mm3 \n"\ | |||
| "paddw %%mm3, %%mm0 \n"\ | |||
| "pand %%mm5, %%mm0 \n"\ | |||
| "movq %%mm0, %%mm3 \n"\ | |||
| "packuswb %%mm3, %%mm3 \n"\ | |||
| "movd %%mm3, (%1,%0) \n"\ | |||
| "add %4, %0 \n"\ | |||
| "cmp %5, %0 \n"\ | |||
| "jle 1b \n"\ | |||
| :"+r"(i)\ | |||
| :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ | |||
| "m"(ff_pw_255)\ | |||
| :"memory"\ | |||
| );\ | |||
| } | |||
| #define ABS3_MMX2\ | |||
| "psubw %%mm5, %%mm7 \n"\ | |||
| "pmaxsw %%mm7, %%mm5 \n"\ | |||
| "pxor %%mm6, %%mm6 \n"\ | |||
| "pxor %%mm7, %%mm7 \n"\ | |||
| "psubw %%mm3, %%mm6 \n"\ | |||
| "psubw %%mm4, %%mm7 \n"\ | |||
| "pmaxsw %%mm6, %%mm3 \n"\ | |||
| "pmaxsw %%mm7, %%mm4 \n"\ | |||
| "pxor %%mm7, %%mm7 \n" | |||
| #define ABS3_SSSE3\ | |||
| "pabsw %%mm3, %%mm3 \n"\ | |||
| "pabsw %%mm4, %%mm4 \n"\ | |||
| "pabsw %%mm5, %%mm5 \n" | |||
| PAETH(mmx2, ABS3_MMX2) | |||
| #if HAVE_SSSE3 | |||
| PAETH(ssse3, ABS3_SSSE3) | |||
| #endif | |||
| static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w) | |||
| { | |||
| x86_reg i=0; | |||
| __asm__ volatile( | |||
| "jmp 2f \n\t" | |||
| "1: \n\t" | |||
| "movq (%2, %0), %%mm0 \n\t" | |||
| "movq 8(%2, %0), %%mm1 \n\t" | |||
| "paddb (%3, %0), %%mm0 \n\t" | |||
| "paddb 8(%3, %0), %%mm1 \n\t" | |||
| "movq %%mm0, (%1, %0) \n\t" | |||
| "movq %%mm1, 8(%1, %0) \n\t" | |||
| "add $16, %0 \n\t" | |||
| "2: \n\t" | |||
| "cmp %4, %0 \n\t" | |||
| " js 1b \n\t" | |||
| : "+r" (i) | |||
| : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg) w - 15) | |||
| ); | |||
| for (; i < w; i++) | |||
| dst[i] = src1[i] + src2[i]; | |||
| } | |||
| void ff_add_png_paeth_prediction_mmx2 (uint8_t *dst, uint8_t *src, | |||
| uint8_t *top, int w, int bpp); | |||
| void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src, | |||
| uint8_t *top, int w, int bpp); | |||
| void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1, | |||
| uint8_t *src2, int w); | |||
| void ff_pngdsp_init_x86(PNGDSPContext *dsp) | |||
| { | |||
| #if HAVE_YASM | |||
| int flags = av_get_cpu_flags(); | |||
| if (flags & AV_CPU_FLAG_MMX) | |||
| dsp->add_bytes_l2 = add_bytes_l2_mmx; | |||
| dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; | |||
| if (flags & AV_CPU_FLAG_MMX2) | |||
| dsp->add_paeth_prediction = add_png_paeth_prediction_mmx2; | |||
| if (HAVE_SSSE3 && flags & AV_CPU_FLAG_SSSE3) | |||
| dsp->add_paeth_prediction = add_png_paeth_prediction_ssse3; | |||
| dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmx2; | |||
| if (flags & AV_CPU_FLAG_SSSE3) | |||
| dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3; | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,142 @@ | |||
| ;****************************************************************************** | |||
| ;* x86 optimizations for PNG decoding | |||
| ;* | |||
| ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> | |||
| ;* | |||
| ;* This file is part of Libav. | |||
| ;* | |||
| ;* Libav is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* Libav is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with Libav; if not, write to the Free Software | |||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "x86inc.asm" | |||
| %include "x86util.asm" | |||
| SECTION_RODATA | |||
| cextern pw_255 | |||
| section .text align=16 | |||
| ; %1 = nr. of xmm registers used | |||
| %macro ADD_BYTES_FN 1 | |||
| cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i | |||
| %if ARCH_X86_64 | |||
| movsxd waq, wad | |||
| %endif | |||
| xor iq, iq | |||
| ; vector loop | |||
| mov wq, waq | |||
| and waq, ~(mmsize*2-1) | |||
| jmp .end_v | |||
| .loop_v: | |||
| mova m0, [src1q+iq] | |||
| mova m1, [src1q+iq+mmsize] | |||
| paddb m0, [src2q+iq] | |||
| paddb m1, [src2q+iq+mmsize] | |||
| mova [dstq+iq ], m0 | |||
| mova [dstq+iq+mmsize], m1 | |||
| add iq, mmsize*2 | |||
| .end_v: | |||
| cmp iq, waq | |||
| jl .loop_v | |||
| ; scalar loop for leftover | |||
| jmp .end_s | |||
| .loop_s: | |||
| mov wab, [src1q+iq] | |||
| add wab, [src2q+iq] | |||
| mov [dstq+iq], wab | |||
| inc iq | |||
| .end_s: | |||
| cmp iq, wq | |||
| jl .loop_s | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX mmx | |||
| ADD_BYTES_FN 0 | |||
| %macro ADD_PAETH_PRED_FN 1 | |||
| cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr | |||
| %if ARCH_X86_64 | |||
| movsxd bppq, bppd | |||
| movsxd wq, wd | |||
| %endif | |||
| lea endq, [dstq+wq-(mmsize/2-1)] | |||
| sub topq, dstq | |||
| sub srcq, dstq | |||
| sub dstq, bppq | |||
| pxor m7, m7 | |||
| movh m0, [dstq] | |||
| movh m1, [topq+dstq] | |||
| punpcklbw m0, m7 | |||
| punpcklbw m1, m7 | |||
| add dstq, bppq | |||
| .loop: | |||
| mova m2, m1 | |||
| movh m1, [topq+dstq] | |||
| mova m3, m2 | |||
| punpcklbw m1, m7 | |||
| mova m4, m2 | |||
| psubw m3, m1 | |||
| psubw m4, m0 | |||
| mova m5, m3 | |||
| paddw m5, m4 | |||
| %if cpuflag(ssse3) | |||
| pabsw m3, m3 | |||
| pabsw m4, m4 | |||
| pabsw m5, m5 | |||
| %else ; !cpuflag(ssse3) | |||
| psubw m7, m5 | |||
| pmaxsw m5, m7 | |||
| pxor m6, m6 | |||
| pxor m7, m7 | |||
| psubw m6, m3 | |||
| psubw m7, m4 | |||
| pmaxsw m3, m6 | |||
| pmaxsw m4, m7 | |||
| pxor m7, m7 | |||
| %endif ; cpuflag(ssse3) | |||
| mova m6, m4 | |||
| pminsw m6, m5 | |||
| pcmpgtw m3, m6 | |||
| pcmpgtw m4, m5 | |||
| mova m6, m4 | |||
| pand m4, m3 | |||
| pandn m6, m3 | |||
| pandn m3, m0 | |||
| movh m0, [srcq+dstq] | |||
| pand m6, m1 | |||
| pand m2, m4 | |||
| punpcklbw m0, m7 | |||
| paddw m0, m6 | |||
| paddw m3, m2 | |||
| paddw m0, m3 | |||
| pand m0, [pw_255] | |||
| mova m3, m0 | |||
| packuswb m3, m3 | |||
| movh [dstq], m3 | |||
| add dstq, bppq | |||
| cmp dstq, endq | |||
| jle .loop | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX mmx2 | |||
| ADD_PAETH_PRED_FN 0 | |||
| INIT_MMX ssse3 | |||
| ADD_PAETH_PRED_FN 0 | |||