vf_psnr: sse2 optimizations for sum-squared-error.

The internal line accumulator for 16bit can overflow, so I changed that from int to uint64_t in the C code. The matching assembly looks a little weird but output looks correct. (avx2 should be trivial to add later.) Reviewed-by: Paul B Mahol <onemda@gmail.com> Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
10 years ago · ae4c9ddebc
--- a/libavfilter/psnr.h
+++ b/libavfilter/psnr.h
@@ -0,0 +1,33 @@
 /*
 * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef LIBAVFILTER_PSNR_H
 #define LIBAVFILTER_PSNR_H
 #include <stddef.h>
 #include <stdint.h>
 typedef struct PSNRDSPContext {
    uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w);
 } PSNRDSPContext;
 void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp);
 #endif /* LIBAVFILTER_PSNR_H */
--- a/libavfilter/vf_psnr.c
+++ b/libavfilter/vf_psnr.c
@@ -33,6 +33,7 @@
 #include "drawutils.h"
 #include "formats.h"
 #include "internal.h"
 #include "psnr.h"
 #include "video.h"
 typedef struct PSNRContext {
@@ -50,11 +51,7 @@ typedef struct PSNRContext {
    int planewidth[4];
    int planeheight[4];
    double planeweight[4];
    void (*compute_mse)(struct PSNRContext *s,
                        const uint8_t *m[4], const int ml[4],
                        const uint8_t *r[4], const int rl[4],
                        int w, int h, double mse[4]);
    PSNRDSPContext dsp;
 } PSNRContext;
 #define OFFSET(x) offsetof(PSNRContext, x)
@@ -78,55 +75,48 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max)
    return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0);
 }
 static inline
 void compute_images_mse(PSNRContext *s,
                        const uint8_t *main_data[4], const int main_linesizes[4],
                        const uint8_t *ref_data[4], const int ref_linesizes[4],
                        int w, int h, double mse[4])
 static uint64_t sse_line_8bit(const uint8_t *main_line,  const uint8_t *ref_line, int outw)
 {
    int i, c, j;
    int j;
    unsigned m2 = 0;
    for (c = 0; c < s->nb_components; c++) {
        const int outw = s->planewidth[c];
        const int outh = s->planeheight[c];
        const uint8_t *main_line = main_data[c];
        const uint8_t *ref_line = ref_data[c];
        const int ref_linesize = ref_linesizes[c];
        const int main_linesize = main_linesizes[c];
        uint64_t m = 0;
    for (j = 0; j < outw; j++)
        m2 += pow2(main_line[j] - ref_line[j]);
        for (i = 0; i < outh; i++) {
            int m2 = 0;
            for (j = 0; j < outw; j++)
                m2 += pow2(main_line[j] - ref_line[j]);
            m += m2;
            ref_line += ref_linesize;
            main_line += main_linesize;
        }
        mse[c] = m / (double)(outw * outh);
    }
    return m2;
 }
 static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw)
 {
    int j;
    uint64_t m2 = 0;
    const uint16_t *main_line = (const uint16_t *) _main_line;
    const uint16_t *ref_line = (const uint16_t *) _ref_line;
    for (j = 0; j < outw; j++)
        m2 += pow2(main_line[j] - ref_line[j]);
    return m2;
 }
 static inline
 void compute_images_mse_16bit(PSNRContext *s,
 void compute_images_mse(PSNRContext *s,
                        const uint8_t *main_data[4], const int main_linesizes[4],
                        const uint8_t *ref_data[4], const int ref_linesizes[4],
                        int w, int h, double mse[4])
 {
    int i, c, j;
    int i, c;
    for (c = 0; c < s->nb_components; c++) {
        const int outw = s->planewidth[c];
        const int outh = s->planeheight[c];
        const uint16_t *main_line = (uint16_t *)main_data[c];
        const uint16_t *ref_line = (uint16_t *)ref_data[c];
        const int ref_linesize = ref_linesizes[c] / 2;
        const int main_linesize = main_linesizes[c] / 2;
        const uint8_t *main_line = main_data[c];
        const uint8_t *ref_line = ref_data[c];
        const int ref_linesize = ref_linesizes[c];
        const int main_linesize = main_linesizes[c];
        uint64_t m = 0;
        for (i = 0; i < outh; i++) {
            for (j = 0; j < outw; j++)
                m += pow2(main_line[j] - ref_line[j]);
            m += s->dsp.sse_line(main_line, ref_line, outw);
            ref_line += ref_linesize;
            main_line += main_linesize;
        }
@@ -155,9 +145,9 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main,
    int j, c;
    AVDictionary **metadata = avpriv_frame_get_metadatap(main);
    s->compute_mse(s, (const uint8_t **)main->data, main->linesize,
                      (const uint8_t **)ref->data, ref->linesize,
                       main->width, main->height, comp_mse);
    compute_images_mse(s, (const uint8_t **)main->data, main->linesize,
                          (const uint8_t **)ref->data, ref->linesize,
                          main->width, main->height, comp_mse);
    for (j = 0; j < s->nb_components; j++)
        mse += comp_mse[j] * s->planeweight[j];
@@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink)
        s->average_max += s->max[j] * s->planeweight[j];
    }
    s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse;
    s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit;
    if (ARCH_X86)
        ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1);
    return 0;
 }
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_interlace_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
 OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
 OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
 OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
 OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
@@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o
 YASM-OBJS-$(CONFIG_INTERLACE_FILTER)         += x86/vf_interlace.o
 YASM-OBJS-$(CONFIG_PP7_FILTER)               += x86/vf_pp7.o
 YASM-OBJS-$(CONFIG_PSNR_FILTER)              += x86/vf_psnr.o
 YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
 YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
 YASM-OBJS-$(CONFIG_TINTERLACE_FILTER)        += x86/vf_interlace.o
--- a/libavfilter/x86/vf_psnr.asm
+++ b/libavfilter/x86/vf_psnr.asm
@@ -0,0 +1,139 @@
 ;*****************************************************************************
 ;* x86-optimized functions for interlace filter
 ;*
 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;******************************************************************************
 %include "libavutil/x86/x86util.asm"
 SECTION .text
 %macro SSE_LINE_FN 2 ; 8 or 16, byte or word
 INIT_XMM sse2
 %if ARCH_X86_32
 %if %1 == 8
 cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
 %else
 cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
 %endif
    mov       bufq, r0mp
    mov       refq, r1mp
    mov         wd, r2m
 %else
 cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
 %endif
    pxor        m6, m6
    pxor        m7, m7
    sub         wd, mmsize*2
    jl .end
 .loop:
    movu        m0, [bufq+mmsize*0]
    movu        m1, [bufq+mmsize*1]
    movu        m2, [refq+mmsize*0]
    movu        m3, [refq+mmsize*1]
 %if %1 == 8
    add       bufq, mmsize*2
    add       refq, mmsize*2
    psubusb     m4, m0, m2
    psubusb     m5, m1, m3
    psubusb     m2, m0
    psubusb     m3, m1
    por         m2, m4
    por         m3, m5
    punpcklbw   m0, m2, m6
    punpcklbw   m1, m3, m6
    punpckhbw   m2, m6
    punpckhbw   m3, m6
 %else
    psubw       m0, m2
    psubw       m1, m3
    movu        m2, [bufq+mmsize*2]
    movu        m3, [bufq+mmsize*3]
    movu        m4, [refq+mmsize*2]
    movu        m5, [refq+mmsize*3]
    psubw       m2, m4
    psubw       m3, m5
    add       bufq, mmsize*4
    add       refq, mmsize*4
 %endif
    pmaddwd     m0, m0
    pmaddwd     m1, m1
    pmaddwd     m2, m2
    pmaddwd     m3, m3
    paddd       m0, m1
    paddd       m2, m3
 %if %1 == 8
    paddd       m7, m0
    paddd       m7, m2
 %else
    paddd       m0, m2
    punpckldq   m2, m0, m6
    punpckhdq   m0, m6
    paddq       m7, m0
    paddq       m7, m2
 %endif
    sub         wd, mmsize*2
    jge .loop
 .end:
    add         wd, mmsize*2
    movhlps     m0, m7
 %if %1 == 8
    paddd       m7, m0
    pshufd      m0, m7, 1
    paddd       m7, m0
    movd       eax, m7
 %else
    paddq       m7, m0
 %if ARCH_X86_32
    movd       eax, m7
    psrldq      m7, 4
    movd       edx, m7
 %else
    movq       rax, m7
 %endif
 %endif
    ; deal with cases where w % 32 != 0
    test        wd, wd
    jz .end_scalar
 .loop_scalar:
    movzx     px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
    movzx     px2d, %2 [refq+wq*(%1/8)-(%1/8)]
    sub       px1d, px2d
    imul      px1d, px1d
 %if %1 == 8
    add        eax, px1d
 %elif ARCH_X86_64
    add        rax, px1q
 %else
    add        eax, px1d
    adc        edx, 0
 %endif
    dec         wd
    jg .loop_scalar
 .end_scalar:
    ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
    RET
 %endmacro
 INIT_XMM sse2
 SSE_LINE_FN  8, byte
 SSE_LINE_FN 16, word
--- a/libavfilter/x86/vf_psnr_init.c
+++ b/libavfilter/x86/vf_psnr_init.c
@@ -0,0 +1,39 @@
 /*
 * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/psnr.h"
 uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
 uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
 void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp)
 {
    int cpu_flags = av_get_cpu_flags();
    if (EXTERNAL_SSE2(cpu_flags)) {
        if (bpp <= 8) {
            dsp->sse_line = ff_sse_line_8bit_sse2;
        } else if (bpp <= 15) {
            dsp->sse_line = ff_sse_line_16bit_sse2;
        }
    }
 }