The internal line accumulator for 16bit can overflow, so I changed that from int to uint64_t in the C code. The matching assembly looks a little weird but output looks correct. (avx2 should be trivial to add later.) Reviewed-by: Paul B Mahol <onemda@gmail.com> Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>tags/n2.8
| @@ -0,0 +1,33 @@ | |||||
| /* | |||||
| * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #ifndef LIBAVFILTER_PSNR_H | |||||
| #define LIBAVFILTER_PSNR_H | |||||
| #include <stddef.h> | |||||
| #include <stdint.h> | |||||
| typedef struct PSNRDSPContext { | |||||
| uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w); | |||||
| } PSNRDSPContext; | |||||
| void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp); | |||||
| #endif /* LIBAVFILTER_PSNR_H */ | |||||
| @@ -33,6 +33,7 @@ | |||||
| #include "drawutils.h" | #include "drawutils.h" | ||||
| #include "formats.h" | #include "formats.h" | ||||
| #include "internal.h" | #include "internal.h" | ||||
| #include "psnr.h" | |||||
| #include "video.h" | #include "video.h" | ||||
| typedef struct PSNRContext { | typedef struct PSNRContext { | ||||
| @@ -50,11 +51,7 @@ typedef struct PSNRContext { | |||||
| int planewidth[4]; | int planewidth[4]; | ||||
| int planeheight[4]; | int planeheight[4]; | ||||
| double planeweight[4]; | double planeweight[4]; | ||||
| void (*compute_mse)(struct PSNRContext *s, | |||||
| const uint8_t *m[4], const int ml[4], | |||||
| const uint8_t *r[4], const int rl[4], | |||||
| int w, int h, double mse[4]); | |||||
| PSNRDSPContext dsp; | |||||
| } PSNRContext; | } PSNRContext; | ||||
| #define OFFSET(x) offsetof(PSNRContext, x) | #define OFFSET(x) offsetof(PSNRContext, x) | ||||
| @@ -78,55 +75,48 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max) | |||||
| return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0); | return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0); | ||||
| } | } | ||||
| static inline | |||||
| void compute_images_mse(PSNRContext *s, | |||||
| const uint8_t *main_data[4], const int main_linesizes[4], | |||||
| const uint8_t *ref_data[4], const int ref_linesizes[4], | |||||
| int w, int h, double mse[4]) | |||||
| static uint64_t sse_line_8bit(const uint8_t *main_line, const uint8_t *ref_line, int outw) | |||||
| { | { | ||||
| int i, c, j; | |||||
| int j; | |||||
| unsigned m2 = 0; | |||||
| for (c = 0; c < s->nb_components; c++) { | |||||
| const int outw = s->planewidth[c]; | |||||
| const int outh = s->planeheight[c]; | |||||
| const uint8_t *main_line = main_data[c]; | |||||
| const uint8_t *ref_line = ref_data[c]; | |||||
| const int ref_linesize = ref_linesizes[c]; | |||||
| const int main_linesize = main_linesizes[c]; | |||||
| uint64_t m = 0; | |||||
| for (j = 0; j < outw; j++) | |||||
| m2 += pow2(main_line[j] - ref_line[j]); | |||||
| for (i = 0; i < outh; i++) { | |||||
| int m2 = 0; | |||||
| for (j = 0; j < outw; j++) | |||||
| m2 += pow2(main_line[j] - ref_line[j]); | |||||
| m += m2; | |||||
| ref_line += ref_linesize; | |||||
| main_line += main_linesize; | |||||
| } | |||||
| mse[c] = m / (double)(outw * outh); | |||||
| } | |||||
| return m2; | |||||
| } | |||||
| static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw) | |||||
| { | |||||
| int j; | |||||
| uint64_t m2 = 0; | |||||
| const uint16_t *main_line = (const uint16_t *) _main_line; | |||||
| const uint16_t *ref_line = (const uint16_t *) _ref_line; | |||||
| for (j = 0; j < outw; j++) | |||||
| m2 += pow2(main_line[j] - ref_line[j]); | |||||
| return m2; | |||||
| } | } | ||||
| static inline | static inline | ||||
| void compute_images_mse_16bit(PSNRContext *s, | |||||
| void compute_images_mse(PSNRContext *s, | |||||
| const uint8_t *main_data[4], const int main_linesizes[4], | const uint8_t *main_data[4], const int main_linesizes[4], | ||||
| const uint8_t *ref_data[4], const int ref_linesizes[4], | const uint8_t *ref_data[4], const int ref_linesizes[4], | ||||
| int w, int h, double mse[4]) | int w, int h, double mse[4]) | ||||
| { | { | ||||
| int i, c, j; | |||||
| int i, c; | |||||
| for (c = 0; c < s->nb_components; c++) { | for (c = 0; c < s->nb_components; c++) { | ||||
| const int outw = s->planewidth[c]; | const int outw = s->planewidth[c]; | ||||
| const int outh = s->planeheight[c]; | const int outh = s->planeheight[c]; | ||||
| const uint16_t *main_line = (uint16_t *)main_data[c]; | |||||
| const uint16_t *ref_line = (uint16_t *)ref_data[c]; | |||||
| const int ref_linesize = ref_linesizes[c] / 2; | |||||
| const int main_linesize = main_linesizes[c] / 2; | |||||
| const uint8_t *main_line = main_data[c]; | |||||
| const uint8_t *ref_line = ref_data[c]; | |||||
| const int ref_linesize = ref_linesizes[c]; | |||||
| const int main_linesize = main_linesizes[c]; | |||||
| uint64_t m = 0; | uint64_t m = 0; | ||||
| for (i = 0; i < outh; i++) { | for (i = 0; i < outh; i++) { | ||||
| for (j = 0; j < outw; j++) | |||||
| m += pow2(main_line[j] - ref_line[j]); | |||||
| m += s->dsp.sse_line(main_line, ref_line, outw); | |||||
| ref_line += ref_linesize; | ref_line += ref_linesize; | ||||
| main_line += main_linesize; | main_line += main_linesize; | ||||
| } | } | ||||
| @@ -155,9 +145,9 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main, | |||||
| int j, c; | int j, c; | ||||
| AVDictionary **metadata = avpriv_frame_get_metadatap(main); | AVDictionary **metadata = avpriv_frame_get_metadatap(main); | ||||
| s->compute_mse(s, (const uint8_t **)main->data, main->linesize, | |||||
| (const uint8_t **)ref->data, ref->linesize, | |||||
| main->width, main->height, comp_mse); | |||||
| compute_images_mse(s, (const uint8_t **)main->data, main->linesize, | |||||
| (const uint8_t **)ref->data, ref->linesize, | |||||
| main->width, main->height, comp_mse); | |||||
| for (j = 0; j < s->nb_components; j++) | for (j = 0; j < s->nb_components; j++) | ||||
| mse += comp_mse[j] * s->planeweight[j]; | mse += comp_mse[j] * s->planeweight[j]; | ||||
| @@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink) | |||||
| s->average_max += s->max[j] * s->planeweight[j]; | s->average_max += s->max[j] * s->planeweight[j]; | ||||
| } | } | ||||
| s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse; | |||||
| s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit; | |||||
| if (ARCH_X86) | |||||
| ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1); | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o | |||||
| OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o | OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o | ||||
| OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o | OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o | ||||
| OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o | OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o | ||||
| OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o | |||||
| OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o | OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o | ||||
| OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o | OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o | ||||
| OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o | OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o | ||||
| @@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o | |||||
| YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o | YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o | ||||
| YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o | YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o | ||||
| YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o | YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o | ||||
| YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o | |||||
| YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o | YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o | ||||
| YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o | YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o | ||||
| YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o | YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o | ||||
| @@ -0,0 +1,139 @@ | |||||
| ;***************************************************************************** | |||||
| ;* x86-optimized functions for interlace filter | |||||
| ;* | |||||
| ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> | |||||
| ;* | |||||
| ;* This file is part of FFmpeg. | |||||
| ;* | |||||
| ;* FFmpeg is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with FFmpeg; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION .text | |||||
| %macro SSE_LINE_FN 2 ; 8 or 16, byte or word | |||||
| INIT_XMM sse2 | |||||
| %if ARCH_X86_32 | |||||
| %if %1 == 8 | |||||
| cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref | |||||
| %else | |||||
| cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref | |||||
| %endif | |||||
| mov bufq, r0mp | |||||
| mov refq, r1mp | |||||
| mov wd, r2m | |||||
| %else | |||||
| cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2 | |||||
| %endif | |||||
| pxor m6, m6 | |||||
| pxor m7, m7 | |||||
| sub wd, mmsize*2 | |||||
| jl .end | |||||
| .loop: | |||||
| movu m0, [bufq+mmsize*0] | |||||
| movu m1, [bufq+mmsize*1] | |||||
| movu m2, [refq+mmsize*0] | |||||
| movu m3, [refq+mmsize*1] | |||||
| %if %1 == 8 | |||||
| add bufq, mmsize*2 | |||||
| add refq, mmsize*2 | |||||
| psubusb m4, m0, m2 | |||||
| psubusb m5, m1, m3 | |||||
| psubusb m2, m0 | |||||
| psubusb m3, m1 | |||||
| por m2, m4 | |||||
| por m3, m5 | |||||
| punpcklbw m0, m2, m6 | |||||
| punpcklbw m1, m3, m6 | |||||
| punpckhbw m2, m6 | |||||
| punpckhbw m3, m6 | |||||
| %else | |||||
| psubw m0, m2 | |||||
| psubw m1, m3 | |||||
| movu m2, [bufq+mmsize*2] | |||||
| movu m3, [bufq+mmsize*3] | |||||
| movu m4, [refq+mmsize*2] | |||||
| movu m5, [refq+mmsize*3] | |||||
| psubw m2, m4 | |||||
| psubw m3, m5 | |||||
| add bufq, mmsize*4 | |||||
| add refq, mmsize*4 | |||||
| %endif | |||||
| pmaddwd m0, m0 | |||||
| pmaddwd m1, m1 | |||||
| pmaddwd m2, m2 | |||||
| pmaddwd m3, m3 | |||||
| paddd m0, m1 | |||||
| paddd m2, m3 | |||||
| %if %1 == 8 | |||||
| paddd m7, m0 | |||||
| paddd m7, m2 | |||||
| %else | |||||
| paddd m0, m2 | |||||
| punpckldq m2, m0, m6 | |||||
| punpckhdq m0, m6 | |||||
| paddq m7, m0 | |||||
| paddq m7, m2 | |||||
| %endif | |||||
| sub wd, mmsize*2 | |||||
| jge .loop | |||||
| .end: | |||||
| add wd, mmsize*2 | |||||
| movhlps m0, m7 | |||||
| %if %1 == 8 | |||||
| paddd m7, m0 | |||||
| pshufd m0, m7, 1 | |||||
| paddd m7, m0 | |||||
| movd eax, m7 | |||||
| %else | |||||
| paddq m7, m0 | |||||
| %if ARCH_X86_32 | |||||
| movd eax, m7 | |||||
| psrldq m7, 4 | |||||
| movd edx, m7 | |||||
| %else | |||||
| movq rax, m7 | |||||
| %endif | |||||
| %endif | |||||
| ; deal with cases where w % 32 != 0 | |||||
| test wd, wd | |||||
| jz .end_scalar | |||||
| .loop_scalar: | |||||
| movzx px1d, %2 [bufq+wq*(%1/8)-(%1/8)] | |||||
| movzx px2d, %2 [refq+wq*(%1/8)-(%1/8)] | |||||
| sub px1d, px2d | |||||
| imul px1d, px1d | |||||
| %if %1 == 8 | |||||
| add eax, px1d | |||||
| %elif ARCH_X86_64 | |||||
| add rax, px1q | |||||
| %else | |||||
| add eax, px1d | |||||
| adc edx, 0 | |||||
| %endif | |||||
| dec wd | |||||
| jg .loop_scalar | |||||
| .end_scalar: | |||||
| ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM sse2 | |||||
| SSE_LINE_FN 8, byte | |||||
| SSE_LINE_FN 16, word | |||||
| @@ -0,0 +1,39 @@ | |||||
| /* | |||||
| * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/x86/cpu.h" | |||||
| #include "libavfilter/psnr.h" | |||||
| uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); | |||||
| uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); | |||||
| void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (EXTERNAL_SSE2(cpu_flags)) { | |||||
| if (bpp <= 8) { | |||||
| dsp->sse_line = ff_sse_line_8bit_sse2; | |||||
| } else if (bpp <= 15) { | |||||
| dsp->sse_line = ff_sse_line_16bit_sse2; | |||||
| } | |||||
| } | |||||
| } | |||||