on x86_64:
time PSNR
plain 3.303 inf
SSE 1.649 107.087535
SSE3 1.632 107.087535
AVX 1.409 106.986771
FMA3 1.265 107.108437
on x86_32 (PSNR compared to x86_64 plain):
time PSNR
plain 7.225 103.951979
SSE 1.827 105.859282
SSE3 1.819 105.859282
AVX 1.533 105.997661
FMA3 1.384 105.885377
FMA4 test is not available
Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
tags/n3.1
| @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) | |||||
| w *= sign * (1.0 / s->fft_len); | w *= sign * (1.0 / s->fft_len); | ||||
| s->coeffs[m].val[x - s->coeffs[m].start] = w; | s->coeffs[m].val[x - s->coeffs[m].start] = w; | ||||
| } | } | ||||
| if (s->permute_coeffs) | |||||
| s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); | |||||
| } | } | ||||
| av_expr_free(expr); | av_expr_free(expr); | ||||
| @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink) | |||||
| s->cqt_align = 1; | s->cqt_align = 1; | ||||
| s->cqt_calc = cqt_calc; | s->cqt_calc = cqt_calc; | ||||
| s->permute_coeffs = NULL; | |||||
| s->draw_sono = draw_sono; | s->draw_sono = draw_sono; | ||||
| if (s->format == AV_PIX_FMT_RGB24) { | if (s->format == AV_PIX_FMT_RGB24) { | ||||
| s->draw_bar = draw_bar_rgb; | s->draw_bar = draw_bar_rgb; | ||||
| @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink) | |||||
| s->update_sono = update_sono_yuv; | s->update_sono = update_sono_yuv; | ||||
| } | } | ||||
| if (ARCH_X86) | |||||
| ff_showcqt_init_x86(s); | |||||
| if ((ret = init_cqt(s)) < 0) | if ((ret = init_cqt(s)) < 0) | ||||
| return ret; | return ret; | ||||
| @@ -74,6 +74,7 @@ typedef struct { | |||||
| /* callback */ | /* callback */ | ||||
| void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, | void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, | ||||
| int len, int fft_len); | int len, int fft_len); | ||||
| void (*permute_coeffs)(float *v, int len); | |||||
| void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h, | void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h, | ||||
| const ColorFloat *c, int bar_h); | const ColorFloat *c, int bar_h); | ||||
| void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off); | void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off); | ||||
| @@ -112,4 +113,6 @@ typedef struct { | |||||
| int axis; | int axis; | ||||
| } ShowCQTContext; | } ShowCQTContext; | ||||
| void ff_showcqt_init_x86(ShowCQTContext *s); | |||||
| #endif | #endif | ||||
| @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o | |||||
| OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o | OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o | ||||
| OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o | OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o | ||||
| OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o | OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o | ||||
| OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o | |||||
| OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o | OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o | ||||
| OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o | OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o | ||||
| OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o | OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o | ||||
| @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o | |||||
| ifdef CONFIG_GPL | ifdef CONFIG_GPL | ||||
| YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o | YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o | ||||
| endif | endif | ||||
| YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o | |||||
| YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o | YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o | ||||
| YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o | YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o | ||||
| YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o | YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o | ||||
| @@ -0,0 +1,206 @@ | |||||
| ;***************************************************************************** | |||||
| ;* x86-optimized functions for showcqt filter | |||||
| ;* | |||||
| ;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com> | |||||
| ;* | |||||
| ;* This file is part of FFmpeg. | |||||
| ;* | |||||
| ;* FFmpeg is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with FFmpeg; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| %if ARCH_X86_64 | |||||
| %define pointer resq | |||||
| %else | |||||
| %define pointer resd | |||||
| %endif | |||||
| struc Coeffs | |||||
| .val: pointer 1 | |||||
| .start: resd 1 | |||||
| .len: resd 1 | |||||
| .sizeof: | |||||
| endstruc | |||||
| %macro EMULATE_HADDPS 3 ; dst, src, tmp | |||||
| %if cpuflag(sse3) | |||||
| haddps %1, %2 | |||||
| %else | |||||
| movaps %3, %1 | |||||
| shufps %1, %2, q2020 | |||||
| shufps %3, %2, q3131 | |||||
| addps %1, %3 | |||||
| %endif | |||||
| %endmacro ; EMULATE_HADDPS | |||||
| %macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp | |||||
| %if cpuflag(fma3) || cpuflag(fma4) | |||||
| fmaddps %1, %2, %3, %4 | |||||
| %else | |||||
| mulps %5, %2, %3 | |||||
| addps %1, %4, %5 | |||||
| %endif | |||||
| %endmacro ; EMULATE_FMADDPS | |||||
| %macro CQT_CALC 9 | |||||
| ; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im | |||||
| ; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset | |||||
| mov id, xd | |||||
| add id, [coeffsq + Coeffs.start + %9] | |||||
| movaps m%5, [srcq + 8 * iq] | |||||
| movaps m%7, [srcq + 8 * iq + mmsize] | |||||
| shufps m%6, m%5, m%7, q3131 | |||||
| shufps m%5, m%5, m%7, q2020 | |||||
| sub id, fft_lend | |||||
| EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 | |||||
| neg id | |||||
| EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 | |||||
| movups m%5, [srcq + 8 * iq - mmsize + 8] | |||||
| movups m%7, [srcq + 8 * iq - 2*mmsize + 8] | |||||
| %if mmsize == 32 | |||||
| vperm2f128 m%5, m%5, m%5, 1 | |||||
| vperm2f128 m%7, m%7, m%7, 1 | |||||
| %endif | |||||
| shufps m%6, m%5, m%7, q1313 | |||||
| shufps m%5, m%5, m%7, q0202 | |||||
| EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 | |||||
| EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 | |||||
| %endmacro ; CQT_CALC | |||||
| %macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 | |||||
| addps m%5, m%4, m%2 | |||||
| subps m%6, m%3, m%1 | |||||
| addps m%1, m%3 | |||||
| subps m%2, m%4 | |||||
| EMULATE_HADDPS m%5, m%6, m%3 | |||||
| EMULATE_HADDPS m%1, m%2, m%3 | |||||
| EMULATE_HADDPS m%1, m%5, m%2 | |||||
| %if mmsize == 32 | |||||
| vextractf128 xmm%2, m%1, 1 | |||||
| addps xmm%1, xmm%2 | |||||
| %endif | |||||
| %endmacro ; CQT_SEPARATE | |||||
| %macro DECLARE_CQT_CALC 0 | |||||
| ; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) | |||||
| %if ARCH_X86_64 | |||||
| cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len | |||||
| align 16 | |||||
| .loop_k: | |||||
| mov xd, [coeffsq + Coeffs.len] | |||||
| xorps m0, m0 | |||||
| movaps m1, m0 | |||||
| movaps m2, m0 | |||||
| mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] | |||||
| movaps m3, m0 | |||||
| movaps m8, m0 | |||||
| cmp coeffs_lend, xd | |||||
| movaps m9, m0 | |||||
| movaps m10, m0 | |||||
| movaps m11, m0 | |||||
| cmova coeffs_lend, xd | |||||
| xor xd, xd | |||||
| test coeffs_lend, coeffs_lend | |||||
| jz .check_loop_b | |||||
| mov coeffs_valq, [coeffsq + Coeffs.val] | |||||
| mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] | |||||
| align 16 | |||||
| .loop_ab: | |||||
| movaps m7, [coeffs_valq + 4 * xq] | |||||
| CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 | |||||
| movaps m7, [coeffs_val2q + 4 * xq] | |||||
| CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof | |||||
| add xd, mmsize/4 | |||||
| cmp xd, coeffs_lend | |||||
| jb .loop_ab | |||||
| .check_loop_b: | |||||
| cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] | |||||
| jae .check_loop_a | |||||
| align 16 | |||||
| .loop_b: | |||||
| movaps m7, [coeffs_val2q + 4 * xq] | |||||
| CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof | |||||
| add xd, mmsize/4 | |||||
| cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] | |||||
| jb .loop_b | |||||
| .loop_end: | |||||
| CQT_SEPARATE 0, 1, 2, 3, 4, 5 | |||||
| CQT_SEPARATE 8, 9, 10, 11, 4, 5 | |||||
| mulps xmm0, xmm0 | |||||
| mulps xmm8, xmm8 | |||||
| EMULATE_HADDPS xmm0, xmm8, xmm1 | |||||
| movaps [dstq], xmm0 | |||||
| sub lend, 2 | |||||
| lea dstq, [dstq + 16] | |||||
| lea coeffsq, [coeffsq + 2*Coeffs.sizeof] | |||||
| jnz .loop_k | |||||
| REP_RET | |||||
| align 16 | |||||
| .check_loop_a: | |||||
| cmp xd, [coeffsq + Coeffs.len] | |||||
| jae .loop_end | |||||
| align 16 | |||||
| .loop_a: | |||||
| movaps m7, [coeffs_valq + 4 * xq] | |||||
| CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 | |||||
| add xd, mmsize/4 | |||||
| cmp xd, [coeffsq + Coeffs.len] | |||||
| jb .loop_a | |||||
| jmp .loop_end | |||||
| %else | |||||
| cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i | |||||
| %define fft_lend r4m | |||||
| align 16 | |||||
| .loop_k: | |||||
| mov xd, [coeffsq + Coeffs.len] | |||||
| xorps m0, m0 | |||||
| movaps m1, m0 | |||||
| movaps m2, m0 | |||||
| movaps m3, m0 | |||||
| test xd, xd | |||||
| jz .store | |||||
| mov coeffs_valq, [coeffsq + Coeffs.val] | |||||
| xor xd, xd | |||||
| align 16 | |||||
| .loop_x: | |||||
| movaps m7, [coeffs_valq + 4 * xq] | |||||
| CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 | |||||
| add xd, mmsize/4 | |||||
| cmp xd, [coeffsq + Coeffs.len] | |||||
| jb .loop_x | |||||
| CQT_SEPARATE 0, 1, 2, 3, 4, 5 | |||||
| mulps xmm0, xmm0 | |||||
| EMULATE_HADDPS xmm0, xmm0, xmm1 | |||||
| .store: | |||||
| movlps [dstq], xmm0 | |||||
| sub lend, 1 | |||||
| lea dstq, [dstq + 8] | |||||
| lea coeffsq, [coeffsq + Coeffs.sizeof] | |||||
| jnz .loop_k | |||||
| REP_RET | |||||
| %endif ; ARCH_X86_64 | |||||
| %endmacro ; DECLARE_CQT_CALC | |||||
| INIT_XMM sse | |||||
| DECLARE_CQT_CALC | |||||
| INIT_XMM sse3 | |||||
| DECLARE_CQT_CALC | |||||
| INIT_YMM avx | |||||
| DECLARE_CQT_CALC | |||||
| INIT_YMM fma3 | |||||
| DECLARE_CQT_CALC | |||||
| INIT_XMM fma4 | |||||
| DECLARE_CQT_CALC | |||||
| @@ -0,0 +1,63 @@ | |||||
| /* | |||||
| * Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/attributes.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/x86/cpu.h" | |||||
| #include "libavfilter/avf_showcqt.h" | |||||
| #define DECLARE_CQT_CALC(type) \ | |||||
| void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \ | |||||
| const Coeffs *coeffs, int len, int fft_len) | |||||
| DECLARE_CQT_CALC(sse); | |||||
| DECLARE_CQT_CALC(sse3); | |||||
| DECLARE_CQT_CALC(avx); | |||||
| DECLARE_CQT_CALC(fma3); | |||||
| DECLARE_CQT_CALC(fma4); | |||||
| #define permute_coeffs_0 NULL | |||||
| static void permute_coeffs_01452367(float *v, int len) | |||||
| { | |||||
| int k; | |||||
| for (k = 0; k < len; k += 8) { | |||||
| FFSWAP(float, v[k+2], v[k+4]); | |||||
| FFSWAP(float, v[k+3], v[k+5]); | |||||
| } | |||||
| } | |||||
| av_cold void ff_showcqt_init_x86(ShowCQTContext *s) | |||||
| { | |||||
| int cpuflags = av_get_cpu_flags(); | |||||
| #define SELECT_CQT_CALC(type, TYPE, align, perm) \ | |||||
| if (EXTERNAL_##TYPE(cpuflags)) { \ | |||||
| s->cqt_calc = ff_showcqt_cqt_calc_##type; \ | |||||
| s->cqt_align = align; \ | |||||
| s->permute_coeffs = permute_coeffs_##perm; \ | |||||
| } | |||||
| SELECT_CQT_CALC(sse, SSE, 4, 0); | |||||
| SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0); | |||||
| SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm | |||||
| SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367); | |||||
| SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367); | |||||
| } | |||||