The 32bits targets have been compiled with -mfpmath=sse for proper reference.
sbr_sum_square C /32bits: 82c (unrolled)/102c
C /64bits: 69c (unrolled)/82c
SSE/32bits: 42c
SSE/64bits: 31c
Use of SSE4.1 dpps to perform the final sum is slower.
Not unrolling to perform 8 operations in a loop yields 10 more cycles.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
tags/n0.11
| @@ -238,4 +238,6 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s) | |||||
| if (ARCH_ARM) | if (ARCH_ARM) | ||||
| ff_sbrdsp_init_arm(s); | ff_sbrdsp_init_arm(s); | ||||
| if (HAVE_MMX) | |||||
| ff_sbrdsp_init_x86(s); | |||||
| } | } | ||||
| @@ -46,5 +46,6 @@ extern const float ff_sbr_noise_table[][2]; | |||||
| void ff_sbrdsp_init(SBRDSPContext *s); | void ff_sbrdsp_init(SBRDSPContext *s); | ||||
| void ff_sbrdsp_init_arm(SBRDSPContext *s); | void ff_sbrdsp_init_arm(SBRDSPContext *s); | ||||
| void ff_sbrdsp_init_x86(SBRDSPContext *s); | |||||
| #endif | #endif | ||||
| @@ -47,6 +47,8 @@ YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | |||||
| MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o | MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o | ||||
| YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | ||||
| MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o | MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o | ||||
| MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o | |||||
| YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | |||||
| MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o | MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o | ||||
| MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o | MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o | ||||
| YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o | YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o | ||||
| @@ -0,0 +1,74 @@ | |||||
| ;****************************************************************************** | |||||
| ;* AAC Spectral Band Replication decoding functions | |||||
| ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "x86inc.asm" | |||||
| %include "x86util.asm" | |||||
| ;SECTION_RODATA | |||||
| SECTION .text | |||||
| INIT_XMM sse | |||||
| cglobal sbr_sum_square, 2, 3, 6 | |||||
| mov r2, r1 | |||||
| xorps m0, m0 | |||||
| xorps m1, m1 | |||||
| sar r2, 3 | |||||
| jz .prepare | |||||
| .loop: | |||||
| movu m2, [r0 + 0] | |||||
| movu m3, [r0 + 16] | |||||
| movu m4, [r0 + 32] | |||||
| movu m5, [r0 + 48] | |||||
| mulps m2, m2 | |||||
| mulps m3, m3 | |||||
| mulps m4, m4 | |||||
| mulps m5, m5 | |||||
| addps m0, m2 | |||||
| addps m1, m3 | |||||
| addps m0, m4 | |||||
| addps m1, m5 | |||||
| add r0, 64 | |||||
| dec r2 | |||||
| jnz .loop | |||||
| .prepare: | |||||
| and r1, 7 | |||||
| sar r1, 1 | |||||
| jz .end | |||||
| ; len is a multiple of 2, thus there are at least 4 elements to process | |||||
| .endloop: | |||||
| movu m2, [r0] | |||||
| add r0, 16 | |||||
| mulps m2, m2 | |||||
| dec r1 | |||||
| addps m0, m2 | |||||
| jnz .endloop | |||||
| .end: | |||||
| addps m0, m1 | |||||
| movhlps m2, m0 | |||||
| addps m0, m2 | |||||
| movss m1, m0 | |||||
| shufps m0, m0, 1 | |||||
| addss m0, m1 | |||||
| %if ARCH_X86_64 == 0 | |||||
| movd r0m, m0 | |||||
| fld dword r0m | |||||
| %endif | |||||
| RET | |||||
| @@ -0,0 +1,37 @@ | |||||
| /* | |||||
| * AAC Spectral Band Replication decoding functions | |||||
| * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavcodec/sbrdsp.h" | |||||
| float ff_sbr_sum_square_sse(float (*x)[2], int n); | |||||
| void ff_sbrdsp_init_x86(SBRDSPContext *s) | |||||
| { | |||||
| if (HAVE_YASM) { | |||||
| int mm_flags = av_get_cpu_flags(); | |||||
| if (mm_flags & AV_CPU_FLAG_SSE) { | |||||
| s->sum_square = ff_sbr_sum_square_sse; | |||||
| } | |||||
| } | |||||
| } | |||||