The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>tags/n0.11
@@ -238,4 +238,6 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s) | |||||
if (ARCH_ARM) | if (ARCH_ARM) | ||||
ff_sbrdsp_init_arm(s); | ff_sbrdsp_init_arm(s); | ||||
if (HAVE_MMX) | |||||
ff_sbrdsp_init_x86(s); | |||||
} | } |
@@ -46,5 +46,6 @@ extern const float ff_sbr_noise_table[][2]; | |||||
void ff_sbrdsp_init(SBRDSPContext *s); | void ff_sbrdsp_init(SBRDSPContext *s); | ||||
void ff_sbrdsp_init_arm(SBRDSPContext *s); | void ff_sbrdsp_init_arm(SBRDSPContext *s); | ||||
void ff_sbrdsp_init_x86(SBRDSPContext *s); | |||||
#endif | #endif |
@@ -47,6 +47,8 @@ YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | |||||
MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o | MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o | ||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | ||||
MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o | MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o | ||||
MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o | |||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | |||||
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o | MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o | ||||
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o | MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o | ||||
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o | YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o | ||||
@@ -0,0 +1,74 @@ | |||||
;****************************************************************************** | |||||
;* AAC Spectral Band Replication decoding functions | |||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |||||
;* | |||||
;* This file is part of Libav. | |||||
;* | |||||
;* Libav is free software; you can redistribute it and/or | |||||
;* modify it under the terms of the GNU Lesser General Public | |||||
;* License as published by the Free Software Foundation; either | |||||
;* version 2.1 of the License, or (at your option) any later version. | |||||
;* | |||||
;* Libav is distributed in the hope that it will be useful, | |||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
;* Lesser General Public License for more details. | |||||
;* | |||||
;* You should have received a copy of the GNU Lesser General Public | |||||
;* License along with Libav; if not, write to the Free Software | |||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
;****************************************************************************** | |||||
%include "x86inc.asm" | |||||
%include "x86util.asm" | |||||
;SECTION_RODATA | |||||
SECTION .text | |||||
INIT_XMM sse | |||||
cglobal sbr_sum_square, 2, 3, 6 | |||||
mov r2, r1 | |||||
xorps m0, m0 | |||||
xorps m1, m1 | |||||
sar r2, 3 | |||||
jz .prepare | |||||
.loop: | |||||
movu m2, [r0 + 0] | |||||
movu m3, [r0 + 16] | |||||
movu m4, [r0 + 32] | |||||
movu m5, [r0 + 48] | |||||
mulps m2, m2 | |||||
mulps m3, m3 | |||||
mulps m4, m4 | |||||
mulps m5, m5 | |||||
addps m0, m2 | |||||
addps m1, m3 | |||||
addps m0, m4 | |||||
addps m1, m5 | |||||
add r0, 64 | |||||
dec r2 | |||||
jnz .loop | |||||
.prepare: | |||||
and r1, 7 | |||||
sar r1, 1 | |||||
jz .end | |||||
; len is a multiple of 2, thus there are at least 4 elements to process | |||||
.endloop: | |||||
movu m2, [r0] | |||||
add r0, 16 | |||||
mulps m2, m2 | |||||
dec r1 | |||||
addps m0, m2 | |||||
jnz .endloop | |||||
.end: | |||||
addps m0, m1 | |||||
movhlps m2, m0 | |||||
addps m0, m2 | |||||
movss m1, m0 | |||||
shufps m0, m0, 1 | |||||
addss m0, m1 | |||||
%if ARCH_X86_64 == 0 | |||||
movd r0m, m0 | |||||
fld dword r0m | |||||
%endif | |||||
RET |
@@ -0,0 +1,37 @@ | |||||
/* | |||||
* AAC Spectral Band Replication decoding functions | |||||
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |||||
* | |||||
* This file is part of Libav. | |||||
* | |||||
* Libav is free software; you can redistribute it and/or | |||||
* modify it under the terms of the GNU Lesser General Public | |||||
* License as published by the Free Software Foundation; either | |||||
* version 2.1 of the License, or (at your option) any later version. | |||||
* | |||||
* Libav is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
* Lesser General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU Lesser General Public | |||||
* License along with Libav; if not, write to the Free Software | |||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
*/ | |||||
#include "config.h" | |||||
#include "libavutil/cpu.h" | |||||
#include "libavcodec/sbrdsp.h" | |||||
float ff_sbr_sum_square_sse(float (*x)[2], int n); | |||||
void ff_sbrdsp_init_x86(SBRDSPContext *s) | |||||
{ | |||||
if (HAVE_YASM) { | |||||
int mm_flags = av_get_cpu_flags(); | |||||
if (mm_flags & AV_CPU_FLAG_SSE) { | |||||
s->sum_square = ff_sbr_sum_square_sse; | |||||
} | |||||
} | |||||
} |