~25% faster dts decoding overall. The checkasm CPU cycles numbers are
not that useful since synth_filter_float() calls FFTContext.imdct_half().
cortex-a57 cortex-a53
synth_filter_float_c: 1866.2 3490.9
synth_filter_float_neon: 915.0 1531.5
With fftc.imdct_half forced to imdct_half_neon:
cortex-a57 cortex-a53
synth_filter_float_c: 1718.4 3025.3
synth_filter_float_neon: 926.2 1530.1
tags/n3.0
| @@ -16,7 +16,8 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o | |||||
| ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o | ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o | ||||
| NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o | |||||
| NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \ | |||||
| aarch64/synth_filter_neon.o | |||||
| NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o | NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o | ||||
| NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o | NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o | ||||
| NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ | NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ | ||||
| @@ -27,4 +27,7 @@ | |||||
| #define CELT_TMP 0x10 | #define CELT_TMP 0x10 | ||||
| #define CELT_TWIDDLE (CELT_TMP + 0x8) // loaded as pair | #define CELT_TWIDDLE (CELT_TMP + 0x8) // loaded as pair | ||||
| /* FFTContext */ | |||||
| #define IMDCT_HALF 0x48 | |||||
| #endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */ | #endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */ | ||||
| @@ -22,7 +22,15 @@ | |||||
| #include "libavutil/aarch64/cpu.h" | #include "libavutil/aarch64/cpu.h" | ||||
| #include "libavutil/attributes.h" | #include "libavutil/attributes.h" | ||||
| #include "libavutil/internal.h" | |||||
| #include "libavcodec/dcadsp.h" | #include "libavcodec/dcadsp.h" | ||||
| #include "libavcodec/fft.h" | |||||
| #include "asm-offsets.h" | |||||
| #if HAVE_NEON || HAVE_VFP | |||||
| AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); | |||||
| #endif | |||||
| void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); | void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); | ||||
| void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); | void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); | ||||
| @@ -49,3 +57,11 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) | |||||
| s->decode_hf = ff_decode_hf_neon; | s->decode_hf = ff_decode_hf_neon; | ||||
| } | } | ||||
| } | } | ||||
| av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (have_neon(cpu_flags)) | |||||
| s->synth_filter_float = ff_synth_filter_float_neon; | |||||
| } | |||||
| @@ -0,0 +1,119 @@ | |||||
| /* | |||||
| * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> | |||||
| * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net> | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "asm-offsets.h" | |||||
| #include "libavutil/aarch64/asm.S" | |||||
| .macro inner_loop | |||||
| ld1 {v29.4s}, [x9], x15 | |||||
| ld1 {v28.4s}, [x8], x15 | |||||
| ld1 {v30.4s}, [x10], x15 | |||||
| ld1 {v31.4s}, [x11], x15 | |||||
| rev64 v28.4s, v28.4s | |||||
| ld1 {v24.4s}, [x4], x15 | |||||
| ld1 {v25.4s}, [x5], x15 | |||||
| rev64 v31.4s, v31.4s | |||||
| ld1 {v26.4s}, [x6], x15 | |||||
| fmla v5.4s, v25.4s, v29.4s | |||||
| ld1 {v27.4s}, [x7], x15 | |||||
| ext v28.16b, v28.16b, v28.16b, #8 | |||||
| ext v31.16b, v31.16b, v31.16b, #8 | |||||
| fmla v6.4s, v26.4s, v30.4s | |||||
| fmls v4.4s, v24.4s, v28.4s | |||||
| fmla v7.4s, v27.4s, v31.4s | |||||
| .endm | |||||
| function ff_synth_filter_float_neon, export=1 | |||||
| ldr w7, [x2] // *synth_buf_offset | |||||
| ldr x9, [x0, #IMDCT_HALF] // imdct_half function pointer | |||||
| sxtw x7, w7 | |||||
| stp x3, x4, [sp, #-64]! | |||||
| add x1, x1, x7, lsl #2 // synth_buf | |||||
| sub w8, w7, #32 | |||||
| stp x5, x1, [sp, #16] | |||||
| bic x7, x7, #63 | |||||
| and w8, w8, #511 | |||||
| stp x7, x30, [sp, #32] | |||||
| str w8, [x2] | |||||
| str s0, [sp, #48] | |||||
| mov x2, x6 // in | |||||
| blr x9 | |||||
| ldp x2, x4, [sp] // synct_buf_2, window | |||||
| ldp x13, x9, [sp, #16] // out, synth_buf | |||||
| ldp x0, x30, [sp, #32] // *synth_buf_offset | |||||
| ldr s0, [sp, #48] | |||||
| add x3, x2, #16*4 // synct_buf_2 + 16 | |||||
| add x14, x13, #16*4 // out + 16 | |||||
| add x8, x9, #12*4 | |||||
| mov x15, #64*4 | |||||
| mov x1, #4 | |||||
| 1: | |||||
| add x10, x9, #16*4 // synth_buf | |||||
| add x11, x8, #16*4 | |||||
| add x5, x4, #16*4 // window | |||||
| add x6, x4, #32*4 | |||||
| add x7, x4, #48*4 | |||||
| ld1 {v4.4s}, [x2] // a | |||||
| ld1 {v5.4s}, [x3] // b | |||||
| movi v6.4s, #0 // c | |||||
| movi v7.4s, #0 // d | |||||
| mov x12, #512 | |||||
| 2: | |||||
| sub x12, x12, #64 | |||||
| cmp x12, x0 | |||||
| inner_loop | |||||
| b.gt 2b | |||||
| sub x8, x8, #512*4 | |||||
| sub x9, x9, #512*4 | |||||
| cbz x12, 4f | |||||
| sub x10, x10, #512*4 | |||||
| sub x11, x11, #512*4 | |||||
| 3: | |||||
| subs x12, x12, #64 | |||||
| inner_loop | |||||
| b.gt 3b | |||||
| 4: | |||||
| subs x1, x1, #1 | |||||
| fmul v4.4s, v4.4s, v0.s[0] | |||||
| fmul v5.4s, v5.4s, v0.s[0] | |||||
| st1 {v6.4s}, [x2], #16 | |||||
| st1 {v7.4s}, [x3], #16 | |||||
| st1 {v4.4s}, [x13], #16 | |||||
| st1 {v5.4s}, [x14], #16 | |||||
| b.le 10f | |||||
| sub x4, x4, #508*4 // window | |||||
| add x9, x9, #4*4 // synth_buf | |||||
| sub x8, x8, #4*4 // synth_buf | |||||
| b 1b | |||||
| 10: | |||||
| add sp, sp, #64 | |||||
| ret | |||||
| endfunc | |||||
| @@ -60,6 +60,10 @@ av_cold void ff_synth_filter_init(SynthFilterContext *c) | |||||
| { | { | ||||
| c->synth_filter_float = synth_filter_float; | c->synth_filter_float = synth_filter_float; | ||||
| if (ARCH_ARM) ff_synth_filter_init_arm(c); | |||||
| if (ARCH_X86) ff_synth_filter_init_x86(c); | |||||
| if (ARCH_AARCH64) | |||||
| ff_synth_filter_init_aarch64(c); | |||||
| if (ARCH_ARM) | |||||
| ff_synth_filter_init_arm(c); | |||||
| if (ARCH_X86) | |||||
| ff_synth_filter_init_x86(c); | |||||
| } | } | ||||
| @@ -32,6 +32,7 @@ typedef struct SynthFilterContext { | |||||
| } SynthFilterContext; | } SynthFilterContext; | ||||
| void ff_synth_filter_init(SynthFilterContext *c); | void ff_synth_filter_init(SynthFilterContext *c); | ||||
| void ff_synth_filter_init_aarch64(SynthFilterContext *c); | |||||
| void ff_synth_filter_init_arm(SynthFilterContext *c); | void ff_synth_filter_init_arm(SynthFilterContext *c); | ||||
| void ff_synth_filter_init_x86(SynthFilterContext *c); | void ff_synth_filter_init_x86(SynthFilterContext *c); | ||||