It is currently declared as a macro who is set to inlinable functions, among which a Neon and a default C implementations. Add a DSP parameter to each inline function, unused except by the default C implementation which calls a function from the DSP context. On an Arrandale CPU, gain for an inlined SSE2 function vs. a call: - Win32: 29 to 26 cycles - Win64: 25 to 23 cycles Signed-off-by: Janne Grunau <janne-libav@jannau.net>tags/n2.2-rc1
| @@ -83,7 +83,8 @@ static inline int decode_blockcodes(int code1, int code2, int levels, | |||||
| #if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y | #if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y | ||||
| #define int8x8_fmul_int32 int8x8_fmul_int32 | #define int8x8_fmul_int32 int8x8_fmul_int32 | ||||
| static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale) | |||||
| static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, | |||||
| float *dst, const int8_t *src, int scale) | |||||
| { | { | ||||
| __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" | __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" | ||||
| "vld1.8 {d0}, [%1,:64] \n" | "vld1.8 {d0}, [%1,:64] \n" | ||||
| @@ -1086,12 +1086,10 @@ static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 }; | |||||
| static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; | static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; | ||||
| #ifndef int8x8_fmul_int32 | #ifndef int8x8_fmul_int32 | ||||
| static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale) | |||||
| static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst, | |||||
| const int8_t *src, int scale) | |||||
| { | { | ||||
| float fscale = scale / 16.0; | |||||
| int i; | |||||
| for (i = 0; i < 8; i++) | |||||
| dst[i] = src[i] * fscale; | |||||
| dsp->int8x8_fmul_int32(dst, src, scale); | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -1219,7 +1217,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) | |||||
| s->debug_flag |= 0x01; | s->debug_flag |= 0x01; | ||||
| } | } | ||||
| int8x8_fmul_int32(subband_samples[k][l], | |||||
| int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l], | |||||
| &high_freq_vq[hfvq][subsubframe * 8], | &high_freq_vq[hfvq][subsubframe * 8], | ||||
| s->scale_factor[k][l][0]); | s->scale_factor[k][l][0]); | ||||
| } | } | ||||
| @@ -24,6 +24,14 @@ | |||||
| #include "libavutil/intreadwrite.h" | #include "libavutil/intreadwrite.h" | ||||
| #include "dcadsp.h" | #include "dcadsp.h" | ||||
| static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale) | |||||
| { | |||||
| float fscale = scale / 16.0; | |||||
| int i; | |||||
| for (i = 0; i < 8; i++) | |||||
| dst[i] = src[i] * fscale; | |||||
| } | |||||
| static void dca_lfe_fir_c(float *out, const float *in, const float *coefs, | static void dca_lfe_fir_c(float *out, const float *in, const float *coefs, | ||||
| int decifactor, float scale) | int decifactor, float scale) | ||||
| { | { | ||||
| @@ -78,5 +86,6 @@ av_cold void ff_dcadsp_init(DCADSPContext *s) | |||||
| { | { | ||||
| s->lfe_fir = dca_lfe_fir_c; | s->lfe_fir = dca_lfe_fir_c; | ||||
| s->qmf_32_subbands = dca_qmf_32_subbands; | s->qmf_32_subbands = dca_qmf_32_subbands; | ||||
| s->int8x8_fmul_int32 = int8x8_fmul_int32_c; | |||||
| if (ARCH_ARM) ff_dcadsp_init_arm(s); | if (ARCH_ARM) ff_dcadsp_init_arm(s); | ||||
| } | } | ||||
| @@ -31,6 +31,7 @@ typedef struct DCADSPContext { | |||||
| int *synth_buf_offset, float synth_buf2[32], | int *synth_buf_offset, float synth_buf2[32], | ||||
| const float window[512], float *samples_out, | const float window[512], float *samples_out, | ||||
| float raXin[32], float scale); | float raXin[32], float scale); | ||||
| void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale); | |||||
| } DCADSPContext; | } DCADSPContext; | ||||
| void ff_dcadsp_init(DCADSPContext *s); | void ff_dcadsp_init(DCADSPContext *s); | ||||