DSPContext.vector_fmul_window() DCADSPContext.lfe_fir() SynthFilterContext.synth_filter_float() Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.8
| @@ -1721,19 +1721,19 @@ static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce) | |||||
| */ | */ | ||||
| if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) && | if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) && | ||||
| (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) { | (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) { | ||||
| ac->dsp.vector_fmul_window( out, saved, buf, lwindow_prev, 0, 512); | |||||
| ac->dsp.vector_fmul_window( out, saved, buf, lwindow_prev, 512); | |||||
| } else { | } else { | ||||
| memcpy( out, saved, 448 * sizeof(float)); | memcpy( out, saved, 448 * sizeof(float)); | ||||
| if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { | if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { | ||||
| ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448, buf + 0*128, swindow_prev, 0, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(temp, buf + 3*128 + 64, buf + 4*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448, buf + 0*128, swindow_prev, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow, 64); | |||||
| ac->dsp.vector_fmul_window(temp, buf + 3*128 + 64, buf + 4*128, swindow, 64); | |||||
| memcpy( out + 448 + 4*128, temp, 64 * sizeof(float)); | memcpy( out + 448 + 4*128, temp, 64 * sizeof(float)); | ||||
| } else { | } else { | ||||
| ac->dsp.vector_fmul_window(out + 448, saved + 448, buf, swindow_prev, 0, 64); | |||||
| ac->dsp.vector_fmul_window(out + 448, saved + 448, buf, swindow_prev, 64); | |||||
| memcpy( out + 576, buf + 64, 448 * sizeof(float)); | memcpy( out + 576, buf + 64, 448 * sizeof(float)); | ||||
| } | } | ||||
| } | } | ||||
| @@ -1741,9 +1741,9 @@ static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce) | |||||
| // buffer update | // buffer update | ||||
| if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { | if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) { | ||||
| memcpy( saved, temp + 64, 64 * sizeof(float)); | memcpy( saved, temp + 64, 64 * sizeof(float)); | ||||
| ac->dsp.vector_fmul_window(saved + 64, buf + 4*128 + 64, buf + 5*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 0, 64); | |||||
| ac->dsp.vector_fmul_window(saved + 64, buf + 4*128 + 64, buf + 5*128, swindow, 64); | |||||
| ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64); | |||||
| ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64); | |||||
| memcpy( saved + 448, buf + 7*128 + 64, 64 * sizeof(float)); | memcpy( saved + 448, buf + 7*128 + 64, 64 * sizeof(float)); | ||||
| } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) { | } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) { | ||||
| memcpy( saved, buf + 512, 448 * sizeof(float)); | memcpy( saved, buf + 512, 448 * sizeof(float)); | ||||
| @@ -628,13 +628,13 @@ static inline void do_imdct(AC3DecodeContext *s, int channels) | |||||
| for(i=0; i<128; i++) | for(i=0; i<128; i++) | ||||
| x[i] = s->transform_coeffs[ch][2*i]; | x[i] = s->transform_coeffs[ch][2*i]; | ||||
| ff_imdct_half(&s->imdct_256, s->tmp_output, x); | ff_imdct_half(&s->imdct_256, s->tmp_output, x); | ||||
| s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128); | |||||
| s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128); | |||||
| for(i=0; i<128; i++) | for(i=0; i<128; i++) | ||||
| x[i] = s->transform_coeffs[ch][2*i+1]; | x[i] = s->transform_coeffs[ch][2*i+1]; | ||||
| ff_imdct_half(&s->imdct_256, s->delay[ch-1], x); | ff_imdct_half(&s->imdct_256, s->delay[ch-1], x); | ||||
| } else { | } else { | ||||
| ff_imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]); | ff_imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]); | ||||
| s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128); | |||||
| s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128); | |||||
| memcpy(s->delay[ch-1], s->tmp_output+128, 128*sizeof(float)); | memcpy(s->delay[ch-1], s->tmp_output+128, 128*sizeof(float)); | ||||
| } | } | ||||
| } | } | ||||
| @@ -23,7 +23,7 @@ | |||||
| #include "libavcodec/dcadsp.h" | #include "libavcodec/dcadsp.h" | ||||
| void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, | void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, | ||||
| int decifactor, float scale, float bias); | |||||
| int decifactor, float scale); | |||||
| void av_cold ff_dcadsp_init_arm(DCADSPContext *s) | void av_cold ff_dcadsp_init_arm(DCADSPContext *s) | ||||
| { | { | ||||
| @@ -29,7 +29,7 @@ function ff_dca_lfe_fir_neon, export=1 | |||||
| cmp r3, #32 | cmp r3, #32 | ||||
| moveq r6, #256/32 | moveq r6, #256/32 | ||||
| movne r6, #256/64 | movne r6, #256/64 | ||||
| NOVFP vldr d0, [sp, #16] @ scale, bias | |||||
| NOVFP vldr s0, [sp, #16] @ scale | |||||
| mov lr, #-16 | mov lr, #-16 | ||||
| 1: | 1: | ||||
| vmov.f32 q2, #0.0 @ v0 | vmov.f32 q2, #0.0 @ v0 | ||||
| @@ -51,8 +51,7 @@ NOVFP vldr d0, [sp, #16] @ scale, bias | |||||
| vadd.f32 d4, d4, d5 | vadd.f32 d4, d4, d5 | ||||
| vadd.f32 d6, d6, d7 | vadd.f32 d6, d6, d7 | ||||
| vpadd.f32 d4, d4, d6 | vpadd.f32 d4, d4, d6 | ||||
| vdup.32 d5, d0[1] | |||||
| vmla.f32 d5, d4, d0[0] | |||||
| vmul.f32 d5, d4, d0[0] | |||||
| vst1.32 {d5[0]}, [r0,:32]! | vst1.32 {d5[0]}, [r0,:32]! | ||||
| vst1.32 {d5[1]}, [r4,:32]! | vst1.32 {d5[1]}, [r4,:32]! | ||||
| bne 1b | bne 1b | ||||
| @@ -140,8 +140,7 @@ void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); | |||||
| void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); | void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); | ||||
| void ff_vector_fmul_window_neon(float *dst, const float *src0, | void ff_vector_fmul_window_neon(float *dst, const float *src0, | ||||
| const float *src1, const float *win, | |||||
| float add_bias, int len); | |||||
| const float *src1, const float *win, int len); | |||||
| void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, | void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, | ||||
| int len); | int len); | ||||
| void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, | void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, | ||||
| @@ -777,11 +777,8 @@ function ff_vector_fmul_neon, export=1 | |||||
| endfunc | endfunc | ||||
| function ff_vector_fmul_window_neon, export=1 | function ff_vector_fmul_window_neon, export=1 | ||||
| VFP vdup.32 q8, d0[0] | |||||
| NOVFP vld1.32 {d16[],d17[]}, [sp,:32] | |||||
| push {r4,r5,lr} | push {r4,r5,lr} | ||||
| VFP ldr lr, [sp, #12] | |||||
| NOVFP ldr lr, [sp, #16] | |||||
| ldr lr, [sp, #12] | |||||
| sub r2, r2, #8 | sub r2, r2, #8 | ||||
| sub r5, lr, #2 | sub r5, lr, #2 | ||||
| add r2, r2, r5, lsl #2 | add r2, r2, r5, lsl #2 | ||||
| @@ -793,14 +790,12 @@ NOVFP ldr lr, [sp, #16] | |||||
| vld1.64 {d4,d5}, [r3,:128]! | vld1.64 {d4,d5}, [r3,:128]! | ||||
| vld1.64 {d6,d7}, [r4,:128], r5 | vld1.64 {d6,d7}, [r4,:128], r5 | ||||
| 1: subs lr, lr, #4 | 1: subs lr, lr, #4 | ||||
| vmov q11, q8 | |||||
| vmla.f32 d22, d0, d4 | |||||
| vmov q10, q8 | |||||
| vmla.f32 d23, d1, d5 | |||||
| vmul.f32 d22, d0, d4 | |||||
| vrev64.32 q3, q3 | vrev64.32 q3, q3 | ||||
| vmla.f32 d20, d0, d7 | |||||
| vmul.f32 d23, d1, d5 | |||||
| vrev64.32 q1, q1 | vrev64.32 q1, q1 | ||||
| vmla.f32 d21, d1, d6 | |||||
| vmul.f32 d20, d0, d7 | |||||
| vmul.f32 d21, d1, d6 | |||||
| beq 2f | beq 2f | ||||
| vmla.f32 d22, d3, d7 | vmla.f32 d22, d3, d7 | ||||
| vld1.64 {d0,d1}, [r1,:128]! | vld1.64 {d0,d1}, [r1,:128]! | ||||
| @@ -34,7 +34,7 @@ void ff_synth_filter_float_neon(FFTContext *imdct, | |||||
| float *synth_buf_ptr, int *synth_buf_offset, | float *synth_buf_ptr, int *synth_buf_offset, | ||||
| float synth_buf2[32], const float window[512], | float synth_buf2[32], const float window[512], | ||||
| float out[32], const float in[32], | float out[32], const float in[32], | ||||
| float scale, float bias); | |||||
| float scale); | |||||
| av_cold void ff_fft_init_arm(FFTContext *s) | av_cold void ff_fft_init_arm(FFTContext *s) | ||||
| { | { | ||||
| @@ -42,7 +42,7 @@ VFP vpop {d0} | |||||
| ldr r5, [sp, #9*4] @ window | ldr r5, [sp, #9*4] @ window | ||||
| ldr r2, [sp, #10*4] @ out | ldr r2, [sp, #10*4] @ out | ||||
| NOVFP vldr d0, [sp, #12*4] @ scale, bias | |||||
| NOVFP vldr s0, [sp, #12*4] @ scale | |||||
| add r8, r9, #12*4 | add r8, r9, #12*4 | ||||
| mov lr, #64*4 | mov lr, #64*4 | ||||
| @@ -90,10 +90,8 @@ NOVFP vldr d0, [sp, #12*4] @ scale, bias | |||||
| sub r11, r11, #512*4 | sub r11, r11, #512*4 | ||||
| b 2b | b 2b | ||||
| 3: | 3: | ||||
| vdup.32 q8, d0[1] | |||||
| vdup.32 q9, d0[1] | |||||
| vmla.f32 q8, q10, d0[0] | |||||
| vmla.f32 q9, q1, d0[0] | |||||
| vmul.f32 q8, q10, d0[0] | |||||
| vmul.f32 q9, q1, d0[0] | |||||
| vst1.32 {q3}, [r3,:128] | vst1.32 {q3}, [r3,:128] | ||||
| sub r3, r3, #16*4 | sub r3, r3, #16*4 | ||||
| vst1.32 {q2}, [r3,:128] | vst1.32 {q2}, [r3,:128] | ||||
| @@ -141,7 +141,7 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q) | |||||
| /* overlap and window */ | /* overlap and window */ | ||||
| q->dsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf, | q->dsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf, | ||||
| &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 0, 16); | |||||
| &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16); | |||||
| prev_buf = &su->spectrum[0][ref_pos+start_pos + 16]; | prev_buf = &su->spectrum[0][ref_pos+start_pos + 16]; | ||||
| start_pos += block_size; | start_pos += block_size; | ||||
| @@ -896,7 +896,7 @@ static void qmf_32_subbands(DCAContext * s, int chans, | |||||
| s->synth.synth_filter_float(&s->imdct, | s->synth.synth_filter_float(&s->imdct, | ||||
| s->subband_fir_hist[chans], &s->hist_index[chans], | s->subband_fir_hist[chans], &s->hist_index[chans], | ||||
| s->subband_fir_noidea[chans], prCoeff, | s->subband_fir_noidea[chans], prCoeff, | ||||
| samples_out, s->raXin, scale, 0); | |||||
| samples_out, s->raXin, scale); | |||||
| samples_out+= 32; | samples_out+= 32; | ||||
| } | } | ||||
| @@ -929,7 +929,7 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select, | |||||
| /* Interpolation */ | /* Interpolation */ | ||||
| for (deciindex = 0; deciindex < num_deci_sample; deciindex++) { | for (deciindex = 0; deciindex < num_deci_sample; deciindex++) { | ||||
| s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, | s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, | ||||
| scale, 0); | |||||
| scale); | |||||
| samples_in++; | samples_in++; | ||||
| samples_out += 2 * decifactor; | samples_out += 2 * decifactor; | ||||
| } | } | ||||
| @@ -23,7 +23,7 @@ | |||||
| #include "dcadsp.h" | #include "dcadsp.h" | ||||
| static void dca_lfe_fir_c(float *out, const float *in, const float *coefs, | static void dca_lfe_fir_c(float *out, const float *in, const float *coefs, | ||||
| int decifactor, float scale, float bias) | |||||
| int decifactor, float scale) | |||||
| { | { | ||||
| float *out2 = out + decifactor; | float *out2 = out + decifactor; | ||||
| const float *cf0 = coefs; | const float *cf0 = coefs; | ||||
| @@ -39,8 +39,8 @@ static void dca_lfe_fir_c(float *out, const float *in, const float *coefs, | |||||
| v0 += s * *cf0++; | v0 += s * *cf0++; | ||||
| v1 += s * *--cf1; | v1 += s * *--cf1; | ||||
| } | } | ||||
| *out++ = (v0 * scale) + bias; | |||||
| *out2++ = (v1 * scale) + bias; | |||||
| *out++ = v0 * scale; | |||||
| *out2++ = v1 * scale; | |||||
| } | } | ||||
| } | } | ||||
| @@ -21,7 +21,7 @@ | |||||
| typedef struct DCADSPContext { | typedef struct DCADSPContext { | ||||
| void (*lfe_fir)(float *out, const float *in, const float *coefs, | void (*lfe_fir)(float *out, const float *in, const float *coefs, | ||||
| int decifactor, float scale, float bias); | |||||
| int decifactor, float scale); | |||||
| } DCADSPContext; | } DCADSPContext; | ||||
| void ff_dcadsp_init(DCADSPContext *s); | void ff_dcadsp_init(DCADSPContext *s); | ||||
| @@ -3776,7 +3776,9 @@ static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, | |||||
| dst[i] = src0[i] * src1[i] + src2[i]; | dst[i] = src0[i] * src1[i] + src2[i]; | ||||
| } | } | ||||
| void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ | |||||
| static void vector_fmul_window_c(float *dst, const float *src0, | |||||
| const float *src1, const float *win, int len) | |||||
| { | |||||
| int i,j; | int i,j; | ||||
| dst += len; | dst += len; | ||||
| win += len; | win += len; | ||||
| @@ -3786,8 +3788,8 @@ void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, c | |||||
| float s1 = src1[j]; | float s1 = src1[j]; | ||||
| float wi = win[i]; | float wi = win[i]; | ||||
| float wj = win[j]; | float wj = win[j]; | ||||
| dst[i] = s0*wj - s1*wi + add_bias; | |||||
| dst[j] = s0*wi + s1*wj + add_bias; | |||||
| dst[i] = s0*wj - s1*wi; | |||||
| dst[j] = s0*wi + s1*wj; | |||||
| } | } | ||||
| } | } | ||||
| @@ -4434,7 +4436,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->vector_fmul = vector_fmul_c; | c->vector_fmul = vector_fmul_c; | ||||
| c->vector_fmul_reverse = vector_fmul_reverse_c; | c->vector_fmul_reverse = vector_fmul_reverse_c; | ||||
| c->vector_fmul_add = vector_fmul_add_c; | c->vector_fmul_add = vector_fmul_add_c; | ||||
| c->vector_fmul_window = ff_vector_fmul_window_c; | |||||
| c->vector_fmul_window = vector_fmul_window_c; | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | ||||
| c->vector_clipf = vector_clipf_c; | c->vector_clipf = vector_clipf_c; | ||||
| c->float_to_int16 = ff_float_to_int16_c; | c->float_to_int16 = ff_float_to_int16_c; | ||||
| @@ -68,9 +68,6 @@ void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | ||||
| void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | ||||
| void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | |||||
| const float *win, float add_bias, int len); | |||||
| /* encoding scans */ | /* encoding scans */ | ||||
| extern const uint8_t ff_alternate_horizontal_scan[64]; | extern const uint8_t ff_alternate_horizontal_scan[64]; | ||||
| extern const uint8_t ff_alternate_vertical_scan[64]; | extern const uint8_t ff_alternate_vertical_scan[64]; | ||||
| @@ -393,7 +390,7 @@ typedef struct DSPContext { | |||||
| /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ | /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ | ||||
| void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); | void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); | ||||
| /* assume len is a multiple of 4, and arrays are 16-byte aligned */ | /* assume len is a multiple of 4, and arrays are 16-byte aligned */ | ||||
| void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); | |||||
| void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); | |||||
| /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | ||||
| void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | ||||
| void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); | void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); | ||||
| @@ -90,13 +90,9 @@ static void vector_fmul_add_altivec(float *dst, const float *src0, | |||||
| } | } | ||||
| } | } | ||||
| static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len) | |||||
| static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len) | |||||
| { | { | ||||
| union { | |||||
| vector float v; | |||||
| float s[4]; | |||||
| } vadd; | |||||
| vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj; | |||||
| vector float zero, t0, t1, s0, s1, wi, wj; | |||||
| const vector unsigned char reverse = vcprm(3,2,1,0); | const vector unsigned char reverse = vcprm(3,2,1,0); | ||||
| int i,j; | int i,j; | ||||
| @@ -104,8 +100,6 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa | |||||
| win += len; | win += len; | ||||
| src0+= len; | src0+= len; | ||||
| vadd.s[0] = add_bias; | |||||
| vadd_bias = vec_splat(vadd.v, 0); | |||||
| zero = (vector float)vec_splat_u32(0); | zero = (vector float)vec_splat_u32(0); | ||||
| for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { | for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { | ||||
| @@ -117,9 +111,9 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa | |||||
| s1 = vec_perm(s1, s1, reverse); | s1 = vec_perm(s1, s1, reverse); | ||||
| wj = vec_perm(wj, wj, reverse); | wj = vec_perm(wj, wj, reverse); | ||||
| t0 = vec_madd(s0, wj, vadd_bias); | |||||
| t0 = vec_madd(s0, wj, zero); | |||||
| t0 = vec_nmsub(s1, wi, t0); | t0 = vec_nmsub(s1, wi, t0); | ||||
| t1 = vec_madd(s0, wi, vadd_bias); | |||||
| t1 = vec_madd(s0, wi, zero); | |||||
| t1 = vec_madd(s1, wj, t1); | t1 = vec_madd(s1, wj, t1); | ||||
| t1 = vec_perm(t1, t1, reverse); | t1 = vec_perm(t1, t1, reverse); | ||||
| @@ -24,7 +24,7 @@ | |||||
| static void synth_filter_float(FFTContext *imdct, | static void synth_filter_float(FFTContext *imdct, | ||||
| float *synth_buf_ptr, int *synth_buf_offset, | float *synth_buf_ptr, int *synth_buf_offset, | ||||
| float synth_buf2[32], const float window[512], | float synth_buf2[32], const float window[512], | ||||
| float out[32], const float in[32], float scale, float bias) | |||||
| float out[32], const float in[32], float scale) | |||||
| { | { | ||||
| float *synth_buf= synth_buf_ptr + *synth_buf_offset; | float *synth_buf= synth_buf_ptr + *synth_buf_offset; | ||||
| int i, j; | int i, j; | ||||
| @@ -48,8 +48,8 @@ static void synth_filter_float(FFTContext *imdct, | |||||
| c += window[i + j + 32]*( synth_buf[16 + i + j - 512]); | c += window[i + j + 32]*( synth_buf[16 + i + j - 512]); | ||||
| d += window[i + j + 48]*( synth_buf[31 - i + j - 512]); | d += window[i + j + 48]*( synth_buf[31 - i + j - 512]); | ||||
| } | } | ||||
| out[i ] = a*scale + bias; | |||||
| out[i + 16] = b*scale + bias; | |||||
| out[i ] = a*scale; | |||||
| out[i + 16] = b*scale; | |||||
| synth_buf2[i ] = c; | synth_buf2[i ] = c; | ||||
| synth_buf2[i + 16] = d; | synth_buf2[i + 16] = d; | ||||
| } | } | ||||
| @@ -28,7 +28,7 @@ typedef struct SynthFilterContext { | |||||
| float *synth_buf_ptr, int *synth_buf_offset, | float *synth_buf_ptr, int *synth_buf_offset, | ||||
| float synth_buf2[32], const float window[512], | float synth_buf2[32], const float window[512], | ||||
| float out[32], const float in[32], | float out[32], const float in[32], | ||||
| float scale, float bias); | |||||
| float scale); | |||||
| } SynthFilterContext; | } SynthFilterContext; | ||||
| void ff_synth_filter_init(SynthFilterContext *c); | void ff_synth_filter_init(SynthFilterContext *c); | ||||
| @@ -646,7 +646,6 @@ static void imdct_and_window(TwinContext *tctx, enum FrameType ftype, int wtype, | |||||
| prev_buf + (bsize-wsize)/2, | prev_buf + (bsize-wsize)/2, | ||||
| buf1 + bsize*j, | buf1 + bsize*j, | ||||
| ff_sine_windows[av_log2(wsize)], | ff_sine_windows[av_log2(wsize)], | ||||
| 0.0, | |||||
| wsize/2); | wsize/2); | ||||
| out2 += wsize; | out2 += wsize; | ||||
| @@ -1575,13 +1575,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) | |||||
| const float *win = vc->win[blockflag & previous_window]; | const float *win = vc->win[blockflag & previous_window]; | ||||
| if (blockflag == previous_window) { | if (blockflag == previous_window) { | ||||
| vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, blocksize / 4); | |||||
| vc->dsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4); | |||||
| } else if (blockflag > previous_window) { | } else if (blockflag > previous_window) { | ||||
| vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, bs0 / 4); | |||||
| vc->dsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4); | |||||
| memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float)); | memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float)); | ||||
| } else { | } else { | ||||
| memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float)); | memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float)); | ||||
| vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, 0, bs0 / 4); | |||||
| vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4); | |||||
| } | } | ||||
| memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float)); | memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float)); | ||||
| } | } | ||||
| @@ -1031,7 +1031,7 @@ static void wmapro_window(WMAProDecodeCtx *s) | |||||
| winlen >>= 1; | winlen >>= 1; | ||||
| s->dsp.vector_fmul_window(start, start, start + winlen, | s->dsp.vector_fmul_window(start, start, start + winlen, | ||||
| window, 0, winlen); | |||||
| window, winlen); | |||||
| s->channel[c].prev_block_len = s->subframe_len; | s->channel[c].prev_block_len = s->subframe_len; | ||||
| } | } | ||||
| @@ -2190,10 +2190,9 @@ static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1 | |||||
| ); | ); | ||||
| } | } | ||||
| static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | |||||
| const float *win, float add_bias, int len){ | |||||
| #if HAVE_6REGS | #if HAVE_6REGS | ||||
| if(add_bias == 0){ | |||||
| static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | |||||
| const float *win, int len){ | |||||
| x86_reg i = -len*4; | x86_reg i = -len*4; | ||||
| x86_reg j = len*4-8; | x86_reg j = len*4-8; | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| @@ -2220,15 +2219,10 @@ static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float | |||||
| :"+r"(i), "+r"(j) | :"+r"(i), "+r"(j) | ||||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | ||||
| ); | ); | ||||
| }else | |||||
| #endif | |||||
| ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |||||
| } | } | ||||
| static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | ||||
| const float *win, float add_bias, int len){ | |||||
| #if HAVE_6REGS | |||||
| if(add_bias == 0){ | |||||
| const float *win, int len){ | |||||
| x86_reg i = -len*4; | x86_reg i = -len*4; | ||||
| x86_reg j = len*4-16; | x86_reg j = len*4-16; | ||||
| __asm__ volatile( | __asm__ volatile( | ||||
| @@ -2256,10 +2250,8 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s | |||||
| :"+r"(i), "+r"(j) | :"+r"(i), "+r"(j) | ||||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | ||||
| ); | ); | ||||
| }else | |||||
| #endif | |||||
| ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |||||
| } | } | ||||
| #endif /* HAVE_6REGS */ | |||||
| static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | ||||
| { | { | ||||
| @@ -2882,7 +2874,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| } | } | ||||
| if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | ||||
| c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | ||||
| #if HAVE_6REGS | |||||
| c->vector_fmul_window = vector_fmul_window_3dnow2; | c->vector_fmul_window = vector_fmul_window_3dnow2; | ||||
| #endif | |||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | ||||
| c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | ||||
| } | } | ||||
| @@ -2899,7 +2893,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| c->vector_fmul = vector_fmul_sse; | c->vector_fmul = vector_fmul_sse; | ||||
| c->vector_fmul_reverse = vector_fmul_reverse_sse; | c->vector_fmul_reverse = vector_fmul_reverse_sse; | ||||
| c->vector_fmul_add = vector_fmul_add_sse; | c->vector_fmul_add = vector_fmul_add_sse; | ||||
| #if HAVE_6REGS | |||||
| c->vector_fmul_window = vector_fmul_window_sse; | c->vector_fmul_window = vector_fmul_window_sse; | ||||
| #endif | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | ||||
| c->vector_clipf = vector_clipf_sse; | c->vector_clipf = vector_clipf_sse; | ||||
| c->float_to_int16 = float_to_int16_sse; | c->float_to_int16 = float_to_int16_sse; | ||||