Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.8
| @@ -256,7 +256,7 @@ static void apply_window_and_mdct(AVCodecContext *avctx, AACEncContext *s, | |||
| s->output[i - 448 - k] = (i < 1024) | |||
| ? sce->saved[i] | |||
| : audio[(i-1024)*chans]; | |||
| s->dsp.vector_fmul (s->output, k ? swindow : pwindow, 128); | |||
| s->dsp.vector_fmul (s->output, s->output, k ? swindow : pwindow, 128); | |||
| s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128); | |||
| ff_mdct_calc(&s->mdct128, sce->coeffs + k, s->output); | |||
| } | |||
| @@ -138,7 +138,7 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); | |||
| void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); | |||
| void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); | |||
| void ff_vector_fmul_neon(float *dst, const float *src, int len); | |||
| void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); | |||
| void ff_vector_fmul_window_neon(float *dst, const float *src0, | |||
| const float *src1, const float *win, | |||
| float add_bias, int len); | |||
| @@ -21,7 +21,8 @@ | |||
| #include "libavcodec/dsputil.h" | |||
| #include "dsputil_arm.h" | |||
| void ff_vector_fmul_vfp(float *dst, const float *src, int len); | |||
| void ff_vector_fmul_vfp(float *dst, const float *src0, | |||
| const float *src1, int len); | |||
| void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | |||
| const float *src1, int len); | |||
| void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); | |||
| @@ -738,42 +738,41 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
| endfunc | |||
| function ff_vector_fmul_neon, export=1 | |||
| mov r3, r0 | |||
| subs r2, r2, #8 | |||
| vld1.64 {d0-d3}, [r0,:128]! | |||
| vld1.64 {d4-d7}, [r1,:128]! | |||
| subs r3, r3, #8 | |||
| vld1.64 {d0-d3}, [r1,:128]! | |||
| vld1.64 {d4-d7}, [r2,:128]! | |||
| vmul.f32 q8, q0, q2 | |||
| vmul.f32 q9, q1, q3 | |||
| beq 3f | |||
| bics ip, r2, #15 | |||
| bics ip, r3, #15 | |||
| beq 2f | |||
| 1: subs ip, ip, #16 | |||
| vld1.64 {d0-d1}, [r0,:128]! | |||
| vld1.64 {d4-d5}, [r1,:128]! | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vld1.64 {d4-d5}, [r2,:128]! | |||
| vmul.f32 q10, q0, q2 | |||
| vld1.64 {d2-d3}, [r0,:128]! | |||
| vld1.64 {d6-d7}, [r1,:128]! | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vld1.64 {d6-d7}, [r2,:128]! | |||
| vmul.f32 q11, q1, q3 | |||
| vst1.64 {d16-d19},[r3,:128]! | |||
| vld1.64 {d0-d1}, [r0,:128]! | |||
| vld1.64 {d4-d5}, [r1,:128]! | |||
| vst1.64 {d16-d19},[r0,:128]! | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vld1.64 {d4-d5}, [r2,:128]! | |||
| vmul.f32 q8, q0, q2 | |||
| vld1.64 {d2-d3}, [r0,:128]! | |||
| vld1.64 {d6-d7}, [r1,:128]! | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vld1.64 {d6-d7}, [r2,:128]! | |||
| vmul.f32 q9, q1, q3 | |||
| vst1.64 {d20-d23},[r3,:128]! | |||
| vst1.64 {d20-d23},[r0,:128]! | |||
| bne 1b | |||
| ands r2, r2, #15 | |||
| ands r3, r3, #15 | |||
| beq 3f | |||
| 2: vld1.64 {d0-d1}, [r0,:128]! | |||
| vld1.64 {d4-d5}, [r1,:128]! | |||
| vst1.64 {d16-d17},[r3,:128]! | |||
| 2: vld1.64 {d0-d1}, [r1,:128]! | |||
| vld1.64 {d4-d5}, [r2,:128]! | |||
| vst1.64 {d16-d17},[r0,:128]! | |||
| vmul.f32 q8, q0, q2 | |||
| vld1.64 {d2-d3}, [r0,:128]! | |||
| vld1.64 {d6-d7}, [r1,:128]! | |||
| vst1.64 {d18-d19},[r3,:128]! | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vld1.64 {d6-d7}, [r2,:128]! | |||
| vst1.64 {d18-d19},[r0,:128]! | |||
| vmul.f32 q9, q1, q3 | |||
| 3: vst1.64 {d16-d19},[r3,:128]! | |||
| 3: vst1.64 {d16-d19},[r0,:128]! | |||
| bx lr | |||
| endfunc | |||
| @@ -41,34 +41,33 @@ | |||
| * ARM VFP optimized implementation of 'vector_fmul_c' function. | |||
| * Assume that len is a positive number and is multiple of 8 | |||
| */ | |||
| @ void ff_vector_fmul_vfp(float *dst, const float *src, int len) | |||
| @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) | |||
| function ff_vector_fmul_vfp, export=1 | |||
| vpush {d8-d15} | |||
| mov r3, r0 | |||
| fmrx r12, fpscr | |||
| orr r12, r12, #(3 << 16) /* set vector size to 4 */ | |||
| fmxr fpscr, r12 | |||
| vldmia r3!, {s0-s3} | |||
| vldmia r1!, {s8-s11} | |||
| vldmia r3!, {s4-s7} | |||
| vldmia r1!, {s12-s15} | |||
| vldmia r1!, {s0-s3} | |||
| vldmia r2!, {s8-s11} | |||
| vldmia r1!, {s4-s7} | |||
| vldmia r2!, {s12-s15} | |||
| vmul.f32 s8, s0, s8 | |||
| 1: | |||
| subs r2, r2, #16 | |||
| subs r3, r3, #16 | |||
| vmul.f32 s12, s4, s12 | |||
| vldmiage r3!, {s16-s19} | |||
| vldmiage r1!, {s24-s27} | |||
| vldmiage r3!, {s20-s23} | |||
| vldmiage r1!, {s28-s31} | |||
| vldmiage r1!, {s16-s19} | |||
| vldmiage r2!, {s24-s27} | |||
| vldmiage r1!, {s20-s23} | |||
| vldmiage r2!, {s28-s31} | |||
| vmulge.f32 s24, s16, s24 | |||
| vstmia r0!, {s8-s11} | |||
| vstmia r0!, {s12-s15} | |||
| vmulge.f32 s28, s20, s28 | |||
| vldmiagt r3!, {s0-s3} | |||
| vldmiagt r1!, {s8-s11} | |||
| vldmiagt r3!, {s4-s7} | |||
| vldmiagt r1!, {s12-s15} | |||
| vldmiagt r1!, {s0-s3} | |||
| vldmiagt r2!, {s8-s11} | |||
| vldmiagt r1!, {s4-s7} | |||
| vldmiagt r2!, {s12-s15} | |||
| vmulge.f32 s8, s0, s8 | |||
| vstmiage r0!, {s24-s27} | |||
| vstmiage r0!, {s28-s31} | |||
| @@ -159,7 +159,7 @@ static void IMLT(ATRAC3Context *q, float *pInput, float *pOutput, int odd_band) | |||
| ff_imdct_calc(&q->mdct_ctx,pOutput,pInput); | |||
| /* Perform windowing on the output. */ | |||
| dsp.vector_fmul(pOutput,mdct_window,512); | |||
| dsp.vector_fmul(pOutput, pOutput, mdct_window, 512); | |||
| } | |||
| @@ -3750,10 +3750,10 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) | |||
| WRAPPER8_16_SQ(rd8x8_c, rd16_c) | |||
| WRAPPER8_16_SQ(bit8x8_c, bit16_c) | |||
| static void vector_fmul_c(float *dst, const float *src, int len){ | |||
| static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){ | |||
| int i; | |||
| for(i=0; i<len; i++) | |||
| dst[i] *= src[i]; | |||
| dst[i] = src0[i] * src1[i]; | |||
| } | |||
| static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ | |||
| @@ -375,7 +375,7 @@ typedef struct DSPContext { | |||
| void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); | |||
| void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); | |||
| /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | |||
| void (*vector_fmul)(float *dst, const float *src, int len); | |||
| void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len); | |||
| void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); | |||
| /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ | |||
| void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); | |||
| @@ -113,13 +113,13 @@ static const uint8_t quant_lut_offset[8] = { 0, 0, 1, 4, 11, 32, 81, 230 }; | |||
| static void apply_mdct(NellyMoserEncodeContext *s) | |||
| { | |||
| memcpy(s->in_buff, s->buf[s->bufsel], NELLY_BUF_LEN * sizeof(float)); | |||
| s->dsp.vector_fmul(s->in_buff, ff_sine_128, NELLY_BUF_LEN); | |||
| s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN); | |||
| s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, | |||
| NELLY_BUF_LEN); | |||
| ff_mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff); | |||
| s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN); | |||
| s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, | |||
| ff_sine_128, NELLY_BUF_LEN); | |||
| s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128, | |||
| NELLY_BUF_LEN); | |||
| ff_mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN); | |||
| @@ -23,16 +23,16 @@ | |||
| #include "dsputil_altivec.h" | |||
| #include "util_altivec.h" | |||
| static void vector_fmul_altivec(float *dst, const float *src, int len) | |||
| static void vector_fmul_altivec(float *dst, const float *src0, const float *src1, int len) | |||
| { | |||
| int i; | |||
| vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); | |||
| for(i=0; i<len-7; i+=8) { | |||
| d0 = vec_ld(0, dst+i); | |||
| s = vec_ld(0, src+i); | |||
| d1 = vec_ld(16, dst+i); | |||
| d0 = vec_ld(0, src0+i); | |||
| s = vec_ld(0, src1+i); | |||
| d1 = vec_ld(16, src0+i); | |||
| d0 = vec_madd(d0, s, zero); | |||
| d1 = vec_madd(d1, vec_ld(16,src+i), zero); | |||
| d1 = vec_madd(d1, vec_ld(16,src1+i), zero); | |||
| vec_st(d0, 0, dst+i); | |||
| vec_st(d1, 16, dst+i); | |||
| } | |||
| @@ -783,7 +783,7 @@ static void read_and_decode_spectrum(TwinContext *tctx, GetBitContext *gb, | |||
| dec_bark_env(tctx, bark1[i][j], bark_use_hist[i][j], i, | |||
| tctx->tmp_buf, gain[sub*i+j], ftype); | |||
| tctx->dsp.vector_fmul(chunk + block_size*j, tctx->tmp_buf, | |||
| tctx->dsp.vector_fmul(chunk + block_size*j, chunk + block_size*j, tctx->tmp_buf, | |||
| block_size); | |||
| } | |||
| @@ -805,7 +805,7 @@ static void read_and_decode_spectrum(TwinContext *tctx, GetBitContext *gb, | |||
| dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf); | |||
| for (j = 0; j < mtab->fmode[ftype].sub; j++) { | |||
| tctx->dsp.vector_fmul(chunk, tctx->tmp_buf, block_size); | |||
| tctx->dsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size); | |||
| chunk += block_size; | |||
| } | |||
| } | |||
| @@ -1578,7 +1578,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) | |||
| for (j = vc->audio_channels-1;j >= 0; j--) { | |||
| ch_floor_ptr = vc->channel_floors + j * blocksize / 2; | |||
| ch_res_ptr = vc->channel_residues + res_chan[j] * blocksize / 2; | |||
| vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize / 2); | |||
| vc->dsp.vector_fmul(ch_floor_ptr, ch_floor_ptr, ch_res_ptr, blocksize / 2); | |||
| ff_imdct_half(&vc->mdct[blockflag], ch_res_ptr, ch_floor_ptr); | |||
| } | |||
| @@ -2074,38 +2074,38 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c | |||
| } | |||
| } | |||
| static void vector_fmul_3dnow(float *dst, const float *src, int len){ | |||
| static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){ | |||
| x86_reg i = (len-4)*4; | |||
| __asm__ volatile( | |||
| "1: \n\t" | |||
| "movq (%1,%0), %%mm0 \n\t" | |||
| "movq 8(%1,%0), %%mm1 \n\t" | |||
| "pfmul (%2,%0), %%mm0 \n\t" | |||
| "pfmul 8(%2,%0), %%mm1 \n\t" | |||
| "movq (%2,%0), %%mm0 \n\t" | |||
| "movq 8(%2,%0), %%mm1 \n\t" | |||
| "pfmul (%3,%0), %%mm0 \n\t" | |||
| "pfmul 8(%3,%0), %%mm1 \n\t" | |||
| "movq %%mm0, (%1,%0) \n\t" | |||
| "movq %%mm1, 8(%1,%0) \n\t" | |||
| "sub $16, %0 \n\t" | |||
| "jge 1b \n\t" | |||
| "femms \n\t" | |||
| :"+r"(i) | |||
| :"r"(dst), "r"(src) | |||
| :"r"(dst), "r"(src0), "r"(src1) | |||
| :"memory" | |||
| ); | |||
| } | |||
| static void vector_fmul_sse(float *dst, const float *src, int len){ | |||
| static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){ | |||
| x86_reg i = (len-8)*4; | |||
| __asm__ volatile( | |||
| "1: \n\t" | |||
| "movaps (%1,%0), %%xmm0 \n\t" | |||
| "movaps 16(%1,%0), %%xmm1 \n\t" | |||
| "mulps (%2,%0), %%xmm0 \n\t" | |||
| "mulps 16(%2,%0), %%xmm1 \n\t" | |||
| "movaps (%2,%0), %%xmm0 \n\t" | |||
| "movaps 16(%2,%0), %%xmm1 \n\t" | |||
| "mulps (%3,%0), %%xmm0 \n\t" | |||
| "mulps 16(%3,%0), %%xmm1 \n\t" | |||
| "movaps %%xmm0, (%1,%0) \n\t" | |||
| "movaps %%xmm1, 16(%1,%0) \n\t" | |||
| "sub $32, %0 \n\t" | |||
| "jge 1b \n\t" | |||
| :"+r"(i) | |||
| :"r"(dst), "r"(src) | |||
| :"r"(dst), "r"(src0), "r"(src1) | |||
| :"memory" | |||
| ); | |||
| } | |||