NEON versions of the following functions are added: vector_fmul_scalar vector_fmul_sv_scalar sv_fmul_scalar butterflies_float Originally committed as revision 19957 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.6
| @@ -157,6 +157,17 @@ void ff_vector_fmul_neon(float *dst, const float *src, int len); | |||
| void ff_vector_fmul_window_neon(float *dst, const float *src0, | |||
| const float *src1, const float *win, | |||
| float add_bias, int len); | |||
| void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, | |||
| int len); | |||
| void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, | |||
| const float **vp, float mul, int len); | |||
| void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, | |||
| const float **vp, float mul, int len); | |||
| void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, | |||
| int len); | |||
| void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, | |||
| int len); | |||
| void ff_butterflies_float_neon(float *v1, float *v2, int len); | |||
| void ff_float_to_int16_neon(int16_t *, const float *, long); | |||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | |||
| @@ -269,6 +280,14 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||
| c->vector_fmul = ff_vector_fmul_neon; | |||
| c->vector_fmul_window = ff_vector_fmul_window_neon; | |||
| c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; | |||
| c->butterflies_float = ff_butterflies_float_neon; | |||
| c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; | |||
| c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; | |||
| c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; | |||
| c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; | |||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| c->float_to_int16 = ff_float_to_int16_neon; | |||
| @@ -858,3 +858,155 @@ function ff_vorbis_inverse_coupling_neon, export=1 | |||
| bx lr | |||
| .endfunc | |||
| #endif | |||
| function ff_vector_fmul_scalar_neon, export=1 | |||
| VFP len .req r2 | |||
| NOVFP len .req r3 | |||
| VFP vdup.32 q8, d0[0] | |||
| NOVFP vdup.32 q8, r2 | |||
| bics r12, len, #15 | |||
| beq 3f | |||
| vld1.32 {q0},[r1,:128]! | |||
| vld1.32 {q1},[r1,:128]! | |||
| 1: vmul.f32 q0, q0, q8 | |||
| vld1.32 {q2},[r1,:128]! | |||
| vmul.f32 q1, q1, q8 | |||
| vld1.32 {q3},[r1,:128]! | |||
| vmul.f32 q2, q2, q8 | |||
| vst1.32 {q0},[r0,:128]! | |||
| vmul.f32 q3, q3, q8 | |||
| vst1.32 {q1},[r0,:128]! | |||
| subs r12, r12, #16 | |||
| beq 2f | |||
| vld1.32 {q0},[r1,:128]! | |||
| vst1.32 {q2},[r0,:128]! | |||
| vld1.32 {q1},[r1,:128]! | |||
| vst1.32 {q3},[r0,:128]! | |||
| b 1b | |||
| 2: vst1.32 {q2},[r0,:128]! | |||
| vst1.32 {q3},[r0,:128]! | |||
| ands len, len, #15 | |||
| bxeq lr | |||
| 3: vld1.32 {q0},[r1,:128]! | |||
| vmul.f32 q0, q0, q8 | |||
| vst1.32 {q0},[r0,:128]! | |||
| subs len, len, #4 | |||
| bgt 3b | |||
| bx lr | |||
| .unreq len | |||
| .endfunc | |||
| function ff_vector_fmul_sv_scalar_2_neon, export=1 | |||
| VFP vdup.32 d16, d0[0] | |||
| NOVFP vdup.32 d16, r3 | |||
| NOVFP ldr r3, [sp] | |||
| vld1.32 {d0},[r1,:64]! | |||
| vld1.32 {d1},[r1,:64]! | |||
| 1: subs r3, r3, #4 | |||
| vmul.f32 d4, d0, d16 | |||
| vmul.f32 d5, d1, d16 | |||
| ldr r12, [r2], #4 | |||
| vld1.32 {d2},[r12,:64] | |||
| ldr r12, [r2], #4 | |||
| vld1.32 {d3},[r12,:64] | |||
| vmul.f32 d4, d4, d2 | |||
| vmul.f32 d5, d5, d3 | |||
| beq 2f | |||
| vld1.32 {d0},[r1,:64]! | |||
| vld1.32 {d1},[r1,:64]! | |||
| vst1.32 {d4},[r0,:64]! | |||
| vst1.32 {d5},[r0,:64]! | |||
| b 1b | |||
| 2: vst1.32 {d4},[r0,:64]! | |||
| vst1.32 {d5},[r0,:64]! | |||
| bx lr | |||
| .endfunc | |||
| function ff_vector_fmul_sv_scalar_4_neon, export=1 | |||
| VFP vdup.32 q10, d0[0] | |||
| NOVFP vdup.32 q10, r3 | |||
| NOVFP ldr r3, [sp] | |||
| push {lr} | |||
| bics lr, r3, #7 | |||
| beq 3f | |||
| vld1.32 {q0},[r1,:128]! | |||
| vld1.32 {q2},[r1,:128]! | |||
| 1: ldr r12, [r2], #4 | |||
| vld1.32 {q1},[r12,:128] | |||
| ldr r12, [r2], #4 | |||
| vld1.32 {q3},[r12,:128] | |||
| vmul.f32 q8, q0, q10 | |||
| vmul.f32 q8, q8, q1 | |||
| vmul.f32 q9, q2, q10 | |||
| vmul.f32 q9, q9, q3 | |||
| subs lr, lr, #8 | |||
| beq 2f | |||
| vld1.32 {q0},[r1,:128]! | |||
| vld1.32 {q2},[r1,:128]! | |||
| vst1.32 {q8},[r0,:128]! | |||
| vst1.32 {q9},[r0,:128]! | |||
| b 1b | |||
| 2: vst1.32 {q8},[r0,:128]! | |||
| vst1.32 {q9},[r0,:128]! | |||
| ands r3, r3, #7 | |||
| popeq {pc} | |||
| 3: vld1.32 {q0},[r1,:128]! | |||
| ldr r12, [r2], #4 | |||
| vld1.32 {q1},[r12,:128] | |||
| vmul.f32 q0, q0, q10 | |||
| vmul.f32 q0, q0, q1 | |||
| vst1.32 {q0},[r0,:128]! | |||
| subs r3, r3, #4 | |||
| bgt 3b | |||
| pop {pc} | |||
| .endfunc | |||
| function ff_sv_fmul_scalar_2_neon, export=1 | |||
| VFP len .req r2 | |||
| NOVFP len .req r3 | |||
| VFP vdup.32 q8, d0[0] | |||
| NOVFP vdup.32 q8, r2 | |||
| ldr r12, [r1], #4 | |||
| vld1.32 {d0},[r12,:64] | |||
| ldr r12, [r1], #4 | |||
| vld1.32 {d1},[r12,:64] | |||
| 1: vmul.f32 q1, q0, q8 | |||
| subs len, len, #4 | |||
| beq 2f | |||
| ldr r12, [r1], #4 | |||
| vld1.32 {d0},[r12,:64] | |||
| ldr r12, [r1], #4 | |||
| vld1.32 {d1},[r12,:64] | |||
| vst1.32 {q1},[r0,:128]! | |||
| b 1b | |||
| 2: vst1.32 {q1},[r0,:128]! | |||
| bx lr | |||
| .unreq len | |||
| .endfunc | |||
| function ff_sv_fmul_scalar_4_neon, export=1 | |||
| VFP len .req r2 | |||
| NOVFP len .req r3 | |||
| VFP vdup.32 q8, d0[0] | |||
| NOVFP vdup.32 q8, r2 | |||
| 1: ldr r12, [r1], #4 | |||
| vld1.32 {q0},[r12,:128] | |||
| vmul.f32 q0, q0, q8 | |||
| vst1.32 {q0},[r0,:128]! | |||
| subs len, len, #4 | |||
| bgt 1b | |||
| bx lr | |||
| .unreq len | |||
| .endfunc | |||
| function ff_butterflies_float_neon, export=1 | |||
| 1: vld1.32 {q0},[r0,:128] | |||
| vld1.32 {q1},[r1,:128] | |||
| vsub.f32 q2, q0, q1 | |||
| vadd.f32 q1, q0, q1 | |||
| vst1.32 {q2},[r1,:128]! | |||
| vst1.32 {q1},[r0,:128]! | |||
| subs r2, r2, #4 | |||
| bgt 1b | |||
| bx lr | |||
| .endfunc | |||