This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672)
tags/n0.8
| @@ -12,6 +12,7 @@ OBJS = allcodecs.o \ | |||||
| bitstream_filter.o \ | bitstream_filter.o \ | ||||
| dsputil.o \ | dsputil.o \ | ||||
| faanidct.o \ | faanidct.o \ | ||||
| fmtconvert.o \ | |||||
| imgconvert.o \ | imgconvert.o \ | ||||
| jrevdct.o \ | jrevdct.o \ | ||||
| opt.o \ | opt.o \ | ||||
| @@ -35,6 +35,7 @@ | |||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "mpeg4audio.h" | #include "mpeg4audio.h" | ||||
| #include "sbr.h" | #include "sbr.h" | ||||
| #include "fmtconvert.h" | |||||
| #include <stdint.h> | #include <stdint.h> | ||||
| @@ -268,6 +269,7 @@ typedef struct { | |||||
| FFTContext mdct; | FFTContext mdct; | ||||
| FFTContext mdct_small; | FFTContext mdct_small; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| FmtConvertContext fmt_conv; | |||||
| int random_state; | int random_state; | ||||
| /** @} */ | /** @} */ | ||||
| @@ -85,6 +85,7 @@ | |||||
| #include "get_bits.h" | #include "get_bits.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| #include "lpc.h" | #include "lpc.h" | ||||
| #include "aac.h" | #include "aac.h" | ||||
| @@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx) | |||||
| ff_aac_sbr_init(); | ff_aac_sbr_init(); | ||||
| dsputil_init(&ac->dsp, avctx); | dsputil_init(&ac->dsp, avctx); | ||||
| ff_fmt_convert_init(&ac->fmt_conv, avctx); | |||||
| ac->random_state = 0x1f2e3d4c; | ac->random_state = 0x1f2e3d4c; | ||||
| @@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data, | |||||
| *data_size = data_size_tmp; | *data_size = data_size_tmp; | ||||
| if (samples) | if (samples) | ||||
| ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); | |||||
| ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); | |||||
| if (ac->output_configured) | if (ac->output_configured) | ||||
| ac->output_configured = OC_LOCKED; | ac->output_configured = OC_LOCKED; | ||||
| @@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) | |||||
| ff_mdct_init(&s->imdct_512, 9, 1, 1.0); | ff_mdct_init(&s->imdct_512, 9, 1, 1.0); | ||||
| ff_kbd_window_init(s->window, 5.0, 256); | ff_kbd_window_init(s->window, 5.0, 256); | ||||
| dsputil_init(&s->dsp, avctx); | dsputil_init(&s->dsp, avctx); | ||||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||||
| av_lfg_init(&s->dith_state, 0); | av_lfg_init(&s->dith_state, 0); | ||||
| /* set scale value for float to int16 conversion */ | /* set scale value for float to int16 conversion */ | ||||
| @@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) | |||||
| } else { | } else { | ||||
| gain *= s->dynamic_range[0]; | gain *= s->dynamic_range[0]; | ||||
| } | } | ||||
| s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); | |||||
| s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); | |||||
| } | } | ||||
| /* apply spectral extension to high frequency bins */ | /* apply spectral extension to high frequency bins */ | ||||
| @@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, | |||||
| av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); | av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); | ||||
| err = 1; | err = 1; | ||||
| } | } | ||||
| s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels); | |||||
| s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels); | |||||
| out_samples += 256 * s->out_channels; | out_samples += 256 * s->out_channels; | ||||
| } | } | ||||
| *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); | *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); | ||||
| @@ -55,6 +55,7 @@ | |||||
| #include "get_bits.h" | #include "get_bits.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| /* override ac3.h to include coupling channel */ | /* override ac3.h to include coupling channel */ | ||||
| #undef AC3_MAX_CHANNELS | #undef AC3_MAX_CHANNELS | ||||
| @@ -190,6 +191,7 @@ typedef struct { | |||||
| ///@defgroup opt optimization | ///@defgroup opt optimization | ||||
| DSPContext dsp; ///< for optimization | DSPContext dsp; ///< for optimization | ||||
| FmtConvertContext fmt_conv; ///< optimized conversion functions | |||||
| float mul_bias; ///< scaling for float_to_int16 conversion | float mul_bias; ///< scaling for float_to_int16 conversion | ||||
| ///@} | ///@} | ||||
| @@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o | |||||
| OBJS += arm/dsputil_init_arm.o \ | OBJS += arm/dsputil_init_arm.o \ | ||||
| arm/dsputil_arm.o \ | arm/dsputil_arm.o \ | ||||
| arm/fft_init_arm.o \ | arm/fft_init_arm.o \ | ||||
| arm/fmtconvert_init_arm.o \ | |||||
| arm/jrevdct_arm.o \ | arm/jrevdct_arm.o \ | ||||
| arm/mpegvideo_arm.o \ | arm/mpegvideo_arm.o \ | ||||
| arm/simple_idct_arm.o \ | arm/simple_idct_arm.o \ | ||||
| @@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ | |||||
| arm/dsputil_armv6.o \ | arm/dsputil_armv6.o \ | ||||
| arm/simple_idct_armv6.o \ | arm/simple_idct_armv6.o \ | ||||
| VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ | |||||
| OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ | OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ | ||||
| arm/dsputil_init_vfp.o \ | arm/dsputil_init_vfp.o \ | ||||
| $(VFP-OBJS-yes) | |||||
| OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ | OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ | ||||
| arm/mpegvideo_iwmmxt.o \ | arm/mpegvideo_iwmmxt.o \ | ||||
| @@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ | |||||
| OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ | OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ | ||||
| arm/dsputil_neon.o \ | arm/dsputil_neon.o \ | ||||
| arm/fmtconvert_neon.o \ | |||||
| arm/int_neon.o \ | arm/int_neon.o \ | ||||
| arm/mpegvideo_neon.o \ | arm/mpegvideo_neon.o \ | ||||
| arm/simple_idct_neon.o \ | arm/simple_idct_neon.o \ | ||||
| @@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, | |||||
| int len); | int len); | ||||
| void ff_butterflies_float_neon(float *v1, float *v2, int len); | void ff_butterflies_float_neon(float *v1, float *v2, int len); | ||||
| float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); | float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); | ||||
| void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, | |||||
| float mul, int len); | |||||
| void ff_vector_fmul_reverse_neon(float *dst, const float *src0, | void ff_vector_fmul_reverse_neon(float *dst, const float *src0, | ||||
| const float *src1, int len); | const float *src1, int len); | ||||
| void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, | void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, | ||||
| @@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, | |||||
| void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, | void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, | ||||
| int len); | int len); | ||||
| void ff_float_to_int16_neon(int16_t *, const float *, long); | |||||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | |||||
| void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); | void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); | ||||
| @@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||||
| c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; | c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; | ||||
| c->butterflies_float = ff_butterflies_float_neon; | c->butterflies_float = ff_butterflies_float_neon; | ||||
| c->scalarproduct_float = ff_scalarproduct_float_neon; | c->scalarproduct_float = ff_scalarproduct_float_neon; | ||||
| c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; | |||||
| c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; | c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; | ||||
| c->vector_fmul_add = ff_vector_fmul_add_neon; | c->vector_fmul_add = ff_vector_fmul_add_neon; | ||||
| c->vector_clipf = ff_vector_clipf_neon; | c->vector_clipf = ff_vector_clipf_neon; | ||||
| @@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||||
| c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; | c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; | ||||
| c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; | c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; | ||||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||||
| c->float_to_int16 = ff_float_to_int16_neon; | |||||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; | |||||
| } | |||||
| if (CONFIG_VORBIS_DECODER) | if (CONFIG_VORBIS_DECODER) | ||||
| c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; | c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; | ||||
| @@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, | |||||
| const float *src1, int len); | const float *src1, int len); | ||||
| void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | ||||
| const float *src1, int len); | const float *src1, int len); | ||||
| void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); | |||||
| void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) | void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) | ||||
| { | { | ||||
| c->vector_fmul = ff_vector_fmul_vfp; | c->vector_fmul = ff_vector_fmul_vfp; | ||||
| c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; | c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; | ||||
| #if HAVE_ARMV6 | |||||
| c->float_to_int16 = ff_float_to_int16_vfp; | |||||
| #endif | |||||
| } | } | ||||
| @@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 | |||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| function ff_float_to_int16_neon, export=1 | |||||
| subs r2, r2, #8 | |||||
| vld1.64 {d0-d1}, [r1,:128]! | |||||
| vcvt.s32.f32 q8, q0, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vcvt.s32.f32 q9, q1, #16 | |||||
| beq 3f | |||||
| bics ip, r2, #15 | |||||
| beq 2f | |||||
| 1: subs ip, ip, #16 | |||||
| vshrn.s32 d4, q8, #16 | |||||
| vld1.64 {d0-d1}, [r1,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vshrn.s32 d6, q0, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| vshrn.s32 d7, q1, #16 | |||||
| vld1.64 {d16-d17},[r1,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r1,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.64 {d6-d7}, [r0,:128]! | |||||
| bne 1b | |||||
| ands r2, r2, #15 | |||||
| beq 3f | |||||
| 2: vld1.64 {d0-d1}, [r1,:128]! | |||||
| vshrn.s32 d4, q8, #16 | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vshrn.s32 d6, q0, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| vshrn.s32 d7, q1, #16 | |||||
| vst1.64 {d6-d7}, [r0,:128]! | |||||
| bx lr | |||||
| 3: vshrn.s32 d4, q8, #16 | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| bx lr | |||||
| endfunc | |||||
| function ff_float_to_int16_interleave_neon, export=1 | |||||
| cmp r3, #2 | |||||
| ldrlt r1, [r1] | |||||
| blt ff_float_to_int16_neon | |||||
| bne 4f | |||||
| ldr r3, [r1] | |||||
| ldr r1, [r1, #4] | |||||
| subs r2, r2, #8 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q8, q0, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q9, q1, #16 | |||||
| vld1.64 {d20-d21},[r1,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r1,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| beq 3f | |||||
| bics ip, r2, #15 | |||||
| beq 2f | |||||
| 1: subs ip, ip, #16 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 q10, q8, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vld1.64 {d24-d25},[r1,:128]! | |||||
| vcvt.s32.f32 q12, q12, #16 | |||||
| vld1.64 {d26-d27},[r1,:128]! | |||||
| vsri.32 q11, q9, #16 | |||||
| vst1.64 {d20-d21},[r0,:128]! | |||||
| vcvt.s32.f32 q13, q13, #16 | |||||
| vst1.64 {d22-d23},[r0,:128]! | |||||
| vsri.32 q12, q0, #16 | |||||
| vld1.64 {d16-d17},[r3,:128]! | |||||
| vsri.32 q13, q1, #16 | |||||
| vst1.64 {d24-d25},[r0,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r3,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r1,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r1,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.64 {d26-d27},[r0,:128]! | |||||
| bne 1b | |||||
| ands r2, r2, #15 | |||||
| beq 3f | |||||
| 2: vsri.32 q10, q8, #16 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vld1.64 {d24-d25},[r1,:128]! | |||||
| vcvt.s32.f32 q12, q12, #16 | |||||
| vsri.32 q11, q9, #16 | |||||
| vld1.64 {d26-d27},[r1,:128]! | |||||
| vcvt.s32.f32 q13, q13, #16 | |||||
| vst1.64 {d20-d21},[r0,:128]! | |||||
| vsri.32 q12, q0, #16 | |||||
| vst1.64 {d22-d23},[r0,:128]! | |||||
| vsri.32 q13, q1, #16 | |||||
| vst1.64 {d24-d27},[r0,:128]! | |||||
| bx lr | |||||
| 3: vsri.32 q10, q8, #16 | |||||
| vsri.32 q11, q9, #16 | |||||
| vst1.64 {d20-d23},[r0,:128]! | |||||
| bx lr | |||||
| 4: push {r4-r8,lr} | |||||
| cmp r3, #4 | |||||
| lsl ip, r3, #1 | |||||
| blt 4f | |||||
| @ 4 channels | |||||
| 5: ldmia r1!, {r4-r7} | |||||
| mov lr, r2 | |||||
| mov r8, r0 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r6,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r7,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| 6: subs lr, lr, #8 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 q9, q8, #16 | |||||
| vld1.64 {d2-d3}, [r5,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vsri.32 q11, q10, #16 | |||||
| vld1.64 {d4-d5}, [r6,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vzip.32 d18, d22 | |||||
| vld1.64 {d6-d7}, [r7,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vzip.32 d19, d23 | |||||
| vst1.64 {d18}, [r8], ip | |||||
| vsri.32 q1, q0, #16 | |||||
| vst1.64 {d22}, [r8], ip | |||||
| vsri.32 q3, q2, #16 | |||||
| vst1.64 {d19}, [r8], ip | |||||
| vzip.32 d2, d6 | |||||
| vst1.64 {d23}, [r8], ip | |||||
| vzip.32 d3, d7 | |||||
| beq 7f | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.64 {d2}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.64 {d6}, [r8], ip | |||||
| vld1.64 {d20-d21},[r6,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.64 {d3}, [r8], ip | |||||
| vld1.64 {d22-d23},[r7,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.64 {d7}, [r8], ip | |||||
| b 6b | |||||
| 7: vst1.64 {d2}, [r8], ip | |||||
| vst1.64 {d6}, [r8], ip | |||||
| vst1.64 {d3}, [r8], ip | |||||
| vst1.64 {d7}, [r8], ip | |||||
| subs r3, r3, #4 | |||||
| popeq {r4-r8,pc} | |||||
| cmp r3, #4 | |||||
| add r0, r0, #8 | |||||
| bge 5b | |||||
| @ 2 channels | |||||
| 4: cmp r3, #2 | |||||
| blt 4f | |||||
| ldmia r1!, {r4-r5} | |||||
| mov lr, r2 | |||||
| mov r8, r0 | |||||
| tst lr, #8 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| beq 6f | |||||
| subs lr, lr, #8 | |||||
| beq 7f | |||||
| vsri.32 d18, d16, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| 6: subs lr, lr, #16 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 d18, d16, #16 | |||||
| vld1.64 {d2-d3}, [r5,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vld1.64 {d4-d5}, [r4,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vld1.64 {d6-d7}, [r5,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vsri.32 d2, d0, #16 | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vsri.32 d3, d1, #16 | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vsri.32 d6, d4, #16 | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vsri.32 d7, d5, #16 | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| beq 6f | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.32 {d2[0]}, [r8], ip | |||||
| vst1.32 {d2[1]}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.32 {d3[0]}, [r8], ip | |||||
| vst1.32 {d3[1]}, [r8], ip | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.32 {d6[0]}, [r8], ip | |||||
| vst1.32 {d6[1]}, [r8], ip | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.32 {d7[0]}, [r8], ip | |||||
| vst1.32 {d7[1]}, [r8], ip | |||||
| bgt 6b | |||||
| 6: vst1.32 {d2[0]}, [r8], ip | |||||
| vst1.32 {d2[1]}, [r8], ip | |||||
| vst1.32 {d3[0]}, [r8], ip | |||||
| vst1.32 {d3[1]}, [r8], ip | |||||
| vst1.32 {d6[0]}, [r8], ip | |||||
| vst1.32 {d6[1]}, [r8], ip | |||||
| vst1.32 {d7[0]}, [r8], ip | |||||
| vst1.32 {d7[1]}, [r8], ip | |||||
| b 8f | |||||
| 7: vsri.32 d18, d16, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| 8: subs r3, r3, #2 | |||||
| add r0, r0, #4 | |||||
| popeq {r4-r8,pc} | |||||
| @ 1 channel | |||||
| 4: ldr r4, [r1],#4 | |||||
| tst r2, #8 | |||||
| mov lr, r2 | |||||
| mov r5, r0 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| bne 8f | |||||
| 6: subs lr, lr, #16 | |||||
| vld1.64 {d4-d5}, [r4,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vld1.64 {d6-d7}, [r4,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vst1.16 {d0[1]}, [r5,:16], ip | |||||
| vst1.16 {d0[3]}, [r5,:16], ip | |||||
| vst1.16 {d1[1]}, [r5,:16], ip | |||||
| vst1.16 {d1[3]}, [r5,:16], ip | |||||
| vst1.16 {d2[1]}, [r5,:16], ip | |||||
| vst1.16 {d2[3]}, [r5,:16], ip | |||||
| vst1.16 {d3[1]}, [r5,:16], ip | |||||
| vst1.16 {d3[3]}, [r5,:16], ip | |||||
| beq 7f | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| 7: vst1.16 {d4[1]}, [r5,:16], ip | |||||
| vst1.16 {d4[3]}, [r5,:16], ip | |||||
| vst1.16 {d5[1]}, [r5,:16], ip | |||||
| vst1.16 {d5[3]}, [r5,:16], ip | |||||
| vst1.16 {d6[1]}, [r5,:16], ip | |||||
| vst1.16 {d6[3]}, [r5,:16], ip | |||||
| vst1.16 {d7[1]}, [r5,:16], ip | |||||
| vst1.16 {d7[3]}, [r5,:16], ip | |||||
| bgt 6b | |||||
| pop {r4-r8,pc} | |||||
| 8: subs lr, lr, #8 | |||||
| vst1.16 {d0[1]}, [r5,:16], ip | |||||
| vst1.16 {d0[3]}, [r5,:16], ip | |||||
| vst1.16 {d1[1]}, [r5,:16], ip | |||||
| vst1.16 {d1[3]}, [r5,:16], ip | |||||
| vst1.16 {d2[1]}, [r5,:16], ip | |||||
| vst1.16 {d2[3]}, [r5,:16], ip | |||||
| vst1.16 {d3[1]}, [r5,:16], ip | |||||
| vst1.16 {d3[3]}, [r5,:16], ip | |||||
| popeq {r4-r8,pc} | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| b 6b | |||||
| endfunc | |||||
| function ff_vector_fmul_neon, export=1 | function ff_vector_fmul_neon, export=1 | ||||
| subs r3, r3, #8 | subs r3, r3, #8 | ||||
| vld1.64 {d0-d3}, [r1,:128]! | vld1.64 {d0-d3}, [r1,:128]! | ||||
| @@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] | |||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| function ff_int32_to_float_fmul_scalar_neon, export=1 | |||||
| VFP vdup.32 q0, d0[0] | |||||
| VFP len .req r2 | |||||
| NOVFP vdup.32 q0, r2 | |||||
| NOVFP len .req r3 | |||||
| vld1.32 {q1},[r1,:128]! | |||||
| vcvt.f32.s32 q3, q1 | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| vcvt.f32.s32 q8, q2 | |||||
| 1: subs len, len, #8 | |||||
| pld [r1, #16] | |||||
| vmul.f32 q9, q3, q0 | |||||
| vmul.f32 q10, q8, q0 | |||||
| beq 2f | |||||
| vld1.32 {q1},[r1,:128]! | |||||
| vcvt.f32.s32 q3, q1 | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| vcvt.f32.s32 q8, q2 | |||||
| vst1.32 {q9}, [r0,:128]! | |||||
| vst1.32 {q10},[r0,:128]! | |||||
| b 1b | |||||
| 2: vst1.32 {q9}, [r0,:128]! | |||||
| vst1.32 {q10},[r0,:128]! | |||||
| bx lr | |||||
| .unreq len | |||||
| endfunc | |||||
| function ff_vector_fmul_reverse_neon, export=1 | function ff_vector_fmul_reverse_neon, export=1 | ||||
| add r2, r2, r3, lsl #2 | add r2, r2, r3, lsl #2 | ||||
| sub r2, r2, #32 | sub r2, r2, #32 | ||||
| @@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 | |||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| #if HAVE_ARMV6 | |||||
| /** | |||||
| * ARM VFP optimized float to int16 conversion. | |||||
| * Assume that len is a positive number and is multiple of 8, destination | |||||
| * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |||||
| * performance), little endian byte sex | |||||
| */ | |||||
| @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |||||
| function ff_float_to_int16_vfp, export=1 | |||||
| push {r4-r8,lr} | |||||
| vpush {d8-d11} | |||||
| vldmia r1!, {s16-s23} | |||||
| vcvt.s32.f32 s0, s16 | |||||
| vcvt.s32.f32 s1, s17 | |||||
| vcvt.s32.f32 s2, s18 | |||||
| vcvt.s32.f32 s3, s19 | |||||
| vcvt.s32.f32 s4, s20 | |||||
| vcvt.s32.f32 s5, s21 | |||||
| vcvt.s32.f32 s6, s22 | |||||
| vcvt.s32.f32 s7, s23 | |||||
| 1: | |||||
| subs r2, r2, #8 | |||||
| vmov r3, r4, s0, s1 | |||||
| vmov r5, r6, s2, s3 | |||||
| vmov r7, r8, s4, s5 | |||||
| vmov ip, lr, s6, s7 | |||||
| vldmiagt r1!, {s16-s23} | |||||
| ssat r4, #16, r4 | |||||
| ssat r3, #16, r3 | |||||
| ssat r6, #16, r6 | |||||
| ssat r5, #16, r5 | |||||
| pkhbt r3, r3, r4, lsl #16 | |||||
| pkhbt r4, r5, r6, lsl #16 | |||||
| vcvtgt.s32.f32 s0, s16 | |||||
| vcvtgt.s32.f32 s1, s17 | |||||
| vcvtgt.s32.f32 s2, s18 | |||||
| vcvtgt.s32.f32 s3, s19 | |||||
| vcvtgt.s32.f32 s4, s20 | |||||
| vcvtgt.s32.f32 s5, s21 | |||||
| vcvtgt.s32.f32 s6, s22 | |||||
| vcvtgt.s32.f32 s7, s23 | |||||
| ssat r8, #16, r8 | |||||
| ssat r7, #16, r7 | |||||
| ssat lr, #16, lr | |||||
| ssat ip, #16, ip | |||||
| pkhbt r5, r7, r8, lsl #16 | |||||
| pkhbt r6, ip, lr, lsl #16 | |||||
| stmia r0!, {r3-r6} | |||||
| bgt 1b | |||||
| vpop {d8-d11} | |||||
| pop {r4-r8,pc} | |||||
| endfunc | |||||
| #endif | |||||
| @@ -0,0 +1,48 @@ | |||||
| /* | |||||
| * ARM optimized Format Conversion Utils | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include <stdint.h> | |||||
| #include "libavcodec/avcodec.h" | |||||
| #include "libavcodec/fmtconvert.h" | |||||
| void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, | |||||
| float mul, int len); | |||||
| void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); | |||||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | |||||
| void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); | |||||
| void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) | |||||
| { | |||||
| if (HAVE_ARMVFP && HAVE_ARMV6) { | |||||
| c->float_to_int16 = ff_float_to_int16_vfp; | |||||
| } | |||||
| if (HAVE_NEON) { | |||||
| c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; | |||||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||||
| c->float_to_int16 = ff_float_to_int16_neon; | |||||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,391 @@ | |||||
| /* | |||||
| * ARM NEON optimised Format Conversion Utils | |||||
| * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #include "asm.S" | |||||
| preserve8 | |||||
| .text | |||||
| function ff_float_to_int16_neon, export=1 | |||||
| subs r2, r2, #8 | |||||
| vld1.64 {d0-d1}, [r1,:128]! | |||||
| vcvt.s32.f32 q8, q0, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vcvt.s32.f32 q9, q1, #16 | |||||
| beq 3f | |||||
| bics ip, r2, #15 | |||||
| beq 2f | |||||
| 1: subs ip, ip, #16 | |||||
| vshrn.s32 d4, q8, #16 | |||||
| vld1.64 {d0-d1}, [r1,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vshrn.s32 d6, q0, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| vshrn.s32 d7, q1, #16 | |||||
| vld1.64 {d16-d17},[r1,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r1,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.64 {d6-d7}, [r0,:128]! | |||||
| bne 1b | |||||
| ands r2, r2, #15 | |||||
| beq 3f | |||||
| 2: vld1.64 {d0-d1}, [r1,:128]! | |||||
| vshrn.s32 d4, q8, #16 | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r1,:128]! | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vshrn.s32 d6, q0, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| vshrn.s32 d7, q1, #16 | |||||
| vst1.64 {d6-d7}, [r0,:128]! | |||||
| bx lr | |||||
| 3: vshrn.s32 d4, q8, #16 | |||||
| vshrn.s32 d5, q9, #16 | |||||
| vst1.64 {d4-d5}, [r0,:128]! | |||||
| bx lr | |||||
| endfunc | |||||
| function ff_float_to_int16_interleave_neon, export=1 | |||||
| cmp r3, #2 | |||||
| ldrlt r1, [r1] | |||||
| blt ff_float_to_int16_neon | |||||
| bne 4f | |||||
| ldr r3, [r1] | |||||
| ldr r1, [r1, #4] | |||||
| subs r2, r2, #8 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q8, q0, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q9, q1, #16 | |||||
| vld1.64 {d20-d21},[r1,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r1,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| beq 3f | |||||
| bics ip, r2, #15 | |||||
| beq 2f | |||||
| 1: subs ip, ip, #16 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 q10, q8, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vld1.64 {d24-d25},[r1,:128]! | |||||
| vcvt.s32.f32 q12, q12, #16 | |||||
| vld1.64 {d26-d27},[r1,:128]! | |||||
| vsri.32 q11, q9, #16 | |||||
| vst1.64 {d20-d21},[r0,:128]! | |||||
| vcvt.s32.f32 q13, q13, #16 | |||||
| vst1.64 {d22-d23},[r0,:128]! | |||||
| vsri.32 q12, q0, #16 | |||||
| vld1.64 {d16-d17},[r3,:128]! | |||||
| vsri.32 q13, q1, #16 | |||||
| vst1.64 {d24-d25},[r0,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r3,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r1,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r1,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.64 {d26-d27},[r0,:128]! | |||||
| bne 1b | |||||
| ands r2, r2, #15 | |||||
| beq 3f | |||||
| 2: vsri.32 q10, q8, #16 | |||||
| vld1.64 {d0-d1}, [r3,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r3,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vld1.64 {d24-d25},[r1,:128]! | |||||
| vcvt.s32.f32 q12, q12, #16 | |||||
| vsri.32 q11, q9, #16 | |||||
| vld1.64 {d26-d27},[r1,:128]! | |||||
| vcvt.s32.f32 q13, q13, #16 | |||||
| vst1.64 {d20-d21},[r0,:128]! | |||||
| vsri.32 q12, q0, #16 | |||||
| vst1.64 {d22-d23},[r0,:128]! | |||||
| vsri.32 q13, q1, #16 | |||||
| vst1.64 {d24-d27},[r0,:128]! | |||||
| bx lr | |||||
| 3: vsri.32 q10, q8, #16 | |||||
| vsri.32 q11, q9, #16 | |||||
| vst1.64 {d20-d23},[r0,:128]! | |||||
| bx lr | |||||
| 4: push {r4-r8,lr} | |||||
| cmp r3, #4 | |||||
| lsl ip, r3, #1 | |||||
| blt 4f | |||||
| @ 4 channels | |||||
| 5: ldmia r1!, {r4-r7} | |||||
| mov lr, r2 | |||||
| mov r8, r0 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r6,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r7,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| 6: subs lr, lr, #8 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 q9, q8, #16 | |||||
| vld1.64 {d2-d3}, [r5,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vsri.32 q11, q10, #16 | |||||
| vld1.64 {d4-d5}, [r6,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vzip.32 d18, d22 | |||||
| vld1.64 {d6-d7}, [r7,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vzip.32 d19, d23 | |||||
| vst1.64 {d18}, [r8], ip | |||||
| vsri.32 q1, q0, #16 | |||||
| vst1.64 {d22}, [r8], ip | |||||
| vsri.32 q3, q2, #16 | |||||
| vst1.64 {d19}, [r8], ip | |||||
| vzip.32 d2, d6 | |||||
| vst1.64 {d23}, [r8], ip | |||||
| vzip.32 d3, d7 | |||||
| beq 7f | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.64 {d2}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.64 {d6}, [r8], ip | |||||
| vld1.64 {d20-d21},[r6,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.64 {d3}, [r8], ip | |||||
| vld1.64 {d22-d23},[r7,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.64 {d7}, [r8], ip | |||||
| b 6b | |||||
| 7: vst1.64 {d2}, [r8], ip | |||||
| vst1.64 {d6}, [r8], ip | |||||
| vst1.64 {d3}, [r8], ip | |||||
| vst1.64 {d7}, [r8], ip | |||||
| subs r3, r3, #4 | |||||
| popeq {r4-r8,pc} | |||||
| cmp r3, #4 | |||||
| add r0, r0, #8 | |||||
| bge 5b | |||||
| @ 2 channels | |||||
| 4: cmp r3, #2 | |||||
| blt 4f | |||||
| ldmia r1!, {r4-r5} | |||||
| mov lr, r2 | |||||
| mov r8, r0 | |||||
| tst lr, #8 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| beq 6f | |||||
| subs lr, lr, #8 | |||||
| beq 7f | |||||
| vsri.32 d18, d16, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| 6: subs lr, lr, #16 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vsri.32 d18, d16, #16 | |||||
| vld1.64 {d2-d3}, [r5,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vld1.64 {d4-d5}, [r4,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vld1.64 {d6-d7}, [r5,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vsri.32 d2, d0, #16 | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vsri.32 d3, d1, #16 | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vsri.32 d6, d4, #16 | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vsri.32 d7, d5, #16 | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| beq 6f | |||||
| vld1.64 {d16-d17},[r4,:128]! | |||||
| vcvt.s32.f32 q8, q8, #16 | |||||
| vst1.32 {d2[0]}, [r8], ip | |||||
| vst1.32 {d2[1]}, [r8], ip | |||||
| vld1.64 {d18-d19},[r5,:128]! | |||||
| vcvt.s32.f32 q9, q9, #16 | |||||
| vst1.32 {d3[0]}, [r8], ip | |||||
| vst1.32 {d3[1]}, [r8], ip | |||||
| vld1.64 {d20-d21},[r4,:128]! | |||||
| vcvt.s32.f32 q10, q10, #16 | |||||
| vst1.32 {d6[0]}, [r8], ip | |||||
| vst1.32 {d6[1]}, [r8], ip | |||||
| vld1.64 {d22-d23},[r5,:128]! | |||||
| vcvt.s32.f32 q11, q11, #16 | |||||
| vst1.32 {d7[0]}, [r8], ip | |||||
| vst1.32 {d7[1]}, [r8], ip | |||||
| bgt 6b | |||||
| 6: vst1.32 {d2[0]}, [r8], ip | |||||
| vst1.32 {d2[1]}, [r8], ip | |||||
| vst1.32 {d3[0]}, [r8], ip | |||||
| vst1.32 {d3[1]}, [r8], ip | |||||
| vst1.32 {d6[0]}, [r8], ip | |||||
| vst1.32 {d6[1]}, [r8], ip | |||||
| vst1.32 {d7[0]}, [r8], ip | |||||
| vst1.32 {d7[1]}, [r8], ip | |||||
| b 8f | |||||
| 7: vsri.32 d18, d16, #16 | |||||
| vsri.32 d19, d17, #16 | |||||
| vst1.32 {d18[0]}, [r8], ip | |||||
| vsri.32 d22, d20, #16 | |||||
| vst1.32 {d18[1]}, [r8], ip | |||||
| vsri.32 d23, d21, #16 | |||||
| vst1.32 {d19[0]}, [r8], ip | |||||
| vst1.32 {d19[1]}, [r8], ip | |||||
| vst1.32 {d22[0]}, [r8], ip | |||||
| vst1.32 {d22[1]}, [r8], ip | |||||
| vst1.32 {d23[0]}, [r8], ip | |||||
| vst1.32 {d23[1]}, [r8], ip | |||||
| 8: subs r3, r3, #2 | |||||
| add r0, r0, #4 | |||||
| popeq {r4-r8,pc} | |||||
| @ 1 channel | |||||
| 4: ldr r4, [r1],#4 | |||||
| tst r2, #8 | |||||
| mov lr, r2 | |||||
| mov r5, r0 | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| bne 8f | |||||
| 6: subs lr, lr, #16 | |||||
| vld1.64 {d4-d5}, [r4,:128]! | |||||
| vcvt.s32.f32 q2, q2, #16 | |||||
| vld1.64 {d6-d7}, [r4,:128]! | |||||
| vcvt.s32.f32 q3, q3, #16 | |||||
| vst1.16 {d0[1]}, [r5,:16], ip | |||||
| vst1.16 {d0[3]}, [r5,:16], ip | |||||
| vst1.16 {d1[1]}, [r5,:16], ip | |||||
| vst1.16 {d1[3]}, [r5,:16], ip | |||||
| vst1.16 {d2[1]}, [r5,:16], ip | |||||
| vst1.16 {d2[3]}, [r5,:16], ip | |||||
| vst1.16 {d3[1]}, [r5,:16], ip | |||||
| vst1.16 {d3[3]}, [r5,:16], ip | |||||
| beq 7f | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| 7: vst1.16 {d4[1]}, [r5,:16], ip | |||||
| vst1.16 {d4[3]}, [r5,:16], ip | |||||
| vst1.16 {d5[1]}, [r5,:16], ip | |||||
| vst1.16 {d5[3]}, [r5,:16], ip | |||||
| vst1.16 {d6[1]}, [r5,:16], ip | |||||
| vst1.16 {d6[3]}, [r5,:16], ip | |||||
| vst1.16 {d7[1]}, [r5,:16], ip | |||||
| vst1.16 {d7[3]}, [r5,:16], ip | |||||
| bgt 6b | |||||
| pop {r4-r8,pc} | |||||
| 8: subs lr, lr, #8 | |||||
| vst1.16 {d0[1]}, [r5,:16], ip | |||||
| vst1.16 {d0[3]}, [r5,:16], ip | |||||
| vst1.16 {d1[1]}, [r5,:16], ip | |||||
| vst1.16 {d1[3]}, [r5,:16], ip | |||||
| vst1.16 {d2[1]}, [r5,:16], ip | |||||
| vst1.16 {d2[3]}, [r5,:16], ip | |||||
| vst1.16 {d3[1]}, [r5,:16], ip | |||||
| vst1.16 {d3[3]}, [r5,:16], ip | |||||
| popeq {r4-r8,pc} | |||||
| vld1.64 {d0-d1}, [r4,:128]! | |||||
| vcvt.s32.f32 q0, q0, #16 | |||||
| vld1.64 {d2-d3}, [r4,:128]! | |||||
| vcvt.s32.f32 q1, q1, #16 | |||||
| b 6b | |||||
| endfunc | |||||
| function ff_int32_to_float_fmul_scalar_neon, export=1 | |||||
| VFP vdup.32 q0, d0[0] | |||||
| VFP len .req r2 | |||||
| NOVFP vdup.32 q0, r2 | |||||
| NOVFP len .req r3 | |||||
| vld1.32 {q1},[r1,:128]! | |||||
| vcvt.f32.s32 q3, q1 | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| vcvt.f32.s32 q8, q2 | |||||
| 1: subs len, len, #8 | |||||
| pld [r1, #16] | |||||
| vmul.f32 q9, q3, q0 | |||||
| vmul.f32 q10, q8, q0 | |||||
| beq 2f | |||||
| vld1.32 {q1},[r1,:128]! | |||||
| vcvt.f32.s32 q3, q1 | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| vcvt.f32.s32 q8, q2 | |||||
| vst1.32 {q9}, [r0,:128]! | |||||
| vst1.32 {q10},[r0,:128]! | |||||
| b 1b | |||||
| 2: vst1.32 {q9}, [r0,:128]! | |||||
| vst1.32 {q10},[r0,:128]! | |||||
| bx lr | |||||
| .unreq len | |||||
| endfunc | |||||
| @@ -0,0 +1,77 @@ | |||||
| /* | |||||
| * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #include "asm.S" | |||||
| .syntax unified | |||||
| /** | |||||
| * ARM VFP optimized float to int16 conversion. | |||||
| * Assume that len is a positive number and is multiple of 8, destination | |||||
| * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |||||
| * performance), little endian byte sex | |||||
| */ | |||||
| @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |||||
| function ff_float_to_int16_vfp, export=1 | |||||
| push {r4-r8,lr} | |||||
| vpush {d8-d11} | |||||
| vldmia r1!, {s16-s23} | |||||
| vcvt.s32.f32 s0, s16 | |||||
| vcvt.s32.f32 s1, s17 | |||||
| vcvt.s32.f32 s2, s18 | |||||
| vcvt.s32.f32 s3, s19 | |||||
| vcvt.s32.f32 s4, s20 | |||||
| vcvt.s32.f32 s5, s21 | |||||
| vcvt.s32.f32 s6, s22 | |||||
| vcvt.s32.f32 s7, s23 | |||||
| 1: | |||||
| subs r2, r2, #8 | |||||
| vmov r3, r4, s0, s1 | |||||
| vmov r5, r6, s2, s3 | |||||
| vmov r7, r8, s4, s5 | |||||
| vmov ip, lr, s6, s7 | |||||
| vldmiagt r1!, {s16-s23} | |||||
| ssat r4, #16, r4 | |||||
| ssat r3, #16, r3 | |||||
| ssat r6, #16, r6 | |||||
| ssat r5, #16, r5 | |||||
| pkhbt r3, r3, r4, lsl #16 | |||||
| pkhbt r4, r5, r6, lsl #16 | |||||
| vcvtgt.s32.f32 s0, s16 | |||||
| vcvtgt.s32.f32 s1, s17 | |||||
| vcvtgt.s32.f32 s2, s18 | |||||
| vcvtgt.s32.f32 s3, s19 | |||||
| vcvtgt.s32.f32 s4, s20 | |||||
| vcvtgt.s32.f32 s5, s21 | |||||
| vcvtgt.s32.f32 s6, s22 | |||||
| vcvtgt.s32.f32 s7, s23 | |||||
| ssat r8, #16, r8 | |||||
| ssat r7, #16, r7 | |||||
| ssat lr, #16, lr | |||||
| ssat ip, #16, ip | |||||
| pkhbt r5, r7, r8, lsl #16 | |||||
| pkhbt r6, ip, lr, lsl #16 | |||||
| stmia r0!, {r3-r6} | |||||
| bgt 1b | |||||
| vpop {d8-d11} | |||||
| pop {r4-r8,pc} | |||||
| endfunc | |||||
| @@ -33,6 +33,7 @@ | |||||
| #include "get_bits.h" | #include "get_bits.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| extern const uint16_t ff_wma_critical_freqs[25]; | extern const uint16_t ff_wma_critical_freqs[25]; | ||||
| @@ -43,6 +44,7 @@ typedef struct { | |||||
| AVCodecContext *avctx; | AVCodecContext *avctx; | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| FmtConvertContext fmt_conv; | |||||
| int first; | int first; | ||||
| int channels; | int channels; | ||||
| int frame_len; ///< transform size (samples) | int frame_len; ///< transform size (samples) | ||||
| @@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx) | |||||
| s->avctx = avctx; | s->avctx = avctx; | ||||
| dsputil_init(&s->dsp, avctx); | dsputil_init(&s->dsp, avctx); | ||||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||||
| /* determine frame length */ | /* determine frame length */ | ||||
| if (avctx->sample_rate < 22050) { | if (avctx->sample_rate < 22050) { | ||||
| @@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct) | |||||
| ff_rdft_calc(&s->trans.rdft, coeffs); | ff_rdft_calc(&s->trans.rdft, coeffs); | ||||
| } | } | ||||
| s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels); | |||||
| s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, | |||||
| s->frame_len, s->channels); | |||||
| if (!s->first) { | if (!s->first) { | ||||
| int count = s->overlap_len * s->channels; | int count = s->overlap_len * s->channels; | ||||
| @@ -40,6 +40,7 @@ | |||||
| #include "dca.h" | #include "dca.h" | ||||
| #include "synth_filter.h" | #include "synth_filter.h" | ||||
| #include "dcadsp.h" | #include "dcadsp.h" | ||||
| #include "fmtconvert.h" | |||||
| //#define TRACE | //#define TRACE | ||||
| @@ -347,6 +348,7 @@ typedef struct { | |||||
| FFTContext imdct; | FFTContext imdct; | ||||
| SynthFilterContext synth; | SynthFilterContext synth; | ||||
| DCADSPContext dcadsp; | DCADSPContext dcadsp; | ||||
| FmtConvertContext fmt_conv; | |||||
| } DCAContext; | } DCAContext; | ||||
| static const uint16_t dca_vlc_offs[] = { | static const uint16_t dca_vlc_offs[] = { | ||||
| @@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index) | |||||
| block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); | block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); | ||||
| } | } | ||||
| s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l], | |||||
| s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l], | |||||
| block, rscale, 8); | block, rscale, 8); | ||||
| } | } | ||||
| @@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx, | |||||
| } | } | ||||
| } | } | ||||
| s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); | |||||
| s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); | |||||
| samples += 256 * channels; | samples += 256 * channels; | ||||
| } | } | ||||
| @@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx) | |||||
| ff_mdct_init(&s->imdct, 6, 1, 1.0); | ff_mdct_init(&s->imdct, 6, 1, 1.0); | ||||
| ff_synth_filter_init(&s->synth); | ff_synth_filter_init(&s->synth); | ||||
| ff_dcadsp_init(&s->dcadsp); | ff_dcadsp_init(&s->dcadsp); | ||||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||||
| for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) | for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) | ||||
| s->samples_chanptr[i] = s->samples + i * 256; | s->samples_chanptr[i] = s->samples + i * 256; | ||||
| @@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len) | |||||
| return p; | return p; | ||||
| } | } | ||||
| static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ | |||||
| int i; | |||||
| for(i=0; i<len; i++) | |||||
| dst[i] = src[i] * mul; | |||||
| } | |||||
| static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, | static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, | ||||
| uint32_t maxi, uint32_t maxisign) | uint32_t maxi, uint32_t maxisign) | ||||
| { | { | ||||
| @@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i | |||||
| } | } | ||||
| } | } | ||||
| static av_always_inline int float_to_int16_one(const float *src){ | |||||
| return av_clip_int16(lrintf(*src)); | |||||
| } | |||||
| static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ | |||||
| int i; | |||||
| for(i=0; i<len; i++) | |||||
| dst[i] = float_to_int16_one(src+i); | |||||
| } | |||||
| static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ | |||||
| int i,j,c; | |||||
| if(channels==2){ | |||||
| for(i=0; i<len; i++){ | |||||
| dst[2*i] = float_to_int16_one(src[0]+i); | |||||
| dst[2*i+1] = float_to_int16_one(src[1]+i); | |||||
| } | |||||
| }else{ | |||||
| for(c=0; c<channels; c++) | |||||
| for(i=0, j=c; i<len; i++, j+=channels) | |||||
| dst[j] = float_to_int16_one(src[c]+i); | |||||
| } | |||||
| } | |||||
| static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) | static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) | ||||
| { | { | ||||
| int res = 0; | int res = 0; | ||||
| @@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->vector_fmul_reverse = vector_fmul_reverse_c; | c->vector_fmul_reverse = vector_fmul_reverse_c; | ||||
| c->vector_fmul_add = vector_fmul_add_c; | c->vector_fmul_add = vector_fmul_add_c; | ||||
| c->vector_fmul_window = vector_fmul_window_c; | c->vector_fmul_window = vector_fmul_window_c; | ||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | |||||
| c->vector_clipf = vector_clipf_c; | c->vector_clipf = vector_clipf_c; | ||||
| c->float_to_int16 = ff_float_to_int16_c; | |||||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_c; | |||||
| c->scalarproduct_int16 = scalarproduct_int16_c; | c->scalarproduct_int16 = scalarproduct_int16_c; | ||||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | ||||
| c->scalarproduct_float = scalarproduct_float_c; | c->scalarproduct_float = scalarproduct_float_c; | ||||
| @@ -392,7 +392,6 @@ typedef struct DSPContext { | |||||
| /* assume len is a multiple of 4, and arrays are 16-byte aligned */ | /* assume len is a multiple of 4, and arrays are 16-byte aligned */ | ||||
| void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); | void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); | ||||
| /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | ||||
| void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | |||||
| void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); | void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); | ||||
| /** | /** | ||||
| * Multiply a vector of floats by a scalar float. Source and | * Multiply a vector of floats by a scalar float. Source and | ||||
| @@ -445,10 +444,6 @@ typedef struct DSPContext { | |||||
| */ | */ | ||||
| void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); | void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); | ||||
| /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ | |||||
| void (*float_to_int16)(int16_t *dst, const float *src, long len); | |||||
| void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); | |||||
| /* (I)DCT */ | /* (I)DCT */ | ||||
| void (*fdct)(DCTELEM *block/* align 16*/); | void (*fdct)(DCTELEM *block/* align 16*/); | ||||
| void (*fdct248)(DCTELEM *block/* align 16*/); | void (*fdct248)(DCTELEM *block/* align 16*/); | ||||
| @@ -0,0 +1,68 @@ | |||||
| /* | |||||
| * Format Conversion Utils | |||||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "avcodec.h" | |||||
| #include "fmtconvert.h" | |||||
| static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ | |||||
| int i; | |||||
| for(i=0; i<len; i++) | |||||
| dst[i] = src[i] * mul; | |||||
| } | |||||
| static av_always_inline int float_to_int16_one(const float *src){ | |||||
| return av_clip_int16(lrintf(*src)); | |||||
| } | |||||
| static void float_to_int16_c(int16_t *dst, const float *src, long len) | |||||
| { | |||||
| int i; | |||||
| for(i=0; i<len; i++) | |||||
| dst[i] = float_to_int16_one(src+i); | |||||
| } | |||||
| static void float_to_int16_interleave_c(int16_t *dst, const float **src, | |||||
| long len, int channels) | |||||
| { | |||||
| int i,j,c; | |||||
| if(channels==2){ | |||||
| for(i=0; i<len; i++){ | |||||
| dst[2*i] = float_to_int16_one(src[0]+i); | |||||
| dst[2*i+1] = float_to_int16_one(src[1]+i); | |||||
| } | |||||
| }else{ | |||||
| for(c=0; c<channels; c++) | |||||
| for(i=0, j=c; i<len; i++, j+=channels) | |||||
| dst[j] = float_to_int16_one(src[c]+i); | |||||
| } | |||||
| } | |||||
| av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) | |||||
| { | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | |||||
| c->float_to_int16 = float_to_int16_c; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_c; | |||||
| if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); | |||||
| if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); | |||||
| if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); | |||||
| } | |||||
| @@ -0,0 +1,79 @@ | |||||
| /* | |||||
| * Format Conversion Utils | |||||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #ifndef AVCODEC_FMTCONVERT_H | |||||
| #define AVCODEC_FMTCONVERT_H | |||||
| #include "avcodec.h" | |||||
| typedef struct FmtConvertContext { | |||||
| /** | |||||
| * Convert an array of int32_t to float and multiply by a float value. | |||||
| * @param dst destination array of float. | |||||
| * constraints: 16-byte aligned | |||||
| * @param src source array of int32_t. | |||||
| * constraints: 16-byte aligned | |||||
| * @param len number of elements to convert. | |||||
| * constraints: multiple of 8 | |||||
| */ | |||||
| void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | |||||
| /** | |||||
| * Convert an array of float to an array of int16_t. | |||||
| * | |||||
| * Convert floats from in the range [-32768.0,32767.0] to ints | |||||
| * without rescaling | |||||
| * | |||||
| * @param dst destination array of int16_t. | |||||
| * constraints: 16-byte aligned | |||||
| * @param src source array of float. | |||||
| * constraints: 16-byte aligned | |||||
| * @param len number of elements to convert. | |||||
| * constraints: multiple of 8 | |||||
| */ | |||||
| void (*float_to_int16)(int16_t *dst, const float *src, long len); | |||||
| /** | |||||
| * Convert multiple arrays of float to an interleaved array of int16_t. | |||||
| * | |||||
| * Convert floats from in the range [-32768.0,32767.0] to ints | |||||
| * without rescaling | |||||
| * | |||||
| * @param dst destination array of interleaved int16_t. | |||||
| * constraints: 16-byte aligned | |||||
| * @param src source array of float arrays, one for each channel. | |||||
| * constraints: 16-byte aligned | |||||
| * @param len number of elements to convert. | |||||
| * constraints: multiple of 8 | |||||
| * @param channels number of channels | |||||
| */ | |||||
| void (*float_to_int16_interleave)(int16_t *dst, const float **src, | |||||
| long len, int channels); | |||||
| } FmtConvertContext; | |||||
| void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); | |||||
| void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); | |||||
| void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); | |||||
| void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); | |||||
| #endif /* AVCODEC_FMTCONVERT_H */ | |||||
| @@ -38,6 +38,7 @@ | |||||
| #include "avcodec.h" | #include "avcodec.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| #define ALT_BITSTREAM_READER_LE | #define ALT_BITSTREAM_READER_LE | ||||
| #include "get_bits.h" | #include "get_bits.h" | ||||
| @@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext { | |||||
| float scale_bias; | float scale_bias; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| FFTContext imdct_ctx; | FFTContext imdct_ctx; | ||||
| FmtConvertContext fmt_conv; | |||||
| DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; | DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; | ||||
| } NellyMoserDecodeContext; | } NellyMoserDecodeContext; | ||||
| @@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) { | |||||
| ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); | ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); | ||||
| dsputil_init(&s->dsp, avctx); | dsputil_init(&s->dsp, avctx); | ||||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||||
| s->scale_bias = 1.0/(1*8); | s->scale_bias = 1.0/(1*8); | ||||
| @@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx, | |||||
| for (i=0 ; i<blocks ; i++) { | for (i=0 ; i<blocks ; i++) { | ||||
| nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); | nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); | ||||
| s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); | |||||
| s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); | |||||
| *data_size += NELLY_SAMPLES*sizeof(int16_t); | *data_size += NELLY_SAMPLES*sizeof(int16_t); | ||||
| } | } | ||||
| @@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \ | |||||
| OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ | OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ | ||||
| ppc/fdct_altivec.o \ | ppc/fdct_altivec.o \ | ||||
| ppc/float_altivec.o \ | ppc/float_altivec.o \ | ||||
| ppc/fmtconvert_altivec.o \ | |||||
| ppc/gmc_altivec.o \ | ppc/gmc_altivec.o \ | ||||
| ppc/idct_altivec.o \ | ppc/idct_altivec.o \ | ||||
| ppc/int_altivec.o \ | ppc/int_altivec.o \ | ||||
| @@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa | |||||
| } | } | ||||
| } | } | ||||
| static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| union { | |||||
| vector float v; | |||||
| float s[4]; | |||||
| } mul_u; | |||||
| int i; | |||||
| vector float src1, src2, dst1, dst2, mul_v, zero; | |||||
| zero = (vector float)vec_splat_u32(0); | |||||
| mul_u.s[0] = mul; | |||||
| mul_v = vec_splat(mul_u.v, 0); | |||||
| for(i=0; i<len; i+=8) { | |||||
| src1 = vec_ctf(vec_ld(0, src+i), 0); | |||||
| src2 = vec_ctf(vec_ld(16, src+i), 0); | |||||
| dst1 = vec_madd(src1, mul_v, zero); | |||||
| dst2 = vec_madd(src2, mul_v, zero); | |||||
| vec_st(dst1, 0, dst+i); | |||||
| vec_st(dst2, 16, dst+i); | |||||
| } | |||||
| } | |||||
| static vector signed short | |||||
| float_to_int16_one_altivec(const float *src) | |||||
| { | |||||
| vector float s0 = vec_ld(0, src); | |||||
| vector float s1 = vec_ld(16, src); | |||||
| vector signed int t0 = vec_cts(s0, 0); | |||||
| vector signed int t1 = vec_cts(s1, 0); | |||||
| return vec_packs(t0,t1); | |||||
| } | |||||
| static void float_to_int16_altivec(int16_t *dst, const float *src, long len) | |||||
| { | |||||
| int i; | |||||
| vector signed short d0, d1, d; | |||||
| vector unsigned char align; | |||||
| if(((long)dst)&15) //FIXME | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d0 = vec_ld(0, dst+i); | |||||
| d = float_to_int16_one_altivec(src+i); | |||||
| d1 = vec_ld(15, dst+i); | |||||
| d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); | |||||
| align = vec_lvsr(0, dst+i); | |||||
| d0 = vec_perm(d1, d, align); | |||||
| d1 = vec_perm(d, d1, align); | |||||
| vec_st(d0, 0, dst+i); | |||||
| vec_st(d1,15, dst+i); | |||||
| } | |||||
| else | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d = float_to_int16_one_altivec(src+i); | |||||
| vec_st(d, 0, dst+i); | |||||
| } | |||||
| } | |||||
| static void | |||||
| float_to_int16_interleave_altivec(int16_t *dst, const float **src, | |||||
| long len, int channels) | |||||
| { | |||||
| int i; | |||||
| vector signed short d0, d1, d2, c0, c1, t0, t1; | |||||
| vector unsigned char align; | |||||
| if(channels == 1) | |||||
| float_to_int16_altivec(dst, src[0], len); | |||||
| else | |||||
| if (channels == 2) { | |||||
| if(((long)dst)&15) | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d0 = vec_ld(0, dst + i); | |||||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||||
| d1 = vec_ld(31, dst + i); | |||||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||||
| c0 = vec_mergeh(t0, t1); | |||||
| c1 = vec_mergel(t0, t1); | |||||
| d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); | |||||
| align = vec_lvsr(0, dst + i); | |||||
| d0 = vec_perm(d2, c0, align); | |||||
| d1 = vec_perm(c0, c1, align); | |||||
| vec_st(d0, 0, dst + i); | |||||
| d0 = vec_perm(c1, d2, align); | |||||
| vec_st(d1, 15, dst + i); | |||||
| vec_st(d0, 31, dst + i); | |||||
| dst+=8; | |||||
| } | |||||
| else | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||||
| d0 = vec_mergeh(t0, t1); | |||||
| d1 = vec_mergel(t0, t1); | |||||
| vec_st(d0, 0, dst + i); | |||||
| vec_st(d1, 16, dst + i); | |||||
| dst+=8; | |||||
| } | |||||
| } else { | |||||
| DECLARE_ALIGNED(16, int16_t, tmp)[len]; | |||||
| int c, j; | |||||
| for (c = 0; c < channels; c++) { | |||||
| float_to_int16_altivec(tmp, src[c], len); | |||||
| for (i = 0, j = c; i < len; i++, j+=channels) { | |||||
| dst[j] = tmp[i]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void float_init_altivec(DSPContext* c, AVCodecContext *avctx) | void float_init_altivec(DSPContext* c, AVCodecContext *avctx) | ||||
| { | { | ||||
| c->vector_fmul = vector_fmul_altivec; | c->vector_fmul = vector_fmul_altivec; | ||||
| c->vector_fmul_reverse = vector_fmul_reverse_altivec; | c->vector_fmul_reverse = vector_fmul_reverse_altivec; | ||||
| c->vector_fmul_add = vector_fmul_add_altivec; | c->vector_fmul_add = vector_fmul_add_altivec; | ||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; | |||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { | if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { | ||||
| c->vector_fmul_window = vector_fmul_window_altivec; | c->vector_fmul_window = vector_fmul_window_altivec; | ||||
| c->float_to_int16 = float_to_int16_altivec; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_altivec; | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,142 @@ | |||||
| /* | |||||
| * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavcodec/fmtconvert.h" | |||||
| #include "dsputil_altivec.h" | |||||
| #include "util_altivec.h" | |||||
| static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| union { | |||||
| vector float v; | |||||
| float s[4]; | |||||
| } mul_u; | |||||
| int i; | |||||
| vector float src1, src2, dst1, dst2, mul_v, zero; | |||||
| zero = (vector float)vec_splat_u32(0); | |||||
| mul_u.s[0] = mul; | |||||
| mul_v = vec_splat(mul_u.v, 0); | |||||
| for(i=0; i<len; i+=8) { | |||||
| src1 = vec_ctf(vec_ld(0, src+i), 0); | |||||
| src2 = vec_ctf(vec_ld(16, src+i), 0); | |||||
| dst1 = vec_madd(src1, mul_v, zero); | |||||
| dst2 = vec_madd(src2, mul_v, zero); | |||||
| vec_st(dst1, 0, dst+i); | |||||
| vec_st(dst2, 16, dst+i); | |||||
| } | |||||
| } | |||||
| static vector signed short | |||||
| float_to_int16_one_altivec(const float *src) | |||||
| { | |||||
| vector float s0 = vec_ld(0, src); | |||||
| vector float s1 = vec_ld(16, src); | |||||
| vector signed int t0 = vec_cts(s0, 0); | |||||
| vector signed int t1 = vec_cts(s1, 0); | |||||
| return vec_packs(t0,t1); | |||||
| } | |||||
| static void float_to_int16_altivec(int16_t *dst, const float *src, long len) | |||||
| { | |||||
| int i; | |||||
| vector signed short d0, d1, d; | |||||
| vector unsigned char align; | |||||
| if(((long)dst)&15) //FIXME | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d0 = vec_ld(0, dst+i); | |||||
| d = float_to_int16_one_altivec(src+i); | |||||
| d1 = vec_ld(15, dst+i); | |||||
| d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); | |||||
| align = vec_lvsr(0, dst+i); | |||||
| d0 = vec_perm(d1, d, align); | |||||
| d1 = vec_perm(d, d1, align); | |||||
| vec_st(d0, 0, dst+i); | |||||
| vec_st(d1,15, dst+i); | |||||
| } | |||||
| else | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d = float_to_int16_one_altivec(src+i); | |||||
| vec_st(d, 0, dst+i); | |||||
| } | |||||
| } | |||||
| static void | |||||
| float_to_int16_interleave_altivec(int16_t *dst, const float **src, | |||||
| long len, int channels) | |||||
| { | |||||
| int i; | |||||
| vector signed short d0, d1, d2, c0, c1, t0, t1; | |||||
| vector unsigned char align; | |||||
| if(channels == 1) | |||||
| float_to_int16_altivec(dst, src[0], len); | |||||
| else | |||||
| if (channels == 2) { | |||||
| if(((long)dst)&15) | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| d0 = vec_ld(0, dst + i); | |||||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||||
| d1 = vec_ld(31, dst + i); | |||||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||||
| c0 = vec_mergeh(t0, t1); | |||||
| c1 = vec_mergel(t0, t1); | |||||
| d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); | |||||
| align = vec_lvsr(0, dst + i); | |||||
| d0 = vec_perm(d2, c0, align); | |||||
| d1 = vec_perm(c0, c1, align); | |||||
| vec_st(d0, 0, dst + i); | |||||
| d0 = vec_perm(c1, d2, align); | |||||
| vec_st(d1, 15, dst + i); | |||||
| vec_st(d0, 31, dst + i); | |||||
| dst+=8; | |||||
| } | |||||
| else | |||||
| for(i=0; i<len-7; i+=8) { | |||||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||||
| d0 = vec_mergeh(t0, t1); | |||||
| d1 = vec_mergel(t0, t1); | |||||
| vec_st(d0, 0, dst + i); | |||||
| vec_st(d1, 16, dst + i); | |||||
| dst+=8; | |||||
| } | |||||
| } else { | |||||
| DECLARE_ALIGNED(16, int16_t, tmp)[len]; | |||||
| int c, j; | |||||
| for (c = 0; c < channels; c++) { | |||||
| float_to_int16_altivec(tmp, src[c], len); | |||||
| for (i = 0, j = c; i < len; i++, j+=channels) { | |||||
| dst[j] = tmp[i]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) | |||||
| { | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; | |||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||||
| c->float_to_int16 = float_to_int16_altivec; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_altivec; | |||||
| } | |||||
| } | |||||
| @@ -31,6 +31,7 @@ | |||||
| #include "get_bits.h" | #include "get_bits.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| #include "vorbis.h" | #include "vorbis.h" | ||||
| #include "xiph.h" | #include "xiph.h" | ||||
| @@ -127,6 +128,7 @@ typedef struct vorbis_context_s { | |||||
| AVCodecContext *avccontext; | AVCodecContext *avccontext; | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| FmtConvertContext fmt_conv; | |||||
| FFTContext mdct[2]; | FFTContext mdct[2]; | ||||
| uint_fast8_t first_frame; | uint_fast8_t first_frame; | ||||
| @@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext) | |||||
| vc->avccontext = avccontext; | vc->avccontext = avccontext; | ||||
| dsputil_init(&vc->dsp, avccontext); | dsputil_init(&vc->dsp, avccontext); | ||||
| ff_fmt_convert_init(&vc->fmt_conv, avccontext); | |||||
| vc->scale_bias = 32768.0f; | vc->scale_bias = 32768.0f; | ||||
| @@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, | |||||
| len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; | len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; | ||||
| } | } | ||||
| vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); | |||||
| vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len, | |||||
| vc->audio_channels); | |||||
| *data_size = len * 2 * vc->audio_channels; | *data_size = len * 2 * vc->audio_channels; | ||||
| return buf_size ; | return buf_size ; | ||||
| @@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2) | |||||
| s->block_align = avctx->block_align; | s->block_align = avctx->block_align; | ||||
| dsputil_init(&s->dsp, avctx); | dsputil_init(&s->dsp, avctx); | ||||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||||
| if (avctx->codec->id == CODEC_ID_WMAV1) { | if (avctx->codec->id == CODEC_ID_WMAV1) { | ||||
| s->version = 1; | s->version = 1; | ||||
| @@ -26,6 +26,7 @@ | |||||
| #include "put_bits.h" | #include "put_bits.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "fft.h" | #include "fft.h" | ||||
| #include "fmtconvert.h" | |||||
| /* size of blocks */ | /* size of blocks */ | ||||
| #define BLOCK_MIN_BITS 7 | #define BLOCK_MIN_BITS 7 | ||||
| @@ -134,6 +135,7 @@ typedef struct WMACodecContext { | |||||
| float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; | float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; | ||||
| float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; | float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| FmtConvertContext fmt_conv; | |||||
| #ifdef TRACE | #ifdef TRACE | ||||
| int frame_count; | int frame_count; | ||||
| @@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples) | |||||
| incr = s->nb_channels; | incr = s->nb_channels; | ||||
| for (ch = 0; ch < MAX_CHANNELS; ch++) | for (ch = 0; ch < MAX_CHANNELS; ch++) | ||||
| output[ch] = s->frame_out[ch]; | output[ch] = s->frame_out[ch]; | ||||
| s->dsp.float_to_int16_interleave(samples, output, n, incr); | |||||
| s->fmt_conv.float_to_int16_interleave(samples, output, n, incr); | |||||
| for (ch = 0; ch < incr; ch++) { | for (ch = 0; ch < incr; ch++) { | ||||
| /* prepare for next block */ | /* prepare for next block */ | ||||
| memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); | memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); | ||||
| @@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | |||||
| MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o | MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o | ||||
| MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ | MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ | ||||
| x86/deinterlace.o \ | x86/deinterlace.o \ | ||||
| x86/fmtconvert.o \ | |||||
| x86/h264_chromamc.o \ | x86/h264_chromamc.o \ | ||||
| $(YASM-OBJS-yes) | $(YASM-OBJS-yes) | ||||
| @@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o | |||||
| OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ | OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ | ||||
| x86/dsputil_mmx.o \ | x86/dsputil_mmx.o \ | ||||
| x86/fdct_mmx.o \ | x86/fdct_mmx.o \ | ||||
| x86/fmtconvert_mmx.o \ | |||||
| x86/idct_mmx_xvid.o \ | x86/idct_mmx_xvid.o \ | ||||
| x86/idct_sse2_xvid.o \ | x86/idct_sse2_xvid.o \ | ||||
| x86/motion_est_mmx.o \ | x86/motion_est_mmx.o \ | ||||
| @@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s | |||||
| } | } | ||||
| #endif /* HAVE_6REGS */ | #endif /* HAVE_6REGS */ | ||||
| static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| x86_reg i = -4*len; | |||||
| __asm__ volatile( | |||||
| "movss %3, %%xmm4 \n" | |||||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||||
| "1: \n" | |||||
| "cvtpi2ps (%2,%0), %%xmm0 \n" | |||||
| "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |||||
| "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |||||
| "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |||||
| "movlhps %%xmm1, %%xmm0 \n" | |||||
| "movlhps %%xmm3, %%xmm2 \n" | |||||
| "mulps %%xmm4, %%xmm0 \n" | |||||
| "mulps %%xmm4, %%xmm2 \n" | |||||
| "movaps %%xmm0, (%1,%0) \n" | |||||
| "movaps %%xmm2, 16(%1,%0) \n" | |||||
| "add $32, %0 \n" | |||||
| "jl 1b \n" | |||||
| :"+r"(i) | |||||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||||
| ); | |||||
| } | |||||
| static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| x86_reg i = -4*len; | |||||
| __asm__ volatile( | |||||
| "movss %3, %%xmm4 \n" | |||||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||||
| "1: \n" | |||||
| "cvtdq2ps (%2,%0), %%xmm0 \n" | |||||
| "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |||||
| "mulps %%xmm4, %%xmm0 \n" | |||||
| "mulps %%xmm4, %%xmm1 \n" | |||||
| "movaps %%xmm0, (%1,%0) \n" | |||||
| "movaps %%xmm1, 16(%1,%0) \n" | |||||
| "add $32, %0 \n" | |||||
| "jl 1b \n" | |||||
| :"+r"(i) | |||||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||||
| ); | |||||
| } | |||||
| static void vector_clipf_sse(float *dst, const float *src, float min, float max, | static void vector_clipf_sse(float *dst, const float *src, float min, float max, | ||||
| int len) | int len) | ||||
| { | { | ||||
| @@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, | |||||
| ); | ); | ||||
| } | } | ||||
| static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| // not bit-exact: pf2id uses different rounding than C and SSE | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "pf2id (%2,%0,2) , %%mm0 \n\t" | |||||
| "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |||||
| "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |||||
| "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |||||
| "packssdw %%mm1 , %%mm0 \n\t" | |||||
| "packssdw %%mm3 , %%mm2 \n\t" | |||||
| "movq %%mm0 , (%1,%0) \n\t" | |||||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| "femms \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |||||
| "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |||||
| "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |||||
| "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |||||
| "packssdw %%mm1 , %%mm0 \n\t" | |||||
| "packssdw %%mm3 , %%mm2 \n\t" | |||||
| "movq %%mm0 , (%1,%0) \n\t" | |||||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| "emms \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |||||
| "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |||||
| "packssdw %%xmm1 , %%xmm0 \n\t" | |||||
| "movdqa %%xmm0 , (%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| void ff_vp3_idct_mmx(int16_t *input_data); | void ff_vp3_idct_mmx(int16_t *input_data); | ||||
| void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| @@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data); | |||||
| void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | |||||
| void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |||||
| void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |||||
| int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); | int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); | int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | ||||
| @@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const | |||||
| int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | ||||
| int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | ||||
| #if !HAVE_YASM | |||||
| #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) | |||||
| #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||||
| #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||||
| #endif | |||||
| #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |||||
| #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |||||
| /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |||||
| static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||||
| DECLARE_ALIGNED(16, int16_t, tmp)[len];\ | |||||
| int i,j,c;\ | |||||
| for(c=0; c<channels; c++){\ | |||||
| float_to_int16_##cpu(tmp, src[c], len);\ | |||||
| for(i=0, j=c; i<len; i++, j+=channels)\ | |||||
| dst[j] = tmp[i];\ | |||||
| }\ | |||||
| }\ | |||||
| \ | |||||
| static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||||
| if(channels==1)\ | |||||
| float_to_int16_##cpu(dst, src[0], len);\ | |||||
| else if(channels==2){\ | |||||
| x86_reg reglen = len; \ | |||||
| const float *src0 = src[0];\ | |||||
| const float *src1 = src[1];\ | |||||
| __asm__ volatile(\ | |||||
| "shl $2, %0 \n"\ | |||||
| "add %0, %1 \n"\ | |||||
| "add %0, %2 \n"\ | |||||
| "add %0, %3 \n"\ | |||||
| "neg %0 \n"\ | |||||
| body\ | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |||||
| );\ | |||||
| }else if(channels==6){\ | |||||
| ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |||||
| }else\ | |||||
| float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |||||
| } | |||||
| FLOAT_TO_INT16_INTERLEAVE(3dnow, | |||||
| "1: \n" | |||||
| "pf2id (%2,%0), %%mm0 \n" | |||||
| "pf2id 8(%2,%0), %%mm1 \n" | |||||
| "pf2id (%3,%0), %%mm2 \n" | |||||
| "pf2id 8(%3,%0), %%mm3 \n" | |||||
| "packssdw %%mm1, %%mm0 \n" | |||||
| "packssdw %%mm3, %%mm2 \n" | |||||
| "movq %%mm0, %%mm1 \n" | |||||
| "punpcklwd %%mm2, %%mm0 \n" | |||||
| "punpckhwd %%mm2, %%mm1 \n" | |||||
| "movq %%mm0, (%1,%0)\n" | |||||
| "movq %%mm1, 8(%1,%0)\n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| "femms \n" | |||||
| ) | |||||
| FLOAT_TO_INT16_INTERLEAVE(sse, | |||||
| "1: \n" | |||||
| "cvtps2pi (%2,%0), %%mm0 \n" | |||||
| "cvtps2pi 8(%2,%0), %%mm1 \n" | |||||
| "cvtps2pi (%3,%0), %%mm2 \n" | |||||
| "cvtps2pi 8(%3,%0), %%mm3 \n" | |||||
| "packssdw %%mm1, %%mm0 \n" | |||||
| "packssdw %%mm3, %%mm2 \n" | |||||
| "movq %%mm0, %%mm1 \n" | |||||
| "punpcklwd %%mm2, %%mm0 \n" | |||||
| "punpckhwd %%mm2, %%mm1 \n" | |||||
| "movq %%mm0, (%1,%0)\n" | |||||
| "movq %%mm1, 8(%1,%0)\n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| "emms \n" | |||||
| ) | |||||
| FLOAT_TO_INT16_INTERLEAVE(sse2, | |||||
| "1: \n" | |||||
| "cvtps2dq (%2,%0), %%xmm0 \n" | |||||
| "cvtps2dq (%3,%0), %%xmm1 \n" | |||||
| "packssdw %%xmm1, %%xmm0 \n" | |||||
| "movhlps %%xmm0, %%xmm1 \n" | |||||
| "punpcklwd %%xmm1, %%xmm0 \n" | |||||
| "movdqa %%xmm0, (%1,%0) \n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| ) | |||||
| static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |||||
| if(channels==6) | |||||
| ff_float_to_int16_interleave6_3dn2(dst, src, len); | |||||
| else | |||||
| float_to_int16_interleave_3dnow(dst, src, len, channels); | |||||
| } | |||||
| float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | ||||
| void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | ||||
| @@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| if(mm_flags & AV_CPU_FLAG_3DNOW){ | if(mm_flags & AV_CPU_FLAG_3DNOW){ | ||||
| c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | ||||
| c->vector_fmul = vector_fmul_3dnow; | c->vector_fmul = vector_fmul_3dnow; | ||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||||
| c->float_to_int16 = float_to_int16_3dnow; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |||||
| } | |||||
| } | } | ||||
| if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | ||||
| c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | ||||
| #if HAVE_6REGS | #if HAVE_6REGS | ||||
| c->vector_fmul_window = vector_fmul_window_3dnow2; | c->vector_fmul_window = vector_fmul_window_3dnow2; | ||||
| #endif | #endif | ||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |||||
| } | |||||
| } | } | ||||
| if(mm_flags & AV_CPU_FLAG_MMX2){ | if(mm_flags & AV_CPU_FLAG_MMX2){ | ||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| @@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| #if HAVE_6REGS | #if HAVE_6REGS | ||||
| c->vector_fmul_window = vector_fmul_window_sse; | c->vector_fmul_window = vector_fmul_window_sse; | ||||
| #endif | #endif | ||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |||||
| c->vector_clipf = vector_clipf_sse; | c->vector_clipf = vector_clipf_sse; | ||||
| c->float_to_int16 = float_to_int16_sse; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_sse; | |||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| c->scalarproduct_float = ff_scalarproduct_float_sse; | c->scalarproduct_float = ff_scalarproduct_float_sse; | ||||
| #endif | #endif | ||||
| @@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| if(mm_flags & AV_CPU_FLAG_3DNOW) | if(mm_flags & AV_CPU_FLAG_3DNOW) | ||||
| c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse | c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse | ||||
| if(mm_flags & AV_CPU_FLAG_SSE2){ | if(mm_flags & AV_CPU_FLAG_SSE2){ | ||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |||||
| c->float_to_int16 = float_to_int16_sse2; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | ||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | ||||
| @@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |||||
| section .text align=16 | section .text align=16 | ||||
| %macro PSWAPD_SSE 2 | |||||
| pshufw %1, %2, 0x4e | |||||
| %endmacro | |||||
| %macro PSWAPD_3DN1 2 | |||||
| movq %1, %2 | |||||
| psrlq %1, 32 | |||||
| punpckldq %1, %2 | |||||
| %endmacro | |||||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |||||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||||
| %ifdef ARCH_X86_64 | |||||
| %define lend r10d | |||||
| mov lend, r2d | |||||
| %else | |||||
| %define lend dword r2m | |||||
| %endif | |||||
| mov src1q, [srcq+1*gprsize] | |||||
| mov src2q, [srcq+2*gprsize] | |||||
| mov src3q, [srcq+3*gprsize] | |||||
| mov src4q, [srcq+4*gprsize] | |||||
| mov src5q, [srcq+5*gprsize] | |||||
| mov srcq, [srcq] | |||||
| sub src1q, srcq | |||||
| sub src2q, srcq | |||||
| sub src3q, srcq | |||||
| sub src4q, srcq | |||||
| sub src5q, srcq | |||||
| .loop: | |||||
| cvtps2pi mm0, [srcq] | |||||
| cvtps2pi mm1, [srcq+src1q] | |||||
| cvtps2pi mm2, [srcq+src2q] | |||||
| cvtps2pi mm3, [srcq+src3q] | |||||
| cvtps2pi mm4, [srcq+src4q] | |||||
| cvtps2pi mm5, [srcq+src5q] | |||||
| packssdw mm0, mm3 | |||||
| packssdw mm1, mm4 | |||||
| packssdw mm2, mm5 | |||||
| pswapd mm3, mm0 | |||||
| punpcklwd mm0, mm1 | |||||
| punpckhwd mm1, mm2 | |||||
| punpcklwd mm2, mm3 | |||||
| pswapd mm3, mm0 | |||||
| punpckldq mm0, mm2 | |||||
| punpckhdq mm2, mm1 | |||||
| punpckldq mm1, mm3 | |||||
| movq [dstq ], mm0 | |||||
| movq [dstq+16], mm2 | |||||
| movq [dstq+ 8], mm1 | |||||
| add srcq, 8 | |||||
| add dstq, 24 | |||||
| sub lend, 2 | |||||
| jg .loop | |||||
| emms | |||||
| RET | |||||
| %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |||||
| %define pswapd PSWAPD_SSE | |||||
| FLOAT_TO_INT16_INTERLEAVE6 sse | |||||
| %define cvtps2pi pf2id | |||||
| %define pswapd PSWAPD_3DN1 | |||||
| FLOAT_TO_INT16_INTERLEAVE6 3dnow | |||||
| %undef pswapd | |||||
| FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||||
| %undef cvtps2pi | |||||
| %macro SCALARPRODUCT 1 | %macro SCALARPRODUCT 1 | ||||
| ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) | ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) | ||||
| cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | ||||
| @@ -0,0 +1,91 @@ | |||||
| ;****************************************************************************** | |||||
| ;* x86 optimized Format Conversion Utils | |||||
| ;* Copyright (c) 2008 Loren Merritt | |||||
| ;* | |||||
| ;* This file is part of FFmpeg. | |||||
| ;* | |||||
| ;* FFmpeg is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with FFmpeg; if not, write to the Free Software | |||||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "x86inc.asm" | |||||
| section .text align=16 | |||||
| %macro PSWAPD_SSE 2 | |||||
| pshufw %1, %2, 0x4e | |||||
| %endmacro | |||||
| %macro PSWAPD_3DN1 2 | |||||
| movq %1, %2 | |||||
| psrlq %1, 32 | |||||
| punpckldq %1, %2 | |||||
| %endmacro | |||||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |||||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||||
| %ifdef ARCH_X86_64 | |||||
| %define lend r10d | |||||
| mov lend, r2d | |||||
| %else | |||||
| %define lend dword r2m | |||||
| %endif | |||||
| mov src1q, [srcq+1*gprsize] | |||||
| mov src2q, [srcq+2*gprsize] | |||||
| mov src3q, [srcq+3*gprsize] | |||||
| mov src4q, [srcq+4*gprsize] | |||||
| mov src5q, [srcq+5*gprsize] | |||||
| mov srcq, [srcq] | |||||
| sub src1q, srcq | |||||
| sub src2q, srcq | |||||
| sub src3q, srcq | |||||
| sub src4q, srcq | |||||
| sub src5q, srcq | |||||
| .loop: | |||||
| cvtps2pi mm0, [srcq] | |||||
| cvtps2pi mm1, [srcq+src1q] | |||||
| cvtps2pi mm2, [srcq+src2q] | |||||
| cvtps2pi mm3, [srcq+src3q] | |||||
| cvtps2pi mm4, [srcq+src4q] | |||||
| cvtps2pi mm5, [srcq+src5q] | |||||
| packssdw mm0, mm3 | |||||
| packssdw mm1, mm4 | |||||
| packssdw mm2, mm5 | |||||
| pswapd mm3, mm0 | |||||
| punpcklwd mm0, mm1 | |||||
| punpckhwd mm1, mm2 | |||||
| punpcklwd mm2, mm3 | |||||
| pswapd mm3, mm0 | |||||
| punpckldq mm0, mm2 | |||||
| punpckhdq mm2, mm1 | |||||
| punpckldq mm1, mm3 | |||||
| movq [dstq ], mm0 | |||||
| movq [dstq+16], mm2 | |||||
| movq [dstq+ 8], mm1 | |||||
| add srcq, 8 | |||||
| add dstq, 24 | |||||
| sub lend, 2 | |||||
| jg .loop | |||||
| emms | |||||
| RET | |||||
| %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |||||
| %define pswapd PSWAPD_SSE | |||||
| FLOAT_TO_INT16_INTERLEAVE6 sse | |||||
| %define cvtps2pi pf2id | |||||
| %define pswapd PSWAPD_3DN1 | |||||
| FLOAT_TO_INT16_INTERLEAVE6 3dnow | |||||
| %undef pswapd | |||||
| FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||||
| %undef cvtps2pi | |||||
| @@ -0,0 +1,266 @@ | |||||
| /* | |||||
| * Format Conversion Utils | |||||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| * | |||||
| * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |||||
| */ | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/x86_cpu.h" | |||||
| #include "libavcodec/fmtconvert.h" | |||||
| static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| x86_reg i = -4*len; | |||||
| __asm__ volatile( | |||||
| "movss %3, %%xmm4 \n" | |||||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||||
| "1: \n" | |||||
| "cvtpi2ps (%2,%0), %%xmm0 \n" | |||||
| "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |||||
| "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |||||
| "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |||||
| "movlhps %%xmm1, %%xmm0 \n" | |||||
| "movlhps %%xmm3, %%xmm2 \n" | |||||
| "mulps %%xmm4, %%xmm0 \n" | |||||
| "mulps %%xmm4, %%xmm2 \n" | |||||
| "movaps %%xmm0, (%1,%0) \n" | |||||
| "movaps %%xmm2, 16(%1,%0) \n" | |||||
| "add $32, %0 \n" | |||||
| "jl 1b \n" | |||||
| :"+r"(i) | |||||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||||
| ); | |||||
| } | |||||
| static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |||||
| { | |||||
| x86_reg i = -4*len; | |||||
| __asm__ volatile( | |||||
| "movss %3, %%xmm4 \n" | |||||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||||
| "1: \n" | |||||
| "cvtdq2ps (%2,%0), %%xmm0 \n" | |||||
| "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |||||
| "mulps %%xmm4, %%xmm0 \n" | |||||
| "mulps %%xmm4, %%xmm1 \n" | |||||
| "movaps %%xmm0, (%1,%0) \n" | |||||
| "movaps %%xmm1, 16(%1,%0) \n" | |||||
| "add $32, %0 \n" | |||||
| "jl 1b \n" | |||||
| :"+r"(i) | |||||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||||
| ); | |||||
| } | |||||
| static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| // not bit-exact: pf2id uses different rounding than C and SSE | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "pf2id (%2,%0,2) , %%mm0 \n\t" | |||||
| "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |||||
| "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |||||
| "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |||||
| "packssdw %%mm1 , %%mm0 \n\t" | |||||
| "packssdw %%mm3 , %%mm2 \n\t" | |||||
| "movq %%mm0 , (%1,%0) \n\t" | |||||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| "femms \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |||||
| "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |||||
| "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |||||
| "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |||||
| "packssdw %%mm1 , %%mm0 \n\t" | |||||
| "packssdw %%mm3 , %%mm2 \n\t" | |||||
| "movq %%mm0 , (%1,%0) \n\t" | |||||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| "emms \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |||||
| x86_reg reglen = len; | |||||
| __asm__ volatile( | |||||
| "add %0 , %0 \n\t" | |||||
| "lea (%2,%0,2) , %2 \n\t" | |||||
| "add %0 , %1 \n\t" | |||||
| "neg %0 \n\t" | |||||
| "1: \n\t" | |||||
| "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |||||
| "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |||||
| "packssdw %%xmm1 , %%xmm0 \n\t" | |||||
| "movdqa %%xmm0 , (%1,%0) \n\t" | |||||
| "add $16 , %0 \n\t" | |||||
| " js 1b \n\t" | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||||
| ); | |||||
| } | |||||
| void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | |||||
| void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |||||
| void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |||||
| #if !HAVE_YASM | |||||
| #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) | |||||
| #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||||
| #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||||
| #endif | |||||
| #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |||||
| #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |||||
| /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |||||
| static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||||
| DECLARE_ALIGNED(16, int16_t, tmp)[len];\ | |||||
| int i,j,c;\ | |||||
| for(c=0; c<channels; c++){\ | |||||
| float_to_int16_##cpu(tmp, src[c], len);\ | |||||
| for(i=0, j=c; i<len; i++, j+=channels)\ | |||||
| dst[j] = tmp[i];\ | |||||
| }\ | |||||
| }\ | |||||
| \ | |||||
| static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||||
| if(channels==1)\ | |||||
| float_to_int16_##cpu(dst, src[0], len);\ | |||||
| else if(channels==2){\ | |||||
| x86_reg reglen = len; \ | |||||
| const float *src0 = src[0];\ | |||||
| const float *src1 = src[1];\ | |||||
| __asm__ volatile(\ | |||||
| "shl $2, %0 \n"\ | |||||
| "add %0, %1 \n"\ | |||||
| "add %0, %2 \n"\ | |||||
| "add %0, %3 \n"\ | |||||
| "neg %0 \n"\ | |||||
| body\ | |||||
| :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |||||
| );\ | |||||
| }else if(channels==6){\ | |||||
| ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |||||
| }else\ | |||||
| float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |||||
| } | |||||
| FLOAT_TO_INT16_INTERLEAVE(3dnow, | |||||
| "1: \n" | |||||
| "pf2id (%2,%0), %%mm0 \n" | |||||
| "pf2id 8(%2,%0), %%mm1 \n" | |||||
| "pf2id (%3,%0), %%mm2 \n" | |||||
| "pf2id 8(%3,%0), %%mm3 \n" | |||||
| "packssdw %%mm1, %%mm0 \n" | |||||
| "packssdw %%mm3, %%mm2 \n" | |||||
| "movq %%mm0, %%mm1 \n" | |||||
| "punpcklwd %%mm2, %%mm0 \n" | |||||
| "punpckhwd %%mm2, %%mm1 \n" | |||||
| "movq %%mm0, (%1,%0)\n" | |||||
| "movq %%mm1, 8(%1,%0)\n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| "femms \n" | |||||
| ) | |||||
| FLOAT_TO_INT16_INTERLEAVE(sse, | |||||
| "1: \n" | |||||
| "cvtps2pi (%2,%0), %%mm0 \n" | |||||
| "cvtps2pi 8(%2,%0), %%mm1 \n" | |||||
| "cvtps2pi (%3,%0), %%mm2 \n" | |||||
| "cvtps2pi 8(%3,%0), %%mm3 \n" | |||||
| "packssdw %%mm1, %%mm0 \n" | |||||
| "packssdw %%mm3, %%mm2 \n" | |||||
| "movq %%mm0, %%mm1 \n" | |||||
| "punpcklwd %%mm2, %%mm0 \n" | |||||
| "punpckhwd %%mm2, %%mm1 \n" | |||||
| "movq %%mm0, (%1,%0)\n" | |||||
| "movq %%mm1, 8(%1,%0)\n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| "emms \n" | |||||
| ) | |||||
| FLOAT_TO_INT16_INTERLEAVE(sse2, | |||||
| "1: \n" | |||||
| "cvtps2dq (%2,%0), %%xmm0 \n" | |||||
| "cvtps2dq (%3,%0), %%xmm1 \n" | |||||
| "packssdw %%xmm1, %%xmm0 \n" | |||||
| "movhlps %%xmm0, %%xmm1 \n" | |||||
| "punpcklwd %%xmm1, %%xmm0 \n" | |||||
| "movdqa %%xmm0, (%1,%0) \n" | |||||
| "add $16, %0 \n" | |||||
| "js 1b \n" | |||||
| ) | |||||
| static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |||||
| if(channels==6) | |||||
| ff_float_to_int16_interleave6_3dn2(dst, src, len); | |||||
| else | |||||
| float_to_int16_interleave_3dnow(dst, src, len, channels); | |||||
| } | |||||
| void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) | |||||
| { | |||||
| int mm_flags = av_get_cpu_flags(); | |||||
| if (mm_flags & AV_CPU_FLAG_MMX) { | |||||
| if(mm_flags & AV_CPU_FLAG_3DNOW){ | |||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||||
| c->float_to_int16 = float_to_int16_3dnow; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |||||
| } | |||||
| } | |||||
| if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | |||||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |||||
| } | |||||
| } | |||||
| if(mm_flags & AV_CPU_FLAG_SSE){ | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |||||
| c->float_to_int16 = float_to_int16_sse; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_sse; | |||||
| } | |||||
| if(mm_flags & AV_CPU_FLAG_SSE2){ | |||||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |||||
| c->float_to_int16 = float_to_int16_sse2; | |||||
| c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |||||
| } | |||||
| } | |||||
| } | |||||