This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672)
tags/n0.8
| @@ -12,6 +12,7 @@ OBJS = allcodecs.o \ | |||
| bitstream_filter.o \ | |||
| dsputil.o \ | |||
| faanidct.o \ | |||
| fmtconvert.o \ | |||
| imgconvert.o \ | |||
| jrevdct.o \ | |||
| opt.o \ | |||
| @@ -35,6 +35,7 @@ | |||
| #include "fft.h" | |||
| #include "mpeg4audio.h" | |||
| #include "sbr.h" | |||
| #include "fmtconvert.h" | |||
| #include <stdint.h> | |||
| @@ -268,6 +269,7 @@ typedef struct { | |||
| FFTContext mdct; | |||
| FFTContext mdct_small; | |||
| DSPContext dsp; | |||
| FmtConvertContext fmt_conv; | |||
| int random_state; | |||
| /** @} */ | |||
| @@ -85,6 +85,7 @@ | |||
| #include "get_bits.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| #include "lpc.h" | |||
| #include "aac.h" | |||
| @@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx) | |||
| ff_aac_sbr_init(); | |||
| dsputil_init(&ac->dsp, avctx); | |||
| ff_fmt_convert_init(&ac->fmt_conv, avctx); | |||
| ac->random_state = 0x1f2e3d4c; | |||
| @@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data, | |||
| *data_size = data_size_tmp; | |||
| if (samples) | |||
| ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); | |||
| ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); | |||
| if (ac->output_configured) | |||
| ac->output_configured = OC_LOCKED; | |||
| @@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) | |||
| ff_mdct_init(&s->imdct_512, 9, 1, 1.0); | |||
| ff_kbd_window_init(s->window, 5.0, 256); | |||
| dsputil_init(&s->dsp, avctx); | |||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||
| av_lfg_init(&s->dith_state, 0); | |||
| /* set scale value for float to int16 conversion */ | |||
| @@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) | |||
| } else { | |||
| gain *= s->dynamic_range[0]; | |||
| } | |||
| s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); | |||
| s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); | |||
| } | |||
| /* apply spectral extension to high frequency bins */ | |||
| @@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, | |||
| av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); | |||
| err = 1; | |||
| } | |||
| s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels); | |||
| s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels); | |||
| out_samples += 256 * s->out_channels; | |||
| } | |||
| *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); | |||
| @@ -55,6 +55,7 @@ | |||
| #include "get_bits.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| /* override ac3.h to include coupling channel */ | |||
| #undef AC3_MAX_CHANNELS | |||
| @@ -190,6 +191,7 @@ typedef struct { | |||
| ///@defgroup opt optimization | |||
| DSPContext dsp; ///< for optimization | |||
| FmtConvertContext fmt_conv; ///< optimized conversion functions | |||
| float mul_bias; ///< scaling for float_to_int16 conversion | |||
| ///@} | |||
| @@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o | |||
| OBJS += arm/dsputil_init_arm.o \ | |||
| arm/dsputil_arm.o \ | |||
| arm/fft_init_arm.o \ | |||
| arm/fmtconvert_init_arm.o \ | |||
| arm/jrevdct_arm.o \ | |||
| arm/mpegvideo_arm.o \ | |||
| arm/simple_idct_arm.o \ | |||
| @@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ | |||
| arm/dsputil_armv6.o \ | |||
| arm/simple_idct_armv6.o \ | |||
| VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ | |||
| OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ | |||
| arm/dsputil_init_vfp.o \ | |||
| $(VFP-OBJS-yes) | |||
| OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ | |||
| arm/mpegvideo_iwmmxt.o \ | |||
| @@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ | |||
| OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ | |||
| arm/dsputil_neon.o \ | |||
| arm/fmtconvert_neon.o \ | |||
| arm/int_neon.o \ | |||
| arm/mpegvideo_neon.o \ | |||
| arm/simple_idct_neon.o \ | |||
| @@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, | |||
| int len); | |||
| void ff_butterflies_float_neon(float *v1, float *v2, int len); | |||
| float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); | |||
| void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, | |||
| float mul, int len); | |||
| void ff_vector_fmul_reverse_neon(float *dst, const float *src0, | |||
| const float *src1, int len); | |||
| void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, | |||
| @@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, | |||
| void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, | |||
| int len); | |||
| void ff_float_to_int16_neon(int16_t *, const float *, long); | |||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | |||
| void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); | |||
| @@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||
| c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; | |||
| c->butterflies_float = ff_butterflies_float_neon; | |||
| c->scalarproduct_float = ff_scalarproduct_float_neon; | |||
| c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; | |||
| c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; | |||
| c->vector_fmul_add = ff_vector_fmul_add_neon; | |||
| c->vector_clipf = ff_vector_clipf_neon; | |||
| @@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||
| c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; | |||
| c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; | |||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| c->float_to_int16 = ff_float_to_int16_neon; | |||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; | |||
| } | |||
| if (CONFIG_VORBIS_DECODER) | |||
| c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; | |||
| @@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, | |||
| const float *src1, int len); | |||
| void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | |||
| const float *src1, int len); | |||
| void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); | |||
| void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| c->vector_fmul = ff_vector_fmul_vfp; | |||
| c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; | |||
| #if HAVE_ARMV6 | |||
| c->float_to_int16 = ff_float_to_int16_vfp; | |||
| #endif | |||
| } | |||
| @@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 | |||
| bx lr | |||
| endfunc | |||
| function ff_float_to_int16_neon, export=1 | |||
| subs r2, r2, #8 | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vcvt.s32.f32 q8, q0, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vcvt.s32.f32 q9, q1, #16 | |||
| beq 3f | |||
| bics ip, r2, #15 | |||
| beq 2f | |||
| 1: subs ip, ip, #16 | |||
| vshrn.s32 d4, q8, #16 | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vshrn.s32 d5, q9, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vshrn.s32 d6, q0, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| vshrn.s32 d7, q1, #16 | |||
| vld1.64 {d16-d17},[r1,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r1,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.64 {d6-d7}, [r0,:128]! | |||
| bne 1b | |||
| ands r2, r2, #15 | |||
| beq 3f | |||
| 2: vld1.64 {d0-d1}, [r1,:128]! | |||
| vshrn.s32 d4, q8, #16 | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vshrn.s32 d5, q9, #16 | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vshrn.s32 d6, q0, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| vshrn.s32 d7, q1, #16 | |||
| vst1.64 {d6-d7}, [r0,:128]! | |||
| bx lr | |||
| 3: vshrn.s32 d4, q8, #16 | |||
| vshrn.s32 d5, q9, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| bx lr | |||
| endfunc | |||
| function ff_float_to_int16_interleave_neon, export=1 | |||
| cmp r3, #2 | |||
| ldrlt r1, [r1] | |||
| blt ff_float_to_int16_neon | |||
| bne 4f | |||
| ldr r3, [r1] | |||
| ldr r1, [r1, #4] | |||
| subs r2, r2, #8 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q8, q0, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q9, q1, #16 | |||
| vld1.64 {d20-d21},[r1,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r1,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| beq 3f | |||
| bics ip, r2, #15 | |||
| beq 2f | |||
| 1: subs ip, ip, #16 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 q10, q8, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vld1.64 {d24-d25},[r1,:128]! | |||
| vcvt.s32.f32 q12, q12, #16 | |||
| vld1.64 {d26-d27},[r1,:128]! | |||
| vsri.32 q11, q9, #16 | |||
| vst1.64 {d20-d21},[r0,:128]! | |||
| vcvt.s32.f32 q13, q13, #16 | |||
| vst1.64 {d22-d23},[r0,:128]! | |||
| vsri.32 q12, q0, #16 | |||
| vld1.64 {d16-d17},[r3,:128]! | |||
| vsri.32 q13, q1, #16 | |||
| vst1.64 {d24-d25},[r0,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r3,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r1,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r1,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.64 {d26-d27},[r0,:128]! | |||
| bne 1b | |||
| ands r2, r2, #15 | |||
| beq 3f | |||
| 2: vsri.32 q10, q8, #16 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vld1.64 {d24-d25},[r1,:128]! | |||
| vcvt.s32.f32 q12, q12, #16 | |||
| vsri.32 q11, q9, #16 | |||
| vld1.64 {d26-d27},[r1,:128]! | |||
| vcvt.s32.f32 q13, q13, #16 | |||
| vst1.64 {d20-d21},[r0,:128]! | |||
| vsri.32 q12, q0, #16 | |||
| vst1.64 {d22-d23},[r0,:128]! | |||
| vsri.32 q13, q1, #16 | |||
| vst1.64 {d24-d27},[r0,:128]! | |||
| bx lr | |||
| 3: vsri.32 q10, q8, #16 | |||
| vsri.32 q11, q9, #16 | |||
| vst1.64 {d20-d23},[r0,:128]! | |||
| bx lr | |||
| 4: push {r4-r8,lr} | |||
| cmp r3, #4 | |||
| lsl ip, r3, #1 | |||
| blt 4f | |||
| @ 4 channels | |||
| 5: ldmia r1!, {r4-r7} | |||
| mov lr, r2 | |||
| mov r8, r0 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r6,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r7,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| 6: subs lr, lr, #8 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 q9, q8, #16 | |||
| vld1.64 {d2-d3}, [r5,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vsri.32 q11, q10, #16 | |||
| vld1.64 {d4-d5}, [r6,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vzip.32 d18, d22 | |||
| vld1.64 {d6-d7}, [r7,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vzip.32 d19, d23 | |||
| vst1.64 {d18}, [r8], ip | |||
| vsri.32 q1, q0, #16 | |||
| vst1.64 {d22}, [r8], ip | |||
| vsri.32 q3, q2, #16 | |||
| vst1.64 {d19}, [r8], ip | |||
| vzip.32 d2, d6 | |||
| vst1.64 {d23}, [r8], ip | |||
| vzip.32 d3, d7 | |||
| beq 7f | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.64 {d2}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.64 {d6}, [r8], ip | |||
| vld1.64 {d20-d21},[r6,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.64 {d3}, [r8], ip | |||
| vld1.64 {d22-d23},[r7,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.64 {d7}, [r8], ip | |||
| b 6b | |||
| 7: vst1.64 {d2}, [r8], ip | |||
| vst1.64 {d6}, [r8], ip | |||
| vst1.64 {d3}, [r8], ip | |||
| vst1.64 {d7}, [r8], ip | |||
| subs r3, r3, #4 | |||
| popeq {r4-r8,pc} | |||
| cmp r3, #4 | |||
| add r0, r0, #8 | |||
| bge 5b | |||
| @ 2 channels | |||
| 4: cmp r3, #2 | |||
| blt 4f | |||
| ldmia r1!, {r4-r5} | |||
| mov lr, r2 | |||
| mov r8, r0 | |||
| tst lr, #8 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| beq 6f | |||
| subs lr, lr, #8 | |||
| beq 7f | |||
| vsri.32 d18, d16, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| 6: subs lr, lr, #16 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 d18, d16, #16 | |||
| vld1.64 {d2-d3}, [r5,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vld1.64 {d4-d5}, [r4,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vld1.64 {d6-d7}, [r5,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vsri.32 d2, d0, #16 | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vsri.32 d3, d1, #16 | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vsri.32 d6, d4, #16 | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vsri.32 d7, d5, #16 | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| beq 6f | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.32 {d2[0]}, [r8], ip | |||
| vst1.32 {d2[1]}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.32 {d3[0]}, [r8], ip | |||
| vst1.32 {d3[1]}, [r8], ip | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.32 {d6[0]}, [r8], ip | |||
| vst1.32 {d6[1]}, [r8], ip | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.32 {d7[0]}, [r8], ip | |||
| vst1.32 {d7[1]}, [r8], ip | |||
| bgt 6b | |||
| 6: vst1.32 {d2[0]}, [r8], ip | |||
| vst1.32 {d2[1]}, [r8], ip | |||
| vst1.32 {d3[0]}, [r8], ip | |||
| vst1.32 {d3[1]}, [r8], ip | |||
| vst1.32 {d6[0]}, [r8], ip | |||
| vst1.32 {d6[1]}, [r8], ip | |||
| vst1.32 {d7[0]}, [r8], ip | |||
| vst1.32 {d7[1]}, [r8], ip | |||
| b 8f | |||
| 7: vsri.32 d18, d16, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| 8: subs r3, r3, #2 | |||
| add r0, r0, #4 | |||
| popeq {r4-r8,pc} | |||
| @ 1 channel | |||
| 4: ldr r4, [r1],#4 | |||
| tst r2, #8 | |||
| mov lr, r2 | |||
| mov r5, r0 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| bne 8f | |||
| 6: subs lr, lr, #16 | |||
| vld1.64 {d4-d5}, [r4,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vld1.64 {d6-d7}, [r4,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vst1.16 {d0[1]}, [r5,:16], ip | |||
| vst1.16 {d0[3]}, [r5,:16], ip | |||
| vst1.16 {d1[1]}, [r5,:16], ip | |||
| vst1.16 {d1[3]}, [r5,:16], ip | |||
| vst1.16 {d2[1]}, [r5,:16], ip | |||
| vst1.16 {d2[3]}, [r5,:16], ip | |||
| vst1.16 {d3[1]}, [r5,:16], ip | |||
| vst1.16 {d3[3]}, [r5,:16], ip | |||
| beq 7f | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| 7: vst1.16 {d4[1]}, [r5,:16], ip | |||
| vst1.16 {d4[3]}, [r5,:16], ip | |||
| vst1.16 {d5[1]}, [r5,:16], ip | |||
| vst1.16 {d5[3]}, [r5,:16], ip | |||
| vst1.16 {d6[1]}, [r5,:16], ip | |||
| vst1.16 {d6[3]}, [r5,:16], ip | |||
| vst1.16 {d7[1]}, [r5,:16], ip | |||
| vst1.16 {d7[3]}, [r5,:16], ip | |||
| bgt 6b | |||
| pop {r4-r8,pc} | |||
| 8: subs lr, lr, #8 | |||
| vst1.16 {d0[1]}, [r5,:16], ip | |||
| vst1.16 {d0[3]}, [r5,:16], ip | |||
| vst1.16 {d1[1]}, [r5,:16], ip | |||
| vst1.16 {d1[3]}, [r5,:16], ip | |||
| vst1.16 {d2[1]}, [r5,:16], ip | |||
| vst1.16 {d2[3]}, [r5,:16], ip | |||
| vst1.16 {d3[1]}, [r5,:16], ip | |||
| vst1.16 {d3[3]}, [r5,:16], ip | |||
| popeq {r4-r8,pc} | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| b 6b | |||
| endfunc | |||
| function ff_vector_fmul_neon, export=1 | |||
| subs r3, r3, #8 | |||
| vld1.64 {d0-d3}, [r1,:128]! | |||
| @@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] | |||
| bx lr | |||
| endfunc | |||
| function ff_int32_to_float_fmul_scalar_neon, export=1 | |||
| VFP vdup.32 q0, d0[0] | |||
| VFP len .req r2 | |||
| NOVFP vdup.32 q0, r2 | |||
| NOVFP len .req r3 | |||
| vld1.32 {q1},[r1,:128]! | |||
| vcvt.f32.s32 q3, q1 | |||
| vld1.32 {q2},[r1,:128]! | |||
| vcvt.f32.s32 q8, q2 | |||
| 1: subs len, len, #8 | |||
| pld [r1, #16] | |||
| vmul.f32 q9, q3, q0 | |||
| vmul.f32 q10, q8, q0 | |||
| beq 2f | |||
| vld1.32 {q1},[r1,:128]! | |||
| vcvt.f32.s32 q3, q1 | |||
| vld1.32 {q2},[r1,:128]! | |||
| vcvt.f32.s32 q8, q2 | |||
| vst1.32 {q9}, [r0,:128]! | |||
| vst1.32 {q10},[r0,:128]! | |||
| b 1b | |||
| 2: vst1.32 {q9}, [r0,:128]! | |||
| vst1.32 {q10},[r0,:128]! | |||
| bx lr | |||
| .unreq len | |||
| endfunc | |||
| function ff_vector_fmul_reverse_neon, export=1 | |||
| add r2, r2, r3, lsl #2 | |||
| sub r2, r2, #32 | |||
| @@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 | |||
| vpop {d8-d15} | |||
| bx lr | |||
| endfunc | |||
| #if HAVE_ARMV6 | |||
| /** | |||
| * ARM VFP optimized float to int16 conversion. | |||
| * Assume that len is a positive number and is multiple of 8, destination | |||
| * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |||
| * performance), little endian byte sex | |||
| */ | |||
| @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |||
| function ff_float_to_int16_vfp, export=1 | |||
| push {r4-r8,lr} | |||
| vpush {d8-d11} | |||
| vldmia r1!, {s16-s23} | |||
| vcvt.s32.f32 s0, s16 | |||
| vcvt.s32.f32 s1, s17 | |||
| vcvt.s32.f32 s2, s18 | |||
| vcvt.s32.f32 s3, s19 | |||
| vcvt.s32.f32 s4, s20 | |||
| vcvt.s32.f32 s5, s21 | |||
| vcvt.s32.f32 s6, s22 | |||
| vcvt.s32.f32 s7, s23 | |||
| 1: | |||
| subs r2, r2, #8 | |||
| vmov r3, r4, s0, s1 | |||
| vmov r5, r6, s2, s3 | |||
| vmov r7, r8, s4, s5 | |||
| vmov ip, lr, s6, s7 | |||
| vldmiagt r1!, {s16-s23} | |||
| ssat r4, #16, r4 | |||
| ssat r3, #16, r3 | |||
| ssat r6, #16, r6 | |||
| ssat r5, #16, r5 | |||
| pkhbt r3, r3, r4, lsl #16 | |||
| pkhbt r4, r5, r6, lsl #16 | |||
| vcvtgt.s32.f32 s0, s16 | |||
| vcvtgt.s32.f32 s1, s17 | |||
| vcvtgt.s32.f32 s2, s18 | |||
| vcvtgt.s32.f32 s3, s19 | |||
| vcvtgt.s32.f32 s4, s20 | |||
| vcvtgt.s32.f32 s5, s21 | |||
| vcvtgt.s32.f32 s6, s22 | |||
| vcvtgt.s32.f32 s7, s23 | |||
| ssat r8, #16, r8 | |||
| ssat r7, #16, r7 | |||
| ssat lr, #16, lr | |||
| ssat ip, #16, ip | |||
| pkhbt r5, r7, r8, lsl #16 | |||
| pkhbt r6, ip, lr, lsl #16 | |||
| stmia r0!, {r3-r6} | |||
| bgt 1b | |||
| vpop {d8-d11} | |||
| pop {r4-r8,pc} | |||
| endfunc | |||
| #endif | |||
| @@ -0,0 +1,48 @@ | |||
| /* | |||
| * ARM optimized Format Conversion Utils | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include <stdint.h> | |||
| #include "libavcodec/avcodec.h" | |||
| #include "libavcodec/fmtconvert.h" | |||
| void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, | |||
| float mul, int len); | |||
| void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); | |||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | |||
| void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); | |||
| void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| if (HAVE_ARMVFP && HAVE_ARMV6) { | |||
| c->float_to_int16 = ff_float_to_int16_vfp; | |||
| } | |||
| if (HAVE_NEON) { | |||
| c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; | |||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| c->float_to_int16 = ff_float_to_int16_neon; | |||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,391 @@ | |||
| /* | |||
| * ARM NEON optimised Format Conversion Utils | |||
| * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "config.h" | |||
| #include "asm.S" | |||
| preserve8 | |||
| .text | |||
| function ff_float_to_int16_neon, export=1 | |||
| subs r2, r2, #8 | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vcvt.s32.f32 q8, q0, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vcvt.s32.f32 q9, q1, #16 | |||
| beq 3f | |||
| bics ip, r2, #15 | |||
| beq 2f | |||
| 1: subs ip, ip, #16 | |||
| vshrn.s32 d4, q8, #16 | |||
| vld1.64 {d0-d1}, [r1,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vshrn.s32 d5, q9, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vshrn.s32 d6, q0, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| vshrn.s32 d7, q1, #16 | |||
| vld1.64 {d16-d17},[r1,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r1,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.64 {d6-d7}, [r0,:128]! | |||
| bne 1b | |||
| ands r2, r2, #15 | |||
| beq 3f | |||
| 2: vld1.64 {d0-d1}, [r1,:128]! | |||
| vshrn.s32 d4, q8, #16 | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r1,:128]! | |||
| vshrn.s32 d5, q9, #16 | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vshrn.s32 d6, q0, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| vshrn.s32 d7, q1, #16 | |||
| vst1.64 {d6-d7}, [r0,:128]! | |||
| bx lr | |||
| 3: vshrn.s32 d4, q8, #16 | |||
| vshrn.s32 d5, q9, #16 | |||
| vst1.64 {d4-d5}, [r0,:128]! | |||
| bx lr | |||
| endfunc | |||
| function ff_float_to_int16_interleave_neon, export=1 | |||
| cmp r3, #2 | |||
| ldrlt r1, [r1] | |||
| blt ff_float_to_int16_neon | |||
| bne 4f | |||
| ldr r3, [r1] | |||
| ldr r1, [r1, #4] | |||
| subs r2, r2, #8 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q8, q0, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q9, q1, #16 | |||
| vld1.64 {d20-d21},[r1,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r1,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| beq 3f | |||
| bics ip, r2, #15 | |||
| beq 2f | |||
| 1: subs ip, ip, #16 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 q10, q8, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vld1.64 {d24-d25},[r1,:128]! | |||
| vcvt.s32.f32 q12, q12, #16 | |||
| vld1.64 {d26-d27},[r1,:128]! | |||
| vsri.32 q11, q9, #16 | |||
| vst1.64 {d20-d21},[r0,:128]! | |||
| vcvt.s32.f32 q13, q13, #16 | |||
| vst1.64 {d22-d23},[r0,:128]! | |||
| vsri.32 q12, q0, #16 | |||
| vld1.64 {d16-d17},[r3,:128]! | |||
| vsri.32 q13, q1, #16 | |||
| vst1.64 {d24-d25},[r0,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r3,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r1,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r1,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.64 {d26-d27},[r0,:128]! | |||
| bne 1b | |||
| ands r2, r2, #15 | |||
| beq 3f | |||
| 2: vsri.32 q10, q8, #16 | |||
| vld1.64 {d0-d1}, [r3,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r3,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vld1.64 {d24-d25},[r1,:128]! | |||
| vcvt.s32.f32 q12, q12, #16 | |||
| vsri.32 q11, q9, #16 | |||
| vld1.64 {d26-d27},[r1,:128]! | |||
| vcvt.s32.f32 q13, q13, #16 | |||
| vst1.64 {d20-d21},[r0,:128]! | |||
| vsri.32 q12, q0, #16 | |||
| vst1.64 {d22-d23},[r0,:128]! | |||
| vsri.32 q13, q1, #16 | |||
| vst1.64 {d24-d27},[r0,:128]! | |||
| bx lr | |||
| 3: vsri.32 q10, q8, #16 | |||
| vsri.32 q11, q9, #16 | |||
| vst1.64 {d20-d23},[r0,:128]! | |||
| bx lr | |||
| 4: push {r4-r8,lr} | |||
| cmp r3, #4 | |||
| lsl ip, r3, #1 | |||
| blt 4f | |||
| @ 4 channels | |||
| 5: ldmia r1!, {r4-r7} | |||
| mov lr, r2 | |||
| mov r8, r0 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r6,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r7,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| 6: subs lr, lr, #8 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 q9, q8, #16 | |||
| vld1.64 {d2-d3}, [r5,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vsri.32 q11, q10, #16 | |||
| vld1.64 {d4-d5}, [r6,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vzip.32 d18, d22 | |||
| vld1.64 {d6-d7}, [r7,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vzip.32 d19, d23 | |||
| vst1.64 {d18}, [r8], ip | |||
| vsri.32 q1, q0, #16 | |||
| vst1.64 {d22}, [r8], ip | |||
| vsri.32 q3, q2, #16 | |||
| vst1.64 {d19}, [r8], ip | |||
| vzip.32 d2, d6 | |||
| vst1.64 {d23}, [r8], ip | |||
| vzip.32 d3, d7 | |||
| beq 7f | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.64 {d2}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.64 {d6}, [r8], ip | |||
| vld1.64 {d20-d21},[r6,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.64 {d3}, [r8], ip | |||
| vld1.64 {d22-d23},[r7,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.64 {d7}, [r8], ip | |||
| b 6b | |||
| 7: vst1.64 {d2}, [r8], ip | |||
| vst1.64 {d6}, [r8], ip | |||
| vst1.64 {d3}, [r8], ip | |||
| vst1.64 {d7}, [r8], ip | |||
| subs r3, r3, #4 | |||
| popeq {r4-r8,pc} | |||
| cmp r3, #4 | |||
| add r0, r0, #8 | |||
| bge 5b | |||
| @ 2 channels | |||
| 4: cmp r3, #2 | |||
| blt 4f | |||
| ldmia r1!, {r4-r5} | |||
| mov lr, r2 | |||
| mov r8, r0 | |||
| tst lr, #8 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| beq 6f | |||
| subs lr, lr, #8 | |||
| beq 7f | |||
| vsri.32 d18, d16, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| 6: subs lr, lr, #16 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vsri.32 d18, d16, #16 | |||
| vld1.64 {d2-d3}, [r5,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vld1.64 {d4-d5}, [r4,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vld1.64 {d6-d7}, [r5,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vsri.32 d2, d0, #16 | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vsri.32 d3, d1, #16 | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vsri.32 d6, d4, #16 | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vsri.32 d7, d5, #16 | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| beq 6f | |||
| vld1.64 {d16-d17},[r4,:128]! | |||
| vcvt.s32.f32 q8, q8, #16 | |||
| vst1.32 {d2[0]}, [r8], ip | |||
| vst1.32 {d2[1]}, [r8], ip | |||
| vld1.64 {d18-d19},[r5,:128]! | |||
| vcvt.s32.f32 q9, q9, #16 | |||
| vst1.32 {d3[0]}, [r8], ip | |||
| vst1.32 {d3[1]}, [r8], ip | |||
| vld1.64 {d20-d21},[r4,:128]! | |||
| vcvt.s32.f32 q10, q10, #16 | |||
| vst1.32 {d6[0]}, [r8], ip | |||
| vst1.32 {d6[1]}, [r8], ip | |||
| vld1.64 {d22-d23},[r5,:128]! | |||
| vcvt.s32.f32 q11, q11, #16 | |||
| vst1.32 {d7[0]}, [r8], ip | |||
| vst1.32 {d7[1]}, [r8], ip | |||
| bgt 6b | |||
| 6: vst1.32 {d2[0]}, [r8], ip | |||
| vst1.32 {d2[1]}, [r8], ip | |||
| vst1.32 {d3[0]}, [r8], ip | |||
| vst1.32 {d3[1]}, [r8], ip | |||
| vst1.32 {d6[0]}, [r8], ip | |||
| vst1.32 {d6[1]}, [r8], ip | |||
| vst1.32 {d7[0]}, [r8], ip | |||
| vst1.32 {d7[1]}, [r8], ip | |||
| b 8f | |||
| 7: vsri.32 d18, d16, #16 | |||
| vsri.32 d19, d17, #16 | |||
| vst1.32 {d18[0]}, [r8], ip | |||
| vsri.32 d22, d20, #16 | |||
| vst1.32 {d18[1]}, [r8], ip | |||
| vsri.32 d23, d21, #16 | |||
| vst1.32 {d19[0]}, [r8], ip | |||
| vst1.32 {d19[1]}, [r8], ip | |||
| vst1.32 {d22[0]}, [r8], ip | |||
| vst1.32 {d22[1]}, [r8], ip | |||
| vst1.32 {d23[0]}, [r8], ip | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| 8: subs r3, r3, #2 | |||
| add r0, r0, #4 | |||
| popeq {r4-r8,pc} | |||
| @ 1 channel | |||
| 4: ldr r4, [r1],#4 | |||
| tst r2, #8 | |||
| mov lr, r2 | |||
| mov r5, r0 | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| bne 8f | |||
| 6: subs lr, lr, #16 | |||
| vld1.64 {d4-d5}, [r4,:128]! | |||
| vcvt.s32.f32 q2, q2, #16 | |||
| vld1.64 {d6-d7}, [r4,:128]! | |||
| vcvt.s32.f32 q3, q3, #16 | |||
| vst1.16 {d0[1]}, [r5,:16], ip | |||
| vst1.16 {d0[3]}, [r5,:16], ip | |||
| vst1.16 {d1[1]}, [r5,:16], ip | |||
| vst1.16 {d1[3]}, [r5,:16], ip | |||
| vst1.16 {d2[1]}, [r5,:16], ip | |||
| vst1.16 {d2[3]}, [r5,:16], ip | |||
| vst1.16 {d3[1]}, [r5,:16], ip | |||
| vst1.16 {d3[3]}, [r5,:16], ip | |||
| beq 7f | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| 7: vst1.16 {d4[1]}, [r5,:16], ip | |||
| vst1.16 {d4[3]}, [r5,:16], ip | |||
| vst1.16 {d5[1]}, [r5,:16], ip | |||
| vst1.16 {d5[3]}, [r5,:16], ip | |||
| vst1.16 {d6[1]}, [r5,:16], ip | |||
| vst1.16 {d6[3]}, [r5,:16], ip | |||
| vst1.16 {d7[1]}, [r5,:16], ip | |||
| vst1.16 {d7[3]}, [r5,:16], ip | |||
| bgt 6b | |||
| pop {r4-r8,pc} | |||
| 8: subs lr, lr, #8 | |||
| vst1.16 {d0[1]}, [r5,:16], ip | |||
| vst1.16 {d0[3]}, [r5,:16], ip | |||
| vst1.16 {d1[1]}, [r5,:16], ip | |||
| vst1.16 {d1[3]}, [r5,:16], ip | |||
| vst1.16 {d2[1]}, [r5,:16], ip | |||
| vst1.16 {d2[3]}, [r5,:16], ip | |||
| vst1.16 {d3[1]}, [r5,:16], ip | |||
| vst1.16 {d3[3]}, [r5,:16], ip | |||
| popeq {r4-r8,pc} | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| vld1.64 {d2-d3}, [r4,:128]! | |||
| vcvt.s32.f32 q1, q1, #16 | |||
| b 6b | |||
| endfunc | |||
| function ff_int32_to_float_fmul_scalar_neon, export=1 | |||
| VFP vdup.32 q0, d0[0] | |||
| VFP len .req r2 | |||
| NOVFP vdup.32 q0, r2 | |||
| NOVFP len .req r3 | |||
| vld1.32 {q1},[r1,:128]! | |||
| vcvt.f32.s32 q3, q1 | |||
| vld1.32 {q2},[r1,:128]! | |||
| vcvt.f32.s32 q8, q2 | |||
| 1: subs len, len, #8 | |||
| pld [r1, #16] | |||
| vmul.f32 q9, q3, q0 | |||
| vmul.f32 q10, q8, q0 | |||
| beq 2f | |||
| vld1.32 {q1},[r1,:128]! | |||
| vcvt.f32.s32 q3, q1 | |||
| vld1.32 {q2},[r1,:128]! | |||
| vcvt.f32.s32 q8, q2 | |||
| vst1.32 {q9}, [r0,:128]! | |||
| vst1.32 {q10},[r0,:128]! | |||
| b 1b | |||
| 2: vst1.32 {q9}, [r0,:128]! | |||
| vst1.32 {q10},[r0,:128]! | |||
| bx lr | |||
| .unreq len | |||
| endfunc | |||
| @@ -0,0 +1,77 @@ | |||
| /* | |||
| * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "config.h" | |||
| #include "asm.S" | |||
| .syntax unified | |||
| /** | |||
| * ARM VFP optimized float to int16 conversion. | |||
| * Assume that len is a positive number and is multiple of 8, destination | |||
| * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |||
| * performance), little endian byte sex | |||
| */ | |||
| @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |||
| function ff_float_to_int16_vfp, export=1 | |||
| push {r4-r8,lr} | |||
| vpush {d8-d11} | |||
| vldmia r1!, {s16-s23} | |||
| vcvt.s32.f32 s0, s16 | |||
| vcvt.s32.f32 s1, s17 | |||
| vcvt.s32.f32 s2, s18 | |||
| vcvt.s32.f32 s3, s19 | |||
| vcvt.s32.f32 s4, s20 | |||
| vcvt.s32.f32 s5, s21 | |||
| vcvt.s32.f32 s6, s22 | |||
| vcvt.s32.f32 s7, s23 | |||
| 1: | |||
| subs r2, r2, #8 | |||
| vmov r3, r4, s0, s1 | |||
| vmov r5, r6, s2, s3 | |||
| vmov r7, r8, s4, s5 | |||
| vmov ip, lr, s6, s7 | |||
| vldmiagt r1!, {s16-s23} | |||
| ssat r4, #16, r4 | |||
| ssat r3, #16, r3 | |||
| ssat r6, #16, r6 | |||
| ssat r5, #16, r5 | |||
| pkhbt r3, r3, r4, lsl #16 | |||
| pkhbt r4, r5, r6, lsl #16 | |||
| vcvtgt.s32.f32 s0, s16 | |||
| vcvtgt.s32.f32 s1, s17 | |||
| vcvtgt.s32.f32 s2, s18 | |||
| vcvtgt.s32.f32 s3, s19 | |||
| vcvtgt.s32.f32 s4, s20 | |||
| vcvtgt.s32.f32 s5, s21 | |||
| vcvtgt.s32.f32 s6, s22 | |||
| vcvtgt.s32.f32 s7, s23 | |||
| ssat r8, #16, r8 | |||
| ssat r7, #16, r7 | |||
| ssat lr, #16, lr | |||
| ssat ip, #16, ip | |||
| pkhbt r5, r7, r8, lsl #16 | |||
| pkhbt r6, ip, lr, lsl #16 | |||
| stmia r0!, {r3-r6} | |||
| bgt 1b | |||
| vpop {d8-d11} | |||
| pop {r4-r8,pc} | |||
| endfunc | |||
| @@ -33,6 +33,7 @@ | |||
| #include "get_bits.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| extern const uint16_t ff_wma_critical_freqs[25]; | |||
| @@ -43,6 +44,7 @@ typedef struct { | |||
| AVCodecContext *avctx; | |||
| GetBitContext gb; | |||
| DSPContext dsp; | |||
| FmtConvertContext fmt_conv; | |||
| int first; | |||
| int channels; | |||
| int frame_len; ///< transform size (samples) | |||
| @@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx) | |||
| s->avctx = avctx; | |||
| dsputil_init(&s->dsp, avctx); | |||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||
| /* determine frame length */ | |||
| if (avctx->sample_rate < 22050) { | |||
| @@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct) | |||
| ff_rdft_calc(&s->trans.rdft, coeffs); | |||
| } | |||
| s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels); | |||
| s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, | |||
| s->frame_len, s->channels); | |||
| if (!s->first) { | |||
| int count = s->overlap_len * s->channels; | |||
| @@ -40,6 +40,7 @@ | |||
| #include "dca.h" | |||
| #include "synth_filter.h" | |||
| #include "dcadsp.h" | |||
| #include "fmtconvert.h" | |||
| //#define TRACE | |||
| @@ -347,6 +348,7 @@ typedef struct { | |||
| FFTContext imdct; | |||
| SynthFilterContext synth; | |||
| DCADSPContext dcadsp; | |||
| FmtConvertContext fmt_conv; | |||
| } DCAContext; | |||
| static const uint16_t dca_vlc_offs[] = { | |||
| @@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index) | |||
| block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); | |||
| } | |||
| s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l], | |||
| s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l], | |||
| block, rscale, 8); | |||
| } | |||
| @@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx, | |||
| } | |||
| } | |||
| s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); | |||
| s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); | |||
| samples += 256 * channels; | |||
| } | |||
| @@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx) | |||
| ff_mdct_init(&s->imdct, 6, 1, 1.0); | |||
| ff_synth_filter_init(&s->synth); | |||
| ff_dcadsp_init(&s->dcadsp); | |||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||
| for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) | |||
| s->samples_chanptr[i] = s->samples + i * 256; | |||
| @@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len) | |||
| return p; | |||
| } | |||
| static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ | |||
| int i; | |||
| for(i=0; i<len; i++) | |||
| dst[i] = src[i] * mul; | |||
| } | |||
| static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, | |||
| uint32_t maxi, uint32_t maxisign) | |||
| { | |||
| @@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i | |||
| } | |||
| } | |||
| static av_always_inline int float_to_int16_one(const float *src){ | |||
| return av_clip_int16(lrintf(*src)); | |||
| } | |||
| static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ | |||
| int i; | |||
| for(i=0; i<len; i++) | |||
| dst[i] = float_to_int16_one(src+i); | |||
| } | |||
| static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ | |||
| int i,j,c; | |||
| if(channels==2){ | |||
| for(i=0; i<len; i++){ | |||
| dst[2*i] = float_to_int16_one(src[0]+i); | |||
| dst[2*i+1] = float_to_int16_one(src[1]+i); | |||
| } | |||
| }else{ | |||
| for(c=0; c<channels; c++) | |||
| for(i=0, j=c; i<len; i++, j+=channels) | |||
| dst[j] = float_to_int16_one(src[c]+i); | |||
| } | |||
| } | |||
| static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) | |||
| { | |||
| int res = 0; | |||
| @@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||
| c->vector_fmul_reverse = vector_fmul_reverse_c; | |||
| c->vector_fmul_add = vector_fmul_add_c; | |||
| c->vector_fmul_window = vector_fmul_window_c; | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | |||
| c->vector_clipf = vector_clipf_c; | |||
| c->float_to_int16 = ff_float_to_int16_c; | |||
| c->float_to_int16_interleave = ff_float_to_int16_interleave_c; | |||
| c->scalarproduct_int16 = scalarproduct_int16_c; | |||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | |||
| c->scalarproduct_float = scalarproduct_float_c; | |||
| @@ -392,7 +392,6 @@ typedef struct DSPContext { | |||
| /* assume len is a multiple of 4, and arrays are 16-byte aligned */ | |||
| void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); | |||
| /* assume len is a multiple of 8, and arrays are 16-byte aligned */ | |||
| void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | |||
| void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); | |||
| /** | |||
| * Multiply a vector of floats by a scalar float. Source and | |||
| @@ -445,10 +444,6 @@ typedef struct DSPContext { | |||
| */ | |||
| void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); | |||
| /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ | |||
| void (*float_to_int16)(int16_t *dst, const float *src, long len); | |||
| void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); | |||
| /* (I)DCT */ | |||
| void (*fdct)(DCTELEM *block/* align 16*/); | |||
| void (*fdct248)(DCTELEM *block/* align 16*/); | |||
| @@ -0,0 +1,68 @@ | |||
| /* | |||
| * Format Conversion Utils | |||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "avcodec.h" | |||
| #include "fmtconvert.h" | |||
| static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ | |||
| int i; | |||
| for(i=0; i<len; i++) | |||
| dst[i] = src[i] * mul; | |||
| } | |||
| static av_always_inline int float_to_int16_one(const float *src){ | |||
| return av_clip_int16(lrintf(*src)); | |||
| } | |||
| static void float_to_int16_c(int16_t *dst, const float *src, long len) | |||
| { | |||
| int i; | |||
| for(i=0; i<len; i++) | |||
| dst[i] = float_to_int16_one(src+i); | |||
| } | |||
| static void float_to_int16_interleave_c(int16_t *dst, const float **src, | |||
| long len, int channels) | |||
| { | |||
| int i,j,c; | |||
| if(channels==2){ | |||
| for(i=0; i<len; i++){ | |||
| dst[2*i] = float_to_int16_one(src[0]+i); | |||
| dst[2*i+1] = float_to_int16_one(src[1]+i); | |||
| } | |||
| }else{ | |||
| for(c=0; c<channels; c++) | |||
| for(i=0, j=c; i<len; i++, j+=channels) | |||
| dst[j] = float_to_int16_one(src[c]+i); | |||
| } | |||
| } | |||
| av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | |||
| c->float_to_int16 = float_to_int16_c; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_c; | |||
| if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); | |||
| if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); | |||
| if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); | |||
| } | |||
| @@ -0,0 +1,79 @@ | |||
| /* | |||
| * Format Conversion Utils | |||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #ifndef AVCODEC_FMTCONVERT_H | |||
| #define AVCODEC_FMTCONVERT_H | |||
| #include "avcodec.h" | |||
| typedef struct FmtConvertContext { | |||
| /** | |||
| * Convert an array of int32_t to float and multiply by a float value. | |||
| * @param dst destination array of float. | |||
| * constraints: 16-byte aligned | |||
| * @param src source array of int32_t. | |||
| * constraints: 16-byte aligned | |||
| * @param len number of elements to convert. | |||
| * constraints: multiple of 8 | |||
| */ | |||
| void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); | |||
| /** | |||
| * Convert an array of float to an array of int16_t. | |||
| * | |||
| * Convert floats from in the range [-32768.0,32767.0] to ints | |||
| * without rescaling | |||
| * | |||
| * @param dst destination array of int16_t. | |||
| * constraints: 16-byte aligned | |||
| * @param src source array of float. | |||
| * constraints: 16-byte aligned | |||
| * @param len number of elements to convert. | |||
| * constraints: multiple of 8 | |||
| */ | |||
| void (*float_to_int16)(int16_t *dst, const float *src, long len); | |||
| /** | |||
| * Convert multiple arrays of float to an interleaved array of int16_t. | |||
| * | |||
| * Convert floats from in the range [-32768.0,32767.0] to ints | |||
| * without rescaling | |||
| * | |||
| * @param dst destination array of interleaved int16_t. | |||
| * constraints: 16-byte aligned | |||
| * @param src source array of float arrays, one for each channel. | |||
| * constraints: 16-byte aligned | |||
| * @param len number of elements to convert. | |||
| * constraints: multiple of 8 | |||
| * @param channels number of channels | |||
| */ | |||
| void (*float_to_int16_interleave)(int16_t *dst, const float **src, | |||
| long len, int channels); | |||
| } FmtConvertContext; | |||
| void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); | |||
| void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); | |||
| void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); | |||
| void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); | |||
| #endif /* AVCODEC_FMTCONVERT_H */ | |||
| @@ -38,6 +38,7 @@ | |||
| #include "avcodec.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| #define ALT_BITSTREAM_READER_LE | |||
| #include "get_bits.h" | |||
| @@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext { | |||
| float scale_bias; | |||
| DSPContext dsp; | |||
| FFTContext imdct_ctx; | |||
| FmtConvertContext fmt_conv; | |||
| DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; | |||
| } NellyMoserDecodeContext; | |||
| @@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) { | |||
| ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); | |||
| dsputil_init(&s->dsp, avctx); | |||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||
| s->scale_bias = 1.0/(1*8); | |||
| @@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx, | |||
| for (i=0 ; i<blocks ; i++) { | |||
| nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); | |||
| s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); | |||
| s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); | |||
| *data_size += NELLY_SAMPLES*sizeof(int16_t); | |||
| } | |||
| @@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \ | |||
| OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ | |||
| ppc/fdct_altivec.o \ | |||
| ppc/float_altivec.o \ | |||
| ppc/fmtconvert_altivec.o \ | |||
| ppc/gmc_altivec.o \ | |||
| ppc/idct_altivec.o \ | |||
| ppc/int_altivec.o \ | |||
| @@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa | |||
| } | |||
| } | |||
| static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) | |||
| { | |||
| union { | |||
| vector float v; | |||
| float s[4]; | |||
| } mul_u; | |||
| int i; | |||
| vector float src1, src2, dst1, dst2, mul_v, zero; | |||
| zero = (vector float)vec_splat_u32(0); | |||
| mul_u.s[0] = mul; | |||
| mul_v = vec_splat(mul_u.v, 0); | |||
| for(i=0; i<len; i+=8) { | |||
| src1 = vec_ctf(vec_ld(0, src+i), 0); | |||
| src2 = vec_ctf(vec_ld(16, src+i), 0); | |||
| dst1 = vec_madd(src1, mul_v, zero); | |||
| dst2 = vec_madd(src2, mul_v, zero); | |||
| vec_st(dst1, 0, dst+i); | |||
| vec_st(dst2, 16, dst+i); | |||
| } | |||
| } | |||
| static vector signed short | |||
| float_to_int16_one_altivec(const float *src) | |||
| { | |||
| vector float s0 = vec_ld(0, src); | |||
| vector float s1 = vec_ld(16, src); | |||
| vector signed int t0 = vec_cts(s0, 0); | |||
| vector signed int t1 = vec_cts(s1, 0); | |||
| return vec_packs(t0,t1); | |||
| } | |||
| static void float_to_int16_altivec(int16_t *dst, const float *src, long len) | |||
| { | |||
| int i; | |||
| vector signed short d0, d1, d; | |||
| vector unsigned char align; | |||
| if(((long)dst)&15) //FIXME | |||
| for(i=0; i<len-7; i+=8) { | |||
| d0 = vec_ld(0, dst+i); | |||
| d = float_to_int16_one_altivec(src+i); | |||
| d1 = vec_ld(15, dst+i); | |||
| d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); | |||
| align = vec_lvsr(0, dst+i); | |||
| d0 = vec_perm(d1, d, align); | |||
| d1 = vec_perm(d, d1, align); | |||
| vec_st(d0, 0, dst+i); | |||
| vec_st(d1,15, dst+i); | |||
| } | |||
| else | |||
| for(i=0; i<len-7; i+=8) { | |||
| d = float_to_int16_one_altivec(src+i); | |||
| vec_st(d, 0, dst+i); | |||
| } | |||
| } | |||
| static void | |||
| float_to_int16_interleave_altivec(int16_t *dst, const float **src, | |||
| long len, int channels) | |||
| { | |||
| int i; | |||
| vector signed short d0, d1, d2, c0, c1, t0, t1; | |||
| vector unsigned char align; | |||
| if(channels == 1) | |||
| float_to_int16_altivec(dst, src[0], len); | |||
| else | |||
| if (channels == 2) { | |||
| if(((long)dst)&15) | |||
| for(i=0; i<len-7; i+=8) { | |||
| d0 = vec_ld(0, dst + i); | |||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||
| d1 = vec_ld(31, dst + i); | |||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||
| c0 = vec_mergeh(t0, t1); | |||
| c1 = vec_mergel(t0, t1); | |||
| d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); | |||
| align = vec_lvsr(0, dst + i); | |||
| d0 = vec_perm(d2, c0, align); | |||
| d1 = vec_perm(c0, c1, align); | |||
| vec_st(d0, 0, dst + i); | |||
| d0 = vec_perm(c1, d2, align); | |||
| vec_st(d1, 15, dst + i); | |||
| vec_st(d0, 31, dst + i); | |||
| dst+=8; | |||
| } | |||
| else | |||
| for(i=0; i<len-7; i+=8) { | |||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||
| d0 = vec_mergeh(t0, t1); | |||
| d1 = vec_mergel(t0, t1); | |||
| vec_st(d0, 0, dst + i); | |||
| vec_st(d1, 16, dst + i); | |||
| dst+=8; | |||
| } | |||
| } else { | |||
| DECLARE_ALIGNED(16, int16_t, tmp)[len]; | |||
| int c, j; | |||
| for (c = 0; c < channels; c++) { | |||
| float_to_int16_altivec(tmp, src[c], len); | |||
| for (i = 0, j = c; i < len; i++, j+=channels) { | |||
| dst[j] = tmp[i]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void float_init_altivec(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| c->vector_fmul = vector_fmul_altivec; | |||
| c->vector_fmul_reverse = vector_fmul_reverse_altivec; | |||
| c->vector_fmul_add = vector_fmul_add_altivec; | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| c->vector_fmul_window = vector_fmul_window_altivec; | |||
| c->float_to_int16 = float_to_int16_altivec; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_altivec; | |||
| } | |||
| } | |||
| @@ -0,0 +1,142 @@ | |||
| /* | |||
| * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavcodec/fmtconvert.h" | |||
| #include "dsputil_altivec.h" | |||
| #include "util_altivec.h" | |||
| static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) | |||
| { | |||
| union { | |||
| vector float v; | |||
| float s[4]; | |||
| } mul_u; | |||
| int i; | |||
| vector float src1, src2, dst1, dst2, mul_v, zero; | |||
| zero = (vector float)vec_splat_u32(0); | |||
| mul_u.s[0] = mul; | |||
| mul_v = vec_splat(mul_u.v, 0); | |||
| for(i=0; i<len; i+=8) { | |||
| src1 = vec_ctf(vec_ld(0, src+i), 0); | |||
| src2 = vec_ctf(vec_ld(16, src+i), 0); | |||
| dst1 = vec_madd(src1, mul_v, zero); | |||
| dst2 = vec_madd(src2, mul_v, zero); | |||
| vec_st(dst1, 0, dst+i); | |||
| vec_st(dst2, 16, dst+i); | |||
| } | |||
| } | |||
| static vector signed short | |||
| float_to_int16_one_altivec(const float *src) | |||
| { | |||
| vector float s0 = vec_ld(0, src); | |||
| vector float s1 = vec_ld(16, src); | |||
| vector signed int t0 = vec_cts(s0, 0); | |||
| vector signed int t1 = vec_cts(s1, 0); | |||
| return vec_packs(t0,t1); | |||
| } | |||
| static void float_to_int16_altivec(int16_t *dst, const float *src, long len) | |||
| { | |||
| int i; | |||
| vector signed short d0, d1, d; | |||
| vector unsigned char align; | |||
| if(((long)dst)&15) //FIXME | |||
| for(i=0; i<len-7; i+=8) { | |||
| d0 = vec_ld(0, dst+i); | |||
| d = float_to_int16_one_altivec(src+i); | |||
| d1 = vec_ld(15, dst+i); | |||
| d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); | |||
| align = vec_lvsr(0, dst+i); | |||
| d0 = vec_perm(d1, d, align); | |||
| d1 = vec_perm(d, d1, align); | |||
| vec_st(d0, 0, dst+i); | |||
| vec_st(d1,15, dst+i); | |||
| } | |||
| else | |||
| for(i=0; i<len-7; i+=8) { | |||
| d = float_to_int16_one_altivec(src+i); | |||
| vec_st(d, 0, dst+i); | |||
| } | |||
| } | |||
| static void | |||
| float_to_int16_interleave_altivec(int16_t *dst, const float **src, | |||
| long len, int channels) | |||
| { | |||
| int i; | |||
| vector signed short d0, d1, d2, c0, c1, t0, t1; | |||
| vector unsigned char align; | |||
| if(channels == 1) | |||
| float_to_int16_altivec(dst, src[0], len); | |||
| else | |||
| if (channels == 2) { | |||
| if(((long)dst)&15) | |||
| for(i=0; i<len-7; i+=8) { | |||
| d0 = vec_ld(0, dst + i); | |||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||
| d1 = vec_ld(31, dst + i); | |||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||
| c0 = vec_mergeh(t0, t1); | |||
| c1 = vec_mergel(t0, t1); | |||
| d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); | |||
| align = vec_lvsr(0, dst + i); | |||
| d0 = vec_perm(d2, c0, align); | |||
| d1 = vec_perm(c0, c1, align); | |||
| vec_st(d0, 0, dst + i); | |||
| d0 = vec_perm(c1, d2, align); | |||
| vec_st(d1, 15, dst + i); | |||
| vec_st(d0, 31, dst + i); | |||
| dst+=8; | |||
| } | |||
| else | |||
| for(i=0; i<len-7; i+=8) { | |||
| t0 = float_to_int16_one_altivec(src[0] + i); | |||
| t1 = float_to_int16_one_altivec(src[1] + i); | |||
| d0 = vec_mergeh(t0, t1); | |||
| d1 = vec_mergel(t0, t1); | |||
| vec_st(d0, 0, dst + i); | |||
| vec_st(d1, 16, dst + i); | |||
| dst+=8; | |||
| } | |||
| } else { | |||
| DECLARE_ALIGNED(16, int16_t, tmp)[len]; | |||
| int c, j; | |||
| for (c = 0; c < channels; c++) { | |||
| float_to_int16_altivec(tmp, src[c], len); | |||
| for (i = 0, j = c; i < len; i++, j+=channels) { | |||
| dst[j] = tmp[i]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| c->float_to_int16 = float_to_int16_altivec; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_altivec; | |||
| } | |||
| } | |||
| @@ -31,6 +31,7 @@ | |||
| #include "get_bits.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| #include "vorbis.h" | |||
| #include "xiph.h" | |||
| @@ -127,6 +128,7 @@ typedef struct vorbis_context_s { | |||
| AVCodecContext *avccontext; | |||
| GetBitContext gb; | |||
| DSPContext dsp; | |||
| FmtConvertContext fmt_conv; | |||
| FFTContext mdct[2]; | |||
| uint_fast8_t first_frame; | |||
| @@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext) | |||
| vc->avccontext = avccontext; | |||
| dsputil_init(&vc->dsp, avccontext); | |||
| ff_fmt_convert_init(&vc->fmt_conv, avccontext); | |||
| vc->scale_bias = 32768.0f; | |||
| @@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, | |||
| len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; | |||
| } | |||
| vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); | |||
| vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len, | |||
| vc->audio_channels); | |||
| *data_size = len * 2 * vc->audio_channels; | |||
| return buf_size ; | |||
| @@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2) | |||
| s->block_align = avctx->block_align; | |||
| dsputil_init(&s->dsp, avctx); | |||
| ff_fmt_convert_init(&s->fmt_conv, avctx); | |||
| if (avctx->codec->id == CODEC_ID_WMAV1) { | |||
| s->version = 1; | |||
| @@ -26,6 +26,7 @@ | |||
| #include "put_bits.h" | |||
| #include "dsputil.h" | |||
| #include "fft.h" | |||
| #include "fmtconvert.h" | |||
| /* size of blocks */ | |||
| #define BLOCK_MIN_BITS 7 | |||
| @@ -134,6 +135,7 @@ typedef struct WMACodecContext { | |||
| float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; | |||
| float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; | |||
| DSPContext dsp; | |||
| FmtConvertContext fmt_conv; | |||
| #ifdef TRACE | |||
| int frame_count; | |||
| @@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples) | |||
| incr = s->nb_channels; | |||
| for (ch = 0; ch < MAX_CHANNELS; ch++) | |||
| output[ch] = s->frame_out[ch]; | |||
| s->dsp.float_to_int16_interleave(samples, output, n, incr); | |||
| s->fmt_conv.float_to_int16_interleave(samples, output, n, incr); | |||
| for (ch = 0; ch < incr; ch++) { | |||
| /* prepare for next block */ | |||
| memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); | |||
| @@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | |||
| MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o | |||
| MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ | |||
| x86/deinterlace.o \ | |||
| x86/fmtconvert.o \ | |||
| x86/h264_chromamc.o \ | |||
| $(YASM-OBJS-yes) | |||
| @@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o | |||
| OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ | |||
| x86/dsputil_mmx.o \ | |||
| x86/fdct_mmx.o \ | |||
| x86/fmtconvert_mmx.o \ | |||
| x86/idct_mmx_xvid.o \ | |||
| x86/idct_sse2_xvid.o \ | |||
| x86/motion_est_mmx.o \ | |||
| @@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s | |||
| } | |||
| #endif /* HAVE_6REGS */ | |||
| static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |||
| { | |||
| x86_reg i = -4*len; | |||
| __asm__ volatile( | |||
| "movss %3, %%xmm4 \n" | |||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||
| "1: \n" | |||
| "cvtpi2ps (%2,%0), %%xmm0 \n" | |||
| "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |||
| "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |||
| "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |||
| "movlhps %%xmm1, %%xmm0 \n" | |||
| "movlhps %%xmm3, %%xmm2 \n" | |||
| "mulps %%xmm4, %%xmm0 \n" | |||
| "mulps %%xmm4, %%xmm2 \n" | |||
| "movaps %%xmm0, (%1,%0) \n" | |||
| "movaps %%xmm2, 16(%1,%0) \n" | |||
| "add $32, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i) | |||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||
| ); | |||
| } | |||
| static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |||
| { | |||
| x86_reg i = -4*len; | |||
| __asm__ volatile( | |||
| "movss %3, %%xmm4 \n" | |||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||
| "1: \n" | |||
| "cvtdq2ps (%2,%0), %%xmm0 \n" | |||
| "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |||
| "mulps %%xmm4, %%xmm0 \n" | |||
| "mulps %%xmm4, %%xmm1 \n" | |||
| "movaps %%xmm0, (%1,%0) \n" | |||
| "movaps %%xmm1, 16(%1,%0) \n" | |||
| "add $32, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i) | |||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||
| ); | |||
| } | |||
| static void vector_clipf_sse(float *dst, const float *src, float min, float max, | |||
| int len) | |||
| { | |||
| @@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, | |||
| ); | |||
| } | |||
| static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| // not bit-exact: pf2id uses different rounding than C and SSE | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "pf2id (%2,%0,2) , %%mm0 \n\t" | |||
| "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |||
| "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |||
| "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |||
| "packssdw %%mm1 , %%mm0 \n\t" | |||
| "packssdw %%mm3 , %%mm2 \n\t" | |||
| "movq %%mm0 , (%1,%0) \n\t" | |||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| "femms \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |||
| "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |||
| "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |||
| "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |||
| "packssdw %%mm1 , %%mm0 \n\t" | |||
| "packssdw %%mm3 , %%mm2 \n\t" | |||
| "movq %%mm0 , (%1,%0) \n\t" | |||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| "emms \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |||
| "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |||
| "packssdw %%xmm1 , %%xmm0 \n\t" | |||
| "movdqa %%xmm0 , (%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| void ff_vp3_idct_mmx(int16_t *input_data); | |||
| void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | |||
| void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | |||
| @@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data); | |||
| void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); | |||
| void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); | |||
| void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | |||
| void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |||
| void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |||
| int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); | |||
| int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); | |||
| int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | |||
| @@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const | |||
| int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | |||
| int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | |||
| #if !HAVE_YASM | |||
| #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) | |||
| #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||
| #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||
| #endif | |||
| #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |||
| #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |||
| /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |||
| static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||
| DECLARE_ALIGNED(16, int16_t, tmp)[len];\ | |||
| int i,j,c;\ | |||
| for(c=0; c<channels; c++){\ | |||
| float_to_int16_##cpu(tmp, src[c], len);\ | |||
| for(i=0, j=c; i<len; i++, j+=channels)\ | |||
| dst[j] = tmp[i];\ | |||
| }\ | |||
| }\ | |||
| \ | |||
| static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||
| if(channels==1)\ | |||
| float_to_int16_##cpu(dst, src[0], len);\ | |||
| else if(channels==2){\ | |||
| x86_reg reglen = len; \ | |||
| const float *src0 = src[0];\ | |||
| const float *src1 = src[1];\ | |||
| __asm__ volatile(\ | |||
| "shl $2, %0 \n"\ | |||
| "add %0, %1 \n"\ | |||
| "add %0, %2 \n"\ | |||
| "add %0, %3 \n"\ | |||
| "neg %0 \n"\ | |||
| body\ | |||
| :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |||
| );\ | |||
| }else if(channels==6){\ | |||
| ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |||
| }else\ | |||
| float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |||
| } | |||
| FLOAT_TO_INT16_INTERLEAVE(3dnow, | |||
| "1: \n" | |||
| "pf2id (%2,%0), %%mm0 \n" | |||
| "pf2id 8(%2,%0), %%mm1 \n" | |||
| "pf2id (%3,%0), %%mm2 \n" | |||
| "pf2id 8(%3,%0), %%mm3 \n" | |||
| "packssdw %%mm1, %%mm0 \n" | |||
| "packssdw %%mm3, %%mm2 \n" | |||
| "movq %%mm0, %%mm1 \n" | |||
| "punpcklwd %%mm2, %%mm0 \n" | |||
| "punpckhwd %%mm2, %%mm1 \n" | |||
| "movq %%mm0, (%1,%0)\n" | |||
| "movq %%mm1, 8(%1,%0)\n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| "femms \n" | |||
| ) | |||
| FLOAT_TO_INT16_INTERLEAVE(sse, | |||
| "1: \n" | |||
| "cvtps2pi (%2,%0), %%mm0 \n" | |||
| "cvtps2pi 8(%2,%0), %%mm1 \n" | |||
| "cvtps2pi (%3,%0), %%mm2 \n" | |||
| "cvtps2pi 8(%3,%0), %%mm3 \n" | |||
| "packssdw %%mm1, %%mm0 \n" | |||
| "packssdw %%mm3, %%mm2 \n" | |||
| "movq %%mm0, %%mm1 \n" | |||
| "punpcklwd %%mm2, %%mm0 \n" | |||
| "punpckhwd %%mm2, %%mm1 \n" | |||
| "movq %%mm0, (%1,%0)\n" | |||
| "movq %%mm1, 8(%1,%0)\n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| "emms \n" | |||
| ) | |||
| FLOAT_TO_INT16_INTERLEAVE(sse2, | |||
| "1: \n" | |||
| "cvtps2dq (%2,%0), %%xmm0 \n" | |||
| "cvtps2dq (%3,%0), %%xmm1 \n" | |||
| "packssdw %%xmm1, %%xmm0 \n" | |||
| "movhlps %%xmm0, %%xmm1 \n" | |||
| "punpcklwd %%xmm1, %%xmm0 \n" | |||
| "movdqa %%xmm0, (%1,%0) \n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| ) | |||
| static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |||
| if(channels==6) | |||
| ff_float_to_int16_interleave6_3dn2(dst, src, len); | |||
| else | |||
| float_to_int16_interleave_3dnow(dst, src, len, channels); | |||
| } | |||
| float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | |||
| void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| @@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| if(mm_flags & AV_CPU_FLAG_3DNOW){ | |||
| c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | |||
| c->vector_fmul = vector_fmul_3dnow; | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| c->float_to_int16 = float_to_int16_3dnow; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |||
| } | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | |||
| c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | |||
| #if HAVE_6REGS | |||
| c->vector_fmul_window = vector_fmul_window_3dnow2; | |||
| #endif | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |||
| } | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_MMX2){ | |||
| #if HAVE_YASM | |||
| @@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| #if HAVE_6REGS | |||
| c->vector_fmul_window = vector_fmul_window_sse; | |||
| #endif | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |||
| c->vector_clipf = vector_clipf_sse; | |||
| c->float_to_int16 = float_to_int16_sse; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_sse; | |||
| #if HAVE_YASM | |||
| c->scalarproduct_float = ff_scalarproduct_float_sse; | |||
| #endif | |||
| @@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| if(mm_flags & AV_CPU_FLAG_3DNOW) | |||
| c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse | |||
| if(mm_flags & AV_CPU_FLAG_SSE2){ | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |||
| c->float_to_int16 = float_to_int16_sse2; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |||
| #if HAVE_YASM | |||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |||
| @@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |||
| section .text align=16 | |||
| %macro PSWAPD_SSE 2 | |||
| pshufw %1, %2, 0x4e | |||
| %endmacro | |||
| %macro PSWAPD_3DN1 2 | |||
| movq %1, %2 | |||
| psrlq %1, 32 | |||
| punpckldq %1, %2 | |||
| %endmacro | |||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||
| %ifdef ARCH_X86_64 | |||
| %define lend r10d | |||
| mov lend, r2d | |||
| %else | |||
| %define lend dword r2m | |||
| %endif | |||
| mov src1q, [srcq+1*gprsize] | |||
| mov src2q, [srcq+2*gprsize] | |||
| mov src3q, [srcq+3*gprsize] | |||
| mov src4q, [srcq+4*gprsize] | |||
| mov src5q, [srcq+5*gprsize] | |||
| mov srcq, [srcq] | |||
| sub src1q, srcq | |||
| sub src2q, srcq | |||
| sub src3q, srcq | |||
| sub src4q, srcq | |||
| sub src5q, srcq | |||
| .loop: | |||
| cvtps2pi mm0, [srcq] | |||
| cvtps2pi mm1, [srcq+src1q] | |||
| cvtps2pi mm2, [srcq+src2q] | |||
| cvtps2pi mm3, [srcq+src3q] | |||
| cvtps2pi mm4, [srcq+src4q] | |||
| cvtps2pi mm5, [srcq+src5q] | |||
| packssdw mm0, mm3 | |||
| packssdw mm1, mm4 | |||
| packssdw mm2, mm5 | |||
| pswapd mm3, mm0 | |||
| punpcklwd mm0, mm1 | |||
| punpckhwd mm1, mm2 | |||
| punpcklwd mm2, mm3 | |||
| pswapd mm3, mm0 | |||
| punpckldq mm0, mm2 | |||
| punpckhdq mm2, mm1 | |||
| punpckldq mm1, mm3 | |||
| movq [dstq ], mm0 | |||
| movq [dstq+16], mm2 | |||
| movq [dstq+ 8], mm1 | |||
| add srcq, 8 | |||
| add dstq, 24 | |||
| sub lend, 2 | |||
| jg .loop | |||
| emms | |||
| RET | |||
| %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |||
| %define pswapd PSWAPD_SSE | |||
| FLOAT_TO_INT16_INTERLEAVE6 sse | |||
| %define cvtps2pi pf2id | |||
| %define pswapd PSWAPD_3DN1 | |||
| FLOAT_TO_INT16_INTERLEAVE6 3dnow | |||
| %undef pswapd | |||
| FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||
| %undef cvtps2pi | |||
| %macro SCALARPRODUCT 1 | |||
| ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) | |||
| cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | |||
| @@ -0,0 +1,91 @@ | |||
| ;****************************************************************************** | |||
| ;* x86 optimized Format Conversion Utils | |||
| ;* Copyright (c) 2008 Loren Merritt | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "x86inc.asm" | |||
| section .text align=16 | |||
| %macro PSWAPD_SSE 2 | |||
| pshufw %1, %2, 0x4e | |||
| %endmacro | |||
| %macro PSWAPD_3DN1 2 | |||
| movq %1, %2 | |||
| psrlq %1, 32 | |||
| punpckldq %1, %2 | |||
| %endmacro | |||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||
| %ifdef ARCH_X86_64 | |||
| %define lend r10d | |||
| mov lend, r2d | |||
| %else | |||
| %define lend dword r2m | |||
| %endif | |||
| mov src1q, [srcq+1*gprsize] | |||
| mov src2q, [srcq+2*gprsize] | |||
| mov src3q, [srcq+3*gprsize] | |||
| mov src4q, [srcq+4*gprsize] | |||
| mov src5q, [srcq+5*gprsize] | |||
| mov srcq, [srcq] | |||
| sub src1q, srcq | |||
| sub src2q, srcq | |||
| sub src3q, srcq | |||
| sub src4q, srcq | |||
| sub src5q, srcq | |||
| .loop: | |||
| cvtps2pi mm0, [srcq] | |||
| cvtps2pi mm1, [srcq+src1q] | |||
| cvtps2pi mm2, [srcq+src2q] | |||
| cvtps2pi mm3, [srcq+src3q] | |||
| cvtps2pi mm4, [srcq+src4q] | |||
| cvtps2pi mm5, [srcq+src5q] | |||
| packssdw mm0, mm3 | |||
| packssdw mm1, mm4 | |||
| packssdw mm2, mm5 | |||
| pswapd mm3, mm0 | |||
| punpcklwd mm0, mm1 | |||
| punpckhwd mm1, mm2 | |||
| punpcklwd mm2, mm3 | |||
| pswapd mm3, mm0 | |||
| punpckldq mm0, mm2 | |||
| punpckhdq mm2, mm1 | |||
| punpckldq mm1, mm3 | |||
| movq [dstq ], mm0 | |||
| movq [dstq+16], mm2 | |||
| movq [dstq+ 8], mm1 | |||
| add srcq, 8 | |||
| add dstq, 24 | |||
| sub lend, 2 | |||
| jg .loop | |||
| emms | |||
| RET | |||
| %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |||
| %define pswapd PSWAPD_SSE | |||
| FLOAT_TO_INT16_INTERLEAVE6 sse | |||
| %define cvtps2pi pf2id | |||
| %define pswapd PSWAPD_3DN1 | |||
| FLOAT_TO_INT16_INTERLEAVE6 3dnow | |||
| %undef pswapd | |||
| FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||
| %undef cvtps2pi | |||
| @@ -0,0 +1,266 @@ | |||
| /* | |||
| * Format Conversion Utils | |||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| * | |||
| * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |||
| */ | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/fmtconvert.h" | |||
| static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |||
| { | |||
| x86_reg i = -4*len; | |||
| __asm__ volatile( | |||
| "movss %3, %%xmm4 \n" | |||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||
| "1: \n" | |||
| "cvtpi2ps (%2,%0), %%xmm0 \n" | |||
| "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |||
| "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |||
| "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |||
| "movlhps %%xmm1, %%xmm0 \n" | |||
| "movlhps %%xmm3, %%xmm2 \n" | |||
| "mulps %%xmm4, %%xmm0 \n" | |||
| "mulps %%xmm4, %%xmm2 \n" | |||
| "movaps %%xmm0, (%1,%0) \n" | |||
| "movaps %%xmm2, 16(%1,%0) \n" | |||
| "add $32, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i) | |||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||
| ); | |||
| } | |||
| static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |||
| { | |||
| x86_reg i = -4*len; | |||
| __asm__ volatile( | |||
| "movss %3, %%xmm4 \n" | |||
| "shufps $0, %%xmm4, %%xmm4 \n" | |||
| "1: \n" | |||
| "cvtdq2ps (%2,%0), %%xmm0 \n" | |||
| "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |||
| "mulps %%xmm4, %%xmm0 \n" | |||
| "mulps %%xmm4, %%xmm1 \n" | |||
| "movaps %%xmm0, (%1,%0) \n" | |||
| "movaps %%xmm1, 16(%1,%0) \n" | |||
| "add $32, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i) | |||
| :"r"(dst+len), "r"(src+len), "m"(mul) | |||
| ); | |||
| } | |||
| static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| // not bit-exact: pf2id uses different rounding than C and SSE | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "pf2id (%2,%0,2) , %%mm0 \n\t" | |||
| "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |||
| "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |||
| "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |||
| "packssdw %%mm1 , %%mm0 \n\t" | |||
| "packssdw %%mm3 , %%mm2 \n\t" | |||
| "movq %%mm0 , (%1,%0) \n\t" | |||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| "femms \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |||
| "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |||
| "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |||
| "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |||
| "packssdw %%mm1 , %%mm0 \n\t" | |||
| "packssdw %%mm3 , %%mm2 \n\t" | |||
| "movq %%mm0 , (%1,%0) \n\t" | |||
| "movq %%mm2 , 8(%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| "emms \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |||
| x86_reg reglen = len; | |||
| __asm__ volatile( | |||
| "add %0 , %0 \n\t" | |||
| "lea (%2,%0,2) , %2 \n\t" | |||
| "add %0 , %1 \n\t" | |||
| "neg %0 \n\t" | |||
| "1: \n\t" | |||
| "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |||
| "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |||
| "packssdw %%xmm1 , %%xmm0 \n\t" | |||
| "movdqa %%xmm0 , (%1,%0) \n\t" | |||
| "add $16 , %0 \n\t" | |||
| " js 1b \n\t" | |||
| :"+r"(reglen), "+r"(dst), "+r"(src) | |||
| ); | |||
| } | |||
| void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | |||
| void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |||
| void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |||
| #if !HAVE_YASM | |||
| #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) | |||
| #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||
| #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |||
| #endif | |||
| #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |||
| #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |||
| /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |||
| static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||
| DECLARE_ALIGNED(16, int16_t, tmp)[len];\ | |||
| int i,j,c;\ | |||
| for(c=0; c<channels; c++){\ | |||
| float_to_int16_##cpu(tmp, src[c], len);\ | |||
| for(i=0, j=c; i<len; i++, j+=channels)\ | |||
| dst[j] = tmp[i];\ | |||
| }\ | |||
| }\ | |||
| \ | |||
| static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||
| if(channels==1)\ | |||
| float_to_int16_##cpu(dst, src[0], len);\ | |||
| else if(channels==2){\ | |||
| x86_reg reglen = len; \ | |||
| const float *src0 = src[0];\ | |||
| const float *src1 = src[1];\ | |||
| __asm__ volatile(\ | |||
| "shl $2, %0 \n"\ | |||
| "add %0, %1 \n"\ | |||
| "add %0, %2 \n"\ | |||
| "add %0, %3 \n"\ | |||
| "neg %0 \n"\ | |||
| body\ | |||
| :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |||
| );\ | |||
| }else if(channels==6){\ | |||
| ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |||
| }else\ | |||
| float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |||
| } | |||
| FLOAT_TO_INT16_INTERLEAVE(3dnow, | |||
| "1: \n" | |||
| "pf2id (%2,%0), %%mm0 \n" | |||
| "pf2id 8(%2,%0), %%mm1 \n" | |||
| "pf2id (%3,%0), %%mm2 \n" | |||
| "pf2id 8(%3,%0), %%mm3 \n" | |||
| "packssdw %%mm1, %%mm0 \n" | |||
| "packssdw %%mm3, %%mm2 \n" | |||
| "movq %%mm0, %%mm1 \n" | |||
| "punpcklwd %%mm2, %%mm0 \n" | |||
| "punpckhwd %%mm2, %%mm1 \n" | |||
| "movq %%mm0, (%1,%0)\n" | |||
| "movq %%mm1, 8(%1,%0)\n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| "femms \n" | |||
| ) | |||
| FLOAT_TO_INT16_INTERLEAVE(sse, | |||
| "1: \n" | |||
| "cvtps2pi (%2,%0), %%mm0 \n" | |||
| "cvtps2pi 8(%2,%0), %%mm1 \n" | |||
| "cvtps2pi (%3,%0), %%mm2 \n" | |||
| "cvtps2pi 8(%3,%0), %%mm3 \n" | |||
| "packssdw %%mm1, %%mm0 \n" | |||
| "packssdw %%mm3, %%mm2 \n" | |||
| "movq %%mm0, %%mm1 \n" | |||
| "punpcklwd %%mm2, %%mm0 \n" | |||
| "punpckhwd %%mm2, %%mm1 \n" | |||
| "movq %%mm0, (%1,%0)\n" | |||
| "movq %%mm1, 8(%1,%0)\n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| "emms \n" | |||
| ) | |||
| FLOAT_TO_INT16_INTERLEAVE(sse2, | |||
| "1: \n" | |||
| "cvtps2dq (%2,%0), %%xmm0 \n" | |||
| "cvtps2dq (%3,%0), %%xmm1 \n" | |||
| "packssdw %%xmm1, %%xmm0 \n" | |||
| "movhlps %%xmm0, %%xmm1 \n" | |||
| "punpcklwd %%xmm1, %%xmm0 \n" | |||
| "movdqa %%xmm0, (%1,%0) \n" | |||
| "add $16, %0 \n" | |||
| "js 1b \n" | |||
| ) | |||
| static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |||
| if(channels==6) | |||
| ff_float_to_int16_interleave6_3dn2(dst, src, len); | |||
| else | |||
| float_to_int16_interleave_3dnow(dst, src, len, channels); | |||
| } | |||
| void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| int mm_flags = av_get_cpu_flags(); | |||
| if (mm_flags & AV_CPU_FLAG_MMX) { | |||
| if(mm_flags & AV_CPU_FLAG_3DNOW){ | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| c->float_to_int16 = float_to_int16_3dnow; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |||
| } | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |||
| } | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_SSE){ | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |||
| c->float_to_int16 = float_to_int16_sse; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_sse; | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_SSE2){ | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |||
| c->float_to_int16 = float_to_int16_sse2; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |||
| } | |||
| } | |||
| } | |||