Separate format conversion DSP functions from DSPContext.

This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672)
15 years ago · fe2ff6d247
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -12,6 +12,7 @@ OBJS = allcodecs.o                                                      \
       bitstream_filter.o                                               \
       dsputil.o                                                        \
       faanidct.o                                                       \
       fmtconvert.o                                                     \
       imgconvert.o                                                     \
       jrevdct.o                                                        \
       opt.o                                                            \
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -35,6 +35,7 @@
 #include "fft.h"
 #include "mpeg4audio.h"
 #include "sbr.h"
 #include "fmtconvert.h"

 #include <stdint.h>

@@ -268,6 +269,7 @@ typedef struct {
    FFTContext mdct;
    FFTContext mdct_small;
    DSPContext dsp;
    FmtConvertContext fmt_conv;
    int random_state;
    /** @} */

--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -85,6 +85,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"
 #include "lpc.h"

 #include "aac.h"
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
    ff_aac_sbr_init();

    dsputil_init(&ac->dsp, avctx);
    ff_fmt_convert_init(&ac->fmt_conv, avctx);

    ac->random_state = 0x1f2e3d4c;

@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
    *data_size = data_size_tmp;

    if (samples)
        ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
        ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);

    if (ac->output_configured)
        ac->output_configured = OC_LOCKED;
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
    ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
    ff_kbd_window_init(s->window, 5.0, 256);
    dsputil_init(&s->dsp, avctx);
    ff_fmt_convert_init(&s->fmt_conv, avctx);
    av_lfg_init(&s->dith_state, 0);

    /* set scale value for float to int16 conversion */
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
        } else {
            gain *= s->dynamic_range[0];
        }
        s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
        s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
    }

    /* apply spectral extension to high frequency bins */
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
            av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
            err = 1;
        }
        s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
        s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
        out_samples += 256 * s->out_channels;
    }
    *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -55,6 +55,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"

 /* override ac3.h to include coupling channel */
 #undef AC3_MAX_CHANNELS
@@ -190,6 +191,7 @@ typedef struct {

 ///@defgroup opt optimization
    DSPContext dsp;                         ///< for optimization
    FmtConvertContext fmt_conv;             ///< optimized conversion functions
    float mul_bias;                         ///< scaling for float_to_int16 conversion
 ///@}

--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
 OBJS                                   += arm/dsputil_init_arm.o        \
                                          arm/dsputil_arm.o             \
                                          arm/fft_init_arm.o            \
                                          arm/fmtconvert_init_arm.o     \
                                          arm/jrevdct_arm.o             \
                                          arm/mpegvideo_arm.o           \
                                          arm/simple_idct_arm.o         \
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                          arm/dsputil_armv6.o           \
                                          arm/simple_idct_armv6.o       \

 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \

 OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
                                          arm/dsputil_init_vfp.o        \
                                          $(VFP-OBJS-yes)

 OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
                                          arm/mpegvideo_iwmmxt.o        \
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o            \

 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
                                          arm/dsputil_neon.o            \
                                          arm/fmtconvert_neon.o         \
                                          arm/int_neon.o                \
                                          arm/mpegvideo_neon.o          \
                                          arm/simple_idct_neon.o        \
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
                              int len);
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
 float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
 void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
                                        float mul, int len);
 void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
                                 const float *src1, int len);
 void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,

 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                          int len);
 void ff_float_to_int16_neon(int16_t *, const float *, long);
 void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);

 void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);

@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
    c->butterflies_float          = ff_butterflies_float_neon;
    c->scalarproduct_float        = ff_scalarproduct_float_neon;
    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
    c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
    c->vector_fmul_add            = ff_vector_fmul_add_neon;
    c->vector_clipf               = ff_vector_clipf_neon;
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
    c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;

    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
        c->float_to_int16            = ff_float_to_int16_neon;
        c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
    }

    if (CONFIG_VORBIS_DECODER)
        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;

--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
                        const float *src1, int len);
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
                                const float *src1, int len);
 void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);

 void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
 {
    c->vector_fmul = ff_vector_fmul_vfp;
    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
 #if HAVE_ARMV6
    c->float_to_int16 = ff_float_to_int16_vfp;
 #endif
 }
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
        bx              lr
 endfunc

 function ff_float_to_int16_neon, export=1
        subs            r2,  r2,  #8
        vld1.64         {d0-d1},  [r1,:128]!
        vcvt.s32.f32    q8,  q0,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vcvt.s32.f32    q9,  q1,  #16
        beq             3f
        bics            ip,  r2,  #15
        beq             2f
 1:      subs            ip,  ip,  #16
        vshrn.s32       d4,  q8,  #16
        vld1.64         {d0-d1},  [r1,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vshrn.s32       d5,  q9,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vshrn.s32       d6,  q0,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        vshrn.s32       d7,  q1,  #16
        vld1.64         {d16-d17},[r1,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r1,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.64         {d6-d7},  [r0,:128]!
        bne             1b
        ands            r2,  r2,  #15
        beq             3f
 2:      vld1.64         {d0-d1},  [r1,:128]!
        vshrn.s32       d4,  q8,  #16
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vshrn.s32       d5,  q9,  #16
        vcvt.s32.f32    q1,  q1,  #16
        vshrn.s32       d6,  q0,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        vshrn.s32       d7,  q1,  #16
        vst1.64         {d6-d7},  [r0,:128]!
        bx              lr
 3:      vshrn.s32       d4,  q8,  #16
        vshrn.s32       d5,  q9,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        bx              lr
 endfunc

 function ff_float_to_int16_interleave_neon, export=1
        cmp             r3, #2
        ldrlt           r1, [r1]
        blt             ff_float_to_int16_neon
        bne             4f

        ldr             r3, [r1]
        ldr             r1, [r1, #4]

        subs            r2,  r2,  #8
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q8,  q0,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q9,  q1,  #16
        vld1.64         {d20-d21},[r1,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r1,:128]!
        vcvt.s32.f32    q11, q11, #16
        beq             3f
        bics            ip,  r2,  #15
        beq             2f
 1:      subs            ip,  ip,  #16
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         q10, q8,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vld1.64         {d24-d25},[r1,:128]!
        vcvt.s32.f32    q12, q12, #16
        vld1.64         {d26-d27},[r1,:128]!
        vsri.32         q11, q9,  #16
        vst1.64         {d20-d21},[r0,:128]!
        vcvt.s32.f32    q13, q13, #16
        vst1.64         {d22-d23},[r0,:128]!
        vsri.32         q12, q0,  #16
        vld1.64         {d16-d17},[r3,:128]!
        vsri.32         q13, q1,  #16
        vst1.64         {d24-d25},[r0,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r3,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r1,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r1,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.64         {d26-d27},[r0,:128]!
        bne             1b
        ands            r2,  r2,  #15
        beq             3f
 2:      vsri.32         q10, q8,  #16
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vld1.64         {d24-d25},[r1,:128]!
        vcvt.s32.f32    q12, q12, #16
        vsri.32         q11, q9,  #16
        vld1.64         {d26-d27},[r1,:128]!
        vcvt.s32.f32    q13, q13, #16
        vst1.64         {d20-d21},[r0,:128]!
        vsri.32         q12, q0,  #16
        vst1.64         {d22-d23},[r0,:128]!
        vsri.32         q13, q1,  #16
        vst1.64         {d24-d27},[r0,:128]!
        bx              lr
 3:      vsri.32         q10, q8,  #16
        vsri.32         q11, q9,  #16
        vst1.64         {d20-d23},[r0,:128]!
        bx              lr

 4:      push            {r4-r8,lr}
        cmp             r3,  #4
        lsl             ip,  r3,  #1
        blt             4f

        @ 4 channels
 5:      ldmia           r1!, {r4-r7}
        mov             lr,  r2
        mov             r8,  r0
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r6,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r7,:128]!
        vcvt.s32.f32    q11, q11, #16
 6:      subs            lr,  lr,  #8
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         q9,  q8,  #16
        vld1.64         {d2-d3},  [r5,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vsri.32         q11, q10, #16
        vld1.64         {d4-d5},  [r6,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vzip.32         d18, d22
        vld1.64         {d6-d7},  [r7,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vzip.32         d19, d23
        vst1.64         {d18},    [r8], ip
        vsri.32         q1,  q0,  #16
        vst1.64         {d22},    [r8], ip
        vsri.32         q3,  q2,  #16
        vst1.64         {d19},    [r8], ip
        vzip.32         d2,  d6
        vst1.64         {d23},    [r8], ip
        vzip.32         d3,  d7
        beq             7f
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.64         {d2},     [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.64         {d6},     [r8], ip
        vld1.64         {d20-d21},[r6,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.64         {d3},     [r8], ip
        vld1.64         {d22-d23},[r7,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.64         {d7},     [r8], ip
        b               6b
 7:      vst1.64         {d2},     [r8], ip
        vst1.64         {d6},     [r8], ip
        vst1.64         {d3},     [r8], ip
        vst1.64         {d7},     [r8], ip
        subs            r3,  r3,  #4
        popeq           {r4-r8,pc}
        cmp             r3,  #4
        add             r0,  r0,  #8
        bge             5b

        @ 2 channels
 4:      cmp             r3,  #2
        blt             4f
        ldmia           r1!, {r4-r5}
        mov             lr,  r2
        mov             r8,  r0
        tst             lr,  #8
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
        beq             6f
        subs            lr,  lr,  #8
        beq             7f
        vsri.32         d18, d16, #16
        vsri.32         d19, d17, #16
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vst1.32         {d19[1]}, [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.32         {d22[0]}, [r8], ip
        vst1.32         {d22[1]}, [r8], ip
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
 6:      subs            lr,  lr,  #16
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         d18, d16, #16
        vld1.64         {d2-d3},  [r5,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vsri.32         d19, d17, #16
        vld1.64         {d4-d5},  [r4,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vld1.64         {d6-d7},  [r5,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vsri.32         d2,  d0,  #16
        vst1.32         {d19[1]}, [r8], ip
        vsri.32         d3,  d1,  #16
        vst1.32         {d22[0]}, [r8], ip
        vsri.32         d6,  d4,  #16
        vst1.32         {d22[1]}, [r8], ip
        vsri.32         d7,  d5,  #16
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
        beq             6f
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.32         {d2[0]},  [r8], ip
        vst1.32         {d2[1]},  [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.32         {d3[0]},  [r8], ip
        vst1.32         {d3[1]},  [r8], ip
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.32         {d6[0]},  [r8], ip
        vst1.32         {d6[1]},  [r8], ip
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.32         {d7[0]},  [r8], ip
        vst1.32         {d7[1]},  [r8], ip
        bgt             6b
 6:      vst1.32         {d2[0]},  [r8], ip
        vst1.32         {d2[1]},  [r8], ip
        vst1.32         {d3[0]},  [r8], ip
        vst1.32         {d3[1]},  [r8], ip
        vst1.32         {d6[0]},  [r8], ip
        vst1.32         {d6[1]},  [r8], ip
        vst1.32         {d7[0]},  [r8], ip
        vst1.32         {d7[1]},  [r8], ip
        b               8f
 7:      vsri.32         d18, d16, #16
        vsri.32         d19, d17, #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vst1.32         {d19[1]}, [r8], ip
        vst1.32         {d22[0]}, [r8], ip
        vst1.32         {d22[1]}, [r8], ip
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
 8:      subs            r3,  r3,  #2
        add             r0,  r0,  #4
        popeq           {r4-r8,pc}

        @ 1 channel
 4:      ldr             r4,  [r1],#4
        tst             r2,  #8
        mov             lr,  r2
        mov             r5,  r0
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        bne             8f
 6:      subs            lr,  lr,  #16
        vld1.64         {d4-d5},  [r4,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vld1.64         {d6-d7},  [r4,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vst1.16         {d0[1]},  [r5,:16], ip
        vst1.16         {d0[3]},  [r5,:16], ip
        vst1.16         {d1[1]},  [r5,:16], ip
        vst1.16         {d1[3]},  [r5,:16], ip
        vst1.16         {d2[1]},  [r5,:16], ip
        vst1.16         {d2[3]},  [r5,:16], ip
        vst1.16         {d3[1]},  [r5,:16], ip
        vst1.16         {d3[3]},  [r5,:16], ip
        beq             7f
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
 7:      vst1.16         {d4[1]},  [r5,:16], ip
        vst1.16         {d4[3]},  [r5,:16], ip
        vst1.16         {d5[1]},  [r5,:16], ip
        vst1.16         {d5[3]},  [r5,:16], ip
        vst1.16         {d6[1]},  [r5,:16], ip
        vst1.16         {d6[3]},  [r5,:16], ip
        vst1.16         {d7[1]},  [r5,:16], ip
        vst1.16         {d7[3]},  [r5,:16], ip
        bgt             6b
        pop             {r4-r8,pc}
 8:      subs            lr,  lr,  #8
        vst1.16         {d0[1]},  [r5,:16], ip
        vst1.16         {d0[3]},  [r5,:16], ip
        vst1.16         {d1[1]},  [r5,:16], ip
        vst1.16         {d1[3]},  [r5,:16], ip
        vst1.16         {d2[1]},  [r5,:16], ip
        vst1.16         {d2[3]},  [r5,:16], ip
        vst1.16         {d3[1]},  [r5,:16], ip
        vst1.16         {d3[3]},  [r5,:16], ip
        popeq           {r4-r8,pc}
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        b               6b
 endfunc

 function ff_vector_fmul_neon, export=1
        subs            r3,  r3,  #8
        vld1.64         {d0-d3},  [r1,:128]!
@@ -1050,34 +713,6 @@ NOVFP   vmov.32         r0,  d0[0]
        bx              lr
 endfunc

 function ff_int32_to_float_fmul_scalar_neon, export=1
 VFP     vdup.32         q0,  d0[0]
 VFP     len     .req    r2
 NOVFP   vdup.32         q0,  r2
 NOVFP   len     .req    r3

        vld1.32         {q1},[r1,:128]!
        vcvt.f32.s32    q3,  q1
        vld1.32         {q2},[r1,:128]!
        vcvt.f32.s32    q8,  q2
 1:      subs            len, len, #8
        pld             [r1, #16]
        vmul.f32        q9,  q3,  q0
        vmul.f32        q10, q8,  q0
        beq             2f
        vld1.32         {q1},[r1,:128]!
        vcvt.f32.s32    q3,  q1
        vld1.32         {q2},[r1,:128]!
        vcvt.f32.s32    q8,  q2
        vst1.32         {q9}, [r0,:128]!
        vst1.32         {q10},[r0,:128]!
        b               1b
 2:      vst1.32         {q9}, [r0,:128]!
        vst1.32         {q10},[r0,:128]!
        bx              lr
        .unreq  len
 endfunc

 function ff_vector_fmul_reverse_neon, export=1
        add             r2,  r2,  r3,  lsl #2
        sub             r2,  r2,  #32
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
        vpop            {d8-d15}
        bx              lr
 endfunc

 #if HAVE_ARMV6
 /**
 * ARM VFP optimized float to int16 conversion.
 * Assume that len is a positive number and is multiple of 8, destination
 * buffer is at least 4 bytes aligned (8 bytes alignment is better for
 * performance), little endian byte sex
 */
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
 function ff_float_to_int16_vfp, export=1
        push            {r4-r8,lr}
        vpush           {d8-d11}
        vldmia          r1!, {s16-s23}
        vcvt.s32.f32    s0,  s16
        vcvt.s32.f32    s1,  s17
        vcvt.s32.f32    s2,  s18
        vcvt.s32.f32    s3,  s19
        vcvt.s32.f32    s4,  s20
        vcvt.s32.f32    s5,  s21
        vcvt.s32.f32    s6,  s22
        vcvt.s32.f32    s7,  s23
 1:
        subs            r2,  r2,  #8
        vmov            r3,  r4,  s0, s1
        vmov            r5,  r6,  s2, s3
        vmov            r7,  r8,  s4, s5
        vmov            ip,  lr,  s6, s7
        vldmiagt        r1!, {s16-s23}
        ssat            r4,  #16, r4
        ssat            r3,  #16, r3
        ssat            r6,  #16, r6
        ssat            r5,  #16, r5
        pkhbt           r3,  r3,  r4, lsl #16
        pkhbt           r4,  r5,  r6, lsl #16
        vcvtgt.s32.f32  s0,  s16
        vcvtgt.s32.f32  s1,  s17
        vcvtgt.s32.f32  s2,  s18
        vcvtgt.s32.f32  s3,  s19
        vcvtgt.s32.f32  s4,  s20
        vcvtgt.s32.f32  s5,  s21
        vcvtgt.s32.f32  s6,  s22
        vcvtgt.s32.f32  s7,  s23
        ssat            r8,  #16, r8
        ssat            r7,  #16, r7
        ssat            lr,  #16, lr
        ssat            ip,  #16, ip
        pkhbt           r5,  r7,  r8, lsl #16
        pkhbt           r6,  ip,  lr, lsl #16
        stmia           r0!, {r3-r6}
        bgt             1b

        vpop            {d8-d11}
        pop             {r4-r8,pc}
 endfunc
 #endif
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -0,0 +1,48 @@
 /*
 * ARM optimized Format Conversion Utils
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "libavcodec/avcodec.h"
 #include "libavcodec/fmtconvert.h"

 void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
                                        float mul, int len);

 void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);

 void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);

 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
 {
    if (HAVE_ARMVFP && HAVE_ARMV6) {
        c->float_to_int16 = ff_float_to_int16_vfp;
    }

    if (HAVE_NEON) {
        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;

        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
            c->float_to_int16            = ff_float_to_int16_neon;
            c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
        }
    }
 }
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -0,0 +1,391 @@
 /*
 * ARM NEON optimised Format Conversion Utils
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "config.h"
 #include "asm.S"

        preserve8
        .text

 function ff_float_to_int16_neon, export=1
        subs            r2,  r2,  #8
        vld1.64         {d0-d1},  [r1,:128]!
        vcvt.s32.f32    q8,  q0,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vcvt.s32.f32    q9,  q1,  #16
        beq             3f
        bics            ip,  r2,  #15
        beq             2f
 1:      subs            ip,  ip,  #16
        vshrn.s32       d4,  q8,  #16
        vld1.64         {d0-d1},  [r1,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vshrn.s32       d5,  q9,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vshrn.s32       d6,  q0,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        vshrn.s32       d7,  q1,  #16
        vld1.64         {d16-d17},[r1,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r1,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.64         {d6-d7},  [r0,:128]!
        bne             1b
        ands            r2,  r2,  #15
        beq             3f
 2:      vld1.64         {d0-d1},  [r1,:128]!
        vshrn.s32       d4,  q8,  #16
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r1,:128]!
        vshrn.s32       d5,  q9,  #16
        vcvt.s32.f32    q1,  q1,  #16
        vshrn.s32       d6,  q0,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        vshrn.s32       d7,  q1,  #16
        vst1.64         {d6-d7},  [r0,:128]!
        bx              lr
 3:      vshrn.s32       d4,  q8,  #16
        vshrn.s32       d5,  q9,  #16
        vst1.64         {d4-d5},  [r0,:128]!
        bx              lr
 endfunc

 function ff_float_to_int16_interleave_neon, export=1
        cmp             r3, #2
        ldrlt           r1, [r1]
        blt             ff_float_to_int16_neon
        bne             4f

        ldr             r3, [r1]
        ldr             r1, [r1, #4]

        subs            r2,  r2,  #8
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q8,  q0,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q9,  q1,  #16
        vld1.64         {d20-d21},[r1,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r1,:128]!
        vcvt.s32.f32    q11, q11, #16
        beq             3f
        bics            ip,  r2,  #15
        beq             2f
 1:      subs            ip,  ip,  #16
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         q10, q8,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vld1.64         {d24-d25},[r1,:128]!
        vcvt.s32.f32    q12, q12, #16
        vld1.64         {d26-d27},[r1,:128]!
        vsri.32         q11, q9,  #16
        vst1.64         {d20-d21},[r0,:128]!
        vcvt.s32.f32    q13, q13, #16
        vst1.64         {d22-d23},[r0,:128]!
        vsri.32         q12, q0,  #16
        vld1.64         {d16-d17},[r3,:128]!
        vsri.32         q13, q1,  #16
        vst1.64         {d24-d25},[r0,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r3,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r1,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r1,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.64         {d26-d27},[r0,:128]!
        bne             1b
        ands            r2,  r2,  #15
        beq             3f
 2:      vsri.32         q10, q8,  #16
        vld1.64         {d0-d1},  [r3,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r3,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vld1.64         {d24-d25},[r1,:128]!
        vcvt.s32.f32    q12, q12, #16
        vsri.32         q11, q9,  #16
        vld1.64         {d26-d27},[r1,:128]!
        vcvt.s32.f32    q13, q13, #16
        vst1.64         {d20-d21},[r0,:128]!
        vsri.32         q12, q0,  #16
        vst1.64         {d22-d23},[r0,:128]!
        vsri.32         q13, q1,  #16
        vst1.64         {d24-d27},[r0,:128]!
        bx              lr
 3:      vsri.32         q10, q8,  #16
        vsri.32         q11, q9,  #16
        vst1.64         {d20-d23},[r0,:128]!
        bx              lr

 4:      push            {r4-r8,lr}
        cmp             r3,  #4
        lsl             ip,  r3,  #1
        blt             4f

        @ 4 channels
 5:      ldmia           r1!, {r4-r7}
        mov             lr,  r2
        mov             r8,  r0
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r6,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r7,:128]!
        vcvt.s32.f32    q11, q11, #16
 6:      subs            lr,  lr,  #8
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         q9,  q8,  #16
        vld1.64         {d2-d3},  [r5,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vsri.32         q11, q10, #16
        vld1.64         {d4-d5},  [r6,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vzip.32         d18, d22
        vld1.64         {d6-d7},  [r7,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vzip.32         d19, d23
        vst1.64         {d18},    [r8], ip
        vsri.32         q1,  q0,  #16
        vst1.64         {d22},    [r8], ip
        vsri.32         q3,  q2,  #16
        vst1.64         {d19},    [r8], ip
        vzip.32         d2,  d6
        vst1.64         {d23},    [r8], ip
        vzip.32         d3,  d7
        beq             7f
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.64         {d2},     [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.64         {d6},     [r8], ip
        vld1.64         {d20-d21},[r6,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.64         {d3},     [r8], ip
        vld1.64         {d22-d23},[r7,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.64         {d7},     [r8], ip
        b               6b
 7:      vst1.64         {d2},     [r8], ip
        vst1.64         {d6},     [r8], ip
        vst1.64         {d3},     [r8], ip
        vst1.64         {d7},     [r8], ip
        subs            r3,  r3,  #4
        popeq           {r4-r8,pc}
        cmp             r3,  #4
        add             r0,  r0,  #8
        bge             5b

        @ 2 channels
 4:      cmp             r3,  #2
        blt             4f
        ldmia           r1!, {r4-r5}
        mov             lr,  r2
        mov             r8,  r0
        tst             lr,  #8
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
        beq             6f
        subs            lr,  lr,  #8
        beq             7f
        vsri.32         d18, d16, #16
        vsri.32         d19, d17, #16
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vst1.32         {d19[1]}, [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.32         {d22[0]}, [r8], ip
        vst1.32         {d22[1]}, [r8], ip
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
 6:      subs            lr,  lr,  #16
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vsri.32         d18, d16, #16
        vld1.64         {d2-d3},  [r5,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        vsri.32         d19, d17, #16
        vld1.64         {d4-d5},  [r4,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vld1.64         {d6-d7},  [r5,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vsri.32         d2,  d0,  #16
        vst1.32         {d19[1]}, [r8], ip
        vsri.32         d3,  d1,  #16
        vst1.32         {d22[0]}, [r8], ip
        vsri.32         d6,  d4,  #16
        vst1.32         {d22[1]}, [r8], ip
        vsri.32         d7,  d5,  #16
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
        beq             6f
        vld1.64         {d16-d17},[r4,:128]!
        vcvt.s32.f32    q8,  q8,  #16
        vst1.32         {d2[0]},  [r8], ip
        vst1.32         {d2[1]},  [r8], ip
        vld1.64         {d18-d19},[r5,:128]!
        vcvt.s32.f32    q9,  q9,  #16
        vst1.32         {d3[0]},  [r8], ip
        vst1.32         {d3[1]},  [r8], ip
        vld1.64         {d20-d21},[r4,:128]!
        vcvt.s32.f32    q10, q10, #16
        vst1.32         {d6[0]},  [r8], ip
        vst1.32         {d6[1]},  [r8], ip
        vld1.64         {d22-d23},[r5,:128]!
        vcvt.s32.f32    q11, q11, #16
        vst1.32         {d7[0]},  [r8], ip
        vst1.32         {d7[1]},  [r8], ip
        bgt             6b
 6:      vst1.32         {d2[0]},  [r8], ip
        vst1.32         {d2[1]},  [r8], ip
        vst1.32         {d3[0]},  [r8], ip
        vst1.32         {d3[1]},  [r8], ip
        vst1.32         {d6[0]},  [r8], ip
        vst1.32         {d6[1]},  [r8], ip
        vst1.32         {d7[0]},  [r8], ip
        vst1.32         {d7[1]},  [r8], ip
        b               8f
 7:      vsri.32         d18, d16, #16
        vsri.32         d19, d17, #16
        vst1.32         {d18[0]}, [r8], ip
        vsri.32         d22, d20, #16
        vst1.32         {d18[1]}, [r8], ip
        vsri.32         d23, d21, #16
        vst1.32         {d19[0]}, [r8], ip
        vst1.32         {d19[1]}, [r8], ip
        vst1.32         {d22[0]}, [r8], ip
        vst1.32         {d22[1]}, [r8], ip
        vst1.32         {d23[0]}, [r8], ip
        vst1.32         {d23[1]}, [r8], ip
 8:      subs            r3,  r3,  #2
        add             r0,  r0,  #4
        popeq           {r4-r8,pc}

        @ 1 channel
 4:      ldr             r4,  [r1],#4
        tst             r2,  #8
        mov             lr,  r2
        mov             r5,  r0
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        bne             8f
 6:      subs            lr,  lr,  #16
        vld1.64         {d4-d5},  [r4,:128]!
        vcvt.s32.f32    q2,  q2,  #16
        vld1.64         {d6-d7},  [r4,:128]!
        vcvt.s32.f32    q3,  q3,  #16
        vst1.16         {d0[1]},  [r5,:16], ip
        vst1.16         {d0[3]},  [r5,:16], ip
        vst1.16         {d1[1]},  [r5,:16], ip
        vst1.16         {d1[3]},  [r5,:16], ip
        vst1.16         {d2[1]},  [r5,:16], ip
        vst1.16         {d2[3]},  [r5,:16], ip
        vst1.16         {d3[1]},  [r5,:16], ip
        vst1.16         {d3[3]},  [r5,:16], ip
        beq             7f
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
 7:      vst1.16         {d4[1]},  [r5,:16], ip
        vst1.16         {d4[3]},  [r5,:16], ip
        vst1.16         {d5[1]},  [r5,:16], ip
        vst1.16         {d5[3]},  [r5,:16], ip
        vst1.16         {d6[1]},  [r5,:16], ip
        vst1.16         {d6[3]},  [r5,:16], ip
        vst1.16         {d7[1]},  [r5,:16], ip
        vst1.16         {d7[3]},  [r5,:16], ip
        bgt             6b
        pop             {r4-r8,pc}
 8:      subs            lr,  lr,  #8
        vst1.16         {d0[1]},  [r5,:16], ip
        vst1.16         {d0[3]},  [r5,:16], ip
        vst1.16         {d1[1]},  [r5,:16], ip
        vst1.16         {d1[3]},  [r5,:16], ip
        vst1.16         {d2[1]},  [r5,:16], ip
        vst1.16         {d2[3]},  [r5,:16], ip
        vst1.16         {d3[1]},  [r5,:16], ip
        vst1.16         {d3[3]},  [r5,:16], ip
        popeq           {r4-r8,pc}
        vld1.64         {d0-d1},  [r4,:128]!
        vcvt.s32.f32    q0,  q0,  #16
        vld1.64         {d2-d3},  [r4,:128]!
        vcvt.s32.f32    q1,  q1,  #16
        b               6b
 endfunc

 function ff_int32_to_float_fmul_scalar_neon, export=1
 VFP     vdup.32         q0,  d0[0]
 VFP     len     .req    r2
 NOVFP   vdup.32         q0,  r2
 NOVFP   len     .req    r3

        vld1.32         {q1},[r1,:128]!
        vcvt.f32.s32    q3,  q1
        vld1.32         {q2},[r1,:128]!
        vcvt.f32.s32    q8,  q2
 1:      subs            len, len, #8
        pld             [r1, #16]
        vmul.f32        q9,  q3,  q0
        vmul.f32        q10, q8,  q0
        beq             2f
        vld1.32         {q1},[r1,:128]!
        vcvt.f32.s32    q3,  q1
        vld1.32         {q2},[r1,:128]!
        vcvt.f32.s32    q8,  q2
        vst1.32         {q9}, [r0,:128]!
        vst1.32         {q10},[r0,:128]!
        b               1b
 2:      vst1.32         {q9}, [r0,:128]!
        vst1.32         {q10},[r0,:128]!
        bx              lr
        .unreq  len
 endfunc
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -0,0 +1,77 @@
 /*
 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "config.h"
 #include "asm.S"

        .syntax unified

 /**
 * ARM VFP optimized float to int16 conversion.
 * Assume that len is a positive number and is multiple of 8, destination
 * buffer is at least 4 bytes aligned (8 bytes alignment is better for
 * performance), little endian byte sex
 */
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
 function ff_float_to_int16_vfp, export=1
        push            {r4-r8,lr}
        vpush           {d8-d11}
        vldmia          r1!, {s16-s23}
        vcvt.s32.f32    s0,  s16
        vcvt.s32.f32    s1,  s17
        vcvt.s32.f32    s2,  s18
        vcvt.s32.f32    s3,  s19
        vcvt.s32.f32    s4,  s20
        vcvt.s32.f32    s5,  s21
        vcvt.s32.f32    s6,  s22
        vcvt.s32.f32    s7,  s23
 1:
        subs            r2,  r2,  #8
        vmov            r3,  r4,  s0, s1
        vmov            r5,  r6,  s2, s3
        vmov            r7,  r8,  s4, s5
        vmov            ip,  lr,  s6, s7
        vldmiagt        r1!, {s16-s23}
        ssat            r4,  #16, r4
        ssat            r3,  #16, r3
        ssat            r6,  #16, r6
        ssat            r5,  #16, r5
        pkhbt           r3,  r3,  r4, lsl #16
        pkhbt           r4,  r5,  r6, lsl #16
        vcvtgt.s32.f32  s0,  s16
        vcvtgt.s32.f32  s1,  s17
        vcvtgt.s32.f32  s2,  s18
        vcvtgt.s32.f32  s3,  s19
        vcvtgt.s32.f32  s4,  s20
        vcvtgt.s32.f32  s5,  s21
        vcvtgt.s32.f32  s6,  s22
        vcvtgt.s32.f32  s7,  s23
        ssat            r8,  #16, r8
        ssat            r7,  #16, r7
        ssat            lr,  #16, lr
        ssat            ip,  #16, ip
        pkhbt           r5,  r7,  r8, lsl #16
        pkhbt           r6,  ip,  lr, lsl #16
        stmia           r0!, {r3-r6}
        bgt             1b

        vpop            {d8-d11}
        pop             {r4-r8,pc}
 endfunc
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -33,6 +33,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"

 extern const uint16_t ff_wma_critical_freqs[25];

@@ -43,6 +44,7 @@ typedef struct {
    AVCodecContext *avctx;
    GetBitContext gb;
    DSPContext dsp;
    FmtConvertContext fmt_conv;
    int first;
    int channels;
    int frame_len;          ///< transform size (samples)
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)

    s->avctx = avctx;
    dsputil_init(&s->dsp, avctx);
    ff_fmt_convert_init(&s->fmt_conv, avctx);

    /* determine frame length */
    if (avctx->sample_rate < 22050) {
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
            ff_rdft_calc(&s->trans.rdft, coeffs);
    }

    s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
    s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
                                          s->frame_len, s->channels);

    if (!s->first) {
        int count = s->overlap_len * s->channels;
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -40,6 +40,7 @@
 #include "dca.h"
 #include "synth_filter.h"
 #include "dcadsp.h"
 #include "fmtconvert.h"

 //#define TRACE

@@ -347,6 +348,7 @@ typedef struct {
    FFTContext imdct;
    SynthFilterContext synth;
    DCADSPContext dcadsp;
    FmtConvertContext fmt_conv;
 } DCAContext;

 static const uint16_t dca_vlc_offs[] = {
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
                        block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
                }

                s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
                s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
                                                  block, rscale, 8);
            }

@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
            }
        }

        s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
        s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
        samples += 256 * channels;
    }

@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
    ff_mdct_init(&s->imdct, 6, 1, 1.0);
    ff_synth_filter_init(&s->synth);
    ff_dcadsp_init(&s->dcadsp);
    ff_fmt_convert_init(&s->fmt_conv, avctx);

    for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
        s->samples_chanptr[i] = s->samples + i * 256;
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
    return p;
 }

 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
    int i;
    for(i=0; i<len; i++)
        dst[i] = src[i] * mul;
 }

 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                   uint32_t maxi, uint32_t maxisign)
 {
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
    }
 }

 static av_always_inline int float_to_int16_one(const float *src){
    return av_clip_int16(lrintf(*src));
 }

 static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
    int i;
    for(i=0; i<len; i++)
        dst[i] = float_to_int16_one(src+i);
 }

 static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
    int i,j,c;
    if(channels==2){
        for(i=0; i<len; i++){
            dst[2*i]   = float_to_int16_one(src[0]+i);
            dst[2*i+1] = float_to_int16_one(src[1]+i);
        }
    }else{
        for(c=0; c<channels; c++)
            for(i=0, j=c; i<len; i++, j+=channels)
                dst[j] = float_to_int16_one(src[c]+i);
    }
 }

 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
 {
    int res = 0;
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->vector_fmul_reverse = vector_fmul_reverse_c;
    c->vector_fmul_add = vector_fmul_add_c;
    c->vector_fmul_window = vector_fmul_window_c;
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
    c->vector_clipf = vector_clipf_c;
    c->float_to_int16 = ff_float_to_int16_c;
    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    c->scalarproduct_float = scalarproduct_float_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -392,7 +392,6 @@ typedef struct DSPContext {
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
    /**
     * Multiply a vector of floats by a scalar float.  Source and
@@ -445,10 +444,6 @@ typedef struct DSPContext {
     */
    void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);

    /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
    void (*float_to_int16)(int16_t *dst, const float *src, long len);
    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);

    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
    void (*fdct248)(DCTELEM *block/* align 16*/);
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -0,0 +1,68 @@
 /*
 * Format Conversion Utils
 * Copyright (c) 2000, 2001 Fabrice Bellard
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "avcodec.h"
 #include "fmtconvert.h"

 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
    int i;
    for(i=0; i<len; i++)
        dst[i] = src[i] * mul;
 }

 static av_always_inline int float_to_int16_one(const float *src){
    return av_clip_int16(lrintf(*src));
 }

 static void float_to_int16_c(int16_t *dst, const float *src, long len)
 {
    int i;
    for(i=0; i<len; i++)
        dst[i] = float_to_int16_one(src+i);
 }

 static void float_to_int16_interleave_c(int16_t *dst, const float **src,
                                        long len, int channels)
 {
    int i,j,c;
    if(channels==2){
        for(i=0; i<len; i++){
            dst[2*i]   = float_to_int16_one(src[0]+i);
            dst[2*i+1] = float_to_int16_one(src[1]+i);
        }
    }else{
        for(c=0; c<channels; c++)
            for(i=0, j=c; i<len; i++, j+=channels)
                dst[j] = float_to_int16_one(src[c]+i);
    }
 }

 av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
 {
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
    c->float_to_int16             = float_to_int16_c;
    c->float_to_int16_interleave  = float_to_int16_interleave_c;

    if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
    if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
    if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
 }
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -0,0 +1,79 @@
 /*
 * Format Conversion Utils
 * Copyright (c) 2000, 2001 Fabrice Bellard
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_FMTCONVERT_H
 #define AVCODEC_FMTCONVERT_H

 #include "avcodec.h"

 typedef struct FmtConvertContext {
    /**
     * Convert an array of int32_t to float and multiply by a float value.
     * @param dst destination array of float.
     *            constraints: 16-byte aligned
     * @param src source array of int32_t.
     *            constraints: 16-byte aligned
     * @param len number of elements to convert.
     *            constraints: multiple of 8
     */
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);

    /**
     * Convert an array of float to an array of int16_t.
     *
     * Convert floats from in the range [-32768.0,32767.0] to ints
     * without rescaling
     *
     * @param dst destination array of int16_t.
     *            constraints: 16-byte aligned
     * @param src source array of float.
     *            constraints: 16-byte aligned
     * @param len number of elements to convert.
     *            constraints: multiple of 8
     */
    void (*float_to_int16)(int16_t *dst, const float *src, long len);

    /**
     * Convert multiple arrays of float to an interleaved array of int16_t.
     *
     * Convert floats from in the range [-32768.0,32767.0] to ints
     * without rescaling
     *
     * @param dst destination array of interleaved int16_t.
     *            constraints: 16-byte aligned
     * @param src source array of float arrays, one for each channel.
     *            constraints: 16-byte aligned
     * @param len number of elements to convert.
     *            constraints: multiple of 8
     * @param channels number of channels
     */
    void (*float_to_int16_interleave)(int16_t *dst, const float **src,
                                      long len, int channels);
 } FmtConvertContext;

 void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);

 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);

 #endif /* AVCODEC_FMTCONVERT_H */
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -38,6 +38,7 @@
 #include "avcodec.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"

 #define ALT_BITSTREAM_READER_LE
 #include "get_bits.h"
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
    float           scale_bias;
    DSPContext      dsp;
    FFTContext      imdct_ctx;
    FmtConvertContext fmt_conv;
    DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
 } NellyMoserDecodeContext;

@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
    ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);

    dsputil_init(&s->dsp, avctx);
    ff_fmt_convert_init(&s->fmt_conv, avctx);

    s->scale_bias = 1.0/(1*8);

@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,

    for (i=0 ; i<blocks ; i++) {
        nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
        s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
        s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
        *data_size += NELLY_SAMPLES*sizeof(int16_t);
    }

--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT)             += ppc/fft_altivec.o             \
 OBJS-$(HAVE_ALTIVEC)                   += ppc/dsputil_altivec.o         \
                                          ppc/fdct_altivec.o            \
                                          ppc/float_altivec.o           \
                                          ppc/fmtconvert_altivec.o      \
                                          ppc/gmc_altivec.o             \
                                          ppc/idct_altivec.o            \
                                          ppc/int_altivec.o             \
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
    }
 }

 static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
 {
    union {
        vector float v;
        float s[4];
    } mul_u;
    int i;
    vector float src1, src2, dst1, dst2, mul_v, zero;

    zero = (vector float)vec_splat_u32(0);
    mul_u.s[0] = mul;
    mul_v = vec_splat(mul_u.v, 0);

    for(i=0; i<len; i+=8) {
        src1 = vec_ctf(vec_ld(0,  src+i), 0);
        src2 = vec_ctf(vec_ld(16, src+i), 0);
        dst1 = vec_madd(src1, mul_v, zero);
        dst2 = vec_madd(src2, mul_v, zero);
        vec_st(dst1,  0, dst+i);
        vec_st(dst2, 16, dst+i);
    }
 }


 static vector signed short
 float_to_int16_one_altivec(const float *src)
 {
    vector float s0 = vec_ld(0, src);
    vector float s1 = vec_ld(16, src);
    vector signed int t0 = vec_cts(s0, 0);
    vector signed int t1 = vec_cts(s1, 0);
    return vec_packs(t0,t1);
 }

 static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
 {
    int i;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        d0 = vec_ld(0, dst+i);
        d = float_to_int16_one_altivec(src+i);
        d1 = vec_ld(15, dst+i);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        d = float_to_int16_one_altivec(src+i);
        vec_st(d, 0, dst+i);
    }
 }

 static void
 float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                                  long len, int channels)
 {
    int i;
    vector signed short d0, d1, d2, c0, c1, t0, t1;
    vector unsigned char align;
    if(channels == 1)
        float_to_int16_altivec(dst, src[0], len);
    else
        if (channels == 2) {
        if(((long)dst)&15)
        for(i=0; i<len-7; i+=8) {
            d0 = vec_ld(0, dst + i);
            t0 = float_to_int16_one_altivec(src[0] + i);
            d1 = vec_ld(31, dst + i);
            t1 = float_to_int16_one_altivec(src[1] + i);
            c0 = vec_mergeh(t0, t1);
            c1 = vec_mergel(t0, t1);
            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
            align = vec_lvsr(0, dst + i);
            d0 = vec_perm(d2, c0, align);
            d1 = vec_perm(c0, c1, align);
            vec_st(d0,  0, dst + i);
            d0 = vec_perm(c1, d2, align);
            vec_st(d1, 15, dst + i);
            vec_st(d0, 31, dst + i);
            dst+=8;
        }
        else
        for(i=0; i<len-7; i+=8) {
            t0 = float_to_int16_one_altivec(src[0] + i);
            t1 = float_to_int16_one_altivec(src[1] + i);
            d0 = vec_mergeh(t0, t1);
            d1 = vec_mergel(t0, t1);
            vec_st(d0,  0, dst + i);
            vec_st(d1, 16, dst + i);
            dst+=8;
        }
    } else {
        DECLARE_ALIGNED(16, int16_t, tmp)[len];
        int c, j;
        for (c = 0; c < channels; c++) {
            float_to_int16_altivec(tmp, src[c], len);
            for (i = 0, j = c; i < len; i++, j+=channels) {
                dst[j] = tmp[i];
            }
        }
   }
 }

 void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
    c->vector_fmul = vector_fmul_altivec;
    c->vector_fmul_reverse = vector_fmul_reverse_altivec;
    c->vector_fmul_add = vector_fmul_add_altivec;
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
        c->vector_fmul_window = vector_fmul_window_altivec;
        c->float_to_int16 = float_to_int16_altivec;
        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
    }
 }
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -0,0 +1,142 @@
 /*
 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavcodec/fmtconvert.h"

 #include "dsputil_altivec.h"
 #include "util_altivec.h"

 static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
 {
    union {
        vector float v;
        float s[4];
    } mul_u;
    int i;
    vector float src1, src2, dst1, dst2, mul_v, zero;

    zero = (vector float)vec_splat_u32(0);
    mul_u.s[0] = mul;
    mul_v = vec_splat(mul_u.v, 0);

    for(i=0; i<len; i+=8) {
        src1 = vec_ctf(vec_ld(0,  src+i), 0);
        src2 = vec_ctf(vec_ld(16, src+i), 0);
        dst1 = vec_madd(src1, mul_v, zero);
        dst2 = vec_madd(src2, mul_v, zero);
        vec_st(dst1,  0, dst+i);
        vec_st(dst2, 16, dst+i);
    }
 }


 static vector signed short
 float_to_int16_one_altivec(const float *src)
 {
    vector float s0 = vec_ld(0, src);
    vector float s1 = vec_ld(16, src);
    vector signed int t0 = vec_cts(s0, 0);
    vector signed int t1 = vec_cts(s1, 0);
    return vec_packs(t0,t1);
 }

 static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
 {
    int i;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        d0 = vec_ld(0, dst+i);
        d = float_to_int16_one_altivec(src+i);
        d1 = vec_ld(15, dst+i);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        d = float_to_int16_one_altivec(src+i);
        vec_st(d, 0, dst+i);
    }
 }

 static void
 float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                                  long len, int channels)
 {
    int i;
    vector signed short d0, d1, d2, c0, c1, t0, t1;
    vector unsigned char align;
    if(channels == 1)
        float_to_int16_altivec(dst, src[0], len);
    else
        if (channels == 2) {
        if(((long)dst)&15)
        for(i=0; i<len-7; i+=8) {
            d0 = vec_ld(0, dst + i);
            t0 = float_to_int16_one_altivec(src[0] + i);
            d1 = vec_ld(31, dst + i);
            t1 = float_to_int16_one_altivec(src[1] + i);
            c0 = vec_mergeh(t0, t1);
            c1 = vec_mergel(t0, t1);
            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
            align = vec_lvsr(0, dst + i);
            d0 = vec_perm(d2, c0, align);
            d1 = vec_perm(c0, c1, align);
            vec_st(d0,  0, dst + i);
            d0 = vec_perm(c1, d2, align);
            vec_st(d1, 15, dst + i);
            vec_st(d0, 31, dst + i);
            dst+=8;
        }
        else
        for(i=0; i<len-7; i+=8) {
            t0 = float_to_int16_one_altivec(src[0] + i);
            t1 = float_to_int16_one_altivec(src[1] + i);
            d0 = vec_mergeh(t0, t1);
            d1 = vec_mergel(t0, t1);
            vec_st(d0,  0, dst + i);
            vec_st(d1, 16, dst + i);
            dst+=8;
        }
    } else {
        DECLARE_ALIGNED(16, int16_t, tmp)[len];
        int c, j;
        for (c = 0; c < channels; c++) {
            float_to_int16_altivec(tmp, src[c], len);
            for (i = 0, j = c; i < len; i++, j+=channels) {
                dst[j] = tmp[i];
            }
        }
   }
 }

 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
 {
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
        c->float_to_int16 = float_to_int16_altivec;
        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
    }
 }
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -31,6 +31,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"

 #include "vorbis.h"
 #include "xiph.h"
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
    AVCodecContext *avccontext;
    GetBitContext gb;
    DSPContext dsp;
    FmtConvertContext fmt_conv;

    FFTContext mdct[2];
    uint_fast8_t  first_frame;
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)

    vc->avccontext = avccontext;
    dsputil_init(&vc->dsp, avccontext);
    ff_fmt_convert_init(&vc->fmt_conv, avccontext);

    vc->scale_bias = 32768.0f;

@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
                              len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
    }

    vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
    vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
                                           vc->audio_channels);
    *data_size = len * 2 * vc->audio_channels;

    return buf_size ;
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
    s->block_align = avctx->block_align;

    dsputil_init(&s->dsp, avctx);
    ff_fmt_convert_init(&s->fmt_conv, avctx);

    if (avctx->codec->id == CODEC_ID_WMAV1) {
        s->version = 1;
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -26,6 +26,7 @@
 #include "put_bits.h"
 #include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"

 /* size of blocks */
 #define BLOCK_MIN_BITS 7
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
    float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
    float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
    DSPContext dsp;
    FmtConvertContext fmt_conv;

 #ifdef TRACE
    int frame_count;
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
    incr = s->nb_channels;
    for (ch = 0; ch < MAX_CHANNELS; ch++)
        output[ch] = s->frame_out[ch];
    s->dsp.float_to_int16_interleave(samples, output, n, incr);
    s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
    for (ch = 0; ch < incr; ch++) {
        /* prepare for next block */
        memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
 MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
 MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
                                          x86/deinterlace.o             \
                                          x86/fmtconvert.o              \
                                          x86/h264_chromamc.o           \
                                          $(YASM-OBJS-yes)

@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
 OBJS-$(HAVE_MMX)                       += x86/dnxhd_mmx.o               \
                                          x86/dsputil_mmx.o             \
                                          x86/fdct_mmx.o                \
                                          x86/fmtconvert_mmx.o          \
                                          x86/idct_mmx_xvid.o           \
                                          x86/idct_sse2_xvid.o          \
                                          x86/motion_est_mmx.o          \
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
 }
 #endif /* HAVE_6REGS */

 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
 {
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtpi2ps   (%2,%0), %%xmm0 \n"
        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
        "movlhps  %%xmm1,    %%xmm0 \n"
        "movlhps  %%xmm3,    %%xmm2 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm2 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm2, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }

 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
 {
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtdq2ps   (%2,%0), %%xmm0 \n"
        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm1 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm1, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }

 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
                             int len)
 {
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
    );
 }

 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    // not bit-exact: pf2id uses different rounding than C and SSE
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "femms                              \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }
 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "emms                               \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }

 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
        "packssdw   %%xmm1      , %%xmm0    \n\t"
        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }

 void ff_vp3_idct_mmx(int16_t *input_data);
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);

 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);

 #if !HAVE_YASM
 #define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse

 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
    int i,j,c;\
    for(c=0; c<channels; c++){\
        float_to_int16_##cpu(tmp, src[c], len);\
        for(i=0, j=c; i<len; i++, j+=channels)\
            dst[j] = tmp[i];\
    }\
 }\
 \
 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
    if(channels==1)\
        float_to_int16_##cpu(dst, src[0], len);\
    else if(channels==2){\
        x86_reg reglen = len; \
        const float *src0 = src[0];\
        const float *src1 = src[1];\
        __asm__ volatile(\
            "shl $2, %0 \n"\
            "add %0, %1 \n"\
            "add %0, %2 \n"\
            "add %0, %3 \n"\
            "neg %0 \n"\
            body\
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
        );\
    }else if(channels==6){\
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
    }else\
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }

 FLOAT_TO_INT16_INTERLEAVE(3dnow,
    "1:                         \n"
    "pf2id     (%2,%0), %%mm0   \n"
    "pf2id    8(%2,%0), %%mm1   \n"
    "pf2id     (%3,%0), %%mm2   \n"
    "pf2id    8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "femms                      \n"
 )

 FLOAT_TO_INT16_INTERLEAVE(sse,
    "1:                         \n"
    "cvtps2pi  (%2,%0), %%mm0   \n"
    "cvtps2pi 8(%2,%0), %%mm1   \n"
    "cvtps2pi  (%3,%0), %%mm2   \n"
    "cvtps2pi 8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "emms                       \n"
 )

 FLOAT_TO_INT16_INTERLEAVE(sse2,
    "1:                         \n"
    "cvtps2dq  (%2,%0), %%xmm0  \n"
    "cvtps2dq  (%3,%0), %%xmm1  \n"
    "packssdw   %%xmm1, %%xmm0  \n"
    "movhlps    %%xmm0, %%xmm1  \n"
    "punpcklwd  %%xmm1, %%xmm0  \n"
    "movdqa     %%xmm0, (%1,%0) \n"
    "add $16, %0                \n"
    "js 1b                      \n"
 )

 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
    if(channels==6)
        ff_float_to_int16_interleave6_3dn2(dst, src, len);
    else
        float_to_int16_interleave_3dnow(dst, src, len, channels);
 }

 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);

 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        if(mm_flags & AV_CPU_FLAG_3DNOW){
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
            c->vector_fmul = vector_fmul_3dnow;
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16 = float_to_int16_3dnow;
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
            }
        }
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_3dnow2;
 #endif
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
            }
        }
        if(mm_flags & AV_CPU_FLAG_MMX2){
 #if HAVE_YASM
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_sse;
 #endif
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
            c->vector_clipf = vector_clipf_sse;
            c->float_to_int16 = float_to_int16_sse;
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
 #if HAVE_YASM
            c->scalarproduct_float = ff_scalarproduct_float_sse;
 #endif
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        if(mm_flags & AV_CPU_FLAG_3DNOW)
            c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
        if(mm_flags & AV_CPU_FLAG_SSE2){
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
            c->float_to_int16 = float_to_int16_sse2;
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
 #if HAVE_YASM
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13

 section .text align=16

 %macro PSWAPD_SSE 2
    pshufw %1, %2, 0x4e
 %endmacro
 %macro PSWAPD_3DN1 2
    movq  %1, %2
    psrlq %1, 32
    punpckldq %1, %2
 %endmacro

 %macro FLOAT_TO_INT16_INTERLEAVE6 1
 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
 %ifdef ARCH_X86_64
    %define lend r10d
    mov     lend, r2d
 %else
    %define lend dword r2m
 %endif
    mov src1q, [srcq+1*gprsize]
    mov src2q, [srcq+2*gprsize]
    mov src3q, [srcq+3*gprsize]
    mov src4q, [srcq+4*gprsize]
    mov src5q, [srcq+5*gprsize]
    mov srcq,  [srcq]
    sub src1q, srcq
    sub src2q, srcq
    sub src3q, srcq
    sub src4q, srcq
    sub src5q, srcq
 .loop:
    cvtps2pi   mm0, [srcq]
    cvtps2pi   mm1, [srcq+src1q]
    cvtps2pi   mm2, [srcq+src2q]
    cvtps2pi   mm3, [srcq+src3q]
    cvtps2pi   mm4, [srcq+src4q]
    cvtps2pi   mm5, [srcq+src5q]
    packssdw   mm0, mm3
    packssdw   mm1, mm4
    packssdw   mm2, mm5
    pswapd     mm3, mm0
    punpcklwd  mm0, mm1
    punpckhwd  mm1, mm2
    punpcklwd  mm2, mm3
    pswapd     mm3, mm0
    punpckldq  mm0, mm2
    punpckhdq  mm2, mm1
    punpckldq  mm1, mm3
    movq [dstq   ], mm0
    movq [dstq+16], mm2
    movq [dstq+ 8], mm1
    add srcq, 8
    add dstq, 24
    sub lend, 2
    jg .loop
    emms
    RET
 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6

 %define pswapd PSWAPD_SSE
 FLOAT_TO_INT16_INTERLEAVE6 sse
 %define cvtps2pi pf2id
 %define pswapd PSWAPD_3DN1
 FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 %undef cvtps2pi



 %macro SCALARPRODUCT 1
 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -0,0 +1,91 @@
 ;******************************************************************************
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "x86inc.asm"

 section .text align=16

 %macro PSWAPD_SSE 2
    pshufw %1, %2, 0x4e
 %endmacro
 %macro PSWAPD_3DN1 2
    movq  %1, %2
    psrlq %1, 32
    punpckldq %1, %2
 %endmacro

 %macro FLOAT_TO_INT16_INTERLEAVE6 1
 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
 %ifdef ARCH_X86_64
    %define lend r10d
    mov     lend, r2d
 %else
    %define lend dword r2m
 %endif
    mov src1q, [srcq+1*gprsize]
    mov src2q, [srcq+2*gprsize]
    mov src3q, [srcq+3*gprsize]
    mov src4q, [srcq+4*gprsize]
    mov src5q, [srcq+5*gprsize]
    mov srcq,  [srcq]
    sub src1q, srcq
    sub src2q, srcq
    sub src3q, srcq
    sub src4q, srcq
    sub src5q, srcq
 .loop:
    cvtps2pi   mm0, [srcq]
    cvtps2pi   mm1, [srcq+src1q]
    cvtps2pi   mm2, [srcq+src2q]
    cvtps2pi   mm3, [srcq+src3q]
    cvtps2pi   mm4, [srcq+src4q]
    cvtps2pi   mm5, [srcq+src5q]
    packssdw   mm0, mm3
    packssdw   mm1, mm4
    packssdw   mm2, mm5
    pswapd     mm3, mm0
    punpcklwd  mm0, mm1
    punpckhwd  mm1, mm2
    punpcklwd  mm2, mm3
    pswapd     mm3, mm0
    punpckldq  mm0, mm2
    punpckhdq  mm2, mm1
    punpckldq  mm1, mm3
    movq [dstq   ], mm0
    movq [dstq+16], mm2
    movq [dstq+ 8], mm1
    add srcq, 8
    add dstq, 24
    sub lend, 2
    jg .loop
    emms
    RET
 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6

 %define pswapd PSWAPD_SSE
 FLOAT_TO_INT16_INTERLEAVE6 sse
 %define cvtps2pi pf2id
 %define pswapd PSWAPD_3DN1
 FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 %undef cvtps2pi
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -0,0 +1,266 @@
 /*
 * Format Conversion Utils
 * Copyright (c) 2000, 2001 Fabrice Bellard
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
 */

 #include "libavutil/cpu.h"
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"

 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
 {
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtpi2ps   (%2,%0), %%xmm0 \n"
        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
        "movlhps  %%xmm1,    %%xmm0 \n"
        "movlhps  %%xmm3,    %%xmm2 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm2 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm2, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }

 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
 {
    x86_reg i = -4*len;
    __asm__ volatile(
        "movss  %3, %%xmm4 \n"
        "shufps $0, %%xmm4, %%xmm4 \n"
        "1: \n"
        "cvtdq2ps   (%2,%0), %%xmm0 \n"
        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
        "mulps    %%xmm4,    %%xmm0 \n"
        "mulps    %%xmm4,    %%xmm1 \n"
        "movaps   %%xmm0,   (%1,%0) \n"
        "movaps   %%xmm1, 16(%1,%0) \n"
        "add $32, %0 \n"
        "jl 1b \n"
        :"+r"(i)
        :"r"(dst+len), "r"(src+len), "m"(mul)
    );
 }

 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    // not bit-exact: pf2id uses different rounding than C and SSE
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "femms                              \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }

 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
        "packssdw   %%mm1       , %%mm0     \n\t"
        "packssdw   %%mm3       , %%mm2     \n\t"
        "movq       %%mm0       ,  (%1,%0)  \n\t"
        "movq       %%mm2       , 8(%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        "emms                               \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }

 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
    x86_reg reglen = len;
    __asm__ volatile(
        "add        %0          , %0        \n\t"
        "lea         (%2,%0,2)  , %2        \n\t"
        "add        %0          , %1        \n\t"
        "neg        %0                      \n\t"
        "1:                                 \n\t"
        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
        "packssdw   %%xmm1      , %%xmm0    \n\t"
        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
        "add        $16         , %0        \n\t"
        " js 1b                             \n\t"
        :"+r"(reglen), "+r"(dst), "+r"(src)
    );
 }

 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);

 #if !HAVE_YASM
 #define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
 #endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse

 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
    int i,j,c;\
    for(c=0; c<channels; c++){\
        float_to_int16_##cpu(tmp, src[c], len);\
        for(i=0, j=c; i<len; i++, j+=channels)\
            dst[j] = tmp[i];\
    }\
 }\
 \
 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
    if(channels==1)\
        float_to_int16_##cpu(dst, src[0], len);\
    else if(channels==2){\
        x86_reg reglen = len; \
        const float *src0 = src[0];\
        const float *src1 = src[1];\
        __asm__ volatile(\
            "shl $2, %0 \n"\
            "add %0, %1 \n"\
            "add %0, %2 \n"\
            "add %0, %3 \n"\
            "neg %0 \n"\
            body\
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
        );\
    }else if(channels==6){\
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
    }else\
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }

 FLOAT_TO_INT16_INTERLEAVE(3dnow,
    "1:                         \n"
    "pf2id     (%2,%0), %%mm0   \n"
    "pf2id    8(%2,%0), %%mm1   \n"
    "pf2id     (%3,%0), %%mm2   \n"
    "pf2id    8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "femms                      \n"
 )

 FLOAT_TO_INT16_INTERLEAVE(sse,
    "1:                         \n"
    "cvtps2pi  (%2,%0), %%mm0   \n"
    "cvtps2pi 8(%2,%0), %%mm1   \n"
    "cvtps2pi  (%3,%0), %%mm2   \n"
    "cvtps2pi 8(%3,%0), %%mm3   \n"
    "packssdw    %%mm1, %%mm0   \n"
    "packssdw    %%mm3, %%mm2   \n"
    "movq        %%mm0, %%mm1   \n"
    "punpcklwd   %%mm2, %%mm0   \n"
    "punpckhwd   %%mm2, %%mm1   \n"
    "movq        %%mm0,  (%1,%0)\n"
    "movq        %%mm1, 8(%1,%0)\n"
    "add $16, %0                \n"
    "js 1b                      \n"
    "emms                       \n"
 )

 FLOAT_TO_INT16_INTERLEAVE(sse2,
    "1:                         \n"
    "cvtps2dq  (%2,%0), %%xmm0  \n"
    "cvtps2dq  (%3,%0), %%xmm1  \n"
    "packssdw   %%xmm1, %%xmm0  \n"
    "movhlps    %%xmm0, %%xmm1  \n"
    "punpcklwd  %%xmm1, %%xmm0  \n"
    "movdqa     %%xmm0, (%1,%0) \n"
    "add $16, %0                \n"
    "js 1b                      \n"
 )

 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
    if(channels==6)
        ff_float_to_int16_interleave6_3dn2(dst, src, len);
    else
        float_to_int16_interleave_3dnow(dst, src, len, channels);
 }

 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();

    if (mm_flags & AV_CPU_FLAG_MMX) {

        if(mm_flags & AV_CPU_FLAG_3DNOW){
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16 = float_to_int16_3dnow;
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
            }
        }
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
            }
        }
        if(mm_flags & AV_CPU_FLAG_SSE){
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
            c->float_to_int16 = float_to_int16_sse;
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
        }
        if(mm_flags & AV_CPU_FLAG_SSE2){
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
            c->float_to_int16 = float_to_int16_sse2;
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
        }
    }
 }