dsputil: Split audio operations off into a separate context

11 years ago · 9a9e2f1c8a
--- a/configure
+++ b/configure
@@ -1529,6 +1529,7 @@ CONFIG_EXTRA="
    aandcttables
    ac3dsp
    audio_frame_queue
    audiodsp
    blockdsp
    cabac
    dsputil
@@ -1713,8 +1714,8 @@ aac_decoder_select="mdct sinewin"
 aac_encoder_select="audio_frame_queue mdct sinewin"
 aac_latm_decoder_select="aac_decoder aac_latm_parser"
 ac3_decoder_select="mdct ac3dsp ac3_parser dsputil"
 ac3_encoder_select="mdct ac3dsp dsputil"
 ac3_fixed_encoder_select="mdct ac3dsp dsputil"
 ac3_encoder_select="ac3dsp audiodsp dsputil mdct"
 ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct"
 aic_decoder_select="dsputil golomb"
 alac_encoder_select="lpc"
 als_decoder_select="dsputil"
@@ -1735,7 +1736,7 @@ binkaudio_rdft_decoder_select="mdct rdft sinewin"
 cavs_decoder_select="blockdsp dsputil golomb h264chroma qpeldsp videodsp"
 cllc_decoder_select="dsputil"
 comfortnoise_encoder_select="lpc"
 cook_decoder_select="dsputil mdct sinewin"
 cook_decoder_select="audiodsp mdct sinewin"
 cscd_decoder_select="lzo"
 cscd_decoder_suggest="zlib"
 dca_decoder_select="mdct"
@@ -1849,7 +1850,7 @@ svq1_decoder_select="hpeldsp"
 svq1_encoder_select="aandcttables dsputil hpeldsp mpegvideoenc"
 svq3_decoder_select="h264_decoder hpeldsp tpeldsp"
 svq3_decoder_suggest="zlib"
 tak_decoder_select="dsputil"
 tak_decoder_select="audiodsp"
 theora_decoder_select="vp3_decoder"
 thp_decoder_select="mjpeg_decoder"
 tiff_decoder_suggest="zlib"
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -28,6 +28,7 @@ OBJS = allcodecs.o                                                      \
 OBJS-$(CONFIG_AANDCTTABLES)            += aandcttab.o
 OBJS-$(CONFIG_AC3DSP)                  += ac3dsp.o
 OBJS-$(CONFIG_AUDIO_FRAME_QUEUE)       += audio_frame_queue.o
 OBJS-$(CONFIG_AUDIODSP)                += audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -37,6 +37,7 @@
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "audiodsp.h"
 #include "ac3dsp.h"
 #include "ac3.h"
 #include "fft.h"
@@ -2480,6 +2481,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
    if (ret)
        goto init_fail;

    ff_audiodsp_init(&s->adsp);
    ff_dsputil_init(&s->dsp, avctx);
    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);

--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -39,6 +39,7 @@
 #include "fft.h"
 #include "mathops.h"
 #include "put_bits.h"
 #include "audiodsp.h"

 #ifndef CONFIG_AC3ENC_FLOAT
 #define CONFIG_AC3ENC_FLOAT 0
@@ -162,6 +163,7 @@ typedef struct AC3EncodeContext {
    AVCodecContext *avctx;                  ///< parent AVCodecContext
    PutBitContext pb;                       ///< bitstream writer context
    DSPContext dsp;
    AudioDSPContext adsp;
    AVFloatDSPContext fdsp;
    AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
    FFTContext mdct;                        ///< FFT context for MDCT calculation
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -29,6 +29,7 @@
 #define FFT_FLOAT 0
 #undef CONFIG_AC3ENC_FLOAT
 #include "internal.h"
 #include "audiodsp.h"
 #include "ac3enc.h"
 #include "eac3enc.h"

@@ -100,9 +101,10 @@ static void scale_coefficients(AC3EncodeContext *s)
 /*
 * Clip MDCT coefficients to allowable range.
 */
 static void clip_coefficients(DSPContext *dsp, int32_t *coef, unsigned int len)
 static void clip_coefficients(AudioDSPContext *adsp, int32_t *coef,
                              unsigned int len)
 {
    dsp->vector_clip_int32(coef, coef, COEF_MIN, COEF_MAX, len);
    adsp->vector_clip_int32(coef, coef, COEF_MIN, COEF_MAX, len);
 }


--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -28,6 +28,7 @@

 #define CONFIG_AC3ENC_FLOAT 1
 #include "internal.h"
 #include "audiodsp.h"
 #include "ac3enc.h"
 #include "eac3enc.h"
 #include "kbdwin.h"
@@ -107,9 +108,10 @@ static void scale_coefficients(AC3EncodeContext *s)
 /*
 * Clip MDCT coefficients to allowable range.
 */
 static void clip_coefficients(DSPContext *dsp, float *coef, unsigned int len)
 static void clip_coefficients(AudioDSPContext *adsp, float *coef,
                              unsigned int len)
 {
    dsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len);
    adsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len);
 }


--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -30,6 +30,8 @@

 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"

 #include "audiodsp.h"
 #include "internal.h"
 #include "ac3enc.h"
 #include "eac3enc.h"
@@ -40,7 +42,8 @@ static void scale_coefficients(AC3EncodeContext *s);

 static int normalize_samples(AC3EncodeContext *s);

 static void clip_coefficients(DSPContext *dsp, CoefType *coef, unsigned int len);
 static void clip_coefficients(AudioDSPContext *adsp, CoefType *coef,
                              unsigned int len);

 static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl);

@@ -161,7 +164,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
        }

        /* coefficients must be clipped in order to be encoded */
        clip_coefficients(&s->dsp, cpl_coef, num_cpl_coefs);
        clip_coefficients(&s->adsp, cpl_coef, num_cpl_coefs);
    }

    /* calculate energy in each band in coupling channel and each fbw channel */
@@ -412,7 +415,7 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
    if (s->fixed_point)
        scale_coefficients(s);

    clip_coefficients(&s->dsp, s->blocks[0].mdct_coef[1],
    clip_coefficients(&s->adsp, s->blocks[0].mdct_coef[1],
                      AC3_MAX_COEFS * s->num_blocks * s->channels);

    s->cpl_on = s->cpl_enabled;
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -26,6 +26,7 @@
 #include "avcodec.h"
 #include "acelp_pitch_delay.h"
 #include "celp_math.h"
 #include "audiodsp.h"

 int ff_acelp_decode_8bit_to_1st_delay3(int ac_index)
 {
@@ -90,7 +91,7 @@ void ff_acelp_update_past_gain(
 }

 int16_t ff_acelp_decode_gain_code(
    DSPContext *dsp,
    AudioDSPContext *adsp,
    int gain_corr_factor,
    const int16_t* fc_v,
    int mr_energy,
@@ -107,7 +108,7 @@ int16_t ff_acelp_decode_gain_code(
        mr_energy += quant_energy[i] * ma_prediction_coeff[i];

    mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
                sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
                sqrt(adsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
    return mr_energy >> 12;
 }

--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -24,7 +24,8 @@
 #define AVCODEC_ACELP_PITCH_DELAY_H

 #include <stdint.h>
 #include "dsputil.h"

 #include "audiodsp.h"

 #define PITCH_DELAY_MIN             20
 #define PITCH_DELAY_MAX             143
@@ -139,7 +140,7 @@ void ff_acelp_update_past_gain(
 /**
 * @brief Decode the adaptive codebook gain and add
 *        correction (4.1.5 and 3.9.1 of G.729).
 * @param dsp initialized dsputil context
 * @param adsp initialized audio DSP context
 * @param gain_corr_factor gain correction factor (2.13)
 * @param fc_v fixed-codebook vector (2.13)
 * @param mr_energy mean innovation energy and fixed-point correction (7.13)
@@ -208,7 +209,7 @@ void ff_acelp_update_past_gain(
 * @remark The routine is used in G.729 and AMR (all modes).
 */
 int16_t ff_acelp_decode_gain_code(
    DSPContext *dsp,
    AudioDSPContext *adsp,
    int gain_corr_factor,
    const int16_t* fc_v,
    int mr_energy,
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -4,6 +4,7 @@ OBJS                                   += arm/fmtconvert_init_arm.o

 OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
                                          arm/ac3dsp_arm.o
 OBJS-$(CONFIG_AUDIODSP)                += arm/audiodsp_init_arm.o
 OBJS-$(CONFIG_BLOCKDSP)                += arm/blockdsp_init_arm.o
 OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o        \
                                          arm/dsputil_arm.o             \
@@ -77,11 +78,13 @@ VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
 NEON-OBJS                              += arm/fmtconvert_neon.o

 NEON-OBJS-$(CONFIG_AC3DSP)             += arm/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_AUDIODSP)           += arm/audiodsp_init_neon.o      \
                                          arm/audiodsp_neon.o           \
                                          arm/int_neon.o
 NEON-OBJS-$(CONFIG_BLOCKDSP)           += arm/blockdsp_init_neon.o      \
                                          arm/blockdsp_neon.o
 NEON-OBJS-$(CONFIG_DSPUTIL)            += arm/dsputil_init_neon.o       \
                                          arm/dsputil_neon.o            \
                                          arm/int_neon.o                \
                                          arm/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_FFT)                += arm/fft_neon.o                \
                                          arm/fft_fixed_neon.o
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -0,0 +1,26 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_ARM_AUDIODSP_ARM_H
 #define AVCODEC_ARM_AUDIODSP_ARM_H

 #include "libavcodec/audiodsp.h"

 void ff_audiodsp_init_neon(AudioDSPContext *c);

 #endif /* AVCODEC_ARM_AUDIODSP_ARM_H */
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -0,0 +1,33 @@
 /*
 * ARM optimized audio functions
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/audiodsp.h"
 #include "audiodsp_arm.h"

 av_cold void ff_audiodsp_init_arm(AudioDSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();

    if (have_neon(cpu_flags))
        ff_audiodsp_init_neon(c);
 }
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -0,0 +1,41 @@
 /*
 * ARM NEON optimised audio functions
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "libavutil/attributes.h"
 #include "libavcodec/audiodsp.h"
 #include "audiodsp_arm.h"

 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                          int len);
 void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
                               int32_t max, unsigned int len);

 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);

 av_cold void ff_audiodsp_init_neon(AudioDSPContext *c)
 {
    c->vector_clip_int32 = ff_vector_clip_int32_neon;
    c->vector_clipf      = ff_vector_clipf_neon;

    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
 }
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -0,0 +1,64 @@
 /*
 * ARM NEON optimised audio functions
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/arm/asm.S"

 function ff_vector_clipf_neon, export=1
 VFP     vdup.32         q1,  d0[1]
 VFP     vdup.32         q0,  d0[0]
 NOVFP   vdup.32         q0,  r2
 NOVFP   vdup.32         q1,  r3
 NOVFP   ldr             r2,  [sp]
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
 1:      vmax.f32        q8,  q10, q0
        vmax.f32        q9,  q11, q0
        subs            r2,  r2,  #8
        beq             2f
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
        vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        b               1b
 2:      vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        bx              lr
 endfunc

 function ff_vector_clip_int32_neon, export=1
        vdup.32         q0,  r2
        vdup.32         q1,  r3
        ldr             r2,  [sp]
 1:
        vld1.32         {q2-q3},  [r1,:128]!
        vmin.s32        q2,  q2,  q1
        vmin.s32        q3,  q3,  q1
        vmax.s32        q2,  q2,  q0
        vmax.s32        q3,  q3,  q0
        vst1.32         {q2-q3},  [r0,:128]!
        subs            r2,  r2,  #8
        bgt             1b
        bx              lr
 endfunc
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -34,13 +34,6 @@ void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
 void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
 void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);

 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                          int len);
 void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
                               int32_t max, unsigned int len);

 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);

 av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
@@ -57,9 +50,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
    c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
    c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;

    c->vector_clipf      = ff_vector_clipf_neon;
    c->vector_clip_int32 = ff_vector_clip_int32_neon;

    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
 }
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -126,45 +126,3 @@ function ff_add_pixels_clamped_neon, export=1
        vst1.8          {d6},    [r3,:64], r2
        bx              lr
 endfunc

 function ff_vector_clipf_neon, export=1
 VFP     vdup.32         q1,  d0[1]
 VFP     vdup.32         q0,  d0[0]
 NOVFP   vdup.32         q0,  r2
 NOVFP   vdup.32         q1,  r3
 NOVFP   ldr             r2,  [sp]
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
 1:      vmax.f32        q8,  q10, q0
        vmax.f32        q9,  q11, q0
        subs            r2,  r2,  #8
        beq             2f
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
        vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        b               1b
 2:      vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        bx              lr
 endfunc

 function ff_vector_clip_int32_neon, export=1
        vdup.32         q0,  r2
        vdup.32         q1,  r3
        ldr             r2,  [sp]
 1:
        vld1.32         {q2-q3},  [r1,:128]!
        vmin.s32        q2,  q2,  q1
        vmin.s32        q3,  q3,  q1
        vmax.s32        q2,  q2,  q0
        vmax.s32        q3,  q3,  q0
        vst1.32         {q2-q3},  [r0,:128]!
        subs            r2,  r2,  #8
        bgt             1b
        bx              lr
 endfunc
--- a/libavcodec/audiodsp.c
+++ b/libavcodec/audiodsp.c
@@ -0,0 +1,118 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "audiodsp.h"

 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                                   uint32_t maxi, uint32_t maxisign)
 {
    if (a > mini)
        return mini;
    else if ((a ^ (1U << 31)) > maxisign)
        return maxi;
    else
        return a;
 }

 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
                                         float *min, float *max, int len)
 {
    int i;
    uint32_t mini        = *(uint32_t *) min;
    uint32_t maxi        = *(uint32_t *) max;
    uint32_t maxisign    = maxi ^ (1U << 31);
    uint32_t *dsti       = (uint32_t *) dst;
    const uint32_t *srci = (const uint32_t *) src;

    for (i = 0; i < len; i += 8) {
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
    }
 }

 static void vector_clipf_c(float *dst, const float *src,
                           float min, float max, int len)
 {
    int i;

    if (min < 0 && max > 0) {
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
    } else {
        for (i = 0; i < len; i += 8) {
            dst[i]     = av_clipf(src[i], min, max);
            dst[i + 1] = av_clipf(src[i + 1], min, max);
            dst[i + 2] = av_clipf(src[i + 2], min, max);
            dst[i + 3] = av_clipf(src[i + 3], min, max);
            dst[i + 4] = av_clipf(src[i + 4], min, max);
            dst[i + 5] = av_clipf(src[i + 5], min, max);
            dst[i + 6] = av_clipf(src[i + 6], min, max);
            dst[i + 7] = av_clipf(src[i + 7], min, max);
        }
    }
 }

 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
                                     int order)
 {
    int res = 0;

    while (order--)
        res += *v1++ **v2++;

    return res;
 }

 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                int32_t max, unsigned int len)
 {
    do {
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        len   -= 8;
    } while (len > 0);
 }

 av_cold void ff_audiodsp_init(AudioDSPContext *c)
 {
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->vector_clip_int32   = vector_clip_int32_c;
    c->vector_clipf        = vector_clipf_c;

    if (ARCH_ARM)
        ff_audiodsp_init_arm(c);
    if (ARCH_PPC)
        ff_audiodsp_init_ppc(c);
    if (ARCH_X86)
        ff_audiodsp_init_x86(c);
 }
--- a/libavcodec/audiodsp.h
+++ b/libavcodec/audiodsp.h
@@ -0,0 +1,59 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_AUDIODSP_H
 #define AVCODEC_AUDIODSP_H

 #include <stdint.h>

 typedef struct AudioDSPContext {
    /**
     * Calculate scalar product of two vectors.
     * @param len length of vectors, should be multiple of 16
     */
    int32_t (*scalarproduct_int16)(const int16_t *v1,
                                   const int16_t *v2 /* align 16 */, int len);

    /**
     * Clip each element in an array of int32_t to a given minimum and
     * maximum value.
     * @param dst  destination array
     *             constraints: 16-byte aligned
     * @param src  source array
     *             constraints: 16-byte aligned
     * @param min  minimum value
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
     * @param max  maximum value
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
     * @param len  number of elements in the array
     *             constraints: multiple of 32 greater than zero
     */
    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                              int32_t max, unsigned int len);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*vector_clipf)(float *dst /* align 16 */,
                         const float *src /* align 16 */,
                         float min, float max, int len /* align 16 */);
 } AudioDSPContext;

 void ff_audiodsp_init(AudioDSPContext *c);
 void ff_audiodsp_init_arm(AudioDSPContext *c);
 void ff_audiodsp_init_ppc(AudioDSPContext *c);
 void ff_audiodsp_init_x86(AudioDSPContext *c);

 #endif /* AVCODEC_AUDIODSP_H */
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -44,9 +44,10 @@

 #include "libavutil/channel_layout.h"
 #include "libavutil/lfg.h"

 #include "audiodsp.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "dsputil.h"
 #include "bytestream.h"
 #include "fft.h"
 #include "internal.h"
@@ -122,7 +123,7 @@ typedef struct cook {
    void (*saturate_output)(struct cook *q, float *out);

    AVCodecContext*     avctx;
    DSPContext          dsp;
    AudioDSPContext     adsp;
    GetBitContext       gb;
    /* stream data */
    int                 num_vectors;
@@ -865,8 +866,8 @@ static inline void decode_bytes_and_gain(COOKContext *q, COOKSubpacket *p,
 */
 static void saturate_output_float(COOKContext *q, float *out)
 {
    q->dsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel,
                        -1.0f, 1.0f, FFALIGN(q->samples_per_channel, 8));
    q->adsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel,
                         -1.0f, 1.0f, FFALIGN(q->samples_per_channel, 8));
 }


@@ -1065,7 +1066,7 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
    /* Initialize RNG. */
    av_lfg_init(&q->random_state, 0);

    ff_dsputil_init(&q->dsp, avctx);
    ff_audiodsp_init(&q->adsp);

    while (edata_ptr < edata_ptr_end) {
        /* 8 for mono, 16 for stereo, ? for multichannel
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -1267,87 +1267,6 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
 WRAPPER8_16_SQ(bit8x8_c, bit16_c)

 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                                   uint32_t maxi, uint32_t maxisign)
 {
    if (a > mini)
        return mini;
    else if ((a ^ (1U << 31)) > maxisign)
        return maxi;
    else
        return a;
 }

 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
                                         float *min, float *max, int len)
 {
    int i;
    uint32_t mini        = *(uint32_t *) min;
    uint32_t maxi        = *(uint32_t *) max;
    uint32_t maxisign    = maxi ^ (1U << 31);
    uint32_t *dsti       = (uint32_t *) dst;
    const uint32_t *srci = (const uint32_t *) src;

    for (i = 0; i < len; i += 8) {
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
    }
 }

 static void vector_clipf_c(float *dst, const float *src,
                           float min, float max, int len)
 {
    int i;

    if (min < 0 && max > 0) {
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
    } else {
        for (i = 0; i < len; i += 8) {
            dst[i]     = av_clipf(src[i], min, max);
            dst[i + 1] = av_clipf(src[i + 1], min, max);
            dst[i + 2] = av_clipf(src[i + 2], min, max);
            dst[i + 3] = av_clipf(src[i + 3], min, max);
            dst[i + 4] = av_clipf(src[i + 4], min, max);
            dst[i + 5] = av_clipf(src[i + 5], min, max);
            dst[i + 6] = av_clipf(src[i + 6], min, max);
            dst[i + 7] = av_clipf(src[i + 7], min, max);
        }
    }
 }

 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
                                     int order)
 {
    int res = 0;

    while (order--)
        res += *v1++ **v2++;

    return res;
 }

 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                int32_t max, unsigned int len)
 {
    do {
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        len   -= 8;
    } while (len > 0);
 }

 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
 {
    ff_j_rev_dct(block);
@@ -1502,10 +1421,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->try_8x8basis = try_8x8basis_c;
    c->add_8x8basis = add_8x8basis_c;

    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->vector_clip_int32   = vector_clip_int32_c;
    c->vector_clipf        = vector_clipf_c;

    c->shrink[0] = av_image_copy_plane;
    c->shrink[1] = ff_shrink22;
    c->shrink[2] = ff_shrink44;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -125,11 +125,6 @@ typedef struct DSPContext {
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
    void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);

    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*vector_clipf)(float *dst /* align 16 */,
                         const float *src /* align 16 */,
                         float min, float max, int len /* align 16 */);

    /* (I)DCT */
    void (*fdct)(int16_t *block /* align 16 */);
    void (*fdct248)(int16_t *block /* align 16 */);
@@ -189,30 +184,6 @@ typedef struct DSPContext {

    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
                      int src_wrap, int width, int height);

    /**
     * Calculate scalar product of two vectors.
     * @param len length of vectors, should be multiple of 16
     */
    int32_t (*scalarproduct_int16)(const int16_t *v1,
                                   const int16_t *v2 /* align 16 */, int len);

    /**
     * Clip each element in an array of int32_t to a given minimum and
     * maximum value.
     * @param dst  destination array
     *             constraints: 16-byte aligned
     * @param src  source array
     *             constraints: 16-byte aligned
     * @param min  minimum value
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
     * @param max  maximum value
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
     * @param len  number of elements in the array
     *             constraints: multiple of 32 greater than zero
     */
    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                              int32_t max, unsigned int len);
 } DSPContext;

 void ff_dsputil_static_init(void);
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -1,5 +1,6 @@
 OBJS                                   += ppc/fmtconvert_altivec.o      \

 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
 OBJS-$(CONFIG_DSPUTIL)                 += ppc/dsputil_ppc.o
 OBJS-$(CONFIG_FFT)                     += ppc/fft_altivec.o
@@ -24,7 +25,6 @@ ALTIVEC-OBJS-$(CONFIG_DSPUTIL)         += ppc/dsputil_altivec.o         \
                                          ppc/fdct_altivec.o            \
                                          ppc/gmc_altivec.o             \
                                          ppc/idct_altivec.o            \
                                          ppc/int_altivec.o             \

 FFT-OBJS-$(HAVE_GNU_AS)                += ppc/fft_altivec_s.o
 ALTIVEC-OBJS-$(CONFIG_FFT)             += $(FFT-OBJS-yes)
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -20,7 +20,7 @@

 /**
 * @file
 * miscellaneous integer operations
 * miscellaneous audio operations
 */

 #include "config.h"
@@ -29,10 +29,13 @@
 #endif

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/dsputil.h"
 #include "dsputil_altivec.h"
 #include "libavcodec/audiodsp.h"

 #if HAVE_ALTIVEC

 static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
@@ -56,7 +59,14 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
    return ires;
 }

 av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
 #endif /* HAVE_ALTIVEC */

 av_cold void ff_audiodsp_init_ppc(AudioDSPContext *c)
 {
 #if HAVE_ALTIVEC
    if (!PPC_ALTIVEC(av_get_cpu_flags()))
        return;

    c->scalarproduct_int16 = scalarproduct_int16_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
@@ -35,6 +35,5 @@ void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

 void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);
 void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx);

 #endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -34,7 +34,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx,
 {
    if (PPC_ALTIVEC(av_get_cpu_flags())) {
        ff_dsputil_init_altivec(c, avctx, high_bit_depth);
        ff_int_init_altivec(c, avctx);

        c->gmc1 = ff_gmc1_altivec;

        if (!high_bit_depth) {
--- a/libavcodec/takdec.c
+++ b/libavcodec/takdec.c
@@ -28,8 +28,8 @@
 #include "libavutil/internal.h"
 #include "libavutil/samplefmt.h"
 #include "tak.h"
 #include "audiodsp.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "internal.h"
 #include "unary.h"

@@ -45,7 +45,7 @@ typedef struct MCDParam {

 typedef struct TAKDecContext {
    AVCodecContext *avctx;                          // parent AVCodecContext
    DSPContext      dsp;
    AudioDSPContext adsp;
    TAKStreamInfo   ti;
    GetBitContext   gb;                             // bitstream reader initialized to start at the current frame

@@ -172,7 +172,7 @@ static av_cold int tak_decode_init(AVCodecContext *avctx)
 {
    TAKDecContext *s = avctx->priv_data;

    ff_dsputil_init(&s->dsp, avctx);
    ff_audiodsp_init(&s->adsp);

    s->avctx = avctx;

@@ -484,8 +484,8 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
    for (i = 0; i < subframe_size - filter_order; i++) {
        int v = 1 << (filter_quant - 1);

        v += s->dsp.scalarproduct_int16(&s->residues[i], filter,
                                        FFALIGN(filter_order, 16));
        v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
                                         FFALIGN(filter_order, 16));

        v = (av_clip(v >> filter_quant, -8192, 8191) << dshift) - *decoded;
        *decoded++ = v;
@@ -654,8 +654,8 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
        for (i = 0; i < length2; i++) {
            int v = 1 << 9;

            v += s->dsp.scalarproduct_int16(&s->residues[i], filter,
                                            FFALIGN(filter_order, 16));
            v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
                                             FFALIGN(filter_order, 16));

            p1[i] = (av_clip(v >> 10, -8192, 8191) << dshift) - p1[i];
        }
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -2,6 +2,7 @@ OBJS                                   += x86/constants.o               \
                                          x86/fmtconvert_init.o         \

 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
 OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o
 OBJS-$(CONFIG_ENCODERS)                += x86/dsputilenc_mmx.o          \
@@ -44,6 +45,7 @@ OBJS-$(CONFIG_VP7_DECODER)             += x86/vp8dsp_init.o
 OBJS-$(CONFIG_VP8_DECODER)             += x86/vp8dsp_init.o
 OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o

 MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
 MMX-OBJS-$(CONFIG_BLOCKDSP)            += x86/blockdsp_mmx.o
 MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
                                          x86/idct_mmx_xvid.o           \
@@ -61,6 +63,7 @@ YASM-OBJS                              += x86/deinterlace.o             \
                                          x86/fmtconvert.o              \

 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_DSPUTIL)            += x86/dsputil.o
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -0,0 +1,137 @@
 ;******************************************************************************
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
 ;* This file is part of Libav.
 ;*
 ;* Libav is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* Libav is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with Libav; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION_TEXT

 %macro SCALARPRODUCT 0
 ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
    shl orderq, 1
    add v1q, orderq
    add v2q, orderq
    neg orderq
    pxor    m2, m2
 .loop:
    movu    m0, [v1q + orderq]
    movu    m1, [v1q + orderq + mmsize]
    pmaddwd m0, [v2q + orderq]
    pmaddwd m1, [v2q + orderq + mmsize]
    paddd   m2, m0
    paddd   m2, m1
    add     orderq, mmsize*2
    jl .loop
 %if mmsize == 16
    movhlps m0, m2
    paddd   m2, m0
    pshuflw m0, m2, 0x4e
 %else
    pshufw  m0, m2, 0x4e
 %endif
    paddd   m2, m0
    movd   eax, m2
    RET
 %endmacro

 INIT_MMX mmxext
 SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT


 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------

 ; %1 = number of xmm registers used
 ; %2 = number of inline load/process/store loops per asm loop
 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
 ; %5 = suffix
 %macro VECTOR_CLIP_INT32 4-5
 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
 %if %4
    cvtsi2ss  m4, minm
    cvtsi2ss  m5, maxm
 %else
    movd      m4, minm
    movd      m5, maxm
 %endif
    SPLATD    m4
    SPLATD    m5
 .loop:
 %assign %%i 1
 %rep %2
    mova      m0,  [srcq+mmsize*0*%%i]
    mova      m1,  [srcq+mmsize*1*%%i]
    mova      m2,  [srcq+mmsize*2*%%i]
    mova      m3,  [srcq+mmsize*3*%%i]
 %if %3
    mova      m7,  [srcq+mmsize*4*%%i]
    mova      m8,  [srcq+mmsize*5*%%i]
    mova      m9,  [srcq+mmsize*6*%%i]
    mova      m10, [srcq+mmsize*7*%%i]
 %endif
    CLIPD  m0,  m4, m5, m6
    CLIPD  m1,  m4, m5, m6
    CLIPD  m2,  m4, m5, m6
    CLIPD  m3,  m4, m5, m6
 %if %3
    CLIPD  m7,  m4, m5, m6
    CLIPD  m8,  m4, m5, m6
    CLIPD  m9,  m4, m5, m6
    CLIPD  m10, m4, m5, m6
 %endif
    mova  [dstq+mmsize*0*%%i], m0
    mova  [dstq+mmsize*1*%%i], m1
    mova  [dstq+mmsize*2*%%i], m2
    mova  [dstq+mmsize*3*%%i], m3
 %if %3
    mova  [dstq+mmsize*4*%%i], m7
    mova  [dstq+mmsize*5*%%i], m8
    mova  [dstq+mmsize*6*%%i], m9
    mova  [dstq+mmsize*7*%%i], m10
 %endif
 %assign %%i %%i+1
 %endrep
    add     srcq, mmsize*4*(%2+%3)
    add     dstq, mmsize*4*(%2+%3)
    sub     lend, mmsize*(%2+%3)
    jg .loop
    REP_RET
 %endmacro

 INIT_MMX mmx
 %define CLIPD CLIPD_MMX
 VECTOR_CLIP_INT32 0, 1, 0, 0
 INIT_XMM sse2
 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
 %define CLIPD CLIPD_SSE2
 VECTOR_CLIP_INT32 6, 2, 0, 1
 INIT_XMM sse4
 %define CLIPD CLIPD_SSE41
 %ifdef m8
 VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif
--- a/libavcodec/x86/audiodsp.h
+++ b/libavcodec/x86/audiodsp.h
@@ -0,0 +1,25 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_X86_AUDIODSP_H
 #define AVCODEC_X86_AUDIODSP_H

 void ff_vector_clipf_sse(float *dst, const float *src,
                         float min, float max, int len);

 #endif /* AVCODEC_X86_AUDIODSP_H */
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -0,0 +1,66 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/audiodsp.h"
 #include "audiodsp.h"

 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                      int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                    int order);

 void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
                              int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
                               int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                   int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                               int32_t min, int32_t max, unsigned int len);

 av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags))
        c->vector_clip_int32 = ff_vector_clip_int32_mmx;

    if (EXTERNAL_MMXEXT(cpu_flags))
        c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;

    if (INLINE_SSE(cpu_flags))
        c->vector_clipf = ff_vector_clipf_sse;

    if (EXTERNAL_SSE2(cpu_flags)) {
        c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
        if (cpu_flags & AV_CPU_FLAG_ATOM)
            c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
        else
            c->vector_clip_int32 = ff_vector_clip_int32_sse2;
    }

    if (EXTERNAL_SSE4(cpu_flags))
        c->vector_clip_int32 = ff_vector_clip_int32_sse4;
 }
--- a/libavcodec/x86/audiodsp_mmx.c
+++ b/libavcodec/x86/audiodsp_mmx.c
@@ -0,0 +1,58 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "config.h"
 #include "libavutil/x86/asm.h"
 #include "audiodsp.h"

 #if HAVE_INLINE_ASM

 void ff_vector_clipf_sse(float *dst, const float *src,
                         float min, float max, int len)
 {
    x86_reg i = (len - 16) * 4;
    __asm__ volatile (
        "movss          %3, %%xmm4      \n\t"
        "movss          %4, %%xmm5      \n\t"
        "shufps $0, %%xmm4, %%xmm4      \n\t"
        "shufps $0, %%xmm5, %%xmm5      \n\t"
        "1:                             \n\t"
        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
        "movaps 16(%2, %0), %%xmm1      \n\t"
        "movaps 32(%2, %0), %%xmm2      \n\t"
        "movaps 48(%2, %0), %%xmm3      \n\t"
        "maxps      %%xmm4, %%xmm0      \n\t"
        "maxps      %%xmm4, %%xmm1      \n\t"
        "maxps      %%xmm4, %%xmm2      \n\t"
        "maxps      %%xmm4, %%xmm3      \n\t"
        "minps      %%xmm5, %%xmm0      \n\t"
        "minps      %%xmm5, %%xmm1      \n\t"
        "minps      %%xmm5, %%xmm2      \n\t"
        "minps      %%xmm5, %%xmm3      \n\t"
        "movaps     %%xmm0,   (%1, %0)  \n\t"
        "movaps     %%xmm1, 16(%1, %0)  \n\t"
        "movaps     %%xmm2, 32(%1, %0)  \n\t"
        "movaps     %%xmm3, 48(%1, %0)  \n\t"
        "sub           $64, %0          \n\t"
        "jge            1b              \n\t"
        : "+&r" (i)
        : "r" (dst), "r" (src), "m" (min), "m" (max)
        : "memory");
 }

 #endif /* HAVE_INLINE_ASM */
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -26,119 +26,6 @@ pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12

 SECTION_TEXT

 %macro SCALARPRODUCT 0
 ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
    shl orderq, 1
    add v1q, orderq
    add v2q, orderq
    neg orderq
    pxor    m2, m2
 .loop:
    movu    m0, [v1q + orderq]
    movu    m1, [v1q + orderq + mmsize]
    pmaddwd m0, [v2q + orderq]
    pmaddwd m1, [v2q + orderq + mmsize]
    paddd   m2, m0
    paddd   m2, m1
    add     orderq, mmsize*2
    jl .loop
 %if mmsize == 16
    movhlps m0, m2
    paddd   m2, m0
    pshuflw m0, m2, 0x4e
 %else
    pshufw  m0, m2, 0x4e
 %endif
    paddd   m2, m0
    movd   eax, m2
    RET
 %endmacro

 INIT_MMX mmxext
 SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT


 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------

 ; %1 = number of xmm registers used
 ; %2 = number of inline load/process/store loops per asm loop
 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
 ; %5 = suffix
 %macro VECTOR_CLIP_INT32 4-5
 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
 %if %4
    cvtsi2ss  m4, minm
    cvtsi2ss  m5, maxm
 %else
    movd      m4, minm
    movd      m5, maxm
 %endif
    SPLATD    m4
    SPLATD    m5
 .loop:
 %assign %%i 1
 %rep %2
    mova      m0,  [srcq+mmsize*0*%%i]
    mova      m1,  [srcq+mmsize*1*%%i]
    mova      m2,  [srcq+mmsize*2*%%i]
    mova      m3,  [srcq+mmsize*3*%%i]
 %if %3
    mova      m7,  [srcq+mmsize*4*%%i]
    mova      m8,  [srcq+mmsize*5*%%i]
    mova      m9,  [srcq+mmsize*6*%%i]
    mova      m10, [srcq+mmsize*7*%%i]
 %endif
    CLIPD  m0,  m4, m5, m6
    CLIPD  m1,  m4, m5, m6
    CLIPD  m2,  m4, m5, m6
    CLIPD  m3,  m4, m5, m6
 %if %3
    CLIPD  m7,  m4, m5, m6
    CLIPD  m8,  m4, m5, m6
    CLIPD  m9,  m4, m5, m6
    CLIPD  m10, m4, m5, m6
 %endif
    mova  [dstq+mmsize*0*%%i], m0
    mova  [dstq+mmsize*1*%%i], m1
    mova  [dstq+mmsize*2*%%i], m2
    mova  [dstq+mmsize*3*%%i], m3
 %if %3
    mova  [dstq+mmsize*4*%%i], m7
    mova  [dstq+mmsize*5*%%i], m8
    mova  [dstq+mmsize*6*%%i], m9
    mova  [dstq+mmsize*7*%%i], m10
 %endif
 %assign %%i %%i+1
 %endrep
    add     srcq, mmsize*4*(%2+%3)
    add     dstq, mmsize*4*(%2+%3)
    sub     lend, mmsize*(%2+%3)
    jg .loop
    REP_RET
 %endmacro

 INIT_MMX mmx
 %define CLIPD CLIPD_MMX
 VECTOR_CLIP_INT32 0, 1, 0, 0
 INIT_XMM sse2
 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
 %define CLIPD CLIPD_SSE2
 VECTOR_CLIP_INT32 6, 2, 0, 1
 INIT_XMM sse4
 %define CLIPD CLIPD_SSE41
 %ifdef m8
 VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif

 ; %1 = aligned/unaligned
 %macro BSWAP_LOOPS  1
    mov      r3, r2
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -26,23 +26,9 @@
 #include "dsputil_x86.h"
 #include "idct_xvid.h"

 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                      int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                    int order);

 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);

 void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
                              int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
                               int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                   int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                               int32_t min, int32_t max, unsigned int len);

 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
                                     int cpu_flags, unsigned high_bit_depth)
 {
@@ -72,10 +58,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,

    c->gmc = ff_gmc_mmx;
 #endif /* HAVE_MMX_INLINE */

 #if HAVE_MMX_EXTERNAL
    c->vector_clip_int32 = ff_vector_clip_int32_mmx;
 #endif /* HAVE_MMX_EXTERNAL */
 }

 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
@@ -88,18 +70,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
        c->idct     = ff_idct_xvid_mmxext;
    }
 #endif /* HAVE_MMXEXT_INLINE */

 #if HAVE_MMXEXT_EXTERNAL
    c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }

 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
                                     int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSE_INLINE
    c->vector_clipf = ff_vector_clipf_sse;
 #endif /* HAVE_SSE_INLINE */
 }

 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
@@ -115,12 +85,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_SSE2_INLINE */

 #if HAVE_SSE2_EXTERNAL
    c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
    if (cpu_flags & AV_CPU_FLAG_ATOM) {
        c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
    } else {
        c->vector_clip_int32 = ff_vector_clip_int32_sse2;
    }
    c->bswap_buf = ff_bswap32_buf_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
@@ -133,14 +97,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_SSSE3_EXTERNAL */
 }

 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
                                      int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSE4_EXTERNAL
    c->vector_clip_int32 = ff_vector_clip_int32_sse4;
 #endif /* HAVE_SSE4_EXTERNAL */
 }

 av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
                                 unsigned high_bit_depth)
 {
@@ -152,18 +108,12 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
    if (X86_MMXEXT(cpu_flags))
        dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);

    if (X86_SSE(cpu_flags))
        dsputil_init_sse(c, avctx, cpu_flags, high_bit_depth);

    if (X86_SSE2(cpu_flags))
        dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);

    if (EXTERNAL_SSSE3(cpu_flags))
        dsputil_init_ssse3(c, avctx, cpu_flags, high_bit_depth);

    if (EXTERNAL_SSE4(cpu_flags))
        dsputil_init_sse4(c, avctx, cpu_flags, high_bit_depth);

    if (CONFIG_ENCODERS)
        ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
 }
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -25,7 +25,6 @@
 #include "config.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "constants.h"
 #include "dsputil_x86.h"
 #include "inline_asm.h"

@@ -375,37 +374,4 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
    }
 }

 void ff_vector_clipf_sse(float *dst, const float *src,
                         float min, float max, int len)
 {
    x86_reg i = (len - 16) * 4;
    __asm__ volatile (
        "movss          %3, %%xmm4      \n\t"
        "movss          %4, %%xmm5      \n\t"
        "shufps $0, %%xmm4, %%xmm4      \n\t"
        "shufps $0, %%xmm5, %%xmm5      \n\t"
        "1:                             \n\t"
        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
        "movaps 16(%2, %0), %%xmm1      \n\t"
        "movaps 32(%2, %0), %%xmm2      \n\t"
        "movaps 48(%2, %0), %%xmm3      \n\t"
        "maxps      %%xmm4, %%xmm0      \n\t"
        "maxps      %%xmm4, %%xmm1      \n\t"
        "maxps      %%xmm4, %%xmm2      \n\t"
        "maxps      %%xmm4, %%xmm3      \n\t"
        "minps      %%xmm5, %%xmm0      \n\t"
        "minps      %%xmm5, %%xmm1      \n\t"
        "minps      %%xmm5, %%xmm2      \n\t"
        "minps      %%xmm5, %%xmm3      \n\t"
        "movaps     %%xmm0,   (%1, %0)  \n\t"
        "movaps     %%xmm1, 16(%1, %0)  \n\t"
        "movaps     %%xmm2, 32(%1, %0)  \n\t"
        "movaps     %%xmm3, 48(%1, %0)  \n\t"
        "sub           $64, %0          \n\t"
        "jge            1b              \n\t"
        : "+&r" (i)
        : "r" (dst), "r" (src), "m" (min), "m" (max)
        : "memory");
 }

 #endif /* HAVE_INLINE_ASM */
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -46,7 +46,4 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
                int dxx, int dxy, int dyx, int dyy,
                int shift, int r, int width, int height);

 void ff_vector_clipf_sse(float *dst, const float *src,
                         float min, float max, int len);

 #endif /* AVCODEC_X86_DSPUTIL_X86_H */