dsputil: Move APE-specific bits into apedsp

11 years ago · 054013a0fc
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,6 +25,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
 #include "apedsp.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "bytestream.h"
@@ -136,6 +137,7 @@ typedef struct APEContext {
    AVClass *class;                          ///< class for AVOptions
    AVCodecContext *avctx;
    DSPContext dsp;
    APEDSPContext adsp;
    int channels;
    int samples;                             ///< samples left to decode in current frame
    int bps;
@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
 static void predictor_decode_mono_3950(APEContext *ctx, int count);
 static void predictor_decode_stereo_3950(APEContext *ctx, int count);
 // TODO: dsputilize
 static av_cold int ape_decode_close(AVCodecContext *avctx)
 {
    APEContext *s = avctx->priv_data;
@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
    return 0;
 }
 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3,
                                              int order, int mul)
 {
    int res = 0;
    while (order--) {
        res   += *v1 * *v2++;
        *v1++ += mul * *v3++;
    }
    return res;
 }
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
    APEContext *s = avctx->priv_data;
@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
        s->predictor_decode_stereo = predictor_decode_stereo_3950;
    }
    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    if (ARCH_ARM)
        ff_apedsp_init_arm(&s->adsp);
    if (ARCH_PPC)
        ff_apedsp_init_ppc(&s->adsp);
    if (ARCH_X86)
        ff_apedsp_init_x86(&s->adsp);
    ff_dsputil_init(&s->dsp, avctx);
    avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
    while (count--) {
        /* round fixedpoint scalar product */
        res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
                                                    f->adaptcoeffs - order,
                                                    order, APESIGN(*data));
        res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
                                                     f->delay - order,
                                                     f->adaptcoeffs - order,
                                                     order, APESIGN(*data));
        res = (res + (1 << (fracbits - 1))) >> fracbits;
        res += *data;
        *data++ = res;
--- a/libavcodec/apedsp.h
+++ b/libavcodec/apedsp.h
@@ -0,0 +1,44 @@
 /*
 * Monkey's Audio lossless audio decoder
 * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
 *  based upon libdemac from Dave Chapman.
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef AVCODEC_APEDSP_H
 #define AVCODEC_APEDSP_H
 #include <stdint.h>
 typedef struct APEDSPContext {
    /**
     * Calculate scalar product of v1 and v2,
     * and v1[i] += v3[i] * mul
     * @param len length of vectors, should be multiple of 16
     */
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
                                            const int16_t *v2,
                                            const int16_t *v3,
                                            int len, int mul);
 } APEDSPContext;
 void ff_apedsp_init_arm(APEDSPContext *c);
 void ff_apedsp_init_ppc(APEDSPContext *c);
 void ff_apedsp_init_x86(APEDSPContext *c);
 #endif /* AVCODEC_APEDSP_H */
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                          arm/sbrdsp_init_arm.o
 OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
 OBJS-$(CONFIG_FLAC_DECODER)            += arm/flacdsp_init_arm.o        \
                                          arm/flacdsp_arm.o
@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP)             += arm/vp3dsp_neon.o
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                          arm/sbrdsp_neon.o
 NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
                                          arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/apedsp_init_arm.c
@@ -0,0 +1,38 @@
 /*
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include <stdint.h>
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/apedsp.h"
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                             const int16_t *v3, int len, int mul);
 av_cold void ff_apedsp_init_arm(APEDSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();
    if (have_neon(cpu_flags)) {
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
    }
 }
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/apedsp_neon.S
@@ -0,0 +1,62 @@
 /*
 * ARM NEON optimised integer operations
 * Copyright (c) 2009 Kostya Shishkov
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/arm/asm.S"
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
 function ff_scalarproduct_and_madd_int16_neon, export=1
        vld1.16         {d28[],d29[]}, [sp]
        vmov.i16        q0,  #0
        vmov.i16        q1,  #0
        vmov.i16        q2,  #0
        vmov.i16        q3,  #0
        mov             r12, r0
 1:      vld1.16         {d16-d17}, [r0,:128]!
        vld1.16         {d18-d19}, [r1]!
        vld1.16         {d20-d21}, [r2]!
        vld1.16         {d22-d23}, [r0,:128]!
        vld1.16         {d24-d25}, [r1]!
        vld1.16         {d26-d27}, [r2]!
        vmul.s16        q10, q10,  q14
        vmul.s16        q13, q13,  q14
        vmlal.s16       q0,  d16,  d18
        vmlal.s16       q1,  d17,  d19
        vadd.s16        q10, q8,   q10
        vadd.s16        q13, q11,  q13
        vmlal.s16       q2,  d22,  d24
        vmlal.s16       q3,  d23,  d25
        vst1.16         {q10},     [r12,:128]!
        subs            r3,  r3,   #16
        vst1.16         {q13},     [r12,:128]!
        bne             1b
        vpadd.s32       d16, d0,   d1
        vpadd.s32       d17, d2,   d3
        vpadd.s32       d18, d4,   d5
        vpadd.s32       d19, d6,   d7
        vpadd.s32       d0,  d16,  d17
        vpadd.s32       d1,  d18,  d19
        vpadd.s32       d2,  d0,   d1
        vpaddl.s32      d3,  d2
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                             const int16_t *v3, int len, int mul);
 av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
    c->vector_clip_int32 = ff_vector_clip_int32_neon;
    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
 }
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
 function ff_scalarproduct_and_madd_int16_neon, export=1
        vld1.16         {d28[],d29[]}, [sp]
        vmov.i16        q0,  #0
        vmov.i16        q1,  #0
        vmov.i16        q2,  #0
        vmov.i16        q3,  #0
        mov             r12, r0
 1:      vld1.16         {d16-d17}, [r0,:128]!
        vld1.16         {d18-d19}, [r1]!
        vld1.16         {d20-d21}, [r2]!
        vld1.16         {d22-d23}, [r0,:128]!
        vld1.16         {d24-d25}, [r1]!
        vld1.16         {d26-d27}, [r2]!
        vmul.s16        q10, q10,  q14
        vmul.s16        q13, q13,  q14
        vmlal.s16       q0,  d16,  d18
        vmlal.s16       q1,  d17,  d19
        vadd.s16        q10, q8,   q10
        vadd.s16        q13, q11,  q13
        vmlal.s16       q2,  d22,  d24
        vmlal.s16       q3,  d23,  d25
        vst1.16         {q10},     [r12,:128]!
        subs            r3,  r3,   #16
        vst1.16         {q13},     [r12,:128]!
        bne             1b
        vpadd.s32       d16, d0,   d1
        vpadd.s32       d17, d2,   d3
        vpadd.s32       d18, d4,   d5
        vpadd.s32       d19, d6,   d7
        vpadd.s32       d0,  d16,  d17
        vpadd.s32       d1,  d18,  d19
        vpadd.s32       d2,  d0,   d1
        vpaddl.s32      d3,  d2
        vmov.32         r0,  d3[0]
        bx              lr
 endfunc
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
    return res;
 }
 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3,
                                              int order, int mul)
 {
    int res = 0;
    while (order--) {
        res   += *v1 * *v2++;
        *v1++ += mul * *v3++;
    }
    return res;
 }
 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                int32_t max, unsigned int len)
 {
@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->try_8x8basis = try_8x8basis_c;
    c->add_8x8basis = add_8x8basis_c;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->vector_clip_int32   = vector_clip_int32_c;
    c->vector_clipf        = vector_clipf_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -255,16 +255,6 @@ typedef struct DSPContext {
     */
    int32_t (*scalarproduct_int16)(const int16_t *v1,
                                   const int16_t *v2 /* align 16 */, int len);
    /* ape functions */
    /**
     * Calculate scalar product of v1 and v2,
     * and v1[i] += v3[i] * mul
     * @param len length of vectors, should be multiple of 16
     */
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
                                            const int16_t *v2,
                                            const int16_t *v3,
                                            int len, int mul);
    /**
     * Clip each element in an array of int32_t to a given minimum and
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VC1_DECODER)             += ppc/vc1dsp_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
--- a/libavcodec/ppc/apedsp_altivec.c
+++ b/libavcodec/ppc/apedsp_altivec.c
@@ -0,0 +1,77 @@
 /*
 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "config.h"
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 #include "libavutil/attributes.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavcodec/apedsp.h"
 #if HAVE_ALTIVEC
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                    const int16_t *v2,
                                                    const int16_t *v3,
                                                    int order, int mul)
 {
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16 *) v1;
    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
    register vec_s16 t0, t1, i0, i1, i4;
    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        i1     = vec_ld(16, v2);
        t0     = vec_perm(i2, i1, align);
        i2     = vec_ld(32, v2);
        t1     = vec_perm(i1, i2, align);
        i0     = pv1[0];
        i1     = pv1[1];
        res    = vec_msum(t0, i0, res);
        res    = vec_msum(t1, i1, res);
        i4     = vec_ld(16, v3);
        t0     = vec_perm(i3, i4, align);
        i3     = vec_ld(32, v3);
        t1     = vec_perm(i4, i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1   += 2;
        v2    += 16;
        v3    += 16;
    } while (--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
 }
 #endif /* HAVE_ALTIVEC */
 av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
 {
 #if HAVE_ALTIVEC
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
    return ires;
 }
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                    const int16_t *v2,
                                                    const int16_t *v3,
                                                    int order, int mul)
 {
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16 *) v1;
    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
    register vec_s16 t0, t1, i0, i1, i4;
    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        i1     = vec_ld(16, v2);
        t0     = vec_perm(i2, i1, align);
        i2     = vec_ld(32, v2);
        t1     = vec_perm(i1, i2, align);
        i0     = pv1[0];
        i1     = pv1[1];
        res    = vec_msum(t0, i0, res);
        res    = vec_msum(t1, i1, res);
        i4     = vec_ld(16, v3);
        t0     = vec_perm(i3, i4, align);
        i3     = vec_ld(32, v3);
        t1     = vec_perm(i4, i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1   += 2;
        v2    += 16;
        v3    += 16;
    } while (--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
 }
 av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
 {
    c->scalarproduct_int16 = scalarproduct_int16_altivec;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
 }
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
 OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
 ;* This file is part of Libav.
 ;*
 ;* Libav is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* Libav is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with Libav; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 %include "libavutil/x86/x86util.asm"
 SECTION_TEXT
 %macro SCALARPRODUCT 0
 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
 ;                                     int order, int mul)
 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
 %if mmsize == 16
    pshuflw m7, m7, 0
    punpcklqdq m7, m7
 %else
    pshufw  m7, m7, 0
 %endif
    pxor    m6, m6
    add v1q, orderq
    add v2q, orderq
    add v3q, orderq
    neg orderq
 .loop:
    movu    m0, [v2q + orderq]
    movu    m1, [v2q + orderq + mmsize]
    mova    m4, [v1q + orderq]
    mova    m5, [v1q + orderq + mmsize]
    movu    m2, [v3q + orderq]
    movu    m3, [v3q + orderq + mmsize]
    pmaddwd m0, m4
    pmaddwd m1, m5
    pmullw  m2, m7
    pmullw  m3, m7
    paddd   m6, m0
    paddd   m6, m1
    paddw   m2, m4
    paddw   m3, m5
    mova    [v1q + orderq], m2
    mova    [v1q + orderq + mmsize], m3
    add     orderq, mmsize*2
    jl .loop
 %if mmsize == 16
    movhlps m0, m6
    paddd   m6, m0
    pshuflw m0, m6, 0x4e
 %else
    pshufw  m0, m6, 0x4e
 %endif
    paddd   m6, m0
    movd   eax, m6
    RET
 %endmacro
 INIT_MMX mmxext
 SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 %macro SCALARPRODUCT_LOOP 1
 align 16
 .loop%1:
    sub     orderq, mmsize*2
 %if %1
    mova    m1, m4
    mova    m4, [v2q + orderq]
    mova    m0, [v2q + orderq + mmsize]
    palignr m1, m0, %1
    palignr m0, m4, %1
    mova    m3, m5
    mova    m5, [v3q + orderq]
    mova    m2, [v3q + orderq + mmsize]
    palignr m3, m2, %1
    palignr m2, m5, %1
 %else
    mova    m0, [v2q + orderq]
    mova    m1, [v2q + orderq + mmsize]
    mova    m2, [v3q + orderq]
    mova    m3, [v3q + orderq + mmsize]
 %endif
    %define t0  [v1q + orderq]
    %define t1  [v1q + orderq + mmsize]
 %if ARCH_X86_64
    mova    m8, t0
    mova    m9, t1
    %define t0  m8
    %define t1  m9
 %endif
    pmaddwd m0, t0
    pmaddwd m1, t1
    pmullw  m2, m7
    pmullw  m3, m7
    paddw   m2, t0
    paddw   m3, t1
    paddd   m6, m0
    paddd   m6, m1
    mova    [v1q + orderq], m2
    mova    [v1q + orderq + mmsize], m3
    jg .loop%1
 %if %1
    jmp .end
 %endif
 %endmacro
 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
 ;                                     int order, int mul)
 INIT_XMM ssse3
 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
    pshuflw m7, m7, 0
    punpcklqdq m7, m7
    pxor    m6, m6
    mov    r4d, v2d
    and    r4d, 15
    and    v2q, ~15
    and    v3q, ~15
    mova    m4, [v2q + orderq]
    mova    m5, [v3q + orderq]
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
    cmp    r4d, 0
    je .loop0
    cmp    r4d, 2
    je .loop2
    cmp    r4d, 4
    je .loop4
    cmp    r4d, 6
    je .loop6
    cmp    r4d, 8
    je .loop8
    cmp    r4d, 10
    je .loop10
    cmp    r4d, 12
    je .loop12
 SCALARPRODUCT_LOOP 14
 SCALARPRODUCT_LOOP 12
 SCALARPRODUCT_LOOP 10
 SCALARPRODUCT_LOOP 8
 SCALARPRODUCT_LOOP 6
 SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
    movhlps m0, m6
    paddd   m6, m0
    pshuflw m0, m6, 0x4e
    paddd   m6, m0
    movd   eax, m6
    RET
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/apedsp.h"
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
                                             const int16_t *v3,
                                             int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3,
                                              int order, int mul);
 av_cold void ff_apedsp_init_x86(APEDSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();
    if (EXTERNAL_MMXEXT(cpu_flags))
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
    if (EXTERNAL_SSE2(cpu_flags))
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
    if (EXTERNAL_SSSE3(cpu_flags) &&
        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
 }
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
    paddd   m2, m0
    movd   eax, m2
    RET
 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
 ;                                     int order, int mul)
 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
 %if mmsize == 16
    pshuflw m7, m7, 0
    punpcklqdq m7, m7
 %else
    pshufw  m7, m7, 0
 %endif
    pxor    m6, m6
    add v1q, orderq
    add v2q, orderq
    add v3q, orderq
    neg orderq
 .loop:
    movu    m0, [v2q + orderq]
    movu    m1, [v2q + orderq + mmsize]
    mova    m4, [v1q + orderq]
    mova    m5, [v1q + orderq + mmsize]
    movu    m2, [v3q + orderq]
    movu    m3, [v3q + orderq + mmsize]
    pmaddwd m0, m4
    pmaddwd m1, m5
    pmullw  m2, m7
    pmullw  m3, m7
    paddd   m6, m0
    paddd   m6, m1
    paddw   m2, m4
    paddw   m3, m5
    mova    [v1q + orderq], m2
    mova    [v1q + orderq + mmsize], m3
    add     orderq, mmsize*2
    jl .loop
 %if mmsize == 16
    movhlps m0, m6
    paddd   m6, m0
    pshuflw m0, m6, 0x4e
 %else
    pshufw  m0, m6, 0x4e
 %endif
    paddd   m6, m0
    movd   eax, m6
    RET
 %endmacro
 INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 %macro SCALARPRODUCT_LOOP 1
 align 16
 .loop%1:
    sub     orderq, mmsize*2
 %if %1
    mova    m1, m4
    mova    m4, [v2q + orderq]
    mova    m0, [v2q + orderq + mmsize]
    palignr m1, m0, %1
    palignr m0, m4, %1
    mova    m3, m5
    mova    m5, [v3q + orderq]
    mova    m2, [v3q + orderq + mmsize]
    palignr m3, m2, %1
    palignr m2, m5, %1
 %else
    mova    m0, [v2q + orderq]
    mova    m1, [v2q + orderq + mmsize]
    mova    m2, [v3q + orderq]
    mova    m3, [v3q + orderq + mmsize]
 %endif
    %define t0  [v1q + orderq]
    %define t1  [v1q + orderq + mmsize]
 %if ARCH_X86_64
    mova    m8, t0
    mova    m9, t1
    %define t0  m8
    %define t1  m9
 %endif
    pmaddwd m0, t0
    pmaddwd m1, t1
    pmullw  m2, m7
    pmullw  m3, m7
    paddw   m2, t0
    paddw   m3, t1
    paddd   m6, m0
    paddd   m6, m1
    mova    [v1q + orderq], m2
    mova    [v1q + orderq + mmsize], m3
    jg .loop%1
 %if %1
    jmp .end
 %endif
 %endmacro
 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
 ;                                     int order, int mul)
 INIT_XMM ssse3
 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
    pshuflw m7, m7, 0
    punpcklqdq m7, m7
    pxor    m6, m6
    mov    r4d, v2d
    and    r4d, 15
    and    v2q, ~15
    and    v3q, ~15
    mova    m4, [v2q + orderq]
    mova    m5, [v3q + orderq]
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
    cmp    r4d, 0
    je .loop0
    cmp    r4d, 2
    je .loop2
    cmp    r4d, 4
    je .loop4
    cmp    r4d, 6
    je .loop6
    cmp    r4d, 8
    je .loop8
    cmp    r4d, 10
    je .loop10
    cmp    r4d, 12
    je .loop12
 SCALARPRODUCT_LOOP 14
 SCALARPRODUCT_LOOP 12
 SCALARPRODUCT_LOOP 10
 SCALARPRODUCT_LOOP 8
 SCALARPRODUCT_LOOP 6
 SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
    movhlps m0, m6
    paddd   m6, m0
    pshuflw m0, m6, 0x4e
    paddd   m6, m0
    movd   eax, m6
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                      int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                    int order);
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
                                             const int16_t *v3,
                                             int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3,
                                              int order, int mul);
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
    SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
    c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 #if HAVE_SSE2_EXTERNAL
    c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
    if (cpu_flags & AV_CPU_FLAG_ATOM) {
        c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
    } else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                       int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
    if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
    c->bswap_buf = ff_bswap32_buf_ssse3;
 #endif /* HAVE_SSSE3_EXTERNAL */
 }