| @@ -25,6 +25,7 @@ | |||||
| #include "libavutil/avassert.h" | #include "libavutil/avassert.h" | ||||
| #include "libavutil/channel_layout.h" | #include "libavutil/channel_layout.h" | ||||
| #include "libavutil/opt.h" | #include "libavutil/opt.h" | ||||
| #include "apedsp.h" | |||||
| #include "avcodec.h" | #include "avcodec.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "bytestream.h" | #include "bytestream.h" | ||||
| @@ -136,6 +137,7 @@ typedef struct APEContext { | |||||
| AVClass *class; ///< class for AVOptions | AVClass *class; ///< class for AVOptions | ||||
| AVCodecContext *avctx; | AVCodecContext *avctx; | ||||
| DSPContext dsp; | DSPContext dsp; | ||||
| APEDSPContext adsp; | |||||
| int channels; | int channels; | ||||
| int samples; ///< samples left to decode in current frame | int samples; ///< samples left to decode in current frame | ||||
| int bps; | int bps; | ||||
| @@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count); | |||||
| static void predictor_decode_mono_3950(APEContext *ctx, int count); | static void predictor_decode_mono_3950(APEContext *ctx, int count); | ||||
| static void predictor_decode_stereo_3950(APEContext *ctx, int count); | static void predictor_decode_stereo_3950(APEContext *ctx, int count); | ||||
| // TODO: dsputilize | |||||
| static av_cold int ape_decode_close(AVCodecContext *avctx) | static av_cold int ape_decode_close(AVCodecContext *avctx) | ||||
| { | { | ||||
| APEContext *s = avctx->priv_data; | APEContext *s = avctx->priv_data; | ||||
| @@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx) | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul) | |||||
| { | |||||
| int res = 0; | |||||
| while (order--) { | |||||
| res += *v1 * *v2++; | |||||
| *v1++ += mul * *v3++; | |||||
| } | |||||
| return res; | |||||
| } | |||||
| static av_cold int ape_decode_init(AVCodecContext *avctx) | static av_cold int ape_decode_init(AVCodecContext *avctx) | ||||
| { | { | ||||
| APEContext *s = avctx->priv_data; | APEContext *s = avctx->priv_data; | ||||
| @@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx) | |||||
| s->predictor_decode_stereo = predictor_decode_stereo_3950; | s->predictor_decode_stereo = predictor_decode_stereo_3950; | ||||
| } | } | ||||
| s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | |||||
| if (ARCH_ARM) | |||||
| ff_apedsp_init_arm(&s->adsp); | |||||
| if (ARCH_PPC) | |||||
| ff_apedsp_init_ppc(&s->adsp); | |||||
| if (ARCH_X86) | |||||
| ff_apedsp_init_x86(&s->adsp); | |||||
| ff_dsputil_init(&s->dsp, avctx); | ff_dsputil_init(&s->dsp, avctx); | ||||
| avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO; | avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO; | ||||
| @@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f, | |||||
| while (count--) { | while (count--) { | ||||
| /* round fixedpoint scalar product */ | /* round fixedpoint scalar product */ | ||||
| res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, | |||||
| f->adaptcoeffs - order, | |||||
| order, APESIGN(*data)); | |||||
| res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs, | |||||
| f->delay - order, | |||||
| f->adaptcoeffs - order, | |||||
| order, APESIGN(*data)); | |||||
| res = (res + (1 << (fracbits - 1))) >> fracbits; | res = (res + (1 << (fracbits - 1))) >> fracbits; | ||||
| res += *data; | res += *data; | ||||
| *data++ = res; | *data++ = res; | ||||
| @@ -0,0 +1,44 @@ | |||||
| /* | |||||
| * Monkey's Audio lossless audio decoder | |||||
| * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org> | |||||
| * based upon libdemac from Dave Chapman. | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #ifndef AVCODEC_APEDSP_H | |||||
| #define AVCODEC_APEDSP_H | |||||
| #include <stdint.h> | |||||
| typedef struct APEDSPContext { | |||||
| /** | |||||
| * Calculate scalar product of v1 and v2, | |||||
| * and v1[i] += v3[i] * mul | |||||
| * @param len length of vectors, should be multiple of 16 | |||||
| */ | |||||
| int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */, | |||||
| const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int len, int mul); | |||||
| } APEDSPContext; | |||||
| void ff_apedsp_init_arm(APEDSPContext *c); | |||||
| void ff_apedsp_init_ppc(APEDSPContext *c); | |||||
| void ff_apedsp_init_x86(APEDSPContext *c); | |||||
| #endif /* AVCODEC_APEDSP_H */ | |||||
| @@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o | |||||
| OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ | OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ | ||||
| arm/sbrdsp_init_arm.o | arm/sbrdsp_init_arm.o | ||||
| OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o | |||||
| OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o | OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o | ||||
| OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ | OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ | ||||
| arm/flacdsp_arm.o | arm/flacdsp_arm.o | ||||
| @@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o | |||||
| NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ | NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ | ||||
| arm/sbrdsp_neon.o | arm/sbrdsp_neon.o | ||||
| NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o | |||||
| NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ | NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ | ||||
| arm/synth_filter_neon.o | arm/synth_filter_neon.o | ||||
| NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o | NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o | ||||
| @@ -0,0 +1,38 @@ | |||||
| /* | |||||
| * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include <stdint.h> | |||||
| #include "libavutil/attributes.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/arm/cpu.h" | |||||
| #include "libavcodec/apedsp.h" | |||||
| int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, int len, int mul); | |||||
| av_cold void ff_apedsp_init_arm(APEDSPContext *c) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (have_neon(cpu_flags)) { | |||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,62 @@ | |||||
| /* | |||||
| * ARM NEON optimised integer operations | |||||
| * Copyright (c) 2009 Kostya Shishkov | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/arm/asm.S" | |||||
| @ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) | |||||
| function ff_scalarproduct_and_madd_int16_neon, export=1 | |||||
| vld1.16 {d28[],d29[]}, [sp] | |||||
| vmov.i16 q0, #0 | |||||
| vmov.i16 q1, #0 | |||||
| vmov.i16 q2, #0 | |||||
| vmov.i16 q3, #0 | |||||
| mov r12, r0 | |||||
| 1: vld1.16 {d16-d17}, [r0,:128]! | |||||
| vld1.16 {d18-d19}, [r1]! | |||||
| vld1.16 {d20-d21}, [r2]! | |||||
| vld1.16 {d22-d23}, [r0,:128]! | |||||
| vld1.16 {d24-d25}, [r1]! | |||||
| vld1.16 {d26-d27}, [r2]! | |||||
| vmul.s16 q10, q10, q14 | |||||
| vmul.s16 q13, q13, q14 | |||||
| vmlal.s16 q0, d16, d18 | |||||
| vmlal.s16 q1, d17, d19 | |||||
| vadd.s16 q10, q8, q10 | |||||
| vadd.s16 q13, q11, q13 | |||||
| vmlal.s16 q2, d22, d24 | |||||
| vmlal.s16 q3, d23, d25 | |||||
| vst1.16 {q10}, [r12,:128]! | |||||
| subs r3, r3, #16 | |||||
| vst1.16 {q13}, [r12,:128]! | |||||
| bne 1b | |||||
| vpadd.s32 d16, d0, d1 | |||||
| vpadd.s32 d17, d2, d3 | |||||
| vpadd.s32 d18, d4, d5 | |||||
| vpadd.s32 d19, d6, d7 | |||||
| vpadd.s32 d0, d16, d17 | |||||
| vpadd.s32 d1, d18, d19 | |||||
| vpadd.s32 d2, d0, d1 | |||||
| vpaddl.s32 d3, d2 | |||||
| vmov.32 r0, d3[0] | |||||
| bx lr | |||||
| endfunc | |||||
| @@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); | int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); | ||||
| int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, int len, int mul); | |||||
| av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, | av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, | ||||
| unsigned high_bit_depth) | unsigned high_bit_depth) | ||||
| { | { | ||||
| @@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, | |||||
| c->vector_clip_int32 = ff_vector_clip_int32_neon; | c->vector_clip_int32 = ff_vector_clip_int32_neon; | ||||
| c->scalarproduct_int16 = ff_scalarproduct_int16_neon; | c->scalarproduct_int16 = ff_scalarproduct_int16_neon; | ||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; | |||||
| } | } | ||||
| @@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1 | |||||
| vmov.32 r0, d3[0] | vmov.32 r0, d3[0] | ||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| @ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) | |||||
| function ff_scalarproduct_and_madd_int16_neon, export=1 | |||||
| vld1.16 {d28[],d29[]}, [sp] | |||||
| vmov.i16 q0, #0 | |||||
| vmov.i16 q1, #0 | |||||
| vmov.i16 q2, #0 | |||||
| vmov.i16 q3, #0 | |||||
| mov r12, r0 | |||||
| 1: vld1.16 {d16-d17}, [r0,:128]! | |||||
| vld1.16 {d18-d19}, [r1]! | |||||
| vld1.16 {d20-d21}, [r2]! | |||||
| vld1.16 {d22-d23}, [r0,:128]! | |||||
| vld1.16 {d24-d25}, [r1]! | |||||
| vld1.16 {d26-d27}, [r2]! | |||||
| vmul.s16 q10, q10, q14 | |||||
| vmul.s16 q13, q13, q14 | |||||
| vmlal.s16 q0, d16, d18 | |||||
| vmlal.s16 q1, d17, d19 | |||||
| vadd.s16 q10, q8, q10 | |||||
| vadd.s16 q13, q11, q13 | |||||
| vmlal.s16 q2, d22, d24 | |||||
| vmlal.s16 q3, d23, d25 | |||||
| vst1.16 {q10}, [r12,:128]! | |||||
| subs r3, r3, #16 | |||||
| vst1.16 {q13}, [r12,:128]! | |||||
| bne 1b | |||||
| vpadd.s32 d16, d0, d1 | |||||
| vpadd.s32 d17, d2, d3 | |||||
| vpadd.s32 d18, d4, d5 | |||||
| vpadd.s32 d19, d6, d7 | |||||
| vpadd.s32 d0, d16, d17 | |||||
| vpadd.s32 d1, d18, d19 | |||||
| vpadd.s32 d2, d0, d1 | |||||
| vpaddl.s32 d3, d2 | |||||
| vmov.32 r0, d3[0] | |||||
| bx lr | |||||
| endfunc | |||||
| @@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2, | |||||
| return res; | return res; | ||||
| } | } | ||||
| static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul) | |||||
| { | |||||
| int res = 0; | |||||
| while (order--) { | |||||
| res += *v1 * *v2++; | |||||
| *v1++ += mul * *v3++; | |||||
| } | |||||
| return res; | |||||
| } | |||||
| static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min, | static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min, | ||||
| int32_t max, unsigned int len) | int32_t max, unsigned int len) | ||||
| { | { | ||||
| @@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) | |||||
| c->try_8x8basis = try_8x8basis_c; | c->try_8x8basis = try_8x8basis_c; | ||||
| c->add_8x8basis = add_8x8basis_c; | c->add_8x8basis = add_8x8basis_c; | ||||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | |||||
| c->scalarproduct_int16 = scalarproduct_int16_c; | c->scalarproduct_int16 = scalarproduct_int16_c; | ||||
| c->vector_clip_int32 = vector_clip_int32_c; | c->vector_clip_int32 = vector_clip_int32_c; | ||||
| c->vector_clipf = vector_clipf_c; | c->vector_clipf = vector_clipf_c; | ||||
| @@ -255,16 +255,6 @@ typedef struct DSPContext { | |||||
| */ | */ | ||||
| int32_t (*scalarproduct_int16)(const int16_t *v1, | int32_t (*scalarproduct_int16)(const int16_t *v1, | ||||
| const int16_t *v2 /* align 16 */, int len); | const int16_t *v2 /* align 16 */, int len); | ||||
| /* ape functions */ | |||||
| /** | |||||
| * Calculate scalar product of v1 and v2, | |||||
| * and v1[i] += v3[i] * mul | |||||
| * @param len length of vectors, should be multiple of 16 | |||||
| */ | |||||
| int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */, | |||||
| const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int len, int mul); | |||||
| /** | /** | ||||
| * Clip each element in an array of int32_t to a given minimum and | * Clip each element in an array of int32_t to a given minimum and | ||||
| @@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o | |||||
| OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o | OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o | ||||
| OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o | OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o | ||||
| OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o | |||||
| OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o | OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o | ||||
| OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o | OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o | ||||
| OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o | OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o | ||||
| @@ -0,0 +1,77 @@ | |||||
| /* | |||||
| * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #if HAVE_ALTIVEC_H | |||||
| #include <altivec.h> | |||||
| #endif | |||||
| #include "libavutil/attributes.h" | |||||
| #include "libavutil/ppc/types_altivec.h" | |||||
| #include "libavcodec/apedsp.h" | |||||
| #if HAVE_ALTIVEC | |||||
| static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, | |||||
| const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul) | |||||
| { | |||||
| LOAD_ZERO; | |||||
| vec_s16 *pv1 = (vec_s16 *) v1; | |||||
| register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; | |||||
| register vec_s16 t0, t1, i0, i1, i4; | |||||
| register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); | |||||
| register vec_s32 res = zero_s32v; | |||||
| register vec_u8 align = vec_lvsl(0, v2); | |||||
| int32_t ires; | |||||
| order >>= 4; | |||||
| do { | |||||
| i1 = vec_ld(16, v2); | |||||
| t0 = vec_perm(i2, i1, align); | |||||
| i2 = vec_ld(32, v2); | |||||
| t1 = vec_perm(i1, i2, align); | |||||
| i0 = pv1[0]; | |||||
| i1 = pv1[1]; | |||||
| res = vec_msum(t0, i0, res); | |||||
| res = vec_msum(t1, i1, res); | |||||
| i4 = vec_ld(16, v3); | |||||
| t0 = vec_perm(i3, i4, align); | |||||
| i3 = vec_ld(32, v3); | |||||
| t1 = vec_perm(i4, i3, align); | |||||
| pv1[0] = vec_mladd(t0, muls, i0); | |||||
| pv1[1] = vec_mladd(t1, muls, i1); | |||||
| pv1 += 2; | |||||
| v2 += 16; | |||||
| v3 += 16; | |||||
| } while (--order); | |||||
| res = vec_splat(vec_sums(res, zero_s32v), 3); | |||||
| vec_ste(res, 0, &ires); | |||||
| return ires; | |||||
| } | |||||
| #endif /* HAVE_ALTIVEC */ | |||||
| av_cold void ff_apedsp_init_ppc(APEDSPContext *c) | |||||
| { | |||||
| #if HAVE_ALTIVEC | |||||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; | |||||
| #endif /* HAVE_ALTIVEC */ | |||||
| } | |||||
| @@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, | |||||
| return ires; | return ires; | ||||
| } | } | ||||
| static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, | |||||
| const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul) | |||||
| { | |||||
| LOAD_ZERO; | |||||
| vec_s16 *pv1 = (vec_s16 *) v1; | |||||
| register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; | |||||
| register vec_s16 t0, t1, i0, i1, i4; | |||||
| register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); | |||||
| register vec_s32 res = zero_s32v; | |||||
| register vec_u8 align = vec_lvsl(0, v2); | |||||
| int32_t ires; | |||||
| order >>= 4; | |||||
| do { | |||||
| i1 = vec_ld(16, v2); | |||||
| t0 = vec_perm(i2, i1, align); | |||||
| i2 = vec_ld(32, v2); | |||||
| t1 = vec_perm(i1, i2, align); | |||||
| i0 = pv1[0]; | |||||
| i1 = pv1[1]; | |||||
| res = vec_msum(t0, i0, res); | |||||
| res = vec_msum(t1, i1, res); | |||||
| i4 = vec_ld(16, v3); | |||||
| t0 = vec_perm(i3, i4, align); | |||||
| i3 = vec_ld(32, v3); | |||||
| t1 = vec_perm(i4, i3, align); | |||||
| pv1[0] = vec_mladd(t0, muls, i0); | |||||
| pv1[1] = vec_mladd(t1, muls, i1); | |||||
| pv1 += 2; | |||||
| v2 += 16; | |||||
| v3 += 16; | |||||
| } while (--order); | |||||
| res = vec_splat(vec_sums(res, zero_s32v), 3); | |||||
| vec_ste(res, 0, &ires); | |||||
| return ires; | |||||
| } | |||||
| av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx) | av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx) | ||||
| { | { | ||||
| c->scalarproduct_int16 = scalarproduct_int16_altivec; | c->scalarproduct_int16 = scalarproduct_int16_altivec; | ||||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; | |||||
| } | } | ||||
| @@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o | |||||
| OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o | OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o | ||||
| OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o | OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o | ||||
| OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o | |||||
| OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o | OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o | ||||
| OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o | OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o | ||||
| OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o | OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o | ||||
| @@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o | |||||
| YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o | YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o | ||||
| YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | ||||
| YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o | |||||
| YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o | YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o | ||||
| YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | ||||
| YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | ||||
| @@ -0,0 +1,167 @@ | |||||
| ;****************************************************************************** | |||||
| ;* Copyright (c) 2008 Loren Merritt | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION_TEXT | |||||
| %macro SCALARPRODUCT 0 | |||||
| ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |||||
| ; int order, int mul) | |||||
| cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul | |||||
| shl orderq, 1 | |||||
| movd m7, mulm | |||||
| %if mmsize == 16 | |||||
| pshuflw m7, m7, 0 | |||||
| punpcklqdq m7, m7 | |||||
| %else | |||||
| pshufw m7, m7, 0 | |||||
| %endif | |||||
| pxor m6, m6 | |||||
| add v1q, orderq | |||||
| add v2q, orderq | |||||
| add v3q, orderq | |||||
| neg orderq | |||||
| .loop: | |||||
| movu m0, [v2q + orderq] | |||||
| movu m1, [v2q + orderq + mmsize] | |||||
| mova m4, [v1q + orderq] | |||||
| mova m5, [v1q + orderq + mmsize] | |||||
| movu m2, [v3q + orderq] | |||||
| movu m3, [v3q + orderq + mmsize] | |||||
| pmaddwd m0, m4 | |||||
| pmaddwd m1, m5 | |||||
| pmullw m2, m7 | |||||
| pmullw m3, m7 | |||||
| paddd m6, m0 | |||||
| paddd m6, m1 | |||||
| paddw m2, m4 | |||||
| paddw m3, m5 | |||||
| mova [v1q + orderq], m2 | |||||
| mova [v1q + orderq + mmsize], m3 | |||||
| add orderq, mmsize*2 | |||||
| jl .loop | |||||
| %if mmsize == 16 | |||||
| movhlps m0, m6 | |||||
| paddd m6, m0 | |||||
| pshuflw m0, m6, 0x4e | |||||
| %else | |||||
| pshufw m0, m6, 0x4e | |||||
| %endif | |||||
| paddd m6, m0 | |||||
| movd eax, m6 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX mmxext | |||||
| SCALARPRODUCT | |||||
| INIT_XMM sse2 | |||||
| SCALARPRODUCT | |||||
| %macro SCALARPRODUCT_LOOP 1 | |||||
| align 16 | |||||
| .loop%1: | |||||
| sub orderq, mmsize*2 | |||||
| %if %1 | |||||
| mova m1, m4 | |||||
| mova m4, [v2q + orderq] | |||||
| mova m0, [v2q + orderq + mmsize] | |||||
| palignr m1, m0, %1 | |||||
| palignr m0, m4, %1 | |||||
| mova m3, m5 | |||||
| mova m5, [v3q + orderq] | |||||
| mova m2, [v3q + orderq + mmsize] | |||||
| palignr m3, m2, %1 | |||||
| palignr m2, m5, %1 | |||||
| %else | |||||
| mova m0, [v2q + orderq] | |||||
| mova m1, [v2q + orderq + mmsize] | |||||
| mova m2, [v3q + orderq] | |||||
| mova m3, [v3q + orderq + mmsize] | |||||
| %endif | |||||
| %define t0 [v1q + orderq] | |||||
| %define t1 [v1q + orderq + mmsize] | |||||
| %if ARCH_X86_64 | |||||
| mova m8, t0 | |||||
| mova m9, t1 | |||||
| %define t0 m8 | |||||
| %define t1 m9 | |||||
| %endif | |||||
| pmaddwd m0, t0 | |||||
| pmaddwd m1, t1 | |||||
| pmullw m2, m7 | |||||
| pmullw m3, m7 | |||||
| paddw m2, t0 | |||||
| paddw m3, t1 | |||||
| paddd m6, m0 | |||||
| paddd m6, m1 | |||||
| mova [v1q + orderq], m2 | |||||
| mova [v1q + orderq + mmsize], m3 | |||||
| jg .loop%1 | |||||
| %if %1 | |||||
| jmp .end | |||||
| %endif | |||||
| %endmacro | |||||
| ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |||||
| ; int order, int mul) | |||||
| INIT_XMM ssse3 | |||||
| cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul | |||||
| shl orderq, 1 | |||||
| movd m7, mulm | |||||
| pshuflw m7, m7, 0 | |||||
| punpcklqdq m7, m7 | |||||
| pxor m6, m6 | |||||
| mov r4d, v2d | |||||
| and r4d, 15 | |||||
| and v2q, ~15 | |||||
| and v3q, ~15 | |||||
| mova m4, [v2q + orderq] | |||||
| mova m5, [v3q + orderq] | |||||
| ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |||||
| cmp r4d, 0 | |||||
| je .loop0 | |||||
| cmp r4d, 2 | |||||
| je .loop2 | |||||
| cmp r4d, 4 | |||||
| je .loop4 | |||||
| cmp r4d, 6 | |||||
| je .loop6 | |||||
| cmp r4d, 8 | |||||
| je .loop8 | |||||
| cmp r4d, 10 | |||||
| je .loop10 | |||||
| cmp r4d, 12 | |||||
| je .loop12 | |||||
| SCALARPRODUCT_LOOP 14 | |||||
| SCALARPRODUCT_LOOP 12 | |||||
| SCALARPRODUCT_LOOP 10 | |||||
| SCALARPRODUCT_LOOP 8 | |||||
| SCALARPRODUCT_LOOP 6 | |||||
| SCALARPRODUCT_LOOP 4 | |||||
| SCALARPRODUCT_LOOP 2 | |||||
| SCALARPRODUCT_LOOP 0 | |||||
| .end: | |||||
| movhlps m0, m6 | |||||
| paddd m6, m0 | |||||
| pshuflw m0, m6, 0x4e | |||||
| paddd m6, m0 | |||||
| movd eax, m6 | |||||
| RET | |||||
| @@ -0,0 +1,47 @@ | |||||
| /* | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/attributes.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/x86/cpu.h" | |||||
| #include "libavcodec/apedsp.h" | |||||
| int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| av_cold void ff_apedsp_init_x86(APEDSPContext *c) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (EXTERNAL_MMXEXT(cpu_flags)) | |||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; | |||||
| if (EXTERNAL_SSE2(cpu_flags)) | |||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |||||
| if (EXTERNAL_SSSE3(cpu_flags) && | |||||
| !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit | |||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | |||||
| } | |||||
| @@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order | |||||
| paddd m2, m0 | paddd m2, m0 | ||||
| movd eax, m2 | movd eax, m2 | ||||
| RET | RET | ||||
| ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |||||
| ; int order, int mul) | |||||
| cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul | |||||
| shl orderq, 1 | |||||
| movd m7, mulm | |||||
| %if mmsize == 16 | |||||
| pshuflw m7, m7, 0 | |||||
| punpcklqdq m7, m7 | |||||
| %else | |||||
| pshufw m7, m7, 0 | |||||
| %endif | |||||
| pxor m6, m6 | |||||
| add v1q, orderq | |||||
| add v2q, orderq | |||||
| add v3q, orderq | |||||
| neg orderq | |||||
| .loop: | |||||
| movu m0, [v2q + orderq] | |||||
| movu m1, [v2q + orderq + mmsize] | |||||
| mova m4, [v1q + orderq] | |||||
| mova m5, [v1q + orderq + mmsize] | |||||
| movu m2, [v3q + orderq] | |||||
| movu m3, [v3q + orderq + mmsize] | |||||
| pmaddwd m0, m4 | |||||
| pmaddwd m1, m5 | |||||
| pmullw m2, m7 | |||||
| pmullw m3, m7 | |||||
| paddd m6, m0 | |||||
| paddd m6, m1 | |||||
| paddw m2, m4 | |||||
| paddw m3, m5 | |||||
| mova [v1q + orderq], m2 | |||||
| mova [v1q + orderq + mmsize], m3 | |||||
| add orderq, mmsize*2 | |||||
| jl .loop | |||||
| %if mmsize == 16 | |||||
| movhlps m0, m6 | |||||
| paddd m6, m0 | |||||
| pshuflw m0, m6, 0x4e | |||||
| %else | |||||
| pshufw m0, m6, 0x4e | |||||
| %endif | |||||
| paddd m6, m0 | |||||
| movd eax, m6 | |||||
| RET | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX mmxext | INIT_MMX mmxext | ||||
| @@ -106,97 +60,6 @@ SCALARPRODUCT | |||||
| INIT_XMM sse2 | INIT_XMM sse2 | ||||
| SCALARPRODUCT | SCALARPRODUCT | ||||
| %macro SCALARPRODUCT_LOOP 1 | |||||
| align 16 | |||||
| .loop%1: | |||||
| sub orderq, mmsize*2 | |||||
| %if %1 | |||||
| mova m1, m4 | |||||
| mova m4, [v2q + orderq] | |||||
| mova m0, [v2q + orderq + mmsize] | |||||
| palignr m1, m0, %1 | |||||
| palignr m0, m4, %1 | |||||
| mova m3, m5 | |||||
| mova m5, [v3q + orderq] | |||||
| mova m2, [v3q + orderq + mmsize] | |||||
| palignr m3, m2, %1 | |||||
| palignr m2, m5, %1 | |||||
| %else | |||||
| mova m0, [v2q + orderq] | |||||
| mova m1, [v2q + orderq + mmsize] | |||||
| mova m2, [v3q + orderq] | |||||
| mova m3, [v3q + orderq + mmsize] | |||||
| %endif | |||||
| %define t0 [v1q + orderq] | |||||
| %define t1 [v1q + orderq + mmsize] | |||||
| %if ARCH_X86_64 | |||||
| mova m8, t0 | |||||
| mova m9, t1 | |||||
| %define t0 m8 | |||||
| %define t1 m9 | |||||
| %endif | |||||
| pmaddwd m0, t0 | |||||
| pmaddwd m1, t1 | |||||
| pmullw m2, m7 | |||||
| pmullw m3, m7 | |||||
| paddw m2, t0 | |||||
| paddw m3, t1 | |||||
| paddd m6, m0 | |||||
| paddd m6, m1 | |||||
| mova [v1q + orderq], m2 | |||||
| mova [v1q + orderq + mmsize], m3 | |||||
| jg .loop%1 | |||||
| %if %1 | |||||
| jmp .end | |||||
| %endif | |||||
| %endmacro | |||||
| ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, | |||||
| ; int order, int mul) | |||||
| INIT_XMM ssse3 | |||||
| cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul | |||||
| shl orderq, 1 | |||||
| movd m7, mulm | |||||
| pshuflw m7, m7, 0 | |||||
| punpcklqdq m7, m7 | |||||
| pxor m6, m6 | |||||
| mov r4d, v2d | |||||
| and r4d, 15 | |||||
| and v2q, ~15 | |||||
| and v3q, ~15 | |||||
| mova m4, [v2q + orderq] | |||||
| mova m5, [v3q + orderq] | |||||
| ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |||||
| cmp r4d, 0 | |||||
| je .loop0 | |||||
| cmp r4d, 2 | |||||
| je .loop2 | |||||
| cmp r4d, 4 | |||||
| je .loop4 | |||||
| cmp r4d, 6 | |||||
| je .loop6 | |||||
| cmp r4d, 8 | |||||
| je .loop8 | |||||
| cmp r4d, 10 | |||||
| je .loop10 | |||||
| cmp r4d, 12 | |||||
| je .loop12 | |||||
| SCALARPRODUCT_LOOP 14 | |||||
| SCALARPRODUCT_LOOP 12 | |||||
| SCALARPRODUCT_LOOP 10 | |||||
| SCALARPRODUCT_LOOP 8 | |||||
| SCALARPRODUCT_LOOP 6 | |||||
| SCALARPRODUCT_LOOP 4 | |||||
| SCALARPRODUCT_LOOP 2 | |||||
| SCALARPRODUCT_LOOP 0 | |||||
| .end: | |||||
| movhlps m0, m6 | |||||
| paddd m6, m0 | |||||
| pshuflw m0, m6, 0x4e | |||||
| paddd m6, m0 | |||||
| movd eax, m6 | |||||
| RET | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, | ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, | ||||
| @@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, | |||||
| int order); | int order); | ||||
| int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, | int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, | ||||
| int order); | int order); | ||||
| int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, | |||||
| const int16_t *v3, | |||||
| int order, int mul); | |||||
| void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); | void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); | ||||
| void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); | void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); | ||||
| @@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, | |||||
| SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); | SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); | ||||
| c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; | c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; | ||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; | |||||
| #endif /* HAVE_MMXEXT_EXTERNAL */ | #endif /* HAVE_MMXEXT_EXTERNAL */ | ||||
| } | } | ||||
| @@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, | |||||
| #if HAVE_SSE2_EXTERNAL | #if HAVE_SSE2_EXTERNAL | ||||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | ||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |||||
| if (cpu_flags & AV_CPU_FLAG_ATOM) { | if (cpu_flags & AV_CPU_FLAG_ATOM) { | ||||
| c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; | c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; | ||||
| } else { | } else { | ||||
| @@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, | |||||
| int cpu_flags, unsigned high_bit_depth) | int cpu_flags, unsigned high_bit_depth) | ||||
| { | { | ||||
| #if HAVE_SSSE3_EXTERNAL | #if HAVE_SSSE3_EXTERNAL | ||||
| if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit | |||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | |||||
| c->bswap_buf = ff_bswap32_buf_ssse3; | c->bswap_buf = ff_bswap32_buf_ssse3; | ||||
| #endif /* HAVE_SSSE3_EXTERNAL */ | #endif /* HAVE_SSSE3_EXTERNAL */ | ||||
| } | } | ||||