Merge commit 'c166148409fe8f0dbccef2fe684286a40ba1e37d'

* commit 'c166148409fe8f0dbccef2fe684286a40ba1e37d': dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc Conflicts: libavcodec/dsputil.c libavcodec/mpegvideo_enc.c libavcodec/x86/dsputilenc.asm libavcodec/x86/dsputilenc_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · 020865f557
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -22,6 +22,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
 OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                += arm/videodsp_init_arm.o
 OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
@@ -61,6 +62,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
                                          arm/idctdsp_armv6.o           \
                                          arm/simple_idct_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_VC1_DECODER)       += arm/startcode_armv6.o
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1
        pop             {r4-r9, pc}
 endfunc
 function ff_pix_norm1_armv6, export=1
        push            {r4-r6, lr}
        mov             r12, #16
        mov             lr,  #0
 1:
        ldm             r0,  {r2-r5}
        uxtb16          r6,  r2
        uxtb16          r2,  r2,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r3
        smlad           lr,  r2,  r2,  lr
        uxtb16          r3,  r3,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r4
        smlad           lr,  r3,  r3,  lr
        uxtb16          r4,  r4,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r5
        smlad           lr,  r4,  r4,  lr
        uxtb16          r5,  r5,  ror #8
        smlad           lr,  r6,  r6,  lr
        subs            r12, r12, #1
        add             r0,  r0,  r1
        smlad           lr,  r5,  r5,  lr
        bgt             1b
        mov             r0,  lr
        pop             {r4-r6, pc}
 endfunc
 function ff_pix_sum_armv6, export=1
        push            {r4-r7, lr}
        mov             r12, #16
        mov             r2,  #0
        mov             r3,  #0
        mov             lr,  #0
        ldr             r4,  [r0]
 1:
        subs            r12, r12, #1
        ldr             r5,  [r0, #4]
        usada8          r2,  r4,  lr,  r2
        ldr             r6,  [r0, #8]
        usada8          r3,  r5,  lr,  r3
        ldr             r7,  [r0, #12]
        usada8          r2,  r6,  lr,  r2
        beq             2f
        ldr_pre         r4,  r0,  r1
        usada8          r3,  r7,  lr,  r3
        bgt             1b
 2:
        usada8          r3,  r7,  lr,  r3
        add             r0,  r2,  r3
        pop             {r4-r7, pc}
 endfunc
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
 int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                   int line_size, int h);
 int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
 int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
@@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
    c->sad[1] = ff_pix_abs8_armv6;
    c->sse[0] = ff_sse16_armv6;
    c->pix_norm1 = ff_pix_norm1_armv6;
    c->pix_sum   = ff_pix_sum_armv6;
 }
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/arm/asm.S"
 function ff_pix_norm1_armv6, export=1
        push            {r4-r6, lr}
        mov             r12, #16
        mov             lr,  #0
 1:
        ldm             r0,  {r2-r5}
        uxtb16          r6,  r2
        uxtb16          r2,  r2,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r3
        smlad           lr,  r2,  r2,  lr
        uxtb16          r3,  r3,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r4
        smlad           lr,  r3,  r3,  lr
        uxtb16          r4,  r4,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r5
        smlad           lr,  r4,  r4,  lr
        uxtb16          r5,  r5,  ror #8
        smlad           lr,  r6,  r6,  lr
        subs            r12, r12, #1
        add             r0,  r0,  r1
        smlad           lr,  r5,  r5,  lr
        bgt             1b
        mov             r0,  lr
        pop             {r4-r6, pc}
 endfunc
 function ff_pix_sum_armv6, export=1
        push            {r4-r7, lr}
        mov             r12, #16
        mov             r2,  #0
        mov             r3,  #0
        mov             lr,  #0
        ldr             r4,  [r0]
 1:
        subs            r12, r12, #1
        ldr             r5,  [r0, #4]
        usada8          r2,  r4,  lr,  r2
        ldr             r6,  [r0, #8]
        usada8          r3,  r5,  lr,  r3
        ldr             r7,  [r0, #12]
        usada8          r2,  r6,  lr,  r2
        beq             2f
        ldr_pre         r4,  r0,  r1
        usada8          r3,  r7,  lr,  r3
        bgt             1b
 2:
        usada8          r3,  r7,  lr,  r3
        add             r0,  r2,  r3
        pop             {r4-r7, pc}
 endfunc
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -0,0 +1,38 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include <stdint.h>
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
 int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
    int cpu_flags = av_get_cpu_flags();
    if (have_armv6(cpu_flags)) {
        c->pix_norm1 = ff_pix_norm1_armv6;
        c->pix_sum   = ff_pix_sum_armv6;
    }
 }
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -323,6 +323,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
    ff_blockdsp_init(&ctx->bdsp, avctx);
    ff_idctdsp_init(&ctx->m.idsp, avctx);
    ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
    ff_dct_common_init(&ctx->m);
    ff_dct_encode_init(&ctx->m);
@@ -733,8 +734,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
            int varc;
            if (!partial_last_row && mb_x * 16 <= avctx->width - 16) {
                sum  = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
                varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize);
                sum  = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize);
                varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize);
            } else {
                int bw = FFMIN(avctx->width - 16 * mb_x, 16);
                int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16);
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -26,7 +26,6 @@
 */
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "copy_block.h"
@@ -34,8 +33,6 @@
 #include "dsputil.h"
 #include "simple_idct.h"
 #include "faandct.h"
 #include "imgconvert.h"
 #include "mathops.h"
 #include "mpegvideo.h"
 #include "config.h"
@@ -48,74 +45,6 @@ uint32_t ff_square_tab[512] = { 0, };
 #define BIT_DEPTH 8
 #include "dsputilenc_template.c"
 static int pix_sum_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
            s   += pix[0];
            s   += pix[1];
            s   += pix[2];
            s   += pix[3];
            s   += pix[4];
            s   += pix[5];
            s   += pix[6];
            s   += pix[7];
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int pix_norm1_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    uint32_t *sq = ff_square_tab + 256;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
 #if 0
            s += sq[pix[0]];
            s += sq[pix[1]];
            s += sq[pix[2]];
            s += sq[pix[3]];
            s += sq[pix[4]];
            s += sq[pix[5]];
            s += sq[pix[6]];
            s += sq[pix[7]];
 #else
 #if HAVE_FAST_64BIT
            register uint64_t x = *(uint64_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            s += sq[(x >> 32) & 0xff];
            s += sq[(x >> 40) & 0xff];
            s += sq[(x >> 48) & 0xff];
            s += sq[(x >> 56) & 0xff];
 #else
            register uint32_t x = *(uint32_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            x  = *(uint32_t *) (pix + 4);
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
 #endif
 #endif
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                  int line_size, int h)
 {
@@ -1094,9 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->sum_abs_dctelem = sum_abs_dctelem_c;
    c->pix_sum   = pix_sum_c;
    c->pix_norm1 = pix_norm1_c;
    /* TODO [0] 16  [1] 8 */
    c->pix_abs[0][0] = pix_abs16_c;
    c->pix_abs[0][1] = pix_abs16_x2_c;
@@ -1141,11 +1067,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    ff_dsputil_init_dwt(c);
 #endif
    c->shrink[0] = av_image_copy_plane;
    c->shrink[1] = ff_shrink22;
    c->shrink[2] = ff_shrink44;
    c->shrink[3] = ff_shrink88;
    c->draw_edges = draw_edges_8_c;
    switch (avctx->bits_per_raw_sample) {
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -72,9 +72,6 @@ typedef struct DSPContext {
                        int stride);
    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
    int (*pix_sum)(uint8_t *pix, int line_size);
    int (*pix_norm1)(uint8_t *pix, int line_size);
    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[6];
    me_cmp_func hadamard8_diff[6];
@@ -108,9 +105,6 @@ typedef struct DSPContext {
 #define EDGE_WIDTH 16
 #define EDGE_TOP    1
 #define EDGE_BOTTOM 2
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
                      int src_wrap, int width, int height);
 } DSPContext;
 void ff_dsputil_static_init(void);
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -903,8 +903,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
    /* intra / predictive decision */
    pix = c->src[0][0];
    sum = s->dsp.pix_sum(pix, s->linesize);
    varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500;
    sum  = s->mpvencdsp.pix_sum(pix, s->linesize);
    varc = s->mpvencdsp.pix_norm1(pix, s->linesize) -
           (((unsigned) sum * sum) >> 8) + 500;
    pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
    pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8;
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -1010,7 +1010,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
            int offset = x + y * stride;
            int sad  = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
                                     16);
            int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8;
            int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8;
            int sae  = get_sae(src + offset, mean, stride);
            acc += sae + 500 < sad;
@@ -1278,15 +1278,21 @@ static int estimate_best_b_count(MpegEncContext *s)
                data[2] += INPLACE_OFFSET;
            }
            s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0],
                                 data[0], pre_input.f->linesize[0],
                                 c->width,      c->height);
            s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1],
                                 data[1], pre_input.f->linesize[1],
                                 c->width >> 1, c->height >> 1);
            s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2],
                                 data[2], pre_input.f->linesize[2],
                                 c->width >> 1, c->height >> 1);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
                                       s->tmp_frames[i]->linesize[0],
                                       data[0],
                                       pre_input.f->linesize[0],
                                       c->width, c->height);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
                                       s->tmp_frames[i]->linesize[1],
                                       data[1],
                                       pre_input.f->linesize[1],
                                       c->width >> 1, c->height >> 1);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
                                       s->tmp_frames[i]->linesize[2],
                                       data[2],
                                       pre_input.f->linesize[2],
                                       c->width >> 1, c->height >> 1);
        }
    }
@@ -2585,9 +2591,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
            int yy = mb_y * 16;
            uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx;
            int varc;
            int sum = s->dsp.pix_sum(pix, s->linesize);
            int sum = s->mpvencdsp.pix_sum(pix, s->linesize);
            varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8;
            varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) -
                    (((unsigned) sum * sum) >> 8) + 500 + 128) >> 8;
            s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
            s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -22,7 +22,10 @@
 #include "config.h"
 #include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "imgconvert.h"
 #include "mpegvideoencdsp.h"
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
@@ -54,12 +57,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
                  (BASIS_SHIFT - RECON_SHIFT);
 }
 static int pix_sum_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
            s   += pix[0];
            s   += pix[1];
            s   += pix[2];
            s   += pix[3];
            s   += pix[4];
            s   += pix[5];
            s   += pix[6];
            s   += pix[7];
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int pix_norm1_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    uint32_t *sq = ff_square_tab + 256;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
 #if 0
            s += sq[pix[0]];
            s += sq[pix[1]];
            s += sq[pix[2]];
            s += sq[pix[3]];
            s += sq[pix[4]];
            s += sq[pix[5]];
            s += sq[pix[6]];
            s += sq[pix[7]];
 #else
 #if HAVE_FAST_64BIT
            register uint64_t x = *(uint64_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            s += sq[(x >> 32) & 0xff];
            s += sq[(x >> 40) & 0xff];
            s += sq[(x >> 48) & 0xff];
            s += sq[(x >> 56) & 0xff];
 #else
            register uint32_t x = *(uint32_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            x  = *(uint32_t *) (pix + 4);
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
 #endif
 #endif
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                                     AVCodecContext *avctx)
 {
    c->try_8x8basis = try_8x8basis_c;
    c->add_8x8basis = add_8x8basis_c;
    c->shrink[0] = av_image_copy_plane;
    c->shrink[1] = ff_shrink22;
    c->shrink[2] = ff_shrink44;
    c->shrink[3] = ff_shrink88;
    c->pix_sum   = pix_sum_c;
    c->pix_norm1 = pix_norm1_c;
    if (ARCH_ARM)
        ff_mpegvideoencdsp_init_arm(c, avctx);
    if (ARCH_PPC)
        ff_mpegvideoencdsp_init_ppc(c, avctx);
    if (ARCH_X86)
        ff_mpegvideoencdsp_init_x86(c, avctx);
 }
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext {
                        int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
    int (*pix_sum)(uint8_t *pix, int line_size);
    int (*pix_norm1)(uint8_t *pix, int line_size);
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
                      int src_wrap, int width, int height);
 } MpegvideoEncDSPContext;
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                             AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
                                          ppc/mpegvideodsp.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += ppc/mpegvideoencdsp.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);
    return s;
 }
 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
 * It's the sad8_altivec code above w/ squaring added. */
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
@@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
    int i, s;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
    vector signed int sumdiffs;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
        /* Add each 4 pixel group together and put 4 results into sad. */
        sad = vec_sum4s(t1, sad);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;
 }
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                               int line_size)
 {
@@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
    c->sse[0] = sse16_altivec;
    c->sse[1] = sse8_altivec;
    c->pix_norm1 = pix_norm1_altivec;
    c->pix_sum   = pix_sum_altivec;
    c->diff_pixels = diff_pixels_altivec;
    if (!high_bit_depth) {
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -0,0 +1,103 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "config.h"
 #include <stdint.h>
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 #if HAVE_ALTIVEC
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);
    return s;
 }
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
    int i, s;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
    vector signed int sumdiffs;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
        /* Add each 4 pixel group together and put 4 results into sad. */
        sad = vec_sum4s(t1, sad);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;
 }
 #endif /* HAVE_ALTIVEC */
 av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
 #if HAVE_ALTIVEC
    if (!PPC_ALTIVEC(av_get_cpu_flags()))
        return;
    c->pix_norm1 = pix_norm1_altivec;
    c->pix_sum   = pix_sum_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -517,6 +517,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
    ff_dsputil_init(&s->dsp, avctx);
    ff_hpeldsp_init(&s->hdsp, avctx->flags);
    ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
    avctx->coded_frame = av_frame_alloc();
    s->current_picture = av_frame_alloc();
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -109,6 +109,7 @@ YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
 YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
 YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                          x86/fpel.o                    \
                                          x86/qpel.o
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -23,10 +23,6 @@
 %include "libavutil/x86/x86util.asm"
 SECTION_RODATA
 cextern pw_1
 SECTION .text
 %macro DIFF_PIXELS_1 4
@@ -465,113 +461,6 @@ cglobal diff_pixels, 4, 5, 5
    jne .loop
    RET
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
 ; %1 = number of xmm registers used
 ; %2 = number of loops
 ; %3 = number of GPRs used
 %macro PIX_SUM16 4
 cglobal pix_sum16, 2, %3, %1
    movsxdifnidn r1, r1d
    mov          r2, %2
 %if cpuflag(xop)
    lea          r3, [r1*3]
 %else
    pxor         m5, m5
 %endif
    pxor         m4, m4
 .loop:
 %if cpuflag(xop)
    vphaddubq    m0, [r0]
    vphaddubq    m1, [r0+r1]
    vphaddubq    m2, [r0+r1*2]
    vphaddubq    m3, [r0+r3]
 %else
    mova         m0, [r0]
 %if mmsize == 8
    mova         m1, [r0+8]
 %else
    mova         m1, [r0+r1]
 %endif
    punpckhbw    m2, m0, m5
    punpcklbw    m0, m5
    punpckhbw    m3, m1, m5
    punpcklbw    m1, m5
 %endif ; cpuflag(xop)
    paddw        m1, m0
    paddw        m3, m2
    paddw        m3, m1
    paddw        m4, m3
 %if mmsize == 8
    add          r0, r1
 %else
    lea          r0, [r0+r1*%4]
 %endif
    dec r2
    jne .loop
 %if cpuflag(xop)
    pshufd       m0, m4, q0032
    paddd        m4, m0
 %else
    HADDW        m4, m5
 %endif
    movd        eax, m4
    RET
 %endmacro
 INIT_MMX mmx
 PIX_SUM16 0, 16, 3, 0
 INIT_XMM sse2
 PIX_SUM16 6, 8,  3, 2
 %if HAVE_XOP_EXTERNAL
 INIT_XMM xop
 PIX_SUM16 5, 4,  4, 4
 %endif
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
 ; %1 = number of xmm registers used
 ; %2 = number of loops
 %macro PIX_NORM1 2
 cglobal pix_norm1, 2, 3, %1
    movsxdifnidn r1, r1d
    mov          r2, %2
    pxor         m0, m0
    pxor         m5, m5
 .loop:
    mova         m2, [r0+0]
 %if mmsize == 8
    mova         m3, [r0+8]
 %else
    mova         m3, [r0+r1]
 %endif
    punpckhbw    m1, m2, m0
    punpcklbw    m2, m0
    punpckhbw    m4, m3, m0
    punpcklbw    m3, m0
    pmaddwd      m1, m1
    pmaddwd      m2, m2
    pmaddwd      m3, m3
    pmaddwd      m4, m4
    paddd        m2, m1
    paddd        m4, m3
    paddd        m5, m2
    paddd        m5, m4
 %if mmsize == 8
    add          r0, r1
 %else
    lea          r0, [r0+r1*2]
 %endif
    dec r2
    jne .loop
    HADDD        m5, m1
    movd        eax, m5
    RET
 %endmacro
 INIT_MMX mmx
 PIX_NORM1 0, 16
 INIT_XMM sse2
 PIX_NORM1 6, 8
 ;-----------------------------------------------
 ;int ff_sum_abs_dctelem(int16_t *block)
 ;-----------------------------------------------
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -37,11 +37,6 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                        int stride);
 void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
 int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
 int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 int ff_sum_abs_dctelem_mmx(int16_t *block);
 int ff_sum_abs_dctelem_mmxext(int16_t *block);
 int ff_sum_abs_dctelem_sse2(int16_t *block);
@@ -364,8 +359,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_mmx;
        c->diff_pixels = ff_diff_pixels_mmx;
        c->pix_sum     = ff_pix_sum16_mmx;
        c->pix_norm1   = ff_pix_norm1_mmx;
    }
    if (EXTERNAL_SSE2(cpu_flags))
@@ -431,8 +424,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
        c->sse[0] = ff_sse16_sse2;
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
        c->diff_pixels = ff_diff_pixels_sse2;
        c->pix_sum     = ff_pix_sum16_sse2;
        c->pix_norm1   = ff_pix_norm1_sse2;
 #if HAVE_ALIGNED_STACK
        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
@@ -448,9 +439,5 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
 #endif
    }
    if (EXTERNAL_XOP(cpu_flags)) {
        c->pix_sum           = ff_pix_sum16_xop;
    }
    ff_dsputil_init_pix_mmx(c, avctx);
 }
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -0,0 +1,137 @@
 ;*****************************************************************************
 ;* SIMD-optimized MPEG encoding functions
 ;*****************************************************************************
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 %include "libavutil/x86/x86util.asm"
 SECTION_RODATA
 cextern pw_1
 SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
 ; %1 = number of xmm registers used
 ; %2 = number of loops
 ; %3 = number of GPRs used
 %macro PIX_SUM16 4
 cglobal pix_sum16, 2, %3, %1
    movsxdifnidn r1, r1d
    mov          r2, %2
 %if cpuflag(xop)
    lea          r3, [r1*3]
 %else
    pxor         m5, m5
 %endif
    pxor         m4, m4
 .loop:
 %if cpuflag(xop)
    vphaddubq    m0, [r0]
    vphaddubq    m1, [r0+r1]
    vphaddubq    m2, [r0+r1*2]
    vphaddubq    m3, [r0+r3]
 %else
    mova         m0, [r0]
 %if mmsize == 8
    mova         m1, [r0+8]
 %else
    mova         m1, [r0+r1]
 %endif
    punpckhbw    m2, m0, m5
    punpcklbw    m0, m5
    punpckhbw    m3, m1, m5
    punpcklbw    m1, m5
 %endif ; cpuflag(xop)
    paddw        m1, m0
    paddw        m3, m2
    paddw        m3, m1
    paddw        m4, m3
 %if mmsize == 8
    add          r0, r1
 %else
    lea          r0, [r0+r1*%4]
 %endif
    dec r2
    jne .loop
 %if cpuflag(xop)
    pshufd       m0, m4, q0032
    paddd        m4, m0
 %else
    HADDW        m4, m5
 %endif
    movd        eax, m4
    RET
 %endmacro
 INIT_MMX mmx
 PIX_SUM16 0, 16, 3, 0
 INIT_XMM sse2
 PIX_SUM16 6, 8,  3, 2
 %if HAVE_XOP_EXTERNAL
 INIT_XMM xop
 PIX_SUM16 5, 4,  4, 4
 %endif
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
 ; %1 = number of xmm registers used
 ; %2 = number of loops
 %macro PIX_NORM1 2
 cglobal pix_norm1, 2, 3, %1
    movsxdifnidn r1, r1d
    mov          r2, %2
    pxor         m0, m0
    pxor         m5, m5
 .loop:
    mova         m2, [r0+0]
 %if mmsize == 8
    mova         m3, [r0+8]
 %else
    mova         m3, [r0+r1]
 %endif
    punpckhbw    m1, m2, m0
    punpcklbw    m2, m0
    punpckhbw    m4, m3, m0
    punpcklbw    m3, m0
    pmaddwd      m1, m1
    pmaddwd      m2, m2
    pmaddwd      m3, m3
    pmaddwd      m4, m4
    paddd        m2, m1
    paddd        m4, m3
    paddd        m5, m2
    paddd        m5, m4
 %if mmsize == 8
    add          r0, r1
 %else
    lea          r0, [r0+r1*2]
 %endif
    dec r2
    jne .loop
    HADDD        m5, m1
    movd        eax, m5
    RET
 %endmacro
 INIT_MMX mmx
 PIX_NORM1 0, 16
 INIT_XMM sse2
 PIX_NORM1 6, 8
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -22,6 +22,12 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
 int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
 int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 #if HAVE_INLINE_ASM
 #define PHADDD(a, t)                            \
@@ -95,9 +101,24 @@
 av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
 #if HAVE_INLINE_ASM
    int cpu_flags = av_get_cpu_flags();
    if (EXTERNAL_MMX(cpu_flags)) {
        c->pix_sum   = ff_pix_sum16_mmx;
        c->pix_norm1 = ff_pix_norm1_mmx;
    }
    if (EXTERNAL_SSE2(cpu_flags)) {
        c->pix_sum     = ff_pix_sum16_sse2;
        c->pix_norm1   = ff_pix_norm1_sse2;
    }
    if (EXTERNAL_XOP(cpu_flags)) {
        c->pix_sum     = ff_pix_sum16_xop;
    }
 #if HAVE_INLINE_ASM
    if (INLINE_MMX(cpu_flags)) {
        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
            c->try_8x8basis = try_8x8basis_mmx;