dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc

12 years ago · c166148409
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,6 +21,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
                                          arm/simple_idct_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                += arm/videodsp_init_arm.o
 OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
@@ -60,6 +61,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
                                          arm/idctdsp_armv6.o           \
                                          arm/simple_idct_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_VP7_DECODER)       += arm/vp8_armv6.o               \
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1
        pop             {r4-r9, pc}
 endfunc
 function ff_pix_norm1_armv6, export=1
        push            {r4-r6, lr}
        mov             r12, #16
        mov             lr,  #0
 1:
        ldm             r0,  {r2-r5}
        uxtb16          r6,  r2
        uxtb16          r2,  r2,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r3
        smlad           lr,  r2,  r2,  lr
        uxtb16          r3,  r3,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r4
        smlad           lr,  r3,  r3,  lr
        uxtb16          r4,  r4,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r5
        smlad           lr,  r4,  r4,  lr
        uxtb16          r5,  r5,  ror #8
        smlad           lr,  r6,  r6,  lr
        subs            r12, r12, #1
        add             r0,  r0,  r1
        smlad           lr,  r5,  r5,  lr
        bgt             1b
        mov             r0,  lr
        pop             {r4-r6, pc}
 endfunc
 function ff_pix_sum_armv6, export=1
        push            {r4-r7, lr}
        mov             r12, #16
        mov             r2,  #0
        mov             r3,  #0
        mov             lr,  #0
        ldr             r4,  [r0]
 1:
        subs            r12, r12, #1
        ldr             r5,  [r0, #4]
        usada8          r2,  r4,  lr,  r2
        ldr             r6,  [r0, #8]
        usada8          r3,  r5,  lr,  r3
        ldr             r7,  [r0, #12]
        usada8          r2,  r6,  lr,  r2
        beq             2f
        ldr_pre         r4,  r0,  r1
        usada8          r3,  r7,  lr,  r3
        bgt             1b
 2:
        usada8          r3,  r7,  lr,  r3
        add             r0,  r2,  r3
        pop             {r4-r7, pc}
 endfunc
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
 int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                   int line_size, int h);
 int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
 int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
@@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
    c->sad[1] = ff_pix_abs8_armv6;
    c->sse[0] = ff_sse16_armv6;
    c->pix_norm1 = ff_pix_norm1_armv6;
    c->pix_sum   = ff_pix_sum_armv6;
 }
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/arm/asm.S"
 function ff_pix_norm1_armv6, export=1
        push            {r4-r6, lr}
        mov             r12, #16
        mov             lr,  #0
 1:
        ldm             r0,  {r2-r5}
        uxtb16          r6,  r2
        uxtb16          r2,  r2,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r3
        smlad           lr,  r2,  r2,  lr
        uxtb16          r3,  r3,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r4
        smlad           lr,  r3,  r3,  lr
        uxtb16          r4,  r4,  ror #8
        smlad           lr,  r6,  r6,  lr
        uxtb16          r6,  r5
        smlad           lr,  r4,  r4,  lr
        uxtb16          r5,  r5,  ror #8
        smlad           lr,  r6,  r6,  lr
        subs            r12, r12, #1
        add             r0,  r0,  r1
        smlad           lr,  r5,  r5,  lr
        bgt             1b
        mov             r0,  lr
        pop             {r4-r6, pc}
 endfunc
 function ff_pix_sum_armv6, export=1
        push            {r4-r7, lr}
        mov             r12, #16
        mov             r2,  #0
        mov             r3,  #0
        mov             lr,  #0
        ldr             r4,  [r0]
 1:
        subs            r12, r12, #1
        ldr             r5,  [r0, #4]
        usada8          r2,  r4,  lr,  r2
        ldr             r6,  [r0, #8]
        usada8          r3,  r5,  lr,  r3
        ldr             r7,  [r0, #12]
        usada8          r2,  r6,  lr,  r2
        beq             2f
        ldr_pre         r4,  r0,  r1
        usada8          r3,  r7,  lr,  r3
        bgt             1b
 2:
        usada8          r3,  r7,  lr,  r3
        add             r0,  r2,  r3
        pop             {r4-r7, pc}
 endfunc
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -0,0 +1,38 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include <stdint.h>
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
 int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
    int cpu_flags = av_get_cpu_flags();
    if (have_armv6(cpu_flags)) {
        c->pix_norm1 = ff_pix_norm1_armv6;
        c->pix_sum   = ff_pix_sum_armv6;
    }
 }
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -309,6 +309,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
    ff_blockdsp_init(&ctx->bdsp, avctx);
    ff_dsputil_init(&ctx->m.dsp, avctx);
    ff_idctdsp_init(&ctx->m.idsp, avctx);
    ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
    ff_dct_common_init(&ctx->m);
    if (!ctx->m.dct_quantize)
        ctx->m.dct_quantize = ff_dct_quantize_c;
@@ -719,8 +720,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
            int varc;
            if (!partial_last_row && mb_x * 16 <= avctx->width - 16) {
                sum  = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
                varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize);
                sum  = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize);
                varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize);
            } else {
                int bw = FFMIN(avctx->width - 16 * mb_x, 16);
                int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16);
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -26,15 +26,12 @@
 */
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "copy_block.h"
 #include "dct.h"
 #include "dsputil.h"
 #include "simple_idct.h"
 #include "faandct.h"
 #include "imgconvert.h"
 #include "mathops.h"
 #include "mpegvideo.h"
 #include "config.h"
@@ -47,74 +44,6 @@ uint32_t ff_square_tab[512] = { 0, };
 #define BIT_DEPTH 8
 #include "dsputilenc_template.c"
 static int pix_sum_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
            s   += pix[0];
            s   += pix[1];
            s   += pix[2];
            s   += pix[3];
            s   += pix[4];
            s   += pix[5];
            s   += pix[6];
            s   += pix[7];
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int pix_norm1_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    uint32_t *sq = ff_square_tab + 256;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
 #if 0
            s += sq[pix[0]];
            s += sq[pix[1]];
            s += sq[pix[2]];
            s += sq[pix[3]];
            s += sq[pix[4]];
            s += sq[pix[5]];
            s += sq[pix[6]];
            s += sq[pix[7]];
 #else
 #if HAVE_FAST_64BIT
            register uint64_t x = *(uint64_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            s += sq[(x >> 32) & 0xff];
            s += sq[(x >> 40) & 0xff];
            s += sq[(x >> 48) & 0xff];
            s += sq[(x >> 56) & 0xff];
 #else
            register uint32_t x = *(uint32_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            x  = *(uint32_t *) (pix + 4);
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
 #endif
 #endif
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                  int line_size, int h)
 {
@@ -1055,9 +984,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->sum_abs_dctelem = sum_abs_dctelem_c;
    c->pix_sum   = pix_sum_c;
    c->pix_norm1 = pix_norm1_c;
    /* TODO [0] 16  [1] 8 */
    c->pix_abs[0][0] = pix_abs16_c;
    c->pix_abs[0][1] = pix_abs16_x2_c;
@@ -1097,11 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->nsse[0] = nsse16_c;
    c->nsse[1] = nsse8_c;
    c->shrink[0] = av_image_copy_plane;
    c->shrink[1] = ff_shrink22;
    c->shrink[2] = ff_shrink44;
    c->shrink[3] = ff_shrink88;
    c->draw_edges = draw_edges_8_c;
    switch (avctx->bits_per_raw_sample) {
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -58,9 +58,6 @@ typedef struct DSPContext {
                        int stride);
    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
    int (*pix_sum)(uint8_t *pix, int line_size);
    int (*pix_norm1)(uint8_t *pix, int line_size);
    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[6];
    me_cmp_func hadamard8_diff[6];
@@ -92,9 +89,6 @@ typedef struct DSPContext {
 #define EDGE_WIDTH 16
 #define EDGE_TOP    1
 #define EDGE_BOTTOM 2
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
                      int src_wrap, int width, int height);
 } DSPContext;
 void ff_dsputil_static_init(void);
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -881,8 +881,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
    /* intra / predictive decision */
    pix = c->src[0][0];
    sum = s->dsp.pix_sum(pix, s->linesize);
    varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500;
    sum  = s->mpvencdsp.pix_sum(pix, s->linesize);
    varc = s->mpvencdsp.pix_norm1(pix, s->linesize) -
           (((unsigned) sum * sum) >> 8) + 500;
    pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
    pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8;
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -895,7 +895,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
            int offset = x + y * stride;
            int sad  = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
                                     16);
            int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8;
            int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8;
            int sae  = get_sae(src + offset, mean, stride);
            acc += sae + 500 < sad;
@@ -1138,15 +1138,21 @@ static int estimate_best_b_count(MpegEncContext *s)
                pre_input.f->data[2] += INPLACE_OFFSET;
            }
            s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0],
                                 pre_input.f->data[0], pre_input.f->linesize[0],
                                 c->width,      c->height);
            s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1],
                                 pre_input.f->data[1], pre_input.f->linesize[1],
                                 c->width >> 1, c->height >> 1);
            s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2],
                                 pre_input.f->data[2], pre_input.f->linesize[2],
                                 c->width >> 1, c->height >> 1);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
                                       s->tmp_frames[i]->linesize[0],
                                       pre_input.f->data[0],
                                       pre_input.f->linesize[0],
                                       c->width, c->height);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
                                       s->tmp_frames[i]->linesize[1],
                                       pre_input.f->data[1],
                                       pre_input.f->linesize[1],
                                       c->width >> 1, c->height >> 1);
            s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
                                       s->tmp_frames[i]->linesize[2],
                                       pre_input.f->data[2],
                                       pre_input.f->linesize[2],
                                       c->width >> 1, c->height >> 1);
        }
    }
@@ -2420,9 +2426,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
            int yy = mb_y * 16;
            uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx;
            int varc;
            int sum = s->dsp.pix_sum(pix, s->linesize);
            int sum = s->mpvencdsp.pix_sum(pix, s->linesize);
            varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8;
            varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) -
                    (((unsigned) sum * sum) >> 8) + 500 + 128) >> 8;
            s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
            s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -21,7 +21,10 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "imgconvert.h"
 #include "mpegvideoencdsp.h"
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
@@ -53,12 +56,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
                  (BASIS_SHIFT - RECON_SHIFT);
 }
 static int pix_sum_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
            s   += pix[0];
            s   += pix[1];
            s   += pix[2];
            s   += pix[3];
            s   += pix[4];
            s   += pix[5];
            s   += pix[6];
            s   += pix[7];
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 static int pix_norm1_c(uint8_t *pix, int line_size)
 {
    int s = 0, i, j;
    uint32_t *sq = ff_square_tab + 256;
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j += 8) {
 #if 0
            s += sq[pix[0]];
            s += sq[pix[1]];
            s += sq[pix[2]];
            s += sq[pix[3]];
            s += sq[pix[4]];
            s += sq[pix[5]];
            s += sq[pix[6]];
            s += sq[pix[7]];
 #else
 #if HAVE_FAST_64BIT
            register uint64_t x = *(uint64_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            s += sq[(x >> 32) & 0xff];
            s += sq[(x >> 40) & 0xff];
            s += sq[(x >> 48) & 0xff];
            s += sq[(x >> 56) & 0xff];
 #else
            register uint32_t x = *(uint32_t *) pix;
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
            x  = *(uint32_t *) (pix + 4);
            s += sq[x         & 0xff];
            s += sq[(x >>  8) & 0xff];
            s += sq[(x >> 16) & 0xff];
            s += sq[(x >> 24) & 0xff];
 #endif
 #endif
            pix += 8;
        }
        pix += line_size - 16;
    }
    return s;
 }
 av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                                     AVCodecContext *avctx)
 {
    c->try_8x8basis = try_8x8basis_c;
    c->add_8x8basis = add_8x8basis_c;
    c->shrink[0] = av_image_copy_plane;
    c->shrink[1] = ff_shrink22;
    c->shrink[2] = ff_shrink44;
    c->shrink[3] = ff_shrink88;
    c->pix_sum   = pix_sum_c;
    c->pix_norm1 = pix_norm1_c;
    if (ARCH_ARM)
        ff_mpegvideoencdsp_init_arm(c, avctx);
    if (ARCH_PPC)
        ff_mpegvideoencdsp_init_ppc(c, avctx);
    if (ARCH_X86)
        ff_mpegvideoencdsp_init_x86(c, avctx);
 }
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext {
                        int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
    int (*pix_sum)(uint8_t *pix, int line_size);
    int (*pix_norm1)(uint8_t *pix, int line_size);
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
                      int src_wrap, int width, int height);
 } MpegvideoEncDSPContext;
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                             AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                 AVCodecContext *avctx);
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
                                          ppc/mpegvideodsp.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += ppc/mpegvideoencdsp.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);
    return s;
 }
 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
 * It's the sad8_altivec code above w/ squaring added. */
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
@@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
    int i, s;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
    vector signed int sumdiffs;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
        /* Add each 4 pixel group together and put 4 results into sad. */
        sad = vec_sum4s(t1, sad);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;
 }
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                               int line_size)
 {
@@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
    c->sse[0] = sse16_altivec;
    c->sse[1] = sse8_altivec;
    c->pix_norm1 = pix_norm1_altivec;
    c->pix_sum   = pix_sum_altivec;
    c->diff_pixels = diff_pixels_altivec;
    if (!high_bit_depth) {
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -0,0 +1,103 @@
 /*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "config.h"
 #include <stdint.h>
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 #if HAVE_ALTIVEC
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);
    return s;
 }
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
    int i, s;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
    vector signed int sumdiffs;
    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
        /* Add each 4 pixel group together and put 4 results into sad. */
        sad = vec_sum4s(t1, sad);
        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;
 }
 #endif /* HAVE_ALTIVEC */
 av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
 #if HAVE_ALTIVEC
    if (!PPC_ALTIVEC(av_get_cpu_flags()))
        return;
    c->pix_norm1 = pix_norm1_altivec;
    c->pix_sum   = pix_sum_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -511,6 +511,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
    ff_dsputil_init(&s->dsp, avctx);
    ff_hpeldsp_init(&s->hdsp, avctx->flags);
    ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
    avctx->coded_frame = av_frame_alloc();
    s->current_picture = av_frame_alloc();
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -92,6 +92,7 @@ YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                          x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
 YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                          x86/fpel.o                    \
                                          x86/qpel.o
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -418,72 +418,3 @@ cglobal diff_pixels, 4,5
    add          r4, 16
    jne .loop
    REP_RET
 INIT_MMX mmx
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
 cglobal pix_sum16, 2, 3
    movsxdifnidn r1, r1d
    mov          r2, r1
    neg          r2
    shl          r2, 4
    sub          r0, r2
    pxor         m7, m7
    pxor         m6, m6
 .loop:
    mova         m0, [r0+r2+0]
    mova         m1, [r0+r2+0]
    mova         m2, [r0+r2+8]
    mova         m3, [r0+r2+8]
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    paddw        m1, m0
    paddw        m3, m2
    paddw        m3, m1
    paddw        m6, m3
    add          r2, r1
    js .loop
    mova         m5, m6
    psrlq        m6, 32
    paddw        m6, m5
    mova         m5, m6
    psrlq        m6, 16
    paddw        m6, m5
    movd        eax, m6
    and         eax, 0xffff
    RET
 INIT_MMX mmx
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
 cglobal pix_norm1, 2, 4
    movsxdifnidn r1, r1d
    mov          r2, 16
    pxor         m0, m0
    pxor         m7, m7
 .loop:
    mova         m2, [r0+0]
    mova         m3, [r0+8]
    mova         m1, m2
    punpckhbw    m1, m0
    punpcklbw    m2, m0
    mova         m4, m3
    punpckhbw    m3, m0
    punpcklbw    m4, m0
    pmaddwd      m1, m1
    pmaddwd      m2, m2
    pmaddwd      m3, m3
    pmaddwd      m4, m4
    paddd        m2, m1
    paddd        m4, m3
    paddd        m7, m2
    add          r0, r1
    paddd        m7, m4
    dec r2
    jne .loop
    mova         m1, m7
    psrlq        m7, 32
    paddd        m1, m7
    movd        eax, m1
    RET
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -35,8 +35,6 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                        int stride);
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 #if HAVE_INLINE_ASM
@@ -831,8 +829,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_mmx;
        c->diff_pixels = ff_diff_pixels_mmx;
        c->pix_sum     = ff_pix_sum16_mmx;
        c->pix_norm1   = ff_pix_norm1_mmx;
    }
    if (EXTERNAL_SSE2(cpu_flags))
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -0,0 +1,95 @@
 ;*****************************************************************************
 ;* SIMD-optimized MPEG encoding functions
 ;*****************************************************************************
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
 ;* This file is part of Libav.
 ;*
 ;* Libav is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* Libav is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with Libav; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 %include "libavutil/x86/x86util.asm"
 SECTION .text
 INIT_MMX mmx
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
 cglobal pix_sum16, 2, 3
    movsxdifnidn r1, r1d
    mov          r2, r1
    neg          r2
    shl          r2, 4
    sub          r0, r2
    pxor         m7, m7
    pxor         m6, m6
 .loop:
    mova         m0, [r0+r2+0]
    mova         m1, [r0+r2+0]
    mova         m2, [r0+r2+8]
    mova         m3, [r0+r2+8]
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    paddw        m1, m0
    paddw        m3, m2
    paddw        m3, m1
    paddw        m6, m3
    add          r2, r1
    js .loop
    mova         m5, m6
    psrlq        m6, 32
    paddw        m6, m5
    mova         m5, m6
    psrlq        m6, 16
    paddw        m6, m5
    movd        eax, m6
    and         eax, 0xffff
    RET
 INIT_MMX mmx
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
 cglobal pix_norm1, 2, 4
    movsxdifnidn r1, r1d
    mov          r2, 16
    pxor         m0, m0
    pxor         m7, m7
 .loop:
    mova         m2, [r0+0]
    mova         m3, [r0+8]
    mova         m1, m2
    punpckhbw    m1, m0
    punpcklbw    m2, m0
    mova         m4, m3
    punpckhbw    m3, m0
    punpcklbw    m4, m0
    pmaddwd      m1, m1
    pmaddwd      m2, m2
    pmaddwd      m3, m3
    pmaddwd      m4, m4
    paddd        m2, m1
    paddd        m4, m3
    paddd        m7, m2
    add          r0, r1
    paddd        m7, m4
    dec r2
    jne .loop
    mova         m1, m7
    psrlq        m7, 32
    paddd        m1, m7
    movd        eax, m1
    RET
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -22,6 +22,9 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 #if HAVE_INLINE_ASM
 #define PHADDD(a, t)                            \
@@ -95,9 +98,15 @@
 av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                         AVCodecContext *avctx)
 {
 #if HAVE_INLINE_ASM
    int cpu_flags = av_get_cpu_flags();
    if (EXTERNAL_MMX(cpu_flags)) {
        c->pix_sum   = ff_pix_sum16_mmx;
        c->pix_norm1 = ff_pix_norm1_mmx;
    }
 #if HAVE_INLINE_ASM
    if (INLINE_MMX(cpu_flags)) {
        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
            c->try_8x8basis = try_8x8basis_mmx;