Merge commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e'

* commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e': dsputil: Split off pixel block routines into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/mpegvideo_enc.c libavcodec/pixblockdsp_template.c libavcodec/x86/dsputilenc.asm libavcodec/x86/dsputilenc_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · 2d5e9451de
--- a/configure
+++ b/configure
@@ -1829,6 +1829,7 @@ CONFIG_EXTRA="
    mpegvideo
    mpegvideoenc
    nettle
    pixblockdsp
    qpeldsp
    rangecoder
    riffdec
@@ -1997,7 +1998,7 @@ threads_if_any="$THREADS_LIST"

 # subsystems
 dct_select="rdft"
 dsputil_select="fdctdsp idctdsp"
 dsputil_select="fdctdsp idctdsp pixblockdsp"
 error_resilience_select="dsputil"
 frame_thread_encoder_deps="encoders threads"
 intrax8_select="error_resilience"
@@ -2007,7 +2008,7 @@ mpeg_er_select="error_resilience"
 mpegaudio_select="mpegaudiodsp"
 mpegaudiodsp_select="dct"
 mpegvideo_select="blockdsp dsputil h264chroma hpeldsp idctdsp videodsp"
 mpegvideoenc_select="dsputil mpegvideo qpeldsp"
 mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp"

 # decoders / encoders
 aac_decoder_select="mdct sinewin"
@@ -2026,9 +2027,9 @@ amv_decoder_select="sp5x_decoder exif"
 amv_encoder_select="aandcttables mpegvideoenc"
 ape_decoder_select="bswapdsp llauddsp"
 asv1_decoder_select="blockdsp bswapdsp idctdsp"
 asv1_encoder_select="bswapdsp dsputil fdctdsp"
 asv1_encoder_select="bswapdsp fdctdsp pixblockdsp"
 asv2_decoder_select="blockdsp bswapdsp idctdsp"
 asv2_encoder_select="bswapdsp dsputil fdctdsp"
 asv2_encoder_select="bswapdsp fdctdsp pixblockdsp"
 atrac1_decoder_select="mdct sinewin"
 atrac3_decoder_select="mdct"
 atrac3p_decoder_select="mdct sinewin"
@@ -2045,9 +2046,9 @@ cscd_decoder_suggest="zlib"
 dca_decoder_select="mdct"
 dirac_decoder_select="dsputil dwt golomb videodsp"
 dnxhd_decoder_select="blockdsp idctdsp"
 dnxhd_encoder_select="aandcttables blockdsp dsputil fdctdsp idctdsp mpegvideoenc"
 dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp"
 dvvideo_decoder_select="dvprofile idctdsp"
 dvvideo_encoder_select="dsputil dvprofile fdctdsp"
 dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp"
 dxa_decoder_select="zlib"
 eac3_decoder_select="ac3_decoder"
 eac3_encoder_select="ac3_encoder"
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -82,6 +82,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += mpegvideo.o mpegvideodsp.o    \
 OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                          motion_est.o ratecontrol.o    \
                                          mpegvideoencdsp.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
 RDFT-OBJS-$(CONFIG_HARDCODED_TABLES)   += sin_tables.o
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += arm/pixblockdsp_init_arm.o
 OBJS-$(CONFIG_VIDEODSP)                += arm/videodsp_init_arm.o
 OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o

@@ -63,6 +64,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
                                          arm/simple_idct_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP)       += arm/pixblockdsp_armv6.o

 ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_VC1_DECODER)       += arm/startcode_armv6.o
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -20,61 +20,6 @@

 #include "libavutil/arm/asm.S"

 function ff_get_pixels_armv6, export=1
        pld             [r1, r2]
        push            {r4-r8, lr}
        mov             lr,  #8
 1:
        ldrd_post       r4,  r5,  r1,  r2
        subs            lr,  lr,  #1
        uxtb16          r6,  r4
        uxtb16          r4,  r4,  ror #8
        uxtb16          r12, r5
        uxtb16          r8,  r5,  ror #8
        pld             [r1, r2]
        pkhbt           r5,  r6,  r4,  lsl #16
        pkhtb           r6,  r4,  r6,  asr #16
        pkhbt           r7,  r12, r8,  lsl #16
        pkhtb           r12, r8,  r12, asr #16
        stm             r0!, {r5,r6,r7,r12}
        bgt             1b

        pop             {r4-r8, pc}
 endfunc

 function ff_diff_pixels_armv6, export=1
        pld             [r1, r3]
        pld             [r2, r3]
        push            {r4-r9, lr}
        mov             lr,  #8
 1:
        ldrd_post       r4,  r5,  r1,  r3
        ldrd_post       r6,  r7,  r2,  r3
        uxtb16          r8,  r4
        uxtb16          r4,  r4,  ror #8
        uxtb16          r9,  r6
        uxtb16          r6,  r6,  ror #8
        pld             [r1, r3]
        ssub16          r9,  r8,  r9
        ssub16          r6,  r4,  r6
        uxtb16          r8,  r5
        uxtb16          r5,  r5,  ror #8
        pld             [r2, r3]
        pkhbt           r4,  r9,  r6,  lsl #16
        pkhtb           r6,  r6,  r9,  asr #16
        uxtb16          r9,  r7
        uxtb16          r7,  r7,  ror #8
        ssub16          r9,  r8,  r9
        ssub16          r5,  r5,  r7
        subs            lr,  lr,  #1
        pkhbt           r8,  r9,  r5,  lsl #16
        pkhtb           r9,  r5,  r9,  asr #16
        stm             r0!, {r4,r6,r8,r9}
        bgt             1b

        pop             {r4-r9, pc}
 endfunc

 function ff_pix_abs16_armv6, export=1
        ldr             r0,  [sp]
        push            {r4-r9, lr}
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -26,10 +26,6 @@
 #include "libavcodec/mpegvideo.h"
 #include "dsputil_arm.h"

 void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
 void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
                          const uint8_t *s2, int stride);

 int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                       int line_size, int h);
 int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
@@ -46,10 +42,6 @@ int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
 av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
    if (!high_bit_depth)
        c->get_pixels = ff_get_pixels_armv6;
    c->diff_pixels = ff_diff_pixels_armv6;

    c->pix_abs[0][0] = ff_pix_abs16_armv6;
    c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
    c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/arm/asm.S"

 function ff_get_pixels_armv6, export=1
        pld             [r1, r2]
        push            {r4-r8, lr}
        mov             lr,  #8
 1:
        ldrd_post       r4,  r5,  r1,  r2
        subs            lr,  lr,  #1
        uxtb16          r6,  r4
        uxtb16          r4,  r4,  ror #8
        uxtb16          r12, r5
        uxtb16          r8,  r5,  ror #8
        pld             [r1, r2]
        pkhbt           r5,  r6,  r4,  lsl #16
        pkhtb           r6,  r4,  r6,  asr #16
        pkhbt           r7,  r12, r8,  lsl #16
        pkhtb           r12, r8,  r12, asr #16
        stm             r0!, {r5,r6,r7,r12}
        bgt             1b

        pop             {r4-r8, pc}
 endfunc

 function ff_diff_pixels_armv6, export=1
        pld             [r1, r3]
        pld             [r2, r3]
        push            {r4-r9, lr}
        mov             lr,  #8
 1:
        ldrd_post       r4,  r5,  r1,  r3
        ldrd_post       r6,  r7,  r2,  r3
        uxtb16          r8,  r4
        uxtb16          r4,  r4,  ror #8
        uxtb16          r9,  r6
        uxtb16          r6,  r6,  ror #8
        pld             [r1, r3]
        ssub16          r9,  r8,  r9
        ssub16          r6,  r4,  r6
        uxtb16          r8,  r5
        uxtb16          r5,  r5,  ror #8
        pld             [r2, r3]
        pkhbt           r4,  r9,  r6,  lsl #16
        pkhtb           r6,  r6,  r9,  asr #16
        uxtb16          r9,  r7
        uxtb16          r7,  r7,  ror #8
        ssub16          r9,  r8,  r9
        ssub16          r5,  r5,  r7
        subs            lr,  lr,  #1
        pkhbt           r8,  r9,  r5,  lsl #16
        pkhtb           r9,  r5,  r9,  asr #16
        stm             r0!, {r4,r6,r8,r9}
        bgt             1b

        pop             {r4-r9, pc}
 endfunc
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -0,0 +1,42 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"

 void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
 void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
                          const uint8_t *s2, int stride);

 av_cold void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
                                     AVCodecContext *avctx,
                                     unsigned high_bit_depth)
 {
    int cpu_flags = av_get_cpu_flags();

    if (have_armv6(cpu_flags)) {
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_armv6;
        c->diff_pixels = ff_diff_pixels_armv6;
    }
 }
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -33,19 +33,19 @@
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "bswapdsp.h"
 #include "dsputil.h"
 #include "fdctdsp.h"
 #include "idctdsp.h"
 #include "get_bits.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"

 typedef struct ASV1Context{
    AVCodecContext *avctx;
    BlockDSPContext bdsp;
    BswapDSPContext bbdsp;
    DSPContext dsp;
    FDCTDSPContext fdsp;
    IDCTDSPContext idsp;
    PixblockDSPContext pdsp;
    PutBitContext pb;
    GetBitContext gb;
    ScanTable scantable;
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -160,16 +160,16 @@ static inline void dct_get(ASV1Context *a, const AVFrame *frame,
    uint8_t *ptr_cb = frame->data[1] + (mb_y * 8 * frame->linesize[1]) + mb_x * 8;
    uint8_t *ptr_cr = frame->data[2] + (mb_y * 8 * frame->linesize[2]) + mb_x * 8;

    a->dsp.get_pixels(block[0], ptr_y                 , linesize);
    a->dsp.get_pixels(block[1], ptr_y              + 8, linesize);
    a->dsp.get_pixels(block[2], ptr_y + 8*linesize    , linesize);
    a->dsp.get_pixels(block[3], ptr_y + 8*linesize + 8, linesize);
    a->pdsp.get_pixels(block[0], ptr_y,                    linesize);
    a->pdsp.get_pixels(block[1], ptr_y + 8,                linesize);
    a->pdsp.get_pixels(block[2], ptr_y + 8 * linesize,     linesize);
    a->pdsp.get_pixels(block[3], ptr_y + 8 * linesize + 8, linesize);
    for(i=0; i<4; i++)
        a->fdsp.fdct(block[i]);

    if(!(a->avctx->flags&CODEC_FLAG_GRAY)){
        a->dsp.get_pixels(block[4], ptr_cb, frame->linesize[1]);
        a->dsp.get_pixels(block[5], ptr_cr, frame->linesize[2]);
        a->pdsp.get_pixels(block[4], ptr_cb, frame->linesize[1]);
        a->pdsp.get_pixels(block[5], ptr_cr, frame->linesize[2]);
        for(i=4; i<6; i++)
            a->fdsp.fdct(block[i]);
    }
@@ -282,8 +282,8 @@ static av_cold int encode_init(AVCodecContext *avctx){
    const int scale= avctx->codec_id == AV_CODEC_ID_ASV1 ? 1 : 2;

    ff_asv_common_init(avctx);
    ff_dsputil_init(&a->dsp, avctx);
    ff_fdctdsp_init(&a->fdsp, avctx);
    ff_pixblockdsp_init(&a->pdsp, avctx);

    if(avctx->global_quality <= 0) avctx->global_quality= 4*FF_QUALITY_SCALE;

--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -30,10 +30,10 @@

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "fdctdsp.h"
 #include "internal.h"
 #include "mpegvideo.h"
 #include "pixblockdsp.h"
 #include "dnxhdenc.h"


@@ -326,6 +326,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
    ff_fdctdsp_init(&ctx->m.fdsp, avctx);
    ff_idctdsp_init(&ctx->m.idsp, avctx);
    ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
    ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
    ff_dct_common_init(&ctx->m);
    ff_dct_encode_init(&ctx->m);

@@ -561,12 +562,12 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    const uint8_t *ptr_v = ctx->thread[0]->src[2] +
                           ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    DSPContext *dsp = &ctx->m.dsp;
    PixblockDSPContext *pdsp = &ctx->m.pdsp;

    dsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);
    pdsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
    pdsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
    pdsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
    pdsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);

    if (mb_y + 1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
        if (ctx->interlaced) {
@@ -589,14 +590,14 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
            ctx->bdsp.clear_block(ctx->blocks[7]);
        }
    } else {
        dsp->get_pixels(ctx->blocks[4],
                        ptr_y + ctx->dct_y_offset, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[5],
                        ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[6],
                        ptr_u + ctx->dct_uv_offset, ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[7],
                        ptr_v + ctx->dct_uv_offset, ctx->m.uvlinesize);
        pdsp->get_pixels(ctx->blocks[4],
                         ptr_y + ctx->dct_y_offset, ctx->m.linesize);
        pdsp->get_pixels(ctx->blocks[5],
                         ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
        pdsp->get_pixels(ctx->blocks[6],
                         ptr_u + ctx->dct_uv_offset, ctx->m.uvlinesize);
        pdsp->get_pixels(ctx->blocks[7],
                         ptr_v + ctx->dct_uv_offset, ctx->m.uvlinesize);
    }
 }

--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -36,13 +36,6 @@

 uint32_t ff_square_tab[512] = { 0, };

 #define BIT_DEPTH 16
 #include "dsputilenc_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 8
 #include "dsputilenc_template.c"

 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                  int line_size, int h)
 {
@@ -111,27 +104,6 @@ static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }

 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
                          const uint8_t *s2, int stride)
 {
    int i;

    /* read the pixels */
    for (i = 0; i < 8; i++) {
        block[0] = s1[0] - s2[0];
        block[1] = s1[1] - s2[1];
        block[2] = s1[2] - s2[2];
        block[3] = s1[3] - s2[3];
        block[4] = s1[4] - s2[4];
        block[5] = s1[5] - s2[5];
        block[6] = s1[6] - s2[6];
        block[7] = s1[7] - s2[7];
        s1      += stride;
        s2      += stride;
        block   += 8;
    }
 }

 static int sum_abs_dctelem_c(int16_t *block)
 {
    int sum = 0, i;
@@ -586,7 +558,7 @@ static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,

    av_assert2(h == 8);

    s->dsp.diff_pixels(temp, src1, src2, stride);
    s->pdsp.diff_pixels(temp, src1, src2, stride);
    s->fdsp.fdct(temp);
    return s->dsp.sum_abs_dctelem(temp);
 }
@@ -626,7 +598,7 @@ static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
    int16_t dct[8][8];
    int i, sum = 0;

    s->dsp.diff_pixels(dct[0], src1, src2, stride);
    s->pdsp.diff_pixels(dct[0], src1, src2, stride);

 #define SRC(x) dct[i][x]
 #define DST(x, v) dct[i][x] = v
@@ -653,7 +625,7 @@ static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,

    av_assert2(h == 8);

    s->dsp.diff_pixels(temp, src1, src2, stride);
    s->pdsp.diff_pixels(temp, src1, src2, stride);
    s->fdsp.fdct(temp);

    for (i = 0; i < 64; i++)
@@ -672,7 +644,7 @@ static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
    av_assert2(h == 8);
    s->mb_intra = 0;

    s->dsp.diff_pixels(temp, src1, src2, stride);
    s->pdsp.diff_pixels(temp, src1, src2, stride);

    memcpy(bak, temp, 64 * sizeof(int16_t));

@@ -703,7 +675,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
    copy_block8(lsrc1, src1, 8, stride, 8);
    copy_block8(lsrc2, src2, 8, stride, 8);

    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
    s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);

    s->block_last_index[0 /* FIXME */] =
    last                               =
@@ -775,7 +747,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,

    av_assert2(h == 8);

    s->dsp.diff_pixels(temp, src1, src2, stride);
    s->pdsp.diff_pixels(temp, src1, src2, stride);

    s->block_last_index[0 /* FIXME */] =
    last                               =
@@ -971,8 +943,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)

    ff_check_alignment();

    c->diff_pixels = diff_pixels_c;

    c->sum_abs_dctelem = sum_abs_dctelem_c;

    /* TODO [0] 16  [1] 8 */
@@ -1019,21 +989,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    ff_dsputil_init_dwt(c);
 #endif

    switch (avctx->bits_per_raw_sample) {
    case 9:
    case 10:
    case 12:
    case 14:
        c->get_pixels = get_pixels_16_c;
        break;
    default:
        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
            c->get_pixels = get_pixels_8_c;
        }
        break;
    }


    if (ARCH_ALPHA)
        ff_dsputil_init_alpha(c, avctx);
    if (ARCH_ARM)
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -62,14 +62,6 @@ typedef int (*me_cmp_func)(struct MpegEncContext *c,
 * DSPContext.
 */
 typedef struct DSPContext {
    /* pixel ops : interface with DCT */
    void (*get_pixels)(int16_t *block /* align 16 */,
                       const uint8_t *pixels /* align 8 */,
                       int line_size);
    void (*diff_pixels)(int16_t *block /* align 16 */,
                        const uint8_t *s1 /* align 8 */,
                        const uint8_t *s2 /* align 8 */,
                        int stride);
    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);

    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@ -31,6 +31,7 @@
 #include "dsputil.h"
 #include "fdctdsp.h"
 #include "internal.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
 #include "dv.h"
 #include "dv_tablegen.h"
@@ -41,6 +42,7 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
    DVVideoContext *s = avctx->priv_data;
    DSPContext dsp;
    FDCTDSPContext fdsp;
    PixblockDSPContext pdsp;
    int ret;

    s->sys = avpriv_dv_codec_profile(avctx);
@@ -70,9 +72,10 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
    memset(&dsp,0, sizeof(dsp));
    ff_dsputil_init(&dsp, avctx);
    ff_fdctdsp_init(&fdsp, avctx);
    ff_pixblockdsp_init(&pdsp, avctx);
    ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp);

    s->get_pixels = dsp.get_pixels;
    s->get_pixels = pdsp.get_pixels;
    s->ildct_cmp  = dsp.ildct_cmp[5];

    s->fdct[0]    = fdsp.fdct;
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@@ -29,5 +29,6 @@ LIBAVCODEC_$MAJOR {
                ff_dnxhd_cid_table;
                ff_idctdsp_init;
                ff_fdctdsp_init;
                ff_pixblockdsp_init;
        local:  *;
 };
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -40,6 +40,7 @@
 #include "idctdsp.h"
 #include "mpegvideodsp.h"
 #include "mpegvideoencdsp.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
 #include "ratecontrol.h"
 #include "parser.h"
@@ -371,6 +372,7 @@ typedef struct MpegEncContext {
    IDCTDSPContext idsp;
    MpegVideoDSPContext mdsp;
    MpegvideoEncDSPContext mpvencdsp;
    PixblockDSPContext pdsp;
    QpelDSPContext qdsp;
    VideoDSPContext vdsp;
    H263DSPContext h263dsp;
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -37,7 +37,6 @@
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "dct.h"
 #include "dsputil.h"
 #include "idctdsp.h"
 #include "mpeg12.h"
 #include "mpegvideo.h"
@@ -48,6 +47,7 @@
 #include "mpegutils.h"
 #include "mjpegenc.h"
 #include "msmpeg4.h"
 #include "pixblockdsp.h"
 #include "qpeldsp.h"
 #include "faandct.h"
 #include "thread.h"
@@ -820,6 +820,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)

    ff_fdctdsp_init(&s->fdsp, avctx);
    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
    ff_pixblockdsp_init(&s->pdsp, avctx);
    ff_qpeldsp_init(&s->qdsp);

    s->avctx->coded_frame = s->current_picture.f;
@@ -2093,27 +2094,27 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
            }
        }

        s->dsp.get_pixels(s->block[0], ptr_y                  , wrap_y);
        s->dsp.get_pixels(s->block[1], ptr_y              + 8 , wrap_y);
        s->dsp.get_pixels(s->block[2], ptr_y + dct_offset     , wrap_y);
        s->dsp.get_pixels(s->block[3], ptr_y + dct_offset + 8 , wrap_y);
        s->pdsp.get_pixels(s->block[0], ptr_y,                  wrap_y);
        s->pdsp.get_pixels(s->block[1], ptr_y + 8,              wrap_y);
        s->pdsp.get_pixels(s->block[2], ptr_y + dct_offset,     wrap_y);
        s->pdsp.get_pixels(s->block[3], ptr_y + dct_offset + 8, wrap_y);

        if (s->flags & CODEC_FLAG_GRAY) {
            skip_dct[4] = 1;
            skip_dct[5] = 1;
        } else {
            s->dsp.get_pixels(s->block[4], ptr_cb, wrap_c);
            s->dsp.get_pixels(s->block[5], ptr_cr, wrap_c);
            s->pdsp.get_pixels(s->block[4], ptr_cb, wrap_c);
            s->pdsp.get_pixels(s->block[5], ptr_cr, wrap_c);
            if (!s->chroma_y_shift && s->chroma_x_shift) { /* 422 */
                s->dsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
                s->dsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
                s->pdsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
                s->pdsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
            } else if (!s->chroma_y_shift && !s->chroma_x_shift) { /* 444 */
                s->dsp.get_pixels(s->block[6], ptr_cb + 8, wrap_c);
                s->dsp.get_pixels(s->block[7], ptr_cr + 8, wrap_c);
                s->dsp.get_pixels(s->block[8], ptr_cb + uv_dct_offset, wrap_c);
                s->dsp.get_pixels(s->block[9], ptr_cr + uv_dct_offset, wrap_c);
                s->dsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
                s->dsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
                s->pdsp.get_pixels(s->block[ 6], ptr_cb + 8, wrap_c);
                s->pdsp.get_pixels(s->block[ 7], ptr_cr + 8, wrap_c);
                s->pdsp.get_pixels(s->block[ 8], ptr_cb + uv_dct_offset, wrap_c);
                s->pdsp.get_pixels(s->block[ 9], ptr_cr + uv_dct_offset, wrap_c);
                s->pdsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
                s->pdsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
            }
        }
    } else {
@@ -2180,24 +2181,24 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
            }
        }

        s->dsp.diff_pixels(s->block[0], ptr_y, dest_y, wrap_y);
        s->dsp.diff_pixels(s->block[1], ptr_y + 8, dest_y + 8, wrap_y);
        s->dsp.diff_pixels(s->block[2], ptr_y + dct_offset,
                           dest_y + dct_offset, wrap_y);
        s->dsp.diff_pixels(s->block[3], ptr_y + dct_offset + 8,
                           dest_y + dct_offset + 8, wrap_y);
        s->pdsp.diff_pixels(s->block[0], ptr_y, dest_y, wrap_y);
        s->pdsp.diff_pixels(s->block[1], ptr_y + 8, dest_y + 8, wrap_y);
        s->pdsp.diff_pixels(s->block[2], ptr_y + dct_offset,
                            dest_y + dct_offset, wrap_y);
        s->pdsp.diff_pixels(s->block[3], ptr_y + dct_offset + 8,
                            dest_y + dct_offset + 8, wrap_y);

        if (s->flags & CODEC_FLAG_GRAY) {
            skip_dct[4] = 1;
            skip_dct[5] = 1;
        } else {
            s->dsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
            s->dsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
            s->pdsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
            s->pdsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
            if (!s->chroma_y_shift) { /* 422 */
                s->dsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
                                   dest_cb + uv_dct_offset, wrap_c);
                s->dsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
                                   dest_cr + uv_dct_offset, wrap_c);
                s->pdsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
                                    dest_cb + uv_dct_offset, wrap_c);
                s->pdsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
                                    dest_cr + uv_dct_offset, wrap_c);
            }
        }
        /* pre quantization */
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@ -0,0 +1,80 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "avcodec.h"
 #include "pixblockdsp.h"

 #define BIT_DEPTH 16
 #include "pixblockdsp_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 8
 #include "pixblockdsp_template.c"

 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
                          const uint8_t *s2, int stride)
 {
    int i;

    /* read the pixels */
    for (i = 0; i < 8; i++) {
        block[0] = s1[0] - s2[0];
        block[1] = s1[1] - s2[1];
        block[2] = s1[2] - s2[2];
        block[3] = s1[3] - s2[3];
        block[4] = s1[4] - s2[4];
        block[5] = s1[5] - s2[5];
        block[6] = s1[6] - s2[6];
        block[7] = s1[7] - s2[7];
        s1      += stride;
        s2      += stride;
        block   += 8;
    }
 }

 av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
 {
    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;

    c->diff_pixels = diff_pixels_c;

    switch (avctx->bits_per_raw_sample) {
    case 9:
    case 10:
    case 12:
    case 14:
        c->get_pixels = get_pixels_16_c;
        break;
    default:
        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
            c->get_pixels = get_pixels_8_c;
        }
        break;
    }

    if (ARCH_ARM)
        ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
    if (ARCH_PPC)
        ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
    if (ARCH_X86)
        ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
 }
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@ -0,0 +1,44 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_PIXBLOCKDSP_H
 #define AVCODEC_PIXBLOCKDSP_H

 #include <stdint.h>

 #include "avcodec.h"

 typedef struct PixblockDSPContext {
    void (*get_pixels)(int16_t *block /* align 16 */,
                       const uint8_t *pixels /* align 8 */,
                       int line_size);
    void (*diff_pixels)(int16_t *block /* align 16 */,
                        const uint8_t *s1 /* align 8 */,
                        const uint8_t *s2 /* align 8 */,
                        int stride);
 } PixblockDSPContext;

 void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
 void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);
 void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);
 void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);

 #endif /* AVCODEC_PIXBLOCKDSP_H */
--- a/libavcodec/pixblockdsp_template.c
+++ b/libavcodec/pixblockdsp_template.c
@@ -1,10 +1,4 @@
 /*
 * DSP utils
 * Copyright (c) 2000, 2001 Fabrice Bellard
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 *
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
@@ -22,11 +16,6 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 /**
 * @file
 * DSP utils
 */

 #include "bit_depth_template.c"

 static void FUNCC(get_pixels)(int16_t *av_restrict block, const uint8_t *_pixels,
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -15,6 +15,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
                                          ppc/mpegvideodsp.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += ppc/mpegvideoencdsp.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += ppc/pixblockdsp.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o

--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -402,105 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    return s;
 }

 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                               int line_size)
 {
    int i;
    vector unsigned char perm = vec_lvsl(0, pixels);
    const vector unsigned char zero =
        (const vector unsigned char) vec_splat_u8(0);

    for (i = 0; i < 8; i++) {
        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        vector unsigned char pixl = vec_ld(0, pixels);
        vector unsigned char pixr = vec_ld(7, pixels);
        vector unsigned char bytes = vec_perm(pixl, pixr, perm);

        // Convert the bytes into shorts.
        vector signed short shorts = (vector signed short) vec_mergeh(zero,
                                                                      bytes);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts, i * 16, (vector signed short *) block);

        pixels += line_size;
    }
 }

 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
                                const uint8_t *s2, int stride)
 {
    int i;
    vector unsigned char perm1 = vec_lvsl(0, s1);
    vector unsigned char perm2 = vec_lvsl(0, s2);
    const vector unsigned char zero =
        (const vector unsigned char) vec_splat_u8(0);
    vector signed short shorts1, shorts2;

    for (i = 0; i < 4; i++) {
        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        vector unsigned char pixl  = vec_ld(0,  s1);
        vector unsigned char pixr  = vec_ld(15, s1);
        vector unsigned char bytes = vec_perm(pixl, pixr, perm1);

        // Convert the bytes into shorts.
        shorts1 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the same for the second block of pixels.
        pixl  = vec_ld(0,  s2);
        pixr  = vec_ld(15, s2);
        bytes = vec_perm(pixl, pixr, perm2);

        // Convert the bytes into shorts.
        shorts2 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the subtraction.
        shorts1 = vec_sub(shorts1, shorts2);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts1, 0, (vector signed short *) block);

        s1    += stride;
        s2    += stride;
        block += 8;

        /* The code below is a copy of the code above...
         * This is a manual unroll. */

        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        pixl  = vec_ld(0,  s1);
        pixr  = vec_ld(15, s1);
        bytes = vec_perm(pixl, pixr, perm1);

        // Convert the bytes into shorts.
        shorts1 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the same for the second block of pixels.
        pixl  = vec_ld(0,  s2);
        pixr  = vec_ld(15, s2);
        bytes = vec_perm(pixl, pixr, perm2);

        // Convert the bytes into shorts.
        shorts2 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the subtraction.
        shorts1 = vec_sub(shorts1, shorts2);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts1, 0, (vector signed short *) block);

        s1    += stride;
        s2    += stride;
        block += 8;
    }
 }

 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                     uint8_t *src, int stride, int h)
 {
@@ -854,12 +755,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
    c->sse[0] = sse16_altivec;
    c->sse[1] = sse8_altivec;

    c->diff_pixels = diff_pixels_altivec;

    if (!high_bit_depth) {
        c->get_pixels = get_pixels_altivec;
    }

    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
 }
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -0,0 +1,153 @@
 /*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "config.h"
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"

 #if HAVE_ALTIVEC

 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                               int line_size)
 {
    int i;
    vector unsigned char perm = vec_lvsl(0, pixels);
    const vector unsigned char zero =
        (const vector unsigned char) vec_splat_u8(0);

    for (i = 0; i < 8; i++) {
        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        vector unsigned char pixl = vec_ld(0, pixels);
        vector unsigned char pixr = vec_ld(7, pixels);
        vector unsigned char bytes = vec_perm(pixl, pixr, perm);

        // Convert the bytes into shorts.
        vector signed short shorts = (vector signed short) vec_mergeh(zero,
                                                                      bytes);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts, i * 16, (vector signed short *) block);

        pixels += line_size;
    }
 }

 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
                                const uint8_t *s2, int stride)
 {
    int i;
    vector unsigned char perm1 = vec_lvsl(0, s1);
    vector unsigned char perm2 = vec_lvsl(0, s2);
    const vector unsigned char zero =
        (const vector unsigned char) vec_splat_u8(0);
    vector signed short shorts1, shorts2;

    for (i = 0; i < 4; i++) {
        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        vector unsigned char pixl  = vec_ld(0,  s1);
        vector unsigned char pixr  = vec_ld(15, s1);
        vector unsigned char bytes = vec_perm(pixl, pixr, perm1);

        // Convert the bytes into shorts.
        shorts1 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the same for the second block of pixels.
        pixl  = vec_ld(0,  s2);
        pixr  = vec_ld(15, s2);
        bytes = vec_perm(pixl, pixr, perm2);

        // Convert the bytes into shorts.
        shorts2 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the subtraction.
        shorts1 = vec_sub(shorts1, shorts2);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts1, 0, (vector signed short *) block);

        s1    += stride;
        s2    += stride;
        block += 8;

        /* The code below is a copy of the code above...
         * This is a manual unroll. */

        /* Read potentially unaligned pixels.
         * We're reading 16 pixels, and actually only want 8,
         * but we simply ignore the extras. */
        pixl  = vec_ld(0,  s1);
        pixr  = vec_ld(15, s1);
        bytes = vec_perm(pixl, pixr, perm1);

        // Convert the bytes into shorts.
        shorts1 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the same for the second block of pixels.
        pixl  = vec_ld(0,  s2);
        pixr  = vec_ld(15, s2);
        bytes = vec_perm(pixl, pixr, perm2);

        // Convert the bytes into shorts.
        shorts2 = (vector signed short) vec_mergeh(zero, bytes);

        // Do the subtraction.
        shorts1 = vec_sub(shorts1, shorts2);

        // Save the data to the block, we assume the block is 16-byte aligned.
        vec_st(shorts1, 0, (vector signed short *) block);

        s1    += stride;
        s2    += stride;
        block += 8;
    }
 }

 #endif /* HAVE_ALTIVEC */

 av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
                                     AVCodecContext *avctx,
                                     unsigned high_bit_depth)
 {
 #if HAVE_ALTIVEC
    if (!PPC_ALTIVEC(av_get_cpu_flags()))
        return;

    c->diff_pixels = diff_pixels_altivec;

    if (!high_bit_depth) {
        c->get_pixels = get_pixels_altivec;
    }
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -31,6 +31,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
                                          x86/mpegvideodsp.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += x86/mpegvideoenc.o           \
                                          x86/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += x86/pixblockdsp_init.o
 OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
 OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
@@ -110,6 +111,7 @@ YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
 YASM-OBJS-$(CONFIG_PIXBLOCKDSP)        += x86/pixblockdsp.o
 YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                          x86/fpel.o                    \
                                          x86/qpel.o
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -352,115 +352,6 @@ SUM_SQUARED_ERRORS 16
 INIT_XMM sse2
 SUM_SQUARED_ERRORS 16

 INIT_MMX mmx
 ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
 cglobal get_pixels, 3,4
    movsxdifnidn r2, r2d
    add          r0, 128
    mov          r3, -128
    pxor         m7, m7
 .loop:
    mova         m0, [r1]
    mova         m2, [r1+r2]
    mova         m1, m0
    mova         m3, m2
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    mova [r0+r3+ 0], m0
    mova [r0+r3+ 8], m1
    mova [r0+r3+16], m2
    mova [r0+r3+24], m3
    lea          r1, [r1+r2*2]
    add          r3, 32
    js .loop
    REP_RET

 INIT_XMM sse2
 cglobal get_pixels, 3, 4, 5
    movsxdifnidn r2, r2d
    lea          r3, [r2*3]
    pxor         m4, m4
    movh         m0, [r1]
    movh         m1, [r1+r2]
    movh         m2, [r1+r2*2]
    movh         m3, [r1+r3]
    lea          r1, [r1+r2*4]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    mova       [r0], m0
    mova  [r0+0x10], m1
    mova  [r0+0x20], m2
    mova  [r0+0x30], m3
    movh         m0, [r1]
    movh         m1, [r1+r2*1]
    movh         m2, [r1+r2*2]
    movh         m3, [r1+r3]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    mova  [r0+0x40], m0
    mova  [r0+0x50], m1
    mova  [r0+0x60], m2
    mova  [r0+0x70], m3
    RET

 INIT_MMX mmx
 ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
 ;                         int stride);
 cglobal diff_pixels, 4,5
    movsxdifnidn r3, r3d
    pxor         m7, m7
    add          r0,  128
    mov          r4, -128
 .loop:
    mova         m0, [r1]
    mova         m2, [r2]
    mova         m1, m0
    mova         m3, m2
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    psubw        m0, m2
    psubw        m1, m3
    mova  [r0+r4+0], m0
    mova  [r0+r4+8], m1
    add          r1, r3
    add          r2, r3
    add          r4, 16
    jne .loop
    REP_RET

 INIT_XMM sse2
 cglobal diff_pixels, 4, 5, 5
    movsxdifnidn r3, r3d
    pxor         m4, m4
    add          r0,  128
    mov          r4, -128
 .loop:
    movh         m0, [r1]
    movh         m2, [r2]
    movh         m1, [r1+r3]
    movh         m3, [r2+r3]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    psubw        m0, m2
    psubw        m1, m3
    mova [r0+r4+0 ], m0
    mova [r0+r4+16], m1
    lea          r1, [r1+r3*2]
    lea          r2, [r2+r3*2]
    add          r4, 32
    jne .loop
    RET

 ;-----------------------------------------------
 ;int ff_sum_abs_dctelem(int16_t *block)
 ;-----------------------------------------------
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -30,12 +30,6 @@
 #include "libavcodec/mpegvideo.h"
 #include "dsputil_x86.h"

 void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                        int stride);
 void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
 int ff_sum_abs_dctelem_mmx(int16_t *block);
 int ff_sum_abs_dctelem_mmxext(int16_t *block);
 int ff_sum_abs_dctelem_sse2(int16_t *block);
@@ -353,16 +347,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
 {
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_mmx;
        c->diff_pixels = ff_diff_pixels_mmx;
    }

    if (EXTERNAL_SSE2(cpu_flags))
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_sse2;

 #if HAVE_INLINE_ASM
    if (INLINE_MMX(cpu_flags)) {
        c->vsad[4] = vsad_intra16_mmx;
@@ -410,7 +394,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
    if (EXTERNAL_SSE2(cpu_flags)) {
        c->sse[0] = ff_sse16_sse2;
        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
        c->diff_pixels = ff_diff_pixels_sse2;

 #if HAVE_ALIGNED_STACK
        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -0,0 +1,135 @@
 ;*****************************************************************************
 ;* SIMD-optimized pixel operations
 ;*****************************************************************************
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION .text

 INIT_MMX mmx
 ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
 cglobal get_pixels, 3,4
    movsxdifnidn r2, r2d
    add          r0, 128
    mov          r3, -128
    pxor         m7, m7
 .loop:
    mova         m0, [r1]
    mova         m2, [r1+r2]
    mova         m1, m0
    mova         m3, m2
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    mova [r0+r3+ 0], m0
    mova [r0+r3+ 8], m1
    mova [r0+r3+16], m2
    mova [r0+r3+24], m3
    lea          r1, [r1+r2*2]
    add          r3, 32
    js .loop
    REP_RET

 INIT_XMM sse2
 cglobal get_pixels, 3, 4, 5
    movsxdifnidn r2, r2d
    lea          r3, [r2*3]
    pxor         m4, m4
    movh         m0, [r1]
    movh         m1, [r1+r2]
    movh         m2, [r1+r2*2]
    movh         m3, [r1+r3]
    lea          r1, [r1+r2*4]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    mova       [r0], m0
    mova  [r0+0x10], m1
    mova  [r0+0x20], m2
    mova  [r0+0x30], m3
    movh         m0, [r1]
    movh         m1, [r1+r2*1]
    movh         m2, [r1+r2*2]
    movh         m3, [r1+r3]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    mova  [r0+0x40], m0
    mova  [r0+0x50], m1
    mova  [r0+0x60], m2
    mova  [r0+0x70], m3
    RET

 INIT_MMX mmx
 ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
 ;                         int stride);
 cglobal diff_pixels, 4,5
    movsxdifnidn r3, r3d
    pxor         m7, m7
    add          r0,  128
    mov          r4, -128
 .loop:
    mova         m0, [r1]
    mova         m2, [r2]
    mova         m1, m0
    mova         m3, m2
    punpcklbw    m0, m7
    punpckhbw    m1, m7
    punpcklbw    m2, m7
    punpckhbw    m3, m7
    psubw        m0, m2
    psubw        m1, m3
    mova  [r0+r4+0], m0
    mova  [r0+r4+8], m1
    add          r1, r3
    add          r2, r3
    add          r4, 16
    jne .loop
    REP_RET

 INIT_XMM sse2
 cglobal diff_pixels, 4, 5, 5
    movsxdifnidn r3, r3d
    pxor         m4, m4
    add          r0,  128
    mov          r4, -128
 .loop:
    movh         m0, [r1]
    movh         m2, [r2]
    movh         m1, [r1+r3]
    movh         m3, [r2+r3]
    punpcklbw    m0, m4
    punpcklbw    m1, m4
    punpcklbw    m2, m4
    punpcklbw    m3, m4
    psubw        m0, m2
    psubw        m1, m3
    mova [r0+r4+0 ], m0
    mova [r0+r4+16], m1
    lea          r1, [r1+r3*2]
    lea          r2, [r2+r3*2]
    add          r4, 32
    jne .loop
    RET
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -0,0 +1,50 @@
 /*
 * SIMD-optimized pixel operations
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/pixblockdsp.h"

 void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                        int stride);
 void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);

 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                     AVCodecContext *avctx,
                                     unsigned high_bit_depth)
 {
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_mmx;
        c->diff_pixels = ff_diff_pixels_mmx;
    }

    if (EXTERNAL_SSE2(cpu_flags)) {
        if (!high_bit_depth)
            c->get_pixels = ff_get_pixels_sse2;
        c->diff_pixels = ff_diff_pixels_sse2;
    }
 }
--- a/libavfilter/vf_mpdecimate.c
+++ b/libavfilter/vf_mpdecimate.c
@@ -28,6 +28,7 @@
 #include "libavutil/pixdesc.h"
 #include "libavutil/timestamp.h"
 #include "libavcodec/dsputil.h"
 #include "libavcodec/pixblockdsp.h"
 #include "avfilter.h"
 #include "internal.h"
 #include "formats.h"
@@ -49,6 +50,7 @@ typedef struct {
    int hsub, vsub;                ///< chroma subsampling values
    AVFrame *ref;                  ///< reference picture
    DSPContext dspctx;             ///< context providing optimized diff routines
    PixblockDSPContext pdsp;
    AVCodecContext *avctx;         ///< codec context required for the DSPContext
 } DecimateContext;

@@ -75,6 +77,7 @@ static int diff_planes(AVFilterContext *ctx,
 {
    DecimateContext *decimate = ctx->priv;
    DSPContext *dspctx = &decimate->dspctx;
    PixblockDSPContext *pdsp = &decimate->pdsp;

    int x, y;
    int d, c = 0;
@@ -84,7 +87,7 @@ static int diff_planes(AVFilterContext *ctx,
    /* compute difference for blocks of 8x8 bytes */
    for (y = 0; y < h-7; y += 4) {
        for (x = 8; x < w-7; x += 4) {
            dspctx->diff_pixels(block,
            pdsp->diff_pixels(block,
                                cur+x+y*linesize,
                                ref+x+y*linesize, linesize);
            d = dspctx->sum_abs_dctelem(block);
@@ -141,6 +144,7 @@ static av_cold int init(AVFilterContext *ctx)
    if (!decimate->avctx)
        return AVERROR(ENOMEM);
    avpriv_dsputil_init(&decimate->dspctx, decimate->avctx);
    ff_pixblockdsp_init(&decimate->pdsp, decimate->avctx);

    return 0;
 }
--- a/libavfilter/vf_spp.c
+++ b/libavfilter/vf_spp.c
@@ -232,7 +232,7 @@ static void filter(SPPContext *p, uint8_t *dst, uint8_t *src,
                const int x1 = x + offset[i + count - 1][0];
                const int y1 = y + offset[i + count - 1][1];
                const int index = x1 + y1*linesize;
                p->dsp.get_pixels(block, p->src + index, linesize);
                p->pdsp.get_pixels(block, p->src + index, linesize);
                p->fdsp.fdct(block);
                p->requantize(block2, block, qp, p->idsp.idct_permutation);
                p->idsp.idct(block2);
@@ -380,9 +380,9 @@ static av_cold int init(AVFilterContext *ctx)
    spp->avctx = avcodec_alloc_context3(NULL);
    if (!spp->avctx)
        return AVERROR(ENOMEM);
    avpriv_dsputil_init(&spp->dsp, spp->avctx);
    ff_idctdsp_init(&spp->idsp, spp->avctx);
    ff_fdctdsp_init(&spp->fdsp, spp->avctx);
    ff_pixblockdsp_init(&spp->pdsp, spp->avctx);
    spp->store_slice = store_slice_c;
    switch (spp->mode) {
    case MODE_HARD: spp->requantize = hardthresh_c; break;
--- a/libavfilter/vf_spp.h
+++ b/libavfilter/vf_spp.h
@@ -23,7 +23,7 @@
 #define AVFILTER_SPP_H

 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 #include "libavcodec/pixblockdsp.h"
 #include "libavcodec/idctdsp.h"
 #include "libavcodec/fdctdsp.h"
 #include "avfilter.h"
@@ -41,9 +41,9 @@ typedef struct {
    uint8_t *src;
    int16_t *temp;
    AVCodecContext *avctx;
    DSPContext dsp;
    IDCTDSPContext idsp;
    FDCTDSPContext fdsp;
    PixblockDSPContext pdsp;
    int8_t *non_b_qp_table;
    int non_b_qp_alloc_size;
    int use_bframe_qp;