Merge commit 'e74433a8e6fc00c8dbde293c97a3e45384c2c1d9'

* commit 'e74433a8e6fc00c8dbde293c97a3e45384c2c1d9': dsputil: Split clear_block*/fill_block* off into a separate context Conflicts: configure libavcodec/asvdec.c libavcodec/dnxhddec.c libavcodec/dnxhdenc.c libavcodec/dsputil.h libavcodec/eamad.c libavcodec/intrax8.c libavcodec/mjpegdec.c libavcodec/ppc/dsputil_ppc.c libavcodec/vc1dec.c libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · 2b05db4f81
--- a/configure
+++ b/configure
@@ -1796,6 +1796,7 @@ CONFIG_EXTRA="
    aandcttables
    ac3dsp
    audio_frame_queue
    blockdsp
    cabac
    dsputil
    exif
@@ -1995,7 +1996,7 @@ mdct_select="fft"
 rdft_select="fft"
 mpegaudio_select="mpegaudiodsp"
 mpegaudiodsp_select="dct"
 mpegvideo_select="dsputil h264chroma hpeldsp videodsp"
 mpegvideo_select="blockdsp dsputil h264chroma hpeldsp videodsp"
 mpegvideoenc_select="dsputil mpegvideo qpeldsp"

 # decoders / encoders
@@ -2014,18 +2015,18 @@ amrwb_decoder_select="lsp"
 amv_decoder_select="sp5x_decoder exif"
 amv_encoder_select="aandcttables"
 ape_decoder_select="dsputil llauddsp"
 asv1_decoder_select="dsputil"
 asv1_decoder_select="blockdsp dsputil"
 asv1_encoder_select="dsputil"
 asv2_decoder_select="dsputil"
 asv2_decoder_select="blockdsp dsputil"
 asv2_encoder_select="dsputil"
 atrac1_decoder_select="mdct sinewin"
 atrac3_decoder_select="mdct"
 atrac3p_decoder_select="mdct sinewin"
 avrn_decoder_select="exif"
 bink_decoder_select="dsputil hpeldsp"
 bink_decoder_select="blockdsp hpeldsp"
 binkaudio_dct_decoder_select="mdct rdft dct sinewin"
 binkaudio_rdft_decoder_select="mdct rdft sinewin"
 cavs_decoder_select="dsputil golomb h264chroma qpeldsp videodsp"
 cavs_decoder_select="blockdsp dsputil golomb h264chroma qpeldsp videodsp"
 cllc_decoder_select="dsputil"
 comfortnoise_encoder_select="lpc"
 cook_decoder_select="dsputil mdct sinewin"
@@ -2033,16 +2034,16 @@ cscd_decoder_select="lzo"
 cscd_decoder_suggest="zlib"
 dca_decoder_select="mdct"
 dirac_decoder_select="dsputil dwt golomb videodsp"
 dnxhd_decoder_select="dsputil"
 dnxhd_encoder_select="aandcttables dsputil mpegvideoenc"
 dnxhd_decoder_select="blockdsp dsputil"
 dnxhd_encoder_select="aandcttables blockdsp dsputil mpegvideoenc"
 dvvideo_decoder_select="dsputil"
 dvvideo_encoder_select="dsputil"
 dxa_decoder_select="zlib"
 eac3_decoder_select="ac3_decoder"
 eac3_encoder_select="ac3_encoder"
 eamad_decoder_select="aandcttables dsputil mpegvideo"
 eamad_decoder_select="aandcttables blockdsp dsputil mpegvideo"
 eatgq_decoder_select="aandcttables"
 eatqi_decoder_select="aandcttables dsputil error_resilience mpegvideo"
 eatqi_decoder_select="aandcttables blockdsp dsputil error_resilience mpegvideo"
 exr_decoder_select="zlib"
 ffv1_decoder_select="golomb rangecoder"
 ffv1_encoder_select="rangecoder"
@@ -2057,9 +2058,9 @@ flashsv2_encoder_select="zlib"
 flashsv2_decoder_select="zlib"
 flv_decoder_select="h263_decoder"
 flv_encoder_select="h263_encoder"
 fourxm_decoder_select="dsputil"
 fourxm_decoder_select="blockdsp dsputil"
 fraps_decoder_select="dsputil huffman"
 g2m_decoder_select="dsputil zlib"
 g2m_decoder_select="blockdsp dsputil zlib"
 g729_decoder_select="dsputil"
 h261_decoder_select="error_resilience mpegvideo"
 h261_encoder_select="aandcttables mpegvideoenc"
@@ -2078,14 +2079,14 @@ indeo3_decoder_select="hpeldsp"
 interplay_video_decoder_select="hpeldsp"
 jpegls_decoder_select="golomb mjpeg_decoder"
 jpegls_encoder_select="golomb"
 jv_decoder_select="dsputil"
 jv_decoder_select="blockdsp"
 lagarith_decoder_select="huffyuvdsp"
 ljpeg_encoder_select="aandcttables mpegvideoenc"
 loco_decoder_select="golomb"
 mdec_decoder_select="dsputil error_resilience mpegvideo"
 mdec_decoder_select="blockdsp dsputil error_resilience mpegvideo"
 metasound_decoder_select="lsp mdct sinewin"
 mimic_decoder_select="dsputil hpeldsp"
 mjpeg_decoder_select="dsputil hpeldsp exif"
 mimic_decoder_select="blockdsp dsputil hpeldsp"
 mjpeg_decoder_select="blockdsp dsputil hpeldsp exif"
 mjpeg_encoder_select="aandcttables mpegvideoenc"
 mjpegb_decoder_select="mjpeg_decoder"
 mlp_decoder_select="mlp_parser"
@@ -2124,7 +2125,7 @@ on2avc_decoder_select="mdct"
 opus_decoder_deps="swresample"
 png_decoder_select="zlib"
 png_encoder_select="huffyuvencdsp zlib"
 prores_decoder_select="dsputil"
 prores_decoder_select="blockdsp dsputil"
 prores_encoder_select="dsputil"
 qcelp_decoder_select="lsp"
 qdm2_decoder_select="mdct rdft mpegaudiodsp"
@@ -2163,7 +2164,7 @@ twinvq_decoder_select="mdct lsp sinewin"
 utvideo_decoder_select="dsputil"
 utvideo_encoder_select="dsputil huffman huffyuvencdsp"
 vble_decoder_select="huffyuvdsp"
 vc1_decoder_select="error_resilience h263_decoder h264chroma h264qpel intrax8 qpeldsp"
 vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel intrax8 qpeldsp"
 vc1image_decoder_select="vc1_decoder"
 vorbis_decoder_select="mdct"
 vorbis_encoder_select="mdct"
@@ -2185,7 +2186,7 @@ wmav2_encoder_select="mdct sinewin"
 wmavoice_decoder_select="lsp rdft dct mdct sinewin"
 wmv1_decoder_select="h263_decoder"
 wmv1_encoder_select="h263_encoder"
 wmv2_decoder_select="h263_decoder intrax8 videodsp"
 wmv2_decoder_select="blockdsp h263_decoder intrax8 videodsp"
 wmv2_encoder_select="h263_encoder"
 wmv3_decoder_select="vc1_decoder"
 wmv3image_decoder_select="wmv3_decoder"
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -31,6 +31,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "bytestream.h"
 #include "dsputil.h"
 #include "get_bits.h"
@@ -134,6 +135,7 @@ typedef struct CFrameBuffer {
 typedef struct FourXContext {
    AVCodecContext *avctx;
    DSPContext dsp;
    BlockDSPContext bdsp;
    uint16_t *frame_buffer;
    uint16_t *last_frame_buffer;
    GetBitContext pre_gb;          ///< ac/dc prefix
@@ -592,7 +594,7 @@ static int decode_i_mb(FourXContext *f)
    int ret;
    int i;

    f->dsp.clear_blocks(f->block[0]);
    f->bdsp.clear_blocks(f->block[0]);

    for (i = 0; i < 6; i++)
        if ((ret = decode_i_block(f, f->block[i])) < 0)
@@ -998,6 +1000,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    }

    f->version = AV_RL32(avctx->extradata) >> 16;
    ff_blockdsp_init(&f->bdsp, avctx);
    ff_dsputil_init(&f->dsp, avctx);
    f->avctx = avctx;
    init_vlcs(f);
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -33,6 +33,7 @@ OBJS = allcodecs.o                                                      \
 OBJS-$(CONFIG_AANDCTTABLES)            += aandcttab.o
 OBJS-$(CONFIG_AC3DSP)                  += ac3dsp.o
 OBJS-$(CONFIG_AUDIO_FRAME_QUEUE)       += audio_frame_queue.o
 OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
 OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -4,6 +4,7 @@ OBJS                                   += arm/fmtconvert_init_arm.o

 OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
                                          arm/ac3dsp_arm.o
 OBJS-$(CONFIG_BLOCKDSP)                += arm/blockdsp_init_arm.o
 OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o        \
                                          arm/dsputil_arm.o             \
                                          arm/jrevdct_arm.o             \
@@ -79,6 +80,8 @@ VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
 NEON-OBJS                              += arm/fmtconvert_neon.o

 NEON-OBJS-$(CONFIG_AC3DSP)             += arm/ac3dsp_neon.o
 NEON-OBJS-$(CONFIG_BLOCKDSP)           += arm/blockdsp_init_neon.o      \
                                          arm/blockdsp_neon.o
 NEON-OBJS-$(CONFIG_DSPUTIL)            += arm/dsputil_init_neon.o       \
                                          arm/dsputil_neon.o            \
                                          arm/int_neon.o                \
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -0,0 +1,26 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_ARM_BLOCKDSP_ARM_H
 #define AVCODEC_ARM_BLOCKDSP_ARM_H

 #include "libavcodec/blockdsp.h"

 void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);

 #endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -0,0 +1,33 @@
 /*
 * ARM optimized block operations
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/blockdsp.h"
 #include "blockdsp_arm.h"

 av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
 {
    int cpu_flags = av_get_cpu_flags();

    if (have_neon(cpu_flags))
        ff_blockdsp_init_neon(c, high_bit_depth);
 }
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -0,0 +1,37 @@
 /*
 * ARM NEON optimised block operations
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "libavutil/attributes.h"
 #include "libavcodec/blockdsp.h"
 #include "blockdsp_arm.h"

 void ff_clear_block_neon(int16_t *block);
 void ff_clear_blocks_neon(int16_t *blocks);

 av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
 {
    if (!high_bit_depth) {
        c->clear_block  = ff_clear_block_neon;
        c->clear_blocks = ff_clear_blocks_neon;
    }
 }
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -0,0 +1,38 @@
 /*
 * ARM NEON optimised block functions
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/arm/asm.S"

 function ff_clear_block_neon, export=1
        vmov.i16        q0,  #0
        .rept           8
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
 endfunc

 function ff_clear_blocks_neon, export=1
        vmov.i16        q0,  #0
        .rept           8*6
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
 endfunc
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -30,9 +30,6 @@ void ff_simple_idct_neon(int16_t *data);
 void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
 void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);

 void ff_clear_block_neon(int16_t *block);
 void ff_clear_blocks_neon(int16_t *blocks);

 void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
 void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
 void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
@@ -61,11 +58,6 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
    c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;

    if (!high_bit_depth) {
        c->clear_block  = ff_clear_block_neon;
        c->clear_blocks = ff_clear_blocks_neon;
    }

    c->vector_clipf      = ff_vector_clipf_neon;
    c->vector_clip_int32 = ff_vector_clip_int32_neon;

--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -21,22 +21,6 @@

 #include "libavutil/arm/asm.S"

 function ff_clear_block_neon, export=1
        vmov.i16        q0,  #0
        .rept           8
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
 endfunc

 function ff_clear_blocks_neon, export=1
        vmov.i16        q0,  #0
        .rept           8*6
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
 endfunc

 function ff_put_pixels_clamped_neon, export=1
        vld1.16         {d16-d19}, [r0,:128]!
        vqmovun.s16     d0, q8
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -31,12 +31,14 @@
 #include "libavutil/mem.h"

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "get_bits.h"
 #include "put_bits.h"

 typedef struct ASV1Context{
    AVCodecContext *avctx;
    BlockDSPContext bdsp;
    DSPContext dsp;
    PutBitContext pb;
    GetBitContext gb;
--- a/libavcodec/asvdec.c
+++ b/libavcodec/asvdec.c
@@ -28,6 +28,7 @@

 #include "asv.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mpeg12data.h"
@@ -163,7 +164,7 @@ static inline int decode_mb(ASV1Context *a, int16_t block[6][64])
 {
    int i;

    a->dsp.clear_blocks(block[0]);
    a->bdsp.clear_blocks(block[0]);

    if (a->avctx->codec_id == AV_CODEC_ID_ASV1) {
        for (i = 0; i < 6; i++) {
@@ -276,6 +277,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    }

    ff_asv_common_init(avctx);
    ff_blockdsp_init(&a->bdsp, avctx);
    init_vlcs(a);
    ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_asv_scantab);
    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -24,9 +24,9 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "binkdata.h"
 #include "binkdsp.h"
 #include "blockdsp.h"
 #include "hpeldsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -113,7 +113,7 @@ typedef struct Bundle {
 */
 typedef struct BinkContext {
    AVCodecContext *avctx;
    DSPContext     dsp;
    BlockDSPContext bdsp;
    HpelDSPContext hdsp;
    BinkDSPContext binkdsp;
    AVFrame        *last;
@@ -886,7 +886,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                } else {
                    put_pixels8x8_overlapped(dst, ref, stride);
                }
                c->dsp.clear_block(block);
                c->bdsp.clear_block(block);
                v = binkb_get_value(c, BINKB_SRC_INTER_COEFS);
                read_residue(gb, block, v);
                c->binkdsp.add_pixels8(dst, block, stride);
@@ -910,7 +910,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                break;
            case 5:
                v = binkb_get_value(c, BINKB_SRC_COLORS);
                c->dsp.fill_block_tab[1](dst, v, stride, 8);
                c->bdsp.fill_block_tab[1](dst, v, stride, 8);
                break;
            case 6:
                for (i = 0; i < 2; i++)
@@ -1053,7 +1053,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                    break;
                case FILL_BLOCK:
                    v = get_value(c, BINK_SRC_COLORS);
                    c->dsp.fill_block_tab[0](dst, v, stride, 16);
                    c->bdsp.fill_block_tab[0](dst, v, stride, 16);
                    break;
                case PATTERN_BLOCK:
                    for (i = 0; i < 2; i++)
@@ -1123,7 +1123,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                    return AVERROR_INVALIDDATA;
                }
                c->hdsp.put_pixels_tab[1][0](dst, ref, stride, 8);
                c->dsp.clear_block(block);
                c->bdsp.clear_block(block);
                v = get_bits(gb, 7);
                read_residue(gb, block, v);
                c->binkdsp.add_pixels8(dst, block, stride);
@@ -1136,7 +1136,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                break;
            case FILL_BLOCK:
                v = get_value(c, BINK_SRC_COLORS);
                c->dsp.fill_block_tab[1](dst, v, stride, 8);
                c->bdsp.fill_block_tab[1](dst, v, stride, 8);
                break;
            case INTER_BLOCK:
                xoff = get_value(c, BINK_SRC_X_OFF);
@@ -1306,7 +1306,7 @@ static av_cold int decode_init(AVCodecContext *avctx)

    avctx->pix_fmt = c->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;

    ff_dsputil_init(&c->dsp, avctx);
    ff_blockdsp_init(&c->bdsp, avctx);
    ff_hpeldsp_init(&c->hdsp, avctx->flags);
    ff_binkdsp_init(&c->binkdsp);

--- a/libavcodec/blockdsp.c
+++ b/libavcodec/blockdsp.c
@@ -0,0 +1,78 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>
 #include <string.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "version.h"

 static void clear_block_8_c(int16_t *block)
 {
    memset(block, 0, sizeof(int16_t) * 64);
 }

 static void clear_blocks_8_c(int16_t *blocks)
 {
    memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }

 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
    int i;

    for (i = 0; i < h; i++) {
        memset(block, value, 16);
        block += line_size;
    }
 }

 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
    int i;

    for (i = 0; i < h; i++) {
        memset(block, value, 8);
        block += line_size;
    }
 }

 av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
 {
    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;

    c->clear_block  = clear_block_8_c;
    c->clear_blocks = clear_blocks_8_c;

    c->fill_block_tab[0] = fill_block16_c;
    c->fill_block_tab[1] = fill_block8_c;

    if (ARCH_ARM)
        ff_blockdsp_init_arm(c, high_bit_depth);
    if (ARCH_PPC)
        ff_blockdsp_init_ppc(c, high_bit_depth);
    if (ARCH_X86)
 #if FF_API_XVMC
        ff_blockdsp_init_x86(c, high_bit_depth, avctx);
 #else
        ff_blockdsp_init_x86(c, high_bit_depth);
 #endif /* FF_API_XVMC */
 }
--- a/libavcodec/blockdsp.h
+++ b/libavcodec/blockdsp.h
@@ -0,0 +1,52 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_BLOCKDSP_H
 #define AVCODEC_BLOCKDSP_H

 #include <stdint.h>

 #include "avcodec.h"
 #include "version.h"

 /* add and put pixel (decoding)
 * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
 * h for op_pixels_func is limited to { width / 2, width },
 * but never larger than 16 and never smaller than 4. */
 typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
                             uint8_t value, int line_size, int h);

 typedef struct BlockDSPContext {
    void (*clear_block)(int16_t *block /* align 16 */);
    void (*clear_blocks)(int16_t *blocks /* align 16 */);

    op_fill_func fill_block_tab[2];
 } BlockDSPContext;

 void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);

 void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth);
 void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth);
 #if FF_API_XVMC
 void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
                          AVCodecContext *avctx);
 #else
 void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth);
 #endif /* FF_API_XVMC */

 #endif /* AVCODEC_BLOCKDSP_H */
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -761,6 +761,7 @@ av_cold int ff_cavs_init(AVCodecContext *avctx)
 {
    AVSContext *h = avctx->priv_data;

    ff_blockdsp_init(&h->bdsp, avctx);
    ff_dsputil_init(&h->dsp, avctx);
    ff_h264chroma_init(&h->h264chroma, 8);
    ff_videodsp_init(&h->vdsp, 8);
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -23,6 +23,7 @@
 #define AVCODEC_CAVS_H

 #include "cavsdsp.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "h264chroma.h"
 #include "get_bits.h"
@@ -162,6 +163,7 @@ typedef struct AVSFrame {
 typedef struct AVSContext {
    AVCodecContext *avctx;
    DSPContext       dsp;
    BlockDSPContext bdsp;
    H264ChromaContext h264chroma;
    VideoDSPContext vdsp;
    CAVSDSPContext  cdsp;
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -589,7 +589,7 @@ static int decode_residual_block(AVSContext *h, GetBitContext *gb,
                      dequant_shift[qp], i)) < 0)
        return ret;
    h->cdsp.cavs_idct8_add(dst, block, stride);
    h->dsp.clear_block(block);
    h->bdsp.clear_block(block);
    return 0;
 }

--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -25,6 +25,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "get_bits.h"
 #include "dnxhddata.h"
 #include "dsputil.h"
@@ -34,6 +35,7 @@
 typedef struct DNXHDContext {
    AVCodecContext *avctx;
    GetBitContext gb;
    BlockDSPContext bdsp;
    int64_t cid;                        ///< compression id
    unsigned int width, height;
    unsigned int mb_width, mb_height;
@@ -142,6 +144,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
        ctx->avctx->bits_per_raw_sample = 10;
        if (ctx->bit_depth != 10) {
            ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
            ff_dsputil_init(&ctx->dsp, ctx->avctx);
            ctx->bit_depth = 10;
            ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
@@ -151,6 +154,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
        ctx->avctx->bits_per_raw_sample = 10;
        if (ctx->bit_depth != 10) {
            ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
            ff_dsputil_init(&ctx->dsp, ctx->avctx);
            ctx->bit_depth = 10;
            ctx->decode_dct_block = dnxhd_decode_dct_block_10;
@@ -159,6 +163,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
        ctx->avctx->bits_per_raw_sample = 8;
        if (ctx->bit_depth != 8) {
            ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
            ff_dsputil_init(&ctx->dsp, ctx->avctx);
            ctx->bit_depth = 8;
            ctx->decode_dct_block = dnxhd_decode_dct_block_8;
@@ -338,12 +343,12 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
    }

    for (i = 0; i < 8; i++) {
        ctx->dsp.clear_block(ctx->blocks[i]);
        ctx->bdsp.clear_block(ctx->blocks[i]);
        ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
    }
    if (ctx->is_444) {
        for (; i < 12; i++) {
            ctx->dsp.clear_block(ctx->blocks[i]);
            ctx->bdsp.clear_block(ctx->blocks[i]);
            ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
        }
    }
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -29,6 +29,7 @@
 #include "libavutil/timer.h"

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "internal.h"
 #include "mpegvideo.h"
@@ -320,6 +321,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)

    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;

    ff_blockdsp_init(&ctx->bdsp, avctx);
    ff_dct_common_init(&ctx->m);
    ff_dct_encode_init(&ctx->m);

@@ -577,10 +579,10 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
                                    ptr_v + ctx->dct_uv_offset,
                                    ctx->m.uvlinesize);
        } else {
            dsp->clear_block(ctx->blocks[4]);
            dsp->clear_block(ctx->blocks[5]);
            dsp->clear_block(ctx->blocks[6]);
            dsp->clear_block(ctx->blocks[7]);
            ctx->bdsp.clear_block(ctx->blocks[4]);
            ctx->bdsp.clear_block(ctx->blocks[5]);
            ctx->bdsp.clear_block(ctx->blocks[6]);
            ctx->bdsp.clear_block(ctx->blocks[7]);
        }
    } else {
        dsp->get_pixels(ctx->blocks[4],
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -41,6 +41,7 @@ typedef struct RCEntry {

 typedef struct DNXHDEncContext {
    AVClass *class;
    BlockDSPContext bdsp;
    MpegEncContext m; ///< Used for quantization dsp functions

    int cid;
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -436,26 +436,6 @@ static int sum_abs_dctelem_c(int16_t *block)
    return sum;
 }

 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
    int i;

    for (i = 0; i < h; i++) {
        memset(block, value, 16);
        block += line_size;
    }
 }

 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
    int i;

    for (i = 0; i < h; i++) {
        memset(block, value, 8);
        block += line_size;
    }
 }

 #define avg2(a, b) ((a + b + 1) >> 1)
 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)

@@ -1517,16 +1497,6 @@ static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
            memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);
 }

 static void clear_block_8_c(int16_t *block)
 {
    memset(block, 0, sizeof(int16_t) * 64);
 }

 static void clear_blocks_8_c(int16_t *blocks)
 {
    memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }

 /* init static data */
 av_cold void ff_dsputil_static_init(void)
 {
@@ -1641,9 +1611,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    c->pix_sum   = pix_sum_c;
    c->pix_norm1 = pix_norm1_c;

    c->fill_block_tab[0] = fill_block16_c;
    c->fill_block_tab[1] = fill_block8_c;

    /* TODO [0] 16  [1] 8 */
    c->pix_abs[0][0] = pix_abs16_c;
    c->pix_abs[0][1] = pix_abs16_x2_c;
@@ -1705,9 +1672,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)

    c->draw_edges = draw_edges_8_c;

    c->clear_block  = clear_block_8_c;
    c->clear_blocks = clear_blocks_8_c;

    switch (avctx->bits_per_raw_sample) {
    case 9:
    case 10:
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -51,13 +51,6 @@ void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 * !future video codecs might need functions with less strict alignment
 */

 /* add and put pixel (decoding)
 * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
 * h for op_pixels_func is limited to { width / 2, width },
 * but never larger than 16 and never smaller than 4. */
 typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
                             uint8_t value, int line_size, int h);

 struct MpegEncContext;
 /* Motion estimation:
 * h is limited to { width / 2, width, 2 * width },
@@ -116,8 +109,7 @@ typedef struct DSPContext {
                int stride, int h, int ox, int oy,
                int dxx, int dxy, int dyx, int dyy,
                int shift, int r, int width, int height);
    void (*clear_block)(int16_t *block /* align 16 */);
    void (*clear_blocks)(int16_t *blocks /* align 16 */);

    int (*pix_sum)(uint8_t *pix, int line_size);
    int (*pix_norm1)(uint8_t *pix, int line_size);

@@ -236,8 +228,6 @@ typedef struct DSPContext {
     */
    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                              int32_t max, unsigned int len);

    op_fill_func fill_block_tab[2];
 } DSPContext;

 void ff_dsputil_static_init(void);
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -44,6 +44,7 @@

 typedef struct MadContext {
    AVCodecContext *avctx;
    BlockDSPContext bdsp;
    DSPContext dsp;
    AVFrame *last_frame;
    GetBitContext gb;
@@ -61,6 +62,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    MadContext *s = avctx->priv_data;
    s->avctx = avctx;
    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
    ff_blockdsp_init(&s->bdsp, avctx);
    ff_dsputil_init(&s->dsp, avctx);
    ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM);
    ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct);
@@ -213,7 +215,7 @@ static int decode_mb(MadContext *s, AVFrame *frame, int inter)
            if (s->last_frame->data[0])
                comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
        } else {
            s->dsp.clear_block(s->block);
            s->bdsp.clear_block(s->block);
            if(decode_block_intra(s, s->block) < 0)
                return -1;
            idct_put(s, frame, s->block, s->mb_x, s->mb_y, j);
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -27,6 +27,7 @@
 */

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "get_bits.h"
 #include "aandcttab.h"
 #include "eaidct.h"
@@ -46,6 +47,7 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx)
    TqiContext *t = avctx->priv_data;
    MpegEncContext *s = &t->s;
    s->avctx = avctx;
    ff_blockdsp_init(&s->bdsp, avctx);
    ff_dsputil_init(&s->dsp, avctx);
    ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM);
    ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
@@ -59,7 +61,7 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx)
 static int tqi_decode_mb(MpegEncContext *s, int16_t (*block)[64])
 {
    int n;
    s->dsp.clear_blocks(block[0]);
    s->bdsp.clear_blocks(block[0]);
    for (n=0; n<6; n++)
        if (ff_mpeg1_decode_block_intra(s, block[n], n) < 0)
            return -1;
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -29,6 +29,7 @@

 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "bytestream.h"
 #include "dsputil.h"
 #include "get_bits.h"
@@ -72,6 +73,7 @@ static const uint8_t chroma_quant[64] = {
 };

 typedef struct JPGContext {
    BlockDSPContext bdsp;
    DSPContext dsp;
    ScanTable  scantable;

@@ -150,6 +152,7 @@ static av_cold int jpg_init(AVCodecContext *avctx, JPGContext *c)
    if (ret)
        return ret;

    ff_blockdsp_init(&c->bdsp, avctx);
    ff_dsputil_init(&c->dsp, avctx);
    ff_init_scantable(c->dsp.idct_permutation, &c->scantable,
                      ff_zigzag_direct);
@@ -193,7 +196,7 @@ static int jpg_decode_block(JPGContext *c, GetBitContext *gb,
    const int is_chroma = !!plane;
    const uint8_t *qmat = is_chroma ? chroma_quant : luma_quant;

    c->dsp.clear_block(block);
    c->bdsp.clear_block(block);
    dc = get_vlc2(gb, c->dc_vlc[is_chroma].table, 9, 3);
    if (dc < 0)
        return AVERROR_INVALIDDATA;
@@ -259,7 +262,7 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
    for (i = 0; i < 3; i++)
        c->prev_dc[i] = 1024;
    bx = by = 0;
    c->dsp.clear_blocks(c->block[0]);
    c->bdsp.clear_blocks(c->block[0]);
    for (mb_y = 0; mb_y < mb_h; mb_y++) {
        for (mb_x = 0; mb_x < mb_w; mb_x++) {
            if (mask && !mask[mb_x * 2] && !mask[mb_x * 2 + 1] &&
--- a/libavcodec/h261dec.c
+++ b/libavcodec/h261dec.c
@@ -442,7 +442,7 @@ static int h261_decode_mb(H261Context *h)
 intra:
    /* decode each block */
    if (s->mb_intra || HAS_CBP(h->mtype)) {
        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);
        for (i = 0; i < 6; i++) {
            if (h261_decode_block(h, s->block[i], i, cbp & 32) < 0)
                return SLICE_ERROR;
--- a/libavcodec/h263.h
+++ b/libavcodec/h263.h
@@ -196,7 +196,7 @@ static inline int get_p_cbp(MpegEncContext * s,
        for (i = 0; i < 6; i++) {
            if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){
                s->block_last_index[i]= -1;
                s->dsp.clear_block(s->block[i]);
                s->bdsp.clear_block(s->block[i]);
            }
        }
    }else{
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -535,7 +535,7 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){
    int sign;

    av_assert2(w->orient<12);
    s->dsp.clear_block(s->block[0]);
    s->bdsp.clear_block(s->block[0]);

    if(chroma){
        dc_mode=2;
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -517,7 +517,7 @@ retry:
                rl = &ff_rl_intra_aic;
                i = 0;
                s->gb= gb;
                s->dsp.clear_block(block);
                s->bdsp.clear_block(block);
                goto retry;
            }
            av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d i:%d\n", s->mb_x, s->mb_y, s->mb_intra);
@@ -610,7 +610,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
            }
        }while(cbpc == 20);

        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);

        dquant = cbpc & 8;
        s->mb_intra = ((cbpc & 4) != 0);
@@ -705,7 +705,7 @@ int ff_h263_decode_mb(MpegEncContext *s,

        s->mb_intra = IS_INTRA(mb_type);
        if(HAS_CBP(mb_type)){
            s->dsp.clear_blocks(s->block[0]);
            s->bdsp.clear_blocks(s->block[0]);
            cbpc = get_vlc2(&s->gb, cbpc_b_vlc.table, CBPC_B_VLC_BITS, 1);
            if(s->mb_intra){
                dquant = IS_QUANT(mb_type);
@@ -777,7 +777,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
            }
        }while(cbpc == 8);

        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);

        dquant = cbpc & 4;
        s->mb_intra = 1;
--- a/libavcodec/jvdec.c
+++ b/libavcodec/jvdec.c
@@ -28,12 +28,12 @@
 #include "libavutil/intreadwrite.h"

 #include "avcodec.h"
 #include "dsputil.h"
 #include "blockdsp.h"
 #include "get_bits.h"
 #include "internal.h"

 typedef struct JvContext {
    DSPContext dsp;
    BlockDSPContext bdsp;
    AVFrame   *frame;
    uint32_t   palette[AVPALETTE_COUNT];
    int        palette_has_changed;
@@ -48,7 +48,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
        return AVERROR(ENOMEM);

    avctx->pix_fmt = AV_PIX_FMT_PAL8;
    ff_dsputil_init(&s->dsp, avctx);
    ff_blockdsp_init(&s->bdsp, avctx);
    return 0;
 }

@@ -113,14 +113,14 @@ static inline void decode4x4(GetBitContext *gb, uint8_t *dst, int linesize)
 * Decode 8x8 block
 */
 static inline void decode8x8(GetBitContext *gb, uint8_t *dst, int linesize,
                             DSPContext *dsp)
                             BlockDSPContext *bdsp)
 {
    int i, j, v[2];

    switch (get_bits(gb, 2)) {
    case 1:
        v[0] = get_bits(gb, 8);
        dsp->fill_block_tab[1](dst, v[0], linesize, 8);
        bdsp->fill_block_tab[1](dst, v[0], linesize, 8);
        break;
    case 2:
        v[0] = get_bits(gb, 8);
@@ -167,7 +167,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                for (i = 0; i < avctx->width; i += 8)
                    decode8x8(&gb,
                              s->frame->data[0] + j * s->frame->linesize[0] + i,
                              s->frame->linesize[0], &s->dsp);
                              s->frame->linesize[0], &s->bdsp);

            buf += video_size;
        } else if (video_type == 2) {
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -28,12 +28,14 @@
 */

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "mpegvideo.h"
 #include "mpeg12.h"
 #include "thread.h"

 typedef struct MDECContext {
    AVCodecContext *avctx;
    BlockDSPContext bdsp;
    DSPContext dsp;
    ThreadFrame frame;
    GetBitContext gb;
@@ -123,7 +125,7 @@ static inline int decode_mb(MDECContext *a, int16_t block[6][64])
    int i, ret;
    static const int block_index[6] = { 5, 4, 0, 1, 2, 3 };

    a->dsp.clear_blocks(block[0]);
    a->bdsp.clear_blocks(block[0]);

    for (i = 0; i < 6; i++) {
        if ((ret = mdec_decode_block_intra(a, block[block_index[i]],
@@ -208,6 +210,7 @@ static av_cold int decode_init(AVCodecContext *avctx)

    a->avctx           = avctx;

    ff_blockdsp_init(&a->bdsp, avctx);
    ff_dsputil_init(&a->dsp, avctx);
    ff_mpeg12_init_vlcs();
    ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_zigzag_direct);
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -24,6 +24,7 @@
 #include <stdint.h>

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "bytestream.h"
@@ -52,6 +53,7 @@ typedef struct {

    GetBitContext   gb;
    ScanTable       scantable;
    BlockDSPContext bdsp;
    DSPContext      dsp;
    HpelDSPContext  hdsp;
    VLC             vlc;
@@ -146,6 +148,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
        av_log(avctx, AV_LOG_ERROR, "error initializing vlc table\n");
        return ret;
    }
    ff_blockdsp_init(&ctx->bdsp, avctx);
    ff_dsputil_init(&ctx->dsp, avctx);
    ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
    ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, col_zag);
@@ -228,7 +231,7 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
    int16_t *block = ctx->dct_block;
    unsigned int pos;

    ctx->dsp.clear_block(block);
    ctx->bdsp.clear_block(block);

    block[0] = get_bits(&ctx->gb, 8) << 3;

--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -34,6 +34,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "copy_block.h"
 #include "internal.h"
 #include "mjpeg.h"
@@ -106,6 +107,7 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
    }

    s->avctx = avctx;
    ff_blockdsp_init(&s->bdsp, avctx);
    ff_hpeldsp_init(&s->hdsp, avctx->flags);
    ff_dsputil_init(&s->dsp, avctx);
    ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct);
@@ -652,7 +654,7 @@ static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block,
                                 int16_t *quant_matrix, int Al)
 {
    int val;
    s->dsp.clear_block(block);
    s->bdsp.clear_block(block);
    val = mjpeg_decode_dc(s, dc_index);
    if (val == 0xfffff) {
        av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
@@ -1217,7 +1219,7 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                                             linesize[c], s->avctx->lowres);

                        else {
                            s->dsp.clear_block(s->block);
                            s->bdsp.clear_block(s->block);
                            if (decode_block(s, s->block, i,
                                             s->dc_index[i], s->ac_index[i],
                                             s->quant_matrixes[s->quant_sindex[i]]) < 0) {
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -34,6 +34,7 @@
 #include "libavutil/stereo3d.h"

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "get_bits.h"
 #include "dsputil.h"
 #include "hpeldsp.h"
@@ -104,6 +105,7 @@ typedef struct MJpegDecodeContext {
    uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
    int palette_index;
    ScanTable scantable;
    BlockDSPContext bdsp;
    DSPContext dsp;
    HpelDSPContext hdsp;

--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -799,10 +799,10 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
    av_dlog(s->avctx, "mb_type=%x\n", mb_type);
 //    motion_type = 0; /* avoid warning */
    if (IS_INTRA(mb_type)) {
        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);

        if (!s->chroma_y_shift)
            s->dsp.clear_blocks(s->block[6]);
            s->bdsp.clear_blocks(s->block[6]);

        /* compute DCT type */
        // FIXME: add an interlaced_dct coded var?
@@ -1039,13 +1039,13 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])

        s->mb_intra = 0;
        if (HAS_CBP(mb_type)) {
            s->dsp.clear_blocks(s->block[0]);
            s->bdsp.clear_blocks(s->block[0]);

            cbp = get_vlc2(&s->gb, ff_mb_pat_vlc.table, MB_PAT_VLC_BITS, 1);
            if (mb_block_count > 6) {
                cbp <<= mb_block_count - 6;
                cbp  |= get_bits(&s->gb, mb_block_count - 6);
                s->dsp.clear_blocks(s->block[6]);
                s->bdsp.clear_blocks(s->block[6]);
            }
            if (cbp <= 0) {
                av_log(s->avctx, AV_LOG_ERROR,
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -1261,7 +1261,7 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])

    if (!IS_SKIP(mb_type)) {
        int i;
        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);
        /* decode each block */
        for (i = 0; i < 6; i++) {
            if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, s->mb_intra, ctx->rvlc) < 0) {
@@ -1339,7 +1339,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
            }
        } while (cbpc == 20);

        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);
        dquant      = cbpc & 8;
        s->mb_intra = ((cbpc & 4) != 0);
        if (s->mb_intra)
@@ -1485,7 +1485,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
            if (modb2) {
                cbp = 0;
            } else {
                s->dsp.clear_blocks(s->block[0]);
                s->bdsp.clear_blocks(s->block[0]);
                cbp = get_bits(&s->gb, 6);
            }

@@ -1620,7 +1620,7 @@ intra:
        if (!s->progressive_sequence)
            s->interlaced_dct = get_bits1(&s->gb);

        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);
        /* decode each block */
        for (i = 0; i < 6; i++) {
            if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, 1, 0) < 0)
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -485,7 +485,7 @@ static inline int get_b_cbp(MpegEncContext *s, int16_t block[6][64],
        for (i = 0; i < 6; i++) {
            if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i)) & 1) == 0) {
                s->block_last_index[i] = -1;
                s->dsp.clear_block(s->block[i]);
                s->bdsp.clear_block(s->block[i]);
            }
        }
    } else {
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -33,6 +33,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "h264chroma.h"
 #include "internal.h"
@@ -352,7 +353,7 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
    ff_init_block_index(s);
    ff_update_block_index(s);

    s->dsp.clear_blocks(s->block[0]);
    s->bdsp.clear_blocks(s->block[0]);

    s->dest[0] = s->current_picture.f->data[0] + (s->mb_y *  16                       * s->linesize)   + s->mb_x *  16;
    s->dest[1] = s->current_picture.f->data[1] + (s->mb_y * (16 >> s->chroma_y_shift) * s->uvlinesize) + s->mb_x * (16 >> s->chroma_x_shift);
@@ -378,6 +379,7 @@ static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
 /* init common dct for both encoder and decoder */
 av_cold int ff_dct_common_init(MpegEncContext *s)
 {
    ff_blockdsp_init(&s->bdsp, s->avctx);
    ff_dsputil_init(&s->dsp, s->avctx);
    ff_h264chroma_init(&s->h264chroma, 8); //for lowres
    ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -29,6 +29,7 @@
 #define AVCODEC_MPEGVIDEO_H

 #include "avcodec.h"
 #include "blockdsp.h"
 #include "dsputil.h"
 #include "error_resilience.h"
 #include "get_bits.h"
@@ -356,6 +357,7 @@ typedef struct MpegEncContext {
    int unrestricted_mv;        ///< mv can point outside of the coded picture
    int h263_long_vectors;      ///< use horrible h263v1 long vector mode

    BlockDSPContext bdsp;
    DSPContext dsp;             ///< pointers for accelerated dsp functions
    H264ChromaContext h264chroma;
    HpelDSPContext hdsp;
--- a/libavcodec/msmpeg4dec.c
+++ b/libavcodec/msmpeg4dec.c
@@ -179,7 +179,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
        *mb_type_ptr = MB_TYPE_INTRA;
    }

    s->dsp.clear_blocks(s->block[0]);
    s->bdsp.clear_blocks(s->block[0]);
    for (i = 0; i < 6; i++) {
        if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
        {
@@ -270,7 +270,7 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64])
        }
    }

    s->dsp.clear_blocks(s->block[0]);
    s->bdsp.clear_blocks(s->block[0]);
    for (i = 0; i < 6; i++) {
        if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
        {
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -1,5 +1,6 @@
 OBJS                                   += ppc/fmtconvert_altivec.o      \

 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
 OBJS-$(CONFIG_DSPUTIL)                 += ppc/dsputil_ppc.o
 OBJS-$(CONFIG_FFT)                     += ppc/fft_altivec.o
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -0,0 +1,169 @@
 /*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "config.h"
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 #include <string.h>

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavcodec/blockdsp.h"

 /* ***** WARNING ***** WARNING ***** WARNING ***** */
 /*
 * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
 * a cache line size not equal to 32 bytes. Fortunately all processors used
 * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
 * cache lines. This is due to the use of the 'dcbz' instruction. It simply
 * clears a single cache line to zero, so you need to know the cache line
 * size to use it! It's absurd, but it's fast...
 *
 * update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
 * cache line size: 128 bytes. Oups.
 * The semantics of dcbz was changed, it always clears 32 bytes. So the function
 * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
 * which is defined to clear a cache line (as dcbz before). So we can still
 * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
 *
 * see <http://developer.apple.com/technotes/tn/tn2087.html>
 * and <http://developer.apple.com/technotes/tn/tn2086.html>
 */
 static void clear_blocks_dcbz32_ppc(int16_t *blocks)
 {
    register int misal = (unsigned long) blocks & 0x00000010, i = 0;

    if (misal) {
        ((unsigned long *) blocks)[0] = 0L;
        ((unsigned long *) blocks)[1] = 0L;
        ((unsigned long *) blocks)[2] = 0L;
        ((unsigned long *) blocks)[3] = 0L;
        i += 16;
    }
    for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32)
        __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory");
    if (misal) {
        ((unsigned long *) blocks)[188] = 0L;
        ((unsigned long *) blocks)[189] = 0L;
        ((unsigned long *) blocks)[190] = 0L;
        ((unsigned long *) blocks)[191] = 0L;
        i += 16;
    }
 }

 /* Same as above, when dcbzl clears a whole 128 bytes cache line
 * i.e. the PPC970 AKA G5. */
 static void clear_blocks_dcbz128_ppc(int16_t *blocks)
 {
 #if HAVE_DCBZL
    register int misal = (unsigned long) blocks & 0x0000007f, i = 0;

    if (misal) {
        /* We could probably also optimize this case,
         * but there's not much point as the machines
         * aren't available yet (2003-06-26). */
        memset(blocks, 0, sizeof(int16_t) * 6 * 64);
    } else {
        for (; i < sizeof(int16_t) * 6 * 64; i += 128)
            __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory");
    }
 #else
    memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 #endif
 }

 /* Check dcbz report how many bytes are set to 0 by dcbz. */
 /* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
 * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
 * assembler knows about dcbzl ... */
 static long check_dcbzl_effect(void)
 {
    long count = 0;
 #if HAVE_DCBZL
    register char *fakedata = av_malloc(1024);
    register char *fakedata_middle;
    register long zero = 0, i = 0;

    if (!fakedata)
        return 0L;

    fakedata_middle = fakedata + 512;

    memset(fakedata, 0xFF, 1024);

    /* Below the constraint "b" seems to mean "address base register"
     * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
    __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero));

    for (i = 0; i < 1024; i++)
        if (fakedata[i] == (char) 0)
            count++;

    av_free(fakedata);
 #endif

    return count;
 }

 #if HAVE_ALTIVEC
 static void clear_block_altivec(int16_t *block)
 {
    LOAD_ZERO;
    vec_st(zero_s16v,   0, block);
    vec_st(zero_s16v,  16, block);
    vec_st(zero_s16v,  32, block);
    vec_st(zero_s16v,  48, block);
    vec_st(zero_s16v,  64, block);
    vec_st(zero_s16v,  80, block);
    vec_st(zero_s16v,  96, block);
    vec_st(zero_s16v, 112, block);
 }
 #endif /* HAVE_ALTIVEC */

 av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth)
 {
    // common optimizations whether AltiVec is available or not
    if (!high_bit_depth) {
        switch (check_dcbzl_effect()) {
        case 32:
            c->clear_blocks = clear_blocks_dcbz32_ppc;
            break;
        case 128:
            c->clear_blocks = clear_blocks_dcbz128_ppc;
            break;
        default:
            break;
        }
    }

 #if HAVE_ALTIVEC
    if (!PPC_ALTIVEC(av_get_cpu_flags()))
        return;

    if (!high_bit_depth)
        c->clear_block = clear_block_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -558,19 +558,6 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
    }
 }

 static void clear_block_altivec(int16_t *block)
 {
    LOAD_ZERO;
    vec_st(zero_s16v,   0, block);
    vec_st(zero_s16v,  16, block);
    vec_st(zero_s16v,  32, block);
    vec_st(zero_s16v,  48, block);
    vec_st(zero_s16v,  64, block);
    vec_st(zero_s16v,  80, block);
    vec_st(zero_s16v,  96, block);
    vec_st(zero_s16v, 112, block);
 }

 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                     uint8_t *src, int stride, int h)
 {
@@ -931,7 +918,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,

    if (!high_bit_depth) {
        c->get_pixels = get_pixels_altivec;
        c->clear_block = clear_block_altivec;
    }

    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -24,125 +24,15 @@

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 #include "dsputil_altivec.h"

 /* ***** WARNING ***** WARNING ***** WARNING ***** */
 /*
 * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
 * a cache line size not equal to 32 bytes. Fortunately all processors used
 * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
 * cache lines. This is due to the use of the 'dcbz' instruction. It simply
 * clears a single cache line to zero, so you need to know the cache line
 * size to use it! It's absurd, but it's fast...
 *
 * update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
 * cache line size: 128 bytes. Oups.
 * The semantics of dcbz was changed, it always clears 32 bytes. So the function
 * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
 * which is defined to clear a cache line (as dcbz before). So we can still
 * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
 *
 * see <http://developer.apple.com/technotes/tn/tn2087.html>
 * and <http://developer.apple.com/technotes/tn/tn2086.html>
 */
 static void clear_blocks_dcbz32_ppc(int16_t *blocks)
 {
    register int misal = (unsigned long) blocks & 0x00000010, i = 0;

    if (misal) {
        ((unsigned long *) blocks)[0] = 0L;
        ((unsigned long *) blocks)[1] = 0L;
        ((unsigned long *) blocks)[2] = 0L;
        ((unsigned long *) blocks)[3] = 0L;
        i += 16;
    }
    for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32)
        __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory");
    if (misal) {
        ((unsigned long *) blocks)[188] = 0L;
        ((unsigned long *) blocks)[189] = 0L;
        ((unsigned long *) blocks)[190] = 0L;
        ((unsigned long *) blocks)[191] = 0L;
        i += 16;
    }
 }

 /* Same as above, when dcbzl clears a whole 128 bytes cache line
 * i.e. the PPC970 AKA G5. */
 static void clear_blocks_dcbz128_ppc(int16_t *blocks)
 {
 #if HAVE_DCBZL
    register int misal = (unsigned long) blocks & 0x0000007f, i = 0;

    if (misal) {
        /* We could probably also optimize this case,
         * but there's not much point as the machines
         * aren't available yet (2003-06-26). */
        memset(blocks, 0, sizeof(int16_t) * 6 * 64);
    } else {
        for (; i < sizeof(int16_t) * 6 * 64; i += 128)
            __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory");
    }
 #else
    memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 #endif
 }

 /* Check dcbz report how many bytes are set to 0 by dcbz. */
 /* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
 * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
 * assembler knows about dcbzl ... */
 static long check_dcbzl_effect(void)
 {
    long count = 0;
 #if HAVE_DCBZL
    register char *fakedata = av_malloc(1024);
    register char *fakedata_middle;
    register long zero = 0, i = 0;

    if (!fakedata)
        return 0L;

    fakedata_middle = fakedata + 512;

    memset(fakedata, 0xFF, 1024);

    /* Below the constraint "b" seems to mean "address base register"
     * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
    __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero));

    for (i = 0; i < 1024; i++)
        if (fakedata[i] == (char) 0)
            count++;

    av_free(fakedata);
 #endif

    return count;
 }

 av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx,
                                 unsigned high_bit_depth)
 {
    int mm_flags = av_get_cpu_flags();
    // common optimizations whether AltiVec is available or not
    if (!high_bit_depth) {
        switch (check_dcbzl_effect()) {
        case 32:
            c->clear_blocks = clear_blocks_dcbz32_ppc;
            break;
        case 128:
            c->clear_blocks = clear_blocks_dcbz128_ppc;
            break;
        default:
            break;
        }
    }

    if (PPC_ALTIVEC(mm_flags)) {
        ff_dsputil_init_altivec(c, avctx, high_bit_depth);
        ff_int_init_altivec(c, avctx);
--- a/libavcodec/proresdec.h
+++ b/libavcodec/proresdec.h
@@ -23,6 +23,7 @@
 #define AVCODEC_PRORESDEC_H

 #include "dsputil.h"
 #include "blockdsp.h"
 #include "proresdsp.h"

 typedef struct {
@@ -36,6 +37,7 @@ typedef struct {

 typedef struct {
    DSPContext dsp;
    BlockDSPContext bdsp;
    ProresDSPContext prodsp;
    AVFrame *frame;
    int frame_type;              ///< 0 = progressive, 1 = tff, 2 = bff
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -50,6 +50,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    avctx->bits_per_raw_sample = 10;

    ff_dsputil_init(&ctx->dsp, avctx);
    ff_blockdsp_init(&ctx->bdsp, avctx);
    ff_proresdsp_init(&ctx->prodsp, avctx);

    ff_init_scantable_permutation(idct_permutation,
@@ -366,7 +367,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
    int ret;

    for (i = 0; i < blocks_per_slice; i++)
        ctx->dsp.clear_block(blocks+(i<<6));
        ctx->bdsp.clear_block(blocks+(i<<6));

    init_get_bits(&gb, buf, buf_size << 3);

@@ -399,7 +400,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
    int ret;

    for (i = 0; i < blocks_per_slice; i++)
        ctx->dsp.clear_block(blocks+(i<<6));
        ctx->bdsp.clear_block(blocks+(i<<6));

    init_get_bits(&gb, buf, buf_size << 3);

@@ -478,7 +479,7 @@ static void decode_slice_alpha(ProresContext *ctx,
    int16_t *block;

    for (i = 0; i < blocks_per_slice<<2; i++)
        ctx->dsp.clear_block(blocks+(i<<6));
        ctx->bdsp.clear_block(blocks+(i<<6));

    init_get_bits(&gb, buf, buf_size << 3);

--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -3016,7 +3016,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
    int scale;
    int q1, q2 = 0;

    s->dsp.clear_block(block);
    s->bdsp.clear_block(block);

    /* XXX: Guard against dumb values of mquant */
    mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant);
@@ -3223,7 +3223,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
    int ttblk = ttmb & 7;
    int pat = 0;

    s->dsp.clear_block(block);
    s->bdsp.clear_block(block);

    if (ttmb == -1) {
        ttblk = ff_vc1_ttblk_to_tt[v->tt_index][get_vlc2(gb, ff_vc1_ttblk_vlc[v->tt_index].table, VC1_TTBLK_VLC_BITS, 1)];
@@ -4801,7 +4801,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
            dst[3] = dst[2] + 8;
            dst[4] = s->dest[1];
            dst[5] = s->dest[2];
            s->dsp.clear_blocks(s->block[0]);
            s->bdsp.clear_blocks(s->block[0]);
            mb_pos = s->mb_x + s->mb_y * s->mb_width;
            s->current_picture.mb_type[mb_pos]                     = MB_TYPE_INTRA;
            s->current_picture.qscale_table[mb_pos]                = v->pq;
@@ -4941,7 +4941,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
        for (;s->mb_x < s->mb_width; s->mb_x++) {
            int16_t (*block)[64] = v->block[v->cur_blk_idx];
            ff_update_block_index(s);
            s->dsp.clear_blocks(block[0]);
            s->bdsp.clear_blocks(block[0]);
            mb_pos = s->mb_x + s->mb_y * s->mb_stride;
            s->current_picture.mb_type[mb_pos + v->mb_off]                         = MB_TYPE_INTRA;
            s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
@@ -5626,6 +5626,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
    // That this is necessary might indicate a bug.
    ff_vc1_decode_end(avctx);

    ff_blockdsp_init(&s->bdsp, avctx);
    ff_h264chroma_init(&v->h264chroma, 8);
    ff_qpeldsp_init(&s->qdsp);

--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -28,6 +28,7 @@
 av_cold void ff_wmv2_common_init(Wmv2Context * w){
    MpegEncContext * const s= &w->s;

    ff_blockdsp_init(&s->bdsp, s->avctx);
    ff_wmv2dsp_init(&w->wdsp);
    s->dsp.idct_permutation_type = w->wdsp.idct_perm;
    ff_init_scantable_permutation(s->dsp.idct_permutation,
@@ -60,12 +61,12 @@ static void wmv2_add_block(Wmv2Context *w, int16_t *block1, uint8_t *dst, int st
    case 1:
        ff_simple_idct84_add(dst           , stride, block1);
        ff_simple_idct84_add(dst + 4*stride, stride, w->abt_block2[n]);
        s->dsp.clear_block(w->abt_block2[n]);
        s->bdsp.clear_block(w->abt_block2[n]);
        break;
    case 2:
        ff_simple_idct48_add(dst           , stride, block1);
        ff_simple_idct48_add(dst + 4       , stride, w->abt_block2[n]);
        s->dsp.clear_block(w->abt_block2[n]);
        s->bdsp.clear_block(w->abt_block2[n]);
        break;
    default:
        av_log(s->avctx, AV_LOG_ERROR, "internal error in WMV2 abt\n");
--- a/libavcodec/wmv2dec.c
+++ b/libavcodec/wmv2dec.c
@@ -385,7 +385,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64])
        wmv2_pred_motion(w, &mx, &my);

        if(cbp){
            s->dsp.clear_blocks(s->block[0]);
            s->bdsp.clear_blocks(s->block[0]);
            if(s->per_mb_rl_table){
                s->rl_table_index = decode012(&s->gb);
                s->rl_chroma_table_index = s->rl_table_index;
@@ -431,7 +431,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64])
            s->rl_chroma_table_index = s->rl_table_index;
        }

        s->dsp.clear_blocks(s->block[0]);
        s->bdsp.clear_blocks(s->block[0]);
        for (i = 0; i < 6; i++) {
            if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
            {
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -53,6 +53,7 @@ OBJS-$(CONFIG_VP8_DECODER)             += x86/vp8dsp_init.o
 OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
 OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o

 MMX-OBJS-$(CONFIG_BLOCKDSP)            += x86/blockdsp_mmx.o
 MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
                                          x86/idct_mmx_xvid.o           \
                                          x86/idct_sse2_xvid.o          \
--- a/libavcodec/x86/blockdsp_mmx.c
+++ b/libavcodec/x86/blockdsp_mmx.c
@@ -0,0 +1,61 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/blockdsp.h"
 #include "libavcodec/version.h"

 void ff_clear_block_mmx(int16_t *block);
 void ff_clear_block_sse(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);

 #if FF_API_XVMC
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
                                  AVCodecContext *avctx)
 #else
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth)
 #endif /* FF_API_XVMC */
 {
 #if HAVE_YASM
    int cpu_flags = av_get_cpu_flags();

    if (!high_bit_depth) {
        if (INLINE_MMX(cpu_flags)) {
            c->clear_block  = ff_clear_block_mmx;
            c->clear_blocks = ff_clear_blocks_mmx;
        }

    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
        return;

        if (INLINE_SSE(cpu_flags)) {
            c->clear_block  = ff_clear_block_sse;
            c->clear_blocks = ff_clear_blocks_sse;
        }
    }
 #endif /* HAVE_YASM */
 }
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -22,12 +22,10 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/internal.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 #include "libavcodec/simple_idct.h"
 #include "libavcodec/version.h"
 #include "dsputil_x86.h"
 #include "idct_xvid.h"

@@ -82,10 +80,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_MMX_INLINE */

 #if HAVE_MMX_EXTERNAL
    if (!high_bit_depth) {
        c->clear_block  = ff_clear_block_mmx;
        c->clear_blocks = ff_clear_blocks_mmx;
    }
    c->vector_clip_int32 = ff_vector_clip_int32_mmx;
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
 #endif /* HAVE_MMX_EXTERNAL */
@@ -113,15 +107,6 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
 #if HAVE_YASM
 #if HAVE_SSE_EXTERNAL
    c->vector_clipf = ff_vector_clipf_sse;

    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
        return;

    if (!high_bit_depth) {
        c->clear_block  = ff_clear_block_sse;
        c->clear_blocks = ff_clear_blocks_sse;
    }
 #endif
 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
    c->gmc = ff_gmc_sse;
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -134,7 +134,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
    } while (--i);
 }


 /* Draw the edges of width 'w' of an image of size width, height
 * this MMX version can only handle w == 8 || w == 16. */
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -40,11 +40,6 @@ void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
 void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
                                       int line_size);

 void ff_clear_block_mmx(int16_t *block);
 void ff_clear_block_sse(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);

 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
                       int w, int h, int sides);