libavcodec/blockdsp : add AVX version

Also modify the required alignment, to 32 instead of 16 for several codecs Signed-off-by: James Almer <jamrial@gmail.com>
8 years ago · cbbec68847
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -54,7 +54,7 @@ typedef struct ASV1Context {
    int mb_height;
    int mb_width2;
    int mb_height2;
    DECLARE_ALIGNED(16, int16_t, block)[6][64];
    DECLARE_ALIGNED(32, int16_t, block)[6][64];
    uint16_t intra_matrix[64];
    int q_intra_matrix[64];
    uint8_t *bitstream_buffer;
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -813,7 +813,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
    int v, col[2];
    const uint8_t *scan;
    int xoff, yoff;
    LOCAL_ALIGNED_16(int16_t, block, [64]);
    LOCAL_ALIGNED_32(int16_t, block, [64]);
    LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
    int coordmap[64];
    int ybias = is_key ? -15 : 0;
@@ -976,7 +976,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
    uint8_t *dst, *prev, *ref_start, *ref_end;
    int v, col[2];
    const uint8_t *scan;
    LOCAL_ALIGNED_16(int16_t, block, [64]);
    LOCAL_ALIGNED_32(int16_t, block, [64]);
    LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
    LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
    int coordmap[64];
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -74,7 +74,7 @@ typedef struct DNXHDEncContext {
    unsigned min_padding;
    int intra_quant_bias;
    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
    DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
    DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10
    DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -54,7 +54,7 @@ typedef struct MadContext {
    GetBitContext gb;
    void *bitstream_buf;
    unsigned int bitstream_buf_size;
    DECLARE_ALIGNED(16, int16_t, block)[64];
    DECLARE_ALIGNED(32, int16_t, block)[64];
    ScanTable scantable;
    uint16_t quant_matrix[64];
    int mb_x;
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -51,7 +51,7 @@ typedef struct TqiContext {
    uint16_t intra_matrix[64];
    int last_dc[3];
    DECLARE_ALIGNED(16, int16_t, block)[6][64];
    DECLARE_ALIGNED(32, int16_t, block)[6][64];
 } TqiContext;
 static av_cold int tqi_decode_init(AVCodecContext *avctx)
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -122,7 +122,7 @@ typedef struct JPGContext {
    VLC        dc_vlc[2], ac_vlc[2];
    int        prev_dc[3];
    DECLARE_ALIGNED(16, int16_t, block)[6][64];
    DECLARE_ALIGNED(32, int16_t, block)[6][64];
    uint8_t    *buf;
 } JPGContext;
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -574,7 +574,7 @@ not_coded:
 static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
    LOCAL_ALIGNED_16(int16_t, dblock, [64]);
    LOCAL_ALIGNED_32(int16_t, dblock, [64]);
    int i, mbi;
    int bli[6];
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -48,7 +48,7 @@ typedef struct MDECContext {
    int mb_width;
    int mb_height;
    int mb_x, mb_y;
    DECLARE_ALIGNED(16, int16_t, block)[6][64];
    DECLARE_ALIGNED(32, int16_t, block)[6][64];
    DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64];
    uint8_t *bitstream_buffer;
    unsigned int bitstream_buffer_size;
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -49,7 +49,7 @@ typedef struct MimicContext {
    ThreadFrame     frames     [16];
    DECLARE_ALIGNED(16, int16_t, dct_block)[64];
    DECLARE_ALIGNED(32, int16_t, dct_block)[64];
    GetBitContext   gb;
    ScanTable       scantable;
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -98,7 +98,7 @@ typedef struct MJpegDecodeContext {
    int got_picture;                                ///< we found a SOF and picture is valid, too.
    int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
    int8_t *qscale_table;
    DECLARE_ALIGNED(16, int16_t, block)[64];
    DECLARE_ALIGNED(32, int16_t, block)[64];
    int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
    uint8_t *last_nnz[MAX_COMPONENTS];
    uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -368,7 +368,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
                             const int16_t *qmat)
 {
    ProresContext *ctx = avctx->priv_data;
    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
    int16_t *block;
    GetBitContext gb;
    int i, blocks_per_slice = slice->mb_count<<2;
@@ -402,7 +402,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
                               const int16_t *qmat, int log2_blocks_per_mb)
 {
    ProresContext *ctx = avctx->priv_data;
    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
    int16_t *block;
    GetBitContext gb;
    int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
@@ -485,7 +485,7 @@ static void decode_slice_alpha(ProresContext *ctx,
 {
    GetBitContext gb;
    int i;
    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
    int16_t *block;
    for (i = 0; i < blocks_per_slice<<2; i++)
--- a/libavcodec/speedhq.c
+++ b/libavcodec/speedhq.c
@@ -224,7 +224,7 @@ static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l
 {
    const int *quant_matrix = s->quant_matrix;
    const uint8_t *scantable = s->intra_scantable.permutated;
    LOCAL_ALIGNED_16(int16_t, block, [64]);
    LOCAL_ALIGNED_32(int16_t, block, [64]);
    int dc_offset;
    s->bdsp.clear_block(block);
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -51,7 +51,7 @@ typedef struct Wmv2Context {
    int hshift;
    ScanTable abt_scantable[2];
    DECLARE_ALIGNED(16, int16_t, abt_block2)[6][64];
    DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64];
 } Wmv2Context;
 void ff_wmv2_common_init(Wmv2Context *w);
--- a/libavcodec/x86/blockdsp.asm
+++ b/libavcodec/x86/blockdsp.asm
@@ -4,6 +4,8 @@
 ;* Copyright (c) 2008 Loren Merritt
 ;* Copyright (c) 2009 Fiona Glaser
 ;*
 ;* AVX version by Jokyo Images
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
@@ -39,20 +41,18 @@ cglobal clear_block, 1, 1, %1, blocks
    mova  [blocksq+mmsize*(1+%%i)], m0
    mova  [blocksq+mmsize*(2+%%i)], m0
    mova  [blocksq+mmsize*(3+%%i)], m0
    mova  [blocksq+mmsize*(4+%%i)], m0
    mova  [blocksq+mmsize*(5+%%i)], m0
    mova  [blocksq+mmsize*(6+%%i)], m0
    mova  [blocksq+mmsize*(7+%%i)], m0
 %assign %%i %%i+8
 %assign %%i %%i+4
 %endrep
    RET
 %endmacro
 INIT_MMX mmx
 %define ZERO pxor
 CLEAR_BLOCK 0, 2
 CLEAR_BLOCK 0, 4
 INIT_XMM sse
 %define ZERO xorps
 CLEAR_BLOCK 1, 2
 INIT_YMM avx
 CLEAR_BLOCK 1, 1
 ;-----------------------------------------
@@ -84,3 +84,5 @@ CLEAR_BLOCKS 0
 INIT_XMM sse
 %define ZERO xorps
 CLEAR_BLOCKS 1
 INIT_YMM avx
 CLEAR_BLOCKS 1
--- a/libavcodec/x86/blockdsp_init.c
+++ b/libavcodec/x86/blockdsp_init.c
@@ -28,8 +28,10 @@
 void ff_clear_block_mmx(int16_t *block);
 void ff_clear_block_sse(int16_t *block);
 void ff_clear_block_avx(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 void ff_clear_blocks_avx(int16_t *blocks);
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
                                  AVCodecContext *avctx)
@@ -50,5 +52,9 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
        c->clear_block  = ff_clear_block_sse;
        c->clear_blocks = ff_clear_blocks_sse;
    }
    if (EXTERNAL_AVX_FAST(cpu_flags)) {
        c->clear_block  = ff_clear_block_avx;
        c->clear_blocks = ff_clear_blocks_avx;
    }
 #endif /* HAVE_X86ASM */
 }
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -53,8 +53,8 @@ do {                                                                \
 void checkasm_check_blockdsp(void)
 {
    LOCAL_ALIGNED_16(uint16_t, buf0, [6 * 8 * 8]);
    LOCAL_ALIGNED_16(uint16_t, buf1, [6 * 8 * 8]);
    LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
    LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]);
    AVCodecContext avctx = { 0 };
    BlockDSPContext h;