Also modify the required alignment, to 32 instead of 16 for several codecs Signed-off-by: James Almer <jamrial@gmail.com>tags/n3.4
| @@ -54,7 +54,7 @@ typedef struct ASV1Context { | |||||
| int mb_height; | int mb_height; | ||||
| int mb_width2; | int mb_width2; | ||||
| int mb_height2; | int mb_height2; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[6][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[6][64]; | |||||
| uint16_t intra_matrix[64]; | uint16_t intra_matrix[64]; | ||||
| int q_intra_matrix[64]; | int q_intra_matrix[64]; | ||||
| uint8_t *bitstream_buffer; | uint8_t *bitstream_buffer; | ||||
| @@ -813,7 +813,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, | |||||
| int v, col[2]; | int v, col[2]; | ||||
| const uint8_t *scan; | const uint8_t *scan; | ||||
| int xoff, yoff; | int xoff, yoff; | ||||
| LOCAL_ALIGNED_16(int16_t, block, [64]); | |||||
| LOCAL_ALIGNED_32(int16_t, block, [64]); | |||||
| LOCAL_ALIGNED_16(int32_t, dctblock, [64]); | LOCAL_ALIGNED_16(int32_t, dctblock, [64]); | ||||
| int coordmap[64]; | int coordmap[64]; | ||||
| int ybias = is_key ? -15 : 0; | int ybias = is_key ? -15 : 0; | ||||
| @@ -976,7 +976,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, | |||||
| uint8_t *dst, *prev, *ref_start, *ref_end; | uint8_t *dst, *prev, *ref_start, *ref_end; | ||||
| int v, col[2]; | int v, col[2]; | ||||
| const uint8_t *scan; | const uint8_t *scan; | ||||
| LOCAL_ALIGNED_16(int16_t, block, [64]); | |||||
| LOCAL_ALIGNED_32(int16_t, block, [64]); | |||||
| LOCAL_ALIGNED_16(uint8_t, ublock, [64]); | LOCAL_ALIGNED_16(uint8_t, ublock, [64]); | ||||
| LOCAL_ALIGNED_16(int32_t, dctblock, [64]); | LOCAL_ALIGNED_16(int32_t, dctblock, [64]); | ||||
| int coordmap[64]; | int coordmap[64]; | ||||
| @@ -74,7 +74,7 @@ typedef struct DNXHDEncContext { | |||||
| unsigned min_padding; | unsigned min_padding; | ||||
| int intra_quant_bias; | int intra_quant_bias; | ||||
| DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, blocks)[12][64]; | |||||
| DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10 | DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10 | ||||
| DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10 | DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10 | ||||
| @@ -54,7 +54,7 @@ typedef struct MadContext { | |||||
| GetBitContext gb; | GetBitContext gb; | ||||
| void *bitstream_buf; | void *bitstream_buf; | ||||
| unsigned int bitstream_buf_size; | unsigned int bitstream_buf_size; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[64]; | |||||
| ScanTable scantable; | ScanTable scantable; | ||||
| uint16_t quant_matrix[64]; | uint16_t quant_matrix[64]; | ||||
| int mb_x; | int mb_x; | ||||
| @@ -51,7 +51,7 @@ typedef struct TqiContext { | |||||
| uint16_t intra_matrix[64]; | uint16_t intra_matrix[64]; | ||||
| int last_dc[3]; | int last_dc[3]; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[6][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[6][64]; | |||||
| } TqiContext; | } TqiContext; | ||||
| static av_cold int tqi_decode_init(AVCodecContext *avctx) | static av_cold int tqi_decode_init(AVCodecContext *avctx) | ||||
| @@ -122,7 +122,7 @@ typedef struct JPGContext { | |||||
| VLC dc_vlc[2], ac_vlc[2]; | VLC dc_vlc[2], ac_vlc[2]; | ||||
| int prev_dc[3]; | int prev_dc[3]; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[6][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[6][64]; | |||||
| uint8_t *buf; | uint8_t *buf; | ||||
| } JPGContext; | } JPGContext; | ||||
| @@ -574,7 +574,7 @@ not_coded: | |||||
| static int h263_skip_b_part(MpegEncContext *s, int cbp) | static int h263_skip_b_part(MpegEncContext *s, int cbp) | ||||
| { | { | ||||
| LOCAL_ALIGNED_16(int16_t, dblock, [64]); | |||||
| LOCAL_ALIGNED_32(int16_t, dblock, [64]); | |||||
| int i, mbi; | int i, mbi; | ||||
| int bli[6]; | int bli[6]; | ||||
| @@ -48,7 +48,7 @@ typedef struct MDECContext { | |||||
| int mb_width; | int mb_width; | ||||
| int mb_height; | int mb_height; | ||||
| int mb_x, mb_y; | int mb_x, mb_y; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[6][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[6][64]; | |||||
| DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64]; | DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64]; | ||||
| uint8_t *bitstream_buffer; | uint8_t *bitstream_buffer; | ||||
| unsigned int bitstream_buffer_size; | unsigned int bitstream_buffer_size; | ||||
| @@ -49,7 +49,7 @@ typedef struct MimicContext { | |||||
| ThreadFrame frames [16]; | ThreadFrame frames [16]; | ||||
| DECLARE_ALIGNED(16, int16_t, dct_block)[64]; | |||||
| DECLARE_ALIGNED(32, int16_t, dct_block)[64]; | |||||
| GetBitContext gb; | GetBitContext gb; | ||||
| ScanTable scantable; | ScanTable scantable; | ||||
| @@ -98,7 +98,7 @@ typedef struct MJpegDecodeContext { | |||||
| int got_picture; ///< we found a SOF and picture is valid, too. | int got_picture; ///< we found a SOF and picture is valid, too. | ||||
| int linesize[MAX_COMPONENTS]; ///< linesize << interlaced | int linesize[MAX_COMPONENTS]; ///< linesize << interlaced | ||||
| int8_t *qscale_table; | int8_t *qscale_table; | ||||
| DECLARE_ALIGNED(16, int16_t, block)[64]; | |||||
| DECLARE_ALIGNED(32, int16_t, block)[64]; | |||||
| int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode) | int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode) | ||||
| uint8_t *last_nnz[MAX_COMPONENTS]; | uint8_t *last_nnz[MAX_COMPONENTS]; | ||||
| uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) | uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) | ||||
| @@ -368,7 +368,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice, | |||||
| const int16_t *qmat) | const int16_t *qmat) | ||||
| { | { | ||||
| ProresContext *ctx = avctx->priv_data; | ProresContext *ctx = avctx->priv_data; | ||||
| LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); | |||||
| LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); | |||||
| int16_t *block; | int16_t *block; | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| int i, blocks_per_slice = slice->mb_count<<2; | int i, blocks_per_slice = slice->mb_count<<2; | ||||
| @@ -402,7 +402,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice, | |||||
| const int16_t *qmat, int log2_blocks_per_mb) | const int16_t *qmat, int log2_blocks_per_mb) | ||||
| { | { | ||||
| ProresContext *ctx = avctx->priv_data; | ProresContext *ctx = avctx->priv_data; | ||||
| LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); | |||||
| LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); | |||||
| int16_t *block; | int16_t *block; | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb; | int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb; | ||||
| @@ -485,7 +485,7 @@ static void decode_slice_alpha(ProresContext *ctx, | |||||
| { | { | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| int i; | int i; | ||||
| LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); | |||||
| LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); | |||||
| int16_t *block; | int16_t *block; | ||||
| for (i = 0; i < blocks_per_slice<<2; i++) | for (i = 0; i < blocks_per_slice<<2; i++) | ||||
| @@ -224,7 +224,7 @@ static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l | |||||
| { | { | ||||
| const int *quant_matrix = s->quant_matrix; | const int *quant_matrix = s->quant_matrix; | ||||
| const uint8_t *scantable = s->intra_scantable.permutated; | const uint8_t *scantable = s->intra_scantable.permutated; | ||||
| LOCAL_ALIGNED_16(int16_t, block, [64]); | |||||
| LOCAL_ALIGNED_32(int16_t, block, [64]); | |||||
| int dc_offset; | int dc_offset; | ||||
| s->bdsp.clear_block(block); | s->bdsp.clear_block(block); | ||||
| @@ -51,7 +51,7 @@ typedef struct Wmv2Context { | |||||
| int hshift; | int hshift; | ||||
| ScanTable abt_scantable[2]; | ScanTable abt_scantable[2]; | ||||
| DECLARE_ALIGNED(16, int16_t, abt_block2)[6][64]; | |||||
| DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64]; | |||||
| } Wmv2Context; | } Wmv2Context; | ||||
| void ff_wmv2_common_init(Wmv2Context *w); | void ff_wmv2_common_init(Wmv2Context *w); | ||||
| @@ -4,6 +4,8 @@ | |||||
| ;* Copyright (c) 2008 Loren Merritt | ;* Copyright (c) 2008 Loren Merritt | ||||
| ;* Copyright (c) 2009 Fiona Glaser | ;* Copyright (c) 2009 Fiona Glaser | ||||
| ;* | ;* | ||||
| ;* AVX version by Jokyo Images | |||||
| ;* | |||||
| ;* This file is part of FFmpeg. | ;* This file is part of FFmpeg. | ||||
| ;* | ;* | ||||
| ;* FFmpeg is free software; you can redistribute it and/or | ;* FFmpeg is free software; you can redistribute it and/or | ||||
| @@ -39,20 +41,18 @@ cglobal clear_block, 1, 1, %1, blocks | |||||
| mova [blocksq+mmsize*(1+%%i)], m0 | mova [blocksq+mmsize*(1+%%i)], m0 | ||||
| mova [blocksq+mmsize*(2+%%i)], m0 | mova [blocksq+mmsize*(2+%%i)], m0 | ||||
| mova [blocksq+mmsize*(3+%%i)], m0 | mova [blocksq+mmsize*(3+%%i)], m0 | ||||
| mova [blocksq+mmsize*(4+%%i)], m0 | |||||
| mova [blocksq+mmsize*(5+%%i)], m0 | |||||
| mova [blocksq+mmsize*(6+%%i)], m0 | |||||
| mova [blocksq+mmsize*(7+%%i)], m0 | |||||
| %assign %%i %%i+8 | |||||
| %assign %%i %%i+4 | |||||
| %endrep | %endrep | ||||
| RET | RET | ||||
| %endmacro | %endmacro | ||||
| INIT_MMX mmx | INIT_MMX mmx | ||||
| %define ZERO pxor | %define ZERO pxor | ||||
| CLEAR_BLOCK 0, 2 | |||||
| CLEAR_BLOCK 0, 4 | |||||
| INIT_XMM sse | INIT_XMM sse | ||||
| %define ZERO xorps | %define ZERO xorps | ||||
| CLEAR_BLOCK 1, 2 | |||||
| INIT_YMM avx | |||||
| CLEAR_BLOCK 1, 1 | CLEAR_BLOCK 1, 1 | ||||
| ;----------------------------------------- | ;----------------------------------------- | ||||
| @@ -84,3 +84,5 @@ CLEAR_BLOCKS 0 | |||||
| INIT_XMM sse | INIT_XMM sse | ||||
| %define ZERO xorps | %define ZERO xorps | ||||
| CLEAR_BLOCKS 1 | CLEAR_BLOCKS 1 | ||||
| INIT_YMM avx | |||||
| CLEAR_BLOCKS 1 | |||||
| @@ -28,8 +28,10 @@ | |||||
| void ff_clear_block_mmx(int16_t *block); | void ff_clear_block_mmx(int16_t *block); | ||||
| void ff_clear_block_sse(int16_t *block); | void ff_clear_block_sse(int16_t *block); | ||||
| void ff_clear_block_avx(int16_t *block); | |||||
| void ff_clear_blocks_mmx(int16_t *blocks); | void ff_clear_blocks_mmx(int16_t *blocks); | ||||
| void ff_clear_blocks_sse(int16_t *blocks); | void ff_clear_blocks_sse(int16_t *blocks); | ||||
| void ff_clear_blocks_avx(int16_t *blocks); | |||||
| av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, | av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, | ||||
| AVCodecContext *avctx) | AVCodecContext *avctx) | ||||
| @@ -50,5 +52,9 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, | |||||
| c->clear_block = ff_clear_block_sse; | c->clear_block = ff_clear_block_sse; | ||||
| c->clear_blocks = ff_clear_blocks_sse; | c->clear_blocks = ff_clear_blocks_sse; | ||||
| } | } | ||||
| if (EXTERNAL_AVX_FAST(cpu_flags)) { | |||||
| c->clear_block = ff_clear_block_avx; | |||||
| c->clear_blocks = ff_clear_blocks_avx; | |||||
| } | |||||
| #endif /* HAVE_X86ASM */ | #endif /* HAVE_X86ASM */ | ||||
| } | } | ||||
| @@ -53,8 +53,8 @@ do { \ | |||||
| void checkasm_check_blockdsp(void) | void checkasm_check_blockdsp(void) | ||||
| { | { | ||||
| LOCAL_ALIGNED_16(uint16_t, buf0, [6 * 8 * 8]); | |||||
| LOCAL_ALIGNED_16(uint16_t, buf1, [6 * 8 * 8]); | |||||
| LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]); | |||||
| LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]); | |||||
| AVCodecContext avctx = { 0 }; | AVCodecContext avctx = { 0 }; | ||||
| BlockDSPContext h; | BlockDSPContext h; | ||||