These functions are mostly H264-specific (the only other user I can spot is bink), and this allows us to special-case some functionality for H264. Also remove the 16-bit-coeff with >8bpp versions (unused) and merge the duplicate 32-bit-coeff for >8bpp (identical). Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n1.2
| @@ -443,6 +443,27 @@ static void put_signed_pixels_clamped_c(const int16_t *block, | |||||
| } | } | ||||
| } | } | ||||
| static void add_pixels8_c(uint8_t *av_restrict pixels, | |||||
| int16_t *block, | |||||
| int line_size) | |||||
| { | |||||
| int i; | |||||
| for(i=0;i<8;i++) { | |||||
| pixels[0] += block[0]; | |||||
| pixels[1] += block[1]; | |||||
| pixels[2] += block[2]; | |||||
| pixels[3] += block[3]; | |||||
| pixels[4] += block[4]; | |||||
| pixels[5] += block[5]; | |||||
| pixels[6] += block[6]; | |||||
| pixels[7] += block[7]; | |||||
| pixels += line_size; | |||||
| block += 8; | |||||
| } | |||||
| } | |||||
| static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, | static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, | ||||
| int line_size) | int line_size) | ||||
| { | { | ||||
| @@ -2852,6 +2873,8 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->shrink[2]= ff_shrink44; | c->shrink[2]= ff_shrink44; | ||||
| c->shrink[3]= ff_shrink88; | c->shrink[3]= ff_shrink88; | ||||
| c->add_pixels8 = add_pixels8_c; | |||||
| #define hpel_funcs(prefix, idx, num) \ | #define hpel_funcs(prefix, idx, num) \ | ||||
| c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \ | c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \ | ||||
| c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \ | c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \ | ||||
| @@ -2879,9 +2902,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->get_pixels = FUNCC(get_pixels ## dct , depth);\ | c->get_pixels = FUNCC(get_pixels ## dct , depth);\ | ||||
| c->draw_edges = FUNCC(draw_edges , depth);\ | c->draw_edges = FUNCC(draw_edges , depth);\ | ||||
| c->clear_block = FUNCC(clear_block ## dct , depth);\ | c->clear_block = FUNCC(clear_block ## dct , depth);\ | ||||
| c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\ | |||||
| c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\ | |||||
| c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\ | |||||
| c->clear_blocks = FUNCC(clear_blocks ## dct , depth) | |||||
| switch (avctx->bits_per_raw_sample) { | switch (avctx->bits_per_raw_sample) { | ||||
| case 9: | case 9: | ||||
| @@ -155,7 +155,6 @@ typedef struct DSPContext { | |||||
| void (*put_signed_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | void (*put_signed_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | ||||
| void (*add_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | void (*add_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | ||||
| void (*add_pixels8)(uint8_t *pixels, int16_t *block, int line_size); | void (*add_pixels8)(uint8_t *pixels, int16_t *block, int line_size); | ||||
| void (*add_pixels4)(uint8_t *pixels, int16_t *block, int line_size); | |||||
| int (*sum_abs_dctelem)(int16_t *block/*align 16*/); | int (*sum_abs_dctelem)(int16_t *block/*align 16*/); | ||||
| /** | /** | ||||
| * translational global motion compensation. | * translational global motion compensation. | ||||
| @@ -89,48 +89,6 @@ static void FUNCC(get_pixels ## suffix)(int16_t *av_restrict _block, \ | |||||
| } \ | } \ | ||||
| } \ | } \ | ||||
| \ | \ | ||||
| static void FUNCC(add_pixels8 ## suffix)(uint8_t *av_restrict _pixels, \ | |||||
| int16_t *_block, \ | |||||
| int line_size) \ | |||||
| { \ | |||||
| int i; \ | |||||
| pixel *av_restrict pixels = (pixel *av_restrict)_pixels; \ | |||||
| dctcoef *block = (dctcoef*)_block; \ | |||||
| line_size /= sizeof(pixel); \ | |||||
| \ | |||||
| for(i=0;i<8;i++) { \ | |||||
| pixels[0] += block[0]; \ | |||||
| pixels[1] += block[1]; \ | |||||
| pixels[2] += block[2]; \ | |||||
| pixels[3] += block[3]; \ | |||||
| pixels[4] += block[4]; \ | |||||
| pixels[5] += block[5]; \ | |||||
| pixels[6] += block[6]; \ | |||||
| pixels[7] += block[7]; \ | |||||
| pixels += line_size; \ | |||||
| block += 8; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| static void FUNCC(add_pixels4 ## suffix)(uint8_t *av_restrict _pixels, \ | |||||
| int16_t *_block, \ | |||||
| int line_size) \ | |||||
| { \ | |||||
| int i; \ | |||||
| pixel *av_restrict pixels = (pixel *av_restrict)_pixels; \ | |||||
| dctcoef *block = (dctcoef*)_block; \ | |||||
| line_size /= sizeof(pixel); \ | |||||
| \ | |||||
| for(i=0;i<4;i++) { \ | |||||
| pixels[0] += block[0]; \ | |||||
| pixels[1] += block[1]; \ | |||||
| pixels[2] += block[2]; \ | |||||
| pixels[3] += block[3]; \ | |||||
| pixels += line_size; \ | |||||
| block += 4; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| static void FUNCC(clear_block ## suffix)(int16_t *block) \ | static void FUNCC(clear_block ## suffix)(int16_t *block) \ | ||||
| { \ | { \ | ||||
| memset(block, 0, sizeof(dctcoef)*64); \ | memset(block, 0, sizeof(dctcoef)*64); \ | ||||
| @@ -1818,7 +1818,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, | |||||
| if (IS_8x8DCT(mb_type)) { | if (IS_8x8DCT(mb_type)) { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| idct_dc_add = | idct_dc_add = | ||||
| idct_add = s->dsp.add_pixels8; | |||||
| idct_add = h->h264dsp.h264_add_pixels8; | |||||
| } else { | } else { | ||||
| idct_dc_add = h->h264dsp.h264_idct8_dc_add; | idct_dc_add = h->h264dsp.h264_idct8_dc_add; | ||||
| idct_add = h->h264dsp.h264_idct8_add; | idct_add = h->h264dsp.h264_idct8_add; | ||||
| @@ -1843,7 +1843,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, | |||||
| } else { | } else { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| idct_dc_add = | idct_dc_add = | ||||
| idct_add = s->dsp.add_pixels4; | |||||
| idct_add = h->h264dsp.h264_add_pixels4; | |||||
| } else { | } else { | ||||
| idct_dc_add = h->h264dsp.h264_idct_dc_add; | idct_dc_add = h->h264dsp.h264_idct_dc_add; | ||||
| idct_add = h->h264dsp.h264_idct_add; | idct_add = h->h264dsp.h264_idct_add; | ||||
| @@ -1942,9 +1942,9 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, | |||||
| for (i = 0; i < 16; i++) | for (i = 0; i < 16; i++) | ||||
| if (h->non_zero_count_cache[scan8[i + p * 16]] || | if (h->non_zero_count_cache[scan8[i + p * 16]] || | ||||
| dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | ||||
| s->dsp.add_pixels4(dest_y + block_offset[i], | |||||
| h->mb + (i * 16 + p * 256 << pixel_shift), | |||||
| linesize); | |||||
| h->h264dsp.h264_add_pixels4(dest_y + block_offset[i], | |||||
| h->mb + (i * 16 + p * 256 << pixel_shift), | |||||
| linesize); | |||||
| } | } | ||||
| } else { | } else { | ||||
| h->h264dsp.h264_idct_add16intra(dest_y, block_offset, | h->h264dsp.h264_idct_add16intra(dest_y, block_offset, | ||||
| @@ -1955,8 +1955,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, | |||||
| } else if (h->cbp & 15) { | } else if (h->cbp & 15) { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| const int di = IS_8x8DCT(mb_type) ? 4 : 1; | const int di = IS_8x8DCT(mb_type) ? 4 : 1; | ||||
| idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 | |||||
| : s->dsp.add_pixels4; | |||||
| idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8 | |||||
| : h->h264dsp.h264_add_pixels4; | |||||
| for (i = 0; i < 16; i += di) | for (i = 0; i < 16; i += di) | ||||
| if (h->non_zero_count_cache[scan8[i + p * 16]]) | if (h->non_zero_count_cache[scan8[i + p * 16]]) | ||||
| idct_add(dest_y + block_offset[i], | idct_add(dest_y + block_offset[i], | ||||
| @@ -205,7 +205,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) | |||||
| h->mb + (16 * 16 * 2 << PIXEL_SHIFT), | h->mb + (16 * 16 * 2 << PIXEL_SHIFT), | ||||
| uvlinesize); | uvlinesize); | ||||
| } else { | } else { | ||||
| idct_add = s->dsp.add_pixels4; | |||||
| idct_add = h->h264dsp.h264_add_pixels4; | |||||
| for (j = 1; j < 3; j++) { | for (j = 1; j < 3; j++) { | ||||
| for (i = j * 16; i < j * 16 + 4; i++) | for (i = j * 16; i < j * 16 + 4; i++) | ||||
| if (h->non_zero_count_cache[scan8[i]] || | if (h->non_zero_count_cache[scan8[i]] || | ||||
| @@ -0,0 +1,68 @@ | |||||
| /* | |||||
| * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |||||
| * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| /** | |||||
| * @file | |||||
| * H.264 / AVC / MPEG4 part10 DSP functions. | |||||
| * @author Michael Niedermayer <michaelni@gmx.at> | |||||
| */ | |||||
| #include "bit_depth_template.c" | |||||
| static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride) | |||||
| { | |||||
| int i; | |||||
| pixel *dst = (pixel *) _dst; | |||||
| dctcoef *src = (dctcoef *) _src; | |||||
| stride /= sizeof(pixel); | |||||
| for (i = 0; i < 4; i++) { | |||||
| dst[0] += src[0]; | |||||
| dst[1] += src[1]; | |||||
| dst[2] += src[2]; | |||||
| dst[3] += src[3]; | |||||
| dst += stride; | |||||
| src += 4; | |||||
| } | |||||
| } | |||||
| static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) | |||||
| { | |||||
| int i; | |||||
| pixel *dst = (pixel *) _dst; | |||||
| dctcoef *src = (dctcoef *) _src; | |||||
| stride /= sizeof(pixel); | |||||
| for (i = 0; i < 8; i++) { | |||||
| dst[0] += src[0]; | |||||
| dst[1] += src[1]; | |||||
| dst[2] += src[2]; | |||||
| dst[3] += src[3]; | |||||
| dst[4] += src[4]; | |||||
| dst[5] += src[5]; | |||||
| dst[6] += src[6]; | |||||
| dst[7] += src[7]; | |||||
| dst += stride; | |||||
| src += 8; | |||||
| } | |||||
| } | |||||
| @@ -52,11 +52,29 @@ | |||||
| #include "h264dsp_template.c" | #include "h264dsp_template.c" | ||||
| #undef BIT_DEPTH | #undef BIT_DEPTH | ||||
| #define BIT_DEPTH 8 | |||||
| #include "h264addpx_template.c" | |||||
| #undef BIT_DEPTH | |||||
| #define BIT_DEPTH 16 | |||||
| #include "h264addpx_template.c" | |||||
| #undef BIT_DEPTH | |||||
| void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) | void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) | ||||
| { | { | ||||
| #undef FUNC | #undef FUNC | ||||
| #define FUNC(a, depth) a ## _ ## depth ## _c | #define FUNC(a, depth) a ## _ ## depth ## _c | ||||
| #define ADDPX_DSP(depth) \ | |||||
| c->h264_add_pixels4 = FUNC(ff_h264_add_pixels4, depth);\ | |||||
| c->h264_add_pixels8 = FUNC(ff_h264_add_pixels8, depth) | |||||
| if (bit_depth > 8 && bit_depth <= 16) { | |||||
| ADDPX_DSP(16); | |||||
| } else { | |||||
| ADDPX_DSP(8); | |||||
| } | |||||
| #define H264_DSP(depth) \ | #define H264_DSP(depth) \ | ||||
| c->h264_idct_add= FUNC(ff_h264_idct_add, depth);\ | c->h264_idct_add= FUNC(ff_h264_idct_add, depth);\ | ||||
| c->h264_idct8_add= FUNC(ff_h264_idct8_add, depth);\ | c->h264_idct8_add= FUNC(ff_h264_idct8_add, depth);\ | ||||
| @@ -101,6 +101,10 @@ typedef struct H264DSPContext { | |||||
| void (*h264_luma_dc_dequant_idct)(int16_t *output, | void (*h264_luma_dc_dequant_idct)(int16_t *output, | ||||
| int16_t *input /*align 16*/, int qmul); | int16_t *input /*align 16*/, int qmul); | ||||
| void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); | void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); | ||||
| /* bypass-transform */ | |||||
| void (*h264_add_pixels8)(uint8_t *dst, int16_t *block, int stride); | |||||
| void (*h264_add_pixels4)(uint8_t *dst, int16_t *block, int stride); | |||||
| } H264DSPContext; | } H264DSPContext; | ||||
| void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||