From f188a1e0ca12822fd6c607924169d678c7254838 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Sun, 5 Jun 2011 18:33:23 -0400 Subject: [PATCH 1/5] H.264: Add x86 assembly for 10-bit MC Chroma H.264 functions. Mainly ported from 8-bit H.264 MC Chroma. Signed-off-by: Ronald S. Bultje --- libavcodec/x86/Makefile | 1 + libavcodec/x86/dsputil_mmx.c | 32 +++ libavcodec/x86/h264_chromamc_10bit.asm | 273 +++++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 libavcodec/x86/h264_chromamc_10bit.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 1c451c8352..ea57bd1db6 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ x86/deinterlace.o \ x86/fmtconvert.o \ x86/h264_chromamc.o \ + x86/h264_chromamc_10bit.o \ $(YASM-OBJS-yes) MMX-OBJS-$(CONFIG_FFT) += x86/fft.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 214c6a3945..b174b8393f 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1938,6 +1938,19 @@ void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); +#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ +void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, uint8_t *src,\ + int stride, int h, int x, int y); + +CHROMA_MC(put, 2, 10, mmxext) +CHROMA_MC(avg, 2, 10, mmxext) +CHROMA_MC(put, 4, 10, mmxext) +CHROMA_MC(avg, 4, 10, mmxext) +CHROMA_MC(put, 8, 10, sse2) +CHROMA_MC(avg, 8, 10, sse2) +CHROMA_MC(put, 8, 10, avx) +CHROMA_MC(avg, 8, 10, avx) /* CAVS specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -2420,6 +2433,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8; + const int bit_depth = avctx->bits_per_raw_sample; if (avctx->dsp_mask) { if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) @@ -2651,6 +2665,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; } + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext; + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext; + } c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; #endif @@ -2756,6 +2776,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 2, sse2); H264_QPEL_FUNCS(3, 3, sse2); } + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2; + } } #if HAVE_SSSE3 if(mm_flags & AV_CPU_FLAG_SSSE3){ @@ -2854,6 +2878,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } #endif } +#if HAVE_AVX + if (mm_flags & AV_CPU_FLAG_AVX) { + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; + } + } +#endif } if (CONFIG_ENCODERS) diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm new file mode 100644 index 0000000000..9d075434fe --- /dev/null +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -0,0 +1,273 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 + +SECTION .text + + +%macro MV0_PIXELS_MC8 0 + lea r4, [r2*3 ] + lea r5, [r2*4 ] +.next4rows + movu m0, [r1 ] + movu m1, [r1+r2 ] + CHROMAMC_AVG m0, [r0 ] + CHROMAMC_AVG m1, [r0+r2 ] + mova [r0 ], m0 + mova [r0+r2 ], m1 + movu m0, [r1+r2*2] + movu m1, [r1+r4 ] + CHROMAMC_AVG m0, [r0+r2*2] + CHROMAMC_AVG m1, [r0+r4 ] + mova [r0+r2*2], m0 + mova [r0+r4 ], m1 + add r1, r5 + add r0, r5 + sub r3d, 4 + jne .next4rows +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +%macro CHROMA_MC8 2 +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, +; int stride, int h, int mx, int my) +cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 + movsxdifnidn r2, r2d + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + MV0_PIXELS_MC8 + REP_RET + +.at_least_one_non_zero + mov r6d, 2 + test r5d, r5d + je .x_interpolation + mov r6, r2 ; dxy = x ? 1 : stride + test r4d, r4d + jne .xy_interpolation +.x_interpolation + ; mx == 0 XOR my == 0 - 1 dimensional filter only + or r4d, r5d ; x + y + movd m5, r4d + mova m4, [pw_8] + mova m6, [pw_4] ; mm6 = rnd >> 3 + SPLATW m5, m5 ; mm5 = B = x + psubw m4, m5 ; mm4 = A = 8-x + +.next1drow + movu m0, [r1 ] ; mm0 = src[0..7] + movu m2, [r1+r6] ; mm2 = src[1..8] + + pmullw m0, m4 ; mm0 = A * src[0..7] + pmullw m2, m5 ; mm2 = B * src[1..8] + + paddw m0, m6 + paddw m0, m2 + psrlw m0, 3 + CHROMAMC_AVG m0, [r0] + mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 + + add r0, r2 + add r1, r2 + dec r3d + jne .next1drow + REP_RET + +.xy_interpolation ; general case, bilinear + movd m4, r4m ; x + movd m6, r5m ; y + + SPLATW m4, m4 ; mm4 = x words + SPLATW m6, m6 ; mm6 = y words + psllw m5, m4, 3 ; mm5 = 8x + pmullw m4, m6 ; mm4 = x * y + psllw m6, 3 ; mm6 = 8y + paddw m1, m5, m6 ; mm7 = 8x+8y + mova m7, m4 ; DD = x * y + psubw m5, m4 ; mm5 = B = 8x - xy + psubw m6, m4 ; mm6 = C = 8y - xy + paddw m4, [pw_64] + psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 + + movu m0, [r1 ] ; mm0 = src[0..7] + movu m1, [r1+2] ; mm1 = src[1..8] +.next2drow + add r1, r2 + + pmullw m2, m0, m4 + pmullw m1, m5 + paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] + + movu m0, [r1] + movu m1, [r1+2] + pmullw m3, m0, m6 + paddw m2, m3 ; mm2 += C * src[0..7+strde] + pmullw m3, m1, m7 + paddw m2, m3 ; mm2 += D * src[1..8+strde] + + paddw m2, [pw_32] + psrlw m2, 6 + CHROMAMC_AVG m2, [r0] + mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 + + add r0, r2 + dec r3d + jne .next2drow + REP_RET +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +;TODO: xmm mc4 +%macro MC4_OP 2 + movq %1, [r1 ] + movq m1, [r1+2] + add r1, r2 + pmullw %1, m4 + pmullw m1, m2 + paddw m1, %1 + mova %1, m1 + + pmullw %2, m5 + pmullw m1, m3 + paddw %2, [pw_32] + paddw m1, %2 + psrlw m1, 6 + CHROMAMC_AVG m1, %2, [r0] + movq [r0], m1 + add r0, r2 +%endmacro + +%macro CHROMA_MC4 2 +cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 + movsxdifnidn r2, r2d + movd m2, r4m ; x + movd m3, r5m ; y + mova m4, [pw_8] + mova m5, m4 + SPLATW m2, m2 + SPLATW m3, m3 + psubw m4, m2 + psubw m5, m3 + + movq m0, [r1 ] + movq m6, [r1+2] + add r1, r2 + pmullw m0, m4 + pmullw m6, m2 + paddw m6, m0 + +.next2rows + MC4_OP m0, m6 + MC4_OP m6, m0 + sub r3d, 2 + jnz .next2rows + REP_RET +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +%macro CHROMA_MC2 2 +cglobal %1_h264_chroma_mc2_10_%2, 6,7 + movsxdifnidn r2, r2d + mov r6d, r4d + shl r4d, 16 + sub r4d, r6d + add r4d, 8 + imul r5d, r4d ; x*y<<16 | y*(8-x) + shl r4d, 3 + sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) + + movd m5, r4d + movd m6, r5d + punpckldq m5, m5 ; mm5 = {A,B,A,B} + punpckldq m6, m6 ; mm6 = {C,D,C,D} + pxor m7, m7 + pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] + +.nextrow + add r1, r2 + movq m1, m2 + pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] + pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] + movq m2, m0 + pmaddwd m0, m6 + paddw m1, [pw_32] + paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] + psrlw m1, 6 + packssdw m1, m7 + CHROMAMC_AVG m1, m3, [r0] + movd [r0], m1 + add r0, r2 + dec r3d + jnz .nextrow + REP_RET +%endmacro + +%macro NOTHING 2-3 +%endmacro +%macro AVG 2-3 +%if %0==3 + movq %2, %3 +%endif + PAVG %1, %2 +%endmacro + +%define CHROMAMC_AVG NOTHING +INIT_XMM +CHROMA_MC8 put, sse2 +%ifdef HAVE_AVX +INIT_AVX +CHROMA_MC8 put, avx +%endif +INIT_MMX +CHROMA_MC4 put, mmxext +CHROMA_MC2 put, mmxext + +%define CHROMAMC_AVG AVG +%define PAVG pavgw +INIT_XMM +CHROMA_MC8 avg, sse2 +%ifdef HAVE_AVX +INIT_AVX +CHROMA_MC8 avg, avx +%endif +INIT_MMX +CHROMA_MC4 avg, mmxext +CHROMA_MC2 avg, mmxext From ed63f527f28d1d73589a9e0bac3aed2197f14887 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 18 Jun 2011 08:34:14 -0400 Subject: [PATCH 2/5] Fix build if yasm is not available. --- libavcodec/x86/dsputil_mmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index b174b8393f..5c5ecb2d65 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2776,10 +2776,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 2, sse2); H264_QPEL_FUNCS(3, 3, sse2); } +#if HAVE_YASM if (bit_depth == 10) { c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2; c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2; } +#endif } #if HAVE_SSSE3 if(mm_flags & AV_CPU_FLAG_SSSE3){ @@ -2878,7 +2880,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } #endif } -#if HAVE_AVX +#if HAVE_AVX && HAVE_YASM if (mm_flags & AV_CPU_FLAG_AVX) { if (bit_depth == 10) { c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; From 84bd2b4bf5ca544e29c48634ac8b2899c58b0d9d Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 18 Jun 2011 17:26:40 +0200 Subject: [PATCH 3/5] lavf: add a forgotten NULL check in convert_format_parameters(). --- libavformat/utils.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libavformat/utils.c b/libavformat/utils.c index e3485e6ffa..81bc9b70c3 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -395,6 +395,9 @@ static AVDictionary *convert_format_parameters(AVFormatParameters *ap) char buf[1024]; AVDictionary *opts = NULL; + if (!ap) + return NULL; + if (ap->time_base.num) { snprintf(buf, sizeof(buf), "%d/%d", ap->time_base.den, ap->time_base.num); av_dict_set(&opts, "framerate", buf, 0); From bed31c7e3c530f9237bbb9fb96507be5977d9f65 Mon Sep 17 00:00:00 2001 From: Reinhard Tartler Date: Thu, 16 Jun 2011 19:59:47 +0200 Subject: [PATCH 4/5] more Changelog additions --- Changelog | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Changelog b/Changelog index 38f07b1440..a5044498a5 100644 --- a/Changelog +++ b/Changelog @@ -2,23 +2,27 @@ Entries are sorted chronologically from oldest to youngest within each release, releases are sorted from youngest to oldest. -version : +version 0.7: - E-AC-3 audio encoder - ac3enc: add channel coupling support - floating-point sample format support to the ac3, eac3, dca, aac, and vorbis decoders. - H264/MPEG frame-level multi-threading - All av_metadata_* functions renamed to av_dict_* and moved to libavutil +- 4:4:4 H.264 decoding support +- 10-bit H.264 optimizations for x86 version 0.7_beta2: +- VP8 frame-multithreading +- NEON optimizations for VP8 - Lots of deprecated API cruft removed - fft and imdct optimizations for AVX (Sandy Bridge) processors - DPX image encoder - SMPTE 302M AES3 audio decoder - Remove support for quitting ffmpeg with 'q', ctrl+c should be used. -- 9bit and 10bit per sample support in the h264 decoder +- 9bit and 10bit per sample support in the H.264 decoder version 0.7_beta1: From dbafb0e06faa092f60e53d845957fbab7f2a3f2d Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 18 Jun 2011 15:33:49 -0400 Subject: [PATCH 5/5] lavf: prevent crash in av_open_input_file() if ap == NULL. Needed for proper behaviour in our old API compatibility code. --- libavformat/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/utils.c b/libavformat/utils.c index 81bc9b70c3..7370c60bdf 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -549,7 +549,7 @@ int av_open_input_file(AVFormatContext **ic_ptr, const char *filename, int err; AVDictionary *opts = convert_format_parameters(ap); - if (!ap->prealloced_context) + if (!ap || !ap->prealloced_context) *ic_ptr = NULL; err = avformat_open_input(ic_ptr, filename, fmt, &opts);