Merge remote-tracking branch 'qatar/master'

* qatar/master: vc1dec: Remove separate scaling function for interlaced field MVs vc1dec: Invoke edge_emulation regardless of MV precision x86: Use consistent 3dnowext function and macro name suffixes g723_1: scale output as supposed for the case with postfilter disabled g723_1: increase excitation storage by 4 g723_1: fix upper bound parameter from inverse maximum autocorrelation g723_1: make scale_vector() behave like the reference g723_1: fix off-by-one error in normalize_bits() g723_1: save/restore excitation with offset to store LPC history wmapro: prevent division by zero when sample rate is unspecified x86: proresdsp: improve SIGNEXTEND macro comments x86: h264dsp: K&R formatting cosmetics LICENSE: Document all GPL files Conflicts: libavcodec/g723_1.c libavcodec/wmaprodec.c libavcodec/x86/h264dsp_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
13 years ago · a7acab6cda
--- a/LICENSE
+++ b/LICENSE
@@ -17,6 +17,14 @@ Specifically, the GPL parts of FFmpeg are
 - optional x86 optimizations in the files
  libavcodec/x86/idct_mmx.c
 - the X11 grabber in libavdevice/x11grab.c
 - the texi2pod.pl tool
 - the following filters in libavfilter:
    - vf_blackframe.c
    - vf_boxblur.c
    - vf_cropdetect.c
    - vf_delogo.c
    - vf_hqdn3d.c
    - vf_yadif.c
 There are a handful of files under other licensing terms, namely:
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -52,7 +52,7 @@ typedef struct g723_1_context {
    int16_t prev_lsp[LPC_ORDER];
    int16_t prev_excitation[PITCH_MAX];
    int16_t excitation[PITCH_MAX + FRAME_LEN];
    int16_t excitation[PITCH_MAX + FRAME_LEN + 4];
    int16_t synth_mem[LPC_ORDER];
    int16_t fir_mem[LPC_ORDER];
    int     iir_mem[LPC_ORDER];
@@ -267,8 +267,10 @@ static int scale_vector(int16_t *vector, int length)
    bits  = normalize_bits(max, 15);
    scale = shift_table[bits];
    for (i = 0; i < length; i++)
    for (i = 0; i < length; i++) {
        av_assert2(av_clipl_int32(vector[i] * (int64_t)scale << 1) == vector[i] * (int64_t)scale << 1);
        vector[i] = (vector[i] * scale) >> 3;
    }
    return bits - 3;
 }
@@ -592,7 +594,10 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max,
    int i;
    pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag);
    limit     = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3);
    if (dir > 0)
        limit = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3);
    else
        limit = pitch_lag + 3;
    for (i = pitch_lag - 3; i <= limit; i++) {
        ccr = ff_dot_product(buf, buf + dir * i, length)<<1;
@@ -967,7 +972,6 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
    G723_1_Context *p  = avctx->priv_data;
    const uint8_t *buf = avpkt->data;
    int buf_size       = avpkt->size;
    int16_t *out;
    int dec_mode       = buf[0] & 3;
    PPFParam ppf[SUBFRAMES];
@@ -975,6 +979,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
    int16_t lpc[SUBFRAMES * LPC_ORDER];
    int16_t acb_vector[SUBFRAME_LEN];
    int16_t *vector_ptr;
    int16_t *out;
    int bad_frame = 0, i, j, ret;
    if (!buf_size || buf_size < frame_size[dec_mode]) {
@@ -995,8 +1000,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
        return ret;
    }
    out= (int16_t*)p->frame.data[0];
    out = (int16_t *)p->frame.data[0];
    if (p->cur_frame_type == ACTIVE_FRAME) {
        if (!bad_frame)
@@ -1079,7 +1084,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
        memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
               PITCH_MAX * sizeof(*p->excitation));
    } else {
        memset(out, 0, sizeof(int16_t)*FRAME_LEN);
        memset(out, 0, FRAME_LEN * 2);
        av_log(avctx, AV_LOG_WARNING,
               "G.723.1: Comfort noise generation not supported yet\n");
        return frame_size[dec_mode];
@@ -1094,13 +1099,18 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                    0, 1, 1 << 12);
    memcpy(p->synth_mem, out + FRAME_LEN, LPC_ORDER * sizeof(int16_t));
    if (p->postfilter)
    if (p->postfilter) {
        formant_postfilter(p, lpc, out);
    } else { // if output is not postfiltered it should be scaled by 2
        for (i = 0; i < FRAME_LEN; i++)
            out[LPC_ORDER + i] = av_clip_int16(out[LPC_ORDER + i] << 1);
    }
    memmove(out, out + LPC_ORDER, sizeof(int16_t)*FRAME_LEN);
    p->frame.nb_samples = FRAME_LEN;
    *(AVFrame*)data = p->frame;
    *got_frame_ptr = 1;
    *got_frame_ptr   = 1;
    *(AVFrame *)data = p->frame;
    return frame_size[dec_mode];
 }
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -1882,8 +1882,8 @@ static void vc1_interp_mc(VC1Context *v)
    }
    if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22
        || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 16 - s->mspel * 3
        || (unsigned)(src_y - s->mspel) > v_edge_pos    - (my & 3) - 16 - s->mspel * 3) {
        || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3
        || (unsigned)(src_y - 1) > v_edge_pos    - (my & 3) - 16 - 3) {
        uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize;
        srcY -= s->mspel * (1 + s->linesize);
@@ -1979,20 +1979,6 @@ static av_always_inline int scale_mv(int value, int bfrac, int inv, int qs)
 #endif
 }
 static av_always_inline int scale_mv_intfi(int value, int bfrac, int inv,
                                           int qs, int qs_last)
 {
    int n = bfrac;
    if (inv)
        n -= 256;
    n <<= !qs_last;
    if (!qs)
        return (value * n + 255) >> 9;
    else
        return (value * n + 128) >> 8;
 }
 /** Reconstruct motion vector for B-frame and do motion compensation
 */
 static inline void vc1_b_mc(VC1Context *v, int dmv_x[2], int dmv_y[2],
@@ -2246,14 +2232,14 @@ static inline void vc1_pred_b_mv_intfi(VC1Context *v, int n, int *dmv_x, int *dm
    if (v->bmvtype == BMV_TYPE_DIRECT) {
        int total_opp, k, f;
        if (s->next_picture.f.mb_type[mb_pos + v->mb_off] != MB_TYPE_INTRA) {
            s->mv[0][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0],
                                            v->bfraction, 0, s->quarter_sample, v->qs_last);
            s->mv[0][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1],
                                            v->bfraction, 0, s->quarter_sample, v->qs_last);
            s->mv[1][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0],
                                            v->bfraction, 1, s->quarter_sample, v->qs_last);
            s->mv[1][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1],
                                            v->bfraction, 1, s->quarter_sample, v->qs_last);
            s->mv[0][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0],
                                      v->bfraction, 0, s->quarter_sample);
            s->mv[0][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1],
                                      v->bfraction, 0, s->quarter_sample);
            s->mv[1][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0],
                                      v->bfraction, 1, s->quarter_sample);
            s->mv[1][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1],
                                      v->bfraction, 1, s->quarter_sample);
            total_opp = v->mv_f_next[0][s->block_index[0] + v->blocks_off]
                      + v->mv_f_next[0][s->block_index[1] + v->blocks_off]
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -341,6 +341,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
        return AVERROR_INVALIDDATA;
    }
    if (s->avctx->sample_rate <= 0) {
        av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n");
        return AVERROR_INVALIDDATA;
    }
    s->num_channels = avctx->channels;
    if (s->num_channels < 0) {
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2485,9 +2485,9 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
 }
 #if HAVE_6REGS
 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
                                      const float *src1, const float *win,
                                      int len)
 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                        const float *src1, const float *win,
                                        int len)
 {
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 8;
@@ -2939,11 +2939,11 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
 #endif
 }
 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
                                int mm_flags)
 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
                                  int mm_flags)
 {
 #if HAVE_6REGS && HAVE_INLINE_ASM
    c->vector_fmul_window  = vector_fmul_window_3dnow2;
    c->vector_fmul_window  = vector_fmul_window_3dnowext;
 #endif
 }
@@ -3194,7 +3194,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
        dsputil_init_3dnow(c, avctx, mm_flags);
    if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
        dsputil_init_3dnow2(c, avctx, mm_flags);
        dsputil_init_3dnowext(c, avctx, mm_flags);
    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
        dsputil_init_sse(c, avctx, mm_flags);
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -34,9 +34,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
    }
    if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
        /* 3DNowEx for K7 */
        s->imdct_calc = ff_imdct_calc_3dnow2;
        s->imdct_half = ff_imdct_half_3dnow2;
        s->fft_calc   = ff_fft_calc_3dnow2;
        s->imdct_calc = ff_imdct_calc_3dnowext;
        s->imdct_half = ff_imdct_half_3dnowext;
        s->fft_calc   = ff_fft_calc_3dnowext;
    }
 #endif
    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -25,12 +25,12 @@ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
 void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -93,14 +93,14 @@ cextern cos_ %+ i
 SECTION_TEXT
 %macro T2_3DN 4 ; z0, z1, mem0, mem1
 %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
    mova     %1, %3
    mova     %2, %1
    pfadd    %1, %4
    pfsub    %2, %4
 %endmacro
 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
 %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
    mova     %5, %3
    pfsub    %3, %4
    pfadd    %5, %4 ; {t6,t5}
@@ -445,13 +445,13 @@ fft16_sse:
    ret
 %macro FFT48_3DN 0
 %macro FFT48_3DNOW 0
 align 16
 fft4 %+ SUFFIX:
    T2_3DN   m0, m1, Z(0), Z(1)
    T2_3DNOW m0, m1, Z(0), Z(1)
    mova     m2, Z(2)
    mova     m3, Z(3)
    T4_3DN   m0, m1, m2, m3, m4, m5
    T4_3DNOW m0, m1, m2, m3, m4, m5
    PUNPCK   m0, m1, m4
    PUNPCK   m2, m3, m5
    mova   Z(0), m0
@@ -462,14 +462,14 @@ fft4 %+ SUFFIX:
 align 16
 fft8 %+ SUFFIX:
    T2_3DN   m0, m1, Z(0), Z(1)
    T2_3DNOW m0, m1, Z(0), Z(1)
    mova     m2, Z(2)
    mova     m3, Z(3)
    T4_3DN   m0, m1, m2, m3, m4, m5
    T4_3DNOW m0, m1, m2, m3, m4, m5
    mova   Z(0), m0
    mova   Z(2), m2
    T2_3DN   m4, m5,  Z(4),  Z(5)
    T2_3DN   m6, m7, Z2(6), Z2(7)
    T2_3DNOW m4, m5,  Z(4),  Z(5)
    T2_3DNOW m6, m7, Z2(6), Z2(7)
    PSWAPD   m0, m5
    PSWAPD   m2, m7
    pxor     m0, [ps_m1p1]
@@ -478,12 +478,12 @@ fft8 %+ SUFFIX:
    pfadd    m7, m2
    pfmul    m5, [ps_root2]
    pfmul    m7, [ps_root2]
    T4_3DN   m1, m3, m5, m7, m0, m2
    T4_3DNOW m1, m3, m5, m7, m0, m2
    mova   Z(5), m5
    mova  Z2(7), m7
    mova     m0, Z(0)
    mova     m2, Z(2)
    T4_3DN   m0, m2, m4, m6, m5, m7
    T4_3DNOW m0, m2, m4, m6, m5, m7
    PUNPCK   m0, m1, m5
    PUNPCK   m2, m3, m7
    mova   Z(0), m0
@@ -501,7 +501,7 @@ fft8 %+ SUFFIX:
 %if ARCH_X86_32
 %macro PSWAPD 2
 %if cpuflag(3dnow2)
 %if cpuflag(3dnowext)
    pswapd %1, %2
 %elifidn %1, %2
    movd [r0+12], %1
@@ -513,11 +513,11 @@ fft8 %+ SUFFIX:
 %endif
 %endmacro
 INIT_MMX 3dnow2
 FFT48_3DN
 INIT_MMX 3dnowext
 FFT48_3DNOW
 INIT_MMX 3dnow
 FFT48_3DN
 FFT48_3DNOW
 %endif
 %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
@@ -634,7 +634,7 @@ cglobal fft_calc, 2,5,8
 %if ARCH_X86_32
 INIT_MMX 3dnow
 FFT_CALC_FUNC
 INIT_MMX 3dnow2
 INIT_MMX 3dnowext
 FFT_CALC_FUNC
 %endif
 INIT_XMM sse
@@ -728,7 +728,7 @@ cglobal imdct_calc, 3,5,3
 %if ARCH_X86_32
 INIT_MMX 3dnow
 IMDCT_CALC_FUNC
 INIT_MMX 3dnow2
 INIT_MMX 3dnowext
 IMDCT_CALC_FUNC
 %endif
@@ -744,8 +744,8 @@ INIT_MMX 3dnow
 %define unpckhps punpckhdq
 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %define pass_3dnow2 pass_3dnow
 %define pass_interleave_3dnow2 pass_interleave_3dnow
 %define pass_3dnowext pass_3dnow
 %define pass_interleave_3dnowext pass_interleave_3dnow
 %endif
 %ifdef PIC
@@ -814,7 +814,7 @@ DECL_FFT 5, _interleave
 INIT_MMX 3dnow
 DECL_FFT 4
 DECL_FFT 4, _interleave
 INIT_MMX 3dnow2
 INIT_MMX 3dnowext
 DECL_FFT 4
 DECL_FFT 4, _interleave
 %endif
@@ -846,7 +846,7 @@ INIT_XMM sse
    PSWAPD     m5, m3
    pfmul      m2, m3
    pfmul      m6, m5
 %if cpuflag(3dnow2)
 %if cpuflag(3dnowext)
    pfpnacc    m0, m4
    pfpnacc    m2, m6
 %else
@@ -1019,7 +1019,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
    xor   r4, r4
    sub   r4, r3
 %endif
 %if notcpuflag(3dnow2) && mmsize == 8
 %if notcpuflag(3dnowext) && mmsize == 8
    movd  m7, [ps_m1m1m1m1]
 %endif
 .pre:
@@ -1103,7 +1103,7 @@ DECL_IMDCT POSROTATESHUF
 INIT_MMX 3dnow
 DECL_IMDCT POSROTATESHUF_3DNOW
 INIT_MMX 3dnow2
 INIT_MMX 3dnowext
 DECL_IMDCT POSROTATESHUF_3DNOW
 %endif
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -249,7 +249,7 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
 %macro PSWAPD_SSE 2
    pshufw %1, %2, 0x4e
 %endmacro
 %macro PSWAPD_3DN1 2
 %macro PSWAPD_3DNOW 2
    movq  %1, %2
    psrlq %1, 32
    punpckldq %1, %2
@@ -306,10 +306,10 @@ cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4,
 %define pswapd PSWAPD_SSE
 FLOAT_TO_INT16_INTERLEAVE6 sse
 %define cvtps2pi pf2id
 %define pswapd PSWAPD_3DN1
 %define pswapd PSWAPD_3DNOW
 FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 FLOAT_TO_INT16_INTERLEAVE6 3dnowext
 %undef cvtps2pi
 ;-----------------------------------------------------------------------------
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -46,7 +46,7 @@ void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long l
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
@@ -74,9 +74,11 @@ FLOAT_TO_INT16_INTERLEAVE(3dnow)
 FLOAT_TO_INT16_INTERLEAVE(sse)
 FLOAT_TO_INT16_INTERLEAVE(sse2)
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
 static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
                                               long len, int channels)
 {
    if(channels==6)
        ff_float_to_int16_interleave6_3dn2(dst, src, len);
        ff_float_to_int16_interleave6_3dnowext(dst, src, len);
    else
        float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
@@ -126,7 +128,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
        }
        if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) {
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
                c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
            }
        }
        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -25,8 +25,10 @@
 /***********************************/
 /* IDCT */
 #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
 void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride);
 #define IDCT_ADD_FUNC(NUM, DEPTH, OPT)                                  \
 void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                       int16_t *block,  \
                                                       int stride);
 IDCT_ADD_FUNC(, 8, mmx)
 IDCT_ADD_FUNC(, 10, sse2)
@@ -44,10 +46,10 @@ IDCT_ADD_FUNC(8, 10, avx)
 #endif
 #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
                              (uint8_t *dst, const int *block_offset, \
                              DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)                         \
 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
    (uint8_t *dst, const int *block_offset,                             \
     DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);
 IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
 IDCT_ADD_REP_FUNC(8, 4, 8, mmx2)
@@ -68,10 +70,11 @@ IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
 #endif
 #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
                              (uint8_t **dst, const int *block_offset, \
                              DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)                      \
 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT     \
    (uint8_t **dst, const int *block_offset,                          \
     DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);
 IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
 IDCT_ADD_REP_FUNC2(, 8, 8, mmx2)
 IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
@@ -80,7 +83,7 @@ IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
 IDCT_ADD_REP_FUNC2(, 8, 10, avx)
 #endif
 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
 void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul);
 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
 /***********************************/
@@ -91,273 +94,292 @@ void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40],
                                       int bidir, int edges, int step,
                                       int mask_mv0, int mask_mv1, int field);
 #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
                                                                int alpha, int beta, int8_t *tc0);
 #define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
                                                               int stride,    \
                                                               int alpha,     \
                                                               int beta,      \
                                                               int8_t *tc0);
 #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
                                                                int alpha, int beta);
 #define LF_FUNCS(type, depth)\
 LF_FUNC (h,  chroma,       depth, mmx2)\
 LF_IFUNC(h,  chroma_intra, depth, mmx2)\
 LF_FUNC (v,  chroma,       depth, mmx2)\
 LF_IFUNC(v,  chroma_intra, depth, mmx2)\
 LF_FUNC (h,  luma,         depth, mmx2)\
 LF_IFUNC(h,  luma_intra,   depth, mmx2)\
 LF_FUNC (h,  luma,         depth, sse2)\
 LF_IFUNC(h,  luma_intra,   depth, sse2)\
 LF_FUNC (v,  luma,         depth, sse2)\
 LF_IFUNC(v,  luma_intra,   depth, sse2)\
 LF_FUNC (h,  chroma,       depth, sse2)\
 LF_IFUNC(h,  chroma_intra, depth, sse2)\
 LF_FUNC (v,  chroma,       depth, sse2)\
 LF_IFUNC(v,  chroma_intra, depth, sse2)\
 LF_FUNC (h,  luma,         depth,  avx)\
 LF_IFUNC(h,  luma_intra,   depth,  avx)\
 LF_FUNC (v,  luma,         depth,  avx)\
 LF_IFUNC(v,  luma_intra,   depth,  avx)\
 LF_FUNC (h,  chroma,       depth,  avx)\
 LF_IFUNC(h,  chroma_intra, depth,  avx)\
 LF_FUNC (v,  chroma,       depth,  avx)\
 LF_IFUNC(v,  chroma_intra, depth,  avx)
 LF_FUNCS( uint8_t,  8)
 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
                                                               int stride,    \
                                                               int alpha,     \
                                                               int beta);
 #define LF_FUNCS(type, depth)                   \
 LF_FUNC(h,  chroma,       depth, mmx2)          \
 LF_IFUNC(h, chroma_intra, depth, mmx2)          \
 LF_FUNC(v,  chroma,       depth, mmx2)          \
 LF_IFUNC(v, chroma_intra, depth, mmx2)          \
 LF_FUNC(h,  luma,         depth, mmx2)          \
 LF_IFUNC(h, luma_intra,   depth, mmx2)          \
 LF_FUNC(h,  luma,         depth, sse2)          \
 LF_IFUNC(h, luma_intra,   depth, sse2)          \
 LF_FUNC(v,  luma,         depth, sse2)          \
 LF_IFUNC(v, luma_intra,   depth, sse2)          \
 LF_FUNC(h,  chroma,       depth, sse2)          \
 LF_IFUNC(h, chroma_intra, depth, sse2)          \
 LF_FUNC(v,  chroma,       depth, sse2)          \
 LF_IFUNC(v, chroma_intra, depth, sse2)          \
 LF_FUNC(h,  luma,         depth, avx)           \
 LF_IFUNC(h, luma_intra,   depth, avx)           \
 LF_FUNC(v,  luma,         depth, avx)           \
 LF_IFUNC(v, luma_intra,   depth, avx)           \
 LF_FUNC(h,  chroma,       depth, avx)           \
 LF_IFUNC(h, chroma_intra, depth, avx)           \
 LF_FUNC(v,  chroma,       depth, avx)           \
 LF_IFUNC(v, chroma_intra, depth, avx)
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
 #if ARCH_X86_32 && HAVE_YASM
 LF_FUNC (v8, luma,             8, mmx2)
 static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 LF_FUNC(v8, luma, 8, mmx2)
 static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha,
                                     int beta, int8_t *tc0)
 {
    if((tc0[0] & tc0[1]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix+0, stride, alpha, beta, tc0);
    if((tc0[2] & tc0[3]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix+8, stride, alpha, beta, tc0+2);
    if ((tc0[0] & tc0[1]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0);
    if ((tc0[2] & tc0[3]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2);
 }
 LF_IFUNC(v8, luma_intra,        8, mmx2)
 static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, int alpha, int beta)
 LF_IFUNC(v8, luma_intra, 8, mmx2)
 static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride,
                                           int alpha, int beta)
 {
    ff_deblock_v8_luma_intra_8_mmx2(pix+0, stride, alpha, beta);
    ff_deblock_v8_luma_intra_8_mmx2(pix+8, stride, alpha, beta);
    ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta);
    ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta);
 }
 #endif /* ARCH_X86_32 */
 LF_FUNC (v,  luma,            10, mmx2)
 LF_IFUNC(v,  luma_intra,      10, mmx2)
 LF_FUNC(v,  luma,       10, mmx2)
 LF_IFUNC(v, luma_intra, 10, mmx2)
 /***********************************/
 /* weighted prediction */
 #define H264_WEIGHT(W, OPT) \
 void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
    int stride, int height, int log2_denom, int weight, int offset);
 #define H264_WEIGHT(W, OPT)                                             \
 void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride,         \
                                      int height, int log2_denom,       \
                                      int weight, int offset);
 #define H264_BIWEIGHT(W, OPT) \
 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
    int weights, int offset);
 #define H264_BIWEIGHT(W, OPT)                                           \
 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
                                        int stride, int height,         \
                                        int log2_denom, int weightd,    \
                                        int weights, int offset);
 #define H264_BIWEIGHT_MMX(W) \
 H264_WEIGHT  (W, mmx2) \
 H264_BIWEIGHT(W, mmx2)
 #define H264_BIWEIGHT_MMX(W)                    \
    H264_WEIGHT(W, mmx2)                        \
    H264_BIWEIGHT(W, mmx2)
 #define H264_BIWEIGHT_MMX_SSE(W) \
 H264_BIWEIGHT_MMX(W) \
 H264_WEIGHT      (W, sse2) \
 H264_BIWEIGHT    (W, sse2) \
 H264_BIWEIGHT    (W, ssse3)
 #define H264_BIWEIGHT_MMX_SSE(W)                \
    H264_BIWEIGHT_MMX(W)                        \
    H264_WEIGHT(W, sse2)                        \
    H264_BIWEIGHT(W, sse2)                      \
    H264_BIWEIGHT(W, ssse3)
 H264_BIWEIGHT_MMX_SSE(16)
 H264_BIWEIGHT_MMX_SSE( 8)
 H264_BIWEIGHT_MMX    ( 4)
 #define H264_WEIGHT_10(W, DEPTH, OPT) \
 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
    int stride, int height, int log2_denom, int weight, int offset);
 #define H264_BIWEIGHT_10(W, DEPTH, OPT) \
 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
     int weightd, int weights, int offset);
 #define H264_BIWEIGHT_10_SSE(W, DEPTH) \
 H264_WEIGHT_10  (W, DEPTH, sse2) \
 H264_WEIGHT_10  (W, DEPTH, sse4) \
 H264_BIWEIGHT_10(W, DEPTH, sse2) \
 H264_BIWEIGHT_10(W, DEPTH, sse4)
 H264_BIWEIGHT_MMX_SSE(8)
 H264_BIWEIGHT_MMX(4)
 #define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
                                                    int stride,         \
                                                    int height,         \
                                                    int log2_denom,     \
                                                    int weight,         \
                                                    int offset);
 #define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
                                                      uint8_t *src,     \
                                                      int stride,       \
                                                      int height,       \
                                                      int log2_denom,   \
                                                      int weightd,      \
                                                      int weights,      \
                                                      int offset);
 #define H264_BIWEIGHT_10_SSE(W, DEPTH)          \
    H264_WEIGHT_10(W, DEPTH, sse2)              \
    H264_WEIGHT_10(W, DEPTH, sse4)              \
    H264_BIWEIGHT_10(W, DEPTH, sse2)            \
    H264_BIWEIGHT_10(W, DEPTH, sse4)
 H264_BIWEIGHT_10_SSE(16, 10)
 H264_BIWEIGHT_10_SSE( 8, 10)
 H264_BIWEIGHT_10_SSE( 4, 10)
 H264_BIWEIGHT_10_SSE(8,  10)
 H264_BIWEIGHT_10_SSE(4,  10)
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                         const int chroma_format_idc)
 {
 #if HAVE_YASM
    int mm_flags = av_get_cpu_flags();
    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2)
        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2;
    }
    if (bit_depth == 8) {
    if (mm_flags & AV_CPU_FLAG_MMX) {
        c->h264_idct_dc_add         =
        c->h264_idct_add            = ff_h264_idct_add_8_mmx;
        c->h264_idct8_dc_add        =
        c->h264_idct8_add           = ff_h264_idct8_add_8_mmx;
        c->h264_idct_add16          = ff_h264_idct_add16_8_mmx;
        c->h264_idct8_add4          = ff_h264_idct8_add4_8_mmx;
        if (chroma_format_idc == 1)
            c->h264_idct_add8       = ff_h264_idct_add8_8_mmx;
        c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_mmx;
        if (mm_flags & AV_CPU_FLAG_CMOV)
            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
        if (mm_flags & AV_CPU_FLAG_MMX2) {
            c->h264_idct_dc_add    = ff_h264_idct_dc_add_8_mmx2;
            c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_8_mmx2;
            c->h264_idct_add16     = ff_h264_idct_add16_8_mmx2;
            c->h264_idct8_add4     = ff_h264_idct8_add4_8_mmx2;
        if (mm_flags & AV_CPU_FLAG_MMX) {
            c->h264_idct_dc_add   =
            c->h264_idct_add      = ff_h264_idct_add_8_mmx;
            c->h264_idct8_dc_add  =
            c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;
            c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
            c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
            if (chroma_format_idc == 1)
                c->h264_idct_add8  = ff_h264_idct_add8_8_mmx2;
            c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2;
            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmx2;
            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmx2;
            if (chroma_format_idc == 1) {
                c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmx2;
                c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmx2;
            }
                c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
            if (mm_flags & AV_CPU_FLAG_CMOV)
                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
            if (mm_flags & AV_CPU_FLAG_MMX2) {
                c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmx2;
                c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2;
                c->h264_idct_add16   = ff_h264_idct_add16_8_mmx2;
                c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmx2;
                if (chroma_format_idc == 1)
                    c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2;
                c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2;
                c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmx2;
                c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2;
                if (chroma_format_idc == 1) {
                    c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmx2;
                    c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2;
                }
 #if ARCH_X86_32
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmx2;
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmx2;
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2;
 #endif
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
                c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_mmx2;
                c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmx2;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2;
 #endif /* ARCH_X86_32 */
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2;
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2;
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
                if (mm_flags & AV_CPU_FLAG_SSE2) {
                    c->h264_idct8_add  = ff_h264_idct8_add_8_sse2;
                c->h264_idct_add16          = ff_h264_idct_add16_8_sse2;
                c->h264_idct8_add4          = ff_h264_idct8_add4_8_sse2;
                if (chroma_format_idc == 1)
                    c->h264_idct_add8       = ff_h264_idct_add8_8_sse2;
                c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
                    c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
                    c->h264_idct_add16intra      = ff_h264_idct_add16intra_8_sse2;
                    c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSSE3) {
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
            }
            if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) {
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_sse2;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
 #endif /* HAVE_ALIGNED_STACK */
                }
                if (mm_flags & AV_CPU_FLAG_SSSE3) {
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
                }
                if (HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
 #endif
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
 #endif /* HAVE_ALIGNED_STACK */
                }
            }
        }
    }
    } else if (bit_depth == 10) {
    if (mm_flags & AV_CPU_FLAG_MMX) {
        if (mm_flags & AV_CPU_FLAG_MMX2) {
        if (mm_flags & AV_CPU_FLAG_MMX) {
            if (mm_flags & AV_CPU_FLAG_MMX2) {
 #if ARCH_X86_32
            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmx2;
            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmx2;
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmx2;
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmx2;
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2;
 #endif
            c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2;
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct_add       = ff_h264_idct_add_10_sse2;
                c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_sse2;
                c->h264_idct_add16     = ff_h264_idct_add16_10_sse2;
                if (chroma_format_idc == 1)
                    c->h264_idct_add8  = ff_h264_idct_add8_10_sse2;
                c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2;
                c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmx2;
                c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2;
                c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmx2;
                c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmx2;
                c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmx2;
                c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_mmx2;
 #endif /* ARCH_X86_32 */
                c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2;
                if (mm_flags & AV_CPU_FLAG_SSE2) {
                    c->h264_idct_add     = ff_h264_idct_add_10_sse2;
                    c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
                    c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
                    c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_idct8_add      = ff_h264_idct8_add_10_sse2;
                c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
                    c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
 #endif /* HAVE_ALIGNED_STACK */
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
                    c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
                    c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
                    c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
                    c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSE4) {
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
            }
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
 #endif /* HAVE_ALIGNED_STACK */
                }
                if (mm_flags & AV_CPU_FLAG_SSE4) {
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
                    c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
                    c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
                }
 #if HAVE_AVX
            if (mm_flags&AV_CPU_FLAG_AVX) {
                c->h264_idct_dc_add    =
                c->h264_idct_add       = ff_h264_idct_add_10_avx;
                c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_avx;
                c->h264_idct_add16     = ff_h264_idct_add16_10_avx;
                if (chroma_format_idc == 1)
                    c->h264_idct_add8  = ff_h264_idct_add8_10_avx;
                c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx;
                if (mm_flags & AV_CPU_FLAG_AVX) {
                    c->h264_idct_dc_add  =
                    c->h264_idct_add     = ff_h264_idct_add_10_avx;
                    c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
                    c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
                    c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
 #if HAVE_ALIGNED_STACK
                c->h264_idct8_add      = ff_h264_idct8_add_10_avx;
                c->h264_idct8_add4     = ff_h264_idct8_add4_10_avx;
 #endif
                    c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
 #endif /* HAVE_ALIGNED_STACK */
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
                    c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
                    c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
 #endif
            }
                    c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
                    c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
                    c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
                    c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
 #endif /* HAVE_ALIGNED_STACK */
                }
 #endif /* HAVE_AVX */
            }
        }
    }
    }
 #endif
 #endif /* HAVE_YASM */
 }
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -301,12 +301,12 @@ cglobal prores_idct_put_10, 4, 4, %1
    RET
 %endmacro
 %macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp
 %if cpuflag(sse4)
 %macro SIGNEXTEND 2-3
 %if cpuflag(sse4) ; dstlow, dsthigh
    movhlps     %2,  %1
    pmovsxwd    %1,  %1
    pmovsxwd    %2,  %2
 %else ; sse2
 %elif cpuflag(sse2) ; dstlow, dsthigh, tmp
    pxor        %3,  %3
    pcmpgtw     %3,  %1
    mova        %2,  %1
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -590,7 +590,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %assign cpuflags_mmx      (1<<0)
 %assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
 %assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
 %assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
--- a/tests/ref/fate/acodec-g723_1
+++ b/tests/ref/fate/acodec-g723_1
@@ -1,4 +1,4 @@
 dec0deb2425e908d232d2471acff04a3 *tests/data/fate/acodec-g723_1.g723_1
 4800 tests/data/fate/acodec-g723_1.g723_1
 90b20555c962b638dad0e98ac2c05b25 *tests/data/fate/acodec-g723_1.out.wav
 stddev: 8418.34 PSNR: 17.82 MAXDIFF:52968 bytes:    95992/    96000
 87fd529c9e41914f73a865d147cc9516 *tests/data/fate/acodec-g723_1.out.wav
 stddev: 8425.98 PSNR: 17.82 MAXDIFF:53268 bytes:    95992/    96000