H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.
14 years ago · c2d337429c
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                       int beta, int8_t *tc0);
 void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
                                      int weight, int offset);
 void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
                                     int weight, int offset);
 void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
                                     int weight, int offset);
 void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
                                   int log2_den, int weight, int offset);
 void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
                                  int log2_den, int weight, int offset);
 void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
                                  int log2_den, int weight, int offset);
 void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
                                        int log2_den, int weightd, int weights,
                                        int offset);
 void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                       int log2_den, int weightd, int weights,
                                       int offset);
 void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
                                       int log2_den, int weightd, int weights,
                                       int offset);
 void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
                                     int height, int log2_den, int weightd,
                                     int weights, int offset);
 void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
                                    int height, int log2_den, int weightd,
                                    int weights, int offset);
 void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
                                    int height, int log2_den, int weightd,
                                    int weights, int offset);
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
    c->h264_idct_add        = ff_h264_idct_add_neon;
    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q2,  q8
        vmov            q3,  q8
 1:      subs            ip,  ip,  #2
 1:      subs            r3,  r3,  #2
        vld1.8          {d20-d21},[r0,:128], r2
        \macd           q2,  d0,  d20
        pld             [r0]
@@ -1632,7 +1632,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
 1:      subs            ip,  ip,  #2
 1:      subs            r3,  r3,  #2
        vld1.8          {d4},[r0,:64], r2
        \macd           q1,  d0,  d4
        pld             [r0]
@@ -1662,7 +1662,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
 1:      subs            ip,  ip,  #4
 1:      subs            r3,  r3,  #4
        vld1.32         {d4[0]},[r0,:32], r2
        vld1.32         {d4[1]},[r0,:32], r2
        \macd           q1,  d0,  d4
@@ -1700,16 +1700,17 @@ endfunc
        .endm
        .macro  biweight_func w
 function biweight_h264_pixels_\w\()_neon
 function ff_biweight_h264_pixels_\w\()_neon, export=1
        push            {r4-r6, lr}
        add             r4,  sp,  #16
        ldr             r12, [sp, #16]
        add             r4,  sp,  #20
        ldm             r4,  {r4-r6}
        lsr             lr,  r4,  #31
        add             r6,  r6,  #1
        eors            lr,  lr,  r5,  lsr #30
        orr             r6,  r6,  #1
        vdup.16         q9,  r3
        lsl             r6,  r6,  r3
        vdup.16         q9,  r12
        lsl             r6,  r6,  r12
        vmvn            q9,  q9
        vdup.16         q8,  r6
        mov             r6,  r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
        .endm
        .macro  biweight_entry w, h, b=1
 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               biweight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        biweight_entry  16, 8
        biweight_entry  16, 16, b=0
        biweight_func   16
        biweight_entry  8,  16
        biweight_entry  8,  4
        biweight_entry  8,  8,  b=0
        biweight_func   8
        biweight_entry  4,  8
        biweight_entry  4,  2
        biweight_entry  4,  4,  b=0
        biweight_func   4
@ Weighted prediction
        .macro  weight_16 add
        vdup.8          d0,  r3
 1:      subs            ip,  ip,  #2
        vdup.8          d0,  r12
 1:      subs            r2,  r2,  #2
        vld1.8          {d20-d21},[r0,:128], r1
        vmull.u8        q2,  d0,  d20
        pld             [r0]
@@ -1785,8 +1767,8 @@ endfunc
        .endm
        .macro  weight_8 add
        vdup.8          d0,  r3
 1:      subs            ip,  ip,  #2
        vdup.8          d0,  r12
 1:      subs            r2,  r2,  #2
        vld1.8          {d4},[r0,:64], r1
        vmull.u8        q1,  d0,  d4
        pld             [r0]
@@ -1806,10 +1788,10 @@ endfunc
        .endm
        .macro  weight_4 add
        vdup.8          d0,  r3
        vdup.8          d0,  r12
        vmov            q1,  q8
        vmov            q10, q8
 1:      subs            ip,  ip,  #4
 1:      subs            r2,  r2,  #4
        vld1.32         {d4[0]},[r0,:32], r1
        vld1.32         {d4[1]},[r0,:32], r1
        vmull.u8        q1,  d0,  d4
@@ -1842,50 +1824,32 @@ endfunc
        .endm
        .macro  weight_func w
 function weight_h264_pixels_\w\()_neon
 function ff_weight_h264_pixels_\w\()_neon, export=1
        push            {r4, lr}
        ldr             r4,  [sp, #8]
        cmp             r2,  #1
        lsl             r4,  r4,  r2
        ldr             r12, [sp, #8]
        ldr             r4,  [sp, #12]
        cmp             r3,  #1
        lsl             r4,  r4,  r3
        vdup.16         q8,  r4
        mov             r4,  r0
        ble             20f
        rsb             lr,  r2,  #1
        rsb             lr,  r3,  #1
        vdup.16         q9,  lr
        cmp             r3,  #0
        cmp             r12, #0
        blt             10f
        weight_\w       vhadd.s16
 10:     rsb             r3,  r3,  #0
 10:     rsb             r12, r12, #0
        weight_\w       vhsub.s16
 20:     rsb             lr,  r2,  #0
 20:     rsb             lr,  r3,  #0
        vdup.16         q9,  lr
        cmp             r3,  #0
        cmp             r12, #0
        blt             10f
        weight_\w       vadd.s16
 10:     rsb             r3,  r3,  #0
 10:     rsb             r12, r12, #0
        weight_\w       vsub.s16
 endfunc
        .endm
        .macro  weight_entry w, h, b=1
 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               weight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        weight_entry    16, 8
        weight_entry    16, 16, b=0
        weight_func     16
        weight_entry    8,  16
        weight_entry    8,  4
        weight_entry    8,  8,  b=0
        weight_func     8
        weight_entry    4,  8
        weight_entry    4,  2
        weight_entry    4,  4,  b=0
        weight_func     4
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                               int height, int delta, int list,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int src_x_offset, int src_y_offset,
                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
            src_cb= s->edge_emu_buffer;
    }
    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
    if(emu){
        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
            src_cr= s->edge_emu_buffer;
    }
    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 }
 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
 static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list0){
        Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
        mc_dir_part(h, ref, n, square, height, delta, 0,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
                           qpix_op, chroma_op, pixel_shift, chroma444);
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list1){
        Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
        mc_dir_part(h, ref, n, square, height, delta, 1,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
                           qpix_op, chroma_op, pixel_shift, chroma444);
    }
 }
 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
 static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                           int list0, int list1, int pixel_shift, int chroma444){
    MpegEncContext * const s = &h->s;
    int chroma_height;
    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
    if(chroma444){
        chroma_height = height;
        chroma_weight_avg = luma_weight_avg;
        chroma_weight_op = luma_weight_op;
        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
    } else if (CHROMA422) {
        chroma_height = height;
        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
    }else{
        chroma_height = height >> 1;
        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
    }
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
        int refn0 = h->ref_cache[0][ scan8[n] ];
        int refn1 = h->ref_cache[1][ scan8[n] ];
        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                    dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                    tmp_y, tmp_cb, tmp_cr,
                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
        if(h->use_weight == 2){
            int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
            int weight1 = 64 - weight0;
            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
            if (CHROMA422) {
                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
                                  tmp_cb + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
            }
            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
                              height,        5, weight0, weight1, 0);
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
                              chroma_height, 5, weight0, weight1, 0);
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
                              chroma_height, 5, weight0, weight1, 0);
        }else{
            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                            h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                            h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                            h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                            h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
            if (CHROMA422) {
                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
                                  tmp_cb + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
            }
        }
    }else{
        int list = list1 ? 1 : 0;
        int refn = h->ref_cache[list][ scan8[n] ];
        Picture *ref= &h->ref_list[list][refn];
        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
        mc_dir_part(h, ref, n, square, height, delta, list,
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put, chroma_put, pixel_shift, chroma444);
        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                       h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
        if(h->use_weight_chroma){
            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
            if (CHROMA422) {
                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
            }
        }
    }
 }
 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
 static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
    if((h->use_weight==2 && list0 && list1
        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
       || h->use_weight==1)
        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                         x_offset, y_offset, qpix_put, chroma_put,
                         weight_op[0], weight_op[3], weight_avg[0],
                         weight_avg[3], list0, list1, pixel_shift, chroma444);
                         weight_op[0], weight_op[1], weight_avg[0],
                         weight_avg[1], list0, list1, pixel_shift, chroma444);
    else
        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
                    chroma_avg, list0, list1, pixel_shift, chroma444);
 }
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
    prefetch_motion(h, 0, pixel_shift, chroma444);
    if(IS_16X16(mb_type)){
        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
    }else if(IS_16X8(mb_type)){
        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                &weight_op[1], &weight_avg[1],
                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                &weight_op[1], &weight_avg[1],
                weight_op, weight_avg,
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                pixel_shift, chroma444);
    }else if(IS_8X16(mb_type)){
        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                &weight_op[2], &weight_avg[2],
                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                &weight_op[2], &weight_avg[2],
                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                pixel_shift, chroma444);
    }else{
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
            int y_offset= (i&2)<<1;
            if(IS_SUB_8X8(sub_mb_type)){
                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                    &weight_op[3], &weight_avg[3],
                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else if(IS_SUB_8X4(sub_mb_type)){
                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                    &weight_op[4], &weight_avg[4],
                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                    &weight_op[4], &weight_avg[4],
                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else if(IS_SUB_4X8(sub_mb_type)){
                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                    &weight_op[5], &weight_avg[5],
                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                    &weight_op[5], &weight_avg[5],
                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else{
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                for(j=0; j<4; j++){
                    int sub_x_offset= x_offset + 2*(j&1);
                    int sub_y_offset= y_offset +   (j&2);
                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                        &weight_op[6], &weight_avg[6],
                        &weight_op[2], &weight_avg[2],
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                        pixel_shift, chroma444);
                }
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
    else\
        c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
 \
    c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
    c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -31,16 +31,18 @@
 #include "dsputil.h"
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
 typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
 typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
                                 int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
                                   int log2_denom, int weightd, int weights, int offset);
 /**
 * Context for storing H.264 DSP functions
 */
 typedef struct H264DSPContext{
    /* weighted MC */
    h264_weight_func weight_h264_pixels_tab[10];
    h264_biweight_func biweight_h264_pixels_tab[10];
    h264_weight_func weight_h264_pixels_tab[4];
    h264_biweight_func biweight_h264_pixels_tab[4];
    /* loop filter */
    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -29,14 +29,16 @@
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
 #define H264_WEIGHT(W,H) \
 static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
 #define H264_WEIGHT(W) \
 static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
                                           int log2_denom, int weight, int offset) \
 { \
    int y; \
    pixel *block = (pixel*)_block; \
    stride /= sizeof(pixel); \
    offset <<= (log2_denom + (BIT_DEPTH-8)); \
    if(log2_denom) offset += 1<<(log2_denom-1); \
    for(y=0; y<H; y++, block += stride){ \
    for (y = 0; y < height; y++, block += stride) { \
        op_scale1(0); \
        op_scale1(1); \
        if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
        op_scale1(15); \
    } \
 } \
 static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
 static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
                                             int log2_denom, int weightd, int weights, int offset) \
 { \
    int y; \
    pixel *dst = (pixel*)_dst; \
    pixel *src = (pixel*)_src; \
    stride /= sizeof(pixel); \
    offset <<= (BIT_DEPTH-8); \
    offset = ((offset + 1) | 1) << log2_denom; \
    for(y=0; y<H; y++, dst += stride, src += stride){ \
    for (y = 0; y < height; y++, dst += stride, src += stride) { \
        op_scale2(0); \
        op_scale2(1); \
        if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
    } \
 }
 H264_WEIGHT(16,16)
 H264_WEIGHT(16,8)
 H264_WEIGHT(8,16)
 H264_WEIGHT(8,8)
 H264_WEIGHT(8,4)
 H264_WEIGHT(4,8)
 H264_WEIGHT(4,4)
 H264_WEIGHT(4,2)
 H264_WEIGHT(2,4)
 H264_WEIGHT(2,2)
 H264_WEIGHT(16)
 H264_WEIGHT(8)
 H264_WEIGHT(4)
 H264_WEIGHT(2)
 #undef op_scale1
 #undef op_scale2
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 static av_always_inline
 void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
 void weight_h264_W_altivec(uint8_t *block, int stride, int height,
                           int log2_denom, int weight, int offset, int w)
 {
    int y, aligned;
    vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
    voffset = vec_splat(vtemp, 5);
    aligned = !((unsigned long)block & 0xf);
    for (y=0; y<h; y++) {
    for (y = 0; y < height; y++) {
        vblock = vec_ld(0, block);
        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 static av_always_inline
 void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
                               int weightd, int weights, int offset, int w, int h)
 void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
                             int log2_denom, int weightd, int weights, int offset, int w)
 {
    int y, dst_aligned, src_aligned;
    vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    dst_aligned = !((unsigned long)dst & 0xf);
    src_aligned = !((unsigned long)src & 0xf);
    for (y=0; y<h; y++) {
    for (y = 0; y < height; y++) {
        vdst = vec_ld(0, dst);
        vsrc = vec_ld(0, src);
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    }
 }
 #define H264_WEIGHT(W,H) \
 static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
 #define H264_WEIGHT(W) \
 static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
                                                   int log2_denom, int weight, int offset){ \
    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
 static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
 static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
                                                     int log2_denom, int weightd, int weights, int offset){ \
    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
 H264_WEIGHT(16,16)
 H264_WEIGHT(16, 8)
 H264_WEIGHT( 8,16)
 H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)
 H264_WEIGHT(16)
 H264_WEIGHT( 8)
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
    }
    }
 }
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
 ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
 ;                               int log2_denom, int weightd, int weights,
 ;                               int offset);
 ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
 ;                            int height, int log2_denom, int weightd,
 ;                            int weights, int offset);
 ; and
 ; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
 ;                             int log2_denom, int weight,
 ;                             int offset);
 ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
 ;                          int log2_denom, int weight, int offset);
 ;-----------------------------------------------------------------------------
 %macro WEIGHT_SETUP 0
    add        r4, r4
    inc        r4
    movd       m3, r3d
    movd       m5, r4d
    movd       m6, r2d
    add        r5, r5
    inc        r5
    movd       m3, r4d
    movd       m5, r5d
    movd       m6, r3d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
    packuswb      m0, m1
 %endmacro
 %macro WEIGHT_FUNC_DBL_MM 1
 cglobal h264_weight_16x%1_mmx2, 5, 5, 0
 INIT_MMX
 cglobal h264_weight_16_mmx2, 6, 6, 0
    WEIGHT_SETUP
    mov        r2, %1
 %if %1 == 16
 .nextrow
    WEIGHT_OP 0,  4
    mova     [r0  ], m0
    WEIGHT_OP 8, 12
    mova     [r0+8], m0
    add        r0, r1
    dec        r2
    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
 %endif
 %endmacro
 INIT_MMX
 WEIGHT_FUNC_DBL_MM 16
 WEIGHT_FUNC_DBL_MM  8
 %macro WEIGHT_FUNC_MM 4
 cglobal h264_weight_%1x%2_%4, 7, 7, %3
 %macro WEIGHT_FUNC_MM 3
 cglobal h264_weight_%1_%3, 6, 6, %2
    WEIGHT_SETUP
    mov        r2, %2
 %if %2 == 16
 .nextrow
    WEIGHT_OP 0, mmsize/2
    mova     [r0], m0
    add        r0, r1
    dec        r2
    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
 WEIGHT_FUNC_MM  8, 16,  0, mmx2
 WEIGHT_FUNC_MM  8,  8,  0, mmx2
 WEIGHT_FUNC_MM  8,  4,  0, mmx2
 WEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
 WEIGHT_FUNC_MM 16, 16,  8, sse2
 WEIGHT_FUNC_MM 16,  8,  8, sse2
 WEIGHT_FUNC_MM 16, 8, sse2
 %macro WEIGHT_FUNC_HALF_MM 5
 cglobal h264_weight_%1x%2_%5, 5, 5, %4
 %macro WEIGHT_FUNC_HALF_MM 3
 cglobal h264_weight_%1_%3, 6, 6, %2
    WEIGHT_SETUP
    mov        r2, %2/2
    sar       r2d, 1
    lea        r3, [r1*2]
 %if %2 == mmsize
 .nextrow
    WEIGHT_OP 0, r1
    movh     [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
    movh     [r0+r1], m0
 %endif
    add        r0, r3
    dec        r2
    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
 WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
 WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
 WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
 WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
 WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
 WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
 WEIGHT_FUNC_HALF_MM 8, 8, sse2
 WEIGHT_FUNC_HALF_MM 8, 8, sse2
 WEIGHT_FUNC_HALF_MM 8, 8, sse2
 %macro BIWEIGHT_SETUP 0
    add        r6, 1
    or         r6, 1
    add        r3, 1
    movd       m3, r4d
    movd       m4, r5d
    movd       m5, r6d
    movd       m6, r3d
 %ifdef ARCH_X86_64
 %define off_regd r11d
 %else
 %define off_regd r3d
 %endif
    mov  off_regd, r7m
    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m3, r5d
    movd       m4, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m1
 %endmacro
 %macro BIWEIGHT_FUNC_DBL_MM 1
 cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
 INIT_MMX
 cglobal h264_biweight_16_mmx2, 7, 7, 0
    BIWEIGHT_SETUP
    mov        r3, %1
 %if %1 == 16
    movifnidn r3d, r3m
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
    mova     [r0+8], m0
    add        r0, r2
    add        r1, r2
    dec        r3
    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
 %endif
 %endmacro
 INIT_MMX
 BIWEIGHT_FUNC_DBL_MM 16
 BIWEIGHT_FUNC_DBL_MM  8
 %macro BIWEIGHT_FUNC_MM 4
 cglobal h264_biweight_%1x%2_%4, 7, 7, %3
 %macro BIWEIGHT_FUNC_MM 3
 cglobal h264_biweight_%1_%3, 7, 7, %2
    BIWEIGHT_SETUP
    mov        r3, %2
 %if %2 == 16
    movifnidn r3d, r3m
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
    dec        r3
    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
 BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
 BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
 BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
 BIWEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
 BIWEIGHT_FUNC_MM 16, 16,  8, sse2
 BIWEIGHT_FUNC_MM 16,  8,  8, sse2
 BIWEIGHT_FUNC_MM 16, 8, sse2
 %macro BIWEIGHT_FUNC_HALF_MM 5
 cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %macro BIWEIGHT_FUNC_HALF_MM 3
 cglobal h264_biweight_%1_%3, 7, 7, %2
    BIWEIGHT_SETUP
    mov        r3, %2/2
    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %2 == mmsize
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
    add        r0, r4
    add        r1, r4
    dec        r3
    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
 BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
 BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 %macro BIWEIGHT_SSSE3_SETUP 0
    add        r6, 1
    or         r6, 1
    add        r3, 1
    movd       m4, r4d
    movd       m0, r5d
    movd       m5, r6d
    movd       m6, r3d
 %ifdef ARCH_X86_64
 %define off_regd r11d
 %else
 %define off_regd r3d
 %endif
    mov  off_regd, r7m
    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m4, r5d
    movd       m0, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
    punpcklbw  m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m2
 %endmacro
 %macro BIWEIGHT_SSSE3_16 1
 cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
 INIT_XMM
 cglobal h264_biweight_16_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
    mov        r3, %1
    movifnidn r3d, r3m
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
    dec        r3
    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
 BIWEIGHT_SSSE3_16 16
 BIWEIGHT_SSSE3_16  8
 %macro BIWEIGHT_SSSE3_8 1
 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
 cglobal h264_biweight_8_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
    mov        r3, %1/2
    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
    movhps     [r0+r2], m0
    add        r0, r4
    add        r1, r4
    dec        r3
    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
 BIWEIGHT_SSSE3_8 16
 BIWEIGHT_SSSE3_8  8
 BIWEIGHT_SSSE3_8  4
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 ;-----------------------------------------------------------------------------
 ; void h264_weight(uint8_t *dst, int stride, int log2_denom,
 ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
 DECLARE_REG_TMP 2
 %else
 DECLARE_REG_TMP 10
 %endif
 %macro WEIGHT_PROLOGUE 1
    mov t0, %1
 %macro WEIGHT_PROLOGUE 0
 .prologue
    PROLOGUE 0,5,8
    PROLOGUE 0,6,8
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
    movifnidn r3d, r3m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
 %endmacro
 %macro WEIGHT_SETUP 1
    mova       m0, [pw_1]
    movd       m2, r2m
    movd       m2, r3m
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
    shl        r4, 19       ; *8, move to upper half of dword
    lea        r4, [r4+r3*2+0x10000]
    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
    shl        r5, 19       ; *8, move to upper half of dword
    lea        r5, [r5+r4*2+0x10000]
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 %macro WEIGHT_FUNC_DBL 1
 cglobal h264_weight_16x16_10_%1
    WEIGHT_PROLOGUE 16
 cglobal h264_weight_16_10_%1
    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP %1,  0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
    WEIGHT_OP %1, 16
    mova [r0+16], m5
    add       r0, r1
    dec       t0
    dec       r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 %macro WEIGHT_FUNC_MM 1
 cglobal h264_weight_8x16_10_%1
    WEIGHT_PROLOGUE 16
 cglobal h264_weight_8_10_%1
    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP  %1, 0
    mova     [r0], m5
    add        r0, r1
    dec        t0
    dec        r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 cglobal h264_weight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 %macro WEIGHT_FUNC_HALF_MM 1
 cglobal h264_weight_4x8_10_%1
    WEIGHT_PROLOGUE 4
 cglobal h264_weight_4_10_%1
    WEIGHT_PROLOGUE
    sar         r2d, 1
    WEIGHT_SETUP %1
    lea         r3, [r1*2]
 .nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
    dec         t0
    dec         r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 cglobal h264_weight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 ;-----------------------------------------------------------------------------
 ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
 ;                    int weightd, int weights, int offset);
 ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
 ;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
 DECLARE_REG_TMP 2,3
 DECLARE_REG_TMP 3
 %else
 DECLARE_REG_TMP 10,2
 DECLARE_REG_TMP 10
 %endif
 %macro BIWEIGHT_PROLOGUE 1
    mov t0, %1
 %macro BIWEIGHT_PROLOGUE 0
 .prologue
    PROLOGUE 0,7,8
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
    movifnidn t1d, r2m
    movifnidn r4d, r4m
    movifnidn r2d, r2m
    movifnidn r5d, r5m
    movifnidn r6d, r6m
    movifnidn t0d, r7m
 %endmacro
 %macro BIWEIGHT_SETUP 1
    lea        r6, [r6*4+1] ; (offset<<2)+1
    or         r6, 1
    shl        r5, 16
    or         r4, r5
    movd       m4, r4d      ; weightd | weights
    movd       m5, r6d      ; (offset+1)|1
    movd       m6, r3m      ; log2_denom
    lea        t0, [t0*4+1] ; (offset<<2)+1
    or         t0, 1
    shl        r6, 16
    or         r5, r6
    movd       m4, r5d      ; weightd | weights
    movd       m5, t0d      ; (offset+1)|1
    movd       m6, r4m      ; log2_denom
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
    movifnidn r3d, r3m
 %ifnidn %1, sse4
    pxor       m7, m7
 %endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 %macro BIWEIGHT_FUNC_DBL 1
 cglobal h264_biweight_16x16_10_%1
    BIWEIGHT_PROLOGUE 16
 cglobal h264_biweight_16_10_%1
    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT  %1,  0
    mova [r0   ], m0
    BIWEIGHT  %1, 16
    mova [r0+16], m0
    add       r0, t1
    add       r1, t1
    dec       t0
    add       r0, r2
    add       r1, r2
    dec       r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 %macro BIWEIGHT_FUNC 1
 cglobal h264_biweight_8x16_10_%1
    BIWEIGHT_PROLOGUE 16
 cglobal h264_biweight_8_10_%1
    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT %1, 0
    mova   [r0], m0
    add      r0, t1
    add      r1, t1
    dec      t0
    add      r0, r2
    add      r1, r2
    dec      r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 cglobal h264_biweight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 %macro BIWEIGHT_FUNC_HALF 1
 cglobal h264_biweight_4x8_10_%1
    BIWEIGHT_PROLOGUE 4
 cglobal h264_biweight_4_10_%1
    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
    lea        r4, [t1*2]
    sar        r3d, 1
    lea        r4, [r2*2]
 .nextrow
    BIWEIGHT    %1, 0, t1
    BIWEIGHT    %1, 0, r2
    movh   [r0   ], m0
    movhps [r0+t1], m0
    movhps [r0+r2], m0
    add         r0, r4
    add         r1, r4
    dec         t0
    dec         r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 cglobal h264_biweight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
 #define H264_WEIGHT(W, H, OPT) \
 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
    int stride, int log2_denom, int weight, int offset);
 #define H264_WEIGHT(W, OPT) \
 void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
    int stride, int height, int log2_denom, int weight, int offset);
 #define H264_BIWEIGHT(W, H, OPT) \
 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
    uint8_t *src, int stride, int log2_denom, int weightd, \
 #define H264_BIWEIGHT(W, OPT) \
 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
    int weights, int offset);
 #define H264_BIWEIGHT_MMX(W,H) \
 H264_WEIGHT  (W, H, mmx2) \
 H264_BIWEIGHT(W, H, mmx2)
 #define H264_BIWEIGHT_MMX_SSE(W,H) \
 H264_BIWEIGHT_MMX(W, H) \
 H264_WEIGHT      (W, H, sse2) \
 H264_BIWEIGHT    (W, H, sse2) \
 H264_BIWEIGHT    (W, H, ssse3)
 H264_BIWEIGHT_MMX_SSE(16, 16)
 H264_BIWEIGHT_MMX_SSE(16,  8)
 H264_BIWEIGHT_MMX_SSE( 8, 16)
 H264_BIWEIGHT_MMX_SSE( 8,  8)
 H264_BIWEIGHT_MMX_SSE( 8,  4)
 H264_BIWEIGHT_MMX    ( 4,  8)
 H264_BIWEIGHT_MMX    ( 4,  4)
 H264_BIWEIGHT_MMX    ( 4,  2)
 #define H264_WEIGHT_10(W, H, DEPTH, OPT) \
 void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
    int stride, int log2_denom, int weight, int offset);
 #define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
 void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
 #define H264_BIWEIGHT_MMX(W) \
 H264_WEIGHT  (W, mmx2) \
 H264_BIWEIGHT(W, mmx2)
 #define H264_BIWEIGHT_MMX_SSE(W) \
 H264_BIWEIGHT_MMX(W) \
 H264_WEIGHT      (W, sse2) \
 H264_BIWEIGHT    (W, sse2) \
 H264_BIWEIGHT    (W, ssse3)
 H264_BIWEIGHT_MMX_SSE(16)
 H264_BIWEIGHT_MMX_SSE( 8)
 H264_BIWEIGHT_MMX    ( 4)
 #define H264_WEIGHT_10(W, DEPTH, OPT) \
 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
    int stride, int height, int log2_denom, int weight, int offset);
 #define H264_BIWEIGHT_10(W, DEPTH, OPT) \
 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
     int weightd, int weights, int offset);
 #define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
 H264_WEIGHT_10  (W, H, DEPTH, sse2) \
 H264_WEIGHT_10  (W, H, DEPTH, sse4) \
 H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
 H264_BIWEIGHT_10(W, H, DEPTH, sse4)
 H264_BIWEIGHT_10_SSE(16, 16, 10)
 H264_BIWEIGHT_10_SSE(16,  8, 10)
 H264_BIWEIGHT_10_SSE( 8, 16, 10)
 H264_BIWEIGHT_10_SSE( 8,  8, 10)
 H264_BIWEIGHT_10_SSE( 8,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  8, 10)
 H264_BIWEIGHT_10_SSE( 4,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  2, 10)
 #define H264_BIWEIGHT_10_SSE(W, DEPTH) \
 H264_WEIGHT_10  (W, DEPTH, sse2) \
 H264_WEIGHT_10  (W, DEPTH, sse4) \
 H264_BIWEIGHT_10(W, DEPTH, sse2) \
 H264_BIWEIGHT_10(W, DEPTH, sse4)
 H264_BIWEIGHT_10_SSE(16, 10)
 H264_BIWEIGHT_10_SSE( 8, 10)
 H264_BIWEIGHT_10_SSE( 4, 10)
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSSE3) {
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
            }
            if (mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSE4) {
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
            }
 #if HAVE_AVX
            if (mm_flags&AV_CPU_FLAG_AVX) {