Neon parts by Mans Rullgard <mans@mansr.com>.tags/n0.9
| @@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, | |||
| void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, | |||
| int beta, int8_t *tc0); | |||
| void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, | |||
| int weight, int offset); | |||
| void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, | |||
| int log2_den, int weight, int offset); | |||
| void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, | |||
| int log2_den, int weight, int offset); | |||
| void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, | |||
| int log2_den, int weight, int offset); | |||
| void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int log2_den, int weightd, int weights, | |||
| int offset); | |||
| void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int height, int log2_den, int weightd, | |||
| int weights, int offset); | |||
| void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int height, int log2_den, int weightd, | |||
| int weights, int offset); | |||
| void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, | |||
| int height, int log2_den, int weightd, | |||
| int weights, int offset); | |||
| void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); | |||
| void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); | |||
| @@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i | |||
| c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; | |||
| c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; | |||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; | |||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; | |||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; | |||
| c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; | |||
| c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; | |||
| c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; | |||
| c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; | |||
| c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; | |||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; | |||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; | |||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; | |||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; | |||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; | |||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; | |||
| c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; | |||
| c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; | |||
| c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; | |||
| c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; | |||
| c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; | |||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; | |||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; | |||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; | |||
| c->h264_idct_add = ff_h264_idct_add_neon; | |||
| c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; | |||
| @@ -1592,7 +1592,7 @@ endfunc | |||
| vdup.8 d1, r5 | |||
| vmov q2, q8 | |||
| vmov q3, q8 | |||
| 1: subs ip, ip, #2 | |||
| 1: subs r3, r3, #2 | |||
| vld1.8 {d20-d21},[r0,:128], r2 | |||
| \macd q2, d0, d20 | |||
| pld [r0] | |||
| @@ -1632,7 +1632,7 @@ endfunc | |||
| vdup.8 d1, r5 | |||
| vmov q1, q8 | |||
| vmov q10, q8 | |||
| 1: subs ip, ip, #2 | |||
| 1: subs r3, r3, #2 | |||
| vld1.8 {d4},[r0,:64], r2 | |||
| \macd q1, d0, d4 | |||
| pld [r0] | |||
| @@ -1662,7 +1662,7 @@ endfunc | |||
| vdup.8 d1, r5 | |||
| vmov q1, q8 | |||
| vmov q10, q8 | |||
| 1: subs ip, ip, #4 | |||
| 1: subs r3, r3, #4 | |||
| vld1.32 {d4[0]},[r0,:32], r2 | |||
| vld1.32 {d4[1]},[r0,:32], r2 | |||
| \macd q1, d0, d4 | |||
| @@ -1700,16 +1700,17 @@ endfunc | |||
| .endm | |||
| .macro biweight_func w | |||
| function biweight_h264_pixels_\w\()_neon | |||
| function ff_biweight_h264_pixels_\w\()_neon, export=1 | |||
| push {r4-r6, lr} | |||
| add r4, sp, #16 | |||
| ldr r12, [sp, #16] | |||
| add r4, sp, #20 | |||
| ldm r4, {r4-r6} | |||
| lsr lr, r4, #31 | |||
| add r6, r6, #1 | |||
| eors lr, lr, r5, lsr #30 | |||
| orr r6, r6, #1 | |||
| vdup.16 q9, r3 | |||
| lsl r6, r6, r3 | |||
| vdup.16 q9, r12 | |||
| lsl r6, r6, r12 | |||
| vmvn q9, q9 | |||
| vdup.16 q8, r6 | |||
| mov r6, r0 | |||
| @@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon | |||
| endfunc | |||
| .endm | |||
| .macro biweight_entry w, h, b=1 | |||
| function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |||
| mov ip, #\h | |||
| .if \b | |||
| b biweight_h264_pixels_\w\()_neon | |||
| .endif | |||
| endfunc | |||
| .endm | |||
| biweight_entry 16, 8 | |||
| biweight_entry 16, 16, b=0 | |||
| biweight_func 16 | |||
| biweight_entry 8, 16 | |||
| biweight_entry 8, 4 | |||
| biweight_entry 8, 8, b=0 | |||
| biweight_func 8 | |||
| biweight_entry 4, 8 | |||
| biweight_entry 4, 2 | |||
| biweight_entry 4, 4, b=0 | |||
| biweight_func 4 | |||
| @ Weighted prediction | |||
| .macro weight_16 add | |||
| vdup.8 d0, r3 | |||
| 1: subs ip, ip, #2 | |||
| vdup.8 d0, r12 | |||
| 1: subs r2, r2, #2 | |||
| vld1.8 {d20-d21},[r0,:128], r1 | |||
| vmull.u8 q2, d0, d20 | |||
| pld [r0] | |||
| @@ -1785,8 +1767,8 @@ endfunc | |||
| .endm | |||
| .macro weight_8 add | |||
| vdup.8 d0, r3 | |||
| 1: subs ip, ip, #2 | |||
| vdup.8 d0, r12 | |||
| 1: subs r2, r2, #2 | |||
| vld1.8 {d4},[r0,:64], r1 | |||
| vmull.u8 q1, d0, d4 | |||
| pld [r0] | |||
| @@ -1806,10 +1788,10 @@ endfunc | |||
| .endm | |||
| .macro weight_4 add | |||
| vdup.8 d0, r3 | |||
| vdup.8 d0, r12 | |||
| vmov q1, q8 | |||
| vmov q10, q8 | |||
| 1: subs ip, ip, #4 | |||
| 1: subs r2, r2, #4 | |||
| vld1.32 {d4[0]},[r0,:32], r1 | |||
| vld1.32 {d4[1]},[r0,:32], r1 | |||
| vmull.u8 q1, d0, d4 | |||
| @@ -1842,50 +1824,32 @@ endfunc | |||
| .endm | |||
| .macro weight_func w | |||
| function weight_h264_pixels_\w\()_neon | |||
| function ff_weight_h264_pixels_\w\()_neon, export=1 | |||
| push {r4, lr} | |||
| ldr r4, [sp, #8] | |||
| cmp r2, #1 | |||
| lsl r4, r4, r2 | |||
| ldr r12, [sp, #8] | |||
| ldr r4, [sp, #12] | |||
| cmp r3, #1 | |||
| lsl r4, r4, r3 | |||
| vdup.16 q8, r4 | |||
| mov r4, r0 | |||
| ble 20f | |||
| rsb lr, r2, #1 | |||
| rsb lr, r3, #1 | |||
| vdup.16 q9, lr | |||
| cmp r3, #0 | |||
| cmp r12, #0 | |||
| blt 10f | |||
| weight_\w vhadd.s16 | |||
| 10: rsb r3, r3, #0 | |||
| 10: rsb r12, r12, #0 | |||
| weight_\w vhsub.s16 | |||
| 20: rsb lr, r2, #0 | |||
| 20: rsb lr, r3, #0 | |||
| vdup.16 q9, lr | |||
| cmp r3, #0 | |||
| cmp r12, #0 | |||
| blt 10f | |||
| weight_\w vadd.s16 | |||
| 10: rsb r3, r3, #0 | |||
| 10: rsb r12, r12, #0 | |||
| weight_\w vsub.s16 | |||
| endfunc | |||
| .endm | |||
| .macro weight_entry w, h, b=1 | |||
| function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |||
| mov ip, #\h | |||
| .if \b | |||
| b weight_h264_pixels_\w\()_neon | |||
| .endif | |||
| endfunc | |||
| .endm | |||
| weight_entry 16, 8 | |||
| weight_entry 16, 16, b=0 | |||
| weight_func 16 | |||
| weight_entry 8, 16 | |||
| weight_entry 8, 4 | |||
| weight_entry 8, 8, b=0 | |||
| weight_func 8 | |||
| weight_entry 4, 8 | |||
| weight_entry 4, 2 | |||
| weight_entry 4, 4, b=0 | |||
| weight_func 4 | |||
| @@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){ | |||
| } | |||
| #endif | |||
| static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, | |||
| static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, | |||
| int height, int delta, int list, | |||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | |||
| int src_x_offset, int src_y_offset, | |||
| qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, | |||
| @@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, | |||
| s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | |||
| src_cb= s->edge_emu_buffer; | |||
| } | |||
| chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||
| chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||
| if(emu){ | |||
| s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | |||
| src_cr= s->edge_emu_buffer; | |||
| } | |||
| chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||
| chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||
| } | |||
| static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, | |||
| static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, | |||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | |||
| int x_offset, int y_offset, | |||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | |||
| @@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei | |||
| if(list0){ | |||
| Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; | |||
| mc_dir_part(h, ref, n, square, chroma_height, delta, 0, | |||
| mc_dir_part(h, ref, n, square, height, delta, 0, | |||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_op, chroma_op, pixel_shift, chroma444); | |||
| @@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei | |||
| if(list1){ | |||
| Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; | |||
| mc_dir_part(h, ref, n, square, chroma_height, delta, 1, | |||
| mc_dir_part(h, ref, n, square, height, delta, 1, | |||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_op, chroma_op, pixel_shift, chroma444); | |||
| } | |||
| } | |||
| static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, | |||
| static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta, | |||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | |||
| int x_offset, int y_offset, | |||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | |||
| @@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom | |||
| h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, | |||
| int list0, int list1, int pixel_shift, int chroma444){ | |||
| MpegEncContext * const s = &h->s; | |||
| int chroma_height; | |||
| dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | |||
| if(chroma444){ | |||
| chroma_height = height; | |||
| chroma_weight_avg = luma_weight_avg; | |||
| chroma_weight_op = luma_weight_op; | |||
| dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | |||
| dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | |||
| } else if (CHROMA422) { | |||
| chroma_height = height; | |||
| dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | |||
| dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | |||
| }else{ | |||
| chroma_height = height >> 1; | |||
| dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | |||
| dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | |||
| } | |||
| @@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom | |||
| int refn0 = h->ref_cache[0][ scan8[n] ]; | |||
| int refn1 = h->ref_cache[1][ scan8[n] ]; | |||
| mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, | |||
| mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, | |||
| dest_y, dest_cb, dest_cr, | |||
| x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | |||
| mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, | |||
| mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, | |||
| tmp_y, tmp_cb, tmp_cr, | |||
| x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | |||
| if(h->use_weight == 2){ | |||
| int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; | |||
| int weight1 = 64 - weight0; | |||
| luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); | |||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); | |||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); | |||
| if (CHROMA422) { | |||
| chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, | |||
| tmp_cb + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, 5, weight0, weight1, 0); | |||
| chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, | |||
| tmp_cr + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, 5, weight0, weight1, 0); | |||
| } | |||
| luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, | |||
| height, 5, weight0, weight1, 0); | |||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, | |||
| chroma_height, 5, weight0, weight1, 0); | |||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, | |||
| chroma_height, 5, weight0, weight1, 0); | |||
| }else{ | |||
| luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, | |||
| luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom, | |||
| h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], | |||
| h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); | |||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], | |||
| h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); | |||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], | |||
| h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); | |||
| if (CHROMA422) { | |||
| chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, | |||
| tmp_cb + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], | |||
| h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); | |||
| chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, | |||
| tmp_cr + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], | |||
| h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); | |||
| } | |||
| } | |||
| }else{ | |||
| int list = list1 ? 1 : 0; | |||
| int refn = h->ref_cache[list][ scan8[n] ]; | |||
| Picture *ref= &h->ref_list[list][refn]; | |||
| mc_dir_part(h, ref, n, square, chroma_height, delta, list, | |||
| mc_dir_part(h, ref, n, square, height, delta, list, | |||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_put, chroma_put, pixel_shift, chroma444); | |||
| luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, | |||
| luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, | |||
| h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); | |||
| if(h->use_weight_chroma){ | |||
| chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); | |||
| chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); | |||
| if (CHROMA422) { | |||
| chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); | |||
| chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, | |||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||
| h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, | |||
| static inline void mc_part(H264Context *h, int n, int square, int height, int delta, | |||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | |||
| int x_offset, int y_offset, | |||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | |||
| @@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height, | |||
| if((h->use_weight==2 && list0 && list1 | |||
| && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) | |||
| || h->use_weight==1) | |||
| mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, | |||
| mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, | |||
| x_offset, y_offset, qpix_put, chroma_put, | |||
| weight_op[0], weight_op[3], weight_avg[0], | |||
| weight_avg[3], list0, list1, pixel_shift, chroma444); | |||
| weight_op[0], weight_op[1], weight_avg[0], | |||
| weight_avg[1], list0, list1, pixel_shift, chroma444); | |||
| else | |||
| mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, | |||
| mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, | |||
| x_offset, y_offset, qpix_put, chroma_put, qpix_avg, | |||
| chroma_avg, list0, list1, pixel_shift, chroma444); | |||
| } | |||
| @@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||
| prefetch_motion(h, 0, pixel_shift, chroma444); | |||
| if(IS_16X16(mb_type)){ | |||
| mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, | |||
| mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, | |||
| qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], | |||
| weight_op, weight_avg, | |||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| }else if(IS_16X8(mb_type)){ | |||
| mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, | |||
| mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, | |||
| qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | |||
| &weight_op[1], &weight_avg[1], | |||
| weight_op, weight_avg, | |||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, | |||
| mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, | |||
| qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | |||
| &weight_op[1], &weight_avg[1], | |||
| weight_op, weight_avg, | |||
| IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | |||
| pixel_shift, chroma444); | |||
| }else if(IS_8X16(mb_type)){ | |||
| mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, | |||
| mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, | |||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | |||
| &weight_op[2], &weight_avg[2], | |||
| &weight_op[1], &weight_avg[1], | |||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, | |||
| mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, | |||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | |||
| &weight_op[2], &weight_avg[2], | |||
| &weight_op[1], &weight_avg[1], | |||
| IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | |||
| pixel_shift, chroma444); | |||
| }else{ | |||
| @@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||
| int y_offset= (i&2)<<1; | |||
| if(IS_SUB_8X8(sub_mb_type)){ | |||
| mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | |||
| &weight_op[3], &weight_avg[3], | |||
| &weight_op[1], &weight_avg[1], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| }else if(IS_SUB_8X4(sub_mb_type)){ | |||
| mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | |||
| &weight_op[4], &weight_avg[4], | |||
| &weight_op[1], &weight_avg[1], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, | |||
| mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, | |||
| qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | |||
| &weight_op[4], &weight_avg[4], | |||
| &weight_op[1], &weight_avg[1], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| }else if(IS_SUB_4X8(sub_mb_type)){ | |||
| mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | |||
| &weight_op[5], &weight_avg[5], | |||
| &weight_op[2], &weight_avg[2], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, | |||
| mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, | |||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | |||
| &weight_op[5], &weight_avg[5], | |||
| &weight_op[2], &weight_avg[2], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| }else{ | |||
| @@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||
| for(j=0; j<4; j++){ | |||
| int sub_x_offset= x_offset + 2*(j&1); | |||
| int sub_y_offset= y_offset + (j&2); | |||
| mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, | |||
| mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, | |||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | |||
| &weight_op[6], &weight_avg[6], | |||
| &weight_op[2], &weight_avg[2], | |||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | |||
| pixel_shift, chroma444); | |||
| } | |||
| @@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo | |||
| else\ | |||
| c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ | |||
| \ | |||
| c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ | |||
| c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ | |||
| c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ | |||
| c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ | |||
| c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ | |||
| c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ | |||
| c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ | |||
| c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ | |||
| c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\ | |||
| c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\ | |||
| c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\ | |||
| c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\ | |||
| c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\ | |||
| c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\ | |||
| c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\ | |||
| c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\ | |||
| c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\ | |||
| c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\ | |||
| c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\ | |||
| c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\ | |||
| c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\ | |||
| c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\ | |||
| c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\ | |||
| c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\ | |||
| c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\ | |||
| c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\ | |||
| c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\ | |||
| c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\ | |||
| \ | |||
| c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ | |||
| c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ | |||
| @@ -31,16 +31,18 @@ | |||
| #include "dsputil.h" | |||
| //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); | |||
| typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); | |||
| typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); | |||
| typedef void (*h264_weight_func)(uint8_t *block, int stride, int height, | |||
| int log2_denom, int weight, int offset); | |||
| typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height, | |||
| int log2_denom, int weightd, int weights, int offset); | |||
| /** | |||
| * Context for storing H.264 DSP functions | |||
| */ | |||
| typedef struct H264DSPContext{ | |||
| /* weighted MC */ | |||
| h264_weight_func weight_h264_pixels_tab[10]; | |||
| h264_biweight_func biweight_h264_pixels_tab[10]; | |||
| h264_weight_func weight_h264_pixels_tab[4]; | |||
| h264_biweight_func biweight_h264_pixels_tab[4]; | |||
| /* loop filter */ | |||
| void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); | |||
| @@ -29,14 +29,16 @@ | |||
| #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) | |||
| #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | |||
| #define H264_WEIGHT(W,H) \ | |||
| static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ | |||
| #define H264_WEIGHT(W) \ | |||
| static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \ | |||
| int log2_denom, int weight, int offset) \ | |||
| { \ | |||
| int y; \ | |||
| pixel *block = (pixel*)_block; \ | |||
| stride /= sizeof(pixel); \ | |||
| offset <<= (log2_denom + (BIT_DEPTH-8)); \ | |||
| if(log2_denom) offset += 1<<(log2_denom-1); \ | |||
| for(y=0; y<H; y++, block += stride){ \ | |||
| for (y = 0; y < height; y++, block += stride) { \ | |||
| op_scale1(0); \ | |||
| op_scale1(1); \ | |||
| if(W==2) continue; \ | |||
| @@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride | |||
| op_scale1(15); \ | |||
| } \ | |||
| } \ | |||
| static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \ | |||
| static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \ | |||
| int log2_denom, int weightd, int weights, int offset) \ | |||
| { \ | |||
| int y; \ | |||
| pixel *dst = (pixel*)_dst; \ | |||
| pixel *src = (pixel*)_src; \ | |||
| stride /= sizeof(pixel); \ | |||
| offset <<= (BIT_DEPTH-8); \ | |||
| offset = ((offset + 1) | 1) << log2_denom; \ | |||
| for(y=0; y<H; y++, dst += stride, src += stride){ \ | |||
| for (y = 0; y < height; y++, dst += stride, src += stride) { \ | |||
| op_scale2(0); \ | |||
| op_scale2(1); \ | |||
| if(W==2) continue; \ | |||
| @@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_ | |||
| } \ | |||
| } | |||
| H264_WEIGHT(16,16) | |||
| H264_WEIGHT(16,8) | |||
| H264_WEIGHT(8,16) | |||
| H264_WEIGHT(8,8) | |||
| H264_WEIGHT(8,4) | |||
| H264_WEIGHT(4,8) | |||
| H264_WEIGHT(4,4) | |||
| H264_WEIGHT(4,2) | |||
| H264_WEIGHT(2,4) | |||
| H264_WEIGHT(2,2) | |||
| H264_WEIGHT(16) | |||
| H264_WEIGHT(8) | |||
| H264_WEIGHT(4) | |||
| H264_WEIGHT(2) | |||
| #undef op_scale1 | |||
| #undef op_scale2 | |||
| @@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, | |||
| } | |||
| static av_always_inline | |||
| void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) | |||
| void weight_h264_W_altivec(uint8_t *block, int stride, int height, | |||
| int log2_denom, int weight, int offset, int w) | |||
| { | |||
| int y, aligned; | |||
| vec_u8 vblock; | |||
| @@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei | |||
| voffset = vec_splat(vtemp, 5); | |||
| aligned = !((unsigned long)block & 0xf); | |||
| for (y=0; y<h; y++) { | |||
| for (y = 0; y < height; y++) { | |||
| vblock = vec_ld(0, block); | |||
| v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); | |||
| @@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei | |||
| } | |||
| static av_always_inline | |||
| void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, | |||
| int weightd, int weights, int offset, int w, int h) | |||
| void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, | |||
| int log2_denom, int weightd, int weights, int offset, int w) | |||
| { | |||
| int y, dst_aligned, src_aligned; | |||
| vec_u8 vsrc, vdst; | |||
| @@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ | |||
| dst_aligned = !((unsigned long)dst & 0xf); | |||
| src_aligned = !((unsigned long)src & 0xf); | |||
| for (y=0; y<h; y++) { | |||
| for (y = 0; y < height; y++) { | |||
| vdst = vec_ld(0, dst); | |||
| vsrc = vec_ld(0, src); | |||
| @@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ | |||
| } | |||
| } | |||
| #define H264_WEIGHT(W,H) \ | |||
| static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |||
| weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ | |||
| #define H264_WEIGHT(W) \ | |||
| static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \ | |||
| int log2_denom, int weight, int offset){ \ | |||
| weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \ | |||
| }\ | |||
| static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ | |||
| biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ | |||
| static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \ | |||
| int log2_denom, int weightd, int weights, int offset){ \ | |||
| biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ | |||
| } | |||
| H264_WEIGHT(16,16) | |||
| H264_WEIGHT(16, 8) | |||
| H264_WEIGHT( 8,16) | |||
| H264_WEIGHT( 8, 8) | |||
| H264_WEIGHT( 8, 4) | |||
| H264_WEIGHT(16) | |||
| H264_WEIGHT( 8) | |||
| void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { | |||
| const int high_bit_depth = avctx->bits_per_raw_sample > 8; | |||
| @@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom | |||
| c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; | |||
| c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; | |||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; | |||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; | |||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; | |||
| c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; | |||
| c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; | |||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; | |||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; | |||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; | |||
| c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; | |||
| c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; | |||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; | |||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; | |||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; | |||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; | |||
| } | |||
| } | |||
| } | |||
| @@ -28,21 +28,20 @@ SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| ; biweight pred: | |||
| ; | |||
| ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, | |||
| ; int log2_denom, int weightd, int weights, | |||
| ; int offset); | |||
| ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, | |||
| ; int height, int log2_denom, int weightd, | |||
| ; int weights, int offset); | |||
| ; and | |||
| ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, | |||
| ; int log2_denom, int weight, | |||
| ; int offset); | |||
| ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, | |||
| ; int log2_denom, int weight, int offset); | |||
| ;----------------------------------------------------------------------------- | |||
| %macro WEIGHT_SETUP 0 | |||
| add r4, r4 | |||
| inc r4 | |||
| movd m3, r3d | |||
| movd m5, r4d | |||
| movd m6, r2d | |||
| add r5, r5 | |||
| inc r5 | |||
| movd m3, r4d | |||
| movd m5, r5d | |||
| movd m6, r3d | |||
| pslld m5, m6 | |||
| psrld m5, 1 | |||
| %if mmsize == 16 | |||
| @@ -71,60 +70,41 @@ SECTION .text | |||
| packuswb m0, m1 | |||
| %endmacro | |||
| %macro WEIGHT_FUNC_DBL_MM 1 | |||
| cglobal h264_weight_16x%1_mmx2, 5, 5, 0 | |||
| INIT_MMX | |||
| cglobal h264_weight_16_mmx2, 6, 6, 0 | |||
| WEIGHT_SETUP | |||
| mov r2, %1 | |||
| %if %1 == 16 | |||
| .nextrow | |||
| WEIGHT_OP 0, 4 | |||
| mova [r0 ], m0 | |||
| WEIGHT_OP 8, 12 | |||
| mova [r0+8], m0 | |||
| add r0, r1 | |||
| dec r2 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| WEIGHT_FUNC_DBL_MM 16 | |||
| WEIGHT_FUNC_DBL_MM 8 | |||
| %macro WEIGHT_FUNC_MM 4 | |||
| cglobal h264_weight_%1x%2_%4, 7, 7, %3 | |||
| %macro WEIGHT_FUNC_MM 3 | |||
| cglobal h264_weight_%1_%3, 6, 6, %2 | |||
| WEIGHT_SETUP | |||
| mov r2, %2 | |||
| %if %2 == 16 | |||
| .nextrow | |||
| WEIGHT_OP 0, mmsize/2 | |||
| mova [r0], m0 | |||
| add r0, r1 | |||
| dec r2 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_weight_%1x16_%4.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| WEIGHT_FUNC_MM 8, 16, 0, mmx2 | |||
| WEIGHT_FUNC_MM 8, 8, 0, mmx2 | |||
| WEIGHT_FUNC_MM 8, 4, 0, mmx2 | |||
| WEIGHT_FUNC_MM 8, 0, mmx2 | |||
| INIT_XMM | |||
| WEIGHT_FUNC_MM 16, 16, 8, sse2 | |||
| WEIGHT_FUNC_MM 16, 8, 8, sse2 | |||
| WEIGHT_FUNC_MM 16, 8, sse2 | |||
| %macro WEIGHT_FUNC_HALF_MM 5 | |||
| cglobal h264_weight_%1x%2_%5, 5, 5, %4 | |||
| %macro WEIGHT_FUNC_HALF_MM 3 | |||
| cglobal h264_weight_%1_%3, 6, 6, %2 | |||
| WEIGHT_SETUP | |||
| mov r2, %2/2 | |||
| sar r2d, 1 | |||
| lea r3, [r1*2] | |||
| %if %2 == mmsize | |||
| .nextrow | |||
| WEIGHT_OP 0, r1 | |||
| movh [r0], m0 | |||
| @@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 | |||
| movh [r0+r1], m0 | |||
| %endif | |||
| add r0, r3 | |||
| dec r2 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |||
| WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |||
| WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||
| INIT_XMM | |||
| WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |||
| WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |||
| WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||
| %macro BIWEIGHT_SETUP 0 | |||
| add r6, 1 | |||
| or r6, 1 | |||
| add r3, 1 | |||
| movd m3, r4d | |||
| movd m4, r5d | |||
| movd m5, r6d | |||
| movd m6, r3d | |||
| %ifdef ARCH_X86_64 | |||
| %define off_regd r11d | |||
| %else | |||
| %define off_regd r3d | |||
| %endif | |||
| mov off_regd, r7m | |||
| add off_regd, 1 | |||
| or off_regd, 1 | |||
| add r4, 1 | |||
| movd m3, r5d | |||
| movd m4, r6d | |||
| movd m5, off_regd | |||
| movd m6, r4d | |||
| pslld m5, m6 | |||
| psrld m5, 1 | |||
| %if mmsize == 16 | |||
| @@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||
| packuswb m0, m1 | |||
| %endmacro | |||
| %macro BIWEIGHT_FUNC_DBL_MM 1 | |||
| cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 | |||
| INIT_MMX | |||
| cglobal h264_biweight_16_mmx2, 7, 7, 0 | |||
| BIWEIGHT_SETUP | |||
| mov r3, %1 | |||
| %if %1 == 16 | |||
| movifnidn r3d, r3m | |||
| .nextrow | |||
| BIWEIGHT_STEPA 0, 1, 0 | |||
| BIWEIGHT_STEPA 1, 2, 4 | |||
| @@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 | |||
| mova [r0+8], m0 | |||
| add r0, r2 | |||
| add r1, r2 | |||
| dec r3 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| BIWEIGHT_FUNC_DBL_MM 16 | |||
| BIWEIGHT_FUNC_DBL_MM 8 | |||
| %macro BIWEIGHT_FUNC_MM 4 | |||
| cglobal h264_biweight_%1x%2_%4, 7, 7, %3 | |||
| %macro BIWEIGHT_FUNC_MM 3 | |||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||
| BIWEIGHT_SETUP | |||
| mov r3, %2 | |||
| %if %2 == 16 | |||
| movifnidn r3d, r3m | |||
| .nextrow | |||
| BIWEIGHT_STEPA 0, 1, 0 | |||
| BIWEIGHT_STEPA 1, 2, mmsize/2 | |||
| @@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 | |||
| mova [r0], m0 | |||
| add r0, r2 | |||
| add r1, r2 | |||
| dec r3 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 | |||
| BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 | |||
| BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 | |||
| BIWEIGHT_FUNC_MM 8, 0, mmx2 | |||
| INIT_XMM | |||
| BIWEIGHT_FUNC_MM 16, 16, 8, sse2 | |||
| BIWEIGHT_FUNC_MM 16, 8, 8, sse2 | |||
| BIWEIGHT_FUNC_MM 16, 8, sse2 | |||
| %macro BIWEIGHT_FUNC_HALF_MM 5 | |||
| cglobal h264_biweight_%1x%2_%5, 7, 7, %4 | |||
| %macro BIWEIGHT_FUNC_HALF_MM 3 | |||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||
| BIWEIGHT_SETUP | |||
| mov r3, %2/2 | |||
| movifnidn r3d, r3m | |||
| sar r3, 1 | |||
| lea r4, [r2*2] | |||
| %if %2 == mmsize | |||
| .nextrow | |||
| BIWEIGHT_STEPA 0, 1, 0 | |||
| BIWEIGHT_STEPA 1, 2, r2 | |||
| @@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 | |||
| %endif | |||
| add r0, r4 | |||
| add r1, r4 | |||
| dec r3 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_MMX | |||
| BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |||
| BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |||
| BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |||
| BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||
| INIT_XMM | |||
| BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |||
| BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |||
| BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||
| BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||
| %macro BIWEIGHT_SSSE3_SETUP 0 | |||
| add r6, 1 | |||
| or r6, 1 | |||
| add r3, 1 | |||
| movd m4, r4d | |||
| movd m0, r5d | |||
| movd m5, r6d | |||
| movd m6, r3d | |||
| %ifdef ARCH_X86_64 | |||
| %define off_regd r11d | |||
| %else | |||
| %define off_regd r3d | |||
| %endif | |||
| mov off_regd, r7m | |||
| add off_regd, 1 | |||
| or off_regd, 1 | |||
| add r4, 1 | |||
| movd m4, r5d | |||
| movd m0, r6d | |||
| movd m5, off_regd | |||
| movd m6, r4d | |||
| pslld m5, m6 | |||
| psrld m5, 1 | |||
| punpcklbw m4, m0 | |||
| @@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||
| packuswb m0, m2 | |||
| %endmacro | |||
| %macro BIWEIGHT_SSSE3_16 1 | |||
| cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 | |||
| INIT_XMM | |||
| cglobal h264_biweight_16_ssse3, 7, 7, 8 | |||
| BIWEIGHT_SSSE3_SETUP | |||
| mov r3, %1 | |||
| movifnidn r3d, r3m | |||
| %if %1 == 16 | |||
| .nextrow | |||
| movh m0, [r0] | |||
| movh m2, [r0+8] | |||
| @@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 | |||
| mova [r0], m0 | |||
| add r0, r2 | |||
| add r1, r2 | |||
| dec r3 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_XMM | |||
| BIWEIGHT_SSSE3_16 16 | |||
| BIWEIGHT_SSSE3_16 8 | |||
| %macro BIWEIGHT_SSSE3_8 1 | |||
| cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 | |||
| cglobal h264_biweight_8_ssse3, 7, 7, 8 | |||
| BIWEIGHT_SSSE3_SETUP | |||
| mov r3, %1/2 | |||
| movifnidn r3d, r3m | |||
| sar r3, 1 | |||
| lea r4, [r2*2] | |||
| %if %1 == 16 | |||
| .nextrow | |||
| movh m0, [r0] | |||
| movh m1, [r1] | |||
| @@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 | |||
| movhps [r0+r2], m0 | |||
| add r0, r4 | |||
| add r1, r4 | |||
| dec r3 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| %else | |||
| jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) | |||
| %endif | |||
| %endmacro | |||
| INIT_XMM | |||
| BIWEIGHT_SSSE3_8 16 | |||
| BIWEIGHT_SSSE3_8 8 | |||
| BIWEIGHT_SSSE3_8 4 | |||
| @@ -36,33 +36,26 @@ cextern pw_1 | |||
| SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| ; void h264_weight(uint8_t *dst, int stride, int log2_denom, | |||
| ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, | |||
| ; int weight, int offset); | |||
| ;----------------------------------------------------------------------------- | |||
| %ifdef ARCH_X86_32 | |||
| DECLARE_REG_TMP 2 | |||
| %else | |||
| DECLARE_REG_TMP 10 | |||
| %endif | |||
| %macro WEIGHT_PROLOGUE 1 | |||
| mov t0, %1 | |||
| %macro WEIGHT_PROLOGUE 0 | |||
| .prologue | |||
| PROLOGUE 0,5,8 | |||
| PROLOGUE 0,6,8 | |||
| movifnidn r0, r0mp | |||
| movifnidn r1d, r1m | |||
| movifnidn r3d, r3m | |||
| movifnidn r4d, r4m | |||
| movifnidn r5d, r5m | |||
| %endmacro | |||
| %macro WEIGHT_SETUP 1 | |||
| mova m0, [pw_1] | |||
| movd m2, r2m | |||
| movd m2, r3m | |||
| pslld m0, m2 ; 1<<log2_denom | |||
| SPLATW m0, m0 | |||
| shl r4, 19 ; *8, move to upper half of dword | |||
| lea r4, [r4+r3*2+0x10000] | |||
| movd m3, r4d ; weight<<1 | 1+(offset<<(3)) | |||
| shl r5, 19 ; *8, move to upper half of dword | |||
| lea r5, [r5+r4*2+0x10000] | |||
| movd m3, r5d ; weight<<1 | 1+(offset<<(3)) | |||
| pshufd m3, m3, 0 | |||
| mova m4, [pw_pixel_max] | |||
| paddw m2, [sq_1] ; log2_denom+1 | |||
| @@ -96,8 +89,8 @@ DECLARE_REG_TMP 10 | |||
| %endmacro | |||
| %macro WEIGHT_FUNC_DBL 1 | |||
| cglobal h264_weight_16x16_10_%1 | |||
| WEIGHT_PROLOGUE 16 | |||
| cglobal h264_weight_16_10_%1 | |||
| WEIGHT_PROLOGUE | |||
| WEIGHT_SETUP %1 | |||
| .nextrow | |||
| WEIGHT_OP %1, 0 | |||
| @@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1 | |||
| WEIGHT_OP %1, 16 | |||
| mova [r0+16], m5 | |||
| add r0, r1 | |||
| dec t0 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_weight_16x8_10_%1 | |||
| mov t0, 8 | |||
| jmp mangle(ff_h264_weight_16x16_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4 | |||
| %macro WEIGHT_FUNC_MM 1 | |||
| cglobal h264_weight_8x16_10_%1 | |||
| WEIGHT_PROLOGUE 16 | |||
| cglobal h264_weight_8_10_%1 | |||
| WEIGHT_PROLOGUE | |||
| WEIGHT_SETUP %1 | |||
| .nextrow | |||
| WEIGHT_OP %1, 0 | |||
| mova [r0], m5 | |||
| add r0, r1 | |||
| dec t0 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_weight_8x8_10_%1 | |||
| mov t0, 8 | |||
| jmp mangle(ff_h264_weight_8x16_10_%1.prologue) | |||
| cglobal h264_weight_8x4_10_%1 | |||
| mov t0, 4 | |||
| jmp mangle(ff_h264_weight_8x16_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4 | |||
| %macro WEIGHT_FUNC_HALF_MM 1 | |||
| cglobal h264_weight_4x8_10_%1 | |||
| WEIGHT_PROLOGUE 4 | |||
| cglobal h264_weight_4_10_%1 | |||
| WEIGHT_PROLOGUE | |||
| sar r2d, 1 | |||
| WEIGHT_SETUP %1 | |||
| lea r3, [r1*2] | |||
| .nextrow | |||
| @@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1 | |||
| movh [r0], m5 | |||
| movhps [r0+r1], m5 | |||
| add r0, r3 | |||
| dec t0 | |||
| dec r2d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_weight_4x4_10_%1 | |||
| mov t0, 2 | |||
| jmp mangle(ff_h264_weight_4x8_10_%1.prologue) | |||
| cglobal h264_weight_4x2_10_%1 | |||
| mov t0, 1 | |||
| jmp mangle(ff_h264_weight_4x8_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4 | |||
| ;----------------------------------------------------------------------------- | |||
| ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom, | |||
| ; int weightd, int weights, int offset); | |||
| ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, | |||
| ; int log2_denom, int weightd, int weights, int offset); | |||
| ;----------------------------------------------------------------------------- | |||
| %ifdef ARCH_X86_32 | |||
| DECLARE_REG_TMP 2,3 | |||
| DECLARE_REG_TMP 3 | |||
| %else | |||
| DECLARE_REG_TMP 10,2 | |||
| DECLARE_REG_TMP 10 | |||
| %endif | |||
| %macro BIWEIGHT_PROLOGUE 1 | |||
| mov t0, %1 | |||
| %macro BIWEIGHT_PROLOGUE 0 | |||
| .prologue | |||
| PROLOGUE 0,7,8 | |||
| movifnidn r0, r0mp | |||
| movifnidn r1, r1mp | |||
| movifnidn t1d, r2m | |||
| movifnidn r4d, r4m | |||
| movifnidn r2d, r2m | |||
| movifnidn r5d, r5m | |||
| movifnidn r6d, r6m | |||
| movifnidn t0d, r7m | |||
| %endmacro | |||
| %macro BIWEIGHT_SETUP 1 | |||
| lea r6, [r6*4+1] ; (offset<<2)+1 | |||
| or r6, 1 | |||
| shl r5, 16 | |||
| or r4, r5 | |||
| movd m4, r4d ; weightd | weights | |||
| movd m5, r6d ; (offset+1)|1 | |||
| movd m6, r3m ; log2_denom | |||
| lea t0, [t0*4+1] ; (offset<<2)+1 | |||
| or t0, 1 | |||
| shl r6, 16 | |||
| or r5, r6 | |||
| movd m4, r5d ; weightd | weights | |||
| movd m5, t0d ; (offset+1)|1 | |||
| movd m6, r4m ; log2_denom | |||
| pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom | |||
| paddd m6, [sq_1] | |||
| pshufd m4, m4, 0 | |||
| pshufd m5, m5, 0 | |||
| mova m3, [pw_pixel_max] | |||
| movifnidn r3d, r3m | |||
| %ifnidn %1, sse4 | |||
| pxor m7, m7 | |||
| %endif | |||
| @@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2 | |||
| %endmacro | |||
| %macro BIWEIGHT_FUNC_DBL 1 | |||
| cglobal h264_biweight_16x16_10_%1 | |||
| BIWEIGHT_PROLOGUE 16 | |||
| cglobal h264_biweight_16_10_%1 | |||
| BIWEIGHT_PROLOGUE | |||
| BIWEIGHT_SETUP %1 | |||
| .nextrow | |||
| BIWEIGHT %1, 0 | |||
| mova [r0 ], m0 | |||
| BIWEIGHT %1, 16 | |||
| mova [r0+16], m0 | |||
| add r0, t1 | |||
| add r1, t1 | |||
| dec t0 | |||
| add r0, r2 | |||
| add r1, r2 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_biweight_16x8_10_%1 | |||
| mov t0, 8 | |||
| jmp mangle(ff_h264_biweight_16x16_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2 | |||
| BIWEIGHT_FUNC_DBL sse4 | |||
| %macro BIWEIGHT_FUNC 1 | |||
| cglobal h264_biweight_8x16_10_%1 | |||
| BIWEIGHT_PROLOGUE 16 | |||
| cglobal h264_biweight_8_10_%1 | |||
| BIWEIGHT_PROLOGUE | |||
| BIWEIGHT_SETUP %1 | |||
| .nextrow | |||
| BIWEIGHT %1, 0 | |||
| mova [r0], m0 | |||
| add r0, t1 | |||
| add r1, t1 | |||
| dec t0 | |||
| add r0, r2 | |||
| add r1, r2 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_biweight_8x8_10_%1 | |||
| mov t0, 8 | |||
| jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) | |||
| cglobal h264_biweight_8x4_10_%1 | |||
| mov t0, 4 | |||
| jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2 | |||
| BIWEIGHT_FUNC sse4 | |||
| %macro BIWEIGHT_FUNC_HALF 1 | |||
| cglobal h264_biweight_4x8_10_%1 | |||
| BIWEIGHT_PROLOGUE 4 | |||
| cglobal h264_biweight_4_10_%1 | |||
| BIWEIGHT_PROLOGUE | |||
| BIWEIGHT_SETUP %1 | |||
| lea r4, [t1*2] | |||
| sar r3d, 1 | |||
| lea r4, [r2*2] | |||
| .nextrow | |||
| BIWEIGHT %1, 0, t1 | |||
| BIWEIGHT %1, 0, r2 | |||
| movh [r0 ], m0 | |||
| movhps [r0+t1], m0 | |||
| movhps [r0+r2], m0 | |||
| add r0, r4 | |||
| add r1, r4 | |||
| dec t0 | |||
| dec r3d | |||
| jnz .nextrow | |||
| REP_RET | |||
| cglobal h264_biweight_4x4_10_%1 | |||
| mov t0, 2 | |||
| jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) | |||
| cglobal h264_biweight_4x2_10_%1 | |||
| mov t0, 1 | |||
| jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) | |||
| %endmacro | |||
| INIT_XMM | |||
| @@ -298,57 +298,47 @@ LF_IFUNC(v, luma_intra, 10, mmxext) | |||
| /***********************************/ | |||
| /* weighted prediction */ | |||
| #define H264_WEIGHT(W, H, OPT) \ | |||
| void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ | |||
| int stride, int log2_denom, int weight, int offset); | |||
| #define H264_WEIGHT(W, OPT) \ | |||
| void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \ | |||
| int stride, int height, int log2_denom, int weight, int offset); | |||
| #define H264_BIWEIGHT(W, H, OPT) \ | |||
| void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ | |||
| uint8_t *src, int stride, int log2_denom, int weightd, \ | |||
| #define H264_BIWEIGHT(W, OPT) \ | |||
| void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \ | |||
| uint8_t *src, int stride, int height, int log2_denom, int weightd, \ | |||
| int weights, int offset); | |||
| #define H264_BIWEIGHT_MMX(W,H) \ | |||
| H264_WEIGHT (W, H, mmx2) \ | |||
| H264_BIWEIGHT(W, H, mmx2) | |||
| #define H264_BIWEIGHT_MMX_SSE(W,H) \ | |||
| H264_BIWEIGHT_MMX(W, H) \ | |||
| H264_WEIGHT (W, H, sse2) \ | |||
| H264_BIWEIGHT (W, H, sse2) \ | |||
| H264_BIWEIGHT (W, H, ssse3) | |||
| H264_BIWEIGHT_MMX_SSE(16, 16) | |||
| H264_BIWEIGHT_MMX_SSE(16, 8) | |||
| H264_BIWEIGHT_MMX_SSE( 8, 16) | |||
| H264_BIWEIGHT_MMX_SSE( 8, 8) | |||
| H264_BIWEIGHT_MMX_SSE( 8, 4) | |||
| H264_BIWEIGHT_MMX ( 4, 8) | |||
| H264_BIWEIGHT_MMX ( 4, 4) | |||
| H264_BIWEIGHT_MMX ( 4, 2) | |||
| #define H264_WEIGHT_10(W, H, DEPTH, OPT) \ | |||
| void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||
| int stride, int log2_denom, int weight, int offset); | |||
| #define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \ | |||
| void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \ | |||
| (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \ | |||
| #define H264_BIWEIGHT_MMX(W) \ | |||
| H264_WEIGHT (W, mmx2) \ | |||
| H264_BIWEIGHT(W, mmx2) | |||
| #define H264_BIWEIGHT_MMX_SSE(W) \ | |||
| H264_BIWEIGHT_MMX(W) \ | |||
| H264_WEIGHT (W, sse2) \ | |||
| H264_BIWEIGHT (W, sse2) \ | |||
| H264_BIWEIGHT (W, ssse3) | |||
| H264_BIWEIGHT_MMX_SSE(16) | |||
| H264_BIWEIGHT_MMX_SSE( 8) | |||
| H264_BIWEIGHT_MMX ( 4) | |||
| #define H264_WEIGHT_10(W, DEPTH, OPT) \ | |||
| void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||
| int stride, int height, int log2_denom, int weight, int offset); | |||
| #define H264_BIWEIGHT_10(W, DEPTH, OPT) \ | |||
| void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \ | |||
| (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \ | |||
| int weightd, int weights, int offset); | |||
| #define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \ | |||
| H264_WEIGHT_10 (W, H, DEPTH, sse2) \ | |||
| H264_WEIGHT_10 (W, H, DEPTH, sse4) \ | |||
| H264_BIWEIGHT_10(W, H, DEPTH, sse2) \ | |||
| H264_BIWEIGHT_10(W, H, DEPTH, sse4) | |||
| H264_BIWEIGHT_10_SSE(16, 16, 10) | |||
| H264_BIWEIGHT_10_SSE(16, 8, 10) | |||
| H264_BIWEIGHT_10_SSE( 8, 16, 10) | |||
| H264_BIWEIGHT_10_SSE( 8, 8, 10) | |||
| H264_BIWEIGHT_10_SSE( 8, 4, 10) | |||
| H264_BIWEIGHT_10_SSE( 4, 8, 10) | |||
| H264_BIWEIGHT_10_SSE( 4, 4, 10) | |||
| H264_BIWEIGHT_10_SSE( 4, 2, 10) | |||
| #define H264_BIWEIGHT_10_SSE(W, DEPTH) \ | |||
| H264_WEIGHT_10 (W, DEPTH, sse2) \ | |||
| H264_WEIGHT_10 (W, DEPTH, sse4) \ | |||
| H264_BIWEIGHT_10(W, DEPTH, sse2) \ | |||
| H264_BIWEIGHT_10(W, DEPTH, sse4) | |||
| H264_BIWEIGHT_10_SSE(16, 10) | |||
| H264_BIWEIGHT_10_SSE( 8, 10) | |||
| H264_BIWEIGHT_10_SSE( 4, 10) | |||
| void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) | |||
| { | |||
| @@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||
| c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; | |||
| c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; | |||
| #endif | |||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | |||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | |||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | |||
| c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | |||
| c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | |||
| c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; | |||
| c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; | |||
| c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; | |||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; | |||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | |||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | |||
| c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | |||
| c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | |||
| c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | |||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; | |||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; | |||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; | |||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; | |||
| if (mm_flags&AV_CPU_FLAG_SSE2) { | |||
| c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | |||
| @@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||
| c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; | |||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | |||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | |||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | |||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; | |||
| c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; | |||
| c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; | |||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; | |||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; | |||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; | |||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; | |||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; | |||
| #if HAVE_ALIGNED_STACK | |||
| c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | |||
| @@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||
| #endif | |||
| } | |||
| if (mm_flags&AV_CPU_FLAG_SSSE3) { | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; | |||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; | |||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; | |||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; | |||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; | |||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; | |||
| } | |||
| if (mm_flags&AV_CPU_FLAG_AVX) { | |||
| #if HAVE_ALIGNED_STACK | |||
| @@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||
| c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; | |||
| #endif | |||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; | |||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; | |||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; | |||
| c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; | |||
| c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; | |||
| c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; | |||
| c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; | |||
| c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; | |||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; | |||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; | |||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; | |||
| c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; | |||
| c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; | |||
| c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; | |||
| c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; | |||
| c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; | |||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; | |||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; | |||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; | |||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; | |||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; | |||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; | |||
| c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; | |||
| c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; | |||
| @@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||
| #endif | |||
| } | |||
| if (mm_flags&AV_CPU_FLAG_SSE4) { | |||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; | |||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; | |||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; | |||
| c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; | |||
| c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; | |||
| c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; | |||
| c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; | |||
| c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; | |||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; | |||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; | |||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; | |||
| c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; | |||
| c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; | |||
| c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; | |||
| c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; | |||
| c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; | |||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; | |||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; | |||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; | |||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; | |||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; | |||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; | |||
| } | |||
| #if HAVE_AVX | |||
| if (mm_flags&AV_CPU_FLAG_AVX) { | |||