Neon parts by Mans Rullgard <mans@mansr.com>.tags/n0.9
| @@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, | |||||
| void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, | void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, | ||||
| int beta, int8_t *tc0); | int beta, int8_t *tc0); | ||||
| void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, | |||||
| int weight, int offset); | |||||
| void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, | |||||
| int log2_den, int weight, int offset); | |||||
| void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, | |||||
| int log2_den, int weight, int offset); | |||||
| void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, | |||||
| int log2_den, int weight, int offset); | |||||
| void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int log2_den, int weightd, int weights, | |||||
| int offset); | |||||
| void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int height, int log2_den, int weightd, | |||||
| int weights, int offset); | |||||
| void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int height, int log2_den, int weightd, | |||||
| int weights, int offset); | |||||
| void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, | |||||
| int height, int log2_den, int weightd, | |||||
| int weights, int offset); | |||||
| void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); | void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); | ||||
| void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); | void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); | ||||
| @@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i | |||||
| c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; | c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; | ||||
| c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; | c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; | ||||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; | |||||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; | |||||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; | |||||
| c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; | |||||
| c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; | |||||
| c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; | |||||
| c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; | |||||
| c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; | |||||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; | |||||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; | |||||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; | |||||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; | |||||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; | |||||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; | |||||
| c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; | |||||
| c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; | |||||
| c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; | |||||
| c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; | |||||
| c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; | |||||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; | |||||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; | |||||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; | |||||
| c->h264_idct_add = ff_h264_idct_add_neon; | c->h264_idct_add = ff_h264_idct_add_neon; | ||||
| c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; | c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; | ||||
| @@ -1592,7 +1592,7 @@ endfunc | |||||
| vdup.8 d1, r5 | vdup.8 d1, r5 | ||||
| vmov q2, q8 | vmov q2, q8 | ||||
| vmov q3, q8 | vmov q3, q8 | ||||
| 1: subs ip, ip, #2 | |||||
| 1: subs r3, r3, #2 | |||||
| vld1.8 {d20-d21},[r0,:128], r2 | vld1.8 {d20-d21},[r0,:128], r2 | ||||
| \macd q2, d0, d20 | \macd q2, d0, d20 | ||||
| pld [r0] | pld [r0] | ||||
| @@ -1632,7 +1632,7 @@ endfunc | |||||
| vdup.8 d1, r5 | vdup.8 d1, r5 | ||||
| vmov q1, q8 | vmov q1, q8 | ||||
| vmov q10, q8 | vmov q10, q8 | ||||
| 1: subs ip, ip, #2 | |||||
| 1: subs r3, r3, #2 | |||||
| vld1.8 {d4},[r0,:64], r2 | vld1.8 {d4},[r0,:64], r2 | ||||
| \macd q1, d0, d4 | \macd q1, d0, d4 | ||||
| pld [r0] | pld [r0] | ||||
| @@ -1662,7 +1662,7 @@ endfunc | |||||
| vdup.8 d1, r5 | vdup.8 d1, r5 | ||||
| vmov q1, q8 | vmov q1, q8 | ||||
| vmov q10, q8 | vmov q10, q8 | ||||
| 1: subs ip, ip, #4 | |||||
| 1: subs r3, r3, #4 | |||||
| vld1.32 {d4[0]},[r0,:32], r2 | vld1.32 {d4[0]},[r0,:32], r2 | ||||
| vld1.32 {d4[1]},[r0,:32], r2 | vld1.32 {d4[1]},[r0,:32], r2 | ||||
| \macd q1, d0, d4 | \macd q1, d0, d4 | ||||
| @@ -1700,16 +1700,17 @@ endfunc | |||||
| .endm | .endm | ||||
| .macro biweight_func w | .macro biweight_func w | ||||
| function biweight_h264_pixels_\w\()_neon | |||||
| function ff_biweight_h264_pixels_\w\()_neon, export=1 | |||||
| push {r4-r6, lr} | push {r4-r6, lr} | ||||
| add r4, sp, #16 | |||||
| ldr r12, [sp, #16] | |||||
| add r4, sp, #20 | |||||
| ldm r4, {r4-r6} | ldm r4, {r4-r6} | ||||
| lsr lr, r4, #31 | lsr lr, r4, #31 | ||||
| add r6, r6, #1 | add r6, r6, #1 | ||||
| eors lr, lr, r5, lsr #30 | eors lr, lr, r5, lsr #30 | ||||
| orr r6, r6, #1 | orr r6, r6, #1 | ||||
| vdup.16 q9, r3 | |||||
| lsl r6, r6, r3 | |||||
| vdup.16 q9, r12 | |||||
| lsl r6, r6, r12 | |||||
| vmvn q9, q9 | vmvn q9, q9 | ||||
| vdup.16 q8, r6 | vdup.16 q8, r6 | ||||
| mov r6, r0 | mov r6, r0 | ||||
| @@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon | |||||
| endfunc | endfunc | ||||
| .endm | .endm | ||||
| .macro biweight_entry w, h, b=1 | |||||
| function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |||||
| mov ip, #\h | |||||
| .if \b | |||||
| b biweight_h264_pixels_\w\()_neon | |||||
| .endif | |||||
| endfunc | |||||
| .endm | |||||
| biweight_entry 16, 8 | |||||
| biweight_entry 16, 16, b=0 | |||||
| biweight_func 16 | biweight_func 16 | ||||
| biweight_entry 8, 16 | |||||
| biweight_entry 8, 4 | |||||
| biweight_entry 8, 8, b=0 | |||||
| biweight_func 8 | biweight_func 8 | ||||
| biweight_entry 4, 8 | |||||
| biweight_entry 4, 2 | |||||
| biweight_entry 4, 4, b=0 | |||||
| biweight_func 4 | biweight_func 4 | ||||
| @ Weighted prediction | @ Weighted prediction | ||||
| .macro weight_16 add | .macro weight_16 add | ||||
| vdup.8 d0, r3 | |||||
| 1: subs ip, ip, #2 | |||||
| vdup.8 d0, r12 | |||||
| 1: subs r2, r2, #2 | |||||
| vld1.8 {d20-d21},[r0,:128], r1 | vld1.8 {d20-d21},[r0,:128], r1 | ||||
| vmull.u8 q2, d0, d20 | vmull.u8 q2, d0, d20 | ||||
| pld [r0] | pld [r0] | ||||
| @@ -1785,8 +1767,8 @@ endfunc | |||||
| .endm | .endm | ||||
| .macro weight_8 add | .macro weight_8 add | ||||
| vdup.8 d0, r3 | |||||
| 1: subs ip, ip, #2 | |||||
| vdup.8 d0, r12 | |||||
| 1: subs r2, r2, #2 | |||||
| vld1.8 {d4},[r0,:64], r1 | vld1.8 {d4},[r0,:64], r1 | ||||
| vmull.u8 q1, d0, d4 | vmull.u8 q1, d0, d4 | ||||
| pld [r0] | pld [r0] | ||||
| @@ -1806,10 +1788,10 @@ endfunc | |||||
| .endm | .endm | ||||
| .macro weight_4 add | .macro weight_4 add | ||||
| vdup.8 d0, r3 | |||||
| vdup.8 d0, r12 | |||||
| vmov q1, q8 | vmov q1, q8 | ||||
| vmov q10, q8 | vmov q10, q8 | ||||
| 1: subs ip, ip, #4 | |||||
| 1: subs r2, r2, #4 | |||||
| vld1.32 {d4[0]},[r0,:32], r1 | vld1.32 {d4[0]},[r0,:32], r1 | ||||
| vld1.32 {d4[1]},[r0,:32], r1 | vld1.32 {d4[1]},[r0,:32], r1 | ||||
| vmull.u8 q1, d0, d4 | vmull.u8 q1, d0, d4 | ||||
| @@ -1842,50 +1824,32 @@ endfunc | |||||
| .endm | .endm | ||||
| .macro weight_func w | .macro weight_func w | ||||
| function weight_h264_pixels_\w\()_neon | |||||
| function ff_weight_h264_pixels_\w\()_neon, export=1 | |||||
| push {r4, lr} | push {r4, lr} | ||||
| ldr r4, [sp, #8] | |||||
| cmp r2, #1 | |||||
| lsl r4, r4, r2 | |||||
| ldr r12, [sp, #8] | |||||
| ldr r4, [sp, #12] | |||||
| cmp r3, #1 | |||||
| lsl r4, r4, r3 | |||||
| vdup.16 q8, r4 | vdup.16 q8, r4 | ||||
| mov r4, r0 | mov r4, r0 | ||||
| ble 20f | ble 20f | ||||
| rsb lr, r2, #1 | |||||
| rsb lr, r3, #1 | |||||
| vdup.16 q9, lr | vdup.16 q9, lr | ||||
| cmp r3, #0 | |||||
| cmp r12, #0 | |||||
| blt 10f | blt 10f | ||||
| weight_\w vhadd.s16 | weight_\w vhadd.s16 | ||||
| 10: rsb r3, r3, #0 | |||||
| 10: rsb r12, r12, #0 | |||||
| weight_\w vhsub.s16 | weight_\w vhsub.s16 | ||||
| 20: rsb lr, r2, #0 | |||||
| 20: rsb lr, r3, #0 | |||||
| vdup.16 q9, lr | vdup.16 q9, lr | ||||
| cmp r3, #0 | |||||
| cmp r12, #0 | |||||
| blt 10f | blt 10f | ||||
| weight_\w vadd.s16 | weight_\w vadd.s16 | ||||
| 10: rsb r3, r3, #0 | |||||
| 10: rsb r12, r12, #0 | |||||
| weight_\w vsub.s16 | weight_\w vsub.s16 | ||||
| endfunc | endfunc | ||||
| .endm | .endm | ||||
| .macro weight_entry w, h, b=1 | |||||
| function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |||||
| mov ip, #\h | |||||
| .if \b | |||||
| b weight_h264_pixels_\w\()_neon | |||||
| .endif | |||||
| endfunc | |||||
| .endm | |||||
| weight_entry 16, 8 | |||||
| weight_entry 16, 16, b=0 | |||||
| weight_func 16 | weight_func 16 | ||||
| weight_entry 8, 16 | |||||
| weight_entry 8, 4 | |||||
| weight_entry 8, 8, b=0 | |||||
| weight_func 8 | weight_func 8 | ||||
| weight_entry 4, 8 | |||||
| weight_entry 4, 2 | |||||
| weight_entry 4, 4, b=0 | |||||
| weight_func 4 | weight_func 4 | ||||
| @@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){ | |||||
| } | } | ||||
| #endif | #endif | ||||
| static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, | |||||
| static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, | |||||
| int height, int delta, int list, | |||||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | ||||
| int src_x_offset, int src_y_offset, | int src_x_offset, int src_y_offset, | ||||
| qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, | qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, | ||||
| @@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, | |||||
| s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | ||||
| src_cb= s->edge_emu_buffer; | src_cb= s->edge_emu_buffer; | ||||
| } | } | ||||
| chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||||
| chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||||
| if(emu){ | if(emu){ | ||||
| s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); | ||||
| src_cr= s->edge_emu_buffer; | src_cr= s->edge_emu_buffer; | ||||
| } | } | ||||
| chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||||
| chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); | |||||
| } | } | ||||
| static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, | |||||
| static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, | |||||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | ||||
| int x_offset, int y_offset, | int x_offset, int y_offset, | ||||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | ||||
| @@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei | |||||
| if(list0){ | if(list0){ | ||||
| Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; | Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; | ||||
| mc_dir_part(h, ref, n, square, chroma_height, delta, 0, | |||||
| mc_dir_part(h, ref, n, square, height, delta, 0, | |||||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | dest_y, dest_cb, dest_cr, x_offset, y_offset, | ||||
| qpix_op, chroma_op, pixel_shift, chroma444); | qpix_op, chroma_op, pixel_shift, chroma444); | ||||
| @@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei | |||||
| if(list1){ | if(list1){ | ||||
| Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; | Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; | ||||
| mc_dir_part(h, ref, n, square, chroma_height, delta, 1, | |||||
| mc_dir_part(h, ref, n, square, height, delta, 1, | |||||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | dest_y, dest_cb, dest_cr, x_offset, y_offset, | ||||
| qpix_op, chroma_op, pixel_shift, chroma444); | qpix_op, chroma_op, pixel_shift, chroma444); | ||||
| } | } | ||||
| } | } | ||||
| static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, | |||||
| static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta, | |||||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | ||||
| int x_offset, int y_offset, | int x_offset, int y_offset, | ||||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | ||||
| @@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom | |||||
| h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, | h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, | ||||
| int list0, int list1, int pixel_shift, int chroma444){ | int list0, int list1, int pixel_shift, int chroma444){ | ||||
| MpegEncContext * const s = &h->s; | MpegEncContext * const s = &h->s; | ||||
| int chroma_height; | |||||
| dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | ||||
| if(chroma444){ | if(chroma444){ | ||||
| chroma_height = height; | |||||
| chroma_weight_avg = luma_weight_avg; | chroma_weight_avg = luma_weight_avg; | ||||
| chroma_weight_op = luma_weight_op; | chroma_weight_op = luma_weight_op; | ||||
| dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | ||||
| dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; | ||||
| } else if (CHROMA422) { | } else if (CHROMA422) { | ||||
| chroma_height = height; | |||||
| dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | ||||
| dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; | ||||
| }else{ | }else{ | ||||
| chroma_height = height >> 1; | |||||
| dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | ||||
| dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; | ||||
| } | } | ||||
| @@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom | |||||
| int refn0 = h->ref_cache[0][ scan8[n] ]; | int refn0 = h->ref_cache[0][ scan8[n] ]; | ||||
| int refn1 = h->ref_cache[1][ scan8[n] ]; | int refn1 = h->ref_cache[1][ scan8[n] ]; | ||||
| mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, | |||||
| mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, | |||||
| dest_y, dest_cb, dest_cr, | dest_y, dest_cb, dest_cr, | ||||
| x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | ||||
| mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, | |||||
| mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, | |||||
| tmp_y, tmp_cb, tmp_cr, | tmp_y, tmp_cb, tmp_cr, | ||||
| x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); | ||||
| if(h->use_weight == 2){ | if(h->use_weight == 2){ | ||||
| int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; | int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; | ||||
| int weight1 = 64 - weight0; | int weight1 = 64 - weight0; | ||||
| luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); | |||||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); | |||||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); | |||||
| if (CHROMA422) { | |||||
| chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, | |||||
| tmp_cb + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, 5, weight0, weight1, 0); | |||||
| chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, | |||||
| tmp_cr + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, 5, weight0, weight1, 0); | |||||
| } | |||||
| luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, | |||||
| height, 5, weight0, weight1, 0); | |||||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, | |||||
| chroma_height, 5, weight0, weight1, 0); | |||||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, | |||||
| chroma_height, 5, weight0, weight1, 0); | |||||
| }else{ | }else{ | ||||
| luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, | |||||
| luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom, | |||||
| h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], | h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], | ||||
| h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); | h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); | ||||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], | h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], | ||||
| h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); | h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); | ||||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], | h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], | ||||
| h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); | h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); | ||||
| if (CHROMA422) { | |||||
| chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, | |||||
| tmp_cb + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], | |||||
| h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); | |||||
| chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, | |||||
| tmp_cr + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], | |||||
| h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); | |||||
| } | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| int list = list1 ? 1 : 0; | int list = list1 ? 1 : 0; | ||||
| int refn = h->ref_cache[list][ scan8[n] ]; | int refn = h->ref_cache[list][ scan8[n] ]; | ||||
| Picture *ref= &h->ref_list[list][refn]; | Picture *ref= &h->ref_list[list][refn]; | ||||
| mc_dir_part(h, ref, n, square, chroma_height, delta, list, | |||||
| mc_dir_part(h, ref, n, square, height, delta, list, | |||||
| dest_y, dest_cb, dest_cr, x_offset, y_offset, | dest_y, dest_cb, dest_cr, x_offset, y_offset, | ||||
| qpix_put, chroma_put, pixel_shift, chroma444); | qpix_put, chroma_put, pixel_shift, chroma444); | ||||
| luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, | |||||
| luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, | |||||
| h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); | h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); | ||||
| if(h->use_weight_chroma){ | if(h->use_weight_chroma){ | ||||
| chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); | h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); | ||||
| chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); | h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); | ||||
| if (CHROMA422) { | |||||
| chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); | |||||
| chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, | |||||
| h->mb_uvlinesize, h->chroma_log2_weight_denom, | |||||
| h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, | |||||
| static inline void mc_part(H264Context *h, int n, int square, int height, int delta, | |||||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | ||||
| int x_offset, int y_offset, | int x_offset, int y_offset, | ||||
| qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, | ||||
| @@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height, | |||||
| if((h->use_weight==2 && list0 && list1 | if((h->use_weight==2 && list0 && list1 | ||||
| && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) | && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) | ||||
| || h->use_weight==1) | || h->use_weight==1) | ||||
| mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, | |||||
| mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, | |||||
| x_offset, y_offset, qpix_put, chroma_put, | x_offset, y_offset, qpix_put, chroma_put, | ||||
| weight_op[0], weight_op[3], weight_avg[0], | |||||
| weight_avg[3], list0, list1, pixel_shift, chroma444); | |||||
| weight_op[0], weight_op[1], weight_avg[0], | |||||
| weight_avg[1], list0, list1, pixel_shift, chroma444); | |||||
| else | else | ||||
| mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, | |||||
| mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, | |||||
| x_offset, y_offset, qpix_put, chroma_put, qpix_avg, | x_offset, y_offset, qpix_put, chroma_put, qpix_avg, | ||||
| chroma_avg, list0, list1, pixel_shift, chroma444); | chroma_avg, list0, list1, pixel_shift, chroma444); | ||||
| } | } | ||||
| @@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||||
| prefetch_motion(h, 0, pixel_shift, chroma444); | prefetch_motion(h, 0, pixel_shift, chroma444); | ||||
| if(IS_16X16(mb_type)){ | if(IS_16X16(mb_type)){ | ||||
| mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], | qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], | ||||
| weight_op, weight_avg, | weight_op, weight_avg, | ||||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else if(IS_16X8(mb_type)){ | }else if(IS_16X8(mb_type)){ | ||||
| mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | ||||
| &weight_op[1], &weight_avg[1], | |||||
| weight_op, weight_avg, | |||||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, | |||||
| mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, | |||||
| qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], | ||||
| &weight_op[1], &weight_avg[1], | |||||
| weight_op, weight_avg, | |||||
| IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else if(IS_8X16(mb_type)){ | }else if(IS_8X16(mb_type)){ | ||||
| mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, | |||||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | ||||
| &weight_op[2], &weight_avg[2], | |||||
| &weight_op[1], &weight_avg[1], | |||||
| IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, | |||||
| mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, | |||||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | ||||
| &weight_op[2], &weight_avg[2], | |||||
| &weight_op[1], &weight_avg[1], | |||||
| IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else{ | }else{ | ||||
| @@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||||
| int y_offset= (i&2)<<1; | int y_offset= (i&2)<<1; | ||||
| if(IS_SUB_8X8(sub_mb_type)){ | if(IS_SUB_8X8(sub_mb_type)){ | ||||
| mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], | ||||
| &weight_op[3], &weight_avg[3], | |||||
| &weight_op[1], &weight_avg[1], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else if(IS_SUB_8X4(sub_mb_type)){ | }else if(IS_SUB_8X4(sub_mb_type)){ | ||||
| mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | ||||
| &weight_op[4], &weight_avg[4], | |||||
| &weight_op[1], &weight_avg[1], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, | |||||
| mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, | |||||
| qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], | ||||
| &weight_op[4], &weight_avg[4], | |||||
| &weight_op[1], &weight_avg[1], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else if(IS_SUB_4X8(sub_mb_type)){ | }else if(IS_SUB_4X8(sub_mb_type)){ | ||||
| mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, | |||||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | ||||
| &weight_op[5], &weight_avg[5], | |||||
| &weight_op[2], &weight_avg[2], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, | |||||
| mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, | |||||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | ||||
| &weight_op[5], &weight_avg[5], | |||||
| &weight_op[2], &weight_avg[2], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| }else{ | }else{ | ||||
| @@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t | |||||
| for(j=0; j<4; j++){ | for(j=0; j<4; j++){ | ||||
| int sub_x_offset= x_offset + 2*(j&1); | int sub_x_offset= x_offset + 2*(j&1); | ||||
| int sub_y_offset= y_offset + (j&2); | int sub_y_offset= y_offset + (j&2); | ||||
| mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, | |||||
| mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, | |||||
| qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], | ||||
| &weight_op[6], &weight_avg[6], | |||||
| &weight_op[2], &weight_avg[2], | |||||
| IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), | ||||
| pixel_shift, chroma444); | pixel_shift, chroma444); | ||||
| } | } | ||||
| @@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo | |||||
| else\ | else\ | ||||
| c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ | c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ | ||||
| \ | \ | ||||
| c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ | |||||
| c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ | |||||
| c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ | |||||
| c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ | |||||
| c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ | |||||
| c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ | |||||
| c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ | |||||
| c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ | |||||
| c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\ | |||||
| c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\ | |||||
| c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\ | |||||
| c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\ | |||||
| c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\ | |||||
| c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\ | |||||
| c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\ | |||||
| c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\ | |||||
| c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\ | |||||
| c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\ | |||||
| c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\ | |||||
| c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\ | |||||
| c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\ | |||||
| c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\ | |||||
| c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\ | |||||
| c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\ | |||||
| c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\ | |||||
| c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\ | |||||
| c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\ | |||||
| c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\ | |||||
| \ | \ | ||||
| c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ | c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ | ||||
| c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ | c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ | ||||
| @@ -31,16 +31,18 @@ | |||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); | //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); | ||||
| typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); | |||||
| typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); | |||||
| typedef void (*h264_weight_func)(uint8_t *block, int stride, int height, | |||||
| int log2_denom, int weight, int offset); | |||||
| typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height, | |||||
| int log2_denom, int weightd, int weights, int offset); | |||||
| /** | /** | ||||
| * Context for storing H.264 DSP functions | * Context for storing H.264 DSP functions | ||||
| */ | */ | ||||
| typedef struct H264DSPContext{ | typedef struct H264DSPContext{ | ||||
| /* weighted MC */ | /* weighted MC */ | ||||
| h264_weight_func weight_h264_pixels_tab[10]; | |||||
| h264_biweight_func biweight_h264_pixels_tab[10]; | |||||
| h264_weight_func weight_h264_pixels_tab[4]; | |||||
| h264_biweight_func biweight_h264_pixels_tab[4]; | |||||
| /* loop filter */ | /* loop filter */ | ||||
| void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); | void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); | ||||
| @@ -29,14 +29,16 @@ | |||||
| #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) | #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) | ||||
| #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | ||||
| #define H264_WEIGHT(W,H) \ | |||||
| static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ | |||||
| #define H264_WEIGHT(W) \ | |||||
| static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \ | |||||
| int log2_denom, int weight, int offset) \ | |||||
| { \ | |||||
| int y; \ | int y; \ | ||||
| pixel *block = (pixel*)_block; \ | pixel *block = (pixel*)_block; \ | ||||
| stride /= sizeof(pixel); \ | stride /= sizeof(pixel); \ | ||||
| offset <<= (log2_denom + (BIT_DEPTH-8)); \ | offset <<= (log2_denom + (BIT_DEPTH-8)); \ | ||||
| if(log2_denom) offset += 1<<(log2_denom-1); \ | if(log2_denom) offset += 1<<(log2_denom-1); \ | ||||
| for(y=0; y<H; y++, block += stride){ \ | |||||
| for (y = 0; y < height; y++, block += stride) { \ | |||||
| op_scale1(0); \ | op_scale1(0); \ | ||||
| op_scale1(1); \ | op_scale1(1); \ | ||||
| if(W==2) continue; \ | if(W==2) continue; \ | ||||
| @@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride | |||||
| op_scale1(15); \ | op_scale1(15); \ | ||||
| } \ | } \ | ||||
| } \ | } \ | ||||
| static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \ | |||||
| static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \ | |||||
| int log2_denom, int weightd, int weights, int offset) \ | |||||
| { \ | |||||
| int y; \ | int y; \ | ||||
| pixel *dst = (pixel*)_dst; \ | pixel *dst = (pixel*)_dst; \ | ||||
| pixel *src = (pixel*)_src; \ | pixel *src = (pixel*)_src; \ | ||||
| stride /= sizeof(pixel); \ | stride /= sizeof(pixel); \ | ||||
| offset <<= (BIT_DEPTH-8); \ | offset <<= (BIT_DEPTH-8); \ | ||||
| offset = ((offset + 1) | 1) << log2_denom; \ | offset = ((offset + 1) | 1) << log2_denom; \ | ||||
| for(y=0; y<H; y++, dst += stride, src += stride){ \ | |||||
| for (y = 0; y < height; y++, dst += stride, src += stride) { \ | |||||
| op_scale2(0); \ | op_scale2(0); \ | ||||
| op_scale2(1); \ | op_scale2(1); \ | ||||
| if(W==2) continue; \ | if(W==2) continue; \ | ||||
| @@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_ | |||||
| } \ | } \ | ||||
| } | } | ||||
| H264_WEIGHT(16,16) | |||||
| H264_WEIGHT(16,8) | |||||
| H264_WEIGHT(8,16) | |||||
| H264_WEIGHT(8,8) | |||||
| H264_WEIGHT(8,4) | |||||
| H264_WEIGHT(4,8) | |||||
| H264_WEIGHT(4,4) | |||||
| H264_WEIGHT(4,2) | |||||
| H264_WEIGHT(2,4) | |||||
| H264_WEIGHT(2,2) | |||||
| H264_WEIGHT(16) | |||||
| H264_WEIGHT(8) | |||||
| H264_WEIGHT(4) | |||||
| H264_WEIGHT(2) | |||||
| #undef op_scale1 | #undef op_scale1 | ||||
| #undef op_scale2 | #undef op_scale2 | ||||
| @@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, | |||||
| } | } | ||||
| static av_always_inline | static av_always_inline | ||||
| void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) | |||||
| void weight_h264_W_altivec(uint8_t *block, int stride, int height, | |||||
| int log2_denom, int weight, int offset, int w) | |||||
| { | { | ||||
| int y, aligned; | int y, aligned; | ||||
| vec_u8 vblock; | vec_u8 vblock; | ||||
| @@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei | |||||
| voffset = vec_splat(vtemp, 5); | voffset = vec_splat(vtemp, 5); | ||||
| aligned = !((unsigned long)block & 0xf); | aligned = !((unsigned long)block & 0xf); | ||||
| for (y=0; y<h; y++) { | |||||
| for (y = 0; y < height; y++) { | |||||
| vblock = vec_ld(0, block); | vblock = vec_ld(0, block); | ||||
| v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); | v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); | ||||
| @@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei | |||||
| } | } | ||||
| static av_always_inline | static av_always_inline | ||||
| void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, | |||||
| int weightd, int weights, int offset, int w, int h) | |||||
| void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, | |||||
| int log2_denom, int weightd, int weights, int offset, int w) | |||||
| { | { | ||||
| int y, dst_aligned, src_aligned; | int y, dst_aligned, src_aligned; | ||||
| vec_u8 vsrc, vdst; | vec_u8 vsrc, vdst; | ||||
| @@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ | |||||
| dst_aligned = !((unsigned long)dst & 0xf); | dst_aligned = !((unsigned long)dst & 0xf); | ||||
| src_aligned = !((unsigned long)src & 0xf); | src_aligned = !((unsigned long)src & 0xf); | ||||
| for (y=0; y<h; y++) { | |||||
| for (y = 0; y < height; y++) { | |||||
| vdst = vec_ld(0, dst); | vdst = vec_ld(0, dst); | ||||
| vsrc = vec_ld(0, src); | vsrc = vec_ld(0, src); | ||||
| @@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ | |||||
| } | } | ||||
| } | } | ||||
| #define H264_WEIGHT(W,H) \ | |||||
| static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |||||
| weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ | |||||
| #define H264_WEIGHT(W) \ | |||||
| static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \ | |||||
| int log2_denom, int weight, int offset){ \ | |||||
| weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \ | |||||
| }\ | }\ | ||||
| static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ | |||||
| biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ | |||||
| static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \ | |||||
| int log2_denom, int weightd, int weights, int offset){ \ | |||||
| biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ | |||||
| } | } | ||||
| H264_WEIGHT(16,16) | |||||
| H264_WEIGHT(16, 8) | |||||
| H264_WEIGHT( 8,16) | |||||
| H264_WEIGHT( 8, 8) | |||||
| H264_WEIGHT( 8, 4) | |||||
| H264_WEIGHT(16) | |||||
| H264_WEIGHT( 8) | |||||
| void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { | void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { | ||||
| const int high_bit_depth = avctx->bits_per_raw_sample > 8; | const int high_bit_depth = avctx->bits_per_raw_sample > 8; | ||||
| @@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; | c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; | ||||
| c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; | c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; | ||||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; | |||||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; | |||||
| c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; | |||||
| c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; | |||||
| c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; | |||||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; | |||||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; | |||||
| c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; | |||||
| c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; | |||||
| c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; | |||||
| c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; | |||||
| c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; | |||||
| c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; | |||||
| c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -28,21 +28,20 @@ SECTION .text | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; biweight pred: | ; biweight pred: | ||||
| ; | ; | ||||
| ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, | |||||
| ; int log2_denom, int weightd, int weights, | |||||
| ; int offset); | |||||
| ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, | |||||
| ; int height, int log2_denom, int weightd, | |||||
| ; int weights, int offset); | |||||
| ; and | ; and | ||||
| ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, | |||||
| ; int log2_denom, int weight, | |||||
| ; int offset); | |||||
| ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, | |||||
| ; int log2_denom, int weight, int offset); | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro WEIGHT_SETUP 0 | %macro WEIGHT_SETUP 0 | ||||
| add r4, r4 | |||||
| inc r4 | |||||
| movd m3, r3d | |||||
| movd m5, r4d | |||||
| movd m6, r2d | |||||
| add r5, r5 | |||||
| inc r5 | |||||
| movd m3, r4d | |||||
| movd m5, r5d | |||||
| movd m6, r3d | |||||
| pslld m5, m6 | pslld m5, m6 | ||||
| psrld m5, 1 | psrld m5, 1 | ||||
| %if mmsize == 16 | %if mmsize == 16 | ||||
| @@ -71,60 +70,41 @@ SECTION .text | |||||
| packuswb m0, m1 | packuswb m0, m1 | ||||
| %endmacro | %endmacro | ||||
| %macro WEIGHT_FUNC_DBL_MM 1 | |||||
| cglobal h264_weight_16x%1_mmx2, 5, 5, 0 | |||||
| INIT_MMX | |||||
| cglobal h264_weight_16_mmx2, 6, 6, 0 | |||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| mov r2, %1 | |||||
| %if %1 == 16 | |||||
| .nextrow | .nextrow | ||||
| WEIGHT_OP 0, 4 | WEIGHT_OP 0, 4 | ||||
| mova [r0 ], m0 | mova [r0 ], m0 | ||||
| WEIGHT_OP 8, 12 | WEIGHT_OP 8, 12 | ||||
| mova [r0+8], m0 | mova [r0+8], m0 | ||||
| add r0, r1 | add r0, r1 | ||||
| dec r2 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) | |||||
| %endif | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| WEIGHT_FUNC_DBL_MM 16 | |||||
| WEIGHT_FUNC_DBL_MM 8 | |||||
| %macro WEIGHT_FUNC_MM 4 | |||||
| cglobal h264_weight_%1x%2_%4, 7, 7, %3 | |||||
| %macro WEIGHT_FUNC_MM 3 | |||||
| cglobal h264_weight_%1_%3, 6, 6, %2 | |||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| mov r2, %2 | |||||
| %if %2 == 16 | |||||
| .nextrow | .nextrow | ||||
| WEIGHT_OP 0, mmsize/2 | WEIGHT_OP 0, mmsize/2 | ||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, r1 | add r0, r1 | ||||
| dec r2 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_weight_%1x16_%4.nextrow) | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| WEIGHT_FUNC_MM 8, 16, 0, mmx2 | |||||
| WEIGHT_FUNC_MM 8, 8, 0, mmx2 | |||||
| WEIGHT_FUNC_MM 8, 4, 0, mmx2 | |||||
| WEIGHT_FUNC_MM 8, 0, mmx2 | |||||
| INIT_XMM | INIT_XMM | ||||
| WEIGHT_FUNC_MM 16, 16, 8, sse2 | |||||
| WEIGHT_FUNC_MM 16, 8, 8, sse2 | |||||
| WEIGHT_FUNC_MM 16, 8, sse2 | |||||
| %macro WEIGHT_FUNC_HALF_MM 5 | |||||
| cglobal h264_weight_%1x%2_%5, 5, 5, %4 | |||||
| %macro WEIGHT_FUNC_HALF_MM 3 | |||||
| cglobal h264_weight_%1_%3, 6, 6, %2 | |||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| mov r2, %2/2 | |||||
| sar r2d, 1 | |||||
| lea r3, [r1*2] | lea r3, [r1*2] | ||||
| %if %2 == mmsize | |||||
| .nextrow | .nextrow | ||||
| WEIGHT_OP 0, r1 | WEIGHT_OP 0, r1 | ||||
| movh [r0], m0 | movh [r0], m0 | ||||
| @@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 | |||||
| movh [r0+r1], m0 | movh [r0+r1], m0 | ||||
| %endif | %endif | ||||
| add r0, r3 | add r0, r3 | ||||
| dec r2 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |||||
| WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |||||
| WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |||||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
| WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
| INIT_XMM | INIT_XMM | ||||
| WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |||||
| WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |||||
| WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %macro BIWEIGHT_SETUP 0 | %macro BIWEIGHT_SETUP 0 | ||||
| add r6, 1 | |||||
| or r6, 1 | |||||
| add r3, 1 | |||||
| movd m3, r4d | |||||
| movd m4, r5d | |||||
| movd m5, r6d | |||||
| movd m6, r3d | |||||
| %ifdef ARCH_X86_64 | |||||
| %define off_regd r11d | |||||
| %else | |||||
| %define off_regd r3d | |||||
| %endif | |||||
| mov off_regd, r7m | |||||
| add off_regd, 1 | |||||
| or off_regd, 1 | |||||
| add r4, 1 | |||||
| movd m3, r5d | |||||
| movd m4, r6d | |||||
| movd m5, off_regd | |||||
| movd m6, r4d | |||||
| pslld m5, m6 | pslld m5, m6 | ||||
| psrld m5, 1 | psrld m5, 1 | ||||
| %if mmsize == 16 | %if mmsize == 16 | ||||
| @@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||||
| packuswb m0, m1 | packuswb m0, m1 | ||||
| %endmacro | %endmacro | ||||
| %macro BIWEIGHT_FUNC_DBL_MM 1 | |||||
| cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 | |||||
| INIT_MMX | |||||
| cglobal h264_biweight_16_mmx2, 7, 7, 0 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| mov r3, %1 | |||||
| %if %1 == 16 | |||||
| movifnidn r3d, r3m | |||||
| .nextrow | .nextrow | ||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, 4 | BIWEIGHT_STEPA 1, 2, 4 | ||||
| @@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 | |||||
| mova [r0+8], m0 | mova [r0+8], m0 | ||||
| add r0, r2 | add r0, r2 | ||||
| add r1, r2 | add r1, r2 | ||||
| dec r3 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) | |||||
| %endif | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| BIWEIGHT_FUNC_DBL_MM 16 | |||||
| BIWEIGHT_FUNC_DBL_MM 8 | |||||
| %macro BIWEIGHT_FUNC_MM 4 | |||||
| cglobal h264_biweight_%1x%2_%4, 7, 7, %3 | |||||
| %macro BIWEIGHT_FUNC_MM 3 | |||||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| mov r3, %2 | |||||
| %if %2 == 16 | |||||
| movifnidn r3d, r3m | |||||
| .nextrow | .nextrow | ||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, mmsize/2 | BIWEIGHT_STEPA 1, 2, mmsize/2 | ||||
| @@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 | |||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, r2 | add r0, r2 | ||||
| add r1, r2 | add r1, r2 | ||||
| dec r3 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 | |||||
| BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 | |||||
| BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 | |||||
| BIWEIGHT_FUNC_MM 8, 0, mmx2 | |||||
| INIT_XMM | INIT_XMM | ||||
| BIWEIGHT_FUNC_MM 16, 16, 8, sse2 | |||||
| BIWEIGHT_FUNC_MM 16, 8, 8, sse2 | |||||
| BIWEIGHT_FUNC_MM 16, 8, sse2 | |||||
| %macro BIWEIGHT_FUNC_HALF_MM 5 | |||||
| cglobal h264_biweight_%1x%2_%5, 7, 7, %4 | |||||
| %macro BIWEIGHT_FUNC_HALF_MM 3 | |||||
| cglobal h264_biweight_%1_%3, 7, 7, %2 | |||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| mov r3, %2/2 | |||||
| movifnidn r3d, r3m | |||||
| sar r3, 1 | |||||
| lea r4, [r2*2] | lea r4, [r2*2] | ||||
| %if %2 == mmsize | |||||
| .nextrow | .nextrow | ||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, r2 | BIWEIGHT_STEPA 1, 2, r2 | ||||
| @@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 | |||||
| %endif | %endif | ||||
| add r0, r4 | add r0, r4 | ||||
| add r1, r4 | add r1, r4 | ||||
| dec r3 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 | |||||
| BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 | |||||
| BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 | |||||
| BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
| INIT_XMM | INIT_XMM | ||||
| BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 | |||||
| BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 | |||||
| BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||||
| BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
| %macro BIWEIGHT_SSSE3_SETUP 0 | %macro BIWEIGHT_SSSE3_SETUP 0 | ||||
| add r6, 1 | |||||
| or r6, 1 | |||||
| add r3, 1 | |||||
| movd m4, r4d | |||||
| movd m0, r5d | |||||
| movd m5, r6d | |||||
| movd m6, r3d | |||||
| %ifdef ARCH_X86_64 | |||||
| %define off_regd r11d | |||||
| %else | |||||
| %define off_regd r3d | |||||
| %endif | |||||
| mov off_regd, r7m | |||||
| add off_regd, 1 | |||||
| or off_regd, 1 | |||||
| add r4, 1 | |||||
| movd m4, r5d | |||||
| movd m0, r6d | |||||
| movd m5, off_regd | |||||
| movd m6, r4d | |||||
| pslld m5, m6 | pslld m5, m6 | ||||
| psrld m5, 1 | psrld m5, 1 | ||||
| punpcklbw m4, m0 | punpcklbw m4, m0 | ||||
| @@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 | |||||
| packuswb m0, m2 | packuswb m0, m2 | ||||
| %endmacro | %endmacro | ||||
| %macro BIWEIGHT_SSSE3_16 1 | |||||
| cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 | |||||
| INIT_XMM | |||||
| cglobal h264_biweight_16_ssse3, 7, 7, 8 | |||||
| BIWEIGHT_SSSE3_SETUP | BIWEIGHT_SSSE3_SETUP | ||||
| mov r3, %1 | |||||
| movifnidn r3d, r3m | |||||
| %if %1 == 16 | |||||
| .nextrow | .nextrow | ||||
| movh m0, [r0] | movh m0, [r0] | ||||
| movh m2, [r0+8] | movh m2, [r0+8] | ||||
| @@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 | |||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, r2 | add r0, r2 | ||||
| add r1, r2 | add r1, r2 | ||||
| dec r3 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) | |||||
| %endif | |||||
| %endmacro | |||||
| INIT_XMM | INIT_XMM | ||||
| BIWEIGHT_SSSE3_16 16 | |||||
| BIWEIGHT_SSSE3_16 8 | |||||
| %macro BIWEIGHT_SSSE3_8 1 | |||||
| cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 | |||||
| cglobal h264_biweight_8_ssse3, 7, 7, 8 | |||||
| BIWEIGHT_SSSE3_SETUP | BIWEIGHT_SSSE3_SETUP | ||||
| mov r3, %1/2 | |||||
| movifnidn r3d, r3m | |||||
| sar r3, 1 | |||||
| lea r4, [r2*2] | lea r4, [r2*2] | ||||
| %if %1 == 16 | |||||
| .nextrow | .nextrow | ||||
| movh m0, [r0] | movh m0, [r0] | ||||
| movh m1, [r1] | movh m1, [r1] | ||||
| @@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 | |||||
| movhps [r0+r2], m0 | movhps [r0+r2], m0 | ||||
| add r0, r4 | add r0, r4 | ||||
| add r1, r4 | add r1, r4 | ||||
| dec r3 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| %else | |||||
| jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) | |||||
| %endif | |||||
| %endmacro | |||||
| INIT_XMM | |||||
| BIWEIGHT_SSSE3_8 16 | |||||
| BIWEIGHT_SSSE3_8 8 | |||||
| BIWEIGHT_SSSE3_8 4 | |||||
| @@ -36,33 +36,26 @@ cextern pw_1 | |||||
| SECTION .text | SECTION .text | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void h264_weight(uint8_t *dst, int stride, int log2_denom, | |||||
| ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, | |||||
| ; int weight, int offset); | ; int weight, int offset); | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %ifdef ARCH_X86_32 | |||||
| DECLARE_REG_TMP 2 | |||||
| %else | |||||
| DECLARE_REG_TMP 10 | |||||
| %endif | |||||
| %macro WEIGHT_PROLOGUE 1 | |||||
| mov t0, %1 | |||||
| %macro WEIGHT_PROLOGUE 0 | |||||
| .prologue | .prologue | ||||
| PROLOGUE 0,5,8 | |||||
| PROLOGUE 0,6,8 | |||||
| movifnidn r0, r0mp | movifnidn r0, r0mp | ||||
| movifnidn r1d, r1m | movifnidn r1d, r1m | ||||
| movifnidn r3d, r3m | |||||
| movifnidn r4d, r4m | movifnidn r4d, r4m | ||||
| movifnidn r5d, r5m | |||||
| %endmacro | %endmacro | ||||
| %macro WEIGHT_SETUP 1 | %macro WEIGHT_SETUP 1 | ||||
| mova m0, [pw_1] | mova m0, [pw_1] | ||||
| movd m2, r2m | |||||
| movd m2, r3m | |||||
| pslld m0, m2 ; 1<<log2_denom | pslld m0, m2 ; 1<<log2_denom | ||||
| SPLATW m0, m0 | SPLATW m0, m0 | ||||
| shl r4, 19 ; *8, move to upper half of dword | |||||
| lea r4, [r4+r3*2+0x10000] | |||||
| movd m3, r4d ; weight<<1 | 1+(offset<<(3)) | |||||
| shl r5, 19 ; *8, move to upper half of dword | |||||
| lea r5, [r5+r4*2+0x10000] | |||||
| movd m3, r5d ; weight<<1 | 1+(offset<<(3)) | |||||
| pshufd m3, m3, 0 | pshufd m3, m3, 0 | ||||
| mova m4, [pw_pixel_max] | mova m4, [pw_pixel_max] | ||||
| paddw m2, [sq_1] ; log2_denom+1 | paddw m2, [sq_1] ; log2_denom+1 | ||||
| @@ -96,8 +89,8 @@ DECLARE_REG_TMP 10 | |||||
| %endmacro | %endmacro | ||||
| %macro WEIGHT_FUNC_DBL 1 | %macro WEIGHT_FUNC_DBL 1 | ||||
| cglobal h264_weight_16x16_10_%1 | |||||
| WEIGHT_PROLOGUE 16 | |||||
| cglobal h264_weight_16_10_%1 | |||||
| WEIGHT_PROLOGUE | |||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| .nextrow | .nextrow | ||||
| WEIGHT_OP %1, 0 | WEIGHT_OP %1, 0 | ||||
| @@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1 | |||||
| WEIGHT_OP %1, 16 | WEIGHT_OP %1, 16 | ||||
| mova [r0+16], m5 | mova [r0+16], m5 | ||||
| add r0, r1 | add r0, r1 | ||||
| dec t0 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_weight_16x8_10_%1 | |||||
| mov t0, 8 | |||||
| jmp mangle(ff_h264_weight_16x16_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4 | |||||
| %macro WEIGHT_FUNC_MM 1 | %macro WEIGHT_FUNC_MM 1 | ||||
| cglobal h264_weight_8x16_10_%1 | |||||
| WEIGHT_PROLOGUE 16 | |||||
| cglobal h264_weight_8_10_%1 | |||||
| WEIGHT_PROLOGUE | |||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| .nextrow | .nextrow | ||||
| WEIGHT_OP %1, 0 | WEIGHT_OP %1, 0 | ||||
| mova [r0], m5 | mova [r0], m5 | ||||
| add r0, r1 | add r0, r1 | ||||
| dec t0 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_weight_8x8_10_%1 | |||||
| mov t0, 8 | |||||
| jmp mangle(ff_h264_weight_8x16_10_%1.prologue) | |||||
| cglobal h264_weight_8x4_10_%1 | |||||
| mov t0, 4 | |||||
| jmp mangle(ff_h264_weight_8x16_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4 | |||||
| %macro WEIGHT_FUNC_HALF_MM 1 | %macro WEIGHT_FUNC_HALF_MM 1 | ||||
| cglobal h264_weight_4x8_10_%1 | |||||
| WEIGHT_PROLOGUE 4 | |||||
| cglobal h264_weight_4_10_%1 | |||||
| WEIGHT_PROLOGUE | |||||
| sar r2d, 1 | |||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| lea r3, [r1*2] | lea r3, [r1*2] | ||||
| .nextrow | .nextrow | ||||
| @@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1 | |||||
| movh [r0], m5 | movh [r0], m5 | ||||
| movhps [r0+r1], m5 | movhps [r0+r1], m5 | ||||
| add r0, r3 | add r0, r3 | ||||
| dec t0 | |||||
| dec r2d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_weight_4x4_10_%1 | |||||
| mov t0, 2 | |||||
| jmp mangle(ff_h264_weight_4x8_10_%1.prologue) | |||||
| cglobal h264_weight_4x2_10_%1 | |||||
| mov t0, 1 | |||||
| jmp mangle(ff_h264_weight_4x8_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom, | |||||
| ; int weightd, int weights, int offset); | |||||
| ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, | |||||
| ; int log2_denom, int weightd, int weights, int offset); | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %ifdef ARCH_X86_32 | %ifdef ARCH_X86_32 | ||||
| DECLARE_REG_TMP 2,3 | |||||
| DECLARE_REG_TMP 3 | |||||
| %else | %else | ||||
| DECLARE_REG_TMP 10,2 | |||||
| DECLARE_REG_TMP 10 | |||||
| %endif | %endif | ||||
| %macro BIWEIGHT_PROLOGUE 1 | |||||
| mov t0, %1 | |||||
| %macro BIWEIGHT_PROLOGUE 0 | |||||
| .prologue | .prologue | ||||
| PROLOGUE 0,7,8 | PROLOGUE 0,7,8 | ||||
| movifnidn r0, r0mp | movifnidn r0, r0mp | ||||
| movifnidn r1, r1mp | movifnidn r1, r1mp | ||||
| movifnidn t1d, r2m | |||||
| movifnidn r4d, r4m | |||||
| movifnidn r2d, r2m | |||||
| movifnidn r5d, r5m | movifnidn r5d, r5m | ||||
| movifnidn r6d, r6m | movifnidn r6d, r6m | ||||
| movifnidn t0d, r7m | |||||
| %endmacro | %endmacro | ||||
| %macro BIWEIGHT_SETUP 1 | %macro BIWEIGHT_SETUP 1 | ||||
| lea r6, [r6*4+1] ; (offset<<2)+1 | |||||
| or r6, 1 | |||||
| shl r5, 16 | |||||
| or r4, r5 | |||||
| movd m4, r4d ; weightd | weights | |||||
| movd m5, r6d ; (offset+1)|1 | |||||
| movd m6, r3m ; log2_denom | |||||
| lea t0, [t0*4+1] ; (offset<<2)+1 | |||||
| or t0, 1 | |||||
| shl r6, 16 | |||||
| or r5, r6 | |||||
| movd m4, r5d ; weightd | weights | |||||
| movd m5, t0d ; (offset+1)|1 | |||||
| movd m6, r4m ; log2_denom | |||||
| pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom | pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom | ||||
| paddd m6, [sq_1] | paddd m6, [sq_1] | ||||
| pshufd m4, m4, 0 | pshufd m4, m4, 0 | ||||
| pshufd m5, m5, 0 | pshufd m5, m5, 0 | ||||
| mova m3, [pw_pixel_max] | mova m3, [pw_pixel_max] | ||||
| movifnidn r3d, r3m | |||||
| %ifnidn %1, sse4 | %ifnidn %1, sse4 | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| %endif | %endif | ||||
| @@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2 | |||||
| %endmacro | %endmacro | ||||
| %macro BIWEIGHT_FUNC_DBL 1 | %macro BIWEIGHT_FUNC_DBL 1 | ||||
| cglobal h264_biweight_16x16_10_%1 | |||||
| BIWEIGHT_PROLOGUE 16 | |||||
| cglobal h264_biweight_16_10_%1 | |||||
| BIWEIGHT_PROLOGUE | |||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| .nextrow | .nextrow | ||||
| BIWEIGHT %1, 0 | BIWEIGHT %1, 0 | ||||
| mova [r0 ], m0 | mova [r0 ], m0 | ||||
| BIWEIGHT %1, 16 | BIWEIGHT %1, 16 | ||||
| mova [r0+16], m0 | mova [r0+16], m0 | ||||
| add r0, t1 | |||||
| add r1, t1 | |||||
| dec t0 | |||||
| add r0, r2 | |||||
| add r1, r2 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_biweight_16x8_10_%1 | |||||
| mov t0, 8 | |||||
| jmp mangle(ff_h264_biweight_16x16_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2 | |||||
| BIWEIGHT_FUNC_DBL sse4 | BIWEIGHT_FUNC_DBL sse4 | ||||
| %macro BIWEIGHT_FUNC 1 | %macro BIWEIGHT_FUNC 1 | ||||
| cglobal h264_biweight_8x16_10_%1 | |||||
| BIWEIGHT_PROLOGUE 16 | |||||
| cglobal h264_biweight_8_10_%1 | |||||
| BIWEIGHT_PROLOGUE | |||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| .nextrow | .nextrow | ||||
| BIWEIGHT %1, 0 | BIWEIGHT %1, 0 | ||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, t1 | |||||
| add r1, t1 | |||||
| dec t0 | |||||
| add r0, r2 | |||||
| add r1, r2 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_biweight_8x8_10_%1 | |||||
| mov t0, 8 | |||||
| jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) | |||||
| cglobal h264_biweight_8x4_10_%1 | |||||
| mov t0, 4 | |||||
| jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2 | |||||
| BIWEIGHT_FUNC sse4 | BIWEIGHT_FUNC sse4 | ||||
| %macro BIWEIGHT_FUNC_HALF 1 | %macro BIWEIGHT_FUNC_HALF 1 | ||||
| cglobal h264_biweight_4x8_10_%1 | |||||
| BIWEIGHT_PROLOGUE 4 | |||||
| cglobal h264_biweight_4_10_%1 | |||||
| BIWEIGHT_PROLOGUE | |||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| lea r4, [t1*2] | |||||
| sar r3d, 1 | |||||
| lea r4, [r2*2] | |||||
| .nextrow | .nextrow | ||||
| BIWEIGHT %1, 0, t1 | |||||
| BIWEIGHT %1, 0, r2 | |||||
| movh [r0 ], m0 | movh [r0 ], m0 | ||||
| movhps [r0+t1], m0 | |||||
| movhps [r0+r2], m0 | |||||
| add r0, r4 | add r0, r4 | ||||
| add r1, r4 | add r1, r4 | ||||
| dec t0 | |||||
| dec r3d | |||||
| jnz .nextrow | jnz .nextrow | ||||
| REP_RET | REP_RET | ||||
| cglobal h264_biweight_4x4_10_%1 | |||||
| mov t0, 2 | |||||
| jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) | |||||
| cglobal h264_biweight_4x2_10_%1 | |||||
| mov t0, 1 | |||||
| jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) | |||||
| %endmacro | %endmacro | ||||
| INIT_XMM | INIT_XMM | ||||
| @@ -298,57 +298,47 @@ LF_IFUNC(v, luma_intra, 10, mmxext) | |||||
| /***********************************/ | /***********************************/ | ||||
| /* weighted prediction */ | /* weighted prediction */ | ||||
| #define H264_WEIGHT(W, H, OPT) \ | |||||
| void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ | |||||
| int stride, int log2_denom, int weight, int offset); | |||||
| #define H264_WEIGHT(W, OPT) \ | |||||
| void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \ | |||||
| int stride, int height, int log2_denom, int weight, int offset); | |||||
| #define H264_BIWEIGHT(W, H, OPT) \ | |||||
| void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ | |||||
| uint8_t *src, int stride, int log2_denom, int weightd, \ | |||||
| #define H264_BIWEIGHT(W, OPT) \ | |||||
| void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \ | |||||
| uint8_t *src, int stride, int height, int log2_denom, int weightd, \ | |||||
| int weights, int offset); | int weights, int offset); | ||||
| #define H264_BIWEIGHT_MMX(W,H) \ | |||||
| H264_WEIGHT (W, H, mmx2) \ | |||||
| H264_BIWEIGHT(W, H, mmx2) | |||||
| #define H264_BIWEIGHT_MMX_SSE(W,H) \ | |||||
| H264_BIWEIGHT_MMX(W, H) \ | |||||
| H264_WEIGHT (W, H, sse2) \ | |||||
| H264_BIWEIGHT (W, H, sse2) \ | |||||
| H264_BIWEIGHT (W, H, ssse3) | |||||
| H264_BIWEIGHT_MMX_SSE(16, 16) | |||||
| H264_BIWEIGHT_MMX_SSE(16, 8) | |||||
| H264_BIWEIGHT_MMX_SSE( 8, 16) | |||||
| H264_BIWEIGHT_MMX_SSE( 8, 8) | |||||
| H264_BIWEIGHT_MMX_SSE( 8, 4) | |||||
| H264_BIWEIGHT_MMX ( 4, 8) | |||||
| H264_BIWEIGHT_MMX ( 4, 4) | |||||
| H264_BIWEIGHT_MMX ( 4, 2) | |||||
| #define H264_WEIGHT_10(W, H, DEPTH, OPT) \ | |||||
| void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||||
| int stride, int log2_denom, int weight, int offset); | |||||
| #define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \ | |||||
| void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \ | |||||
| (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \ | |||||
| #define H264_BIWEIGHT_MMX(W) \ | |||||
| H264_WEIGHT (W, mmx2) \ | |||||
| H264_BIWEIGHT(W, mmx2) | |||||
| #define H264_BIWEIGHT_MMX_SSE(W) \ | |||||
| H264_BIWEIGHT_MMX(W) \ | |||||
| H264_WEIGHT (W, sse2) \ | |||||
| H264_BIWEIGHT (W, sse2) \ | |||||
| H264_BIWEIGHT (W, ssse3) | |||||
| H264_BIWEIGHT_MMX_SSE(16) | |||||
| H264_BIWEIGHT_MMX_SSE( 8) | |||||
| H264_BIWEIGHT_MMX ( 4) | |||||
| #define H264_WEIGHT_10(W, DEPTH, OPT) \ | |||||
| void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||||
| int stride, int height, int log2_denom, int weight, int offset); | |||||
| #define H264_BIWEIGHT_10(W, DEPTH, OPT) \ | |||||
| void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \ | |||||
| (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \ | |||||
| int weightd, int weights, int offset); | int weightd, int weights, int offset); | ||||
| #define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \ | |||||
| H264_WEIGHT_10 (W, H, DEPTH, sse2) \ | |||||
| H264_WEIGHT_10 (W, H, DEPTH, sse4) \ | |||||
| H264_BIWEIGHT_10(W, H, DEPTH, sse2) \ | |||||
| H264_BIWEIGHT_10(W, H, DEPTH, sse4) | |||||
| H264_BIWEIGHT_10_SSE(16, 16, 10) | |||||
| H264_BIWEIGHT_10_SSE(16, 8, 10) | |||||
| H264_BIWEIGHT_10_SSE( 8, 16, 10) | |||||
| H264_BIWEIGHT_10_SSE( 8, 8, 10) | |||||
| H264_BIWEIGHT_10_SSE( 8, 4, 10) | |||||
| H264_BIWEIGHT_10_SSE( 4, 8, 10) | |||||
| H264_BIWEIGHT_10_SSE( 4, 4, 10) | |||||
| H264_BIWEIGHT_10_SSE( 4, 2, 10) | |||||
| #define H264_BIWEIGHT_10_SSE(W, DEPTH) \ | |||||
| H264_WEIGHT_10 (W, DEPTH, sse2) \ | |||||
| H264_WEIGHT_10 (W, DEPTH, sse4) \ | |||||
| H264_BIWEIGHT_10(W, DEPTH, sse2) \ | |||||
| H264_BIWEIGHT_10(W, DEPTH, sse4) | |||||
| H264_BIWEIGHT_10_SSE(16, 10) | |||||
| H264_BIWEIGHT_10_SSE( 8, 10) | |||||
| H264_BIWEIGHT_10_SSE( 4, 10) | |||||
| void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) | void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) | ||||
| { | { | ||||
| @@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; | ||||
| c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; | ||||
| #endif | #endif | ||||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | |||||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | |||||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | |||||
| c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | |||||
| c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | |||||
| c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; | |||||
| c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; | |||||
| c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; | |||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; | |||||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; | |||||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | |||||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | |||||
| c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | |||||
| c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | |||||
| c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | |||||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; | |||||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; | |||||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; | |||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; | |||||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; | |||||
| if (mm_flags&AV_CPU_FLAG_SSE2) { | if (mm_flags&AV_CPU_FLAG_SSE2) { | ||||
| c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | ||||
| @@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; | c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; | ||||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | ||||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | |||||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | |||||
| c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; | |||||
| c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; | |||||
| c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; | |||||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; | |||||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; | |||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; | |||||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; | |||||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; | |||||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; | |||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; | |||||
| #if HAVE_ALIGNED_STACK | #if HAVE_ALIGNED_STACK | ||||
| c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | ||||
| @@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| #endif | #endif | ||||
| } | } | ||||
| if (mm_flags&AV_CPU_FLAG_SSSE3) { | if (mm_flags&AV_CPU_FLAG_SSSE3) { | ||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; | |||||
| c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; | |||||
| c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; | |||||
| c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; | |||||
| c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; | |||||
| c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; | |||||
| } | } | ||||
| if (mm_flags&AV_CPU_FLAG_AVX) { | if (mm_flags&AV_CPU_FLAG_AVX) { | ||||
| #if HAVE_ALIGNED_STACK | #if HAVE_ALIGNED_STACK | ||||
| @@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; | c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; | ||||
| #endif | #endif | ||||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; | |||||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; | |||||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; | |||||
| c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; | |||||
| c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; | |||||
| c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; | |||||
| c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; | |||||
| c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; | |||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; | |||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; | |||||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; | |||||
| c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; | |||||
| c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; | |||||
| c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; | |||||
| c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; | |||||
| c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; | |||||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; | |||||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; | |||||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; | |||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; | |||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; | |||||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; | |||||
| c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; | c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; | ||||
| c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; | c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; | ||||
| @@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom | |||||
| #endif | #endif | ||||
| } | } | ||||
| if (mm_flags&AV_CPU_FLAG_SSE4) { | if (mm_flags&AV_CPU_FLAG_SSE4) { | ||||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; | |||||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; | |||||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; | |||||
| c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; | |||||
| c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; | |||||
| c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; | |||||
| c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; | |||||
| c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; | |||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; | |||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; | |||||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; | |||||
| c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; | |||||
| c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; | |||||
| c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; | |||||
| c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; | |||||
| c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; | |||||
| c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; | |||||
| c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; | |||||
| c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; | |||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; | |||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; | |||||
| c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; | |||||
| } | } | ||||
| #if HAVE_AVX | #if HAVE_AVX | ||||
| if (mm_flags&AV_CPU_FLAG_AVX) { | if (mm_flags&AV_CPU_FLAG_AVX) { | ||||