Quite often, the original weights are multiple of 512. By prescaling them by 1/512 when they are computed (once per frame), no intermediate shifting is needed, and no prescaling on each call either. The x86 code already used that trick. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>tags/n0.11
| @@ -128,8 +128,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) | |||
| c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; | |||
| c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; | |||
| c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon; | |||
| c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon; | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon; | |||
| c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; | |||
| c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; | |||
| @@ -521,7 +521,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int | |||
| */ | |||
| static int calc_add_mv(RV34DecContext *r, int dir, int val) | |||
| { | |||
| int mul = dir ? -r->weight2 : r->weight1; | |||
| int mul = dir ? -r->mv_weight2 : r->mv_weight1; | |||
| return (val * mul + 0x2000) >> 14; | |||
| } | |||
| @@ -776,24 +776,24 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type, | |||
| static void rv4_weight(RV34DecContext *r) | |||
| { | |||
| r->rdsp.rv40_weight_pixels_tab[0](r->s.dest[0], | |||
| r->tmp_b_block_y[0], | |||
| r->tmp_b_block_y[1], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.linesize); | |||
| r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[1], | |||
| r->tmp_b_block_uv[0], | |||
| r->tmp_b_block_uv[2], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.uvlinesize); | |||
| r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[2], | |||
| r->tmp_b_block_uv[1], | |||
| r->tmp_b_block_uv[3], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.uvlinesize); | |||
| r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][0](r->s.dest[0], | |||
| r->tmp_b_block_y[0], | |||
| r->tmp_b_block_y[1], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.linesize); | |||
| r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[1], | |||
| r->tmp_b_block_uv[0], | |||
| r->tmp_b_block_uv[2], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.uvlinesize); | |||
| r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[2], | |||
| r->tmp_b_block_uv[1], | |||
| r->tmp_b_block_uv[3], | |||
| r->weight1, | |||
| r->weight2, | |||
| r->s.uvlinesize); | |||
| } | |||
| static void rv34_mc_2mv(RV34DecContext *r, const int block_type) | |||
| @@ -1703,11 +1703,21 @@ int ff_rv34_decode_frame(AVCodecContext *avctx, | |||
| int dist0 = GET_PTS_DIFF(r->cur_pts, r->last_pts); | |||
| int dist1 = GET_PTS_DIFF(r->next_pts, r->cur_pts); | |||
| if (!refdist) { | |||
| r->weight1 = r->weight2 = 8192; | |||
| } else { | |||
| r->weight1 = (dist0 << 14) / refdist; | |||
| r->weight2 = (dist1 << 14) / refdist; | |||
| if(!refdist){ | |||
| r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192; | |||
| r->scaled_weight = 0; | |||
| }else{ | |||
| r->mv_weight1 = (dist0 << 14) / refdist; | |||
| r->mv_weight2 = (dist1 << 14) / refdist; | |||
| if((r->mv_weight1|r->mv_weight2) & 511){ | |||
| r->weight1 = r->mv_weight1; | |||
| r->weight2 = r->mv_weight2; | |||
| r->scaled_weight = 0; | |||
| }else{ | |||
| r->weight1 = r->mv_weight1 >> 9; | |||
| r->weight2 = r->mv_weight2 >> 9; | |||
| r->scaled_weight = 1; | |||
| } | |||
| } | |||
| } | |||
| s->mb_x = s->mb_y = 0; | |||
| @@ -106,7 +106,9 @@ typedef struct RV34DecContext{ | |||
| int rpr; ///< one field size in RV30 slice header | |||
| int cur_pts, last_pts, next_pts; | |||
| int scaled_weight; | |||
| int weight1, weight2; ///< B frame distance fractions (0.14) used in motion compensation | |||
| int mv_weight1, mv_weight2; | |||
| uint16_t *cbp_luma; ///< CBP values for luma subblocks | |||
| uint8_t *cbp_chroma; ///< CBP values for chroma subblocks | |||
| @@ -58,7 +58,12 @@ typedef struct RV34DSPContext { | |||
| qpel_mc_func avg_pixels_tab[4][16]; | |||
| h264_chroma_mc_func put_chroma_pixels_tab[3]; | |||
| h264_chroma_mc_func avg_chroma_pixels_tab[3]; | |||
| rv40_weight_func rv40_weight_pixels_tab[2]; | |||
| /** | |||
| * Biweight functions, first dimension is transform size (16/8), | |||
| * second is whether the weight is prescaled by 1/512 to skip | |||
| * the intermediate shifting. | |||
| */ | |||
| rv40_weight_func rv40_weight_pixels_tab[2][2]; | |||
| rv34_inv_transform_func rv34_inv_transform; | |||
| rv34_inv_transform_func rv34_inv_transform_dc; | |||
| rv34_idct_add_func rv34_idct_add; | |||
| @@ -278,7 +278,7 @@ RV40_CHROMA_MC(put_, op_put) | |||
| RV40_CHROMA_MC(avg_, op_avg) | |||
| #define RV40_WEIGHT_FUNC(size) \ | |||
| static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ | |||
| static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ | |||
| {\ | |||
| int i, j;\ | |||
| \ | |||
| @@ -289,6 +289,18 @@ static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src | |||
| src2 += stride;\ | |||
| dst += stride;\ | |||
| }\ | |||
| }\ | |||
| static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ | |||
| {\ | |||
| int i, j;\ | |||
| \ | |||
| for (j = 0; j < size; j++) {\ | |||
| for (i = 0; i < size; i++)\ | |||
| dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\ | |||
| src1 += stride;\ | |||
| src2 += stride;\ | |||
| dst += stride;\ | |||
| }\ | |||
| } | |||
| RV40_WEIGHT_FUNC(16) | |||
| @@ -578,8 +590,10 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) { | |||
| c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c; | |||
| c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c; | |||
| c->rv40_weight_pixels_tab[0] = rv40_weight_func_16; | |||
| c->rv40_weight_pixels_tab[1] = rv40_weight_func_8; | |||
| c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16; | |||
| c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8; | |||
| c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16; | |||
| c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8; | |||
| c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter; | |||
| c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter; | |||
| @@ -139,69 +139,61 @@ SECTION .text | |||
| ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) | |||
| ; %1=size %2=num of xmm regs | |||
| %macro RV40_WEIGHT 2 | |||
| cglobal rv40_weight_func_%1, 6, 7, %2 | |||
| ; The weights are FP0.14 notation of fractions depending on pts. | |||
| ; For timebases without rounding error (i.e. PAL), the fractions | |||
| ; can be simplified, and several operations can be avoided. | |||
| ; Therefore, we check here whether they are multiples of 2^9 for | |||
| ; those simplifications to occur. | |||
| %macro RV40_WEIGHT 3 | |||
| cglobal rv40_weight_func_%1_%2, 6, 7, %3 | |||
| %if cpuflag(ssse3) | |||
| mova m1, [shift_round] | |||
| %else | |||
| mova m1, [pw_16] | |||
| %endif | |||
| pxor m0, m0 | |||
| mov r6, r3 | |||
| or r6, r4 | |||
| ; The weights are FP0.14 notation of fractions depending on pts. | |||
| ; For timebases without rounding error (i.e. PAL), the fractions | |||
| ; can be simplified, and several operations can be avoided. | |||
| ; Therefore, we check here whether they are multiples of 2^9 for | |||
| ; those simplifications to occur. | |||
| and r6, 0x1FF | |||
| ; Set loop counter and increments | |||
| %if mmsize == 8 | |||
| mov r6, %1 | |||
| mov r6, %2 | |||
| %else | |||
| mov r6, (%1 * %1) / mmsize | |||
| mov r6, (%2 * %2) / mmsize | |||
| %endif | |||
| ; Use result of test now | |||
| jz .loop_512 | |||
| movd m2, r3 | |||
| movd m3, r4 | |||
| %ifidn %1,rnd | |||
| %define RND 0 | |||
| SPLATW m2, m2 | |||
| SPLATW m3, m3 | |||
| .loop: | |||
| MAIN_LOOP %1, 0 | |||
| jnz .loop | |||
| REP_RET | |||
| ; Weights are multiple of 512, which allows some shortcuts | |||
| .loop_512: | |||
| sar r3, 9 | |||
| sar r4, 9 | |||
| movd m2, r3 | |||
| movd m3, r4 | |||
| %else | |||
| %define RND 1 | |||
| %if cpuflag(ssse3) | |||
| punpcklbw m3, m2 | |||
| SPLATW m3, m3 | |||
| %else | |||
| SPLATW m2, m2 | |||
| SPLATW m3, m3 | |||
| %endif | |||
| .loop2: | |||
| MAIN_LOOP %1, 1 | |||
| jnz .loop2 | |||
| REP_RET | |||
| %endif | |||
| SPLATW m3, m3 | |||
| .loop: | |||
| MAIN_LOOP %2, RND | |||
| jnz .loop | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX mmx | |||
| RV40_WEIGHT 8, 0 | |||
| RV40_WEIGHT 16, 0 | |||
| RV40_WEIGHT rnd, 8, 3 | |||
| RV40_WEIGHT rnd, 16, 4 | |||
| RV40_WEIGHT nornd, 8, 3 | |||
| RV40_WEIGHT nornd, 16, 4 | |||
| INIT_XMM sse2 | |||
| RV40_WEIGHT 8, 8 | |||
| RV40_WEIGHT 16, 8 | |||
| RV40_WEIGHT rnd, 8, 3 | |||
| RV40_WEIGHT rnd, 16, 4 | |||
| RV40_WEIGHT nornd, 8, 3 | |||
| RV40_WEIGHT nornd, 16, 4 | |||
| INIT_XMM ssse3 | |||
| RV40_WEIGHT 8, 8 | |||
| RV40_WEIGHT 16, 8 | |||
| RV40_WEIGHT rnd, 8, 3 | |||
| RV40_WEIGHT rnd, 16, 4 | |||
| RV40_WEIGHT nornd, 8, 3 | |||
| RV40_WEIGHT nornd, 16, 4 | |||
| @@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, | |||
| int stride, int h, int x, int y); | |||
| #define DECLARE_WEIGHT(opt) \ | |||
| void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); \ | |||
| void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); | |||
| void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); \ | |||
| void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); \ | |||
| void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); \ | |||
| void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |||
| int w1, int w2, ptrdiff_t stride); | |||
| DECLARE_WEIGHT(mmx) | |||
| DECLARE_WEIGHT(sse2) | |||
| DECLARE_WEIGHT(ssse3) | |||
| @@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) | |||
| if (mm_flags & AV_CPU_FLAG_MMX) { | |||
| c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; | |||
| c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; | |||
| c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx; | |||
| c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx; | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_MMX2) { | |||
| c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; | |||
| @@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) | |||
| c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSE2) { | |||
| c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2; | |||
| c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2; | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSSE3) { | |||
| c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3; | |||
| c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3; | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; | |||
| } | |||
| #endif | |||
| } | |||