Lets us do the zeroing in asm instead of C. Also makes it consistent with the way the regular iDCT code does it. Originally committed as revision 24668 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -117,6 +117,7 @@ typedef struct { | |||||
| */ | */ | ||||
| DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; | DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; | ||||
| DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; | DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; | ||||
| DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; | |||||
| uint8_t intra4x4_pred_mode_mb[16]; | uint8_t intra4x4_pred_mode_mb[16]; | ||||
| int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock | int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock | ||||
| @@ -864,22 +865,19 @@ static av_always_inline | |||||
| void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, | void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, | ||||
| uint8_t t_nnz[9], uint8_t l_nnz[9]) | uint8_t t_nnz[9], uint8_t l_nnz[9]) | ||||
| { | { | ||||
| LOCAL_ALIGNED_16(DCTELEM, dc,[16]); | |||||
| int i, x, y, luma_start = 0, luma_ctx = 3; | int i, x, y, luma_start = 0, luma_ctx = 3; | ||||
| int nnz_pred, nnz, nnz_total = 0; | int nnz_pred, nnz, nnz_total = 0; | ||||
| int segment = s->segment; | int segment = s->segment; | ||||
| if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { | if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { | ||||
| AV_ZERO128(dc); | |||||
| AV_ZERO128(dc+8); | |||||
| nnz_pred = t_nnz[8] + l_nnz[8]; | nnz_pred = t_nnz[8] + l_nnz[8]; | ||||
| // decode DC values and do hadamard | // decode DC values and do hadamard | ||||
| nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred, | |||||
| nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred, | |||||
| s->qmat[segment].luma_dc_qmul); | s->qmat[segment].luma_dc_qmul); | ||||
| l_nnz[8] = t_nnz[8] = !!nnz; | l_nnz[8] = t_nnz[8] = !!nnz; | ||||
| nnz_total += nnz; | nnz_total += nnz; | ||||
| s->vp8dsp.vp8_luma_dc_wht(s->block, dc); | |||||
| s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc); | |||||
| luma_start = 1; | luma_start = 1; | ||||
| luma_ctx = 0; | luma_ctx = 0; | ||||
| } | } | ||||
| @@ -46,6 +46,10 @@ static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16]) | |||||
| t1 = dc[i*4+1] + dc[i*4+2]; | t1 = dc[i*4+1] + dc[i*4+2]; | ||||
| t2 = dc[i*4+1] - dc[i*4+2]; | t2 = dc[i*4+1] - dc[i*4+2]; | ||||
| t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding | t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding | ||||
| dc[i*4+0] = 0; | |||||
| dc[i*4+1] = 0; | |||||
| dc[i*4+2] = 0; | |||||
| dc[i*4+3] = 0; | |||||
| *block[i][0] = (t0 + t1) >> 3; | *block[i][0] = (t0 + t1) >> 3; | ||||
| *block[i][1] = (t3 + t2) >> 3; | *block[i][1] = (t3 + t2) >> 3; | ||||
| @@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str | |||||
| extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); | extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); | ||||
| extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); | extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); | ||||
| extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | ||||
| extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]); | |||||
| extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| @@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| if (mm_flags & FF_MM_SSE) { | if (mm_flags & FF_MM_SSE) { | ||||
| c->vp8_idct_add = ff_vp8_idct_add_sse; | c->vp8_idct_add = ff_vp8_idct_add_sse; | ||||
| c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; | |||||
| c->put_vp8_epel_pixels_tab[0][0][0] = | c->put_vp8_epel_pixels_tab[0][0][0] = | ||||
| c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; | c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; | ||||
| } | } | ||||
| @@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse | |||||
| SWAP %1, %4, %3 | SWAP %1, %4, %3 | ||||
| %endmacro | %endmacro | ||||
| INIT_MMX | |||||
| cglobal vp8_luma_dc_wht_mmx, 2,3 | |||||
| %macro VP8_DC_WHT 1 | |||||
| cglobal vp8_luma_dc_wht_%1, 2,3 | |||||
| movq m0, [r1] | movq m0, [r1] | ||||
| movq m1, [r1+8] | movq m1, [r1+8] | ||||
| movq m2, [r1+16] | movq m2, [r1+16] | ||||
| movq m3, [r1+24] | movq m3, [r1+24] | ||||
| %ifidn %1, sse | |||||
| xorps xmm0, xmm0 | |||||
| movaps [r1+ 0], xmm0 | |||||
| movaps [r1+16], xmm0 | |||||
| %else | |||||
| pxor m4, m4 | |||||
| movq [r1+ 0], m4 | |||||
| movq [r1+ 8], m4 | |||||
| movq [r1+16], m4 | |||||
| movq [r1+24], m4 | |||||
| %endif | |||||
| HADAMARD4_1D 0, 1, 2, 3 | HADAMARD4_1D 0, 1, 2, 3 | ||||
| TRANSPOSE4x4W 0, 1, 2, 3, 4 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | ||||
| paddw m0, [pw_3] | paddw m0, [pw_3] | ||||
| @@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 | |||||
| SCATTER_WHT 0, 1, 0 | SCATTER_WHT 0, 1, 0 | ||||
| SCATTER_WHT 2, 3, 2 | SCATTER_WHT 2, 3, 2 | ||||
| RET | RET | ||||
| %endmacro | |||||
| INIT_MMX | |||||
| VP8_DC_WHT mmx | |||||
| VP8_DC_WHT sse | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); | ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); | ||||