~0.3% faster overall. Originally committed as revision 24448 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -835,8 +835,6 @@ static void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb | |||||
| int nnz_pred, nnz, nnz_total = 0; | int nnz_pred, nnz, nnz_total = 0; | ||||
| int segment = s->segment; | int segment = s->segment; | ||||
| s->dsp.clear_blocks((DCTELEM *)s->block); | |||||
| if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { | if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { | ||||
| AV_ZERO128(dc); | AV_ZERO128(dc); | ||||
| AV_ZERO128(dc+8); | AV_ZERO128(dc+8); | ||||
| @@ -69,6 +69,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride) | |||||
| t1 = block[0*4+i] - block[2*4+i]; | t1 = block[0*4+i] - block[2*4+i]; | ||||
| t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]); | t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]); | ||||
| t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]); | t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]); | ||||
| block[0*4+i] = 0; | |||||
| block[1*4+i] = 0; | |||||
| block[2*4+i] = 0; | |||||
| block[3*4+i] = 0; | |||||
| tmp[i*4+0] = t0 + t3; | tmp[i*4+0] = t0 + t3; | ||||
| tmp[i*4+1] = t1 + t2; | tmp[i*4+1] = t1 + t2; | ||||
| @@ -94,6 +98,7 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) | |||||
| { | { | ||||
| int i, dc = (block[0] + 4) >> 3; | int i, dc = (block[0] + 4) >> 3; | ||||
| uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; | uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; | ||||
| block[0] = 0; | |||||
| for (i = 0; i < 4; i++) { | for (i = 0; i < 4; i++) { | ||||
| dst[0] = cm[dst[0]]; | dst[0] = cm[dst[0]]; | ||||
| @@ -222,6 +222,7 @@ extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | |||||
| extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | ||||
| extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | |||||
| #define DECLARE_LOOP_FILTER(NAME)\ | #define DECLARE_LOOP_FILTER(NAME)\ | ||||
| extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ | extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ | ||||
| @@ -328,6 +329,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| } | } | ||||
| if (mm_flags & FF_MM_SSE) { | if (mm_flags & FF_MM_SSE) { | ||||
| c->vp8_idct_add = ff_vp8_idct_add_sse; | |||||
| c->put_vp8_epel_pixels_tab[0][0][0] = | c->put_vp8_epel_pixels_tab[0][0][0] = | ||||
| c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; | c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; | ||||
| } | } | ||||
| @@ -913,6 +913,7 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 | |||||
| paddw mm0, [pw_4] | paddw mm0, [pw_4] | ||||
| pxor mm1, mm1 | pxor mm1, mm1 | ||||
| psraw mm0, 3 | psraw mm0, 3 | ||||
| movd [r1], mm1 | |||||
| psubw mm1, mm0 | psubw mm1, mm0 | ||||
| packuswb mm0, mm0 | packuswb mm0, mm0 | ||||
| packuswb mm1, mm1 | packuswb mm1, mm1 | ||||
| @@ -944,11 +945,12 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 | |||||
| cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | ||||
| ; load data | ; load data | ||||
| movd xmm0, [r1] | movd xmm0, [r1] | ||||
| lea r1, [r0+r2*2] | |||||
| pxor xmm1, xmm1 | pxor xmm1, xmm1 | ||||
| ; calculate DC | ; calculate DC | ||||
| paddw xmm0, [pw_4] | paddw xmm0, [pw_4] | ||||
| movd [r1], xmm1 | |||||
| lea r1, [r0+r2*2] | |||||
| movd xmm2, [r0] | movd xmm2, [r0] | ||||
| movd xmm3, [r0+r2] | movd xmm3, [r0+r2] | ||||
| movd xmm4, [r1] | movd xmm4, [r1] | ||||
| @@ -1005,14 +1007,26 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | |||||
| %endmacro | %endmacro | ||||
| INIT_MMX | INIT_MMX | ||||
| cglobal vp8_idct_add_mmx, 3, 3 | |||||
| %macro VP8_IDCT_ADD 1 | |||||
| cglobal vp8_idct_add_%1, 3, 3 | |||||
| ; load block data | ; load block data | ||||
| movq m0, [r1] | |||||
| movq m1, [r1+8] | |||||
| movq m0, [r1+ 0] | |||||
| movq m1, [r1+ 8] | |||||
| movq m2, [r1+16] | movq m2, [r1+16] | ||||
| movq m3, [r1+24] | movq m3, [r1+24] | ||||
| movq m6, [pw_20091] | movq m6, [pw_20091] | ||||
| movq m7, [pw_17734] | movq m7, [pw_17734] | ||||
| %ifidn %1, sse | |||||
| xorps xmm0, xmm0 | |||||
| movaps [r1+ 0], xmm0 | |||||
| movaps [r1+16], xmm0 | |||||
| %else | |||||
| pxor m4, m4 | |||||
| movq [r1+ 0], m4 | |||||
| movq [r1+ 8], m4 | |||||
| movq [r1+16], m4 | |||||
| movq [r1+24], m4 | |||||
| %endif | |||||
| ; actual IDCT | ; actual IDCT | ||||
| VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | ||||
| @@ -1028,6 +1042,10 @@ cglobal vp8_idct_add_mmx, 3, 3 | |||||
| STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | ||||
| RET | RET | ||||
| %endmacro | |||||
| VP8_IDCT_ADD mmx | |||||
| VP8_IDCT_ADD sse | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) | ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) | ||||