Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, | |||||
| } | } | ||||
| } | } | ||||
| static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, | |||||
| VP8Macroblock *mb) | |||||
| static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb) | |||||
| { | { | ||||
| int x, y, nnz; | |||||
| int x, y, ch, nnz; | |||||
| if (mb->mode != MODE_I4x4) | |||||
| if (mb->mode != MODE_I4x4) { | |||||
| uint8_t *y_dst = dst[0]; | |||||
| for (y = 0; y < 4; y++) { | for (y = 0; y < 4; y++) { | ||||
| for (x = 0; x < 4; x++) { | |||||
| nnz = s->non_zero_count_cache[y][x]; | |||||
| if (nnz) { | |||||
| if (nnz == 1) | |||||
| s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); | |||||
| else | |||||
| s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); | |||||
| uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]); | |||||
| if (nnz) { | |||||
| if (nnz&~0x01010101) { | |||||
| for (x = 0; x < 4; x++) { | |||||
| nnz = s->non_zero_count_cache[y][x]; | |||||
| if (nnz) { | |||||
| if (nnz == 1) | |||||
| s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); | |||||
| else | |||||
| s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize); | |||||
| } | } | ||||
| } | } | ||||
| y_dst += 4*s->linesize; | y_dst += 4*s->linesize; | ||||
| } | } | ||||
| } | |||||
| for (y = 0; y < 2; y++) { | |||||
| for (x = 0; x < 2; x++) { | |||||
| nnz = s->non_zero_count_cache[4][(y<<1)+x]; | |||||
| if (nnz) { | |||||
| if (nnz == 1) | |||||
| s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); | |||||
| else | |||||
| s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); | |||||
| } | |||||
| nnz = s->non_zero_count_cache[5][(y<<1)+x]; | |||||
| if (nnz) { | |||||
| if (nnz == 1) | |||||
| s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); | |||||
| else | |||||
| s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); | |||||
| for (ch = 0; ch < 2; ch++) { | |||||
| if (AV_RN32A(s->non_zero_count_cache[4+ch])) { | |||||
| uint8_t *ch_dst = dst[1+ch]; | |||||
| for (y = 0; y < 2; y++) { | |||||
| for (x = 0; x < 2; x++) { | |||||
| nnz = s->non_zero_count_cache[4+ch][(y<<1)+x]; | |||||
| if (nnz) { | |||||
| if (nnz == 1) | |||||
| s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); | |||||
| else | |||||
| s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize); | |||||
| } | |||||
| } | |||||
| ch_dst += 4*s->uvlinesize; | |||||
| } | } | ||||
| } | } | ||||
| u_dst += 4*s->uvlinesize; | |||||
| v_dst += 4*s->uvlinesize; | |||||
| } | } | ||||
| } | } | ||||
| @@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, | |||||
| prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); | prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); | ||||
| if (!mb->skip) { | if (!mb->skip) { | ||||
| idct_mb(s, dst[0], dst[1], dst[2], mb); | |||||
| idct_mb(s, dst, mb); | |||||
| } else { | } else { | ||||
| AV_ZERO64(s->left_nnz); | AV_ZERO64(s->left_nnz); | ||||
| AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned | AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned | ||||
| @@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) | |||||
| } | } | ||||
| } | } | ||||
| static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride) | |||||
| { | |||||
| int i, j; | |||||
| for (j = 0; j < 4; j++) { | |||||
| uint8_t *pix = dst+j*4; | |||||
| int dc = (block[j][0] + 4) >> 3; | |||||
| uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; | |||||
| block[j][0] = 0; | |||||
| if (!dc) | |||||
| continue; | |||||
| for (i = 0; i < 4; i++) { | |||||
| pix[0] = cm[pix[0]]; | |||||
| pix[1] = cm[pix[1]]; | |||||
| pix[2] = cm[pix[2]]; | |||||
| pix[3] = cm[pix[3]]; | |||||
| pix += stride; | |||||
| } | |||||
| } | |||||
| } | |||||
| // because I like only having two parameters to pass functions... | // because I like only having two parameters to pass functions... | ||||
| #define LOAD_PIXELS\ | #define LOAD_PIXELS\ | ||||
| @@ -460,9 +479,10 @@ VP8_BILINEAR(4) | |||||
| av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) | av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) | ||||
| { | { | ||||
| dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; | |||||
| dsp->vp8_idct_add = vp8_idct_add_c; | |||||
| dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; | |||||
| dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; | |||||
| dsp->vp8_idct_add = vp8_idct_add_c; | |||||
| dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; | |||||
| dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c; | |||||
| dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; | dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; | ||||
| dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; | dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; | ||||
| @@ -33,6 +33,7 @@ typedef struct VP8DSPContext { | |||||
| void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]); | void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]); | ||||
| void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride); | void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); | void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| void (*vp8_idct_dc_add4)(uint8_t *dst, DCTELEM block[4][16], int stride); | |||||
| // loop filter applied to edges between macroblocks | // loop filter applied to edges between macroblocks | ||||
| void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride, | void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride, | ||||
| @@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16) | |||||
| extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride); | |||||
| extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); | |||||
| extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | ||||
| extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| @@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| if (mm_flags & FF_MM_MMX) { | if (mm_flags & FF_MM_MMX) { | ||||
| c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; | c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; | ||||
| c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx; | |||||
| c->vp8_idct_add = ff_vp8_idct_add_mmx; | c->vp8_idct_add = ff_vp8_idct_add_mmx; | ||||
| c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; | c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; | ||||
| c->put_vp8_epel_pixels_tab[0][0][0] = | c->put_vp8_epel_pixels_tab[0][0][0] = | ||||
| @@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| } | } | ||||
| if (mm_flags & FF_MM_SSE2) { | if (mm_flags & FF_MM_SSE2) { | ||||
| c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2; | |||||
| c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | ||||
| c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | ||||
| @@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2 | |||||
| REP_RET | REP_RET | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; IDCT functions: | |||||
| ; | |||||
| ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro ADD_DC 4 | |||||
| %4 m2, [r0+%3] | |||||
| %4 m3, [r0+r2+%3] | |||||
| %4 m4, [r1+%3] | |||||
| %4 m5, [r1+r2+%3] | |||||
| paddusb m2, %1 | |||||
| paddusb m3, %1 | |||||
| paddusb m4, %1 | |||||
| paddusb m5, %1 | |||||
| psubusb m2, %2 | |||||
| psubusb m3, %2 | |||||
| psubusb m4, %2 | |||||
| psubusb m5, %2 | |||||
| %4 [r0+%3], m2 | |||||
| %4 [r0+r2+%3], m3 | |||||
| %4 [r1+%3], m4 | |||||
| %4 [r1+r2+%3], m5 | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| cglobal vp8_idct_dc_add_mmx, 3, 3 | cglobal vp8_idct_dc_add_mmx, 3, 3 | ||||
| ; load data | ; load data | ||||
| movd mm0, [r1] | |||||
| movd m0, [r1] | |||||
| ; calculate DC | ; calculate DC | ||||
| paddw mm0, [pw_4] | |||||
| pxor mm1, mm1 | |||||
| psraw mm0, 3 | |||||
| movd [r1], mm1 | |||||
| psubw mm1, mm0 | |||||
| packuswb mm0, mm0 | |||||
| packuswb mm1, mm1 | |||||
| punpcklbw mm0, mm0 | |||||
| punpcklbw mm1, mm1 | |||||
| punpcklwd mm0, mm0 | |||||
| punpcklwd mm1, mm1 | |||||
| paddw m0, [pw_4] | |||||
| pxor m1, m1 | |||||
| psraw m0, 3 | |||||
| movd [r1], m1 | |||||
| psubw m1, m0 | |||||
| packuswb m0, m0 | |||||
| packuswb m1, m1 | |||||
| punpcklbw m0, m0 | |||||
| punpcklbw m1, m1 | |||||
| punpcklwd m0, m0 | |||||
| punpcklwd m1, m1 | |||||
| ; add DC | ; add DC | ||||
| lea r1, [r0+r2*2] | |||||
| movd mm2, [r0] | |||||
| movd mm3, [r0+r2] | |||||
| movd mm4, [r1] | |||||
| movd mm5, [r1+r2] | |||||
| paddusb mm2, mm0 | |||||
| paddusb mm3, mm0 | |||||
| paddusb mm4, mm0 | |||||
| paddusb mm5, mm0 | |||||
| psubusb mm2, mm1 | |||||
| psubusb mm3, mm1 | |||||
| psubusb mm4, mm1 | |||||
| psubusb mm5, mm1 | |||||
| movd [r0], mm2 | |||||
| movd [r0+r2], mm3 | |||||
| movd [r1], mm4 | |||||
| movd [r1+r2], mm5 | |||||
| lea r1, [r0+r2*2] | |||||
| ADD_DC m0, m1, 0, movh | |||||
| RET | RET | ||||
| INIT_XMM | |||||
| cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | ||||
| ; load data | ; load data | ||||
| movd xmm0, [r1] | |||||
| pxor xmm1, xmm1 | |||||
| movd m0, [r1] | |||||
| pxor m1, m1 | |||||
| ; calculate DC | |||||
| paddw m0, [pw_4] | |||||
| movd [r1], m1 | |||||
| lea r1, [r0+r2*2] | |||||
| movd m2, [r0] | |||||
| movd m3, [r0+r2] | |||||
| movd m4, [r1] | |||||
| movd m5, [r1+r2] | |||||
| psraw m0, 3 | |||||
| pshuflw m0, m0, 0 | |||||
| punpcklqdq m0, m0 | |||||
| punpckldq m2, m3 | |||||
| punpckldq m4, m5 | |||||
| punpcklbw m2, m1 | |||||
| punpcklbw m4, m1 | |||||
| paddw m2, m0 | |||||
| paddw m4, m0 | |||||
| packuswb m2, m4 | |||||
| movd [r0], m2 | |||||
| pextrd [r0+r2], m2, 1 | |||||
| pextrd [r1], m2, 2 | |||||
| pextrd [r1+r2], m2, 3 | |||||
| RET | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); | |||||
| ;----------------------------------------------------------------------------- | |||||
| INIT_MMX | |||||
| cglobal vp8_idct_dc_add4_mmx, 3, 3 | |||||
| ; load data | |||||
| movd m0, [r1+32*0] ; A | |||||
| movd m1, [r1+32*2] ; C | |||||
| punpcklwd m0, [r1+32*1] ; A B | |||||
| punpcklwd m1, [r1+32*3] ; C D | |||||
| punpckldq m0, m1 ; A B C D | |||||
| pxor m6, m6 | |||||
| ; calculate DC | ; calculate DC | ||||
| paddw xmm0, [pw_4] | |||||
| movd [r1], xmm1 | |||||
| lea r1, [r0+r2*2] | |||||
| movd xmm2, [r0] | |||||
| movd xmm3, [r0+r2] | |||||
| movd xmm4, [r1] | |||||
| movd xmm5, [r1+r2] | |||||
| psraw xmm0, 3 | |||||
| pshuflw xmm0, xmm0, 0 | |||||
| punpcklqdq xmm0, xmm0 | |||||
| punpckldq xmm2, xmm3 | |||||
| punpckldq xmm4, xmm5 | |||||
| punpcklbw xmm2, xmm1 | |||||
| punpcklbw xmm4, xmm1 | |||||
| paddw xmm2, xmm0 | |||||
| paddw xmm4, xmm0 | |||||
| packuswb xmm2, xmm4 | |||||
| movd [r0], xmm2 | |||||
| pextrd [r0+r2], xmm2, 1 | |||||
| pextrd [r1], xmm2, 2 | |||||
| pextrd [r1+r2], xmm2, 3 | |||||
| paddw m0, [pw_4] | |||||
| movd [r1+32*0], m6 | |||||
| movd [r1+32*1], m6 | |||||
| movd [r1+32*2], m6 | |||||
| movd [r1+32*3], m6 | |||||
| psraw m0, 3 | |||||
| psubw m6, m0 | |||||
| packuswb m0, m0 | |||||
| packuswb m6, m6 | |||||
| punpcklbw m0, m0 ; AABBCCDD | |||||
| punpcklbw m6, m6 ; AABBCCDD | |||||
| movq m1, m0 | |||||
| movq m7, m6 | |||||
| punpcklbw m0, m0 ; AAAABBBB | |||||
| punpckhbw m1, m1 ; CCCCDDDD | |||||
| punpcklbw m6, m6 ; AAAABBBB | |||||
| punpckhbw m7, m7 ; CCCCDDDD | |||||
| ; add DC | |||||
| lea r1, [r0+r2*2] | |||||
| ADD_DC m0, m6, 0, mova | |||||
| ADD_DC m1, m7, 8, mova | |||||
| RET | |||||
| INIT_XMM | |||||
| cglobal vp8_idct_dc_add4_sse2, 3, 3 | |||||
| ; load data | |||||
| movd m0, [r1+32*0] ; A | |||||
| movd m1, [r1+32*2] ; C | |||||
| punpcklwd m0, [r1+32*1] ; A B | |||||
| punpcklwd m1, [r1+32*3] ; C D | |||||
| punpckldq m0, m1 ; A B C D | |||||
| pxor m1, m1 | |||||
| ; calculate DC | |||||
| paddw m0, [pw_4] | |||||
| movd [r1+32*0], m1 | |||||
| movd [r1+32*1], m1 | |||||
| movd [r1+32*2], m1 | |||||
| movd [r1+32*3], m1 | |||||
| psraw m0, 3 | |||||
| psubw m1, m0 | |||||
| packuswb m0, m0 | |||||
| packuswb m1, m1 | |||||
| punpcklbw m0, m0 | |||||
| punpcklbw m1, m1 | |||||
| punpcklbw m0, m0 | |||||
| punpcklbw m1, m1 | |||||
| ; add DC | |||||
| lea r1, [r0+r2*2] | |||||
| ADD_DC m0, m1, 0, mova | |||||
| RET | RET | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||