2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.6
| @@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); | |||
| void ff_vp3_idct_neon(DCTELEM *data); | |||
| void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); | |||
| void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); | |||
| void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); | |||
| void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); | |||
| void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); | |||
| @@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||
| if (CONFIG_VP3_DECODER) { | |||
| c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; | |||
| c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; | |||
| c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon; | |||
| } | |||
| c->vector_fmul = ff_vector_fmul_neon; | |||
| @@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1 | |||
| vst1.64 {d7}, [r2,:64], r1 | |||
| bx lr | |||
| endfunc | |||
| function ff_vp3_idct_dc_add_neon, export=1 | |||
| ldrsh r2, [r2] | |||
| movw r3, #46341 | |||
| mul r2, r3, r2 | |||
| smulwt r2, r3, r2 | |||
| mov r3, r0 | |||
| vdup.16 q15, r2 | |||
| vrshr.s16 q15, q15, #4 | |||
| vld1.8 {d0}, [r0,:64], r1 | |||
| vld1.8 {d1}, [r0,:64], r1 | |||
| vld1.8 {d2}, [r0,:64], r1 | |||
| vaddw.u8 q8, q15, d0 | |||
| vld1.8 {d3}, [r0,:64], r1 | |||
| vaddw.u8 q9, q15, d1 | |||
| vld1.8 {d4}, [r0,:64], r1 | |||
| vaddw.u8 q10, q15, d2 | |||
| vld1.8 {d5}, [r0,:64], r1 | |||
| vaddw.u8 q11, q15, d3 | |||
| vld1.8 {d6}, [r0,:64], r1 | |||
| vaddw.u8 q12, q15, d4 | |||
| vld1.8 {d7}, [r0,:64], r1 | |||
| vaddw.u8 q13, q15, d5 | |||
| vqmovun.s16 d0, q8 | |||
| vaddw.u8 q14, q15, d6 | |||
| vqmovun.s16 d1, q9 | |||
| vaddw.u8 q15, q15, d7 | |||
| vqmovun.s16 d2, q10 | |||
| vst1.8 {d0}, [r3,:64], r1 | |||
| vqmovun.s16 d3, q11 | |||
| vst1.8 {d1}, [r3,:64], r1 | |||
| vqmovun.s16 d4, q12 | |||
| vst1.8 {d2}, [r3,:64], r1 | |||
| vqmovun.s16 d5, q13 | |||
| vst1.8 {d3}, [r3,:64], r1 | |||
| vqmovun.s16 d6, q14 | |||
| vst1.8 {d4}, [r3,:64], r1 | |||
| vqmovun.s16 d7, q15 | |||
| vst1.8 {d5}, [r3,:64], r1 | |||
| vst1.8 {d6}, [r3,:64], r1 | |||
| vst1.8 {d7}, [r3,:64], r1 | |||
| bx lr | |||
| endfunc | |||
| @@ -4467,6 +4467,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||
| if (CONFIG_VP3_DECODER) { | |||
| c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; | |||
| c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; | |||
| c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c; | |||
| } | |||
| if (CONFIG_VP6_DECODER) { | |||
| c->vp6_filter_diag4= ff_vp6_filter_diag4_c; | |||
| @@ -86,6 +86,7 @@ extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; | |||
| void ff_vp3_idct_c(DCTELEM *block/* align 16*/); | |||
| void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); | |||
| void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); | |||
| void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); | |||
| void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); | |||
| void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); | |||
| @@ -373,6 +374,7 @@ typedef struct DSPContext { | |||
| void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale); | |||
| void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale); | |||
| void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); | |||
| void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values); | |||
| void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values); | |||
| @@ -1395,8 +1395,6 @@ static void render_slice(Vp3DecodeContext *s, int slice) | |||
| /* transform if this block was coded */ | |||
| if (s->all_fragments[i].coding_method != MODE_COPY) { | |||
| int intra = s->all_fragments[i].coding_method == MODE_INTRA; | |||
| if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) || | |||
| (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) | |||
| motion_source= golden_plane; | |||
| @@ -1456,11 +1454,11 @@ static void render_slice(Vp3DecodeContext *s, int slice) | |||
| } | |||
| s->dsp.clear_block(block); | |||
| vp3_dequant(s, s->all_fragments + i, plane, !intra, block); | |||
| /* invert DCT and place (or add) in final output */ | |||
| if (s->all_fragments[i].coding_method == MODE_INTRA) { | |||
| vp3_dequant(s, s->all_fragments + i, plane, 0, block); | |||
| if(s->avctx->idct_algo!=FF_IDCT_VP3) | |||
| block[0] += 128<<3; | |||
| s->dsp.idct_put( | |||
| @@ -1468,10 +1466,14 @@ static void render_slice(Vp3DecodeContext *s, int slice) | |||
| stride, | |||
| block); | |||
| } else { | |||
| if (vp3_dequant(s, s->all_fragments + i, plane, 1, block)) { | |||
| s->dsp.idct_add( | |||
| output_plane + first_pixel, | |||
| stride, | |||
| block); | |||
| } else { | |||
| s->dsp.vp3_idct_dc_add(output_plane + first_pixel, stride, block); | |||
| } | |||
| } | |||
| } else { | |||
| @@ -223,6 +223,25 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/* | |||
| idct(dest, line_size, block, 2); | |||
| } | |||
| void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){ | |||
| const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; | |||
| int i, dc = block[0]; | |||
| dc = (46341*dc)>>16; | |||
| dc = (46341*dc + (8<<16))>>20; | |||
| for(i = 0; i < 8; i++){ | |||
| dest[0] = cm[dest[0]+dc]; | |||
| dest[1] = cm[dest[1]+dc]; | |||
| dest[2] = cm[dest[2]+dc]; | |||
| dest[3] = cm[dest[3]+dc]; | |||
| dest[4] = cm[dest[4]+dc]; | |||
| dest[5] = cm[dest[5]+dc]; | |||
| dest[6] = cm[dest[6]+dc]; | |||
| dest[7] = cm[dest[7]+dc]; | |||
| dest += line_size; | |||
| } | |||
| } | |||
| void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values) | |||
| { | |||
| unsigned char *end; | |||
| @@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; | |||
| } | |||
| } | |||
| if (CONFIG_VP3_DECODER) { | |||
| c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; | |||
| } | |||
| #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |||
| c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ | |||
| @@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) | |||
| ff_vp3_idct_mmx(block); | |||
| add_pixels_clamped_mmx(block, dest, line_size); | |||
| } | |||
| void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) | |||
| { | |||
| int dc = block[0]; | |||
| dc = (46341*dc)>>16; | |||
| dc = (46341*dc + (8<<16))>>20; | |||
| __asm__ volatile( | |||
| "movd %3, %%mm0 \n\t" | |||
| "pshufw $0, %%mm0, %%mm0 \n\t" | |||
| "pxor %%mm1, %%mm1 \n\t" | |||
| "psubw %%mm0, %%mm1 \n\t" | |||
| "packuswb %%mm0, %%mm0 \n\t" | |||
| "packuswb %%mm1, %%mm1 \n\t" | |||
| #define DC_ADD \ | |||
| "movq (%0), %%mm2 \n\t" \ | |||
| "movq (%0,%1), %%mm3 \n\t" \ | |||
| "paddusb %%mm0, %%mm2 \n\t" \ | |||
| "movq (%0,%1,2), %%mm4 \n\t" \ | |||
| "paddusb %%mm0, %%mm3 \n\t" \ | |||
| "movq (%0,%2), %%mm5 \n\t" \ | |||
| "paddusb %%mm0, %%mm4 \n\t" \ | |||
| "paddusb %%mm0, %%mm5 \n\t" \ | |||
| "psubusb %%mm1, %%mm2 \n\t" \ | |||
| "psubusb %%mm1, %%mm3 \n\t" \ | |||
| "movq %%mm2, (%0) \n\t" \ | |||
| "psubusb %%mm1, %%mm4 \n\t" \ | |||
| "movq %%mm3, (%0,%1) \n\t" \ | |||
| "psubusb %%mm1, %%mm5 \n\t" \ | |||
| "movq %%mm4, (%0,%1,2) \n\t" \ | |||
| "movq %%mm5, (%0,%2) \n\t" | |||
| DC_ADD | |||
| "lea (%0,%1,4), %0 \n\t" | |||
| DC_ADD | |||
| : "+r"(dest) | |||
| : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) | |||
| ); | |||
| } | |||
| @@ -28,6 +28,7 @@ | |||
| void ff_vp3_idct_mmx(int16_t *data); | |||
| void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | |||
| void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | |||
| void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); | |||
| void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); | |||
| void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); | |||