Originally committed as revision 15630 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -63,7 +63,9 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; | |||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; | ||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; | ||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; | ||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; | |||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; | ||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; | |||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; | ||||
| DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; | DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; | ||||
| @@ -2591,6 +2593,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| c->h263_v_loop_filter= h263_v_loop_filter_mmx; | c->h263_v_loop_filter= h263_v_loop_filter_mmx; | ||||
| c->h263_h_loop_filter= h263_h_loop_filter_mmx; | c->h263_h_loop_filter= h263_h_loop_filter_mmx; | ||||
| } | } | ||||
| if ((ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) && | |||||
| !(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||||
| c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx; | |||||
| c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx; | |||||
| } | |||||
| c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; | c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; | ||||
| c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; | c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; | ||||
| c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; | c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; | ||||
| @@ -50,7 +50,9 @@ extern const uint64_t ff_pw_255; | |||||
| extern const uint64_t ff_pb_1; | extern const uint64_t ff_pb_1; | ||||
| extern const uint64_t ff_pb_3; | extern const uint64_t ff_pb_3; | ||||
| extern const uint64_t ff_pb_7; | extern const uint64_t ff_pb_7; | ||||
| extern const uint64_t ff_pb_1F; | |||||
| extern const uint64_t ff_pb_3F; | extern const uint64_t ff_pb_3F; | ||||
| extern const uint64_t ff_pb_81; | |||||
| extern const uint64_t ff_pb_A1; | extern const uint64_t ff_pb_A1; | ||||
| extern const uint64_t ff_pb_FC; | extern const uint64_t ff_pb_FC; | ||||
| @@ -86,6 +88,22 @@ extern const double ff_pd_2[2]; | |||||
| SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ | SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ | ||||
| SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ | SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ | ||||
| // e,f,g,h can be memory | |||||
| // out: a,d,t,c | |||||
| #define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ | |||||
| "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ | |||||
| "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ | |||||
| "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ | |||||
| "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ | |||||
| SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ | |||||
| /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ | |||||
| SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ | |||||
| /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ | |||||
| SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ | |||||
| /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ | |||||
| SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ | |||||
| /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ | |||||
| #ifdef ARCH_X86_64 | #ifdef ARCH_X86_64 | ||||
| // permutes 01234567 -> 05736421 | // permutes 01234567 -> 05736421 | ||||
| #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ | #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ | ||||
| @@ -23,11 +23,112 @@ | |||||
| * MMX-optimized functions cribbed from the original VP3 source code. | * MMX-optimized functions cribbed from the original VP3 source code. | ||||
| */ | */ | ||||
| #include "libavutil/x86_cpu.h" | |||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "dsputil_mmx.h" | #include "dsputil_mmx.h" | ||||
| extern const uint16_t ff_vp3_idct_data[]; | extern const uint16_t ff_vp3_idct_data[]; | ||||
| // this is off by one or two for some cases when filter_limit is greater than 63 | |||||
| // in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 | |||||
| // out: p1 in mm4, p2 in mm3 | |||||
| #define VP3_LOOP_FILTER(flim) \ | |||||
| "movq %%mm6, %%mm7 \n\t" \ | |||||
| "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ | |||||
| "psrlw $3, %%mm7 \n\t" \ | |||||
| "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ | |||||
| "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ | |||||
| "pxor %%mm4, %%mm2 \n\t" \ | |||||
| "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ | |||||
| "movq %%mm2, %%mm5 \n\t" \ | |||||
| "paddb %%mm2, %%mm2 \n\t" \ | |||||
| "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ | |||||
| "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ | |||||
| "pcmpeqb %%mm0, %%mm0 \n\t" \ | |||||
| "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ | |||||
| "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ | |||||
| "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ | |||||
| "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ | |||||
| "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ | |||||
| "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ | |||||
| "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ | |||||
| "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ | |||||
| "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ | |||||
| "psubusb %%mm7, %%mm6 \n\t" \ | |||||
| "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ | |||||
| \ | |||||
| "movq "#flim", %%mm5 \n\t" \ | |||||
| "pminub %%mm5, %%mm6 \n\t" \ | |||||
| "pminub %%mm5, %%mm7 \n\t" \ | |||||
| "movq %%mm6, %%mm0 \n\t" \ | |||||
| "movq %%mm7, %%mm1 \n\t" \ | |||||
| "paddb %%mm6, %%mm6 \n\t" \ | |||||
| "paddb %%mm7, %%mm7 \n\t" \ | |||||
| "pminub %%mm5, %%mm6 \n\t" \ | |||||
| "pminub %%mm5, %%mm7 \n\t" \ | |||||
| "psubb %%mm0, %%mm6 \n\t" \ | |||||
| "psubb %%mm1, %%mm7 \n\t" \ | |||||
| "paddusb %%mm7, %%mm4 \n\t" \ | |||||
| "psubusb %%mm6, %%mm4 \n\t" \ | |||||
| "psubusb %%mm7, %%mm3 \n\t" \ | |||||
| "paddusb %%mm6, %%mm3 \n\t" | |||||
| #define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ | |||||
| "movd "#mm", %0 \n\t" \ | |||||
| "movw %w0, -1"#dst0" \n\t" \ | |||||
| "psrlq $32, "#mm" \n\t" \ | |||||
| "shr $16, %0 \n\t" \ | |||||
| "movw %w0, -1"#dst1" \n\t" \ | |||||
| "movd "#mm", %0 \n\t" \ | |||||
| "movw %w0, -1"#dst2" \n\t" \ | |||||
| "shr $16, %0 \n\t" \ | |||||
| "movw %w0, -1"#dst3" \n\t" | |||||
| void ff_vp3_v_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values) | |||||
| { | |||||
| __asm__ volatile( | |||||
| "movq %0, %%mm6 \n\t" | |||||
| "movq %1, %%mm4 \n\t" | |||||
| "movq %2, %%mm2 \n\t" | |||||
| "movq %3, %%mm1 \n\t" | |||||
| VP3_LOOP_FILTER(%4) | |||||
| "movq %%mm4, %1 \n\t" | |||||
| "movq %%mm3, %2 \n\t" | |||||
| : "+m" (*(uint64_t*)(src - 2*stride)), | |||||
| "+m" (*(uint64_t*)(src - 1*stride)), | |||||
| "+m" (*(uint64_t*)(src + 0*stride)), | |||||
| "+m" (*(uint64_t*)(src + 1*stride)) | |||||
| : "m"(*(uint64_t*)(bounding_values+129)) | |||||
| ); | |||||
| } | |||||
| void ff_vp3_h_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values) | |||||
| { | |||||
| x86_reg tmp; | |||||
| __asm__ volatile( | |||||
| "movd -2(%1), %%mm6 \n\t" | |||||
| "movd -2(%1,%3), %%mm0 \n\t" | |||||
| "movd -2(%1,%3,2), %%mm1 \n\t" | |||||
| "movd -2(%1,%4), %%mm4 \n\t" | |||||
| TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) | |||||
| VP3_LOOP_FILTER(%5) | |||||
| SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) | |||||
| STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) | |||||
| STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) | |||||
| : "=&r"(tmp) | |||||
| : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), | |||||
| "m"(*(uint64_t*)(bounding_values+129)) | |||||
| : "memory" | |||||
| ); | |||||
| } | |||||
| /* from original comments: The Macro does IDct on 4 1-D Dcts */ | /* from original comments: The Macro does IDct on 4 1-D Dcts */ | ||||
| #define BeginIDCT() \ | #define BeginIDCT() \ | ||||
| "movq "I(3)", %%mm2 \n\t" \ | "movq "I(3)", %%mm2 \n\t" \ | ||||
| @@ -29,4 +29,7 @@ void ff_vp3_idct_mmx(int16_t *data); | |||||
| void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | ||||
| void ff_vp3_v_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values); | |||||
| void ff_vp3_h_loop_filter_mmx(uint8_t *src, int stride, int *bounding_values); | |||||
| #endif /* AVCODEC_I386_VP3DSP_MMX_H */ | #endif /* AVCODEC_I386_VP3DSP_MMX_H */ | ||||
| @@ -229,7 +229,7 @@ typedef struct Vp3DecodeContext { | |||||
| uint16_t huffman_table[80][32][2]; | uint16_t huffman_table[80][32][2]; | ||||
| uint8_t filter_limit_values[64]; | uint8_t filter_limit_values[64]; | ||||
| int bounding_values_array[256]; | |||||
| DECLARE_ALIGNED_8(int, bounding_values_array[256+2]); | |||||
| } Vp3DecodeContext; | } Vp3DecodeContext; | ||||
| /************************************************************************ | /************************************************************************ | ||||
| @@ -533,6 +533,7 @@ static void init_loop_filter(Vp3DecodeContext *s) | |||||
| bounding_values[x] = x; | bounding_values[x] = x; | ||||
| bounding_values[x + filter_limit] = filter_limit - x; | bounding_values[x + filter_limit] = filter_limit - x; | ||||
| } | } | ||||
| bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202; | |||||
| } | } | ||||
| /* | /* | ||||