Code mostly inspired by vp8's MC, however:
- its MMX2 horizontal filter is worse because it can't take advantage of
the coefficient redundancy
- that same coefficient redundancy allows better code for non-SSSE3 versions
Benchmark (rounded to tens of unit):
V8x8 H8x8 2D8x8 V16x16 H16x16 2D16x16
C 445 358 985 1785 1559 3280
MMX* 219 271 478 714 929 1443
SSE2 131 158 294 425 515 892
SSSE3 120 122 248 387 390 763
End result is overall around a 15% speedup for SSSE3 version (on 6 sequences);
all loop filter functions now take around 55% of decoding time, while luma MC
dsp functions are around 6%, chroma ones are 1.3% and biweight around 2.3%.
Signed-off-by: Diego Biurrun <diego@biurrun.de>
tags/n0.11
| @@ -1791,6 +1791,22 @@ QPEL_2TAP(avg_, 16, 3dnow) | |||
| QPEL_2TAP(put_, 8, 3dnow) | |||
| QPEL_2TAP(avg_, 8, 3dnow) | |||
| void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) | |||
| { | |||
| put_pixels8_xy2_mmx(dst, src, stride, 8); | |||
| } | |||
| void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) | |||
| { | |||
| put_pixels16_xy2_mmx(dst, src, stride, 16); | |||
| } | |||
| void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) | |||
| { | |||
| avg_pixels8_xy2_mmx(dst, src, stride, 8); | |||
| } | |||
| void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) | |||
| { | |||
| avg_pixels16_xy2_mmx(dst, src, stride, 16); | |||
| } | |||
| #if HAVE_YASM | |||
| typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, | |||
| @@ -199,6 +199,11 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); | |||
| void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); | |||
| void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); | |||
| void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); | |||
| void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); | |||
| void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); | |||
| void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); | |||
| void ff_mmx_idct(DCTELEM *block); | |||
| void ff_mmxext_idct(DCTELEM *block); | |||
| @@ -1,5 +1,7 @@ | |||
| ;****************************************************************************** | |||
| ;* MMX/SSE2-optimized functions for the RV40 decoder | |||
| ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |||
| ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |||
| ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |||
| ;* | |||
| ;* This file is part of Libav. | |||
| @@ -25,11 +27,319 @@ | |||
| SECTION_RODATA | |||
| align 16 | |||
| shift_round: times 8 dw 1 << (16 - 6) | |||
| cextern pw_16 | |||
| pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 | |||
| sixtap_filter_hb_m: times 8 db 1, -5 | |||
| times 8 db 52, 20 | |||
| ; multiplied by 2 to have the same shift | |||
| times 8 db 2, -10 | |||
| times 8 db 40, 40 | |||
| ; back to normal | |||
| times 8 db 1, -5 | |||
| times 8 db 20, 52 | |||
| sixtap_filter_v_m: times 8 dw 1 | |||
| times 8 dw -5 | |||
| times 8 dw 52 | |||
| times 8 dw 20 | |||
| ; multiplied by 2 to have the same shift | |||
| times 8 dw 2 | |||
| times 8 dw -10 | |||
| times 8 dw 40 | |||
| times 8 dw 40 | |||
| ; back to normal | |||
| times 8 dw 1 | |||
| times 8 dw -5 | |||
| times 8 dw 20 | |||
| times 8 dw 52 | |||
| %ifdef PIC | |||
| %define sixtap_filter_hw picregq | |||
| %define sixtap_filter_hb picregq | |||
| %define sixtap_filter_v picregq | |||
| %define npicregs 1 | |||
| %else | |||
| %define sixtap_filter_hw sixtap_filter_hw_m | |||
| %define sixtap_filter_hb sixtap_filter_hb_m | |||
| %define sixtap_filter_v sixtap_filter_v_m | |||
| %define npicregs 0 | |||
| %endif | |||
| filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |||
| filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |||
| filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 | |||
| cextern pw_32 | |||
| cextern pw_16 | |||
| cextern pw_512 | |||
| SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| ; subpel MC functions: | |||
| ; | |||
| ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, | |||
| ; uint8_t *src, int srcstride, | |||
| ; int len, int m); | |||
| ;---------------------------------------------------------------------- | |||
| %macro LOAD 2 | |||
| %if WIN64 | |||
| movsxd %1q, %1d | |||
| %endif | |||
| %ifdef PIC | |||
| add %1q, picregq | |||
| %else | |||
| add %1q, %2 | |||
| %endif | |||
| %endmacro | |||
| %macro STORE 3 | |||
| %ifidn %3, avg | |||
| movh %2, [dstq] | |||
| %endif | |||
| packuswb %1, %1 | |||
| %ifidn %3, avg | |||
| %if cpuflag(3dnow) | |||
| pavgusb %1, %2 | |||
| %else | |||
| pavgb %1, %2 | |||
| %endif | |||
| %endif | |||
| movh [dstq], %1 | |||
| %endmacro | |||
| %macro FILTER_V 1 | |||
| cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg | |||
| %ifdef PIC | |||
| lea picregq, [sixtap_filter_v_m] | |||
| %endif | |||
| pxor m7, m7 | |||
| LOAD my, sixtap_filter_v | |||
| ; read 5 lines | |||
| sub srcq, srcstrideq | |||
| sub srcq, srcstrideq | |||
| movh m0, [srcq] | |||
| movh m1, [srcq+srcstrideq] | |||
| movh m2, [srcq+srcstrideq*2] | |||
| lea srcq, [srcq+srcstrideq*2] | |||
| add srcq, srcstrideq | |||
| movh m3, [srcq] | |||
| movh m4, [srcq+srcstrideq] | |||
| punpcklbw m0, m7 | |||
| punpcklbw m1, m7 | |||
| punpcklbw m2, m7 | |||
| punpcklbw m3, m7 | |||
| punpcklbw m4, m7 | |||
| %ifdef m8 | |||
| mova m8, [myq+ 0] | |||
| mova m9, [myq+16] | |||
| mova m10, [myq+32] | |||
| mova m11, [myq+48] | |||
| %define COEFF05 m8 | |||
| %define COEFF14 m9 | |||
| %define COEFF2 m10 | |||
| %define COEFF3 m11 | |||
| %else | |||
| %define COEFF05 [myq+ 0] | |||
| %define COEFF14 [myq+16] | |||
| %define COEFF2 [myq+32] | |||
| %define COEFF3 [myq+48] | |||
| %endif | |||
| .nextrow: | |||
| mova m6, m1 | |||
| movh m5, [srcq+2*srcstrideq] ; read new row | |||
| paddw m6, m4 | |||
| punpcklbw m5, m7 | |||
| pmullw m6, COEFF14 | |||
| paddw m0, m5 | |||
| pmullw m0, COEFF05 | |||
| paddw m6, m0 | |||
| mova m0, m1 | |||
| paddw m6, [pw_32] | |||
| mova m1, m2 | |||
| pmullw m2, COEFF2 | |||
| paddw m6, m2 | |||
| mova m2, m3 | |||
| pmullw m3, COEFF3 | |||
| paddw m6, m3 | |||
| ; round/clip/store | |||
| mova m3, m4 | |||
| psraw m6, 6 | |||
| mova m4, m5 | |||
| STORE m6, m5, %1 | |||
| ; go to next line | |||
| add dstq, dststrideq | |||
| add srcq, srcstrideq | |||
| dec heightd ; next row | |||
| jg .nextrow | |||
| REP_RET | |||
| %endmacro | |||
| %macro FILTER_H 1 | |||
| cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg | |||
| %ifdef PIC | |||
| lea picregq, [sixtap_filter_v_m] | |||
| %endif | |||
| pxor m7, m7 | |||
| LOAD mx, sixtap_filter_v | |||
| mova m6, [pw_32] | |||
| %ifdef m8 | |||
| mova m8, [mxq+ 0] | |||
| mova m9, [mxq+16] | |||
| mova m10, [mxq+32] | |||
| mova m11, [mxq+48] | |||
| %define COEFF05 m8 | |||
| %define COEFF14 m9 | |||
| %define COEFF2 m10 | |||
| %define COEFF3 m11 | |||
| %else | |||
| %define COEFF05 [mxq+ 0] | |||
| %define COEFF14 [mxq+16] | |||
| %define COEFF2 [mxq+32] | |||
| %define COEFF3 [mxq+48] | |||
| %endif | |||
| .nextrow: | |||
| movq m0, [srcq-2] | |||
| movq m5, [srcq+3] | |||
| movq m1, [srcq-1] | |||
| movq m4, [srcq+2] | |||
| punpcklbw m0, m7 | |||
| punpcklbw m5, m7 | |||
| punpcklbw m1, m7 | |||
| punpcklbw m4, m7 | |||
| movq m2, [srcq-0] | |||
| movq m3, [srcq+1] | |||
| paddw m0, m5 | |||
| paddw m1, m4 | |||
| punpcklbw m2, m7 | |||
| punpcklbw m3, m7 | |||
| pmullw m0, COEFF05 | |||
| pmullw m1, COEFF14 | |||
| pmullw m2, COEFF2 | |||
| pmullw m3, COEFF3 | |||
| paddw m0, m6 | |||
| paddw m1, m2 | |||
| paddw m0, m3 | |||
| paddw m0, m1 | |||
| psraw m0, 6 | |||
| STORE m0, m1, %1 | |||
| ; go to next line | |||
| add dstq, dststrideq | |||
| add srcq, srcstrideq | |||
| dec heightd ; next row | |||
| jg .nextrow | |||
| REP_RET | |||
| %endmacro | |||
| %if ARCH_X86_32 | |||
| INIT_MMX mmx | |||
| FILTER_V put | |||
| FILTER_H put | |||
| INIT_MMX mmx2 | |||
| FILTER_V avg | |||
| FILTER_H avg | |||
| INIT_MMX 3dnow | |||
| FILTER_V avg | |||
| FILTER_H avg | |||
| %endif | |||
| INIT_XMM sse2 | |||
| FILTER_H put | |||
| FILTER_H avg | |||
| FILTER_V put | |||
| FILTER_V avg | |||
| %macro FILTER_SSSE3 1 | |||
| cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg | |||
| %ifdef PIC | |||
| lea picregq, [sixtap_filter_hb_m] | |||
| %endif | |||
| ; read 5 lines | |||
| sub srcq, srcstrideq | |||
| LOAD my, sixtap_filter_hb | |||
| sub srcq, srcstrideq | |||
| movh m0, [srcq] | |||
| movh m1, [srcq+srcstrideq] | |||
| movh m2, [srcq+srcstrideq*2] | |||
| lea srcq, [srcq+srcstrideq*2] | |||
| add srcq, srcstrideq | |||
| mova m5, [myq] | |||
| movh m3, [srcq] | |||
| movh m4, [srcq+srcstrideq] | |||
| lea srcq, [srcq+2*srcstrideq] | |||
| .nextrow: | |||
| mova m6, m2 | |||
| punpcklbw m0, m1 | |||
| punpcklbw m6, m3 | |||
| pmaddubsw m0, m5 | |||
| pmaddubsw m6, [myq+16] | |||
| movh m7, [srcq] ; read new row | |||
| paddw m6, m0 | |||
| mova m0, m1 | |||
| mova m1, m2 | |||
| mova m2, m3 | |||
| mova m3, m4 | |||
| mova m4, m7 | |||
| punpcklbw m7, m3 | |||
| pmaddubsw m7, m5 | |||
| paddw m6, m7 | |||
| pmulhrsw m6, [pw_512] | |||
| STORE m6, m7, %1 | |||
| ; go to next line | |||
| add dstq, dststrideq | |||
| add srcq, srcstrideq | |||
| dec heightd ; next row | |||
| jg .nextrow | |||
| REP_RET | |||
| cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg | |||
| %ifdef PIC | |||
| lea picregq, [sixtap_filter_hb_m] | |||
| %endif | |||
| mova m3, [filter_h6_shuf2] | |||
| mova m4, [filter_h6_shuf3] | |||
| LOAD mx, sixtap_filter_hb | |||
| mova m5, [mxq] ; set up 6tap filter in bytes | |||
| mova m6, [mxq+16] | |||
| mova m7, [filter_h6_shuf1] | |||
| .nextrow: | |||
| movu m0, [srcq-2] | |||
| mova m1, m0 | |||
| mova m2, m0 | |||
| pshufb m0, m7 | |||
| pshufb m1, m3 | |||
| pshufb m2, m4 | |||
| pmaddubsw m0, m5 | |||
| pmaddubsw m1, m6 | |||
| pmaddubsw m2, m5 | |||
| paddw m0, m1 | |||
| paddw m0, m2 | |||
| pmulhrsw m0, [pw_512] | |||
| STORE m0, m1, %1 | |||
| ; go to next line | |||
| add dstq, dststrideq | |||
| add srcq, srcstrideq | |||
| dec heightd ; next row | |||
| jg .nextrow | |||
| REP_RET | |||
| %endmacro | |||
| INIT_XMM ssse3 | |||
| FILTER_SSSE3 put | |||
| FILTER_SSSE3 avg | |||
| ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 | |||
| %macro RV40_WCORE 4-5 | |||
| movh m4, [%3 + r6 + 0] | |||
| @@ -143,7 +453,7 @@ SECTION .text | |||
| %macro RV40_WEIGHT 3 | |||
| cglobal rv40_weight_func_%1_%2, 6, 7, 8 | |||
| %if cpuflag(ssse3) | |||
| mova m1, [shift_round] | |||
| mova m1, [pw_1024] | |||
| %else | |||
| mova m1, [pw_16] | |||
| %endif | |||
| @@ -22,8 +22,11 @@ | |||
| /** | |||
| * @file | |||
| * RV40 decoder motion compensation functions x86-optimised | |||
| * 2,0 and 0,2 have h264 equivalents. | |||
| * 3,3 is bugged in the rv40 format and maps to _xy2 version | |||
| */ | |||
| #include "libavcodec/x86/dsputil_mmx.h" | |||
| #include "libavcodec/rv34dsp.h" | |||
| void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, | |||
| @@ -53,6 +56,132 @@ DECLARE_WEIGHT(mmx) | |||
| DECLARE_WEIGHT(sse2) | |||
| DECLARE_WEIGHT(ssse3) | |||
| /** @{ */ | |||
| /** | |||
| * Define one qpel function. | |||
| * LOOPSIZE must be already set to the number of pixels processed per | |||
| * iteration in the inner loop of the called functions. | |||
| * COFF(x) must be already defined so as to provide the offset into any | |||
| * array of coeffs used by the called function for the qpel position x. | |||
| */ | |||
| #define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ | |||
| static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ | |||
| uint8_t *src, \ | |||
| int stride) \ | |||
| { \ | |||
| int i; \ | |||
| if (PH && PV) { \ | |||
| DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ | |||
| uint8_t *tmpptr = tmp + SIZE * 2; \ | |||
| src -= stride * 2; \ | |||
| \ | |||
| for (i = 0; i < SIZE; i += LOOPSIZE) \ | |||
| ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ | |||
| SIZE + 5, HCOFF(PH)); \ | |||
| for (i = 0; i < SIZE; i += LOOPSIZE) \ | |||
| ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ | |||
| SIZE, SIZE, VCOFF(PV)); \ | |||
| } else if (PV) { \ | |||
| for (i = 0; i < SIZE; i += LOOPSIZE) \ | |||
| ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ | |||
| stride, SIZE, VCOFF(PV)); \ | |||
| } else { \ | |||
| for (i = 0; i < SIZE; i += LOOPSIZE) \ | |||
| ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ | |||
| stride, SIZE, HCOFF(PH)); \ | |||
| } \ | |||
| }; | |||
| /** Declare functions for sizes 8 and 16 and given operations | |||
| * and qpel position. */ | |||
| #define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ | |||
| QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ | |||
| QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) | |||
| /** Declare all functions for all sizes and qpel positions */ | |||
| #define QPEL_MC_DECL(OP, OPT) \ | |||
| void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ | |||
| const uint8_t *src, \ | |||
| ptrdiff_t srcStride, \ | |||
| int len, int m); \ | |||
| void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ | |||
| const uint8_t *src, \ | |||
| ptrdiff_t srcStride, \ | |||
| int len, int m); \ | |||
| QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ | |||
| QPEL_FUNCS_DECL(OP, 3, 2, OPT) | |||
| /** @} */ | |||
| #define LOOPSIZE 8 | |||
| #define HCOFF(x) (32 * (x - 1)) | |||
| #define VCOFF(x) (32 * (x - 1)) | |||
| QPEL_MC_DECL(put_, _ssse3) | |||
| QPEL_MC_DECL(avg_, _ssse3) | |||
| #undef LOOPSIZE | |||
| #undef HCOFF | |||
| #undef VCOFF | |||
| #define LOOPSIZE 8 | |||
| #define HCOFF(x) (64 * (x - 1)) | |||
| #define VCOFF(x) (64 * (x - 1)) | |||
| QPEL_MC_DECL(put_, _sse2) | |||
| QPEL_MC_DECL(avg_, _sse2) | |||
| #if ARCH_X86_32 | |||
| #undef LOOPSIZE | |||
| #undef HCOFF | |||
| #undef VCOFF | |||
| #define LOOPSIZE 4 | |||
| #define HCOFF(x) (64 * (x - 1)) | |||
| #define VCOFF(x) (64 * (x - 1)) | |||
| QPEL_MC_DECL(put_, _mmx) | |||
| #define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx | |||
| #define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx | |||
| QPEL_MC_DECL(avg_, _mmx2) | |||
| #define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx | |||
| #define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx | |||
| QPEL_MC_DECL(avg_, _3dnow) | |||
| #endif | |||
| /** @{ */ | |||
| /** Set one function */ | |||
| #define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ | |||
| c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; | |||
| /** Set functions put and avg for sizes 8 and 16 and a given qpel position */ | |||
| #define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ | |||
| QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ | |||
| QPEL_FUNC_SET(OP, 16, PH, PV, OPT) | |||
| /** Set all functions for all sizes and qpel positions */ | |||
| #define QPEL_MC_SET(OP, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 0, 1, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 0, 3, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 1, 0, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 1, 1, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 1, 2, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 1, 3, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 2, 1, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 2, 2, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 2, 3, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 3, 0, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 3, 1, OPT) \ | |||
| QPEL_FUNCS_SET (OP, 3, 2, OPT) | |||
| /** @} */ | |||
| void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) | |||
| { | |||
| #if HAVE_YASM | |||
| @@ -65,25 +194,42 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx; | |||
| c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx; | |||
| c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx; | |||
| c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx; | |||
| c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx; | |||
| #if ARCH_X86_32 | |||
| QPEL_MC_SET(put_, _mmx) | |||
| #endif | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_MMX2) { | |||
| c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; | |||
| c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2; | |||
| #if ARCH_X86_32 | |||
| QPEL_MC_SET(avg_, _mmx2) | |||
| #endif | |||
| } else if (mm_flags & AV_CPU_FLAG_3DNOW) { | |||
| c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; | |||
| c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; | |||
| #if ARCH_X86_32 | |||
| QPEL_MC_SET(avg_, _3dnow) | |||
| #endif | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSE2) { | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; | |||
| QPEL_MC_SET(put_, _sse2) | |||
| QPEL_MC_SET(avg_, _sse2) | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSSE3) { | |||
| c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; | |||
| c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; | |||
| c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; | |||
| c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; | |||
| QPEL_MC_SET(put_, _ssse3) | |||
| QPEL_MC_SET(avg_, _ssse3) | |||
| } | |||
| #endif | |||
| } | |||