* qatar/master: x86: mmx2 ---> mmxext in asm constructs Conflicts: libavcodec/x86/h264_chromamc_10bit.asm libavcodec/x86/h264_deblock.asm libavcodec/x86/h264dsp_init.c Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n1.1
@@ -97,7 +97,7 @@ AC3_EXPONENT_MIN | |||||
por %1, %2 | por %1, %2 | ||||
pshuflw %2, %1, q0001 | pshuflw %2, %1, q0001 | ||||
por %1, %2 | por %1, %2 | ||||
%elif cpuflag(mmx2) | |||||
%elif cpuflag(mmxext) | |||||
pshufw %2, %1, q0032 | pshufw %2, %1, q0032 | ||||
por %1, %2 | por %1, %2 | ||||
pshufw %2, %1, q0001 | pshufw %2, %1, q0001 | ||||
@@ -153,7 +153,7 @@ cglobal ac3_max_msb_abs_int16, 2,2,5, src, len | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
%define ABS2 ABS2_MMX | %define ABS2 ABS2_MMX | ||||
AC3_MAX_MSB_ABS_INT16 or_abs | AC3_MAX_MSB_ABS_INT16 or_abs | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
%define ABS2 ABS2_MMXEXT | %define ABS2 ABS2_MMXEXT | ||||
AC3_MAX_MSB_ABS_INT16 min_max | AC3_MAX_MSB_ABS_INT16 min_max | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
@@ -31,7 +31,7 @@ extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int n | |||||
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | ||||
extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); | extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); | ||||
extern int ff_ac3_max_msb_abs_int16_mmx2 (const int16_t *src, int len); | |||||
extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); | |||||
extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); | extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); | ||||
extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); | extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); | ||||
@@ -182,7 +182,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) | |||||
} | } | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | ||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx2; | |||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; | |||||
} | } | ||||
if (EXTERNAL_SSE(mm_flags)) { | if (EXTERNAL_SSE(mm_flags)) { | ||||
c->float_to_fixed24 = ff_float_to_fixed24_sse; | c->float_to_fixed24 = ff_float_to_fixed24_sse; | ||||
@@ -108,7 +108,7 @@ cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
SCALARPRODUCT mmx2 | |||||
SCALARPRODUCT mmxext | |||||
INIT_XMM | INIT_XMM | ||||
SCALARPRODUCT sse2 | SCALARPRODUCT sse2 | ||||
@@ -327,8 +327,8 @@ APPLY_WINDOW_INT16 ssse3_atom, 0, 1 | |||||
APPLY_WINDOW_INT16 ssse3, 0, 1 | APPLY_WINDOW_INT16 ssse3, 0, 1 | ||||
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||||
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | |||||
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||||
cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, top, diff, w, left, left_top | |||||
movq mm0, [topq] | movq mm0, [topq] | ||||
movq mm2, mm0 | movq mm2, mm0 | ||||
movd mm4, [left_topq] | movd mm4, [left_topq] | ||||
@@ -804,7 +804,7 @@ ALIGN 128 | |||||
mov valh, vall | mov valh, vall | ||||
%if %1 >= 8 | %if %1 >= 8 | ||||
movd mm0, vald | movd mm0, vald | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
pshufw mm0, mm0, 0 | pshufw mm0, mm0, 0 | ||||
%else ; mmx | %else ; mmx | ||||
punpcklwd mm0, mm0 | punpcklwd mm0, mm0 | ||||
@@ -2109,21 +2109,21 @@ PREFETCH(prefetch_3dnow, prefetch) | |||||
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src, | void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_h264_chroma_mc8_rnd_mmx2 (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src, | void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, | void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, | |||||
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, | void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, | |||||
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, | |||||
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, | void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, | ||||
@@ -2141,10 +2141,10 @@ void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ | |||||
(uint8_t *dst, uint8_t *src, \ | (uint8_t *dst, uint8_t *src, \ | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
CHROMA_MC(put, 2, 10, mmx2) | |||||
CHROMA_MC(avg, 2, 10, mmx2) | |||||
CHROMA_MC(put, 4, 10, mmx2) | |||||
CHROMA_MC(avg, 4, 10, mmx2) | |||||
CHROMA_MC(put, 2, 10, mmxext) | |||||
CHROMA_MC(avg, 2, 10, mmxext) | |||||
CHROMA_MC(put, 4, 10, mmxext) | |||||
CHROMA_MC(avg, 4, 10, mmxext) | |||||
CHROMA_MC(put, 8, 10, sse2) | CHROMA_MC(put, 8, 10, sse2) | ||||
CHROMA_MC(avg, 8, 10, sse2) | CHROMA_MC(avg, 8, 10, sse2) | ||||
CHROMA_MC(put, 8, 10, avx) | CHROMA_MC(put, 8, 10, avx) | ||||
@@ -2457,13 +2457,13 @@ static void vector_clipf_sse(float *dst, const float *src, | |||||
#endif /* HAVE_INLINE_ASM */ | #endif /* HAVE_INLINE_ASM */ | ||||
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, | |||||
int order); | |||||
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, | |||||
int order); | |||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, | int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, | ||||
int order); | int order); | ||||
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, | |||||
const int16_t *v3, | |||||
int order, int mul); | |||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, | |||||
const int16_t *v3, | |||||
int order, int mul); | |||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, | int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, | ||||
const int16_t *v3, | const int16_t *v3, | ||||
int order, int mul); | int order, int mul); | ||||
@@ -2487,9 +2487,9 @@ void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, | |||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); | void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); | ||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); | void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); | ||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, | |||||
const uint8_t *diff, int w, | |||||
int *left, int *left_top); | |||||
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, | |||||
const uint8_t *diff, int w, | |||||
int *left, int *left_top); | |||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, | int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, | ||||
int w, int left); | int w, int left); | ||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, | int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, | ||||
@@ -2706,24 +2706,24 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, | |||||
#if HAVE_YASM | #if HAVE_YASM | ||||
if (!high_bit_depth && CONFIG_H264CHROMA) { | if (!high_bit_depth && CONFIG_H264CHROMA) { | ||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmx2; | |||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2; | |||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2; | |||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2; | |||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; | |||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; | |||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext; | |||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext; | |||||
} | } | ||||
if (bit_depth == 10 && CONFIG_H264CHROMA) { | if (bit_depth == 10 && CONFIG_H264CHROMA) { | ||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2; | |||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2; | |||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2; | |||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2; | |||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; | |||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; | |||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; | |||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; | |||||
} | } | ||||
/* slower than cmov version on AMD */ | /* slower than cmov version on AMD */ | ||||
if (!(mm_flags & AV_CPU_FLAG_3DNOW)) | if (!(mm_flags & AV_CPU_FLAG_3DNOW)) | ||||
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; | |||||
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; | |||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | |||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; | |||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; | |||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; | |||||
if (avctx->flags & CODEC_FLAG_BITEXACT) { | if (avctx->flags & CODEC_FLAG_BITEXACT) { | ||||
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba; | c->apply_window_int16 = ff_apply_window_int16_mmxext_ba; | ||||
@@ -265,7 +265,7 @@ HADAMARD8_DIFF_MMX mmx | |||||
%define ABS1 ABS1_MMXEXT | %define ABS1 ABS1_MMXEXT | ||||
%define HSUM HSUM_MMXEXT | %define HSUM HSUM_MMXEXT | ||||
HADAMARD8_DIFF_MMX mmx2 | |||||
HADAMARD8_DIFF_MMX mmxext | |||||
INIT_XMM | INIT_XMM | ||||
%define ABS2 ABS2_MMXEXT | %define ABS2 ABS2_MMXEXT | ||||
@@ -1105,7 +1105,7 @@ int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ | |||||
int stride, int h); | int stride, int h); | ||||
hadamard_func(mmx) | hadamard_func(mmx) | ||||
hadamard_func(mmx2) | |||||
hadamard_func(mmxext) | |||||
hadamard_func(sse2) | hadamard_func(sse2) | ||||
hadamard_func(ssse3) | hadamard_func(ssse3) | ||||
@@ -1196,8 +1196,8 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; | c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2; | |||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2; | |||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; | |||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; | |||||
} | } | ||||
if (EXTERNAL_SSE2(mm_flags)) { | if (EXTERNAL_SSE2(mm_flags)) { | ||||
@@ -442,17 +442,17 @@ chroma_mc8_mmx_func put, vc1, nornd_mmx | |||||
chroma_mc8_mmx_func put, rv40, mmx | chroma_mc8_mmx_func put, rv40, mmx | ||||
chroma_mc4_mmx_func put, h264, mmx | chroma_mc4_mmx_func put, h264, mmx | ||||
chroma_mc4_mmx_func put, rv40, mmx | chroma_mc4_mmx_func put, rv40, mmx | ||||
chroma_mc2_mmx_func put, h264, mmx2 | |||||
chroma_mc2_mmx_func put, h264, mmxext | |||||
%define CHROMAMC_AVG DIRECT_AVG | %define CHROMAMC_AVG DIRECT_AVG | ||||
%define CHROMAMC_AVG4 COPY_AVG | %define CHROMAMC_AVG4 COPY_AVG | ||||
%define PAVG pavgb | %define PAVG pavgb | ||||
chroma_mc8_mmx_func avg, h264, rnd_mmx2 | |||||
chroma_mc8_mmx_func avg, vc1, nornd_mmx2 | |||||
chroma_mc8_mmx_func avg, rv40, mmx2 | |||||
chroma_mc4_mmx_func avg, h264, mmx2 | |||||
chroma_mc4_mmx_func avg, rv40, mmx2 | |||||
chroma_mc2_mmx_func avg, h264, mmx2 | |||||
chroma_mc8_mmx_func avg, h264, rnd_mmxext | |||||
chroma_mc8_mmx_func avg, vc1, nornd_mmxext | |||||
chroma_mc8_mmx_func avg, rv40, mmxext | |||||
chroma_mc4_mmx_func avg, h264, mmxext | |||||
chroma_mc4_mmx_func avg, rv40, mmxext | |||||
chroma_mc2_mmx_func avg, h264, mmxext | |||||
%define PAVG pavgusb | %define PAVG pavgusb | ||||
chroma_mc8_mmx_func avg, h264, rnd_3dnow | chroma_mc8_mmx_func avg, h264, rnd_3dnow | ||||
@@ -255,7 +255,7 @@ CHROMA_MC8 put | |||||
INIT_XMM avx | INIT_XMM avx | ||||
CHROMA_MC8 put | CHROMA_MC8 put | ||||
%endif | %endif | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
CHROMA_MC4 put | CHROMA_MC4 put | ||||
CHROMA_MC2 put | CHROMA_MC2 put | ||||
@@ -266,6 +266,6 @@ CHROMA_MC8 avg | |||||
INIT_XMM avx | INIT_XMM avx | ||||
CHROMA_MC8 avg | CHROMA_MC8 avg | ||||
%endif | %endif | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
CHROMA_MC4 avg | CHROMA_MC4 avg | ||||
CHROMA_MC2 avg | CHROMA_MC2 avg |
@@ -506,7 +506,7 @@ cglobal deblock_h_luma_8, 0,5 | |||||
RET | RET | ||||
%endmacro ; DEBLOCK_LUMA | %endmacro ; DEBLOCK_LUMA | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
DEBLOCK_LUMA v8, 8 | DEBLOCK_LUMA v8, 8 | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
DEBLOCK_LUMA v, 16 | DEBLOCK_LUMA v, 16 | ||||
@@ -789,11 +789,11 @@ INIT_XMM avx | |||||
DEBLOCK_LUMA_INTRA v | DEBLOCK_LUMA_INTRA v | ||||
%endif | %endif | ||||
%if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
DEBLOCK_LUMA_INTRA v8 | DEBLOCK_LUMA_INTRA v8 | ||||
%endif | %endif | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
%macro CHROMA_V_START 0 | %macro CHROMA_V_START 0 | ||||
dec r2d ; alpha-1 | dec r2d ; alpha-1 | ||||
@@ -824,7 +824,7 @@ cglobal deblock_v_chroma_8, 5,6 | |||||
movq m1, [t5+r1] | movq m1, [t5+r1] | ||||
movq m2, [r0] | movq m2, [r0] | ||||
movq m3, [r0+r1] | movq m3, [r0+r1] | ||||
call ff_chroma_inter_body_mmx2 | |||||
call ff_chroma_inter_body_mmxext | |||||
movq [t5+r1], m1 | movq [t5+r1], m1 | ||||
movq [r0], m2 | movq [r0], m2 | ||||
RET | RET | ||||
@@ -862,7 +862,7 @@ cglobal deblock_h_chroma_8, 5,7 | |||||
RET | RET | ||||
ALIGN 16 | ALIGN 16 | ||||
ff_chroma_inter_body_mmx2: | |||||
ff_chroma_inter_body_mmxext: | |||||
LOAD_MASK r2d, r3d | LOAD_MASK r2d, r3d | ||||
movd m6, [r4] ; tc0 | movd m6, [r4] ; tc0 | ||||
punpcklbw m6, m6 | punpcklbw m6, m6 | ||||
@@ -895,7 +895,7 @@ cglobal deblock_v_chroma_intra_8, 4,5 | |||||
movq m1, [t5+r1] | movq m1, [t5+r1] | ||||
movq m2, [r0] | movq m2, [r0] | ||||
movq m3, [r0+r1] | movq m3, [r0+r1] | ||||
call ff_chroma_intra_body_mmx2 | |||||
call ff_chroma_intra_body_mmxext | |||||
movq [t5+r1], m1 | movq [t5+r1], m1 | ||||
movq [r0], m2 | movq [r0], m2 | ||||
RET | RET | ||||
@@ -906,12 +906,12 @@ cglobal deblock_v_chroma_intra_8, 4,5 | |||||
cglobal deblock_h_chroma_intra_8, 4,6 | cglobal deblock_h_chroma_intra_8, 4,6 | ||||
CHROMA_H_START | CHROMA_H_START | ||||
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) | TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) | ||||
call ff_chroma_intra_body_mmx2 | |||||
call ff_chroma_intra_body_mmxext | |||||
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) | TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) | ||||
RET | RET | ||||
ALIGN 16 | ALIGN 16 | ||||
ff_chroma_intra_body_mmx2: | |||||
ff_chroma_intra_body_mmxext: | |||||
LOAD_MASK r2d, r3d | LOAD_MASK r2d, r3d | ||||
movq m5, m1 | movq m5, m1 | ||||
movq m6, m2 | movq m6, m2 | ||||
@@ -1035,7 +1035,7 @@ ff_chroma_intra_body_mmx2: | |||||
jl %%.b_idx_loop | jl %%.b_idx_loop | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ | cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ | ||||
step, mask_mv0, mask_mv1, field | step, mask_mv0, mask_mv1, field | ||||
%define b_idxq bidirq | %define b_idxq bidirq | ||||
@@ -795,7 +795,7 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) | |||||
%endmacro | %endmacro | ||||
%if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
DEBLOCK_LUMA | DEBLOCK_LUMA | ||||
DEBLOCK_LUMA_INTRA | DEBLOCK_LUMA_INTRA | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
@@ -912,7 +912,7 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) | |||||
%endmacro | %endmacro | ||||
%if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
DEBLOCK_CHROMA | DEBLOCK_CHROMA | ||||
%endif | %endif | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
@@ -286,14 +286,14 @@ cglobal h264_idct8_add_8_sse2, 3, 4, 10 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |||||
cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0 | |||||
; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | |||||
cglobal h264_idct_dc_add_8_mmxext, 3, 3, 0 | |||||
DC_ADD_MMXEXT_INIT r1, r2 | DC_ADD_MMXEXT_INIT r1, r2 | ||||
DC_ADD_MMXEXT_OP movh, r0, r2, r1 | DC_ADD_MMXEXT_OP movh, r0, r2, r1 | ||||
RET | RET | ||||
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |||||
cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 | |||||
; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | |||||
cglobal h264_idct8_dc_add_8_mmxext, 3, 3, 0 | |||||
DC_ADD_MMXEXT_INIT r1, r2 | DC_ADD_MMXEXT_INIT r1, r2 | ||||
DC_ADD_MMXEXT_OP mova, r0, r2, r1 | DC_ADD_MMXEXT_OP mova, r0, r2, r1 | ||||
lea r0, [r0+r2*4] | lea r0, [r0+r2*4] | ||||
@@ -354,9 +354,9 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str | |||||
ADD rsp, pad | ADD rsp, pad | ||||
RET | RET | ||||
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add16_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
xor r5, r5 | xor r5, r5 | ||||
%ifdef PIC | %ifdef PIC | ||||
lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
@@ -421,9 +421,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block | |||||
jl .nextblock | jl .nextblock | ||||
REP_RET | REP_RET | ||||
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, | |||||
; const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add16intra_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
xor r5, r5 | xor r5, r5 | ||||
%ifdef PIC | %ifdef PIC | ||||
lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
@@ -463,9 +464,10 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo | |||||
jl .nextblock | jl .nextblock | ||||
REP_RET | REP_RET | ||||
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset, | |||||
; DCTELEM *block, int stride, | |||||
; const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct8_add4_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
%assign pad 128+4-(stack_offset&7) | %assign pad 128+4-(stack_offset&7) | ||||
SUB rsp, pad | SUB rsp, pad | ||||
@@ -620,7 +622,7 @@ cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, str | |||||
call h264_idct_add8_mmx_plane | call h264_idct_add8_mmx_plane | ||||
RET | RET | ||||
h264_idct_add8_mmx2_plane: | |||||
h264_idct_add8_mmxext_plane: | |||||
.nextblock: | .nextblock: | ||||
movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
@@ -661,9 +663,9 @@ h264_idct_add8_mmx2_plane: | |||||
jnz .nextblock | jnz .nextblock | ||||
rep ret | rep ret | ||||
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset, | |||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) | |||||
cglobal h264_idct_add8_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |||||
mov r5, 16 | mov r5, 16 | ||||
add r2, 512 | add r2, 512 | ||||
%if ARCH_X86_64 | %if ARCH_X86_64 | ||||
@@ -672,7 +674,7 @@ cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, st | |||||
%ifdef PIC | %ifdef PIC | ||||
lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
%endif | %endif | ||||
call h264_idct_add8_mmx2_plane | |||||
call h264_idct_add8_mmxext_plane | |||||
mov r5, 32 | mov r5, 32 | ||||
add r2, 384 | add r2, 384 | ||||
%if ARCH_X86_64 | %if ARCH_X86_64 | ||||
@@ -680,12 +682,12 @@ cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, st | |||||
%else | %else | ||||
add r0mp, gprsize | add r0mp, gprsize | ||||
%endif | %endif | ||||
call h264_idct_add8_mmx2_plane | |||||
call h264_idct_add8_mmxext_plane | |||||
RET | RET | ||||
INIT_MMX | INIT_MMX | ||||
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | ||||
h264_idct_dc_add8_mmx2: | |||||
h264_idct_dc_add8_mmxext: | |||||
movd m0, [r2 ] ; 0 0 X D | movd m0, [r2 ] ; 0 0 X D | ||||
punpcklwd m0, [r2+32] ; x X d D | punpcklwd m0, [r2+32] ; x X d D | ||||
paddsw m0, [pw_32] | paddsw m0, [pw_32] | ||||
@@ -779,7 +781,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 | |||||
%else | %else | ||||
add r0, r0m | add r0, r0m | ||||
%endif | %endif | ||||
call h264_idct_dc_add8_mmx2 | |||||
call h264_idct_dc_add8_mmxext | |||||
.cycle%1end: | .cycle%1end: | ||||
%if %1 < 7 | %if %1 < 7 | ||||
add r2, 64 | add r2, 64 | ||||
@@ -828,7 +830,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 | |||||
mov r0, [r0] | mov r0, [r0] | ||||
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | ||||
%endif | %endif | ||||
call h264_idct_dc_add8_mmx2 | |||||
call h264_idct_dc_add8_mmxext | |||||
.cycle%1end: | .cycle%1end: | ||||
%if %1 == 1 | %if %1 == 1 | ||||
add r2, 384+64 | add r2, 384+64 | ||||
@@ -184,7 +184,7 @@ IDCT_ADD16_10 | |||||
mova [%1+%3 ], m4 | mova [%1+%3 ], m4 | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal h264_idct_dc_add_10,3,3 | cglobal h264_idct_dc_add_10,3,3 | ||||
movd m0, [r1] | movd m0, [r1] | ||||
paddd m0, [pd_32] | paddd m0, [pd_32] | ||||
@@ -120,7 +120,7 @@ cglobal pred16x16_horizontal_8, 2,3 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
PRED16x16_H | PRED16x16_H | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_H | PRED16x16_H | ||||
INIT_XMM ssse3 | INIT_XMM ssse3 | ||||
PRED16x16_H | PRED16x16_H | ||||
@@ -180,7 +180,7 @@ cglobal pred16x16_dc_8, 2,7 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_DC | PRED16x16_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_DC | PRED16x16_DC | ||||
@@ -229,7 +229,7 @@ cglobal pred16x16_tm_vp8_8, 2,5 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
PRED16x16_TM | PRED16x16_TM | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_TM | PRED16x16_TM | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
@@ -309,14 +309,14 @@ cglobal pred16x16_plane_%1_8, 2,9,7 | |||||
movhlps m1, m0 | movhlps m1, m0 | ||||
%endif | %endif | ||||
paddw m0, m1 | paddw m0, m1 | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
PSHUFLW m1, m0, 0xE | PSHUFLW m1, m0, 0xE | ||||
%elif cpuflag(mmx) | %elif cpuflag(mmx) | ||||
mova m1, m0 | mova m1, m0 | ||||
psrlq m1, 32 | psrlq m1, 32 | ||||
%endif | %endif | ||||
paddw m0, m1 | paddw m0, m1 | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
PSHUFLW m1, m0, 0x1 | PSHUFLW m1, m0, 0x1 | ||||
%elif cpuflag(mmx) | %elif cpuflag(mmx) | ||||
mova m1, m0 | mova m1, m0 | ||||
@@ -536,7 +536,7 @@ INIT_MMX mmx | |||||
H264_PRED16x16_PLANE h264 | H264_PRED16x16_PLANE h264 | ||||
H264_PRED16x16_PLANE rv40 | H264_PRED16x16_PLANE rv40 | ||||
H264_PRED16x16_PLANE svq3 | H264_PRED16x16_PLANE svq3 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
H264_PRED16x16_PLANE h264 | H264_PRED16x16_PLANE h264 | ||||
H264_PRED16x16_PLANE rv40 | H264_PRED16x16_PLANE rv40 | ||||
H264_PRED16x16_PLANE svq3 | H264_PRED16x16_PLANE svq3 | ||||
@@ -582,7 +582,7 @@ cglobal pred8x8_plane_8, 2,9,7 | |||||
paddw m0, m1 | paddw m0, m1 | ||||
%if notcpuflag(ssse3) | %if notcpuflag(ssse3) | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
PSHUFLW m1, m0, 0xE | PSHUFLW m1, m0, 0xE | ||||
%elif cpuflag(mmx) | %elif cpuflag(mmx) | ||||
mova m1, m0 | mova m1, m0 | ||||
@@ -591,7 +591,7 @@ cglobal pred8x8_plane_8, 2,9,7 | |||||
paddw m0, m1 | paddw m0, m1 | ||||
%endif ; !ssse3 | %endif ; !ssse3 | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
PSHUFLW m1, m0, 0x1 | PSHUFLW m1, m0, 0x1 | ||||
%elif cpuflag(mmx) | %elif cpuflag(mmx) | ||||
mova m1, m0 | mova m1, m0 | ||||
@@ -716,7 +716,7 @@ ALIGN 16 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
H264_PRED8x8_PLANE | H264_PRED8x8_PLANE | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
H264_PRED8x8_PLANE | H264_PRED8x8_PLANE | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
H264_PRED8x8_PLANE | H264_PRED8x8_PLANE | ||||
@@ -763,7 +763,7 @@ cglobal pred8x8_horizontal_8, 2,3 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
PRED8x8_H | PRED8x8_H | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED8x8_H | PRED8x8_H | ||||
INIT_MMX ssse3 | INIT_MMX ssse3 | ||||
PRED8x8_H | PRED8x8_H | ||||
@@ -941,7 +941,7 @@ cglobal pred8x8_tm_vp8_8, 2,6 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
PRED8x8_TM | PRED8x8_TM | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED8x8_TM | PRED8x8_TM | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
@@ -2442,7 +2442,7 @@ cglobal pred4x4_tm_vp8_8, 3,6 | |||||
sub r3d, r4d | sub r3d, r4d | ||||
movd mm2, r1d | movd mm2, r1d | ||||
movd mm4, r3d | movd mm4, r3d | ||||
%if cpuflag(mmx2) | |||||
%if cpuflag(mmxext) | |||||
pshufw mm2, mm2, 0 | pshufw mm2, mm2, 0 | ||||
pshufw mm4, mm4, 0 | pshufw mm4, mm4, 0 | ||||
%else | %else | ||||
@@ -2465,7 +2465,7 @@ cglobal pred4x4_tm_vp8_8, 3,6 | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
PRED4x4_TM | PRED4x4_TM | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED4x4_TM | PRED4x4_TM | ||||
INIT_XMM ssse3 | INIT_XMM ssse3 | ||||
@@ -188,7 +188,7 @@ PRED4x4_HD | |||||
HADDD %1, %2 | HADDD %1, %2 | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal pred4x4_dc_10, 3, 3 | cglobal pred4x4_dc_10, 3, 3 | ||||
sub r0, r2 | sub r0, r2 | ||||
lea r1, [r0+r2*2] | lea r1, [r0+r2*2] | ||||
@@ -271,7 +271,7 @@ PRED4x4_VL | |||||
;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) | ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) | ||||
;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal pred4x4_horizontal_up_10, 3, 3 | cglobal pred4x4_horizontal_up_10, 3, 3 | ||||
sub r0, r2 | sub r0, r2 | ||||
lea r1, [r0+r2*2] | lea r1, [r0+r2*2] | ||||
@@ -420,7 +420,7 @@ cglobal pred8x8_dc_10, 2, 6 | |||||
RET | RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED8x8_DC pshufw | PRED8x8_DC pshufw | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED8x8_DC pshuflw | PRED8x8_DC pshuflw | ||||
@@ -534,7 +534,7 @@ cglobal pred8x8l_128_dc_10, 4, 4 | |||||
RET | RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED8x8L_128_DC | PRED8x8L_128_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED8x8L_128_DC | PRED8x8L_128_DC | ||||
@@ -1033,7 +1033,7 @@ cglobal pred16x16_vertical_10, 2, 3 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_VERTICAL | PRED16x16_VERTICAL | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_VERTICAL | PRED16x16_VERTICAL | ||||
@@ -1057,7 +1057,7 @@ cglobal pred16x16_horizontal_10, 2, 3 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_HORIZONTAL | PRED16x16_HORIZONTAL | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_HORIZONTAL | PRED16x16_HORIZONTAL | ||||
@@ -1103,7 +1103,7 @@ cglobal pred16x16_dc_10, 2, 6 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_DC | PRED16x16_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_DC | PRED16x16_DC | ||||
@@ -1135,7 +1135,7 @@ cglobal pred16x16_top_dc_10, 2, 3 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_TOP_DC | PRED16x16_TOP_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_TOP_DC | PRED16x16_TOP_DC | ||||
@@ -1172,7 +1172,7 @@ cglobal pred16x16_left_dc_10, 2, 6 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_LEFT_DC | PRED16x16_LEFT_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_LEFT_DC | PRED16x16_LEFT_DC | ||||
@@ -1193,7 +1193,7 @@ cglobal pred16x16_128_dc_10, 2,3 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
PRED16x16_128_DC | PRED16x16_128_DC | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
PRED16x16_128_DC | PRED16x16_128_DC |
@@ -27,7 +27,7 @@ void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | |||||
const uint8_t *topright, \ | const uint8_t *topright, \ | ||||
ptrdiff_t stride); | ptrdiff_t stride); | ||||
PRED4x4(dc, 10, mmx2) | |||||
PRED4x4(dc, 10, mmxext) | |||||
PRED4x4(down_left, 10, sse2) | PRED4x4(down_left, 10, sse2) | ||||
PRED4x4(down_left, 10, avx) | PRED4x4(down_left, 10, avx) | ||||
PRED4x4(down_right, 10, sse2) | PRED4x4(down_right, 10, sse2) | ||||
@@ -38,7 +38,7 @@ PRED4x4(vertical_left, 10, avx) | |||||
PRED4x4(vertical_right, 10, sse2) | PRED4x4(vertical_right, 10, sse2) | ||||
PRED4x4(vertical_right, 10, ssse3) | PRED4x4(vertical_right, 10, ssse3) | ||||
PRED4x4(vertical_right, 10, avx) | PRED4x4(vertical_right, 10, avx) | ||||
PRED4x4(horizontal_up, 10, mmx2) | |||||
PRED4x4(horizontal_up, 10, mmxext) | |||||
PRED4x4(horizontal_down, 10, sse2) | PRED4x4(horizontal_down, 10, sse2) | ||||
PRED4x4(horizontal_down, 10, ssse3) | PRED4x4(horizontal_down, 10, ssse3) | ||||
PRED4x4(horizontal_down, 10, avx) | PRED4x4(horizontal_down, 10, avx) | ||||
@@ -47,7 +47,7 @@ PRED4x4(horizontal_down, 10, avx) | |||||
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | ||||
ptrdiff_t stride); | ptrdiff_t stride); | ||||
PRED8x8(dc, 10, mmx2) | |||||
PRED8x8(dc, 10, mmxext) | |||||
PRED8x8(dc, 10, sse2) | PRED8x8(dc, 10, sse2) | ||||
PRED8x8(top_dc, 10, sse2) | PRED8x8(top_dc, 10, sse2) | ||||
PRED8x8(plane, 10, sse2) | PRED8x8(plane, 10, sse2) | ||||
@@ -62,7 +62,7 @@ void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | |||||
PRED8x8L(dc, 10, sse2) | PRED8x8L(dc, 10, sse2) | ||||
PRED8x8L(dc, 10, avx) | PRED8x8L(dc, 10, avx) | ||||
PRED8x8L(128_dc, 10, mmx2) | |||||
PRED8x8L(128_dc, 10, mmxext) | |||||
PRED8x8L(128_dc, 10, sse2) | PRED8x8L(128_dc, 10, sse2) | ||||
PRED8x8L(top_dc, 10, sse2) | PRED8x8L(top_dc, 10, sse2) | ||||
PRED8x8L(top_dc, 10, avx) | PRED8x8L(top_dc, 10, avx) | ||||
@@ -88,42 +88,42 @@ PRED8x8L(horizontal_up, 10, avx) | |||||
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ | ||||
ptrdiff_t stride); | ptrdiff_t stride); | ||||
PRED16x16(dc, 10, mmx2) | |||||
PRED16x16(dc, 10, mmxext) | |||||
PRED16x16(dc, 10, sse2) | PRED16x16(dc, 10, sse2) | ||||
PRED16x16(top_dc, 10, mmx2) | |||||
PRED16x16(top_dc, 10, mmxext) | |||||
PRED16x16(top_dc, 10, sse2) | PRED16x16(top_dc, 10, sse2) | ||||
PRED16x16(128_dc, 10, mmx2) | |||||
PRED16x16(128_dc, 10, mmxext) | |||||
PRED16x16(128_dc, 10, sse2) | PRED16x16(128_dc, 10, sse2) | ||||
PRED16x16(left_dc, 10, mmx2) | |||||
PRED16x16(left_dc, 10, mmxext) | |||||
PRED16x16(left_dc, 10, sse2) | PRED16x16(left_dc, 10, sse2) | ||||
PRED16x16(vertical, 10, mmx2) | |||||
PRED16x16(vertical, 10, mmxext) | |||||
PRED16x16(vertical, 10, sse2) | PRED16x16(vertical, 10, sse2) | ||||
PRED16x16(horizontal, 10, mmx2) | |||||
PRED16x16(horizontal, 10, mmxext) | |||||
PRED16x16(horizontal, 10, sse2) | PRED16x16(horizontal, 10, sse2) | ||||
/* 8-bit versions */ | /* 8-bit versions */ | ||||
PRED16x16(vertical, 8, mmx) | PRED16x16(vertical, 8, mmx) | ||||
PRED16x16(vertical, 8, sse) | PRED16x16(vertical, 8, sse) | ||||
PRED16x16(horizontal, 8, mmx) | PRED16x16(horizontal, 8, mmx) | ||||
PRED16x16(horizontal, 8, mmx2) | |||||
PRED16x16(horizontal, 8, mmxext) | |||||
PRED16x16(horizontal, 8, ssse3) | PRED16x16(horizontal, 8, ssse3) | ||||
PRED16x16(dc, 8, mmx2) | |||||
PRED16x16(dc, 8, mmxext) | |||||
PRED16x16(dc, 8, sse2) | PRED16x16(dc, 8, sse2) | ||||
PRED16x16(dc, 8, ssse3) | PRED16x16(dc, 8, ssse3) | ||||
PRED16x16(plane_h264, 8, mmx) | PRED16x16(plane_h264, 8, mmx) | ||||
PRED16x16(plane_h264, 8, mmx2) | |||||
PRED16x16(plane_h264, 8, mmxext) | |||||
PRED16x16(plane_h264, 8, sse2) | PRED16x16(plane_h264, 8, sse2) | ||||
PRED16x16(plane_h264, 8, ssse3) | PRED16x16(plane_h264, 8, ssse3) | ||||
PRED16x16(plane_rv40, 8, mmx) | PRED16x16(plane_rv40, 8, mmx) | ||||
PRED16x16(plane_rv40, 8, mmx2) | |||||
PRED16x16(plane_rv40, 8, mmxext) | |||||
PRED16x16(plane_rv40, 8, sse2) | PRED16x16(plane_rv40, 8, sse2) | ||||
PRED16x16(plane_rv40, 8, ssse3) | PRED16x16(plane_rv40, 8, ssse3) | ||||
PRED16x16(plane_svq3, 8, mmx) | PRED16x16(plane_svq3, 8, mmx) | ||||
PRED16x16(plane_svq3, 8, mmx2) | |||||
PRED16x16(plane_svq3, 8, mmxext) | |||||
PRED16x16(plane_svq3, 8, sse2) | PRED16x16(plane_svq3, 8, sse2) | ||||
PRED16x16(plane_svq3, 8, ssse3) | PRED16x16(plane_svq3, 8, ssse3) | ||||
PRED16x16(tm_vp8, 8, mmx) | PRED16x16(tm_vp8, 8, mmx) | ||||
PRED16x16(tm_vp8, 8, mmx2) | |||||
PRED16x16(tm_vp8, 8, mmxext) | |||||
PRED16x16(tm_vp8, 8, sse2) | PRED16x16(tm_vp8, 8, sse2) | ||||
PRED8x8(top_dc, 8, mmxext) | PRED8x8(top_dc, 8, mmxext) | ||||
@@ -131,14 +131,14 @@ PRED8x8(dc_rv40, 8, mmxext) | |||||
PRED8x8(dc, 8, mmxext) | PRED8x8(dc, 8, mmxext) | ||||
PRED8x8(vertical, 8, mmx) | PRED8x8(vertical, 8, mmx) | ||||
PRED8x8(horizontal, 8, mmx) | PRED8x8(horizontal, 8, mmx) | ||||
PRED8x8(horizontal, 8, mmx2) | |||||
PRED8x8(horizontal, 8, mmxext) | |||||
PRED8x8(horizontal, 8, ssse3) | PRED8x8(horizontal, 8, ssse3) | ||||
PRED8x8(plane, 8, mmx) | PRED8x8(plane, 8, mmx) | ||||
PRED8x8(plane, 8, mmx2) | |||||
PRED8x8(plane, 8, mmxext) | |||||
PRED8x8(plane, 8, sse2) | PRED8x8(plane, 8, sse2) | ||||
PRED8x8(plane, 8, ssse3) | PRED8x8(plane, 8, ssse3) | ||||
PRED8x8(tm_vp8, 8, mmx) | PRED8x8(tm_vp8, 8, mmx) | ||||
PRED8x8(tm_vp8, 8, mmx2) | |||||
PRED8x8(tm_vp8, 8, mmxext) | |||||
PRED8x8(tm_vp8, 8, sse2) | PRED8x8(tm_vp8, 8, sse2) | ||||
PRED8x8(tm_vp8, 8, ssse3) | PRED8x8(tm_vp8, 8, ssse3) | ||||
@@ -175,7 +175,7 @@ PRED4x4(vertical_right, 8, mmxext) | |||||
PRED4x4(horizontal_up, 8, mmxext) | PRED4x4(horizontal_up, 8, mmxext) | ||||
PRED4x4(horizontal_down, 8, mmxext) | PRED4x4(horizontal_down, 8, mmxext) | ||||
PRED4x4(tm_vp8, 8, mmx) | PRED4x4(tm_vp8, 8, mmx) | ||||
PRED4x4(tm_vp8, 8, mmx2) | |||||
PRED4x4(tm_vp8, 8, mmxext) | |||||
PRED4x4(tm_vp8, 8, ssse3) | PRED4x4(tm_vp8, 8, ssse3) | ||||
PRED4x4(vertical_vp8, 8, mmxext) | PRED4x4(vertical_vp8, 8, mmxext) | ||||
@@ -210,10 +210,10 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | |||||
} | } | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx2; | |||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmx2; | |||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; | |||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext; | |||||
if (chroma_format_idc == 1) | if (chroma_format_idc == 1) | ||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx2; | |||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext; | |||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; | h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; | ||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; | h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; | ||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; | h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; | ||||
@@ -243,20 +243,20 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | |||||
} | } | ||||
} | } | ||||
if (codec_id == AV_CODEC_ID_VP8) { | if (codec_id == AV_CODEC_ID_VP8) { | ||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx2; | |||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext; | |||||
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; | h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; | ||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx2; | |||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx2; | |||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext; | |||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext; | |||||
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; | h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; | ||||
} else { | } else { | ||||
if (chroma_format_idc == 1) | if (chroma_format_idc == 1) | ||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx2; | |||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext; | |||||
if (codec_id == AV_CODEC_ID_SVQ3) { | if (codec_id == AV_CODEC_ID_SVQ3) { | ||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmx2; | |||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext; | |||||
} else if (codec_id == AV_CODEC_ID_RV40) { | } else if (codec_id == AV_CODEC_ID_RV40) { | ||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmx2; | |||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext; | |||||
} else { | } else { | ||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmx2; | |||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext; | |||||
} | } | ||||
} | } | ||||
} | } | ||||
@@ -320,20 +320,20 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | |||||
} | } | ||||
} else if (bit_depth == 10) { | } else if (bit_depth == 10) { | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmx2; | |||||
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmx2; | |||||
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; | |||||
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; | |||||
if (chroma_format_idc == 1) | if (chroma_format_idc == 1) | ||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmx2; | |||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; | |||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmx2; | |||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; | |||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmx2; | |||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmx2; | |||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmx2; | |||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmx2; | |||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmx2; | |||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmx2; | |||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext; | |||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext; | |||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext; | |||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext; | |||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; | |||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; | |||||
} | } | ||||
if (EXTERNAL_SSE2(mm_flags)) { | if (EXTERNAL_SSE2(mm_flags)) { | ||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; | h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; | ||||
@@ -71,7 +71,7 @@ SECTION .text | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
cglobal h264_weight_16_mmx2, 6, 6, 0 | |||||
cglobal h264_weight_16_mmxext, 6, 6, 0 | |||||
WEIGHT_SETUP | WEIGHT_SETUP | ||||
.nextrow: | .nextrow: | ||||
WEIGHT_OP 0, 4 | WEIGHT_OP 0, 4 | ||||
@@ -96,7 +96,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
WEIGHT_FUNC_MM 8, 0, mmx2 | |||||
WEIGHT_FUNC_MM 8, 0, mmxext | |||||
INIT_XMM | INIT_XMM | ||||
WEIGHT_FUNC_MM 16, 8, sse2 | WEIGHT_FUNC_MM 16, 8, sse2 | ||||
@@ -121,7 +121,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
WEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
WEIGHT_FUNC_HALF_MM 4, 0, mmxext | |||||
INIT_XMM | INIT_XMM | ||||
WEIGHT_FUNC_HALF_MM 8, 8, sse2 | WEIGHT_FUNC_HALF_MM 8, 8, sse2 | ||||
@@ -175,7 +175,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
cglobal h264_biweight_16_mmx2, 7, 8, 0 | |||||
cglobal h264_biweight_16_mmxext, 7, 8, 0 | |||||
BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
.nextrow: | .nextrow: | ||||
@@ -210,7 +210,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
BIWEIGHT_FUNC_MM 8, 0, mmx2 | |||||
BIWEIGHT_FUNC_MM 8, 0, mmxext | |||||
INIT_XMM | INIT_XMM | ||||
BIWEIGHT_FUNC_MM 16, 8, sse2 | BIWEIGHT_FUNC_MM 16, 8, sse2 | ||||
@@ -239,7 +239,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 | |||||
%endmacro | %endmacro | ||||
INIT_MMX | INIT_MMX | ||||
BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 | |||||
BIWEIGHT_FUNC_HALF_MM 4, 0, mmxext | |||||
INIT_XMM | INIT_XMM | ||||
BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 | ||||
@@ -33,9 +33,9 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||||
IDCT_ADD_FUNC(, 8, mmx) | IDCT_ADD_FUNC(, 8, mmx) | ||||
IDCT_ADD_FUNC(, 10, sse2) | IDCT_ADD_FUNC(, 10, sse2) | ||||
IDCT_ADD_FUNC(_dc, 8, mmx2) | |||||
IDCT_ADD_FUNC(_dc, 10, mmx2) | |||||
IDCT_ADD_FUNC(8_dc, 8, mmx2) | |||||
IDCT_ADD_FUNC(_dc, 8, mmxext) | |||||
IDCT_ADD_FUNC(_dc, 10, mmxext) | |||||
IDCT_ADD_FUNC(8_dc, 8, mmxext) | |||||
IDCT_ADD_FUNC(8_dc, 10, sse2) | IDCT_ADD_FUNC(8_dc, 10, sse2) | ||||
IDCT_ADD_FUNC(8, 8, mmx) | IDCT_ADD_FUNC(8, 8, mmx) | ||||
IDCT_ADD_FUNC(8, 8, sse2) | IDCT_ADD_FUNC(8, 8, sse2) | ||||
@@ -51,16 +51,16 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ | |||||
DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); | DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); | ||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmx) | IDCT_ADD_REP_FUNC(8, 4, 8, mmx) | ||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) | |||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) | |||||
IDCT_ADD_REP_FUNC(8, 4, 8, sse2) | IDCT_ADD_REP_FUNC(8, 4, 8, sse2) | ||||
IDCT_ADD_REP_FUNC(8, 4, 10, sse2) | IDCT_ADD_REP_FUNC(8, 4, 10, sse2) | ||||
IDCT_ADD_REP_FUNC(8, 4, 10, avx) | IDCT_ADD_REP_FUNC(8, 4, 10, avx) | ||||
IDCT_ADD_REP_FUNC(, 16, 8, mmx) | IDCT_ADD_REP_FUNC(, 16, 8, mmx) | ||||
IDCT_ADD_REP_FUNC(, 16, 8, mmx2) | |||||
IDCT_ADD_REP_FUNC(, 16, 8, mmxext) | |||||
IDCT_ADD_REP_FUNC(, 16, 8, sse2) | IDCT_ADD_REP_FUNC(, 16, 8, sse2) | ||||
IDCT_ADD_REP_FUNC(, 16, 10, sse2) | IDCT_ADD_REP_FUNC(, 16, 10, sse2) | ||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) | IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) | ||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2) | |||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) | |||||
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) | IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) | ||||
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) | IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) | ||||
IDCT_ADD_REP_FUNC(, 16, 10, avx) | IDCT_ADD_REP_FUNC(, 16, 10, avx) | ||||
@@ -73,7 +73,7 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ | |||||
DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); | DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); | ||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmx) | IDCT_ADD_REP_FUNC2(, 8, 8, mmx) | ||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) | |||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) | |||||
IDCT_ADD_REP_FUNC2(, 8, 8, sse2) | IDCT_ADD_REP_FUNC2(, 8, 8, sse2) | ||||
IDCT_ADD_REP_FUNC2(, 8, 10, sse2) | IDCT_ADD_REP_FUNC2(, 8, 10, sse2) | ||||
IDCT_ADD_REP_FUNC2(, 8, 10, avx) | IDCT_ADD_REP_FUNC2(, 8, 10, avx) | ||||
@@ -84,10 +84,11 @@ void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul | |||||
/***********************************/ | /***********************************/ | ||||
/* deblocking */ | /* deblocking */ | ||||
void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], | |||||
int8_t ref[2][40], int16_t mv[2][40][2], | |||||
int bidir, int edges, int step, | |||||
int mask_mv0, int mask_mv1, int field); | |||||
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], | |||||
int8_t ref[2][40], | |||||
int16_t mv[2][40][2], | |||||
int bidir, int edges, int step, | |||||
int mask_mv0, int mask_mv1, int field); | |||||
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ | #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ | ||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ | void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ | ||||
@@ -102,12 +103,12 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ | |||||
int beta); | int beta); | ||||
#define LF_FUNCS(type, depth) \ | #define LF_FUNCS(type, depth) \ | ||||
LF_FUNC(h, chroma, depth, mmx2) \ | |||||
LF_IFUNC(h, chroma_intra, depth, mmx2) \ | |||||
LF_FUNC(v, chroma, depth, mmx2) \ | |||||
LF_IFUNC(v, chroma_intra, depth, mmx2) \ | |||||
LF_FUNC(h, luma, depth, mmx2) \ | |||||
LF_IFUNC(h, luma_intra, depth, mmx2) \ | |||||
LF_FUNC(h, chroma, depth, mmxext) \ | |||||
LF_IFUNC(h, chroma_intra, depth, mmxext) \ | |||||
LF_FUNC(v, chroma, depth, mmxext) \ | |||||
LF_IFUNC(v, chroma_intra, depth, mmxext) \ | |||||
LF_FUNC(h, luma, depth, mmxext) \ | |||||
LF_IFUNC(h, luma_intra, depth, mmxext) \ | |||||
LF_FUNC(h, luma, depth, sse2) \ | LF_FUNC(h, luma, depth, sse2) \ | ||||
LF_IFUNC(h, luma_intra, depth, sse2) \ | LF_IFUNC(h, luma_intra, depth, sse2) \ | ||||
LF_FUNC(v, luma, depth, sse2) \ | LF_FUNC(v, luma, depth, sse2) \ | ||||
@@ -129,26 +130,26 @@ LF_FUNCS(uint8_t, 8) | |||||
LF_FUNCS(uint16_t, 10) | LF_FUNCS(uint16_t, 10) | ||||
#if ARCH_X86_32 && HAVE_YASM | #if ARCH_X86_32 && HAVE_YASM | ||||
LF_FUNC(v8, luma, 8, mmx2) | |||||
LF_FUNC(v8, luma, 8, mmxext) | |||||
static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, | static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, | ||||
int beta, int8_t *tc0) | int beta, int8_t *tc0) | ||||
{ | { | ||||
if ((tc0[0] & tc0[1]) >= 0) | if ((tc0[0] & tc0[1]) >= 0) | ||||
ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0); | |||||
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); | |||||
if ((tc0[2] & tc0[3]) >= 0) | if ((tc0[2] & tc0[3]) >= 0) | ||||
ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2); | |||||
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); | |||||
} | } | ||||
LF_IFUNC(v8, luma_intra, 8, mmx2) | |||||
LF_IFUNC(v8, luma_intra, 8, mmxext) | |||||
static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, | static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, | ||||
int alpha, int beta) | int alpha, int beta) | ||||
{ | { | ||||
ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta); | |||||
ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta); | |||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); | |||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); | |||||
} | } | ||||
#endif /* ARCH_X86_32 */ | #endif /* ARCH_X86_32 */ | ||||
LF_FUNC(v, luma, 10, mmx2) | |||||
LF_IFUNC(v, luma_intra, 10, mmx2) | |||||
LF_FUNC(v, luma, 10, mmxext) | |||||
LF_IFUNC(v, luma_intra, 10, mmxext) | |||||
/***********************************/ | /***********************************/ | ||||
/* weighted prediction */ | /* weighted prediction */ | ||||
@@ -165,8 +166,8 @@ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ | |||||
int weights, int offset); | int weights, int offset); | ||||
#define H264_BIWEIGHT_MMX(W) \ | #define H264_BIWEIGHT_MMX(W) \ | ||||
H264_WEIGHT(W, mmx2) \ | |||||
H264_BIWEIGHT(W, mmx2) | |||||
H264_WEIGHT(W, mmxext) \ | |||||
H264_BIWEIGHT(W, mmxext) | |||||
#define H264_BIWEIGHT_MMX_SSE(W) \ | #define H264_BIWEIGHT_MMX_SSE(W) \ | ||||
H264_BIWEIGHT_MMX(W) \ | H264_BIWEIGHT_MMX(W) \ | ||||
@@ -213,7 +214,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
int mm_flags = av_get_cpu_flags(); | int mm_flags = av_get_cpu_flags(); | ||||
if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags)) | if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags)) | ||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; | |||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; | |||||
if (bit_depth == 8) { | if (bit_depth == 8) { | ||||
if (EXTERNAL_MMX(mm_flags)) { | if (EXTERNAL_MMX(mm_flags)) { | ||||
@@ -231,33 +232,33 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; | c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; | |||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; | |||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; | |||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; | |||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; | |||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; | |||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; | |||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; | |||||
if (chroma_format_idc == 1) | if (chroma_format_idc == 1) | ||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; | |||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2; | |||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; | |||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; | |||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmx2; | |||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2; | |||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; | |||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; | |||||
if (chroma_format_idc == 1) { | if (chroma_format_idc == 1) { | ||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmx2; | |||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2; | |||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; | |||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; | |||||
} | } | ||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext; | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext; | ||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmx2; | |||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; | |||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; | ||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; | |||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; | |||||
#endif /* ARCH_X86_32 */ | #endif /* ARCH_X86_32 */ | ||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2; | |||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2; | |||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2; | |||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; | |||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; | |||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; | |||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2; | |||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2; | |||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2; | |||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; | |||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; | |||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; | |||||
if (EXTERNAL_SSE2(mm_flags)) { | if (EXTERNAL_SSE2(mm_flags)) { | ||||
c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | ||||
@@ -298,14 +299,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
if (EXTERNAL_MMX(mm_flags)) { | if (EXTERNAL_MMX(mm_flags)) { | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmx2; | |||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2; | |||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmx2; | |||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmx2; | |||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; | |||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; | |||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; | |||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; | |||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; | |||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; | |||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; | |||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; | |||||
#endif /* ARCH_X86_32 */ | #endif /* ARCH_X86_32 */ | ||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2; | |||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; | |||||
if (EXTERNAL_SSE2(mm_flags)) { | if (EXTERNAL_SSE2(mm_flags)) { | ||||
c->h264_idct_add = ff_h264_idct_add_10_sse2; | c->h264_idct_add = ff_h264_idct_add_10_sse2; | ||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; | c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; | ||||
@@ -166,7 +166,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr | |||||
RET | RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
ADD_PAETH_PRED_FN 0 | ADD_PAETH_PRED_FN 0 | ||||
INIT_MMX ssse3 | INIT_MMX ssse3 | ||||
@@ -23,8 +23,8 @@ | |||||
#include "libavutil/x86/cpu.h" | #include "libavutil/x86/cpu.h" | ||||
#include "libavcodec/pngdsp.h" | #include "libavcodec/pngdsp.h" | ||||
void ff_add_png_paeth_prediction_mmx2 (uint8_t *dst, uint8_t *src, | |||||
uint8_t *top, int w, int bpp); | |||||
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src, | |||||
uint8_t *top, int w, int bpp); | |||||
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src, | void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src, | ||||
uint8_t *top, int w, int bpp); | uint8_t *top, int w, int bpp); | ||||
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1, | void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1, | ||||
@@ -41,7 +41,7 @@ void ff_pngdsp_init_x86(PNGDSPContext *dsp) | |||||
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; | dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; | ||||
#endif | #endif | ||||
if (EXTERNAL_MMXEXT(flags)) | if (EXTERNAL_MMXEXT(flags)) | ||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmx2; | |||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext; | |||||
if (EXTERNAL_SSE2(flags)) | if (EXTERNAL_SSE2(flags)) | ||||
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2; | dsp->add_bytes_l2 = ff_add_bytes_l2_sse2; | ||||
if (EXTERNAL_SSSE3(flags)) | if (EXTERNAL_SSSE3(flags)) | ||||
@@ -57,7 +57,7 @@ cglobal rv34_idct_%1, 1, 2, 0 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
%define IDCT_DC IDCT_DC_ROUND | %define IDCT_DC IDCT_DC_ROUND | ||||
rv34_idct dc | rv34_idct dc | ||||
%define IDCT_DC IDCT_DC_NOROUND | %define IDCT_DC IDCT_DC_NOROUND | ||||
@@ -133,7 +133,7 @@ cglobal rv34_idct_dc_add, 3, 3 | |||||
mova mm5, [pd_512] ; 0x200 | mova mm5, [pd_512] ; 0x200 | ||||
%endmacro | %endmacro | ||||
; ff_rv34_idct_add_mmx2(uint8_t *dst, ptrdiff_t stride, DCTELEM *block); | |||||
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, DCTELEM *block); | |||||
%macro COL_TRANSFORM 4 | %macro COL_TRANSFORM 4 | ||||
pshufw mm3, %2, 0xDD ; col. 1,3,1,3 | pshufw mm3, %2, 0xDD ; col. 1,3,1,3 | ||||
pshufw %2, %2, 0x88 ; col. 0,2,0,2 | pshufw %2, %2, 0x88 ; col. 0,2,0,2 | ||||
@@ -154,7 +154,7 @@ cglobal rv34_idct_dc_add, 3, 3 | |||||
packuswb %2, %2 | packuswb %2, %2 | ||||
movd %1, %2 | movd %1, %2 | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal rv34_idct_add, 3,3,0, d, s, b | cglobal rv34_idct_add, 3,3,0, d, s, b | ||||
ROW_TRANSFORM bq | ROW_TRANSFORM bq | ||||
COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] | COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] | ||||
@@ -25,11 +25,11 @@ | |||||
#include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
#include "libavcodec/rv34dsp.h" | #include "libavcodec/rv34dsp.h" | ||||
void ff_rv34_idct_dc_mmx2(DCTELEM *block); | |||||
void ff_rv34_idct_dc_noround_mmx2(DCTELEM *block); | |||||
void ff_rv34_idct_dc_mmxext(DCTELEM *block); | |||||
void ff_rv34_idct_dc_noround_mmxext(DCTELEM *block); | |||||
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc); | void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc); | ||||
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); | void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); | ||||
void ff_rv34_idct_add_mmx2(uint8_t *dst, ptrdiff_t stride, DCTELEM *block); | |||||
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, DCTELEM *block); | |||||
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp) | av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp) | ||||
{ | { | ||||
@@ -38,8 +38,8 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp) | |||||
if (EXTERNAL_MMX(mm_flags)) | if (EXTERNAL_MMX(mm_flags)) | ||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; | c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmx2; | |||||
c->rv34_idct_add = ff_rv34_idct_add_mmx2; | |||||
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext; | |||||
c->rv34_idct_add = ff_rv34_idct_add_mmxext; | |||||
} | } | ||||
if (EXTERNAL_SSE4(mm_flags)) | if (EXTERNAL_SSE4(mm_flags)) | ||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; | c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; | ||||
@@ -240,7 +240,7 @@ INIT_MMX mmx | |||||
FILTER_V put | FILTER_V put | ||||
FILTER_H put | FILTER_H put | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
FILTER_V avg | FILTER_V avg | ||||
FILTER_H avg | FILTER_H avg | ||||
@@ -486,7 +486,7 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8 | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
RV40_WEIGHT rnd, 8, 3 | RV40_WEIGHT rnd, 8, 3 | ||||
RV40_WEIGHT rnd, 16, 4 | RV40_WEIGHT rnd, 16, 4 | ||||
RV40_WEIGHT nornd, 8, 3 | RV40_WEIGHT nornd, 8, 3 | ||||
@@ -34,15 +34,15 @@ | |||||
#if HAVE_YASM | #if HAVE_YASM | ||||
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, | void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src, | void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, | void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, | void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
@@ -55,7 +55,7 @@ void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *sr | |||||
int w1, int w2, ptrdiff_t stride); \ | int w1, int w2, ptrdiff_t stride); \ | ||||
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | ||||
int w1, int w2, ptrdiff_t stride); | int w1, int w2, ptrdiff_t stride); | ||||
DECLARE_WEIGHT(mmx2) | |||||
DECLARE_WEIGHT(mmxext) | |||||
DECLARE_WEIGHT(sse2) | DECLARE_WEIGHT(sse2) | ||||
DECLARE_WEIGHT(ssse3) | DECLARE_WEIGHT(ssse3) | ||||
@@ -150,9 +150,9 @@ QPEL_MC_DECL(avg_, _sse2) | |||||
QPEL_MC_DECL(put_, _mmx) | QPEL_MC_DECL(put_, _mmx) | ||||
#define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx | |||||
#define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx | |||||
QPEL_MC_DECL(avg_, _mmx2) | |||||
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx | |||||
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx | |||||
QPEL_MC_DECL(avg_, _mmxext) | |||||
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx | #define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx | ||||
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx | #define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx | ||||
@@ -206,14 +206,14 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) | |||||
#endif | #endif | ||||
} | } | ||||
if (EXTERNAL_MMXEXT(mm_flags)) { | if (EXTERNAL_MMXEXT(mm_flags)) { | ||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; | |||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2; | |||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx2; | |||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx2; | |||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx2; | |||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx2; | |||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; | |||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; | |||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; | |||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext; | |||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext; | |||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext; | |||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
QPEL_MC_SET(avg_, _mmx2) | |||||
QPEL_MC_SET(avg_, _mmxext) | |||||
#endif | #endif | ||||
} else if (EXTERNAL_AMD3DNOW(mm_flags)) { | } else if (EXTERNAL_AMD3DNOW(mm_flags)) { | ||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; | c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; | ||||
@@ -64,8 +64,8 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) | |||||
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src, | void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_avg_vc1_chroma_mc8_nornd_mmx2 (uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src, | |||||
int stride, int h, int x, int y); | |||||
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src, | void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src, | ||||
int stride, int h, int x, int y); | int stride, int h, int x, int y); | ||||
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, | void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, | ||||
@@ -99,7 +99,7 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) | |||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) { | if (mm_flags & AV_CPU_FLAG_MMXEXT) { | ||||
ASSIGN_LF(mmxext); | ASSIGN_LF(mmxext); | ||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmx2; | |||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext; | |||||
} else if (mm_flags & AV_CPU_FLAG_3DNOW) { | } else if (mm_flags & AV_CPU_FLAG_3DNOW) { | ||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; | dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; | ||||
} | } | ||||
@@ -101,7 +101,7 @@ SECTION .text | |||||
mov [r0+r3 -1], r2w | mov [r0+r3 -1], r2w | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal vp3_v_loop_filter, 3, 4 | cglobal vp3_v_loop_filter, 3, 4 | ||||
%if ARCH_X86_64 | %if ARCH_X86_64 | ||||
movsxd r1, r1d | movsxd r1, r1d | ||||
@@ -633,7 +633,7 @@ vp3_idct_funcs | |||||
movq [r0+r3 ], m5 | movq [r0+r3 ], m5 | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal vp3_idct_dc_add, 3, 4 | cglobal vp3_idct_dc_add, 3, 4 | ||||
%if ARCH_X86_64 | %if ARCH_X86_64 | ||||
movsxd r1, r1d | movsxd r1, r1d | ||||
@@ -31,11 +31,13 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); | |||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); | ||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); | void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); | ||||
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, | |||||
const DCTELEM *block); | |||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size, | |||||
const DCTELEM *block); | |||||
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); | |||||
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); | |||||
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, | |||||
int *bounding_values); | |||||
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, | |||||
int *bounding_values); | |||||
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) | av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) | ||||
{ | { | ||||
@@ -50,11 +52,11 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) | |||||
#endif | #endif | ||||
if (EXTERNAL_MMXEXT(cpuflags)) { | if (EXTERNAL_MMXEXT(cpuflags)) { | ||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmx2; | |||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; | |||||
if (!(flags & CODEC_FLAG_BITEXACT)) { | if (!(flags & CODEC_FLAG_BITEXACT)) { | ||||
c->v_loop_filter = ff_vp3_v_loop_filter_mmx2; | |||||
c->h_loop_filter = ff_vp3_h_loop_filter_mmx2; | |||||
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext; | |||||
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext; | |||||
} | } | ||||
} | } | ||||
@@ -338,7 +338,7 @@ INIT_XMM ssse3 | |||||
FILTER_SSSE3 8 | FILTER_SSSE3 8 | ||||
; 4x4 block, H-only 4-tap filter | ; 4x4 block, H-only 4-tap filter | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg | cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg | ||||
shl mxd, 4 | shl mxd, 4 | ||||
%ifdef PIC | %ifdef PIC | ||||
@@ -386,7 +386,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he | |||||
REP_RET | REP_RET | ||||
; 4x4 block, H-only 6-tap filter | ; 4x4 block, H-only 6-tap filter | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg | cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg | ||||
lea mxd, [mxq*3] | lea mxd, [mxq*3] | ||||
%ifdef PIC | %ifdef PIC | ||||
@@ -673,7 +673,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
FILTER_V 4 | FILTER_V 4 | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
FILTER_V 8 | FILTER_V 8 | ||||
@@ -769,7 +769,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride | |||||
REP_RET | REP_RET | ||||
%endmacro | %endmacro | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
FILTER_BILINEAR 4 | FILTER_BILINEAR 4 | ||||
INIT_XMM sse2 | INIT_XMM sse2 | ||||
FILTER_BILINEAR 8 | FILTER_BILINEAR 8 | ||||
@@ -1611,7 +1611,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr | |||||
INIT_MMX mmx | INIT_MMX mmx | ||||
SIMPLE_LOOPFILTER v, 4 | SIMPLE_LOOPFILTER v, 4 | ||||
SIMPLE_LOOPFILTER h, 5 | SIMPLE_LOOPFILTER h, 5 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
SIMPLE_LOOPFILTER v, 4 | SIMPLE_LOOPFILTER v, 4 | ||||
SIMPLE_LOOPFILTER h, 5 | SIMPLE_LOOPFILTER h, 5 | ||||
%endif | %endif | ||||
@@ -1835,7 +1835,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
psubusb m6, m5 ; q2-q1 | psubusb m6, m5 ; q2-q1 | ||||
por m6, m4 ; abs(q2-q1) | por m6, m4 ; abs(q2-q1) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m4, m_flimI | mova m4, m_flimI | ||||
pxor m3, m3 | pxor m3, m3 | ||||
psubusb m0, m4 | psubusb m0, m4 | ||||
@@ -1875,7 +1875,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
psubusb m1, m3 ; p1-p0 | psubusb m1, m3 ; p1-p0 | ||||
psubusb m6, m2 ; p0-p1 | psubusb m6, m2 ; p0-p1 | ||||
por m1, m6 ; abs(p1-p0) | por m1, m6 ; abs(p1-p0) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m6, m1 | mova m6, m1 | ||||
psubusb m1, m4 | psubusb m1, m4 | ||||
psubusb m6, m_hevthr | psubusb m6, m_hevthr | ||||
@@ -1906,7 +1906,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
psubusb m1, m5 ; q0-q1 | psubusb m1, m5 ; q0-q1 | ||||
psubusb m7, m4 ; q1-q0 | psubusb m7, m4 ; q1-q0 | ||||
por m1, m7 ; abs(q1-q0) | por m1, m7 ; abs(q1-q0) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m7, m1 | mova m7, m1 | ||||
psubusb m1, m6 | psubusb m1, m6 | ||||
psubusb m7, m_hevthr | psubusb m7, m_hevthr | ||||
@@ -2014,14 +2014,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
%else | %else | ||||
mova m6, m_maskres | mova m6, m_maskres | ||||
%endif | %endif | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m7, [pb_1] | mova m7, [pb_1] | ||||
%else ; mmxext/sse2 | %else ; mmxext/sse2 | ||||
pxor m7, m7 | pxor m7, m7 | ||||
%endif | %endif | ||||
pand m0, m6 | pand m0, m6 | ||||
pand m1, m6 | pand m1, m6 | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
paddusb m0, m7 | paddusb m0, m7 | ||||
pand m1, [pb_FE] | pand m1, [pb_FE] | ||||
pandn m7, m0 | pandn m7, m0 | ||||
@@ -2097,7 +2097,7 @@ INNER_LOOPFILTER h, 16 | |||||
INNER_LOOPFILTER v, 8 | INNER_LOOPFILTER v, 8 | ||||
INNER_LOOPFILTER h, 8 | INNER_LOOPFILTER h, 8 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
INNER_LOOPFILTER v, 16 | INNER_LOOPFILTER v, 16 | ||||
INNER_LOOPFILTER h, 16 | INNER_LOOPFILTER h, 16 | ||||
INNER_LOOPFILTER v, 8 | INNER_LOOPFILTER v, 8 | ||||
@@ -2343,7 +2343,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt | |||||
psubusb m6, m5 ; q2-q1 | psubusb m6, m5 ; q2-q1 | ||||
por m6, m4 ; abs(q2-q1) | por m6, m4 ; abs(q2-q1) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m4, m_flimI | mova m4, m_flimI | ||||
pxor m3, m3 | pxor m3, m3 | ||||
psubusb m0, m4 | psubusb m0, m4 | ||||
@@ -2383,7 +2383,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt | |||||
psubusb m1, m3 ; p1-p0 | psubusb m1, m3 ; p1-p0 | ||||
psubusb m6, m2 ; p0-p1 | psubusb m6, m2 ; p0-p1 | ||||
por m1, m6 ; abs(p1-p0) | por m1, m6 ; abs(p1-p0) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m6, m1 | mova m6, m1 | ||||
psubusb m1, m4 | psubusb m1, m4 | ||||
psubusb m6, m_hevthr | psubusb m6, m_hevthr | ||||
@@ -2414,7 +2414,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt | |||||
psubusb m1, m5 ; q0-q1 | psubusb m1, m5 ; q0-q1 | ||||
psubusb m7, m4 ; q1-q0 | psubusb m7, m4 ; q1-q0 | ||||
por m1, m7 ; abs(q1-q0) | por m1, m7 ; abs(q1-q0) | ||||
%if notcpuflag(mmx2) | |||||
%if notcpuflag(mmxext) | |||||
mova m7, m1 | mova m7, m1 | ||||
psubusb m1, m6 | psubusb m1, m6 | ||||
psubusb m7, m_hevthr | psubusb m7, m_hevthr | ||||
@@ -2755,7 +2755,7 @@ MBEDGE_LOOPFILTER h, 16 | |||||
MBEDGE_LOOPFILTER v, 8 | MBEDGE_LOOPFILTER v, 8 | ||||
MBEDGE_LOOPFILTER h, 8 | MBEDGE_LOOPFILTER h, 8 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
MBEDGE_LOOPFILTER v, 16 | MBEDGE_LOOPFILTER v, 16 | ||||
MBEDGE_LOOPFILTER h, 16 | MBEDGE_LOOPFILTER h, 16 | ||||
MBEDGE_LOOPFILTER v, 8 | MBEDGE_LOOPFILTER v, 8 | ||||
@@ -30,16 +30,16 @@ | |||||
/* | /* | ||||
* MC functions | * MC functions | ||||
*/ | */ | ||||
extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
@@ -81,7 +81,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, | extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, | ||||
@@ -94,7 +94,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride, | |||||
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, | |||||
uint8_t *src, ptrdiff_t srcstride, | uint8_t *src, ptrdiff_t srcstride, | ||||
int height, int mx, int my); | int height, int mx, int my); | ||||
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, | extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, | ||||
@@ -140,16 +140,16 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ | |||||
} | } | ||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
TAP_W8 (mmx2, epel, h4) | |||||
TAP_W8 (mmx2, epel, h6) | |||||
TAP_W16(mmx2, epel, h6) | |||||
TAP_W8 (mmx2, epel, v4) | |||||
TAP_W8 (mmx2, epel, v6) | |||||
TAP_W16(mmx2, epel, v6) | |||||
TAP_W8 (mmx2, bilinear, h) | |||||
TAP_W16(mmx2, bilinear, h) | |||||
TAP_W8 (mmx2, bilinear, v) | |||||
TAP_W16(mmx2, bilinear, v) | |||||
TAP_W8 (mmxext, epel, h4) | |||||
TAP_W8 (mmxext, epel, h6) | |||||
TAP_W16(mmxext, epel, h6) | |||||
TAP_W8 (mmxext, epel, v4) | |||||
TAP_W8 (mmxext, epel, v6) | |||||
TAP_W16(mmxext, epel, v6) | |||||
TAP_W8 (mmxext, bilinear, h) | |||||
TAP_W16(mmxext, bilinear, h) | |||||
TAP_W8 (mmxext, bilinear, v) | |||||
TAP_W16(mmxext, bilinear, v) | |||||
#endif | #endif | ||||
TAP_W16(sse2, epel, h6) | TAP_W16(sse2, epel, h6) | ||||
@@ -178,13 +178,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT | |||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
#define HVTAPMMX(x, y) \ | #define HVTAPMMX(x, y) \ | ||||
HVTAP(mmx2, 8, x, y, 4, 8) \ | |||||
HVTAP(mmx2, 8, x, y, 8, 16) | |||||
HVTAP(mmxext, 8, x, y, 4, 8) \ | |||||
HVTAP(mmxext, 8, x, y, 8, 16) | |||||
HVTAP(mmx2, 8, 6, 6, 16, 16) | |||||
HVTAP(mmxext, 8, 6, 6, 16, 16) | |||||
#else | #else | ||||
#define HVTAPMMX(x, y) \ | #define HVTAPMMX(x, y) \ | ||||
HVTAP(mmx2, 8, x, y, 4, 8) | |||||
HVTAP(mmxext, 8, x, y, 4, 8) | |||||
#endif | #endif | ||||
HVTAPMMX(4, 4) | HVTAPMMX(4, 4) | ||||
@@ -219,10 +219,10 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ | |||||
dst, dststride, tmp, SIZE, height, mx, my); \ | dst, dststride, tmp, SIZE, height, mx, my); \ | ||||
} | } | ||||
HVBILIN(mmx2, 8, 4, 8) | |||||
HVBILIN(mmxext, 8, 4, 8) | |||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
HVBILIN(mmx2, 8, 8, 16) | |||||
HVBILIN(mmx2, 8, 16, 16) | |||||
HVBILIN(mmxext, 8, 8, 16) | |||||
HVBILIN(mmxext, 8, 16, 16) | |||||
#endif | #endif | ||||
HVBILIN(sse2, 8, 8, 16) | HVBILIN(sse2, 8, 8, 16) | ||||
HVBILIN(sse2, 8, 16, 16) | HVBILIN(sse2, 8, 16, 16) | ||||
@@ -284,7 +284,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ | |||||
int e, int i, int hvt); | int e, int i, int hvt); | ||||
DECLARE_LOOP_FILTER(mmx) | DECLARE_LOOP_FILTER(mmx) | ||||
DECLARE_LOOP_FILTER(mmx2) | |||||
DECLARE_LOOP_FILTER(mmxext) | |||||
DECLARE_LOOP_FILTER(sse2) | DECLARE_LOOP_FILTER(sse2) | ||||
DECLARE_LOOP_FILTER(ssse3) | DECLARE_LOOP_FILTER(ssse3) | ||||
DECLARE_LOOP_FILTER(sse4) | DECLARE_LOOP_FILTER(sse4) | ||||
@@ -352,26 +352,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
/* note that 4-tap width=16 functions are missing because w=16 | /* note that 4-tap width=16 functions are missing because w=16 | ||||
* is only used for luma, and luma is always a copy or sixtap. */ | * is only used for luma, and luma is always a copy or sixtap. */ | ||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) { | if (mm_flags & AV_CPU_FLAG_MMXEXT) { | ||||
VP8_MC_FUNC(2, 4, mmx2); | |||||
VP8_BILINEAR_MC_FUNC(2, 4, mmx2); | |||||
VP8_MC_FUNC(2, 4, mmxext); | |||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext); | |||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
VP8_LUMA_MC_FUNC(0, 16, mmx2); | |||||
VP8_MC_FUNC(1, 8, mmx2); | |||||
VP8_BILINEAR_MC_FUNC(0, 16, mmx2); | |||||
VP8_BILINEAR_MC_FUNC(1, 8, mmx2); | |||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2; | |||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2; | |||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2; | |||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2; | |||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2; | |||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2; | |||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2; | |||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2; | |||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2; | |||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2; | |||||
VP8_LUMA_MC_FUNC(0, 16, mmxext); | |||||
VP8_MC_FUNC(1, 8, mmxext); | |||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext); | |||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext); | |||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; | |||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; | |||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; | |||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; | |||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; | |||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; | |||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; | |||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; | |||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; | |||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; | |||||
#endif | #endif | ||||
} | } | ||||
@@ -555,7 +555,7 @@ | |||||
%if mmsize == 16 | %if mmsize == 16 | ||||
pshuflw %1, %2, (%3)*0x55 | pshuflw %1, %2, (%3)*0x55 | ||||
punpcklqdq %1, %1 | punpcklqdq %1, %1 | ||||
%elif cpuflag(mmx2) | |||||
%elif cpuflag(mmxext) | |||||
pshufw %1, %2, (%3)*0x55 | pshufw %1, %2, (%3)*0x55 | ||||
%else | %else | ||||
%ifnidn %1, %2 | %ifnidn %1, %2 | ||||
@@ -247,7 +247,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset | |||||
%endmacro | %endmacro | ||||
%if ARCH_X86_32 | %if ARCH_X86_32 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
yuv2planeX_fn 8, 0, 7 | yuv2planeX_fn 8, 0, 7 | ||||
yuv2planeX_fn 9, 0, 5 | yuv2planeX_fn 9, 0, 5 | ||||
yuv2planeX_fn 10, 0, 5 | yuv2planeX_fn 10, 0, 5 | ||||
@@ -390,7 +390,7 @@ INIT_MMX mmx | |||||
yuv2plane1_fn 8, 0, 5 | yuv2plane1_fn 8, 0, 5 | ||||
yuv2plane1_fn 16, 0, 3 | yuv2plane1_fn 16, 0, 3 | ||||
INIT_MMX mmx2 | |||||
INIT_MMX mmxext | |||||
yuv2plane1_fn 9, 0, 3 | yuv2plane1_fn 9, 0, 3 | ||||
yuv2plane1_fn 10, 0, 3 | yuv2plane1_fn 10, 0, 3 | ||||
%endif | %endif | ||||
@@ -317,7 +317,7 @@ extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filter | |||||
VSCALEX_FUNC(10, opt) | VSCALEX_FUNC(10, opt) | ||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
VSCALEX_FUNCS(mmx2); | |||||
VSCALEX_FUNCS(mmxext); | |||||
#endif | #endif | ||||
VSCALEX_FUNCS(sse2); | VSCALEX_FUNCS(sse2); | ||||
VSCALEX_FUNCS(sse4); | VSCALEX_FUNCS(sse4); | ||||
@@ -334,7 +334,7 @@ extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, | |||||
VSCALE_FUNC(16, opt1) | VSCALE_FUNC(16, opt1) | ||||
#if ARCH_X86_32 | #if ARCH_X86_32 | ||||
VSCALE_FUNCS(mmx, mmx2); | |||||
VSCALE_FUNCS(mmx, mmxext); | |||||
#endif | #endif | ||||
VSCALE_FUNCS(sse2, sse2); | VSCALE_FUNCS(sse2, sse2); | ||||
VSCALE_FUNC(16, sse4); | VSCALE_FUNC(16, sse4); | ||||
@@ -442,7 +442,7 @@ switch(c->dstBpc){ \ | |||||
if (EXTERNAL_MMX(cpu_flags)) { | if (EXTERNAL_MMX(cpu_flags)) { | ||||
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | ||||
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | ||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMXEXT); | |||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT); | |||||
switch (c->srcFormat) { | switch (c->srcFormat) { | ||||
case AV_PIX_FMT_Y400A: | case AV_PIX_FMT_Y400A: | ||||
@@ -475,7 +475,7 @@ switch(c->dstBpc){ \ | |||||
} | } | ||||
} | } | ||||
if (EXTERNAL_MMXEXT(cpu_flags)) { | if (EXTERNAL_MMXEXT(cpu_flags)) { | ||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2, , 1); | |||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1); | |||||
} | } | ||||
#endif /* ARCH_X86_32 */ | #endif /* ARCH_X86_32 */ | ||||
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | ||||