| @@ -53,8 +53,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext* c, int flags) | |||
| hpel_funcs(avg, [3], 2); | |||
| hpel_funcs(avg_no_rnd,, 16); | |||
| #if 0 | |||
| if (ARCH_X86) ff_hpeldsp_init_x86 (c, flags); | |||
| #if 0 | |||
| if (ARCH_ARM) ff_hpeldsp_init_arm (c, flags); | |||
| if (HAVE_VIS) ff_hpeldsp_init_vis (c, flags); | |||
| if (ARCH_ALPHA) ff_hpeldsp_init_alpha (c, flags); | |||
| @@ -10,6 +10,7 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o | |||
| OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o | |||
| OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o | |||
| OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o | |||
| OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o | |||
| OBJS-$(CONFIG_LPC) += x86/lpc.o | |||
| OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o | |||
| OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o | |||
| @@ -66,7 +67,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ | |||
| x86/h264_intrapred_10bit.o | |||
| YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ | |||
| x86/h264_qpel_10bit.o \ | |||
| x86/qpelbase.o | |||
| x86/qpelbase.o \ | |||
| x86/fpelbase.o | |||
| YASM-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp.o \ | |||
| x86/fpelbase.o | |||
| YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o | |||
| YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | |||
| YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | |||
| @@ -83,9 +87,9 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o | |||
| YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | |||
| YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ | |||
| x86/hpeldsp.o \ | |||
| x86/mpeg4qpel.o \ | |||
| x86/qpelbase.o | |||
| x86/qpelbase.o \ | |||
| x86/fpelbase.o | |||
| YASM-OBJS += x86/deinterlace.o \ | |||
| x86/fmtconvert.o | |||
| @@ -60,10 +60,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; | |||
| #if HAVE_YASM | |||
| void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, | |||
| int dstStride, int src1Stride, int h); | |||
| void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, | |||
| @@ -71,54 +67,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, | |||
| int src1Stride, int h); | |||
| void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, | |||
| int dstStride, int src1Stride, int h); | |||
| void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, | |||
| int dstStride, int src1Stride, int h); | |||
| void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, | |||
| int dstStride, int src1Stride, int h); | |||
| void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, | |||
| int dstStride, int src1Stride, int h); | |||
| void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); | |||
| static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| @@ -192,14 +148,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, | |||
| // using regr as temporary and for the output result | |||
| // first argument is unmodifed and second is trashed | |||
| // regfe is supposed to contain 0xfefefefefefefefe | |||
| #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "pand "#regb", "#regr" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pand "#regfe", "#regb" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "paddb "#regb", "#regr" \n\t" | |||
| #define PAVGB_MMX(rega, regb, regr, regfe) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "por "#regb", "#regr" \n\t" \ | |||
| @@ -209,20 +157,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, | |||
| "psubb "#regb", "#regr" \n\t" | |||
| // mm6 is supposed to contain 0xfefefefefefefefe | |||
| #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "movq "#regc", "#regp" \n\t" \ | |||
| "pand "#regb", "#regr" \n\t" \ | |||
| "pand "#regd", "#regp" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pxor "#regc", "#regd" \n\t" \ | |||
| "pand %%mm6, "#regb" \n\t" \ | |||
| "pand %%mm6, "#regd" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "psrlq $1, "#regd" \n\t" \ | |||
| "paddb "#regb", "#regr" \n\t" \ | |||
| "paddb "#regd", "#regp" \n\t" | |||
| #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "movq "#regc", "#regp" \n\t" \ | |||
| @@ -237,22 +171,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, | |||
| "psubb "#regb", "#regr" \n\t" \ | |||
| "psubb "#regd", "#regp" \n\t" | |||
| /***********************************/ | |||
| /* MMX no rounding */ | |||
| #define NO_RND 1 | |||
| #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx | |||
| #define SET_RND MOVQ_WONE | |||
| #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |||
| #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |||
| #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) | |||
| #include "dsputil_rnd_template.c" | |||
| #undef DEF | |||
| #undef SET_RND | |||
| #undef PAVGBP | |||
| #undef PAVGB | |||
| #undef NO_RND | |||
| /***********************************/ | |||
| /* MMX rounding */ | |||
| @@ -260,6 +178,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, | |||
| #define SET_RND MOVQ_WTWO | |||
| #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |||
| #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |||
| #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) | |||
| #include "dsputil_rnd_template.c" | |||
| @@ -274,31 +193,21 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, | |||
| #if HAVE_YASM | |||
| /***********************************/ | |||
| /* 3Dnow specific */ | |||
| #define DEF(x) x ## _3dnow | |||
| #include "dsputil_avg_template.c" | |||
| #undef DEF | |||
| /***********************************/ | |||
| /* MMXEXT specific */ | |||
| #define DEF(x) x ## _mmxext | |||
| #include "dsputil_avg_template.c" | |||
| #undef DEF | |||
| //FIXME the following could be optimized too ... | |||
| static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| int line_size, int h) | |||
| { | |||
| ff_avg_pixels8_mmxext(block, pixels, line_size, h); | |||
| ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| #if HAVE_INLINE_ASM | |||
| #define put_no_rnd_pixels16_mmx put_pixels16_mmx | |||
| #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |||
| /***********************************/ | |||
| /* standard MMX */ | |||
| @@ -1520,14 +1429,6 @@ void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, | |||
| c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ | |||
| } while (0) | |||
| #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |||
| do { \ | |||
| c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ | |||
| } while (0) | |||
| static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
| int mm_flags) | |||
| { | |||
| @@ -1542,14 +1443,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
| c->clear_block = clear_block_mmx; | |||
| c->clear_blocks = clear_blocks_mmx; | |||
| c->draw_edges = draw_edges_mmx; | |||
| SET_HPEL_FUNCS(put, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(avg, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); | |||
| SET_HPEL_FUNCS(put, [1], 8, mmx); | |||
| SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); | |||
| SET_HPEL_FUNCS(avg, [1], 8, mmx); | |||
| } | |||
| #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM) | |||
| @@ -1584,43 +1477,9 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, | |||
| SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); | |||
| SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); | |||
| SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); | |||
| if (!high_bit_depth) { | |||
| c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; | |||
| c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; | |||
| c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; | |||
| c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; | |||
| c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; | |||
| c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; | |||
| c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; | |||
| c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; | |||
| c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; | |||
| } | |||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
| if (!high_bit_depth) { | |||
| c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; | |||
| c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; | |||
| c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; | |||
| c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; | |||
| } | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| #if HAVE_MMXEXT_EXTERNAL | |||
| if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 || | |||
| avctx->codec_id == AV_CODEC_ID_THEORA)) { | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; | |||
| } | |||
| /* slower than cmov version on AMD */ | |||
| if (!(mm_flags & AV_CPU_FLAG_3DNOW)) | |||
| c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; | |||
| @@ -1636,46 +1495,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, | |||
| #endif /* HAVE_MMXEXT_EXTERNAL */ | |||
| } | |||
| static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, | |||
| int mm_flags) | |||
| { | |||
| const int high_bit_depth = avctx->bits_per_raw_sample > 8; | |||
| #if HAVE_YASM | |||
| if (!high_bit_depth) { | |||
| c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; | |||
| c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; | |||
| c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; | |||
| c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; | |||
| c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; | |||
| c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; | |||
| c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; | |||
| c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; | |||
| c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; | |||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; | |||
| c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; | |||
| c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; | |||
| c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; | |||
| } | |||
| } | |||
| if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 || | |||
| avctx->codec_id == AV_CODEC_ID_THEORA)) { | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| } | |||
| static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, | |||
| int mm_flags) | |||
| { | |||
| @@ -1716,15 +1535,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, | |||
| #endif /* HAVE_SSE2_INLINE */ | |||
| #if HAVE_SSE2_EXTERNAL | |||
| if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { | |||
| // these functions are slower than mmx on AMD, but faster on Intel | |||
| if (!high_bit_depth) { | |||
| c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; | |||
| c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; | |||
| } | |||
| } | |||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |||
| if (mm_flags & AV_CPU_FLAG_ATOM) { | |||
| @@ -1811,9 +1621,6 @@ av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) | |||
| if (mm_flags & AV_CPU_FLAG_MMXEXT) | |||
| dsputil_init_mmxext(c, avctx, mm_flags); | |||
| if (mm_flags & AV_CPU_FLAG_3DNOW) | |||
| dsputil_init_3dnow(c, avctx, mm_flags); | |||
| if (mm_flags & AV_CPU_FLAG_SSE) | |||
| dsputil_init_sse(c, avctx, mm_flags); | |||
| @@ -25,212 +25,6 @@ | |||
| */ | |||
| // put_pixels | |||
| static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "testl $1, %0 \n\t" | |||
| " jz 1f \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq (%2), %%mm1 \n\t" | |||
| "add %4, %1 \n\t" | |||
| "add $8, %2 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "decl %0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq (%2), %%mm1 \n\t" | |||
| "add %4, %1 \n\t" | |||
| "movq (%1), %%mm2 \n\t" | |||
| "movq 8(%2), %%mm3 \n\t" | |||
| "add %4, %1 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "movq %%mm5, (%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 16(%2), %%mm1 \n\t" | |||
| "add %4, %1 \n\t" | |||
| "movq (%1), %%mm2 \n\t" | |||
| "movq 24(%2), %%mm3 \n\t" | |||
| "add %4, %1 \n\t" | |||
| "add $32, %2 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "movq %%mm5, (%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used | |||
| :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |||
| #else | |||
| :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |||
| #endif | |||
| :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |||
| :"memory"); | |||
| } | |||
| static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "movq 8(%1), %%mm0 \n\t" | |||
| "movq 9(%1), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm2 \n\t" | |||
| "movq 9(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "movq 8(%1), %%mm0 \n\t" | |||
| "movq 9(%1), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm2 \n\t" | |||
| "movq 9(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "testl $1, %0 \n\t" | |||
| " jz 1f \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq (%2), %%mm1 \n\t" | |||
| "movq 8(%1), %%mm2 \n\t" | |||
| "movq 8(%2), %%mm3 \n\t" | |||
| "add %4, %1 \n\t" | |||
| "add $16, %2 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "movq %%mm5, 8(%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "decl %0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq (%2), %%mm1 \n\t" | |||
| "movq 8(%1), %%mm2 \n\t" | |||
| "movq 8(%2), %%mm3 \n\t" | |||
| "add %4, %1 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "movq %%mm5, 8(%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 16(%2), %%mm1 \n\t" | |||
| "movq 8(%1), %%mm2 \n\t" | |||
| "movq 24(%2), %%mm3 \n\t" | |||
| "add %4, %1 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%3) \n\t" | |||
| "movq %%mm5, 8(%3) \n\t" | |||
| "add %5, %3 \n\t" | |||
| "add $32, %2 \n\t" | |||
| "subl $2, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used | |||
| :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |||
| #else | |||
| :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |||
| #endif | |||
| :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |||
| :"memory"); | |||
| } | |||
| static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"),%%mm2 \n\t" | |||
| PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"),%%mm0 \n\t" | |||
| PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_ZERO(mm7); | |||
| @@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff | |||
| :REG_a, "memory"); | |||
| } | |||
| // avg_pixels | |||
| static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movd %0, %%mm0 \n\t" | |||
| "movd %1, %%mm1 \n\t" | |||
| OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| "movd %%mm2, %0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } | |||
| while (--h); | |||
| } | |||
| #ifndef NO_RND | |||
| // in case more speed is needed - unroling would certainly help | |||
| static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| @@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t l | |||
| } | |||
| while (--h); | |||
| } | |||
| #endif // NO_RND | |||
| static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| @@ -362,141 +134,6 @@ static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t | |||
| while (--h); | |||
| } | |||
| #ifndef NO_RND | |||
| static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq 1%1, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } while (--h); | |||
| } | |||
| #endif // NO_RND | |||
| static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq %2, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| :"+m"(*dst) | |||
| :"m"(*src1), "m"(*src2) | |||
| :"memory"); | |||
| dst += dstStride; | |||
| src1 += src1Stride; | |||
| src2 += 8; | |||
| } while (--h); | |||
| } | |||
| static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq 1%1, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| "movq 8%1, %%mm0 \n\t" | |||
| "movq 9%1, %%mm1 \n\t" | |||
| "movq 8%0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, 8%0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } while (--h); | |||
| } | |||
| static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq %2, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| "movq 8%1, %%mm0 \n\t" | |||
| "movq 8%2, %%mm1 \n\t" | |||
| "movq 8%0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, 8%0 \n\t" | |||
| :"+m"(*dst) | |||
| :"m"(*src1), "m"(*src2) | |||
| :"memory"); | |||
| dst += dstStride; | |||
| src1 += src1Stride; | |||
| src2 += 16; | |||
| } while (--h); | |||
| } | |||
| static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm2 \n\t" | |||
| PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
| "movq (%2), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) | |||
| "movq (%2, %3), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
| PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
| "movq (%2), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) | |||
| "movq (%2, %3), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) | |||
| "movq %%mm2, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| // this routine is 'slightly' suboptimal but mostly unused | |||
| static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| @@ -573,21 +210,11 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff | |||
| } | |||
| //FIXME optimize | |||
| static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(put, pixels8_y2)(block , pixels , line_size, h); | |||
| DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(put, pixels8_xy2)(block , pixels , line_size, h); | |||
| DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(avg, pixels8_y2)(block , pixels , line_size, h); | |||
| DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(avg, pixels8_xy2)(block , pixels , line_size, h); | |||
| DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); | |||
| @@ -0,0 +1,106 @@ | |||
| ;****************************************************************************** | |||
| ;* MMX optimized DSP utils | |||
| ;* Copyright (c) 2008 Loren Merritt | |||
| ;* Copyright (c) 2003-2013 Michael Niedermayer | |||
| ;* Copyright (c) 2013 Daniel Kang | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "libavutil/x86/x86util.asm" | |||
| SECTION .text | |||
| INIT_MMX mmxext | |||
| ; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| %macro PIXELS48 2 | |||
| %if %2 == 4 | |||
| %define OP movh | |||
| %else | |||
| %define OP mova | |||
| %endif | |||
| cglobal %1_pixels%2, 4,5 | |||
| movsxdifnidn r2, r2d | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| OP m0, [r1] | |||
| OP m1, [r1+r2] | |||
| OP m2, [r1+r2*2] | |||
| OP m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| %ifidn %1, avg | |||
| pavgb m0, [r0] | |||
| pavgb m1, [r0+r2] | |||
| pavgb m2, [r0+r2*2] | |||
| pavgb m3, [r0+r4] | |||
| %endif | |||
| OP [r0], m0 | |||
| OP [r0+r2], m1 | |||
| OP [r0+r2*2], m2 | |||
| OP [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jne .loop | |||
| RET | |||
| %endmacro | |||
| PIXELS48 put, 4 | |||
| PIXELS48 avg, 4 | |||
| PIXELS48 put, 8 | |||
| PIXELS48 avg, 8 | |||
| INIT_XMM sse2 | |||
| ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| cglobal put_pixels16, 4,5,4 | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| movu m0, [r1] | |||
| movu m1, [r1+r2] | |||
| movu m2, [r1+r2*2] | |||
| movu m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| mova [r0], m0 | |||
| mova [r0+r2], m1 | |||
| mova [r0+r2*2], m2 | |||
| mova [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jnz .loop | |||
| REP_RET | |||
| ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| cglobal avg_pixels16, 4,5,4 | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| movu m0, [r1] | |||
| movu m1, [r1+r2] | |||
| movu m2, [r1+r2*2] | |||
| movu m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| pavgb m0, [r0] | |||
| pavgb m1, [r0+r2] | |||
| pavgb m2, [r0+r2*2] | |||
| pavgb m3, [r0+r4] | |||
| mova [r0], m0 | |||
| mova [r0+r2], m1 | |||
| mova [r0+r2*2], m2 | |||
| mova [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jnz .loop | |||
| REP_RET | |||
| @@ -0,0 +1,415 @@ | |||
| /* | |||
| * MMX optimized DSP utils | |||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||
| * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| * | |||
| * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |||
| */ | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/x86/asm.h" | |||
| #include "libavcodec/hpeldsp.h" | |||
| #include "dsputil_mmx.h" | |||
| //#undef NDEBUG | |||
| //#include <assert.h> | |||
| #if HAVE_YASM | |||
| void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, | |||
| const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| #endif /* HAVE_YASM */ | |||
| #if HAVE_INLINE_ASM | |||
| #define JUMPALIGN() __asm__ volatile (".p2align 3"::) | |||
| #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) | |||
| #define MOVQ_BFE(regd) \ | |||
| __asm__ volatile ( \ | |||
| "pcmpeqd %%"#regd", %%"#regd" \n\t" \ | |||
| "paddb %%"#regd", %%"#regd" \n\t" ::) | |||
| #ifndef PIC | |||
| #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) | |||
| #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) | |||
| #else | |||
| // for shared library it's better to use this way for accessing constants | |||
| // pcmpeqd -> -1 | |||
| #define MOVQ_BONE(regd) \ | |||
| __asm__ volatile ( \ | |||
| "pcmpeqd %%"#regd", %%"#regd" \n\t" \ | |||
| "psrlw $15, %%"#regd" \n\t" \ | |||
| "packuswb %%"#regd", %%"#regd" \n\t" ::) | |||
| #define MOVQ_WTWO(regd) \ | |||
| __asm__ volatile ( \ | |||
| "pcmpeqd %%"#regd", %%"#regd" \n\t" \ | |||
| "psrlw $15, %%"#regd" \n\t" \ | |||
| "psllw $1, %%"#regd" \n\t"::) | |||
| #endif | |||
| // using regr as temporary and for the output result | |||
| // first argument is unmodifed and second is trashed | |||
| // regfe is supposed to contain 0xfefefefefefefefe | |||
| #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "pand "#regb", "#regr" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pand "#regfe", "#regb" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "paddb "#regb", "#regr" \n\t" | |||
| #define PAVGB_MMX(rega, regb, regr, regfe) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "por "#regb", "#regr" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pand "#regfe", "#regb" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "psubb "#regb", "#regr" \n\t" | |||
| // mm6 is supposed to contain 0xfefefefefefefefe | |||
| #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "movq "#regc", "#regp" \n\t" \ | |||
| "pand "#regb", "#regr" \n\t" \ | |||
| "pand "#regd", "#regp" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pxor "#regc", "#regd" \n\t" \ | |||
| "pand %%mm6, "#regb" \n\t" \ | |||
| "pand %%mm6, "#regd" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "psrlq $1, "#regd" \n\t" \ | |||
| "paddb "#regb", "#regr" \n\t" \ | |||
| "paddb "#regd", "#regp" \n\t" | |||
| #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |||
| "movq "#rega", "#regr" \n\t" \ | |||
| "movq "#regc", "#regp" \n\t" \ | |||
| "por "#regb", "#regr" \n\t" \ | |||
| "por "#regd", "#regp" \n\t" \ | |||
| "pxor "#rega", "#regb" \n\t" \ | |||
| "pxor "#regc", "#regd" \n\t" \ | |||
| "pand %%mm6, "#regb" \n\t" \ | |||
| "pand %%mm6, "#regd" \n\t" \ | |||
| "psrlq $1, "#regd" \n\t" \ | |||
| "psrlq $1, "#regb" \n\t" \ | |||
| "psubb "#regb", "#regr" \n\t" \ | |||
| "psubb "#regd", "#regp" \n\t" | |||
| /***********************************/ | |||
| /* MMX no rounding */ | |||
| #define NO_RND 1 | |||
| #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx | |||
| #define SET_RND MOVQ_WONE | |||
| #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |||
| #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |||
| #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) | |||
| #include "hpeldsp_rnd_template.c" | |||
| #undef DEF | |||
| #undef SET_RND | |||
| #undef PAVGBP | |||
| #undef PAVGB | |||
| #undef NO_RND | |||
| /***********************************/ | |||
| /* MMX rounding */ | |||
| #define DEF(x, y) x ## _ ## y ## _mmx | |||
| #define SET_RND MOVQ_WTWO | |||
| #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |||
| #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |||
| #include "hpeldsp_rnd_template.c" | |||
| #undef DEF | |||
| #undef SET_RND | |||
| #undef PAVGBP | |||
| #undef PAVGB | |||
| #undef OP_AVG | |||
| #endif /* HAVE_INLINE_ASM */ | |||
| #if HAVE_YASM | |||
| #define ff_put_pixels8_mmx ff_put_pixels8_mmxext | |||
| /***********************************/ | |||
| /* 3Dnow specific */ | |||
| #define DEF(x) x ## _3dnow | |||
| #include "hpeldsp_avg_template.c" | |||
| #undef DEF | |||
| /***********************************/ | |||
| /* MMXEXT specific */ | |||
| #define DEF(x) x ## _mmxext | |||
| #include "hpeldsp_avg_template.c" | |||
| #undef DEF | |||
| #endif /* HAVE_YASM */ | |||
| #if HAVE_INLINE_ASM | |||
| #define put_no_rnd_pixels16_mmx put_pixels16_mmx | |||
| #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |||
| #define put_pixels16_mmxext put_pixels16_mmx | |||
| #define put_pixels8_mmxext put_pixels8_mmx | |||
| #define put_pixels4_mmxext put_pixels4_mmx | |||
| #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx | |||
| #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx | |||
| static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h) | |||
| { | |||
| __asm__ volatile ( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1 ), %%mm0 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1 ), %%mm0 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+g"(h), "+r"(pixels), "+r"(block) | |||
| : "r"((x86_reg)line_size) | |||
| : "%"REG_a, "memory" | |||
| ); | |||
| } | |||
| static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h) | |||
| { | |||
| __asm__ volatile ( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1 ), %%mm0 \n\t" | |||
| "movq 8(%1 ), %%mm4 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm5 \n\t" | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1 ), %%mm0 \n\t" | |||
| "movq 8(%1 ), %%mm4 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm5 \n\t" | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+g"(h), "+r"(pixels), "+r"(block) | |||
| : "r"((x86_reg)line_size) | |||
| : "%"REG_a, "memory" | |||
| ); | |||
| } | |||
| #endif /* HAVE_INLINE_ASM */ | |||
| void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, | |||
| ptrdiff_t line_size, int h); | |||
| #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |||
| do { \ | |||
| c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |||
| c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ | |||
| } while (0) | |||
| static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags) | |||
| { | |||
| #if HAVE_INLINE_ASM | |||
| SET_HPEL_FUNCS(put, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(avg, [0], 16, mmx); | |||
| SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); | |||
| SET_HPEL_FUNCS(put, [1], 8, mmx); | |||
| SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); | |||
| SET_HPEL_FUNCS(avg, [1], 8, mmx); | |||
| #endif /* HAVE_INLINE_ASM */ | |||
| } | |||
| static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) | |||
| { | |||
| #if HAVE_YASM | |||
| c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; | |||
| c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; | |||
| c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; | |||
| c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; | |||
| c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; | |||
| c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; | |||
| c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; | |||
| c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; | |||
| c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; | |||
| if (!(flags & CODEC_FLAG_BITEXACT)) { | |||
| c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; | |||
| c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; | |||
| c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; | |||
| c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| #if HAVE_MMXEXT_EXTERNAL | |||
| if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; | |||
| } | |||
| #endif /* HAVE_MMXEXT_EXTERNAL */ | |||
| } | |||
| static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) | |||
| { | |||
| #if HAVE_YASM | |||
| c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; | |||
| c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; | |||
| c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; | |||
| c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; | |||
| c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; | |||
| c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; | |||
| c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; | |||
| c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; | |||
| c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; | |||
| if (!(flags & CODEC_FLAG_BITEXACT)){ | |||
| c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; | |||
| c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; | |||
| c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; | |||
| c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; | |||
| } | |||
| if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { | |||
| c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; | |||
| c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| } | |||
| static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags) | |||
| { | |||
| #if HAVE_SSE2_EXTERNAL | |||
| if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { | |||
| // these functions are slower than mmx on AMD, but faster on Intel | |||
| c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; | |||
| c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; | |||
| c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; | |||
| } | |||
| #endif /* HAVE_SSE2_EXTERNAL */ | |||
| } | |||
| void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) | |||
| { | |||
| int mm_flags = av_get_cpu_flags(); | |||
| if (mm_flags & AV_CPU_FLAG_MMX) | |||
| hpeldsp_init_mmx(c, flags, mm_flags); | |||
| if (mm_flags & AV_CPU_FLAG_MMXEXT) | |||
| hpeldsp_init_mmxext(c, flags, mm_flags); | |||
| if (mm_flags & AV_CPU_FLAG_3DNOW) | |||
| hpeldsp_init_3dnow(c, flags, mm_flags); | |||
| if (mm_flags & AV_CPU_FLAG_SSE2) | |||
| hpeldsp_init_sse2(c, flags, mm_flags); | |||
| } | |||
| @@ -0,0 +1,428 @@ | |||
| /* | |||
| * DSP utils mmx functions are compiled twice for rnd/no_rnd | |||
| * Copyright (c) 2000, 2001 Fabrice Bellard | |||
| * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> | |||
| * | |||
| * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |||
| * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |||
| * and improved by Zdenek Kabelac <kabi@users.sf.net> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| // put_pixels | |||
| static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "movq 8(%1), %%mm0 \n\t" | |||
| "movq 9(%1), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm2 \n\t" | |||
| "movq 9(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm1 \n\t" | |||
| "movq (%1, %3), %%mm2 \n\t" | |||
| "movq 1(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "movq 8(%1), %%mm0 \n\t" | |||
| "movq 9(%1), %%mm1 \n\t" | |||
| "movq 8(%1, %3), %%mm2 \n\t" | |||
| "movq 9(%1, %3), %%mm3 \n\t" | |||
| PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
| "movq %%mm4, 8(%2) \n\t" | |||
| "movq %%mm5, 8(%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"),%%mm2 \n\t" | |||
| PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"),%%mm0 \n\t" | |||
| PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
| "movq %%mm4, (%2) \n\t" | |||
| "movq %%mm5, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_ZERO(mm7); | |||
| SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |||
| __asm__ volatile( | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm4 \n\t" | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "movq %%mm4, %%mm5 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||
| "paddusw %%mm0, %%mm4 \n\t" | |||
| "paddusw %%mm1, %%mm5 \n\t" | |||
| "xor %%"REG_a", %%"REG_a" \n\t" | |||
| "add %3, %1 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
| "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||
| "punpckhbw %%mm7, %%mm3 \n\t" | |||
| "paddusw %%mm2, %%mm0 \n\t" | |||
| "paddusw %%mm3, %%mm1 \n\t" | |||
| "paddusw %%mm6, %%mm4 \n\t" | |||
| "paddusw %%mm6, %%mm5 \n\t" | |||
| "paddusw %%mm0, %%mm4 \n\t" | |||
| "paddusw %%mm1, %%mm5 \n\t" | |||
| "psrlw $2, %%mm4 \n\t" | |||
| "psrlw $2, %%mm5 \n\t" | |||
| "packuswb %%mm5, %%mm4 \n\t" | |||
| "movq %%mm4, (%2, %%"REG_a") \n\t" | |||
| "add %3, %%"REG_a" \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
| "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "movq %%mm4, %%mm5 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||
| "punpckhbw %%mm7, %%mm3 \n\t" | |||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||
| "paddusw %%mm2, %%mm4 \n\t" | |||
| "paddusw %%mm3, %%mm5 \n\t" | |||
| "paddusw %%mm6, %%mm0 \n\t" | |||
| "paddusw %%mm6, %%mm1 \n\t" | |||
| "paddusw %%mm4, %%mm0 \n\t" | |||
| "paddusw %%mm5, %%mm1 \n\t" | |||
| "psrlw $2, %%mm0 \n\t" | |||
| "psrlw $2, %%mm1 \n\t" | |||
| "packuswb %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%2, %%"REG_a") \n\t" | |||
| "add %3, %%"REG_a" \n\t" | |||
| "subl $2, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels) | |||
| :"D"(block), "r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| // avg_pixels | |||
| #ifndef NO_RND | |||
| // in case more speed is needed - unroling would certainly help | |||
| static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %0, %%mm0 \n\t" | |||
| "movq %1, %%mm1 \n\t" | |||
| OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| "movq %%mm2, %0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } | |||
| while (--h); | |||
| } | |||
| #endif // NO_RND | |||
| static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %0, %%mm0 \n\t" | |||
| "movq %1, %%mm1 \n\t" | |||
| OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| "movq %%mm2, %0 \n\t" | |||
| "movq 8%0, %%mm0 \n\t" | |||
| "movq 8%1, %%mm1 \n\t" | |||
| OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| "movq %%mm2, 8%0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } | |||
| while (--h); | |||
| } | |||
| #ifndef NO_RND | |||
| static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq 1%1, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } while (--h); | |||
| } | |||
| #endif // NO_RND | |||
| static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| JUMPALIGN(); | |||
| do { | |||
| __asm__ volatile( | |||
| "movq %1, %%mm0 \n\t" | |||
| "movq 1%1, %%mm1 \n\t" | |||
| "movq %0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, %0 \n\t" | |||
| "movq 8%1, %%mm0 \n\t" | |||
| "movq 9%1, %%mm1 \n\t" | |||
| "movq 8%0, %%mm3 \n\t" | |||
| PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |||
| OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) | |||
| "movq %%mm0, 8%0 \n\t" | |||
| :"+m"(*block) | |||
| :"m"(*pixels) | |||
| :"memory"); | |||
| pixels += line_size; | |||
| block += line_size; | |||
| } while (--h); | |||
| } | |||
| static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_BFE(mm6); | |||
| __asm__ volatile( | |||
| "lea (%3, %3), %%"REG_a" \n\t" | |||
| "movq (%1), %%mm0 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm2 \n\t" | |||
| PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
| "movq (%2), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) | |||
| "movq (%2, %3), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) | |||
| "movq %%mm0, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "movq (%1, %3), %%mm1 \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
| PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
| "movq (%2), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) | |||
| "movq (%2, %3), %%mm3 \n\t" | |||
| OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) | |||
| "movq %%mm2, (%2) \n\t" | |||
| "movq %%mm1, (%2, %3) \n\t" | |||
| "add %%"REG_a", %1 \n\t" | |||
| "add %%"REG_a", %2 \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels), "+D"(block) | |||
| :"r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| // this routine is 'slightly' suboptimal but mostly unused | |||
| static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| { | |||
| MOVQ_ZERO(mm7); | |||
| SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |||
| __asm__ volatile( | |||
| "movq (%1), %%mm0 \n\t" | |||
| "movq 1(%1), %%mm4 \n\t" | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "movq %%mm4, %%mm5 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||
| "paddusw %%mm0, %%mm4 \n\t" | |||
| "paddusw %%mm1, %%mm5 \n\t" | |||
| "xor %%"REG_a", %%"REG_a" \n\t" | |||
| "add %3, %1 \n\t" | |||
| ".p2align 3 \n\t" | |||
| "1: \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
| "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||
| "punpckhbw %%mm7, %%mm3 \n\t" | |||
| "paddusw %%mm2, %%mm0 \n\t" | |||
| "paddusw %%mm3, %%mm1 \n\t" | |||
| "paddusw %%mm6, %%mm4 \n\t" | |||
| "paddusw %%mm6, %%mm5 \n\t" | |||
| "paddusw %%mm0, %%mm4 \n\t" | |||
| "paddusw %%mm1, %%mm5 \n\t" | |||
| "psrlw $2, %%mm4 \n\t" | |||
| "psrlw $2, %%mm5 \n\t" | |||
| "movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
| "packuswb %%mm5, %%mm4 \n\t" | |||
| "pcmpeqd %%mm2, %%mm2 \n\t" | |||
| "paddb %%mm2, %%mm2 \n\t" | |||
| OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) | |||
| "movq %%mm5, (%2, %%"REG_a") \n\t" | |||
| "add %3, %%"REG_a" \n\t" | |||
| "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
| "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "movq %%mm4, %%mm5 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||
| "punpckhbw %%mm7, %%mm3 \n\t" | |||
| "punpckhbw %%mm7, %%mm5 \n\t" | |||
| "paddusw %%mm2, %%mm4 \n\t" | |||
| "paddusw %%mm3, %%mm5 \n\t" | |||
| "paddusw %%mm6, %%mm0 \n\t" | |||
| "paddusw %%mm6, %%mm1 \n\t" | |||
| "paddusw %%mm4, %%mm0 \n\t" | |||
| "paddusw %%mm5, %%mm1 \n\t" | |||
| "psrlw $2, %%mm0 \n\t" | |||
| "psrlw $2, %%mm1 \n\t" | |||
| "movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
| "packuswb %%mm1, %%mm0 \n\t" | |||
| "pcmpeqd %%mm2, %%mm2 \n\t" | |||
| "paddb %%mm2, %%mm2 \n\t" | |||
| OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) | |||
| "movq %%mm1, (%2, %%"REG_a") \n\t" | |||
| "add %3, %%"REG_a" \n\t" | |||
| "subl $2, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| :"+g"(h), "+S"(pixels) | |||
| :"D"(block), "r"((x86_reg)line_size) | |||
| :REG_a, "memory"); | |||
| } | |||
| //FIXME optimize | |||
| static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(put, pixels8_y2)(block , pixels , line_size, h); | |||
| DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(put, pixels8_xy2)(block , pixels , line_size, h); | |||
| DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(avg, pixels8_y2)(block , pixels , line_size, h); | |||
| DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ | |||
| DEF(avg, pixels8_xy2)(block , pixels , line_size, h); | |||
| DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); | |||
| } | |||
| @@ -174,83 +174,3 @@ cglobal %1_pixels16_l2, 6,6 | |||
| INIT_MMX mmxext | |||
| PIXELS16_L2 put | |||
| PIXELS16_L2 avg | |||
| INIT_MMX mmxext | |||
| ; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| %macro PIXELS48 2 | |||
| %if %2 == 4 | |||
| %define OP movh | |||
| %else | |||
| %define OP mova | |||
| %endif | |||
| cglobal %1_pixels%2, 4,5 | |||
| movsxdifnidn r2, r2d | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| OP m0, [r1] | |||
| OP m1, [r1+r2] | |||
| OP m2, [r1+r2*2] | |||
| OP m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| %ifidn %1, avg | |||
| pavgb m0, [r0] | |||
| pavgb m1, [r0+r2] | |||
| pavgb m2, [r0+r2*2] | |||
| pavgb m3, [r0+r4] | |||
| %endif | |||
| OP [r0], m0 | |||
| OP [r0+r2], m1 | |||
| OP [r0+r2*2], m2 | |||
| OP [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jne .loop | |||
| RET | |||
| %endmacro | |||
| PIXELS48 put, 4 | |||
| PIXELS48 avg, 4 | |||
| PIXELS48 put, 8 | |||
| PIXELS48 avg, 8 | |||
| INIT_XMM sse2 | |||
| ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| cglobal put_pixels16, 4,5,4 | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| movu m0, [r1] | |||
| movu m1, [r1+r2] | |||
| movu m2, [r1+r2*2] | |||
| movu m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| mova [r0], m0 | |||
| mova [r0+r2], m1 | |||
| mova [r0+r2*2], m2 | |||
| mova [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jnz .loop | |||
| REP_RET | |||
| ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
| cglobal avg_pixels16, 4,5,4 | |||
| lea r4, [r2*3] | |||
| .loop: | |||
| movu m0, [r1] | |||
| movu m1, [r1+r2] | |||
| movu m2, [r1+r2*2] | |||
| movu m3, [r1+r4] | |||
| lea r1, [r1+r2*4] | |||
| pavgb m0, [r0] | |||
| pavgb m1, [r0+r2] | |||
| pavgb m2, [r0+r2*2] | |||
| pavgb m3, [r0+r4] | |||
| mova [r0], m0 | |||
| mova [r0+r2], m1 | |||
| mova [r0+r2*2], m2 | |||
| mova [r0+r4], m3 | |||
| sub r3d, 4 | |||
| lea r0, [r0+r2*4] | |||
| jnz .loop | |||
| REP_RET | |||