The function is only instantiated once, so there is no point in keeping it in a template file.tags/n2.0
| @@ -1018,7 +1018,7 @@ void ff_put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) | |||||
| void ff_avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) | void ff_avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) | ||||
| { | { | ||||
| avg_pixels8_mmx(dst, src, stride, 8); | |||||
| ff_avg_pixels8_mmx(dst, src, stride, 8); | |||||
| } | } | ||||
| void ff_put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) | void ff_put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) | ||||
| @@ -156,6 +156,8 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_s | |||||
| void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); | void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); | ||||
| void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | |||||
| ptrdiff_t line_size, int h); | |||||
| void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | ||||
| ptrdiff_t line_size, int h); | ptrdiff_t line_size, int h); | ||||
| void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, | void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, | ||||
| @@ -29,6 +29,27 @@ | |||||
| #if HAVE_MMX_INLINE | #if HAVE_MMX_INLINE | ||||
| // in case more speed is needed - unroling would certainly help | |||||
| void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | |||||
| ptrdiff_t line_size, int h) | |||||
| { | |||||
| MOVQ_BFE(mm6); | |||||
| JUMPALIGN(); | |||||
| do { | |||||
| __asm__ volatile( | |||||
| "movq %0, %%mm0 \n\t" | |||||
| "movq %1, %%mm1 \n\t" | |||||
| PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) | |||||
| "movq %%mm2, %0 \n\t" | |||||
| :"+m"(*block) | |||||
| :"m"(*pixels) | |||||
| :"memory"); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } | |||||
| while (--h); | |||||
| } | |||||
| void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, | ||||
| ptrdiff_t line_size, int h) | ptrdiff_t line_size, int h) | ||||
| { | { | ||||
| @@ -74,6 +74,7 @@ void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, | |||||
| void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | ||||
| ptrdiff_t line_size, int h); | ptrdiff_t line_size, int h); | ||||
| #define avg_pixels8_mmx ff_avg_pixels8_mmx | |||||
| #define put_pixels8_mmx ff_put_pixels8_mmx | #define put_pixels8_mmx ff_put_pixels8_mmx | ||||
| #define put_pixels16_mmx ff_put_pixels16_mmx | #define put_pixels16_mmx ff_put_pixels16_mmx | ||||
| #define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx | #define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx | ||||
| @@ -92,28 +92,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff | |||||
| } | } | ||||
| // avg_pixels | // avg_pixels | ||||
| #ifndef NO_RND | |||||
| // in case more speed is needed - unroling would certainly help | |||||
| static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||||
| { | |||||
| MOVQ_BFE(mm6); | |||||
| JUMPALIGN(); | |||||
| do { | |||||
| __asm__ volatile( | |||||
| "movq %0, %%mm0 \n\t" | |||||
| "movq %1, %%mm1 \n\t" | |||||
| OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) | |||||
| "movq %%mm2, %0 \n\t" | |||||
| :"+m"(*block) | |||||
| :"m"(*pixels) | |||||
| :"memory"); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } | |||||
| while (--h); | |||||
| } | |||||
| #endif /* NO_RND */ | |||||
| static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | ||||
| { | { | ||||
| MOVQ_BFE(mm6); | MOVQ_BFE(mm6); | ||||