Signed-off-by: James Almer <jamrial@gmail.com> Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.3
| @@ -513,3 +513,63 @@ BSWAP32_BUF | |||
| INIT_XMM ssse3 | |||
| BSWAP32_BUF | |||
| ;---------------------------------------- | |||
| ; void ff_clear_block(int16_t *blocks); | |||
| ;---------------------------------------- | |||
| ; %1 = number of xmm registers used | |||
| ; %2 = number of inline store loops | |||
| %macro CLEAR_BLOCK 2 | |||
| cglobal clear_block, 1, 1, %1, blocks | |||
| ZERO m0, m0 | |||
| %assign %%i 0 | |||
| %rep %2 | |||
| mova [blocksq+mmsize*(0+%%i)], m0 | |||
| mova [blocksq+mmsize*(1+%%i)], m0 | |||
| mova [blocksq+mmsize*(2+%%i)], m0 | |||
| mova [blocksq+mmsize*(3+%%i)], m0 | |||
| mova [blocksq+mmsize*(4+%%i)], m0 | |||
| mova [blocksq+mmsize*(5+%%i)], m0 | |||
| mova [blocksq+mmsize*(6+%%i)], m0 | |||
| mova [blocksq+mmsize*(7+%%i)], m0 | |||
| %assign %%i %%i+8 | |||
| %endrep | |||
| RET | |||
| %endmacro | |||
| INIT_MMX mmx | |||
| %define ZERO pxor | |||
| CLEAR_BLOCK 0, 2 | |||
| INIT_XMM sse | |||
| %define ZERO xorps | |||
| CLEAR_BLOCK 1, 1 | |||
| ;----------------------------------------- | |||
| ; void ff_clear_blocks(int16_t *blocks); | |||
| ;----------------------------------------- | |||
| ; %1 = number of xmm registers used | |||
| %macro CLEAR_BLOCKS 1 | |||
| cglobal clear_blocks, 1, 2, %1, blocks, len | |||
| add blocksq, 768 | |||
| mov lenq, -768 | |||
| ZERO m0, m0 | |||
| .loop | |||
| mova [blocksq+lenq+mmsize*0], m0 | |||
| mova [blocksq+lenq+mmsize*1], m0 | |||
| mova [blocksq+lenq+mmsize*2], m0 | |||
| mova [blocksq+lenq+mmsize*3], m0 | |||
| mova [blocksq+lenq+mmsize*4], m0 | |||
| mova [blocksq+lenq+mmsize*5], m0 | |||
| mova [blocksq+lenq+mmsize*6], m0 | |||
| mova [blocksq+lenq+mmsize*7], m0 | |||
| add lenq, mmsize*8 | |||
| js .loop | |||
| RET | |||
| %endmacro | |||
| INIT_MMX mmx | |||
| %define ZERO pxor | |||
| CLEAR_BLOCKS 0 | |||
| INIT_XMM sse | |||
| %define ZERO xorps | |||
| CLEAR_BLOCKS 1 | |||
| @@ -534,8 +534,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
| c->add_pixels_clamped = ff_add_pixels_clamped_mmx; | |||
| if (!high_bit_depth) { | |||
| c->clear_block = ff_clear_block_mmx; | |||
| c->clear_blocks = ff_clear_blocks_mmx; | |||
| c->draw_edges = ff_draw_edges_mmx; | |||
| } | |||
| @@ -547,6 +545,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
| #endif /* HAVE_MMX_INLINE */ | |||
| #if HAVE_MMX_EXTERNAL | |||
| if (!high_bit_depth) { | |||
| c->clear_block = ff_clear_block_mmx; | |||
| c->clear_blocks = ff_clear_blocks_mmx; | |||
| } | |||
| c->vector_clip_int32 = ff_vector_clip_int32_mmx; | |||
| #endif /* HAVE_MMX_EXTERNAL */ | |||
| } | |||
| @@ -585,7 +587,10 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, | |||
| { | |||
| #if HAVE_SSE_INLINE | |||
| c->vector_clipf = ff_vector_clipf_sse; | |||
| #endif /* HAVE_SSE_INLINE */ | |||
| #if HAVE_YASM | |||
| #if HAVE_SSE_EXTERNAL | |||
| /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ | |||
| if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) | |||
| return; | |||
| @@ -594,9 +599,7 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, | |||
| c->clear_block = ff_clear_block_sse; | |||
| c->clear_blocks = ff_clear_blocks_sse; | |||
| } | |||
| #endif /* HAVE_SSE_INLINE */ | |||
| #if HAVE_YASM | |||
| #endif | |||
| #if HAVE_INLINE_ASM && CONFIG_VIDEODSP | |||
| c->gmc = ff_gmc_sse; | |||
| #endif | |||
| @@ -172,61 +172,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||
| } while (--i); | |||
| } | |||
| #define CLEAR_BLOCKS(name, n) \ | |||
| void name(int16_t *blocks) \ | |||
| { \ | |||
| __asm__ volatile ( \ | |||
| "pxor %%mm7, %%mm7 \n\t" \ | |||
| "mov $-"#n", %%"REG_a" \n\t" \ | |||
| "1: \n\t" \ | |||
| "movq %%mm7, (%0, %%"REG_a") \n\t" \ | |||
| "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ | |||
| "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ | |||
| "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ | |||
| "add $32, %%"REG_a" \n\t" \ | |||
| "js 1b \n\t" \ | |||
| :: "r"(((uint8_t *) blocks) + n) \ | |||
| : "%"REG_a); \ | |||
| } | |||
| CLEAR_BLOCKS(ff_clear_blocks_mmx, 768) | |||
| CLEAR_BLOCKS(ff_clear_block_mmx, 128) | |||
| void ff_clear_block_sse(int16_t *block) | |||
| { | |||
| __asm__ volatile ( | |||
| "xorps %%xmm0, %%xmm0 \n" | |||
| "movaps %%xmm0, (%0) \n" | |||
| "movaps %%xmm0, 16(%0) \n" | |||
| "movaps %%xmm0, 32(%0) \n" | |||
| "movaps %%xmm0, 48(%0) \n" | |||
| "movaps %%xmm0, 64(%0) \n" | |||
| "movaps %%xmm0, 80(%0) \n" | |||
| "movaps %%xmm0, 96(%0) \n" | |||
| "movaps %%xmm0, 112(%0) \n" | |||
| :: "r" (block) | |||
| : "memory"); | |||
| } | |||
| void ff_clear_blocks_sse(int16_t *blocks) | |||
| { | |||
| __asm__ volatile ( | |||
| "xorps %%xmm0, %%xmm0 \n" | |||
| "mov $-768, %%"REG_a" \n" | |||
| "1: \n" | |||
| "movaps %%xmm0, (%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 16(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 32(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 48(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 64(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 80(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 96(%0, %%"REG_a") \n" | |||
| "movaps %%xmm0, 112(%0, %%"REG_a") \n" | |||
| "add $128, %%"REG_a" \n" | |||
| "js 1b \n" | |||
| :: "r"(((uint8_t *) blocks) + 128 * 6) | |||
| : "%"REG_a); | |||
| } | |||
| void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) | |||
| { | |||
| x86_reg i = 0; | |||