Also add an SSE2 version Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.3
| @@ -31,6 +31,8 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |||||
| pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | ||||
| pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | ||||
| cextern pb_80 | |||||
| SECTION_TEXT | SECTION_TEXT | ||||
| %macro SCALARPRODUCT 0 | %macro SCALARPRODUCT 0 | ||||
| @@ -573,3 +575,53 @@ CLEAR_BLOCKS 0 | |||||
| INIT_XMM sse | INIT_XMM sse | ||||
| %define ZERO xorps | %define ZERO xorps | ||||
| CLEAR_BLOCKS 1 | CLEAR_BLOCKS 1 | ||||
| ;-------------------------------------------------------------------------- | |||||
| ;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, | |||||
| ; int line_size) | |||||
| ;-------------------------------------------------------------------------- | |||||
| %macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 | |||||
| mova m1, [blockq+mmsize*0+%1] | |||||
| mova m2, [blockq+mmsize*2+%1] | |||||
| %if mmsize == 8 | |||||
| mova m3, [blockq+mmsize*4+%1] | |||||
| mova m4, [blockq+mmsize*6+%1] | |||||
| %endif | |||||
| packsswb m1, [blockq+mmsize*1+%1] | |||||
| packsswb m2, [blockq+mmsize*3+%1] | |||||
| %if mmsize == 8 | |||||
| packsswb m3, [blockq+mmsize*5+%1] | |||||
| packsswb m4, [blockq+mmsize*7+%1] | |||||
| %endif | |||||
| paddb m1, m0 | |||||
| paddb m2, m0 | |||||
| %if mmsize == 8 | |||||
| paddb m3, m0 | |||||
| paddb m4, m0 | |||||
| movq [pixelsq+lsizeq*0], m1 | |||||
| movq [pixelsq+lsizeq*1], m2 | |||||
| movq [pixelsq+lsizeq*2], m3 | |||||
| movq [pixelsq+lsize3q ], m4 | |||||
| %else | |||||
| movq [pixelsq+lsizeq*0], m1 | |||||
| movhps [pixelsq+lsizeq*1], m1 | |||||
| movq [pixelsq+lsizeq*2], m2 | |||||
| movhps [pixelsq+lsize3q ], m2 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro PUT_SIGNED_PIXELS_CLAMPED 1 | |||||
| cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3 | |||||
| mova m0, [pb_80] | |||||
| lea lsize3q, [lsizeq*3] | |||||
| PUT_SIGNED_PIXELS_CLAMPED_HALF 0 | |||||
| lea pixelsq, [pixelsq+lsizeq*4] | |||||
| PUT_SIGNED_PIXELS_CLAMPED_HALF 64 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX mmx | |||||
| PUT_SIGNED_PIXELS_CLAMPED 0 | |||||
| INIT_XMM sse2 | |||||
| PUT_SIGNED_PIXELS_CLAMPED 3 | |||||
| @@ -530,7 +530,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||||
| { | { | ||||
| #if HAVE_MMX_INLINE | #if HAVE_MMX_INLINE | ||||
| c->put_pixels_clamped = ff_put_pixels_clamped_mmx; | c->put_pixels_clamped = ff_put_pixels_clamped_mmx; | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; | |||||
| c->add_pixels_clamped = ff_add_pixels_clamped_mmx; | c->add_pixels_clamped = ff_add_pixels_clamped_mmx; | ||||
| if (!high_bit_depth) { | if (!high_bit_depth) { | ||||
| @@ -550,6 +549,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||||
| c->clear_blocks = ff_clear_blocks_mmx; | c->clear_blocks = ff_clear_blocks_mmx; | ||||
| } | } | ||||
| c->vector_clip_int32 = ff_vector_clip_int32_mmx; | c->vector_clip_int32 = ff_vector_clip_int32_mmx; | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; | |||||
| #endif /* HAVE_MMX_EXTERNAL */ | #endif /* HAVE_MMX_EXTERNAL */ | ||||
| } | } | ||||
| @@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, | |||||
| c->vector_clip_int32 = ff_vector_clip_int32_sse2; | c->vector_clip_int32 = ff_vector_clip_int32_sse2; | ||||
| } | } | ||||
| c->bswap_buf = ff_bswap32_buf_sse2; | c->bswap_buf = ff_bswap32_buf_sse2; | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; | |||||
| #endif /* HAVE_SSE2_EXTERNAL */ | #endif /* HAVE_SSE2_EXTERNAL */ | ||||
| } | } | ||||
| @@ -94,42 +94,6 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||||
| : "memory"); | : "memory"); | ||||
| } | } | ||||
| #define put_signed_pixels_clamped_mmx_half(off) \ | |||||
| "movq "#off"(%2), %%mm1 \n\t" \ | |||||
| "movq 16 + "#off"(%2), %%mm2 \n\t" \ | |||||
| "movq 32 + "#off"(%2), %%mm3 \n\t" \ | |||||
| "movq 48 + "#off"(%2), %%mm4 \n\t" \ | |||||
| "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ | |||||
| "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ | |||||
| "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ | |||||
| "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ | |||||
| "paddb %%mm0, %%mm1 \n\t" \ | |||||
| "paddb %%mm0, %%mm2 \n\t" \ | |||||
| "paddb %%mm0, %%mm3 \n\t" \ | |||||
| "paddb %%mm0, %%mm4 \n\t" \ | |||||
| "movq %%mm1, (%0) \n\t" \ | |||||
| "movq %%mm2, (%0, %3) \n\t" \ | |||||
| "movq %%mm3, (%0, %3, 2) \n\t" \ | |||||
| "movq %%mm4, (%0, %1) \n\t" | |||||
| void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||||
| int line_size) | |||||
| { | |||||
| x86_reg line_skip = line_size; | |||||
| x86_reg line_skip3; | |||||
| __asm__ volatile ( | |||||
| "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" | |||||
| "lea (%3, %3, 2), %1 \n\t" | |||||
| put_signed_pixels_clamped_mmx_half(0) | |||||
| "lea (%0, %3, 4), %0 \n\t" | |||||
| put_signed_pixels_clamped_mmx_half(64) | |||||
| : "+&r" (pixels), "=&r" (line_skip3) | |||||
| : "r" (block), "r" (line_skip) | |||||
| NAMED_CONSTRAINTS_ADD(ff_pb_80) | |||||
| : "memory"); | |||||
| } | |||||
| void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | ||||
| int line_size) | int line_size) | ||||
| { | { | ||||
| @@ -37,6 +37,8 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | |||||
| int line_size); | int line_size); | ||||
| void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, | ||||
| int line_size); | int line_size); | ||||
| void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, | |||||
| int line_size); | |||||
| void ff_clear_block_mmx(int16_t *block); | void ff_clear_block_mmx(int16_t *block); | ||||
| void ff_clear_block_sse(int16_t *block); | void ff_clear_block_sse(int16_t *block); | ||||