~15% faster than sse2 Reviewed-by: Mickaël Raulet <mraulet@gmail.com> Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>tags/n2.4
@@ -156,8 +156,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6 | |||||
%endmacro | %endmacro | ||||
INIT_XMM sse2 | |||||
; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
%macro TRANSFORM_ADD_8 0 | |||||
; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
cglobal hevc_transform_add8_8, 3, 4, 8 | cglobal hevc_transform_add8_8, 3, 4, 8 | ||||
lea r3, [r2*3] | lea r3, [r2*3] | ||||
TR_ADD_SSE_8_8 | TR_ADD_SSE_8_8 | ||||
@@ -167,7 +167,7 @@ cglobal hevc_transform_add8_8, 3, 4, 8 | |||||
RET | RET | ||||
%if ARCH_X86_64 | %if ARCH_X86_64 | ||||
; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
cglobal hevc_transform_add16_8, 3, 4, 12 | cglobal hevc_transform_add16_8, 3, 4, 12 | ||||
lea r3, [r2*3] | lea r3, [r2*3] | ||||
TR_ADD_SSE_16_8 | TR_ADD_SSE_16_8 | ||||
@@ -178,7 +178,7 @@ cglobal hevc_transform_add16_8, 3, 4, 12 | |||||
%endrep | %endrep | ||||
RET | RET | ||||
; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) | |||||
cglobal hevc_transform_add32_8, 3, 4, 12 | cglobal hevc_transform_add32_8, 3, 4, 12 | ||||
TR_ADD_SSE_32_8 | TR_ADD_SSE_32_8 | ||||
@@ -190,6 +190,13 @@ cglobal hevc_transform_add32_8, 3, 4, 12 | |||||
RET | RET | ||||
%endif ;ARCH_X86_64 | %endif ;ARCH_X86_64 | ||||
%endmacro | |||||
INIT_XMM sse2 | |||||
TRANSFORM_ADD_8 | |||||
INIT_XMM avx | |||||
TRANSFORM_ADD_8 | |||||
;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) | ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) | ||||
;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
@@ -139,6 +139,10 @@ void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stri | |||||
void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | ||||
void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | ||||
void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | |||||
void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | |||||
void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | |||||
void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | ||||
void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | ||||
void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); | ||||
@@ -509,7 +509,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) | |||||
if (ARCH_X86_64) { | if (ARCH_X86_64) { | ||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; | ||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; | ||||
c->transform_add[2] = ff_hevc_transform_add16_8_avx; | |||||
c->transform_add[3] = ff_hevc_transform_add32_8_avx; | |||||
} | } | ||||
c->transform_add[1] = ff_hevc_transform_add8_8_avx; | |||||
} | } | ||||
if (EXTERNAL_AVX2(cpu_flags)) { | if (EXTERNAL_AVX2(cpu_flags)) { | ||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; | c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; | ||||