About 2x faster than the c version.tags/n3.4
@@ -166,6 +166,112 @@ align 16 | |||||
jl .loop | jl .loop | ||||
REP_RET | REP_RET | ||||
;********************************************************** | |||||
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], | |||||
; float (*in)[32][2], | |||||
; int i, int len) | |||||
;********************************************************** | |||||
INIT_XMM sse | |||||
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp | |||||
movsxdifnidn iq, id | |||||
mov lend, 32 << 3 | |||||
lea inq, [inq+iq*4] | |||||
mov tmpd, id | |||||
shl tmpd, 8 | |||||
add outq, tmpq | |||||
mov tmpd, 64 | |||||
sub tmpd, id | |||||
mov id, tmpd | |||||
test id, 1 | |||||
jne .loop4 | |||||
test id, 2 | |||||
jne .loop8 | |||||
align 16 | |||||
.loop16: | |||||
mov in0q, inq | |||||
mov in1q, 38*64*4 | |||||
add in1q, in0q | |||||
mov tmpd, lend | |||||
.inner_loop16: | |||||
movaps m0, [in0q] | |||||
movaps m1, [in1q] | |||||
movaps m2, [in0q+lenq] | |||||
movaps m3, [in1q+lenq] | |||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4 | |||||
movaps [outq], m0 | |||||
movaps [outq+lenq], m1 | |||||
movaps [outq+lenq*2], m2 | |||||
movaps [outq+3*32*2*4], m3 | |||||
lea in0q, [in0q+lenq*2] | |||||
lea in1q, [in1q+lenq*2] | |||||
add outq, mmsize | |||||
sub tmpd, mmsize | |||||
jg .inner_loop16 | |||||
add inq, 16 | |||||
add outq, 3*32*2*4 | |||||
sub id, 4 | |||||
jg .loop16 | |||||
RET | |||||
align 16 | |||||
.loop8: | |||||
mov in0q, inq | |||||
mov in1q, 38*64*4 | |||||
add in1q, in0q | |||||
mov tmpd, lend | |||||
.inner_loop8: | |||||
movlps m0, [in0q] | |||||
movlps m1, [in1q] | |||||
movhps m0, [in0q+lenq] | |||||
movhps m1, [in1q+lenq] | |||||
SBUTTERFLYPS 0, 1, 2 | |||||
SBUTTERFLYPD 0, 1, 2 | |||||
movaps [outq], m0 | |||||
movaps [outq+lenq], m1 | |||||
lea in0q, [in0q+lenq*2] | |||||
lea in1q, [in1q+lenq*2] | |||||
add outq, mmsize | |||||
sub tmpd, mmsize | |||||
jg .inner_loop8 | |||||
add inq, 8 | |||||
add outq, lenq | |||||
sub id, 2 | |||||
jg .loop16 | |||||
RET | |||||
align 16 | |||||
.loop4: | |||||
mov in0q, inq | |||||
mov in1q, 38*64*4 | |||||
add in1q, in0q | |||||
mov tmpd, lend | |||||
.inner_loop4: | |||||
movss m0, [in0q] | |||||
movss m1, [in1q] | |||||
movss m2, [in0q+lenq] | |||||
movss m3, [in1q+lenq] | |||||
movlhps m0, m1 | |||||
movlhps m2, m3 | |||||
shufps m0, m2, q2020 | |||||
movaps [outq], m0 | |||||
lea in0q, [in0q+lenq*2] | |||||
lea in1q, [in1q+lenq*2] | |||||
add outq, mmsize | |||||
sub tmpd, mmsize | |||||
jg .inner_loop4 | |||||
add inq, 4 | |||||
sub id, 1 | |||||
test id, 2 | |||||
jne .loop8 | |||||
cmp id, 4 | |||||
jge .loop16 | |||||
RET | |||||
;*********************************************************** | ;*********************************************************** | ||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], | ;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], | ||||
; float (*in)[32][2], | ; float (*in)[32][2], | ||||
@@ -44,6 +44,8 @@ void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2], | |||||
int i, int len); | int i, int len); | ||||
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2], | void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2], | ||||
int i, int len); | int i, int len); | ||||
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64], | |||||
int i, int len); | |||||
av_cold void ff_psdsp_init_x86(PSDSPContext *s) | av_cold void ff_psdsp_init_x86(PSDSPContext *s) | ||||
{ | { | ||||
@@ -52,6 +54,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) | |||||
if (EXTERNAL_SSE(cpu_flags)) { | if (EXTERNAL_SSE(cpu_flags)) { | ||||
s->add_squares = ff_ps_add_squares_sse; | s->add_squares = ff_ps_add_squares_sse; | ||||
s->mul_pair_single = ff_ps_mul_pair_single_sse; | s->mul_pair_single = ff_ps_mul_pair_single_sse; | ||||
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse; | |||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse; | s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse; | ||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse; | s->hybrid_analysis = ff_ps_hybrid_analysis_sse; | ||||
} | } | ||||