Add float_interleave() to FmtConvertContext with x86-optimized versions.

Partially based on patches by clsid2 in ffdshow-tryout. ff_float_interleave6() x86 improvements by Loren Merrit.
15 years ago · 32f8fb8ecf
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
    }
 }

 void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
                           int channels)
 {
    int j, c;
    unsigned int i;
    if (channels == 2) {
        for (i = 0; i < len; i++) {
            dst[2*i]   = src[0][i];
            dst[2*i+1] = src[1][i];
        }
    } else if (channels == 1 && len < INT_MAX / sizeof(float)) {
        memcpy(dst, src[0], len * sizeof(float));
    } else {
        for (c = 0; c < channels; c++)
            for (i = 0, j = c; i < len; i++, j += channels)
                dst[j] = src[c][i];
    }
 }

 av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
 {
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
    c->float_to_int16             = float_to_int16_c;
    c->float_to_int16_interleave  = float_to_int16_interleave_c;
    c->float_interleave           = ff_float_interleave_c;

    if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
    if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -68,8 +68,17 @@ typedef struct FmtConvertContext {
     */
    void (*float_to_int16_interleave)(int16_t *dst, const float **src,
                                      long len, int channels);

    /**
     * Convert an array of interleaved float to multiple arrays of float.
     */
    void (*float_interleave)(float *dst, const float **src, unsigned int len,
                             int channels);
 } FmtConvertContext;

 void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
                           int channels);

 void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);

 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -20,6 +20,7 @@
 ;******************************************************************************

 %include "x86inc.asm"
 %include "x86util.asm"

 section .text align=16

@@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 %undef cvtps2pi

 ;-----------------------------------------------------------------------------
 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
 ;-----------------------------------------------------------------------------

 %macro BUTTERFLYPS 3
    movaps    m%3, m%1
    unpcklps  m%1, m%2
    unpckhps  m%3, m%2
    SWAP %2, %3
 %endmacro

 %macro FLOAT_INTERLEAVE6 2
 cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
 %ifdef ARCH_X86_64
    %define lend r10d
    mov     lend, r2d
 %else
    %define lend dword r2m
 %endif
    mov    src1q, [srcq+1*gprsize]
    mov    src2q, [srcq+2*gprsize]
    mov    src3q, [srcq+3*gprsize]
    mov    src4q, [srcq+4*gprsize]
    mov    src5q, [srcq+5*gprsize]
    mov     srcq, [srcq]
    sub    src1q, srcq
    sub    src2q, srcq
    sub    src3q, srcq
    sub    src4q, srcq
    sub    src5q, srcq
 .loop:
 %ifidn %1, sse
    movaps    m0, [srcq]
    movaps    m1, [srcq+src1q]
    movaps    m2, [srcq+src2q]
    movaps    m3, [srcq+src3q]
    movaps    m4, [srcq+src4q]
    movaps    m5, [srcq+src5q]

    BUTTERFLYPS 0, 1, 6
    BUTTERFLYPS 2, 3, 6
    BUTTERFLYPS 4, 5, 6

    movaps    m6, m4
    shufps    m4, m0, 0xe4
    movlhps   m0, m2
    movhlps   m6, m2
    movaps [dstq   ], m0
    movaps [dstq+16], m4
    movaps [dstq+32], m6

    movaps    m6, m5
    shufps    m5, m1, 0xe4
    movlhps   m1, m3
    movhlps   m6, m3
    movaps [dstq+48], m1
    movaps [dstq+64], m5
    movaps [dstq+80], m6
 %else ; mmx
    movq       m0, [srcq]
    movq       m1, [srcq+src1q]
    movq       m2, [srcq+src2q]
    movq       m3, [srcq+src3q]
    movq       m4, [srcq+src4q]
    movq       m5, [srcq+src5q]

    SBUTTERFLY dq, 0, 1, 6
    SBUTTERFLY dq, 2, 3, 6
    SBUTTERFLY dq, 4, 5, 6
    movq [dstq   ], m0
    movq [dstq+ 8], m2
    movq [dstq+16], m4
    movq [dstq+24], m1
    movq [dstq+32], m3
    movq [dstq+40], m5
 %endif
    add      srcq, mmsize
    add      dstq, mmsize*6
    sub      lend, mmsize/4
    jg .loop
 %ifidn %1, mmx
    emms
 %endif
    REP_RET
 %endmacro

 INIT_MMX
 FLOAT_INTERLEAVE6 mmx, 0
 INIT_XMM
 FLOAT_INTERLEAVE6 sse, 7

 ;-----------------------------------------------------------------------------
 ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
 ;-----------------------------------------------------------------------------

 %macro FLOAT_INTERLEAVE2 2
 cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
    mov     src1q, [srcq+gprsize]
    mov      srcq, [srcq        ]
    sub     src1q, srcq
 .loop
    MOVPS      m0, [srcq             ]
    MOVPS      m1, [srcq+src1q       ]
    MOVPS      m3, [srcq      +mmsize]
    MOVPS      m4, [srcq+src1q+mmsize]

    MOVPS      m2, m0
    PUNPCKLDQ  m0, m1
    PUNPCKHDQ  m2, m1

    MOVPS      m1, m3
    PUNPCKLDQ  m3, m4
    PUNPCKHDQ  m1, m4

    MOVPS [dstq         ], m0
    MOVPS [dstq+1*mmsize], m2
    MOVPS [dstq+2*mmsize], m3
    MOVPS [dstq+3*mmsize], m1

    add      srcq, mmsize*2
    add      dstq, mmsize*4
    sub      lend, mmsize/2
    jg .loop
 %ifidn %1, mmx
    emms
 %endif
    REP_RET
 %endmacro

 INIT_MMX
 %define MOVPS     movq
 %define PUNPCKLDQ punpckldq
 %define PUNPCKHDQ punpckhdq
 FLOAT_INTERLEAVE2 mmx, 0
 INIT_XMM
 %define MOVPS     movaps
 %define PUNPCKLDQ unpcklps
 %define PUNPCKHDQ unpckhps
 FLOAT_INTERLEAVE2 sse, 5
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
        float_to_int16_interleave_3dnow(dst, src, len, channels);
 }

 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);

 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);

 static void float_interleave_mmx(float *dst, const float **src,
                                 unsigned int len, int channels)
 {
    if (channels == 2) {
        ff_float_interleave2_mmx(dst, src, len);
    } else if (channels == 6)
        ff_float_interleave6_mmx(dst, src, len);
    else
        ff_float_interleave_c(dst, src, len, channels);
 }

 static void float_interleave_sse(float *dst, const float **src,
                                 unsigned int len, int channels)
 {
    if (channels == 2) {
        ff_float_interleave2_sse(dst, src, len);
    } else if (channels == 6)
        ff_float_interleave6_sse(dst, src, len);
    else
        ff_float_interleave_c(dst, src, len, channels);
 }

 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();

    if (mm_flags & AV_CPU_FLAG_MMX) {
        c->float_interleave = float_interleave_mmx;

        if(mm_flags & AV_CPU_FLAG_3DNOW){
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
            c->float_to_int16 = float_to_int16_sse;
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
            c->float_interleave = float_interleave_sse;
        }
        if(mm_flags & AV_CPU_FLAG_SSE2){
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;