sse & sse2 implementations of vorbis channel coupling.

9% faster vorbis (on a K8). Originally committed as revision 5898 to svn://svn.ffmpeg.org/ffmpeg/trunk
19 years ago · 2dac4acfc0
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -35,6 +35,9 @@
 /* snow.c */
 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);

 /* vorbis.c */
 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);

 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
 uint32_t squareTbl[512] = {0, };

@@ -4090,6 +4093,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->inner_add_yblock = ff_snow_inner_add_yblock;
 #endif

 #ifdef CONFIG_VORBIS_DECODER
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
 #endif

    c->shrink[0]= ff_img_copy_plane;
    c->shrink[1]= ff_shrink22;
    c->shrink[2]= ff_shrink44;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -307,6 +307,8 @@ typedef struct DSPContext {

    void (*h261_loop_filter)(uint8_t *src, int stride);

    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);

    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
    void (*fdct248)(DCTELEM *block/* align 16*/);
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2711,6 +2711,59 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
 }
 #endif

 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
 {
    int i;
    asm volatile("pxor %%mm7, %%mm7":);
    for(i=0; i<blocksize; i+=2) {
        asm volatile(
            "movq    %0,    %%mm0 \n\t"
            "movq    %1,    %%mm1 \n\t"
            "movq    %%mm0, %%mm2 \n\t"
            "movq    %%mm1, %%mm3 \n\t"
            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
            "pxor    %%mm2, %%mm1 \n\t"
            "movq    %%mm3, %%mm4 \n\t"
            "pand    %%mm1, %%mm3 \n\t"
            "pandn   %%mm1, %%mm4 \n\t"
            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
            "movq    %%mm3, %1    \n\t"
            "movq    %%mm0, %0    \n\t"
            :"+m"(mag[i]), "+m"(ang[i])
            ::"memory"
        );
    }
    asm volatile("emms");
 }
 static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize)
 {
    int i;
    for(i=0; i<blocksize; i+=4) {
        asm volatile(
            "movaps  %0,     %%xmm0 \n\t"
            "movaps  %1,     %%xmm1 \n\t"
            "pxor    %%xmm2, %%xmm2 \n\t"
            "pxor    %%xmm3, %%xmm3 \n\t"
            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
            "pslld   $31,    %%xmm2 \n\t" // keep only the sign bit
            "pxor    %%xmm2, %%xmm1 \n\t"
            "movaps  %%xmm3, %%xmm4 \n\t"
            "pand    %%xmm1, %%xmm3 \n\t"
            "pandn   %%xmm1, %%xmm4 \n\t"
            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
            "movaps  %%xmm3, %1     \n\t"
            "movaps  %%xmm0, %0     \n\t"
            :"+m"(mag[i]), "+m"(ang[i])
            ::"memory"
        );
    }
 }

 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3137,6 +3190,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
        }
 #endif

        if(mm_flags & MM_SSE2)
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2;
        else if(mm_flags & MM_SSE)
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
    }

 #ifdef CONFIG_ENCODERS
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -929,6 +929,7 @@ static int vorbis_decode_init(AVCodecContext *avccontext) {
    int i, j, hdr_type;

    vc->avccontext = avccontext;
    dsputil_init(&vc->dsp, avccontext);

    if (!headers_len) {
        av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n");
@@ -1443,6 +1444,31 @@ static int vorbis_residue_decode(vorbis_context *vc, vorbis_residue *vr, uint_fa
    return 0;
 }

 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize)
 {
    int i;
    for(i=0; i<blocksize; i++)
    {
        if (mag[i]>0.0) {
            if (ang[i]>0.0) {
                ang[i]=mag[i]-ang[i];
            } else {
                float temp=ang[i];
                ang[i]=mag[i];
                mag[i]+=temp;
            }
        } else {
            if (ang[i]>0.0) {
                ang[i]+=mag[i];
            } else {
                float temp=ang[i];
                ang[i]=mag[i];
                mag[i]-=temp;
            }
        }
    }
 }

 // Decode the audio packet using the functions above
 #define BIAS 385

@@ -1541,26 +1567,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) {

        mag=vc->channel_residues+res_chan[mapping->magnitude[i]]*blocksize/2;
        ang=vc->channel_residues+res_chan[mapping->angle[i]]*blocksize/2;
        for(j=0;j<blocksize/2;++j) {
            float temp;
            if (mag[j]>0.0) {
                if (ang[j]>0.0) {
                    ang[j]=mag[j]-ang[j];
                } else {
                    temp=ang[j];
                    ang[j]=mag[j];
                    mag[j]+=temp;
                }
            } else {
                if (ang[j]>0.0) {
                    ang[j]+=mag[j];
                } else {
                    temp=ang[j];
                    ang[j]=mag[j];
                    mag[j]-=temp;
                }
            }
        }
        vc->dsp.vorbis_inverse_coupling(mag, ang, blocksize/2);
    }

 // Dotproduct
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -87,6 +87,7 @@ typedef struct {
 typedef struct vorbis_context_s {
    AVCodecContext *avccontext;
    GetBitContext gb;
    DSPContext dsp;

    MDCTContext mdct0;
    MDCTContext mdct1;