new generic FFT/MDCT code for audio codecs

Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk
23 years ago · bb6f569072
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -221,5 +221,52 @@ struct unaligned_32 { uint32_t l; } __attribute__((packed));
 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
              int orig_linesize[3], int coded_linesize,
              AVCodecContext *avctx);
              

 /* FFT computation */

 /* NOTE: soon integer code will be added, so you must use the
   FFTSample type */
 typedef float FFTSample;

 typedef struct FFTComplex {
    FFTSample re, im;
 } FFTComplex;

 typedef struct FFTContext {
    int nbits;
    int inverse;
    uint16_t *revtab;
    FFTComplex *exptab;
    FFTComplex *exptab1; /* only used by SSE code */
    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
 } FFTContext;

 int fft_init(FFTContext *s, int nbits, int inverse);
 void fft_permute(FFTContext *s, FFTComplex *z);
 void fft_calc_c(FFTContext *s, FFTComplex *z);
 void fft_calc_sse(FFTContext *s, FFTComplex *z);
 static inline void fft_calc(FFTContext *s, FFTComplex *z)
 {
    s->fft_calc(s, z);
 }
 void fft_end(FFTContext *s);

 /* MDCT computation */

 typedef struct MDCTContext {
    int n;  /* size of MDCT (i.e. number of input data * 2) */
    int nbits; /* n = 2^nbits */
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    FFTContext fft;
 } MDCTContext;

 int mdct_init(MDCTContext *s, int nbits, int inverse);
 void imdct_calc(MDCTContext *s, FFTSample *output, 
                const FFTSample *input, FFTSample *tmp);
 void mdct_calc(MDCTContext *s, FFTSample *out, 
               const FFTSample *input, FFTSample *tmp);
 void mdct_end(MDCTContext *s);

 #endif
--- a/libavcodec/fft-test.c
+++ b/libavcodec/fft-test.c
@@ -0,0 +1,294 @@
 /* FFT and MDCT tests */
 #include "dsputil.h"
 #include <math.h>
 #include <getopt.h>
 #include <sys/time.h>

 int mm_flags;

 void *av_malloc(int size)
 {
    void *ptr;
    ptr = malloc(size);
    return ptr;
 }

 void av_free(void *ptr)
 {
    /* XXX: this test should not be needed on most libcs */
    if (ptr)
        free(ptr);
 }

 /* cannot call it directly because of 'void **' casting is not automatic */
 void __av_freep(void **ptr)
 {
    av_free(*ptr);
    *ptr = NULL;
 }

 /* reference fft */

 #define MUL16(a,b) ((a) * (b))

 #define CMAC(pre, pim, are, aim, bre, bim) \
 {\
   pre += (MUL16(are, bre) - MUL16(aim, bim));\
   pim += (MUL16(are, bim) + MUL16(bre, aim));\
 }

 FFTComplex *exptab;

 void fft_ref_init(int nbits, int inverse)
 {
    int n, i;
    float c1, s1, alpha;

    n = 1 << nbits;
    exptab = av_malloc((n / 2) * sizeof(FFTComplex));

    for(i=0;i<(n/2);i++) {
        alpha = 2 * M_PI * (float)i / (float)n;
        c1 = cos(alpha);
        s1 = sin(alpha);
        if (!inverse)
            s1 = -s1;
        exptab[i].re = c1;
        exptab[i].im = s1;
    }
 }

 void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits)
 {
    int n, i, j, k, n2;
    float tmp_re, tmp_im, s, c;
    FFTComplex *q;

    n = 1 << nbits;
    n2 = n >> 1;
    for(i=0;i<n;i++) {
        tmp_re = 0;
        tmp_im = 0;
        q = tab;
        for(j=0;j<n;j++) {
            k = (i * j) & (n - 1);
            if (k >= n2) {
                c = -exptab[k - n2].re;
                s = -exptab[k - n2].im;
            } else {
                c = exptab[k].re;
                s = exptab[k].im;
            }
            CMAC(tmp_re, tmp_im, c, s, q->re, q->im);
            q++;
        }
        tabr[i].re = tmp_re;
        tabr[i].im = tmp_im;
    }
 }

 void imdct_ref(float *out, float *in, int n)
 {
    int k, i, a;
    float sum, f;

    for(i=0;i<n;i++) {
        sum = 0;
        for(k=0;k<n/2;k++) {
            a = (2 * i + 1 + (n / 2)) * (2 * k + 1);
            f = cos(M_PI * a / (double)(2 * n));
            sum += f * in[k];
        }
        out[i] = -sum;
    }
 }

 /* NOTE: no normalisation by 1 / N is done */
 void mdct_ref(float *output, float *input, int n)
 {
    int k, i;
    float a, s;

    /* do it by hand */
    for(k=0;k<n/2;k++) {
        s = 0;
        for(i=0;i<n;i++) {
            a = (2*M_PI*(2*i+1+n/2)*(2*k+1) / (4 * n));
            s += input[i] * cos(a);
        }
        output[k] = s;
    }
 }


 float frandom(void)
 {
    return (float)((random() & 0xffff) - 32768) / 32768.0;
 }

 INT64 gettime(void)
 {
    struct timeval tv;
    gettimeofday(&tv,NULL);
    return (INT64)tv.tv_sec * 1000000 + tv.tv_usec;
 }

 void check_diff(float *tab1, float *tab2, int n)
 {
    int i;

    for(i=0;i<n;i++) {
        if (fabsf(tab1[i] - tab2[i]) >= 1e-3) {
            printf("ERROR %d: %f %f\n", 
                   i, tab1[i], tab2[i]);
        }
    }
 }


 void help(void)
 {
    printf("usage: fft-test [-h] [-s] [-i] [-n b]\n"
           "-h     print this help\n"
           "-s     speed test\n"
           "-m     (I)MDCT test\n"
           "-i     inverse transform test\n"
           "-n b   set the transform size to 2^b\n"
           );
    exit(1);
 }



 int main(int argc, char **argv)
 {
    FFTComplex *tab, *tab1, *tab_ref;
    FFTSample *tabtmp, *tab2;
    int it, i, c;
    int do_speed = 0;
    int do_mdct = 0;
    int do_inverse = 0;
    FFTContext s1, *s = &s1;
    MDCTContext m1, *m = &m1;
    int fft_nbits, fft_size;

    mm_flags = 0;
    fft_nbits = 9;
    for(;;) {
        c = getopt(argc, argv, "hsimn:");
        if (c == -1)
            break;
        switch(c) {
        case 'h':
            help();
            break;
        case 's':
            do_speed = 1;
            break;
        case 'i':
            do_inverse = 1;
            break;
        case 'm':
            do_mdct = 1;
            break;
        case 'n':
            fft_nbits = atoi(optarg);
            break;
        }
    }

    fft_size = 1 << fft_nbits;
    tab = av_malloc(fft_size * sizeof(FFTComplex));
    tab1 = av_malloc(fft_size * sizeof(FFTComplex));
    tab_ref = av_malloc(fft_size * sizeof(FFTComplex));
    tabtmp = av_malloc(fft_size / 2 * sizeof(FFTSample));
    tab2 = av_malloc(fft_size * sizeof(FFTSample));

    if (do_mdct) {
        if (do_inverse)
            printf("IMDCT");
        else
            printf("MDCT");
        mdct_init(m, fft_nbits, do_inverse);
    } else {
        if (do_inverse)
            printf("IFFT");
        else
            printf("FFT");
        fft_init(s, fft_nbits, do_inverse);
        fft_ref_init(fft_nbits, do_inverse);
    }
    printf(" %d test\n", fft_size);

    /* generate random data */

    for(i=0;i<fft_size;i++) {
        tab1[i].re = frandom();
        tab1[i].im = frandom();
    }

    /* checking result */
    printf("Checking...\n");

    if (do_mdct) {
        if (do_inverse) {
            imdct_ref((float *)tab_ref, (float *)tab1, fft_size);
            imdct_calc(m, tab2, (float *)tab1, tabtmp);
            check_diff((float *)tab_ref, tab2, fft_size);
        } else {
            mdct_ref((float *)tab_ref, (float *)tab1, fft_size);
            
            mdct_calc(m, tab2, (float *)tab1, tabtmp);

            check_diff((float *)tab_ref, tab2, fft_size / 2);
        }
    } else {
        memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
        fft_permute(s, tab);
        fft_calc(s, tab);
        
        fft_ref(tab_ref, tab1, fft_nbits);
        check_diff((float *)tab_ref, (float *)tab, fft_size * 2);
    }

    /* do a speed test */

    if (do_speed) {
        INT64 time_start, duration;
        int nb_its;

        printf("Speed test...\n");
        /* we measure during about 1 seconds */
        nb_its = 1;
        for(;;) {
            time_start = gettime();
            for(it=0;it<nb_its;it++) {
                if (do_mdct) {
                    if (do_inverse) {
                        imdct_calc(m, (float *)tab, (float *)tab1, tabtmp);
                    } else {
                        mdct_calc(m, (float *)tab, (float *)tab1, tabtmp);
                    }
                } else {
                    memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
                    fft_calc(s, tab);
                }
            }
            duration = gettime() - time_start;
            if (duration >= 1000000)
                break;
            nb_its *= 2;
        }
        printf("time: %0.1f us/transform [total time=%0.2f s its=%d]\n", 
               (double)duration / nb_its, 
               (double)duration / 1000000.0,
               nb_its);
    }
    
    if (do_mdct) {
        mdct_end(m);
    } else {
        fft_end(s);
    }
    return 0;
 }
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -0,0 +1,229 @@
 /*
 * FFT/IFFT transforms
 * Copyright (c) 2002 Fabrice Bellard.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include "dsputil.h"

 /**
 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
 * done 
 */
 int fft_init(FFTContext *s, int nbits, int inverse)
 {
    int i, j, m, n;
    float alpha, c1, s1, s2;
    
    s->nbits = nbits;
    n = 1 << nbits;

    s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
    if (!s->exptab)
        goto fail;
    s->revtab = av_malloc(n * sizeof(uint16_t));
    if (!s->revtab)
        goto fail;
    s->inverse = inverse;

    s2 = inverse ? 1.0 : -1.0;
        
    for(i=0;i<(n/2);i++) {
        alpha = 2 * M_PI * (float)i / (float)n;
        c1 = cos(alpha);
        s1 = sin(alpha) * s2;
        s->exptab[i].re = c1;
        s->exptab[i].im = s1;
    }
    s->fft_calc = fft_calc_c;
    s->exptab1 = NULL;

    /* compute constant table for HAVE_SSE version */
 #if defined(HAVE_MMX) && 0
    if (mm_flags & MM_SSE) {
        int np, nblocks, np2, l;
        FFTComplex *q;

        np = 1 << nbits;
        nblocks = np >> 3;
        np2 = np >> 1;
        s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
        if (!s->exptab1)
            goto fail;
        q = s->exptab1;
        do {
            for(l = 0; l < np2; l += 2 * nblocks) {
                *q++ = s->exptab[l];
                *q++ = s->exptab[l + nblocks];

                q->re = -s->exptab[l].im;
                q->im = s->exptab[l].re;
                q++;
                q->re = -s->exptab[l + nblocks].im;
                q->im = s->exptab[l + nblocks].re;
                q++;
            }
            nblocks = nblocks >> 1;
        } while (nblocks != 0);
        av_freep(&s->exptab);
    }
 #endif

    /* compute bit reverse table */

    for(i=0;i<n;i++) {
        m=0;
        for(j=0;j<nbits;j++) {
            m |= ((i >> j) & 1) << (nbits-j-1);
        }
        s->revtab[i]=m;
    }
    return 0;
 fail:
    av_freep(&s->revtab);
    av_freep(&s->exptab);
    av_freep(&s->exptab1);
    return -1;
 }

 /* butter fly op */
 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
 {\
  FFTSample ax, ay, bx, by;\
  bx=pre1;\
  by=pim1;\
  ax=qre1;\
  ay=qim1;\
  pre = (bx + ax);\
  pim = (by + ay);\
  qre = (bx - ax);\
  qim = (by - ay);\
 }

 #define MUL16(a,b) ((a) * (b))

 #define CMUL(pre, pim, are, aim, bre, bim) \
 {\
   pre = (MUL16(are, bre) - MUL16(aim, bim));\
   pim = (MUL16(are, bim) + MUL16(bre, aim));\
 }

 /**
 * Do a complex FFT with the parameters defined in fft_init(). The
 * input data must be permuted before with s->revtab table. No
 * 1.0/sqrt(n) normalization is done.  
 */
 void fft_calc_c(FFTContext *s, FFTComplex *z)
 {
    int ln = s->nbits;
    int	j, np, np2;
    int	nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *exptab = s->exptab;
    int l;
    FFTSample tmp_re, tmp_im;

    np = 1 << ln;

    /* pass 0 */

    p=&z[0];
    j=(np >> 1);
    do {
        BF(p[0].re, p[0].im, p[1].re, p[1].im, 
           p[0].re, p[0].im, p[1].re, p[1].im);
        p+=2;
    } while (--j != 0);

    /* pass 1 */

    
    p=&z[0];
    j=np >> 2;
    if (s->inverse) {
        do {
            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
               p[0].re, p[0].im, p[2].re, p[2].im);
            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
               p[1].re, p[1].im, -p[3].im, p[3].re);
            p+=4;
        } while (--j != 0);
    } else {
        do {
            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
               p[0].re, p[0].im, p[2].re, p[2].im);
            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
               p[1].re, p[1].im, p[3].im, -p[3].re);
            p+=4;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;
    do {
        p = z;
        q = z + nloops;
        for (j = 0; j < nblocks; ++j) {
            BF(p->re, p->im, q->re, q->im,
               p->re, p->im, q->re, q->im);
            
            p++;
            q++;
            for(l = nblocks; l < np2; l += nblocks) {
                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
                BF(p->re, p->im, q->re, q->im,
                   p->re, p->im, tmp_re, tmp_im);
                p++;
                q++;
            }

            p += nloops;
            q += nloops;
        }
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);
 }

 /**
 * Do the permutation needed BEFORE calling fft_calc()
 */
 void fft_permute(FFTContext *s, FFTComplex *z)
 {
    int j, k, np;
    FFTComplex tmp;
    const uint16_t *revtab = s->revtab;
    
    /* reverse */
    np = 1 << s->nbits;
    for(j=0;j<np;j++) {
        k = revtab[j];
        if (k < j) {
            tmp = z[k];
            z[k] = z[j];
            z[j] = tmp;
        }
    }
 }

 void fft_end(FFTContext *s)
 {
    av_freep(&s->revtab);
    av_freep(&s->exptab);
    av_freep(&s->exptab1);
 }

--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -0,0 +1,128 @@
 /*
 * FFT/MDCT transform with SSE optimizations
 * Copyright (c) 2002 Fabrice Bellard.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include "../dsputil.h"
 #include <math.h>

 #include <xmmintrin.h>

 static const float p1p1p1m1[4] __attribute__((aligned(16))) = 
    { 1.0, 1.0, 1.0, -1.0 };

 static const float p1p1m1m1[4] __attribute__((aligned(16))) = 
    { 1.0, 1.0, -1.0, -1.0 };

 #if 0
 static void print_v4sf(const char *str, __m128 a)
 {
    float *p = (float *)&a;
    printf("%s: %f %f %f %f\n",
           str, p[0], p[1], p[2], p[3]);
 }
 #endif

 /* XXX: handle reverse case */
 void fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
    int ln = s->nbits;
    int	j, np, np2;
    int	nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *cptr, *cptr1;
    int k;

    np = 1 << ln;

    {
        __m128 *r, a, b, a1, c1, c2;

        r = (__m128 *)&z[0];
        c1 = *(__m128 *)p1p1m1m1;
        c2 = *(__m128 *)p1p1p1m1;
        j = (np >> 2);
        do {
            a = r[0];
            b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
            a = _mm_mul_ps(a, c1);
            /* do the pass 0 butterfly */
            a = _mm_add_ps(a, b);

            a1 = r[1];
            b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
            a1 = _mm_mul_ps(a1, c1);
            /* do the pass 0 butterfly */
            b = _mm_add_ps(a1, b);

            /* multiply third by -i */
            b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
            b = _mm_mul_ps(b, c2);

            /* do the pass 1 butterfly */
            r[0] = _mm_add_ps(a, b);
            r[1] = _mm_sub_ps(a, b);
            r += 2;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;

    cptr1 = s->exptab1;
    do {
        p = z;
        q = z + nloops;
        j = nblocks;
        do {
            cptr = cptr1;
            k = nloops >> 1;
            do {
                __m128 a, b, c, t1, t2;

                a = *(__m128 *)p;
                b = *(__m128 *)q;
                
                /* complex mul */
                c = *(__m128 *)cptr;
                /*  cre*re cim*re */
                t1 = _mm_mul_ps(c, 
                                _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); 
                c = *(__m128 *)(cptr + 2);
                /*  -cim*im cre*im */
                t2 = _mm_mul_ps(c,
                                _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); 
                b = _mm_add_ps(t1, t2);
                
                /* butterfly */
                *(__m128 *)p = _mm_add_ps(a, b);
                *(__m128 *)q = _mm_sub_ps(a, b);
                
                p += 2;
                q += 2;
                cptr += 4;
            } while (--k);
        
            p += nloops;
            q += nloops;
        } while (--j);
        cptr1 += nloops * 2;
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);
 }
--- a/libavcodec/mdct.c
+++ b/libavcodec/mdct.c
@@ -0,0 +1,170 @@
 /*
 * MDCT/IMDCT transforms
 * Copyright (c) 2002 Fabrice Bellard.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include "dsputil.h"

 /*
 * init MDCT or IMDCT computation
 */
 int mdct_init(MDCTContext *s, int nbits, int inverse)
 {
    int n, n4, i;
    float alpha;

    memset(s, 0, sizeof(*s));
    n = 1 << nbits;
    s->nbits = nbits;
    s->n = n;
    n4 = n >> 2;
    s->tcos = malloc(n4 * sizeof(FFTSample));
    if (!s->tcos)
        goto fail;
    s->tsin = malloc(n4 * sizeof(FFTSample));
    if (!s->tsin)
        goto fail;

    for(i=0;i<n4;i++) {
        alpha = 2 * M_PI * (i + 1.0 / 8.0) / n;
        s->tcos[i] = -cos(alpha);
        s->tsin[i] = -sin(alpha);
    }
    if (fft_init(&s->fft, s->nbits - 2, inverse) < 0)
        goto fail;
    return 0;
 fail:
    av_freep(&s->tcos);
    av_freep(&s->tsin);
    return -1;
 }

 /* complex multiplication: p = a * b */
 #define CMUL(pre, pim, are, aim, bre, bim) \
 {\
    float _are = (are);\
    float _aim = (aim);\
    float _bre = (bre);\
    float _bim = (bim);\
    (pre) = _are * _bre - _aim * _bim;\
    (pim) = _are * _bim + _aim * _bre;\
 }

 /**
 * Compute inverse MDCT of size N = 2^nbits
 * @param output N samples
 * @param input N/2 samples
 * @param tmp N/2 samples
 */
 void imdct_calc(MDCTContext *s, FFTSample *output, 
                const FFTSample *input, FFTSample *tmp)
 {
    int k, n8, n4, n2, n, j;
    const uint16_t *revtab = s->fft.revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    const FFTSample *in1, *in2;
    FFTComplex *z = (FFTComplex *)tmp;

    n = 1 << s->nbits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;

    /* pre rotation */
    in1 = input;
    in2 = input + n2 - 1;
    for(k = 0; k < n4; k++) {
        j=revtab[k];
        CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
        in1 += 2;
        in2 -= 2;
    }
    fft_calc(&s->fft, z);

    /* post rotation + reordering */
    /* XXX: optimize */
    for(k = 0; k < n4; k++) {
        CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]);
    }
    for(k = 0; k < n8; k++) {
        output[2*k] = -z[n8 + k].im;
        output[n2-1-2*k] = z[n8 + k].im;

        output[2*k+1] = z[n8-1-k].re;
        output[n2-1-2*k-1] = -z[n8-1-k].re;

        output[n2 + 2*k]=-z[k+n8].re;
        output[n-1- 2*k]=-z[k+n8].re;

        output[n2 + 2*k+1]=z[n8-k-1].im;
        output[n-2 - 2 * k] = z[n8-k-1].im;
    }
 }

 /**
 * Compute MDCT of size N = 2^nbits
 * @param input N samples
 * @param out N/2 samples
 * @param tmp temporary storage of N/2 samples
 */
 void mdct_calc(MDCTContext *s, FFTSample *out, 
               const FFTSample *input, FFTSample *tmp)
 {
    int i, j, n, n8, n4, n2, n3;
    FFTSample re, im, re1, im1;
    const uint16_t *revtab = s->fft.revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    FFTComplex *x = (FFTComplex *)tmp;

    n = 1 << s->nbits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;
    n3 = 3 * n4;

    /* pre rotation */
    for(i=0;i<n8;i++) {
        re = -input[2*i+3*n4] - input[n3-1-2*i];
        im = -input[n4+2*i] + input[n4-1-2*i];
        j = revtab[i];
        CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);

        re = input[2*i] - input[n2-1-2*i];
        im = -(input[n2+2*i] + input[n-1-2*i]);
        j = revtab[n8 + i];
        CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
    }

    fft_calc(&s->fft, x);
  
    /* post rotation */
    for(i=0;i<n4;i++) {
        re = x[i].re;
        im = x[i].im;
        CMUL(re1, im1, re, im, -tsin[i], -tcos[i]);
        out[2*i] = im1;
        out[n2-1-2*i] = re1;
    }
 }

 void mdct_end(MDCTContext *s)
 {
    av_freep(&s->tcos);
    av_freep(&s->tsin);
    fft_end(&s->fft);
 }