Merge FFTContext and MDCTContext

Originally committed as revision 19931 to svn://svn.ffmpeg.org/ffmpeg/trunk
16 years ago · 01b2214758
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -257,8 +257,8 @@ typedef struct {
     * @defgroup tables   Computed / set up during initialization.
     * @{
     */
    MDCTContext mdct;
    MDCTContext mdct_small;
    FFTContext mdct;
    FFTContext mdct_small;
    DSPContext dsp;
    int random_state;
    /** @} */
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -49,8 +49,8 @@ extern AACCoefficientsEncoder ff_aac_coders[];
 */
 typedef struct AACEncContext {
    PutBitContext pb;
    MDCTContext mdct1024;                        ///< long (1024 samples) frame transform context
    MDCTContext mdct128;                         ///< short (128 samples) frame transform context
    FFTContext mdct1024;                         ///< long (1024 samples) frame transform context
    FFTContext mdct128;                          ///< short (128 samples) frame transform context
    DSPContext  dsp;
    DECLARE_ALIGNED_16(FFTSample, output[2048]); ///< temporary buffer for MDCT input coefficients
    int16_t* samples;                            ///< saved preprocessed input
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -146,8 +146,8 @@ typedef struct {

 ///@defgroup imdct IMDCT
    int block_switch[AC3_MAX_CHANNELS];     ///< block switch flags                     (blksw)
    MDCTContext imdct_512;                  ///< for 512 sample IMDCT
    MDCTContext imdct_256;                  ///< for 256 sample IMDCT
    FFTContext imdct_512;                   ///< for 512 sample IMDCT
    FFTContext imdct_256;                   ///< for 256 sample IMDCT
 ///@}

 ///@defgroup opt optimization
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,9 +23,9 @@
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);

 void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);

 av_cold void ff_fft_init_arm(FFTContext *s)
 {
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -28,10 +28,10 @@ function ff_imdct_half_neon, export=1
        push            {r4-r8,lr}

        mov             r12, #1
        ldr             lr,  [r0, #4]           @ nbits
        ldr             r4,  [r0, #8]           @ tcos
        ldr             r5,  [r0, #12]          @ tsin
        ldr             r3,  [r0, #24]          @ revtab
        ldr             lr,  [r0, #28]          @ mdct_bits
        ldr             r4,  [r0, #32]          @ tcos
        ldr             r5,  [r0, #36]          @ tsin
        ldr             r3,  [r0, #8]           @ revtab
        lsl             r12, r12, lr            @ n  = 1 << nbits
        lsr             lr,  r12, #2            @ n4 = n >> 2
        add             r7,  r2,  r12,  lsl #1
@@ -73,13 +73,12 @@ function ff_imdct_half_neon, export=1

        mov             r4,  r0
        mov             r6,  r1
        add             r0,  r0,  #16
        bl              ff_fft_calc_neon

        mov             r12, #1
        ldr             lr,  [r4, #4]           @ nbits
        ldr             r5,  [r4, #12]          @ tsin
        ldr             r4,  [r4, #8]           @ tcos
        ldr             lr,  [r4, #28]          @ mdct_bits
        ldr             r5,  [r4, #36]          @ tsin
        ldr             r4,  [r4, #32]          @ tcos
        lsl             r12, r12, lr            @ n  = 1 << nbits
        lsr             lr,  r12, #3            @ n8 = n >> 3

@@ -134,7 +133,7 @@ function ff_imdct_half_neon, export=1
 function ff_imdct_calc_neon, export=1
        push            {r4-r6,lr}

        ldr             r3,  [r0, #4]
        ldr             r3,  [r0, #28]
        mov             r4,  #1
        mov             r5,  r1
        lsl             r4,  r4,  r3
@@ -171,10 +170,10 @@ function ff_mdct_calc_neon, export=1
        push            {r4-r10,lr}

        mov             r12, #1
        ldr             lr,  [r0, #4]           @ nbits
        ldr             r4,  [r0, #8]           @ tcos
        ldr             r5,  [r0, #12]          @ tsin
        ldr             r3,  [r0, #24]          @ revtab
        ldr             lr,  [r0, #28]          @ mdct_bits
        ldr             r4,  [r0, #32]          @ tcos
        ldr             r5,  [r0, #36]          @ tsin
        ldr             r3,  [r0, #8]           @ revtab
        lsl             lr,  r12, lr            @ n  = 1 << nbits
        add             r7,  r2,  lr            @ in4u
        sub             r9,  r7,  #16           @ in4d
@@ -224,7 +223,7 @@ function ff_mdct_calc_neon, export=1
        vst2.32         {d6[1],d7[1]}, [r10,:64]

        mov             r12, #1
        ldr             lr,  [r0, #4]           @ nbits
        ldr             lr,  [r0, #28]          @ mdct_bits
        lsl             lr,  r12, lr            @ n  = 1 << nbits
        sub             r8,  r2,  #16           @ in1d
        add             r2,  r9,  #16           @ in0u
@@ -272,13 +271,12 @@ function ff_mdct_calc_neon, export=1

        mov             r4,  r0
        mov             r6,  r1
        add             r0,  r0,  #16
        bl              ff_fft_calc_neon

        mov             r12, #1
        ldr             lr,  [r4, #4]           @ nbits
        ldr             r5,  [r4, #12]          @ tsin
        ldr             r4,  [r4, #8]           @ tcos
        ldr             lr,  [r4, #28]          @ mdct_bits
        ldr             r5,  [r4, #36]          @ tsin
        ldr             r4,  [r4, #32]          @ tcos
        lsl             r12, r12, lr            @ n  = 1 << nbits
        lsr             lr,  r12, #3            @ n8 = n >> 3

--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -79,7 +79,7 @@ typedef struct {
    DECLARE_ALIGNED_16(float, high[512]);
    float*              bands[3];
    DECLARE_ALIGNED_16(float, out_samples[AT1_MAX_CHANNELS][AT1_SU_SAMPLES]);
    MDCTContext         mdct_ctx[3];
    FFTContext          mdct_ctx[3];
    int                 channels;
    DSPContext          dsp;
 } AT1Ctx;
@@ -94,7 +94,7 @@ static const uint8_t   mdct_long_nbits[3] = {7, 7, 8};
 static void at1_imdct(AT1Ctx *q, float *spec, float *out, int nbits,
                      int rev_spec)
 {
    MDCTContext* mdct_context;
    FFTContext* mdct_context;
    int transf_size = 1 << nbits;

    mdct_context = &q->mdct_ctx[nbits - 5 - (nbits > 6)];
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -123,7 +123,7 @@ static DECLARE_ALIGNED_16(float,mdct_window[512]);
 static VLC              spectral_coeff_tab[7];
 static float            gain_tab1[16];
 static float            gain_tab2[31];
 static MDCTContext      mdct_ctx;
 static FFTContext       mdct_ctx;
 static DSPContext       dsp;


--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -136,7 +136,7 @@ typedef struct cook {
    AVLFG               random_state;

    /* transform data */
    MDCTContext         mdct_ctx;
    FFTContext          mdct_ctx;
    float*              mlt_window;

    /* VLC data */
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -250,7 +250,7 @@ typedef struct {

    int debug_flag;             ///< used for suppressing repeated error messages output
    DSPContext dsp;
    MDCTContext imdct;
    FFTContext imdct;
 } DCAContext;

 static const uint16_t dca_vlc_offs[] = {
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -665,8 +665,6 @@ void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
   FFTSample type */
 typedef float FFTSample;

 struct MDCTContext;

 typedef struct FFTComplex {
    FFTSample re, im;
 } FFTComplex;
@@ -678,11 +676,16 @@ typedef struct FFTContext {
    FFTComplex *exptab;
    FFTComplex *exptab1; /* only used by SSE code */
    FFTComplex *tmp_buf;
    int mdct_size; /* size of MDCT (i.e. number of input data * 2) */
    int mdct_bits; /* n = 2^nbits */
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
    void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
    void (*imdct_half)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
    void (*mdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
    void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
    void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
    void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
    int split_radix;
 } FFTContext;

@@ -720,28 +723,19 @@ void ff_fft_end(FFTContext *s);

 /* MDCT computation */

 typedef struct MDCTContext {
    int n;  /* size of MDCT (i.e. number of input data * 2) */
    int nbits; /* n = 2^nbits */
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    FFTContext fft;
 } MDCTContext;

 static inline void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input)
 static inline void ff_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    s->fft.imdct_calc(s, output, input);
    s->imdct_calc(s, output, input);
 }
 static inline void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
 static inline void ff_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    s->fft.imdct_half(s, output, input);
    s->imdct_half(s, output, input);
 }

 static inline void ff_mdct_calc(MDCTContext *s, FFTSample *output,
 static inline void ff_mdct_calc(FFTContext *s, FFTSample *output,
                                const FFTSample *input)
 {
    s->fft.mdct_calc(s, output, input);
    s->mdct_calc(s, output, input);
 }

 /**
@@ -768,11 +762,11 @@ extern float ff_sine_2048[2048];
 extern float ff_sine_4096[4096];
 extern float * const ff_sine_windows[13];

 int ff_mdct_init(MDCTContext *s, int nbits, int inverse, double scale);
 void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_end(MDCTContext *s);
 int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
 void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_end(FFTContext *s);

 /* Real Discrete Fourier Transform */

--- a/libavcodec/fft-test.c
+++ b/libavcodec/fft-test.c
@@ -184,7 +184,7 @@ int main(int argc, char **argv)
    int do_mdct = 0;
    int do_inverse = 0;
    FFTContext s1, *s = &s1;
    MDCTContext m1, *m = &m1;
    FFTContext m1, *m = &m1;
    int fft_nbits, fft_size;
    double scale = 1.0;
    AVLFG prng;
--- a/libavcodec/mdct.c
+++ b/libavcodec/mdct.c
@@ -72,15 +72,15 @@ av_cold void ff_sine_window_init(float *window, int n) {
 /**
 * init MDCT or IMDCT computation.
 */
 av_cold int ff_mdct_init(MDCTContext *s, int nbits, int inverse, double scale)
 av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
 {
    int n, n4, i;
    double alpha, theta;

    memset(s, 0, sizeof(*s));
    n = 1 << nbits;
    s->nbits = nbits;
    s->n = n;
    s->mdct_bits = nbits;
    s->mdct_size = n;
    n4 = n >> 2;
    s->tcos = av_malloc(n4 * sizeof(FFTSample));
    if (!s->tcos)
@@ -96,7 +96,7 @@ av_cold int ff_mdct_init(MDCTContext *s, int nbits, int inverse, double scale)
        s->tcos[i] = -cos(alpha) * scale;
        s->tsin[i] = -sin(alpha) * scale;
    }
    if (ff_fft_init(&s->fft, s->nbits - 2, inverse) < 0)
    if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
        goto fail;
    return 0;
 fail:
@@ -122,16 +122,16 @@ av_cold int ff_mdct_init(MDCTContext *s, int nbits, int inverse, double scale)
 * @param output N/2 samples
 * @param input N/2 samples
 */
 void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    int k, n8, n4, n2, n, j;
    const uint16_t *revtab = s->fft.revtab;
    const uint16_t *revtab = s->revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    const FFTSample *in1, *in2;
    FFTComplex *z = (FFTComplex *)output;

    n = 1 << s->nbits;
    n = 1 << s->mdct_bits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;
@@ -145,7 +145,7 @@ void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
        in1 += 2;
        in2 -= 2;
    }
    ff_fft_calc(&s->fft, z);
    ff_fft_calc(s, z);

    /* post rotation + reordering */
    for(k = 0; k < n8; k++) {
@@ -164,10 +164,10 @@ void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 * @param output N samples
 * @param input N/2 samples
 */
 void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    int k;
    int n = 1 << s->nbits;
    int n = 1 << s->mdct_bits;
    int n2 = n >> 1;
    int n4 = n >> 2;

@@ -184,16 +184,16 @@ void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 * @param input N samples
 * @param out N/2 samples
 */
 void ff_mdct_calc_c(MDCTContext *s, FFTSample *out, const FFTSample *input)
 void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
 {
    int i, j, n, n8, n4, n2, n3;
    FFTSample re, im;
    const uint16_t *revtab = s->fft.revtab;
    const uint16_t *revtab = s->revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    FFTComplex *x = (FFTComplex *)out;

    n = 1 << s->nbits;
    n = 1 << s->mdct_bits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;
@@ -212,7 +212,7 @@ void ff_mdct_calc_c(MDCTContext *s, FFTSample *out, const FFTSample *input)
        CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
    }

    ff_fft_calc(&s->fft, x);
    ff_fft_calc(s, x);

    /* post rotation */
    for(i=0;i<n8;i++) {
@@ -226,9 +226,9 @@ void ff_mdct_calc_c(MDCTContext *s, FFTSample *out, const FFTSample *input)
    }
 }

 av_cold void ff_mdct_end(MDCTContext *s)
 av_cold void ff_mdct_end(FFTContext *s)
 {
    av_freep(&s->tcos);
    av_freep(&s->tsin);
    ff_fft_end(&s->fft);
    ff_fft_end(s);
 }
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -50,7 +50,7 @@ typedef struct NellyMoserDecodeContext {
    int             add_bias;
    float           scale_bias;
    DSPContext      dsp;
    MDCTContext     imdct_ctx;
    FFTContext      imdct_ctx;
    DECLARE_ALIGNED_16(float,imdct_out[NELLY_BUF_LEN * 2]);
 } NellyMoserDecodeContext;

--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -52,7 +52,7 @@ typedef struct NellyMoserEncodeContext {
    int             bufsel;
    int             have_saved;
    DSPContext      dsp;
    MDCTContext     mdct_ctx;
    FFTContext      mdct_ctx;
    DECLARE_ALIGNED_16(float, mdct_out[NELLY_SAMPLES]);
    DECLARE_ALIGNED_16(float, in_buff[NELLY_SAMPLES]);
    DECLARE_ALIGNED_16(float, buf[2][3 * NELLY_BUF_LEN]);     ///< sample buffer
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -172,7 +172,7 @@ static const ModeTab mode_44_48 = {
 typedef struct TwinContext {
    AVCodecContext *avctx;
    DSPContext      dsp;
    MDCTContext mdct_ctx[3];
    FFTContext mdct_ctx[3];

    const ModeTab *mtab;

--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -128,7 +128,7 @@ typedef struct vorbis_context_s {
    GetBitContext gb;
    DSPContext dsp;

    MDCTContext mdct[2];
    FFTContext mdct[2];
    uint_fast8_t first_frame;
    uint_fast32_t version;
    uint_fast8_t audio_channels;
--- a/libavcodec/vorbis_enc.c
+++ b/libavcodec/vorbis_enc.c
@@ -98,7 +98,7 @@ typedef struct {
    int channels;
    int sample_rate;
    int log2_blocksize[2];
    MDCTContext mdct[2];
    FFTContext mdct[2];
    const float * win[2];
    int have_saved;
    float * saved;
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -116,7 +116,7 @@ typedef struct WMACodecContext {
    WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
    DECLARE_ALIGNED_16(float, coefs[MAX_CHANNELS][BLOCK_MAX_SIZE]);
    DECLARE_ALIGNED_16(FFTSample, output[BLOCK_MAX_SIZE * 2]);
    MDCTContext mdct_ctx[BLOCK_NB_SIZES];
    FFTContext mdct_ctx[BLOCK_NB_SIZES];
    float *windows[BLOCK_NB_SIZES];
    /* output buffer for one frame and the last for IMDCT windowing */
    DECLARE_ALIGNED_16(float, frame_out[MAX_CHANNELS][BLOCK_MAX_SIZE * 2]);
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -166,7 +166,7 @@ typedef struct WMAProDecodeCtx {
    uint8_t          frame_data[MAX_FRAMESIZE +
                      FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
    PutBitContext    pb;                            ///< context for filling the frame_data buffer
    MDCTContext      mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
    FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
    DECLARE_ALIGNED_16(float, tmp[WMAPRO_BLOCK_MAX_SIZE]); ///< IMDCT output buffer
    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes

--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -26,11 +26,11 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);

 void ff_imdct_calc_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);

 #endif
--- a/libavcodec/x86/fft_3dn2.c
+++ b/libavcodec/x86/fft_3dn2.c
@@ -53,14 +53,14 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
            FFSWAP(FFTSample, z[i].im, z[i+1].re);
 }

 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    x86_reg j, k;
    long n = 1 << s->nbits;
    long n = 1 << s->mdct_bits;
    long n2 = n >> 1;
    long n4 = n >> 2;
    long n8 = n >> 3;
    const uint16_t *revtab = s->fft.revtab;
    const uint16_t *revtab = s->revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    const FFTSample *in1, *in2;
@@ -101,7 +101,7 @@ void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *inpu
        );
    }

    ff_fft_dispatch_3dn2(z, s->fft.nbits);
    ff_fft_dispatch_3dn2(z, s->nbits);

 #define CMUL(j,mm0,mm1)\
        "movq  (%2,"#j",2), %%mm6 \n"\
@@ -144,10 +144,10 @@ void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *inpu
    __asm__ volatile("femms");
 }

 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    x86_reg j, k;
    long n = 1 << s->nbits;
    long n = 1 << s->mdct_bits;
    long n4 = n >> 2;

    ff_imdct_half_3dn2(s, output+n4, input);
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@@ -71,14 +71,14 @@ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
 }

 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    av_unused x86_reg i, j, k, l;
    long n = 1 << s->nbits;
    long n = 1 << s->mdct_bits;
    long n2 = n >> 1;
    long n4 = n >> 2;
    long n8 = n >> 3;
    const uint16_t *revtab = s->fft.revtab + n8;
    const uint16_t *revtab = s->revtab + n8;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    FFTComplex *z = (FFTComplex *)output;
@@ -129,7 +129,7 @@ void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
 #endif
    }

    ff_fft_dispatch_sse(z, s->fft.nbits);
    ff_fft_dispatch_sse(z, s->nbits);

    /* post rotation + reinterleave + reorder */

@@ -172,10 +172,10 @@ void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
    );
 }

 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    x86_reg j, k;
    long n = 1 << s->nbits;
    long n = 1 << s->mdct_bits;
    long n4 = n >> 2;

    ff_imdct_half_sse(s, output+n4, input);