Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -221,5 +221,52 @@ struct unaligned_32 { uint32_t l; } __attribute__((packed)); | |||
| void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3], | |||
| int orig_linesize[3], int coded_linesize, | |||
| AVCodecContext *avctx); | |||
| /* FFT computation */ | |||
| /* NOTE: soon integer code will be added, so you must use the | |||
| FFTSample type */ | |||
| typedef float FFTSample; | |||
| typedef struct FFTComplex { | |||
| FFTSample re, im; | |||
| } FFTComplex; | |||
| typedef struct FFTContext { | |||
| int nbits; | |||
| int inverse; | |||
| uint16_t *revtab; | |||
| FFTComplex *exptab; | |||
| FFTComplex *exptab1; /* only used by SSE code */ | |||
| void (*fft_calc)(struct FFTContext *s, FFTComplex *z); | |||
| } FFTContext; | |||
| int fft_init(FFTContext *s, int nbits, int inverse); | |||
| void fft_permute(FFTContext *s, FFTComplex *z); | |||
| void fft_calc_c(FFTContext *s, FFTComplex *z); | |||
| void fft_calc_sse(FFTContext *s, FFTComplex *z); | |||
| static inline void fft_calc(FFTContext *s, FFTComplex *z) | |||
| { | |||
| s->fft_calc(s, z); | |||
| } | |||
| void fft_end(FFTContext *s); | |||
| /* MDCT computation */ | |||
| typedef struct MDCTContext { | |||
| int n; /* size of MDCT (i.e. number of input data * 2) */ | |||
| int nbits; /* n = 2^nbits */ | |||
| /* pre/post rotation tables */ | |||
| FFTSample *tcos; | |||
| FFTSample *tsin; | |||
| FFTContext fft; | |||
| } MDCTContext; | |||
| int mdct_init(MDCTContext *s, int nbits, int inverse); | |||
| void imdct_calc(MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| void mdct_calc(MDCTContext *s, FFTSample *out, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| void mdct_end(MDCTContext *s); | |||
| #endif | |||
| @@ -0,0 +1,294 @@ | |||
| /* FFT and MDCT tests */ | |||
| #include "dsputil.h" | |||
| #include <math.h> | |||
| #include <getopt.h> | |||
| #include <sys/time.h> | |||
| int mm_flags; | |||
| void *av_malloc(int size) | |||
| { | |||
| void *ptr; | |||
| ptr = malloc(size); | |||
| return ptr; | |||
| } | |||
| void av_free(void *ptr) | |||
| { | |||
| /* XXX: this test should not be needed on most libcs */ | |||
| if (ptr) | |||
| free(ptr); | |||
| } | |||
| /* cannot call it directly because of 'void **' casting is not automatic */ | |||
| void __av_freep(void **ptr) | |||
| { | |||
| av_free(*ptr); | |||
| *ptr = NULL; | |||
| } | |||
| /* reference fft */ | |||
| #define MUL16(a,b) ((a) * (b)) | |||
| #define CMAC(pre, pim, are, aim, bre, bim) \ | |||
| {\ | |||
| pre += (MUL16(are, bre) - MUL16(aim, bim));\ | |||
| pim += (MUL16(are, bim) + MUL16(bre, aim));\ | |||
| } | |||
| FFTComplex *exptab; | |||
| void fft_ref_init(int nbits, int inverse) | |||
| { | |||
| int n, i; | |||
| float c1, s1, alpha; | |||
| n = 1 << nbits; | |||
| exptab = av_malloc((n / 2) * sizeof(FFTComplex)); | |||
| for(i=0;i<(n/2);i++) { | |||
| alpha = 2 * M_PI * (float)i / (float)n; | |||
| c1 = cos(alpha); | |||
| s1 = sin(alpha); | |||
| if (!inverse) | |||
| s1 = -s1; | |||
| exptab[i].re = c1; | |||
| exptab[i].im = s1; | |||
| } | |||
| } | |||
| void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits) | |||
| { | |||
| int n, i, j, k, n2; | |||
| float tmp_re, tmp_im, s, c; | |||
| FFTComplex *q; | |||
| n = 1 << nbits; | |||
| n2 = n >> 1; | |||
| for(i=0;i<n;i++) { | |||
| tmp_re = 0; | |||
| tmp_im = 0; | |||
| q = tab; | |||
| for(j=0;j<n;j++) { | |||
| k = (i * j) & (n - 1); | |||
| if (k >= n2) { | |||
| c = -exptab[k - n2].re; | |||
| s = -exptab[k - n2].im; | |||
| } else { | |||
| c = exptab[k].re; | |||
| s = exptab[k].im; | |||
| } | |||
| CMAC(tmp_re, tmp_im, c, s, q->re, q->im); | |||
| q++; | |||
| } | |||
| tabr[i].re = tmp_re; | |||
| tabr[i].im = tmp_im; | |||
| } | |||
| } | |||
| void imdct_ref(float *out, float *in, int n) | |||
| { | |||
| int k, i, a; | |||
| float sum, f; | |||
| for(i=0;i<n;i++) { | |||
| sum = 0; | |||
| for(k=0;k<n/2;k++) { | |||
| a = (2 * i + 1 + (n / 2)) * (2 * k + 1); | |||
| f = cos(M_PI * a / (double)(2 * n)); | |||
| sum += f * in[k]; | |||
| } | |||
| out[i] = -sum; | |||
| } | |||
| } | |||
| /* NOTE: no normalisation by 1 / N is done */ | |||
| void mdct_ref(float *output, float *input, int n) | |||
| { | |||
| int k, i; | |||
| float a, s; | |||
| /* do it by hand */ | |||
| for(k=0;k<n/2;k++) { | |||
| s = 0; | |||
| for(i=0;i<n;i++) { | |||
| a = (2*M_PI*(2*i+1+n/2)*(2*k+1) / (4 * n)); | |||
| s += input[i] * cos(a); | |||
| } | |||
| output[k] = s; | |||
| } | |||
| } | |||
| float frandom(void) | |||
| { | |||
| return (float)((random() & 0xffff) - 32768) / 32768.0; | |||
| } | |||
| INT64 gettime(void) | |||
| { | |||
| struct timeval tv; | |||
| gettimeofday(&tv,NULL); | |||
| return (INT64)tv.tv_sec * 1000000 + tv.tv_usec; | |||
| } | |||
| void check_diff(float *tab1, float *tab2, int n) | |||
| { | |||
| int i; | |||
| for(i=0;i<n;i++) { | |||
| if (fabsf(tab1[i] - tab2[i]) >= 1e-3) { | |||
| printf("ERROR %d: %f %f\n", | |||
| i, tab1[i], tab2[i]); | |||
| } | |||
| } | |||
| } | |||
| void help(void) | |||
| { | |||
| printf("usage: fft-test [-h] [-s] [-i] [-n b]\n" | |||
| "-h print this help\n" | |||
| "-s speed test\n" | |||
| "-m (I)MDCT test\n" | |||
| "-i inverse transform test\n" | |||
| "-n b set the transform size to 2^b\n" | |||
| ); | |||
| exit(1); | |||
| } | |||
| int main(int argc, char **argv) | |||
| { | |||
| FFTComplex *tab, *tab1, *tab_ref; | |||
| FFTSample *tabtmp, *tab2; | |||
| int it, i, c; | |||
| int do_speed = 0; | |||
| int do_mdct = 0; | |||
| int do_inverse = 0; | |||
| FFTContext s1, *s = &s1; | |||
| MDCTContext m1, *m = &m1; | |||
| int fft_nbits, fft_size; | |||
| mm_flags = 0; | |||
| fft_nbits = 9; | |||
| for(;;) { | |||
| c = getopt(argc, argv, "hsimn:"); | |||
| if (c == -1) | |||
| break; | |||
| switch(c) { | |||
| case 'h': | |||
| help(); | |||
| break; | |||
| case 's': | |||
| do_speed = 1; | |||
| break; | |||
| case 'i': | |||
| do_inverse = 1; | |||
| break; | |||
| case 'm': | |||
| do_mdct = 1; | |||
| break; | |||
| case 'n': | |||
| fft_nbits = atoi(optarg); | |||
| break; | |||
| } | |||
| } | |||
| fft_size = 1 << fft_nbits; | |||
| tab = av_malloc(fft_size * sizeof(FFTComplex)); | |||
| tab1 = av_malloc(fft_size * sizeof(FFTComplex)); | |||
| tab_ref = av_malloc(fft_size * sizeof(FFTComplex)); | |||
| tabtmp = av_malloc(fft_size / 2 * sizeof(FFTSample)); | |||
| tab2 = av_malloc(fft_size * sizeof(FFTSample)); | |||
| if (do_mdct) { | |||
| if (do_inverse) | |||
| printf("IMDCT"); | |||
| else | |||
| printf("MDCT"); | |||
| mdct_init(m, fft_nbits, do_inverse); | |||
| } else { | |||
| if (do_inverse) | |||
| printf("IFFT"); | |||
| else | |||
| printf("FFT"); | |||
| fft_init(s, fft_nbits, do_inverse); | |||
| fft_ref_init(fft_nbits, do_inverse); | |||
| } | |||
| printf(" %d test\n", fft_size); | |||
| /* generate random data */ | |||
| for(i=0;i<fft_size;i++) { | |||
| tab1[i].re = frandom(); | |||
| tab1[i].im = frandom(); | |||
| } | |||
| /* checking result */ | |||
| printf("Checking...\n"); | |||
| if (do_mdct) { | |||
| if (do_inverse) { | |||
| imdct_ref((float *)tab_ref, (float *)tab1, fft_size); | |||
| imdct_calc(m, tab2, (float *)tab1, tabtmp); | |||
| check_diff((float *)tab_ref, tab2, fft_size); | |||
| } else { | |||
| mdct_ref((float *)tab_ref, (float *)tab1, fft_size); | |||
| mdct_calc(m, tab2, (float *)tab1, tabtmp); | |||
| check_diff((float *)tab_ref, tab2, fft_size / 2); | |||
| } | |||
| } else { | |||
| memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); | |||
| fft_permute(s, tab); | |||
| fft_calc(s, tab); | |||
| fft_ref(tab_ref, tab1, fft_nbits); | |||
| check_diff((float *)tab_ref, (float *)tab, fft_size * 2); | |||
| } | |||
| /* do a speed test */ | |||
| if (do_speed) { | |||
| INT64 time_start, duration; | |||
| int nb_its; | |||
| printf("Speed test...\n"); | |||
| /* we measure during about 1 seconds */ | |||
| nb_its = 1; | |||
| for(;;) { | |||
| time_start = gettime(); | |||
| for(it=0;it<nb_its;it++) { | |||
| if (do_mdct) { | |||
| if (do_inverse) { | |||
| imdct_calc(m, (float *)tab, (float *)tab1, tabtmp); | |||
| } else { | |||
| mdct_calc(m, (float *)tab, (float *)tab1, tabtmp); | |||
| } | |||
| } else { | |||
| memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); | |||
| fft_calc(s, tab); | |||
| } | |||
| } | |||
| duration = gettime() - time_start; | |||
| if (duration >= 1000000) | |||
| break; | |||
| nb_its *= 2; | |||
| } | |||
| printf("time: %0.1f us/transform [total time=%0.2f s its=%d]\n", | |||
| (double)duration / nb_its, | |||
| (double)duration / 1000000.0, | |||
| nb_its); | |||
| } | |||
| if (do_mdct) { | |||
| mdct_end(m); | |||
| } else { | |||
| fft_end(s); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,229 @@ | |||
| /* | |||
| * FFT/IFFT transforms | |||
| * Copyright (c) 2002 Fabrice Bellard. | |||
| * | |||
| * This library is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2 of the License, or (at your option) any later version. | |||
| * | |||
| * This library is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with this library; if not, write to the Free Software | |||
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
| */ | |||
| #include "dsputil.h" | |||
| /** | |||
| * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is | |||
| * done | |||
| */ | |||
| int fft_init(FFTContext *s, int nbits, int inverse) | |||
| { | |||
| int i, j, m, n; | |||
| float alpha, c1, s1, s2; | |||
| s->nbits = nbits; | |||
| n = 1 << nbits; | |||
| s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); | |||
| if (!s->exptab) | |||
| goto fail; | |||
| s->revtab = av_malloc(n * sizeof(uint16_t)); | |||
| if (!s->revtab) | |||
| goto fail; | |||
| s->inverse = inverse; | |||
| s2 = inverse ? 1.0 : -1.0; | |||
| for(i=0;i<(n/2);i++) { | |||
| alpha = 2 * M_PI * (float)i / (float)n; | |||
| c1 = cos(alpha); | |||
| s1 = sin(alpha) * s2; | |||
| s->exptab[i].re = c1; | |||
| s->exptab[i].im = s1; | |||
| } | |||
| s->fft_calc = fft_calc_c; | |||
| s->exptab1 = NULL; | |||
| /* compute constant table for HAVE_SSE version */ | |||
| #if defined(HAVE_MMX) && 0 | |||
| if (mm_flags & MM_SSE) { | |||
| int np, nblocks, np2, l; | |||
| FFTComplex *q; | |||
| np = 1 << nbits; | |||
| nblocks = np >> 3; | |||
| np2 = np >> 1; | |||
| s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); | |||
| if (!s->exptab1) | |||
| goto fail; | |||
| q = s->exptab1; | |||
| do { | |||
| for(l = 0; l < np2; l += 2 * nblocks) { | |||
| *q++ = s->exptab[l]; | |||
| *q++ = s->exptab[l + nblocks]; | |||
| q->re = -s->exptab[l].im; | |||
| q->im = s->exptab[l].re; | |||
| q++; | |||
| q->re = -s->exptab[l + nblocks].im; | |||
| q->im = s->exptab[l + nblocks].re; | |||
| q++; | |||
| } | |||
| nblocks = nblocks >> 1; | |||
| } while (nblocks != 0); | |||
| av_freep(&s->exptab); | |||
| } | |||
| #endif | |||
| /* compute bit reverse table */ | |||
| for(i=0;i<n;i++) { | |||
| m=0; | |||
| for(j=0;j<nbits;j++) { | |||
| m |= ((i >> j) & 1) << (nbits-j-1); | |||
| } | |||
| s->revtab[i]=m; | |||
| } | |||
| return 0; | |||
| fail: | |||
| av_freep(&s->revtab); | |||
| av_freep(&s->exptab); | |||
| av_freep(&s->exptab1); | |||
| return -1; | |||
| } | |||
| /* butter fly op */ | |||
| #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ | |||
| {\ | |||
| FFTSample ax, ay, bx, by;\ | |||
| bx=pre1;\ | |||
| by=pim1;\ | |||
| ax=qre1;\ | |||
| ay=qim1;\ | |||
| pre = (bx + ax);\ | |||
| pim = (by + ay);\ | |||
| qre = (bx - ax);\ | |||
| qim = (by - ay);\ | |||
| } | |||
| #define MUL16(a,b) ((a) * (b)) | |||
| #define CMUL(pre, pim, are, aim, bre, bim) \ | |||
| {\ | |||
| pre = (MUL16(are, bre) - MUL16(aim, bim));\ | |||
| pim = (MUL16(are, bim) + MUL16(bre, aim));\ | |||
| } | |||
| /** | |||
| * Do a complex FFT with the parameters defined in fft_init(). The | |||
| * input data must be permuted before with s->revtab table. No | |||
| * 1.0/sqrt(n) normalization is done. | |||
| */ | |||
| void fft_calc_c(FFTContext *s, FFTComplex *z) | |||
| { | |||
| int ln = s->nbits; | |||
| int j, np, np2; | |||
| int nblocks, nloops; | |||
| register FFTComplex *p, *q; | |||
| FFTComplex *exptab = s->exptab; | |||
| int l; | |||
| FFTSample tmp_re, tmp_im; | |||
| np = 1 << ln; | |||
| /* pass 0 */ | |||
| p=&z[0]; | |||
| j=(np >> 1); | |||
| do { | |||
| BF(p[0].re, p[0].im, p[1].re, p[1].im, | |||
| p[0].re, p[0].im, p[1].re, p[1].im); | |||
| p+=2; | |||
| } while (--j != 0); | |||
| /* pass 1 */ | |||
| p=&z[0]; | |||
| j=np >> 2; | |||
| if (s->inverse) { | |||
| do { | |||
| BF(p[0].re, p[0].im, p[2].re, p[2].im, | |||
| p[0].re, p[0].im, p[2].re, p[2].im); | |||
| BF(p[1].re, p[1].im, p[3].re, p[3].im, | |||
| p[1].re, p[1].im, -p[3].im, p[3].re); | |||
| p+=4; | |||
| } while (--j != 0); | |||
| } else { | |||
| do { | |||
| BF(p[0].re, p[0].im, p[2].re, p[2].im, | |||
| p[0].re, p[0].im, p[2].re, p[2].im); | |||
| BF(p[1].re, p[1].im, p[3].re, p[3].im, | |||
| p[1].re, p[1].im, p[3].im, -p[3].re); | |||
| p+=4; | |||
| } while (--j != 0); | |||
| } | |||
| /* pass 2 .. ln-1 */ | |||
| nblocks = np >> 3; | |||
| nloops = 1 << 2; | |||
| np2 = np >> 1; | |||
| do { | |||
| p = z; | |||
| q = z + nloops; | |||
| for (j = 0; j < nblocks; ++j) { | |||
| BF(p->re, p->im, q->re, q->im, | |||
| p->re, p->im, q->re, q->im); | |||
| p++; | |||
| q++; | |||
| for(l = nblocks; l < np2; l += nblocks) { | |||
| CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); | |||
| BF(p->re, p->im, q->re, q->im, | |||
| p->re, p->im, tmp_re, tmp_im); | |||
| p++; | |||
| q++; | |||
| } | |||
| p += nloops; | |||
| q += nloops; | |||
| } | |||
| nblocks = nblocks >> 1; | |||
| nloops = nloops << 1; | |||
| } while (nblocks != 0); | |||
| } | |||
| /** | |||
| * Do the permutation needed BEFORE calling fft_calc() | |||
| */ | |||
| void fft_permute(FFTContext *s, FFTComplex *z) | |||
| { | |||
| int j, k, np; | |||
| FFTComplex tmp; | |||
| const uint16_t *revtab = s->revtab; | |||
| /* reverse */ | |||
| np = 1 << s->nbits; | |||
| for(j=0;j<np;j++) { | |||
| k = revtab[j]; | |||
| if (k < j) { | |||
| tmp = z[k]; | |||
| z[k] = z[j]; | |||
| z[j] = tmp; | |||
| } | |||
| } | |||
| } | |||
| void fft_end(FFTContext *s) | |||
| { | |||
| av_freep(&s->revtab); | |||
| av_freep(&s->exptab); | |||
| av_freep(&s->exptab1); | |||
| } | |||
| @@ -0,0 +1,128 @@ | |||
| /* | |||
| * FFT/MDCT transform with SSE optimizations | |||
| * Copyright (c) 2002 Fabrice Bellard. | |||
| * | |||
| * This library is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2 of the License, or (at your option) any later version. | |||
| * | |||
| * This library is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with this library; if not, write to the Free Software | |||
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
| */ | |||
| #include "../dsputil.h" | |||
| #include <math.h> | |||
| #include <xmmintrin.h> | |||
| static const float p1p1p1m1[4] __attribute__((aligned(16))) = | |||
| { 1.0, 1.0, 1.0, -1.0 }; | |||
| static const float p1p1m1m1[4] __attribute__((aligned(16))) = | |||
| { 1.0, 1.0, -1.0, -1.0 }; | |||
| #if 0 | |||
| static void print_v4sf(const char *str, __m128 a) | |||
| { | |||
| float *p = (float *)&a; | |||
| printf("%s: %f %f %f %f\n", | |||
| str, p[0], p[1], p[2], p[3]); | |||
| } | |||
| #endif | |||
| /* XXX: handle reverse case */ | |||
| void fft_calc_sse(FFTContext *s, FFTComplex *z) | |||
| { | |||
| int ln = s->nbits; | |||
| int j, np, np2; | |||
| int nblocks, nloops; | |||
| register FFTComplex *p, *q; | |||
| FFTComplex *cptr, *cptr1; | |||
| int k; | |||
| np = 1 << ln; | |||
| { | |||
| __m128 *r, a, b, a1, c1, c2; | |||
| r = (__m128 *)&z[0]; | |||
| c1 = *(__m128 *)p1p1m1m1; | |||
| c2 = *(__m128 *)p1p1p1m1; | |||
| j = (np >> 2); | |||
| do { | |||
| a = r[0]; | |||
| b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); | |||
| a = _mm_mul_ps(a, c1); | |||
| /* do the pass 0 butterfly */ | |||
| a = _mm_add_ps(a, b); | |||
| a1 = r[1]; | |||
| b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); | |||
| a1 = _mm_mul_ps(a1, c1); | |||
| /* do the pass 0 butterfly */ | |||
| b = _mm_add_ps(a1, b); | |||
| /* multiply third by -i */ | |||
| b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); | |||
| b = _mm_mul_ps(b, c2); | |||
| /* do the pass 1 butterfly */ | |||
| r[0] = _mm_add_ps(a, b); | |||
| r[1] = _mm_sub_ps(a, b); | |||
| r += 2; | |||
| } while (--j != 0); | |||
| } | |||
| /* pass 2 .. ln-1 */ | |||
| nblocks = np >> 3; | |||
| nloops = 1 << 2; | |||
| np2 = np >> 1; | |||
| cptr1 = s->exptab1; | |||
| do { | |||
| p = z; | |||
| q = z + nloops; | |||
| j = nblocks; | |||
| do { | |||
| cptr = cptr1; | |||
| k = nloops >> 1; | |||
| do { | |||
| __m128 a, b, c, t1, t2; | |||
| a = *(__m128 *)p; | |||
| b = *(__m128 *)q; | |||
| /* complex mul */ | |||
| c = *(__m128 *)cptr; | |||
| /* cre*re cim*re */ | |||
| t1 = _mm_mul_ps(c, | |||
| _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); | |||
| c = *(__m128 *)(cptr + 2); | |||
| /* -cim*im cre*im */ | |||
| t2 = _mm_mul_ps(c, | |||
| _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); | |||
| b = _mm_add_ps(t1, t2); | |||
| /* butterfly */ | |||
| *(__m128 *)p = _mm_add_ps(a, b); | |||
| *(__m128 *)q = _mm_sub_ps(a, b); | |||
| p += 2; | |||
| q += 2; | |||
| cptr += 4; | |||
| } while (--k); | |||
| p += nloops; | |||
| q += nloops; | |||
| } while (--j); | |||
| cptr1 += nloops * 2; | |||
| nblocks = nblocks >> 1; | |||
| nloops = nloops << 1; | |||
| } while (nblocks != 0); | |||
| } | |||
| @@ -0,0 +1,170 @@ | |||
| /* | |||
| * MDCT/IMDCT transforms | |||
| * Copyright (c) 2002 Fabrice Bellard. | |||
| * | |||
| * This library is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2 of the License, or (at your option) any later version. | |||
| * | |||
| * This library is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with this library; if not, write to the Free Software | |||
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
| */ | |||
| #include "dsputil.h" | |||
| /* | |||
| * init MDCT or IMDCT computation | |||
| */ | |||
| int mdct_init(MDCTContext *s, int nbits, int inverse) | |||
| { | |||
| int n, n4, i; | |||
| float alpha; | |||
| memset(s, 0, sizeof(*s)); | |||
| n = 1 << nbits; | |||
| s->nbits = nbits; | |||
| s->n = n; | |||
| n4 = n >> 2; | |||
| s->tcos = malloc(n4 * sizeof(FFTSample)); | |||
| if (!s->tcos) | |||
| goto fail; | |||
| s->tsin = malloc(n4 * sizeof(FFTSample)); | |||
| if (!s->tsin) | |||
| goto fail; | |||
| for(i=0;i<n4;i++) { | |||
| alpha = 2 * M_PI * (i + 1.0 / 8.0) / n; | |||
| s->tcos[i] = -cos(alpha); | |||
| s->tsin[i] = -sin(alpha); | |||
| } | |||
| if (fft_init(&s->fft, s->nbits - 2, inverse) < 0) | |||
| goto fail; | |||
| return 0; | |||
| fail: | |||
| av_freep(&s->tcos); | |||
| av_freep(&s->tsin); | |||
| return -1; | |||
| } | |||
| /* complex multiplication: p = a * b */ | |||
| #define CMUL(pre, pim, are, aim, bre, bim) \ | |||
| {\ | |||
| float _are = (are);\ | |||
| float _aim = (aim);\ | |||
| float _bre = (bre);\ | |||
| float _bim = (bim);\ | |||
| (pre) = _are * _bre - _aim * _bim;\ | |||
| (pim) = _are * _bim + _aim * _bre;\ | |||
| } | |||
| /** | |||
| * Compute inverse MDCT of size N = 2^nbits | |||
| * @param output N samples | |||
| * @param input N/2 samples | |||
| * @param tmp N/2 samples | |||
| */ | |||
| void imdct_calc(MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp) | |||
| { | |||
| int k, n8, n4, n2, n, j; | |||
| const uint16_t *revtab = s->fft.revtab; | |||
| const FFTSample *tcos = s->tcos; | |||
| const FFTSample *tsin = s->tsin; | |||
| const FFTSample *in1, *in2; | |||
| FFTComplex *z = (FFTComplex *)tmp; | |||
| n = 1 << s->nbits; | |||
| n2 = n >> 1; | |||
| n4 = n >> 2; | |||
| n8 = n >> 3; | |||
| /* pre rotation */ | |||
| in1 = input; | |||
| in2 = input + n2 - 1; | |||
| for(k = 0; k < n4; k++) { | |||
| j=revtab[k]; | |||
| CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]); | |||
| in1 += 2; | |||
| in2 -= 2; | |||
| } | |||
| fft_calc(&s->fft, z); | |||
| /* post rotation + reordering */ | |||
| /* XXX: optimize */ | |||
| for(k = 0; k < n4; k++) { | |||
| CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]); | |||
| } | |||
| for(k = 0; k < n8; k++) { | |||
| output[2*k] = -z[n8 + k].im; | |||
| output[n2-1-2*k] = z[n8 + k].im; | |||
| output[2*k+1] = z[n8-1-k].re; | |||
| output[n2-1-2*k-1] = -z[n8-1-k].re; | |||
| output[n2 + 2*k]=-z[k+n8].re; | |||
| output[n-1- 2*k]=-z[k+n8].re; | |||
| output[n2 + 2*k+1]=z[n8-k-1].im; | |||
| output[n-2 - 2 * k] = z[n8-k-1].im; | |||
| } | |||
| } | |||
| /** | |||
| * Compute MDCT of size N = 2^nbits | |||
| * @param input N samples | |||
| * @param out N/2 samples | |||
| * @param tmp temporary storage of N/2 samples | |||
| */ | |||
| void mdct_calc(MDCTContext *s, FFTSample *out, | |||
| const FFTSample *input, FFTSample *tmp) | |||
| { | |||
| int i, j, n, n8, n4, n2, n3; | |||
| FFTSample re, im, re1, im1; | |||
| const uint16_t *revtab = s->fft.revtab; | |||
| const FFTSample *tcos = s->tcos; | |||
| const FFTSample *tsin = s->tsin; | |||
| FFTComplex *x = (FFTComplex *)tmp; | |||
| n = 1 << s->nbits; | |||
| n2 = n >> 1; | |||
| n4 = n >> 2; | |||
| n8 = n >> 3; | |||
| n3 = 3 * n4; | |||
| /* pre rotation */ | |||
| for(i=0;i<n8;i++) { | |||
| re = -input[2*i+3*n4] - input[n3-1-2*i]; | |||
| im = -input[n4+2*i] + input[n4-1-2*i]; | |||
| j = revtab[i]; | |||
| CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]); | |||
| re = input[2*i] - input[n2-1-2*i]; | |||
| im = -(input[n2+2*i] + input[n-1-2*i]); | |||
| j = revtab[n8 + i]; | |||
| CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]); | |||
| } | |||
| fft_calc(&s->fft, x); | |||
| /* post rotation */ | |||
| for(i=0;i<n4;i++) { | |||
| re = x[i].re; | |||
| im = x[i].im; | |||
| CMUL(re1, im1, re, im, -tsin[i], -tcos[i]); | |||
| out[2*i] = im1; | |||
| out[n2-1-2*i] = re1; | |||
| } | |||
| } | |||
| void mdct_end(MDCTContext *s) | |||
| { | |||
| av_freep(&s->tcos); | |||
| av_freep(&s->tsin); | |||
| fft_end(&s->fft); | |||
| } | |||