You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
3.9KB

  1. /*
  2. * FFT/MDCT transform with 3DNow! optimizations
  3. * Copyright (c) 2006 Zuxy MENG Jie.
  4. * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../dsputil.h"
  21. #include <math.h>
  22. #ifdef HAVE_MM3DNOW
  23. #include <mm3dnow.h>
  24. static const int p1m1[2] __attribute__((aligned(8))) =
  25. { 0, 1 << 31 };
  26. static const int m1p1[2] __attribute__((aligned(8))) =
  27. { 1 << 31, 0 };
  28. void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
  29. {
  30. int ln = s->nbits;
  31. int j, np, np2;
  32. int nblocks, nloops;
  33. register FFTComplex *p, *q;
  34. FFTComplex *cptr, *cptr1;
  35. int k;
  36. np = 1 << ln;
  37. /* FEMMS not a must here but recommended by AMD */
  38. _m_femms();
  39. {
  40. __m64 *r, a0, a1, b0, b1, tmp, c;
  41. r = (__m64 *)&z[0];
  42. if (s->inverse)
  43. c = *(__m64 *)m1p1;
  44. else
  45. c = *(__m64 *)p1m1;
  46. j = (np >> 2);
  47. do {
  48. /* do the pass 0 butterfly */
  49. a0 = _m_pfadd(r[0], r[1]);
  50. a1 = _m_pfsub(r[0], r[1]);
  51. /* do the pass 0 butterfly */
  52. b0 = _m_pfadd(r[2], r[3]);
  53. b1 = _m_pfsub(r[2], r[3]);
  54. /* multiply third by -i */
  55. tmp = _m_punpckhdq(b1, b1);
  56. b1 = _m_punpckldq(b1, b1);
  57. b1 = _m_punpckldq(tmp, b1);
  58. b1 = _m_pxor(b1, c);
  59. /* do the pass 1 butterfly */
  60. r[0] = _m_pfadd(a0, b0);
  61. r[1] = _m_pfadd(a1, b1);
  62. r[2] = _m_pfsub(a0, b0);
  63. r[3] = _m_pfsub(a1, b1);
  64. r += 4;
  65. } while (--j != 0);
  66. }
  67. /* pass 2 .. ln-1 */
  68. nblocks = np >> 3;
  69. nloops = 1 << 2;
  70. np2 = np >> 1;
  71. cptr1 = s->exptab1;
  72. do {
  73. p = z;
  74. q = z + nloops;
  75. j = nblocks;
  76. do {
  77. cptr = cptr1;
  78. k = nloops >> 1;
  79. do {
  80. __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21;
  81. a0 = *(__m64 *)&p[0];
  82. a1 = *(__m64 *)&p[1];
  83. b0 = *(__m64 *)&q[0];
  84. b1 = *(__m64 *)&q[1];
  85. /* complex mul */
  86. c0 = *(__m64 *)&cptr[0];
  87. c1 = *(__m64 *)&cptr[1];
  88. /* cre*re cim*re */
  89. t10 = _m_pfmul(c0, _m_punpckldq(b0, b0));
  90. t11 = _m_pfmul(c1, _m_punpckldq(b1, b1));
  91. c0 = *(__m64 *)&cptr[2];
  92. c1 = *(__m64 *)&cptr[3];
  93. /* -cim*im cre*im */
  94. t20 = _m_pfmul(c0, _m_punpckhdq(b0, b0));
  95. t21 = _m_pfmul(c1, _m_punpckhdq(b1, b1));
  96. b0 = _m_pfadd(t10, t20);
  97. b1 = _m_pfadd(t11, t21);
  98. /* butterfly */
  99. *(__m64 *)&p[0] = _m_pfadd(a0, b0);
  100. *(__m64 *)&p[1] = _m_pfadd(a1, b1);
  101. *(__m64 *)&q[0] = _m_pfsub(a0, b0);
  102. *(__m64 *)&q[1] = _m_pfsub(a1, b1);
  103. p += 2;
  104. q += 2;
  105. cptr += 4;
  106. } while (--k);
  107. p += nloops;
  108. q += nloops;
  109. } while (--j);
  110. cptr1 += nloops * 2;
  111. nblocks = nblocks >> 1;
  112. nloops = nloops << 1;
  113. } while (nblocks != 0);
  114. _m_femms();
  115. }
  116. #endif