You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
3.8KB

  1. /*
  2. * FFT/MDCT transform with Extended 3DNow! optimizations
  3. * Copyright (c) 2006 Zuxy MENG Jie.
  4. * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../dsputil.h"
  21. #include <math.h>
  22. #ifdef HAVE_MM3DNOW
  23. #include <mm3dnow.h>
  24. static const int p1m1[2] __attribute__((aligned(8))) =
  25. { 0, 1 << 31 };
  26. static const int m1p1[2] __attribute__((aligned(8))) =
  27. { 1 << 31, 0 };
  28. void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
  29. {
  30. int ln = s->nbits;
  31. int j, np, np2;
  32. int nblocks, nloops;
  33. register FFTComplex *p, *q;
  34. FFTComplex *cptr, *cptr1;
  35. int k;
  36. np = 1 << ln;
  37. /* FEMMS is not a must here but recommended by AMD */
  38. _m_femms();
  39. {
  40. __m64 *r, a0, a1, b0, b1, c;
  41. r = (__m64 *)&z[0];
  42. if (s->inverse)
  43. c = *(__m64 *)m1p1;
  44. else
  45. c = *(__m64 *)p1m1;
  46. j = (np >> 2);
  47. do {
  48. /* do the pass 0 butterfly */
  49. a0 = _m_pfadd(r[0], r[1]);
  50. a1 = _m_pfsub(r[0], r[1]);
  51. /* do the pass 0 butterfly */
  52. b0 = _m_pfadd(r[2], r[3]);
  53. b1 = _m_pfsub(r[2], r[3]);
  54. /* multiply third by -i */
  55. b1 = _m_pswapd(b1);
  56. b1 = _m_pxor(b1, c);
  57. r[0] = _m_pfadd(a0, b0);
  58. r[1] = _m_pfadd(a1, b1);
  59. r[2] = _m_pfsub(a0, b0);
  60. r[3] = _m_pfsub(a1, b1);
  61. r += 4;
  62. } while (--j != 0);
  63. }
  64. /* pass 2 .. ln-1 */
  65. nblocks = np >> 3;
  66. nloops = 1 << 2;
  67. np2 = np >> 1;
  68. cptr1 = s->exptab1;
  69. do {
  70. p = z;
  71. q = z + nloops;
  72. j = nblocks;
  73. do {
  74. cptr = cptr1;
  75. k = nloops >> 1;
  76. do {
  77. __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21;
  78. a0 = *(__m64 *)&p[0];
  79. a1 = *(__m64 *)&p[1];
  80. b0 = *(__m64 *)&q[0];
  81. b1 = *(__m64 *)&q[1];
  82. /* complex mul */
  83. c0 = *(__m64 *)&cptr[0];
  84. c1 = *(__m64 *)&cptr[1];
  85. /* cre*re cim*im */
  86. t10 = _m_pfmul(c0, b0);
  87. t11 = _m_pfmul(c1, b1);
  88. /* no need to access cptr[2] & cptr[3] */
  89. c0 = _m_pswapd(c0);
  90. c1 = _m_pswapd(c1);
  91. /* cim*re cre*im */
  92. t20 = _m_pfmul(c0, b0);
  93. t21 = _m_pfmul(c1, b1);
  94. /* cre*re-cim*im cim*re+cre*im */
  95. b0 = _m_pfpnacc(t10, t20);
  96. b1 = _m_pfpnacc(t11, t21);
  97. /* butterfly */
  98. *(__m64 *)&p[0] = _m_pfadd(a0, b0);
  99. *(__m64 *)&p[1] = _m_pfadd(a1, b1);
  100. *(__m64 *)&q[0] = _m_pfsub(a0, b0);
  101. *(__m64 *)&q[1] = _m_pfsub(a1, b1);
  102. p += 2;
  103. q += 2;
  104. cptr += 4;
  105. } while (--k);
  106. p += nloops;
  107. q += nloops;
  108. } while (--j);
  109. cptr1 += nloops * 2;
  110. nblocks = nblocks >> 1;
  111. nloops = nloops << 1;
  112. } while (nblocks != 0);
  113. _m_femms();
  114. }
  115. #endif