You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

129 lines
3.6KB

  1. /*
  2. * FFT/MDCT transform with SSE optimizations
  3. * Copyright (c) 2002 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "../dsputil.h"
  20. #include <math.h>
  21. #include <xmmintrin.h>
  22. static const float p1p1p1m1[4] __attribute__((aligned(16))) =
  23. { 1.0, 1.0, 1.0, -1.0 };
  24. static const float p1p1m1m1[4] __attribute__((aligned(16))) =
  25. { 1.0, 1.0, -1.0, -1.0 };
  26. #if 0
  27. static void print_v4sf(const char *str, __m128 a)
  28. {
  29. float *p = (float *)&a;
  30. printf("%s: %f %f %f %f\n",
  31. str, p[0], p[1], p[2], p[3]);
  32. }
  33. #endif
  34. /* XXX: handle reverse case */
  35. void fft_calc_sse(FFTContext *s, FFTComplex *z)
  36. {
  37. int ln = s->nbits;
  38. int j, np, np2;
  39. int nblocks, nloops;
  40. register FFTComplex *p, *q;
  41. FFTComplex *cptr, *cptr1;
  42. int k;
  43. np = 1 << ln;
  44. {
  45. __m128 *r, a, b, a1, c1, c2;
  46. r = (__m128 *)&z[0];
  47. c1 = *(__m128 *)p1p1m1m1;
  48. c2 = *(__m128 *)p1p1p1m1;
  49. j = (np >> 2);
  50. do {
  51. a = r[0];
  52. b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
  53. a = _mm_mul_ps(a, c1);
  54. /* do the pass 0 butterfly */
  55. a = _mm_add_ps(a, b);
  56. a1 = r[1];
  57. b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
  58. a1 = _mm_mul_ps(a1, c1);
  59. /* do the pass 0 butterfly */
  60. b = _mm_add_ps(a1, b);
  61. /* multiply third by -i */
  62. b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
  63. b = _mm_mul_ps(b, c2);
  64. /* do the pass 1 butterfly */
  65. r[0] = _mm_add_ps(a, b);
  66. r[1] = _mm_sub_ps(a, b);
  67. r += 2;
  68. } while (--j != 0);
  69. }
  70. /* pass 2 .. ln-1 */
  71. nblocks = np >> 3;
  72. nloops = 1 << 2;
  73. np2 = np >> 1;
  74. cptr1 = s->exptab1;
  75. do {
  76. p = z;
  77. q = z + nloops;
  78. j = nblocks;
  79. do {
  80. cptr = cptr1;
  81. k = nloops >> 1;
  82. do {
  83. __m128 a, b, c, t1, t2;
  84. a = *(__m128 *)p;
  85. b = *(__m128 *)q;
  86. /* complex mul */
  87. c = *(__m128 *)cptr;
  88. /* cre*re cim*re */
  89. t1 = _mm_mul_ps(c,
  90. _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0)));
  91. c = *(__m128 *)(cptr + 2);
  92. /* -cim*im cre*im */
  93. t2 = _mm_mul_ps(c,
  94. _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1)));
  95. b = _mm_add_ps(t1, t2);
  96. /* butterfly */
  97. *(__m128 *)p = _mm_add_ps(a, b);
  98. *(__m128 *)q = _mm_sub_ps(a, b);
  99. p += 2;
  100. q += 2;
  101. cptr += 4;
  102. } while (--k);
  103. p += nloops;
  104. q += nloops;
  105. } while (--j);
  106. cptr1 += nloops * 2;
  107. nblocks = nblocks >> 1;
  108. nloops = nloops << 1;
  109. } while (nblocks != 0);
  110. }