You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

141 lines
3.9KB

  1. /*
  2. * FFT/MDCT transform with SSE optimizations
  3. * Copyright (c) 2002 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "../dsputil.h"
  20. #include <math.h>
  21. #ifdef HAVE_BUILTIN_VECTOR
  22. #include <xmmintrin.h>
  23. static const float p1p1p1m1[4] __attribute__((aligned(16))) =
  24. { 1.0, 1.0, 1.0, -1.0 };
  25. static const float p1p1m1p1[4] __attribute__((aligned(16))) =
  26. { 1.0, 1.0, -1.0, 1.0 };
  27. static const float p1p1m1m1[4] __attribute__((aligned(16))) =
  28. { 1.0, 1.0, -1.0, -1.0 };
  29. #if 0
  30. static void print_v4sf(const char *str, __m128 a)
  31. {
  32. float *p = (float *)&a;
  33. printf("%s: %f %f %f %f\n",
  34. str, p[0], p[1], p[2], p[3]);
  35. }
  36. #endif
  37. /* XXX: handle reverse case */
  38. void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
  39. {
  40. int ln = s->nbits;
  41. int j, np, np2;
  42. int nblocks, nloops;
  43. register FFTComplex *p, *q;
  44. FFTComplex *cptr, *cptr1;
  45. int k;
  46. np = 1 << ln;
  47. {
  48. __m128 *r, a, b, a1, c1, c2;
  49. r = (__m128 *)&z[0];
  50. c1 = *(__m128 *)p1p1m1m1;
  51. c2 = *(__m128 *)p1p1p1m1;
  52. if (s->inverse)
  53. c2 = *(__m128 *)p1p1m1p1;
  54. else
  55. c2 = *(__m128 *)p1p1p1m1;
  56. j = (np >> 2);
  57. do {
  58. a = r[0];
  59. b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
  60. a = _mm_mul_ps(a, c1);
  61. /* do the pass 0 butterfly */
  62. a = _mm_add_ps(a, b);
  63. a1 = r[1];
  64. b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
  65. a1 = _mm_mul_ps(a1, c1);
  66. /* do the pass 0 butterfly */
  67. b = _mm_add_ps(a1, b);
  68. /* multiply third by -i */
  69. b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
  70. b = _mm_mul_ps(b, c2);
  71. /* do the pass 1 butterfly */
  72. r[0] = _mm_add_ps(a, b);
  73. r[1] = _mm_sub_ps(a, b);
  74. r += 2;
  75. } while (--j != 0);
  76. }
  77. /* pass 2 .. ln-1 */
  78. nblocks = np >> 3;
  79. nloops = 1 << 2;
  80. np2 = np >> 1;
  81. cptr1 = s->exptab1;
  82. do {
  83. p = z;
  84. q = z + nloops;
  85. j = nblocks;
  86. do {
  87. cptr = cptr1;
  88. k = nloops >> 1;
  89. do {
  90. __m128 a, b, c, t1, t2;
  91. a = *(__m128 *)p;
  92. b = *(__m128 *)q;
  93. /* complex mul */
  94. c = *(__m128 *)cptr;
  95. /* cre*re cim*re */
  96. t1 = _mm_mul_ps(c,
  97. _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0)));
  98. c = *(__m128 *)(cptr + 2);
  99. /* -cim*im cre*im */
  100. t2 = _mm_mul_ps(c,
  101. _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1)));
  102. b = _mm_add_ps(t1, t2);
  103. /* butterfly */
  104. *(__m128 *)p = _mm_add_ps(a, b);
  105. *(__m128 *)q = _mm_sub_ps(a, b);
  106. p += 2;
  107. q += 2;
  108. cptr += 4;
  109. } while (--k);
  110. p += nloops;
  111. q += nloops;
  112. } while (--j);
  113. cptr1 += nloops * 2;
  114. nblocks = nblocks >> 1;
  115. nloops = nloops << 1;
  116. } while (nblocks != 0);
  117. }
  118. #endif