You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

420 lines
12KB

  1. /*
  2. Copyright (c) 2013 Julien Pommier.
  3. Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
  4. How to build:
  5. on linux, with fftw3:
  6. gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
  7. on macos, without fftw3:
  8. clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
  9. on macos, with fftw3:
  10. clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
  11. on windows, with visual c++:
  12. cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
  13. build without SIMD instructions:
  14. gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
  15. */
  16. #include "pffft.h"
  17. #include "fftpack.h"
  18. #include <math.h>
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <time.h>
  22. #include <assert.h>
  23. #include <string.h>
  24. #ifdef HAVE_SYS_TIMES
  25. # include <sys/times.h>
  26. # include <unistd.h>
  27. #endif
  28. #ifdef HAVE_VECLIB
  29. # include <Accelerate/Accelerate.h>
  30. #endif
  31. #ifdef HAVE_FFTW
  32. # include <fftw3.h>
  33. #endif
  34. #define MAX(x,y) ((x)>(y)?(x):(y))
  35. double frand() {
  36. return rand()/(double)RAND_MAX;
  37. }
  38. #if defined(HAVE_SYS_TIMES)
  39. inline double uclock_sec(void) {
  40. static double ttclk = 0.;
  41. if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
  42. struct tms t; return ((double)times(&t)) / ttclk;
  43. }
  44. # else
  45. double uclock_sec(void)
  46. { return (double)clock()/(double)CLOCKS_PER_SEC; }
  47. #endif
  48. /* compare results with the regular fftpack */
  49. void pffft_validate_N(int N, int cplx) {
  50. int Nfloat = N*(cplx?2:1);
  51. int Nbytes = Nfloat * sizeof(float);
  52. float *ref, *in, *out, *tmp, *tmp2;
  53. PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
  54. int pass;
  55. if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
  56. ref = pffft_aligned_malloc(Nbytes);
  57. in = pffft_aligned_malloc(Nbytes);
  58. out = pffft_aligned_malloc(Nbytes);
  59. tmp = pffft_aligned_malloc(Nbytes);
  60. tmp2 = pffft_aligned_malloc(Nbytes);
  61. for (pass=0; pass < 2; ++pass) {
  62. float ref_max = 0;
  63. int k;
  64. //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
  65. // compute reference solution with FFTPACK
  66. if (pass == 0) {
  67. float *wrk = malloc(2*Nbytes+15*sizeof(float));
  68. for (k=0; k < Nfloat; ++k) {
  69. ref[k] = in[k] = frand()*2-1;
  70. out[k] = 1e30;
  71. }
  72. if (!cplx) {
  73. rffti(N, wrk);
  74. rfftf(N, ref, wrk);
  75. // use our ordering for real ffts instead of the one of fftpack
  76. {
  77. float refN=ref[N-1];
  78. for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
  79. ref[1] = refN;
  80. }
  81. } else {
  82. cffti(N, wrk);
  83. cfftf(N, ref, wrk);
  84. }
  85. free(wrk);
  86. }
  87. for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
  88. // pass 0 : non canonical ordering of transform coefficients
  89. if (pass == 0) {
  90. // test forward transform, with different input / output
  91. pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
  92. memcpy(tmp2, tmp, Nbytes);
  93. memcpy(tmp, in, Nbytes);
  94. pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
  95. for (k = 0; k < Nfloat; ++k) {
  96. assert(tmp2[k] == tmp[k]);
  97. }
  98. // test reordering
  99. pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
  100. pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
  101. for (k = 0; k < Nfloat; ++k) {
  102. assert(tmp2[k] == tmp[k]);
  103. }
  104. pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
  105. } else {
  106. // pass 1 : canonical ordering of transform coeffs.
  107. pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
  108. memcpy(tmp2, tmp, Nbytes);
  109. memcpy(tmp, in, Nbytes);
  110. pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
  111. for (k = 0; k < Nfloat; ++k) {
  112. assert(tmp2[k] == tmp[k]);
  113. }
  114. memcpy(out, tmp, Nbytes);
  115. }
  116. {
  117. for (k=0; k < Nfloat; ++k) {
  118. if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
  119. printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
  120. exit(1);
  121. }
  122. }
  123. if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
  124. else pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
  125. memcpy(tmp2, out, Nbytes);
  126. memcpy(out, tmp, Nbytes);
  127. if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
  128. else pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
  129. for (k = 0; k < Nfloat; ++k) {
  130. assert(tmp2[k] == out[k]);
  131. out[k] *= 1.f/N;
  132. }
  133. for (k = 0; k < Nfloat; ++k) {
  134. if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
  135. printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
  136. exit(1);
  137. }
  138. }
  139. }
  140. // quick test of the circular convolution in fft domain
  141. {
  142. float conv_err = 0, conv_max = 0;
  143. pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
  144. memset(out, 0, Nbytes);
  145. pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
  146. pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
  147. for (k=0; k < Nfloat; k += 2) {
  148. float ar = tmp[k], ai=tmp[k+1];
  149. if (cplx || k > 0) {
  150. tmp[k] = ar*ar - ai*ai;
  151. tmp[k+1] = 2*ar*ai;
  152. } else {
  153. tmp[0] = ar*ar;
  154. tmp[1] = ai*ai;
  155. }
  156. }
  157. for (k=0; k < Nfloat; ++k) {
  158. float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
  159. if (d > conv_err) conv_err = d;
  160. if (e > conv_max) conv_max = e;
  161. }
  162. if (conv_err > 1e-5*conv_max) {
  163. printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
  164. }
  165. }
  166. }
  167. printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
  168. pffft_destroy_setup(s);
  169. pffft_aligned_free(ref);
  170. pffft_aligned_free(in);
  171. pffft_aligned_free(out);
  172. pffft_aligned_free(tmp);
  173. pffft_aligned_free(tmp2);
  174. }
  175. void pffft_validate(int cplx) {
  176. static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
  177. int k;
  178. for (k = 0; Ntest[k]; ++k) {
  179. int N = Ntest[k];
  180. if (N == 16 && !cplx) continue;
  181. pffft_validate_N(N, cplx);
  182. }
  183. }
  184. int array_output_format = 0;
  185. void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
  186. float mflops = flops/1e6/(t1 - t0 + 1e-16);
  187. if (array_output_format) {
  188. if (flops != -1) {
  189. printf("|%9.0f ", mflops);
  190. } else printf("| n/a ");
  191. } else {
  192. if (flops != -1) {
  193. printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
  194. }
  195. }
  196. fflush(stdout);
  197. }
  198. void benchmark_ffts(int N, int cplx) {
  199. int Nfloat = (cplx ? N*2 : N);
  200. int Nbytes = Nfloat * sizeof(float);
  201. float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
  202. double t0, t1, flops;
  203. int k;
  204. int max_iter = 5120000/N*4;
  205. #ifdef __arm__
  206. max_iter /= 4;
  207. #endif
  208. int iter;
  209. for (k = 0; k < Nfloat; ++k) {
  210. X[k] = 0; //sqrtf(k+1);
  211. }
  212. // FFTPack benchmark
  213. {
  214. float *wrk = malloc(2*Nbytes + 15*sizeof(float));
  215. int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
  216. if (cplx) cffti(N, wrk);
  217. else rffti(N, wrk);
  218. t0 = uclock_sec();
  219. for (iter = 0; iter < max_iter_; ++iter) {
  220. if (cplx) {
  221. cfftf(N, X, wrk);
  222. cfftb(N, X, wrk);
  223. } else {
  224. rfftf(N, X, wrk);
  225. rfftb(N, X, wrk);
  226. }
  227. }
  228. t1 = uclock_sec();
  229. free(wrk);
  230. flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
  231. show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
  232. }
  233. #ifdef HAVE_VECLIB
  234. int log2N = (int)(log(N)/log(2) + 0.5f);
  235. if (N == (1<<log2N)) {
  236. FFTSetup setup;
  237. setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
  238. DSPSplitComplex zsamples;
  239. zsamples.realp = &X[0];
  240. zsamples.imagp = &X[Nfloat/2];
  241. t0 = uclock_sec();
  242. for (iter = 0; iter < max_iter; ++iter) {
  243. if (cplx) {
  244. vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
  245. vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
  246. } else {
  247. vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
  248. vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
  249. }
  250. }
  251. t1 = uclock_sec();
  252. vDSP_destroy_fftsetup(setup);
  253. flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
  254. show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
  255. } else {
  256. show_output("vDSP", N, cplx, -1, -1, -1, -1);
  257. }
  258. #endif
  259. #ifdef HAVE_FFTW
  260. {
  261. fftwf_plan planf, planb;
  262. fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
  263. fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
  264. memset(in, 0, sizeof(fftw_complex) * N);
  265. int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE); // measure takes a lot of time on largest ffts
  266. //int flags = FFTW_ESTIMATE;
  267. if (cplx) {
  268. planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
  269. planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
  270. } else {
  271. planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
  272. planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
  273. }
  274. t0 = uclock_sec();
  275. for (iter = 0; iter < max_iter; ++iter) {
  276. fftwf_execute(planf);
  277. fftwf_execute(planb);
  278. }
  279. t1 = uclock_sec();
  280. fftwf_destroy_plan(planf);
  281. fftwf_destroy_plan(planb);
  282. fftwf_free(in); fftwf_free(out);
  283. flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
  284. show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
  285. }
  286. #endif
  287. // PFFFT benchmark
  288. {
  289. PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
  290. if (s) {
  291. t0 = uclock_sec();
  292. for (iter = 0; iter < max_iter; ++iter) {
  293. pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
  294. pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
  295. }
  296. t1 = uclock_sec();
  297. pffft_destroy_setup(s);
  298. flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
  299. show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
  300. }
  301. }
  302. if (!array_output_format) {
  303. printf("--\n");
  304. }
  305. pffft_aligned_free(X);
  306. pffft_aligned_free(Y);
  307. pffft_aligned_free(Z);
  308. }
  309. #ifndef PFFFT_SIMD_DISABLE
  310. void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction
  311. #endif
  312. int main(int argc, char **argv) {
  313. int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
  314. int i;
  315. if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {
  316. array_output_format = 1;
  317. }
  318. #ifndef PFFFT_SIMD_DISABLE
  319. validate_pffft_simd();
  320. #endif
  321. pffft_validate(1);
  322. pffft_validate(0);
  323. if (!array_output_format) {
  324. for (i=0; Nvalues[i] > 0; ++i) {
  325. benchmark_ffts(Nvalues[i], 0 /* real fft */);
  326. }
  327. for (i=0; Nvalues[i] > 0; ++i) {
  328. benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
  329. }
  330. } else {
  331. printf("| input len ");
  332. printf("|real FFTPack");
  333. #ifdef HAVE_VECLIB
  334. printf("| real vDSP ");
  335. #endif
  336. #ifdef HAVE_FFTW
  337. printf("| real FFTW ");
  338. #endif
  339. printf("| real PFFFT | ");
  340. printf("|cplx FFTPack");
  341. #ifdef HAVE_VECLIB
  342. printf("| cplx vDSP ");
  343. #endif
  344. #ifdef HAVE_FFTW
  345. printf("| cplx FFTW ");
  346. #endif
  347. printf("| cplx PFFFT |\n");
  348. for (i=0; Nvalues[i] > 0; ++i) {
  349. printf("|%9d ", Nvalues[i]);
  350. benchmark_ffts(Nvalues[i], 0);
  351. printf("| ");
  352. benchmark_ffts(Nvalues[i], 1);
  353. printf("|\n");
  354. }
  355. printf(" (numbers are given in MFlops)\n");
  356. }
  357. return 0;
  358. }