out[lut[i]] = in[i] lookups were 4.04 times(!) slower than out[i] = in[lut[i]] lookups for an out-of-place FFT of length 4096. The permutes remain unchanged for anything but out-of-place monolithic FFT, as those benefit quite a lot from the current order (it means there's only 1 lookup necessary to add to an offset, rather than a full gather). The code was based around non-power-of-two FFTs, so this wasn't benchmarked early on.tags/n4.4
@@ -91,7 +91,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s) | |||||
return 0; | return 0; | ||||
} | } | ||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s) | |||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup) | |||||
{ | { | ||||
const int m = s->m, inv = s->inv; | const int m = s->m, inv = s->inv; | ||||
@@ -101,7 +101,10 @@ int ff_tx_gen_ptwo_revtab(AVTXContext *s) | |||||
/* Default */ | /* Default */ | ||||
for (int i = 0; i < m; i++) { | for (int i = 0; i < m; i++) { | ||||
int k = -split_radix_permutation(i, m, inv) & (m - 1); | int k = -split_radix_permutation(i, m, inv) & (m - 1); | ||||
s->revtab[k] = i; | |||||
if (invert_lookup) | |||||
s->revtab[i] = k; | |||||
else | |||||
s->revtab[k] = i; | |||||
} | } | ||||
return 0; | return 0; | ||||
@@ -123,7 +123,7 @@ struct AVTXContext { | |||||
/* Shared functions */ | /* Shared functions */ | ||||
int ff_tx_type_is_mdct(enum AVTXType type); | int ff_tx_type_is_mdct(enum AVTXType type); | ||||
int ff_tx_gen_compound_mapping(AVTXContext *s); | int ff_tx_gen_compound_mapping(AVTXContext *s); | ||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s); | |||||
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup); | |||||
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s); | int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s); | ||||
/* Also used by SIMD init */ | /* Also used by SIMD init */ | ||||
@@ -410,7 +410,7 @@ static void monolithic_fft(AVTXContext *s, void *_out, void *_in, | |||||
} while ((src = *inplace_idx++)); | } while ((src = *inplace_idx++)); | ||||
} else { | } else { | ||||
for (int i = 0; i < m; i++) | for (int i = 0; i < m; i++) | ||||
out[s->revtab[i]] = in[i]; | |||||
out[i] = in[s->revtab[i]]; | |||||
} | } | ||||
fft_dispatch[mb](out); | fft_dispatch[mb](out); | ||||
@@ -738,7 +738,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx, | |||||
if (n != 1) | if (n != 1) | ||||
init_cos_tabs(0); | init_cos_tabs(0); | ||||
if (m != 1) { | if (m != 1) { | ||||
if ((err = ff_tx_gen_ptwo_revtab(s))) | |||||
if ((err = ff_tx_gen_ptwo_revtab(s, n == 1 && !(flags & AV_TX_INPLACE)))) | |||||
return err; | return err; | ||||
if (flags & AV_TX_INPLACE) { | if (flags & AV_TX_INPLACE) { | ||||
if (is_mdct) /* In-place MDCTs are not supported yet */ | if (is_mdct) /* In-place MDCTs are not supported yet */ | ||||