out[lut[i]] = in[i] lookups were 4.04 times(!) slower than out[i] = in[lut[i]] lookups for an out-of-place FFT of length 4096. The permutes remain unchanged for anything but out-of-place monolithic FFT, as those benefit quite a lot from the current order (it means there's only 1 lookup necessary to add to an offset, rather than a full gather). The code was based around non-power-of-two FFTs, so this wasn't benchmarked early on.tags/n4.4
| @@ -91,7 +91,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s) | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| int ff_tx_gen_ptwo_revtab(AVTXContext *s) | |||||
| int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup) | |||||
| { | { | ||||
| const int m = s->m, inv = s->inv; | const int m = s->m, inv = s->inv; | ||||
| @@ -101,7 +101,10 @@ int ff_tx_gen_ptwo_revtab(AVTXContext *s) | |||||
| /* Default */ | /* Default */ | ||||
| for (int i = 0; i < m; i++) { | for (int i = 0; i < m; i++) { | ||||
| int k = -split_radix_permutation(i, m, inv) & (m - 1); | int k = -split_radix_permutation(i, m, inv) & (m - 1); | ||||
| s->revtab[k] = i; | |||||
| if (invert_lookup) | |||||
| s->revtab[i] = k; | |||||
| else | |||||
| s->revtab[k] = i; | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| @@ -123,7 +123,7 @@ struct AVTXContext { | |||||
| /* Shared functions */ | /* Shared functions */ | ||||
| int ff_tx_type_is_mdct(enum AVTXType type); | int ff_tx_type_is_mdct(enum AVTXType type); | ||||
| int ff_tx_gen_compound_mapping(AVTXContext *s); | int ff_tx_gen_compound_mapping(AVTXContext *s); | ||||
| int ff_tx_gen_ptwo_revtab(AVTXContext *s); | |||||
| int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup); | |||||
| int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s); | int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s); | ||||
| /* Also used by SIMD init */ | /* Also used by SIMD init */ | ||||
| @@ -410,7 +410,7 @@ static void monolithic_fft(AVTXContext *s, void *_out, void *_in, | |||||
| } while ((src = *inplace_idx++)); | } while ((src = *inplace_idx++)); | ||||
| } else { | } else { | ||||
| for (int i = 0; i < m; i++) | for (int i = 0; i < m; i++) | ||||
| out[s->revtab[i]] = in[i]; | |||||
| out[i] = in[s->revtab[i]]; | |||||
| } | } | ||||
| fft_dispatch[mb](out); | fft_dispatch[mb](out); | ||||
| @@ -738,7 +738,7 @@ int TX_NAME(ff_tx_init_mdct_fft)(AVTXContext *s, av_tx_fn *tx, | |||||
| if (n != 1) | if (n != 1) | ||||
| init_cos_tabs(0); | init_cos_tabs(0); | ||||
| if (m != 1) { | if (m != 1) { | ||||
| if ((err = ff_tx_gen_ptwo_revtab(s))) | |||||
| if ((err = ff_tx_gen_ptwo_revtab(s, n == 1 && !(flags & AV_TX_INPLACE)))) | |||||
| return err; | return err; | ||||
| if (flags & AV_TX_INPLACE) { | if (flags & AV_TX_INPLACE) { | ||||
| if (is_mdct) /* In-place MDCTs are not supported yet */ | if (is_mdct) /* In-place MDCTs are not supported yet */ | ||||