AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>tags/n0.8
| @@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs) | |||||
| } | } | ||||
| } | } | ||||
| static int ac3_max_msb_abs_int16_c(const int16_t *src, int len) | |||||
| { | |||||
| int i, v = 0; | |||||
| for (i = 0; i < len; i++) | |||||
| v |= abs(src[i]); | |||||
| return v; | |||||
| } | |||||
| av_cold void ff_ac3dsp_init(AC3DSPContext *c) | av_cold void ff_ac3dsp_init(AC3DSPContext *c) | ||||
| { | { | ||||
| c->ac3_exponent_min = ac3_exponent_min_c; | c->ac3_exponent_min = ac3_exponent_min_c; | ||||
| c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; | |||||
| if (HAVE_MMX) | if (HAVE_MMX) | ||||
| ff_ac3dsp_init_x86(c); | ff_ac3dsp_init_x86(c); | ||||
| @@ -35,6 +35,17 @@ typedef struct AC3DSPContext { | |||||
| * @param nb_coefs number of frequency coefficients. | * @param nb_coefs number of frequency coefficients. | ||||
| */ | */ | ||||
| void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs); | void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs); | ||||
| /** | |||||
| * Calculate the maximum MSB of the absolute value of each element in an | |||||
| * array of int16_t. | |||||
| * @param src input array | |||||
| * constraints: align 16. values must be in range [-32767,32767] | |||||
| * @param len number of values in the array | |||||
| * constraints: multiple of 16 greater than 0 | |||||
| * @return a value with the same MSB as max(abs(src[])) | |||||
| */ | |||||
| int (*ac3_max_msb_abs_int16)(const int16_t *src, int len); | |||||
| } AC3DSPContext; | } AC3DSPContext; | ||||
| void ff_ac3dsp_init (AC3DSPContext *c); | void ff_ac3dsp_init (AC3DSPContext *c); | ||||
| @@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input, | |||||
| * @param n number of values in the array | * @param n number of values in the array | ||||
| * @return log2(max(abs(tab[]))) | * @return log2(max(abs(tab[]))) | ||||
| */ | */ | ||||
| static int log2_tab(int16_t *tab, int n) | |||||
| static int log2_tab(AC3EncodeContext *s, int16_t *src, int len) | |||||
| { | { | ||||
| int i, v; | |||||
| v = 0; | |||||
| for (i = 0; i < n; i++) | |||||
| v |= abs(tab[i]); | |||||
| int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len); | |||||
| return av_log2(v); | return av_log2(v); | ||||
| } | } | ||||
| @@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift) | |||||
| */ | */ | ||||
| static int normalize_samples(AC3EncodeContext *s) | static int normalize_samples(AC3EncodeContext *s) | ||||
| { | { | ||||
| int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE); | |||||
| int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE); | |||||
| lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v); | lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v); | ||||
| return v - 9; | return v - 9; | ||||
| } | } | ||||
| @@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2 | |||||
| %endif | %endif | ||||
| %undef PMINUB | %undef PMINUB | ||||
| %undef LOOP_ALIGN | %undef LOOP_ALIGN | ||||
| ;----------------------------------------------------------------------------- | |||||
| ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) | |||||
| ; | |||||
| ; This function uses 2 different methods to calculate a valid result. | |||||
| ; 1) logical 'or' of abs of each element | |||||
| ; This is used for ssse3 because of the pabsw instruction. | |||||
| ; It is also used for mmx because of the lack of min/max instructions. | |||||
| ; 2) calculate min/max for the array, then or(abs(min),abs(max)) | |||||
| ; This is used for mmxext and sse2 because they have pminsw/pmaxsw. | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro AC3_MAX_MSB_ABS_INT16 2 | |||||
| cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len | |||||
| pxor m2, m2 | |||||
| pxor m3, m3 | |||||
| .loop: | |||||
| %ifidn %2, min_max | |||||
| mova m0, [srcq] | |||||
| mova m1, [srcq+mmsize] | |||||
| pminsw m2, m0 | |||||
| pminsw m2, m1 | |||||
| pmaxsw m3, m0 | |||||
| pmaxsw m3, m1 | |||||
| %else ; or_abs | |||||
| %ifidn %1, mmx | |||||
| mova m0, [srcq] | |||||
| mova m1, [srcq+mmsize] | |||||
| ABS2 m0, m1, m3, m4 | |||||
| %else ; ssse3 | |||||
| ; using memory args is faster for ssse3 | |||||
| pabsw m0, [srcq] | |||||
| pabsw m1, [srcq+mmsize] | |||||
| %endif | |||||
| por m2, m0 | |||||
| por m2, m1 | |||||
| %endif | |||||
| add srcq, mmsize*2 | |||||
| sub lend, mmsize | |||||
| ja .loop | |||||
| %ifidn %2, min_max | |||||
| ABS2 m2, m3, m0, m1 | |||||
| por m2, m3 | |||||
| %endif | |||||
| %ifidn mmsize, 16 | |||||
| mova m0, m2 | |||||
| punpckhqdq m0, m0 | |||||
| por m2, m0 | |||||
| %endif | |||||
| PSHUFLW m0, m2, 0xe | |||||
| por m2, m0 | |||||
| PSHUFLW m0, m2, 0x1 | |||||
| por m2, m0 | |||||
| movd eax, m2 | |||||
| and eax, 0xFFFF | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| %define ABS2 ABS2_MMX | |||||
| %define PSHUFLW pshufw | |||||
| AC3_MAX_MSB_ABS_INT16 mmx, or_abs | |||||
| %define ABS2 ABS2_MMX2 | |||||
| AC3_MAX_MSB_ABS_INT16 mmxext, min_max | |||||
| INIT_XMM | |||||
| %define PSHUFLW pshuflw | |||||
| AC3_MAX_MSB_ABS_INT16 sse2, min_max | |||||
| %define ABS2 ABS2_SSSE3 | |||||
| AC3_MAX_MSB_ABS_INT16 ssse3, or_abs | |||||
| @@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int n | |||||
| extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); | extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); | ||||
| extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | ||||
| extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); | |||||
| extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); | |||||
| extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); | |||||
| extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len); | |||||
| av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) | av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) | ||||
| { | { | ||||
| int mm_flags = av_get_cpu_flags(); | int mm_flags = av_get_cpu_flags(); | ||||
| @@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) | |||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| if (mm_flags & AV_CPU_FLAG_MMX) { | if (mm_flags & AV_CPU_FLAG_MMX) { | ||||
| c->ac3_exponent_min = ff_ac3_exponent_min_mmx; | c->ac3_exponent_min = ff_ac3_exponent_min_mmx; | ||||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; | |||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) { | if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) { | ||||
| c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | ||||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; | |||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { | if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { | ||||
| c->ac3_exponent_min = ff_ac3_exponent_min_sse2; | c->ac3_exponent_min = ff_ac3_exponent_min_sse2; | ||||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; | |||||
| } | |||||
| if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) { | |||||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||