* commit '12004a9a7f20e44f4da2ee6c372d5e1794c8d6c5':
audiodsp/x86: yasmify vector_clipf_sse
audiodsp: reorder arguments for vector_clipf
Merged the version from Libav after a discussion with James Almer on
IRC:
19:22 <ubitux> jamrial: opinion on 12004a9a7f20e44f4da2ee6c372d5e1794c8d6c5?
19:23 <ubitux> it was apparently yasmified differently
19:23 <ubitux> (it depends on the previous commit arg shuffle)
19:24 <ubitux> i don't see the magic movsxdifnidn in your port btw
19:24 <ubitux> it's a port from 1d36defe94
19:25 <jamrial> seems better thanks to said arg shuffle
19:25 <jamrial> the loop is the same, but init is simpler
19:25 <jamrial> probably worth merging
19:25 <ubitux> OK
19:25 <ubitux> thanks
19:26 <jamrial> curious they didn't make len ptrdiff_t after the previous bunch of commits, heh
19:26 <ubitux> yeah indeed
Both commits are merged at the same time to prevent a conflict with our
existing yasmified ff_vector_clipf_sse.
Merged-by: Clément Bœsch <u@pkh.me>
tags/n3.3
| @@ -121,7 +121,7 @@ static void sum_square_butterfly(AC3EncodeContext *s, float sum[4], | |||||
| static void clip_coefficients(AudioDSPContext *adsp, float *coef, | static void clip_coefficients(AudioDSPContext *adsp, float *coef, | ||||
| unsigned int len) | unsigned int len) | ||||
| { | { | ||||
| adsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len); | |||||
| adsp->vector_clipf(coef, coef, len, COEF_MIN, COEF_MAX); | |||||
| } | } | ||||
| @@ -25,8 +25,7 @@ | |||||
| #include "libavcodec/audiodsp.h" | #include "libavcodec/audiodsp.h" | ||||
| #include "audiodsp_arm.h" | #include "audiodsp_arm.h" | ||||
| void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, | |||||
| int len); | |||||
| void ff_vector_clipf_neon(float *dst, const float *src, int len, float min, float max); | |||||
| void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, | void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, | ||||
| int32_t max, unsigned int len); | int32_t max, unsigned int len); | ||||
| @@ -24,9 +24,8 @@ | |||||
| function ff_vector_clipf_neon, export=1 | function ff_vector_clipf_neon, export=1 | ||||
| VFP vdup.32 q1, d0[1] | VFP vdup.32 q1, d0[1] | ||||
| VFP vdup.32 q0, d0[0] | VFP vdup.32 q0, d0[0] | ||||
| NOVFP vdup.32 q0, r2 | |||||
| NOVFP vdup.32 q1, r3 | |||||
| NOVFP ldr r2, [sp] | |||||
| NOVFP vdup.32 q0, r3 | |||||
| NOVFP vld1.32 {d2[],d3[]}, [sp] | |||||
| vld1.f32 {q2},[r1,:128]! | vld1.f32 {q2},[r1,:128]! | ||||
| vmin.f32 q10, q2, q1 | vmin.f32 q10, q2, q1 | ||||
| vld1.f32 {q3},[r1,:128]! | vld1.f32 {q3},[r1,:128]! | ||||
| @@ -55,8 +55,8 @@ static void vector_clipf_c_opposite_sign(float *dst, const float *src, | |||||
| } | } | ||||
| } | } | ||||
| static void vector_clipf_c(float *dst, const float *src, | |||||
| float min, float max, int len) | |||||
| static void vector_clipf_c(float *dst, const float *src, int len, | |||||
| float min, float max) | |||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -48,7 +48,8 @@ typedef struct AudioDSPContext { | |||||
| /* assume len is a multiple of 16, and arrays are 16-byte aligned */ | /* assume len is a multiple of 16, and arrays are 16-byte aligned */ | ||||
| void (*vector_clipf)(float *dst /* align 16 */, | void (*vector_clipf)(float *dst /* align 16 */, | ||||
| const float *src /* align 16 */, | const float *src /* align 16 */, | ||||
| float min, float max, int len /* align 16 */); | |||||
| int len /* align 16 */, | |||||
| float min, float max); | |||||
| } AudioDSPContext; | } AudioDSPContext; | ||||
| void ff_audiodsp_init(AudioDSPContext *c); | void ff_audiodsp_init(AudioDSPContext *c); | ||||
| @@ -882,7 +882,7 @@ static inline void decode_bytes_and_gain(COOKContext *q, COOKSubpacket *p, | |||||
| static void saturate_output_float(COOKContext *q, float *out) | static void saturate_output_float(COOKContext *q, float *out) | ||||
| { | { | ||||
| q->adsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel, | q->adsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel, | ||||
| -1.0f, 1.0f, FFALIGN(q->samples_per_channel, 8)); | |||||
| FFALIGN(q->samples_per_channel, 8), -1.0f, 1.0f); | |||||
| } | } | ||||
| @@ -132,46 +132,45 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 | |||||
| VECTOR_CLIP_INT32 6, 1, 0, 0 | VECTOR_CLIP_INT32 6, 1, 0, 0 | ||||
| %endif | %endif | ||||
| ;----------------------------------------------------- | |||||
| ;void ff_vector_clipf(float *dst, const float *src, | |||||
| ; float min, float max, int len) | |||||
| ;----------------------------------------------------- | |||||
| ; void ff_vector_clipf_sse(float *dst, const float *src, | |||||
| ; int len, float min, float max) | |||||
| INIT_XMM sse | INIT_XMM sse | ||||
| %if UNIX64 | |||||
| cglobal vector_clipf, 3,3,6, dst, src, len | |||||
| %else | |||||
| cglobal vector_clipf, 5,5,6, dst, src, min, max, len | |||||
| %endif | |||||
| %if WIN64 | |||||
| SWAP 0, 2 | |||||
| SWAP 1, 3 | |||||
| %elif ARCH_X86_32 | |||||
| movss m0, minm | |||||
| movss m1, maxm | |||||
| cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max | |||||
| %if ARCH_X86_32 | |||||
| VBROADCASTSS m0, minm | |||||
| VBROADCASTSS m1, maxm | |||||
| %elif WIN64 | |||||
| VBROADCASTSS m0, m3 | |||||
| VBROADCASTSS m1, maxm | |||||
| %else ; 64bit sysv | |||||
| VBROADCASTSS m0, m0 | |||||
| VBROADCASTSS m1, m1 | |||||
| %endif | %endif | ||||
| SPLATD m0 | |||||
| SPLATD m1 | |||||
| shl lend, 2 | |||||
| add srcq, lenq | |||||
| add dstq, lenq | |||||
| neg lenq | |||||
| .loop: | |||||
| mova m2, [srcq+lenq+mmsize*0] | |||||
| mova m3, [srcq+lenq+mmsize*1] | |||||
| mova m4, [srcq+lenq+mmsize*2] | |||||
| mova m5, [srcq+lenq+mmsize*3] | |||||
| maxps m2, m0 | |||||
| maxps m3, m0 | |||||
| maxps m4, m0 | |||||
| maxps m5, m0 | |||||
| minps m2, m1 | |||||
| minps m3, m1 | |||||
| minps m4, m1 | |||||
| minps m5, m1 | |||||
| mova [dstq+lenq+mmsize*0], m2 | |||||
| mova [dstq+lenq+mmsize*1], m3 | |||||
| mova [dstq+lenq+mmsize*2], m4 | |||||
| mova [dstq+lenq+mmsize*3], m5 | |||||
| add lenq, mmsize*4 | |||||
| jl .loop | |||||
| REP_RET | |||||
| movsxdifnidn lenq, lend | |||||
| .loop | |||||
| mova m2, [srcq + 4 * lenq - 4 * mmsize] | |||||
| mova m3, [srcq + 4 * lenq - 3 * mmsize] | |||||
| mova m4, [srcq + 4 * lenq - 2 * mmsize] | |||||
| mova m5, [srcq + 4 * lenq - 1 * mmsize] | |||||
| maxps m2, m0 | |||||
| maxps m3, m0 | |||||
| maxps m4, m0 | |||||
| maxps m5, m0 | |||||
| minps m2, m1 | |||||
| minps m3, m1 | |||||
| minps m4, m1 | |||||
| minps m5, m1 | |||||
| mova [dstq + 4 * lenq - 4 * mmsize], m2 | |||||
| mova [dstq + 4 * lenq - 3 * mmsize], m3 | |||||
| mova [dstq + 4 * lenq - 2 * mmsize], m4 | |||||
| mova [dstq + 4 * lenq - 1 * mmsize], m5 | |||||
| sub lenq, mmsize | |||||
| jg .loop | |||||
| RET | |||||
| @@ -38,7 +38,7 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, | |||||
| void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, | void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, | ||||
| int32_t min, int32_t max, unsigned int len); | int32_t min, int32_t max, unsigned int len); | ||||
| void ff_vector_clipf_sse(float *dst, const float *src, | void ff_vector_clipf_sse(float *dst, const float *src, | ||||
| float min, float max, int len); | |||||
| int len, float min, float max); | |||||
| av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) | av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) | ||||
| { | { | ||||
| @@ -120,7 +120,7 @@ void checkasm_check_audiodsp(void) | |||||
| int i, len; | int i, len; | ||||
| declare_func_emms(AV_CPU_FLAG_MMX, void, float *dst, const float *src, | declare_func_emms(AV_CPU_FLAG_MMX, void, float *dst, const float *src, | ||||
| float min, float max, unsigned int len); | |||||
| int len, float min, float max); | |||||
| val1 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | val1 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | ||||
| val2 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | val2 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | ||||
| @@ -133,13 +133,13 @@ void checkasm_check_audiodsp(void) | |||||
| len = rnd() % 128; | len = rnd() % 128; | ||||
| len = 16 * FFMAX(len, 1); | len = 16 * FFMAX(len, 1); | ||||
| call_ref(dst0, src, min, max, len); | |||||
| call_new(dst1, src, min, max, len); | |||||
| call_ref(dst0, src, len, min, max); | |||||
| call_new(dst1, src, len, min, max); | |||||
| for (i = 0; i < len; i++) { | for (i = 0; i < len; i++) { | ||||
| if (!float_near_ulp_array(dst0, dst1, 3, len)) | if (!float_near_ulp_array(dst0, dst1, 3, len)) | ||||
| fail(); | fail(); | ||||
| } | } | ||||
| bench_new(dst1, src, min, max, MAX_SIZE); | |||||
| bench_new(dst1, src, MAX_SIZE, min, max); | |||||
| } | } | ||||
| report("audiodsp"); | report("audiodsp"); | ||||