* commit '12004a9a7f20e44f4da2ee6c372d5e1794c8d6c5':
audiodsp/x86: yasmify vector_clipf_sse
audiodsp: reorder arguments for vector_clipf
Merged the version from Libav after a discussion with James Almer on
IRC:
19:22 <ubitux> jamrial: opinion on 12004a9a7f20e44f4da2ee6c372d5e1794c8d6c5?
19:23 <ubitux> it was apparently yasmified differently
19:23 <ubitux> (it depends on the previous commit arg shuffle)
19:24 <ubitux> i don't see the magic movsxdifnidn in your port btw
19:24 <ubitux> it's a port from 1d36defe94
19:25 <jamrial> seems better thanks to said arg shuffle
19:25 <jamrial> the loop is the same, but init is simpler
19:25 <jamrial> probably worth merging
19:25 <ubitux> OK
19:25 <ubitux> thanks
19:26 <jamrial> curious they didn't make len ptrdiff_t after the previous bunch of commits, heh
19:26 <ubitux> yeah indeed
Both commits are merged at the same time to prevent a conflict with our
existing yasmified ff_vector_clipf_sse.
Merged-by: Clément Bœsch <u@pkh.me>
tags/n3.3
@@ -121,7 +121,7 @@ static void sum_square_butterfly(AC3EncodeContext *s, float sum[4], | |||||
static void clip_coefficients(AudioDSPContext *adsp, float *coef, | static void clip_coefficients(AudioDSPContext *adsp, float *coef, | ||||
unsigned int len) | unsigned int len) | ||||
{ | { | ||||
adsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len); | |||||
adsp->vector_clipf(coef, coef, len, COEF_MIN, COEF_MAX); | |||||
} | } | ||||
@@ -25,8 +25,7 @@ | |||||
#include "libavcodec/audiodsp.h" | #include "libavcodec/audiodsp.h" | ||||
#include "audiodsp_arm.h" | #include "audiodsp_arm.h" | ||||
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, | |||||
int len); | |||||
void ff_vector_clipf_neon(float *dst, const float *src, int len, float min, float max); | |||||
void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, | void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, | ||||
int32_t max, unsigned int len); | int32_t max, unsigned int len); | ||||
@@ -24,9 +24,8 @@ | |||||
function ff_vector_clipf_neon, export=1 | function ff_vector_clipf_neon, export=1 | ||||
VFP vdup.32 q1, d0[1] | VFP vdup.32 q1, d0[1] | ||||
VFP vdup.32 q0, d0[0] | VFP vdup.32 q0, d0[0] | ||||
NOVFP vdup.32 q0, r2 | |||||
NOVFP vdup.32 q1, r3 | |||||
NOVFP ldr r2, [sp] | |||||
NOVFP vdup.32 q0, r3 | |||||
NOVFP vld1.32 {d2[],d3[]}, [sp] | |||||
vld1.f32 {q2},[r1,:128]! | vld1.f32 {q2},[r1,:128]! | ||||
vmin.f32 q10, q2, q1 | vmin.f32 q10, q2, q1 | ||||
vld1.f32 {q3},[r1,:128]! | vld1.f32 {q3},[r1,:128]! | ||||
@@ -55,8 +55,8 @@ static void vector_clipf_c_opposite_sign(float *dst, const float *src, | |||||
} | } | ||||
} | } | ||||
static void vector_clipf_c(float *dst, const float *src, | |||||
float min, float max, int len) | |||||
static void vector_clipf_c(float *dst, const float *src, int len, | |||||
float min, float max) | |||||
{ | { | ||||
int i; | int i; | ||||
@@ -48,7 +48,8 @@ typedef struct AudioDSPContext { | |||||
/* assume len is a multiple of 16, and arrays are 16-byte aligned */ | /* assume len is a multiple of 16, and arrays are 16-byte aligned */ | ||||
void (*vector_clipf)(float *dst /* align 16 */, | void (*vector_clipf)(float *dst /* align 16 */, | ||||
const float *src /* align 16 */, | const float *src /* align 16 */, | ||||
float min, float max, int len /* align 16 */); | |||||
int len /* align 16 */, | |||||
float min, float max); | |||||
} AudioDSPContext; | } AudioDSPContext; | ||||
void ff_audiodsp_init(AudioDSPContext *c); | void ff_audiodsp_init(AudioDSPContext *c); | ||||
@@ -882,7 +882,7 @@ static inline void decode_bytes_and_gain(COOKContext *q, COOKSubpacket *p, | |||||
static void saturate_output_float(COOKContext *q, float *out) | static void saturate_output_float(COOKContext *q, float *out) | ||||
{ | { | ||||
q->adsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel, | q->adsp.vector_clipf(out, q->mono_mdct_output + q->samples_per_channel, | ||||
-1.0f, 1.0f, FFALIGN(q->samples_per_channel, 8)); | |||||
FFALIGN(q->samples_per_channel, 8), -1.0f, 1.0f); | |||||
} | } | ||||
@@ -132,46 +132,45 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 | |||||
VECTOR_CLIP_INT32 6, 1, 0, 0 | VECTOR_CLIP_INT32 6, 1, 0, 0 | ||||
%endif | %endif | ||||
;----------------------------------------------------- | |||||
;void ff_vector_clipf(float *dst, const float *src, | |||||
; float min, float max, int len) | |||||
;----------------------------------------------------- | |||||
; void ff_vector_clipf_sse(float *dst, const float *src, | |||||
; int len, float min, float max) | |||||
INIT_XMM sse | INIT_XMM sse | ||||
%if UNIX64 | |||||
cglobal vector_clipf, 3,3,6, dst, src, len | |||||
%else | |||||
cglobal vector_clipf, 5,5,6, dst, src, min, max, len | |||||
%endif | |||||
%if WIN64 | |||||
SWAP 0, 2 | |||||
SWAP 1, 3 | |||||
%elif ARCH_X86_32 | |||||
movss m0, minm | |||||
movss m1, maxm | |||||
cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max | |||||
%if ARCH_X86_32 | |||||
VBROADCASTSS m0, minm | |||||
VBROADCASTSS m1, maxm | |||||
%elif WIN64 | |||||
VBROADCASTSS m0, m3 | |||||
VBROADCASTSS m1, maxm | |||||
%else ; 64bit sysv | |||||
VBROADCASTSS m0, m0 | |||||
VBROADCASTSS m1, m1 | |||||
%endif | %endif | ||||
SPLATD m0 | |||||
SPLATD m1 | |||||
shl lend, 2 | |||||
add srcq, lenq | |||||
add dstq, lenq | |||||
neg lenq | |||||
.loop: | |||||
mova m2, [srcq+lenq+mmsize*0] | |||||
mova m3, [srcq+lenq+mmsize*1] | |||||
mova m4, [srcq+lenq+mmsize*2] | |||||
mova m5, [srcq+lenq+mmsize*3] | |||||
maxps m2, m0 | |||||
maxps m3, m0 | |||||
maxps m4, m0 | |||||
maxps m5, m0 | |||||
minps m2, m1 | |||||
minps m3, m1 | |||||
minps m4, m1 | |||||
minps m5, m1 | |||||
mova [dstq+lenq+mmsize*0], m2 | |||||
mova [dstq+lenq+mmsize*1], m3 | |||||
mova [dstq+lenq+mmsize*2], m4 | |||||
mova [dstq+lenq+mmsize*3], m5 | |||||
add lenq, mmsize*4 | |||||
jl .loop | |||||
REP_RET | |||||
movsxdifnidn lenq, lend | |||||
.loop | |||||
mova m2, [srcq + 4 * lenq - 4 * mmsize] | |||||
mova m3, [srcq + 4 * lenq - 3 * mmsize] | |||||
mova m4, [srcq + 4 * lenq - 2 * mmsize] | |||||
mova m5, [srcq + 4 * lenq - 1 * mmsize] | |||||
maxps m2, m0 | |||||
maxps m3, m0 | |||||
maxps m4, m0 | |||||
maxps m5, m0 | |||||
minps m2, m1 | |||||
minps m3, m1 | |||||
minps m4, m1 | |||||
minps m5, m1 | |||||
mova [dstq + 4 * lenq - 4 * mmsize], m2 | |||||
mova [dstq + 4 * lenq - 3 * mmsize], m3 | |||||
mova [dstq + 4 * lenq - 2 * mmsize], m4 | |||||
mova [dstq + 4 * lenq - 1 * mmsize], m5 | |||||
sub lenq, mmsize | |||||
jg .loop | |||||
RET |
@@ -38,7 +38,7 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, | |||||
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, | void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, | ||||
int32_t min, int32_t max, unsigned int len); | int32_t min, int32_t max, unsigned int len); | ||||
void ff_vector_clipf_sse(float *dst, const float *src, | void ff_vector_clipf_sse(float *dst, const float *src, | ||||
float min, float max, int len); | |||||
int len, float min, float max); | |||||
av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) | av_cold void ff_audiodsp_init_x86(AudioDSPContext *c) | ||||
{ | { | ||||
@@ -120,7 +120,7 @@ void checkasm_check_audiodsp(void) | |||||
int i, len; | int i, len; | ||||
declare_func_emms(AV_CPU_FLAG_MMX, void, float *dst, const float *src, | declare_func_emms(AV_CPU_FLAG_MMX, void, float *dst, const float *src, | ||||
float min, float max, unsigned int len); | |||||
int len, float min, float max); | |||||
val1 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | val1 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | ||||
val2 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | val2 = (float)rnd() / (UINT_MAX >> 1) - 1.0f; | ||||
@@ -133,13 +133,13 @@ void checkasm_check_audiodsp(void) | |||||
len = rnd() % 128; | len = rnd() % 128; | ||||
len = 16 * FFMAX(len, 1); | len = 16 * FFMAX(len, 1); | ||||
call_ref(dst0, src, min, max, len); | |||||
call_new(dst1, src, min, max, len); | |||||
call_ref(dst0, src, len, min, max); | |||||
call_new(dst1, src, len, min, max); | |||||
for (i = 0; i < len; i++) { | for (i = 0; i < len; i++) { | ||||
if (!float_near_ulp_array(dst0, dst1, 3, len)) | if (!float_near_ulp_array(dst0, dst1, 3, len)) | ||||
fail(); | fail(); | ||||
} | } | ||||
bench_new(dst1, src, min, max, MAX_SIZE); | |||||
bench_new(dst1, src, MAX_SIZE, min, max); | |||||
} | } | ||||
report("audiodsp"); | report("audiodsp"); | ||||