Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.9
| @@ -143,14 +143,6 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, | |||||
| const float *src1, const float *win, int len); | const float *src1, const float *win, int len); | ||||
| void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, | void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, | ||||
| int len); | int len); | ||||
| void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, | |||||
| const float **vp, float mul, int len); | |||||
| void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, | |||||
| const float **vp, float mul, int len); | |||||
| void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, | |||||
| int len); | |||||
| void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, | |||||
| int len); | |||||
| void ff_butterflies_float_neon(float *v1, float *v2, int len); | void ff_butterflies_float_neon(float *v1, float *v2, int len); | ||||
| float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); | float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); | ||||
| void ff_vector_fmul_reverse_neon(float *dst, const float *src0, | void ff_vector_fmul_reverse_neon(float *dst, const float *src0, | ||||
| @@ -320,12 +312,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) | |||||
| c->vector_clipf = ff_vector_clipf_neon; | c->vector_clipf = ff_vector_clipf_neon; | ||||
| c->vector_clip_int32 = ff_vector_clip_int32_neon; | c->vector_clip_int32 = ff_vector_clip_int32_neon; | ||||
| c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; | |||||
| c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; | |||||
| c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; | |||||
| c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; | |||||
| if (CONFIG_VORBIS_DECODER) | if (CONFIG_VORBIS_DECODER) | ||||
| c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; | c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; | ||||
| @@ -587,110 +587,6 @@ NOVFP vdup.32 q8, r2 | |||||
| .unreq len | .unreq len | ||||
| endfunc | endfunc | ||||
| function ff_vector_fmul_sv_scalar_2_neon, export=1 | |||||
| VFP vdup.32 d16, d0[0] | |||||
| NOVFP vdup.32 d16, r3 | |||||
| NOVFP ldr r3, [sp] | |||||
| vld1.32 {d0},[r1,:64]! | |||||
| vld1.32 {d1},[r1,:64]! | |||||
| 1: subs r3, r3, #4 | |||||
| vmul.f32 d4, d0, d16 | |||||
| vmul.f32 d5, d1, d16 | |||||
| ldr r12, [r2], #4 | |||||
| vld1.32 {d2},[r12,:64] | |||||
| ldr r12, [r2], #4 | |||||
| vld1.32 {d3},[r12,:64] | |||||
| vmul.f32 d4, d4, d2 | |||||
| vmul.f32 d5, d5, d3 | |||||
| beq 2f | |||||
| vld1.32 {d0},[r1,:64]! | |||||
| vld1.32 {d1},[r1,:64]! | |||||
| vst1.32 {d4},[r0,:64]! | |||||
| vst1.32 {d5},[r0,:64]! | |||||
| b 1b | |||||
| 2: vst1.32 {d4},[r0,:64]! | |||||
| vst1.32 {d5},[r0,:64]! | |||||
| bx lr | |||||
| endfunc | |||||
| function ff_vector_fmul_sv_scalar_4_neon, export=1 | |||||
| VFP vdup.32 q10, d0[0] | |||||
| NOVFP vdup.32 q10, r3 | |||||
| NOVFP ldr r3, [sp] | |||||
| push {lr} | |||||
| bics lr, r3, #7 | |||||
| beq 3f | |||||
| vld1.32 {q0},[r1,:128]! | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| 1: ldr r12, [r2], #4 | |||||
| vld1.32 {q1},[r12,:128] | |||||
| ldr r12, [r2], #4 | |||||
| vld1.32 {q3},[r12,:128] | |||||
| vmul.f32 q8, q0, q10 | |||||
| vmul.f32 q8, q8, q1 | |||||
| vmul.f32 q9, q2, q10 | |||||
| vmul.f32 q9, q9, q3 | |||||
| subs lr, lr, #8 | |||||
| beq 2f | |||||
| vld1.32 {q0},[r1,:128]! | |||||
| vld1.32 {q2},[r1,:128]! | |||||
| vst1.32 {q8},[r0,:128]! | |||||
| vst1.32 {q9},[r0,:128]! | |||||
| b 1b | |||||
| 2: vst1.32 {q8},[r0,:128]! | |||||
| vst1.32 {q9},[r0,:128]! | |||||
| ands r3, r3, #7 | |||||
| it eq | |||||
| popeq {pc} | |||||
| 3: vld1.32 {q0},[r1,:128]! | |||||
| ldr r12, [r2], #4 | |||||
| vld1.32 {q1},[r12,:128] | |||||
| vmul.f32 q0, q0, q10 | |||||
| vmul.f32 q0, q0, q1 | |||||
| vst1.32 {q0},[r0,:128]! | |||||
| subs r3, r3, #4 | |||||
| bgt 3b | |||||
| pop {pc} | |||||
| endfunc | |||||
| function ff_sv_fmul_scalar_2_neon, export=1 | |||||
| VFP len .req r2 | |||||
| NOVFP len .req r3 | |||||
| VFP vdup.32 q8, d0[0] | |||||
| NOVFP vdup.32 q8, r2 | |||||
| ldr r12, [r1], #4 | |||||
| vld1.32 {d0},[r12,:64] | |||||
| ldr r12, [r1], #4 | |||||
| vld1.32 {d1},[r12,:64] | |||||
| 1: vmul.f32 q1, q0, q8 | |||||
| subs len, len, #4 | |||||
| beq 2f | |||||
| ldr r12, [r1], #4 | |||||
| vld1.32 {d0},[r12,:64] | |||||
| ldr r12, [r1], #4 | |||||
| vld1.32 {d1},[r12,:64] | |||||
| vst1.32 {q1},[r0,:128]! | |||||
| b 1b | |||||
| 2: vst1.32 {q1},[r0,:128]! | |||||
| bx lr | |||||
| .unreq len | |||||
| endfunc | |||||
| function ff_sv_fmul_scalar_4_neon, export=1 | |||||
| VFP len .req r2 | |||||
| NOVFP len .req r3 | |||||
| VFP vdup.32 q8, d0[0] | |||||
| NOVFP vdup.32 q8, r2 | |||||
| 1: ldr r12, [r1], #4 | |||||
| vld1.32 {q0},[r12,:128] | |||||
| vmul.f32 q0, q0, q8 | |||||
| vst1.32 {q0},[r0,:128]! | |||||
| subs len, len, #4 | |||||
| bgt 1b | |||||
| bx lr | |||||
| .unreq len | |||||
| endfunc | |||||
| function ff_butterflies_float_neon, export=1 | function ff_butterflies_float_neon, export=1 | ||||
| 1: vld1.32 {q0},[r0,:128] | 1: vld1.32 {q0},[r0,:128] | ||||
| vld1.32 {q1},[r1,:128] | vld1.32 {q1},[r1,:128] | ||||
| @@ -405,27 +405,6 @@ void ff_put_signed_pixels_clamped_c(const DCTELEM *block, | |||||
| } | } | ||||
| } | } | ||||
| static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels, | |||||
| int line_size) | |||||
| { | |||||
| int i; | |||||
| /* read the pixels */ | |||||
| for(i=0;i<8;i++) { | |||||
| pixels[0] = block[0]; | |||||
| pixels[1] = block[1]; | |||||
| pixels[2] = block[2]; | |||||
| pixels[3] = block[3]; | |||||
| pixels[4] = block[4]; | |||||
| pixels[5] = block[5]; | |||||
| pixels[6] = block[6]; | |||||
| pixels[7] = block[7]; | |||||
| pixels += line_size; | |||||
| block += 8; | |||||
| } | |||||
| } | |||||
| void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, | void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, | ||||
| int line_size) | int line_size) | ||||
| { | { | ||||
| @@ -2492,50 +2471,6 @@ static void vector_fmul_scalar_c(float *dst, const float *src, float mul, | |||||
| dst[i] = src[i] * mul; | dst[i] = src[i] * mul; | ||||
| } | } | ||||
| static void vector_fmul_sv_scalar_2_c(float *dst, const float *src, | |||||
| const float **sv, float mul, int len) | |||||
| { | |||||
| int i; | |||||
| for (i = 0; i < len; i += 2, sv++) { | |||||
| dst[i ] = src[i ] * sv[0][0] * mul; | |||||
| dst[i+1] = src[i+1] * sv[0][1] * mul; | |||||
| } | |||||
| } | |||||
| static void vector_fmul_sv_scalar_4_c(float *dst, const float *src, | |||||
| const float **sv, float mul, int len) | |||||
| { | |||||
| int i; | |||||
| for (i = 0; i < len; i += 4, sv++) { | |||||
| dst[i ] = src[i ] * sv[0][0] * mul; | |||||
| dst[i+1] = src[i+1] * sv[0][1] * mul; | |||||
| dst[i+2] = src[i+2] * sv[0][2] * mul; | |||||
| dst[i+3] = src[i+3] * sv[0][3] * mul; | |||||
| } | |||||
| } | |||||
| static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul, | |||||
| int len) | |||||
| { | |||||
| int i; | |||||
| for (i = 0; i < len; i += 2, sv++) { | |||||
| dst[i ] = sv[0][0] * mul; | |||||
| dst[i+1] = sv[0][1] * mul; | |||||
| } | |||||
| } | |||||
| static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul, | |||||
| int len) | |||||
| { | |||||
| int i; | |||||
| for (i = 0; i < len; i += 4, sv++) { | |||||
| dst[i ] = sv[0][0] * mul; | |||||
| dst[i+1] = sv[0][1] * mul; | |||||
| dst[i+2] = sv[0][2] * mul; | |||||
| dst[i+3] = sv[0][3] * mul; | |||||
| } | |||||
| } | |||||
| static void butterflies_float_c(float *restrict v1, float *restrict v2, | static void butterflies_float_c(float *restrict v1, float *restrict v2, | ||||
| int len) | int len) | ||||
| { | { | ||||
| @@ -2906,7 +2841,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->diff_pixels = diff_pixels_c; | c->diff_pixels = diff_pixels_c; | ||||
| c->put_pixels_clamped = ff_put_pixels_clamped_c; | c->put_pixels_clamped = ff_put_pixels_clamped_c; | ||||
| c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c; | c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c; | ||||
| c->put_pixels_nonclamped = put_pixels_nonclamped_c; | |||||
| c->add_pixels_clamped = ff_add_pixels_clamped_c; | c->add_pixels_clamped = ff_add_pixels_clamped_c; | ||||
| c->sum_abs_dctelem = sum_abs_dctelem_c; | c->sum_abs_dctelem = sum_abs_dctelem_c; | ||||
| c->gmc1 = gmc1_c; | c->gmc1 = gmc1_c; | ||||
| @@ -3088,12 +3022,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->butterflies_float = butterflies_float_c; | c->butterflies_float = butterflies_float_c; | ||||
| c->vector_fmul_scalar = vector_fmul_scalar_c; | c->vector_fmul_scalar = vector_fmul_scalar_c; | ||||
| c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; | |||||
| c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c; | |||||
| c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c; | |||||
| c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c; | |||||
| c->shrink[0]= av_image_copy_plane; | c->shrink[0]= av_image_copy_plane; | ||||
| c->shrink[1]= ff_shrink22; | c->shrink[1]= ff_shrink22; | ||||
| c->shrink[2]= ff_shrink44; | c->shrink[2]= ff_shrink44; | ||||
| @@ -229,7 +229,6 @@ typedef struct DSPContext { | |||||
| void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); | void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); | ||||
| void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | ||||
| void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | ||||
| void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | |||||
| void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | ||||
| void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); | void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); | ||||
| void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); | void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); | ||||
| @@ -423,32 +422,6 @@ typedef struct DSPContext { | |||||
| */ | */ | ||||
| void (*vector_fmul_scalar)(float *dst, const float *src, float mul, | void (*vector_fmul_scalar)(float *dst, const float *src, float mul, | ||||
| int len); | int len); | ||||
| /** | |||||
| * Multiply a vector of floats by concatenated short vectors of | |||||
| * floats and by a scalar float. Source and destination vectors | |||||
| * must overlap exactly or not at all. | |||||
| * [0]: short vectors of length 2, 8-byte aligned | |||||
| * [1]: short vectors of length 4, 16-byte aligned | |||||
| * @param dst output vector, 16-byte aligned | |||||
| * @param src input vector, 16-byte aligned | |||||
| * @param sv array of pointers to short vectors | |||||
| * @param mul scalar value | |||||
| * @param len number of elements in src and dst, multiple of 4 | |||||
| */ | |||||
| void (*vector_fmul_sv_scalar[2])(float *dst, const float *src, | |||||
| const float **sv, float mul, int len); | |||||
| /** | |||||
| * Multiply short vectors of floats by a scalar float, store | |||||
| * concatenated result. | |||||
| * [0]: short vectors of length 2, 8-byte aligned | |||||
| * [1]: short vectors of length 4, 16-byte aligned | |||||
| * @param dst output vector, 16-byte aligned | |||||
| * @param sv array of pointers to short vectors | |||||
| * @param mul scalar value | |||||
| * @param len number of output elements, multiple of 4 | |||||
| */ | |||||
| void (*sv_fmul_scalar[2])(float *dst, const float **sv, | |||||
| float mul, int len); | |||||
| /** | /** | ||||
| * Calculate the scalar product of two vectors of floats. | * Calculate the scalar product of two vectors of floats. | ||||
| * @param v1 first vector, 16-byte aligned | * @param v1 first vector, 16-byte aligned | ||||