Before After
Mean StdDev Mean StdDev Change
This function 1175.0 4.4 366.2 18.3 +220.8%
Overall 19285.5 292.0 18420.5 489.1 +4.7%
Signed-off-by: Martin Storsjö <martin@martin.st>
tags/n2.1
| @@ -28,6 +28,9 @@ | |||||
| void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src, | void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src, | ||||
| float mul, int len); | float mul, int len); | ||||
| void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, | |||||
| float mul, int len); | |||||
| void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); | void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); | ||||
| void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); | ||||
| @@ -38,6 +41,13 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx | |||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) { | if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) { | ||||
| if (!have_vfpv3(cpu_flags)) { | |||||
| // This function doesn't use anything armv6 specific in itself, | |||||
| // but ff_float_to_int16_vfp which is in the same assembly source | |||||
| // file does, thus the whole file requires armv6 to be built. | |||||
| c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp; | |||||
| } | |||||
| c->float_to_int16 = ff_float_to_int16_vfp; | c->float_to_int16 = ff_float_to_int16_vfp; | ||||
| } | } | ||||
| @@ -1,5 +1,6 @@ | |||||
| /* | /* | ||||
| * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | ||||
| * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org> | |||||
| * | * | ||||
| * This file is part of Libav. | * This file is part of Libav. | ||||
| * | * | ||||
| @@ -76,3 +77,40 @@ function ff_float_to_int16_vfp, export=1 | |||||
| vpop {d8-d11} | vpop {d8-d11} | ||||
| pop {r4-r8,pc} | pop {r4-r8,pc} | ||||
| endfunc | endfunc | ||||
| /** | |||||
| * ARM VFP optimised int32 to float conversion. | |||||
| * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned | |||||
| * (16 bytes alignment is best for BCM2835), little-endian. | |||||
| */ | |||||
| @ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len) | |||||
| function ff_int32_to_float_fmul_scalar_vfp, export=1 | |||||
| VFP tmp .req a4 | |||||
| VFP len .req a3 | |||||
| NOVFP tmp .req a3 | |||||
| NOVFP len .req a4 | |||||
| NOVFP vmov s0, a3 | |||||
| ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 | |||||
| fmrx ip, FPSCR | |||||
| fmxr FPSCR, tmp | |||||
| 1: | |||||
| vldmia a2!, {s8-s15} | |||||
| vcvt.f32.s32 s8, s8 | |||||
| vcvt.f32.s32 s9, s9 | |||||
| vcvt.f32.s32 s10, s10 | |||||
| vcvt.f32.s32 s11, s11 | |||||
| vcvt.f32.s32 s12, s12 | |||||
| vcvt.f32.s32 s13, s13 | |||||
| vcvt.f32.s32 s14, s14 | |||||
| vcvt.f32.s32 s15, s15 | |||||
| vmul.f32 s8, s8, s0 | |||||
| subs len, len, #8 | |||||
| vstmia a1!, {s8-s11} | |||||
| vstmia a1!, {s12-s15} | |||||
| bne 1b | |||||
| fmxr FPSCR, ip | |||||
| bx lr | |||||
| endfunc | |||||
| .unreq tmp | |||||
| .unreq len | |||||