Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.9
@@ -967,6 +967,7 @@ CONFIG_LIST=" | |||
static | |||
swscale | |||
swscale_alpha | |||
thumb | |||
vaapi | |||
vdpau | |||
version3 | |||
@@ -2607,7 +2608,7 @@ if enabled alpha; then | |||
elif enabled arm; then | |||
check_cflags -marm | |||
enabled thumb && check_cflags -mthumb || check_cflags -marm | |||
nogas=die | |||
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then | |||
@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx, | |||
"vmov d1, %2, %3 \n\t" | |||
"lsls %6, %6, #1 \n\t" | |||
"and %0, %5, #1<<31 \n\t" | |||
"it cs \n\t" | |||
"lslcs %5, %5, #1 \n\t" | |||
"lsls %6, %6, #1 \n\t" | |||
"and %1, %5, #1<<31 \n\t" | |||
"it cs \n\t" | |||
"lslcs %5, %5, #1 \n\t" | |||
"lsls %6, %6, #1 \n\t" | |||
"and %2, %5, #1<<31 \n\t" | |||
"it cs \n\t" | |||
"lslcs %5, %5, #1 \n\t" | |||
"vmov d4, %0, %1 \n\t" | |||
"and %3, %5, #1<<31 \n\t" | |||
@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1 | |||
lsl r3, lr, #1 | |||
ldrh r12, [r0, r3] | |||
subs r2, r2, #1 | |||
it gt | |||
ldrbgt lr, [r1], #1 | |||
add r12, r12, #1 | |||
strh r12, [r0, r3] | |||
@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 | |||
mov r11, r10 | |||
ldrb r10, [r4], #1 @ band_start_tab[band++] | |||
subs r9, r9, r5 @ - floor | |||
it lt | |||
movlt r9, #0 | |||
cmp r10, r3 @ - end | |||
and r9, r9, r8 @ & 0x1fe0 | |||
ite gt | |||
subgt r8, r3, r11 | |||
suble r8, r10, r11 | |||
add r9, r9, r5 @ + floor => m | |||
@@ -41,6 +41,7 @@ endfunc | |||
function ff_ac3_exponent_min_neon, export=1 | |||
cmp r1, #0 | |||
it eq | |||
bxeq lr | |||
push {lr} | |||
mov r12, #256 | |||
@@ -24,9 +24,18 @@ | |||
# define ELF | |||
#else | |||
# define ELF @ | |||
#endif | |||
#if CONFIG_THUMB | |||
# define A @ | |||
# define T | |||
#else | |||
# define A | |||
# define T @ | |||
#endif | |||
.syntax unified | |||
T .thumb | |||
.macro require8 val=1 | |||
ELF .eabi_attribute 24, \val | |||
@@ -82,6 +91,90 @@ ELF .size \name, . - \name | |||
#endif | |||
.endm | |||
.macro ldr_pre rt, rn, rm:vararg | |||
A ldr \rt, [\rn, \rm]! | |||
T add \rn, \rn, \rm | |||
T ldr \rt, [\rn] | |||
.endm | |||
.macro ldr_post rt, rn, rm:vararg | |||
A ldr \rt, [\rn], \rm | |||
T ldr \rt, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro ldrd_reg rt, rt2, rn, rm | |||
A ldrd \rt, \rt2, [\rn, \rm] | |||
T add \rt, \rn, \rm | |||
T ldrd \rt, \rt2, [\rt] | |||
.endm | |||
.macro ldrd_post rt, rt2, rn, rm | |||
A ldrd \rt, \rt2, [\rn], \rm | |||
T ldrd \rt, \rt2, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro ldrh_pre rt, rn, rm | |||
A ldrh \rt, [\rn, \rm]! | |||
T add \rn, \rn, \rm | |||
T ldrh \rt, [\rn] | |||
.endm | |||
.macro ldrh_dpre rt, rn, rm | |||
A ldrh \rt, [\rn, -\rm]! | |||
T sub \rn, \rn, \rm | |||
T ldrh \rt, [\rn] | |||
.endm | |||
.macro ldrh_post rt, rn, rm | |||
A ldrh \rt, [\rn], \rm | |||
T ldrh \rt, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro str_post rt, rn, rm:vararg | |||
A str \rt, [\rn], \rm | |||
T str \rt, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro strb_post rt, rn, rm:vararg | |||
A strb \rt, [\rn], \rm | |||
T strb \rt, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro strd_post rt, rt2, rn, rm | |||
A strd \rt, \rt2, [\rn], \rm | |||
T strd \rt, \rt2, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro strh_pre rt, rn, rm | |||
A strh \rt, [\rn, \rm]! | |||
T add \rn, \rn, \rm | |||
T strh \rt, [\rn] | |||
.endm | |||
.macro strh_dpre rt, rn, rm | |||
A strh \rt, [\rn, -\rm]! | |||
T sub \rn, \rn, \rm | |||
T strh \rt, [\rn] | |||
.endm | |||
.macro strh_post rt, rn, rm | |||
A strh \rt, [\rn], \rm | |||
T strh \rt, [\rn] | |||
T add \rn, \rn, \rm | |||
.endm | |||
.macro strh_dpost rt, rn, rm | |||
A strh \rt, [\rn], -\rm | |||
T strh \rt, [\rn] | |||
T sub \rn, \rn, \rm | |||
.endm | |||
#if HAVE_VFP_ARGS | |||
.eabi_attribute 28, 1 | |||
# define VFP | |||
@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1 | |||
add r5, r2, #256*4-16 @ cf1 | |||
sub r1, r1, #12 | |||
cmp r3, #32 | |||
ite eq | |||
moveq r6, #256/32 | |||
movne r6, #256/64 | |||
NOVFP vldr s0, [sp, #16] @ scale | |||
@@ -554,10 +554,12 @@ endfunc | |||
and r9, r5, r14 | |||
and r10, r6, r14 | |||
and r11, r7, r14 | |||
it eq | |||
andeq r14, r14, r14, \rnd #1 | |||
add r8, r8, r10 | |||
add r9, r9, r11 | |||
ldr r12, =0xfcfcfcfc >> 2 | |||
itt eq | |||
addeq r8, r8, r14 | |||
addeq r9, r9, r14 | |||
and r4, r12, r4, lsr #2 | |||
@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
mvn r5, r5 | |||
mvn r7, r7 | |||
tst r6, #0x100 | |||
it ne | |||
movne r6, r5, lsr #24 | |||
tst r8, #0x100 | |||
it ne | |||
movne r8, r7, lsr #24 | |||
mov r9, r6 | |||
ldrsh r5, [r0, #4] /* moved form [A] */ | |||
@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
mvn r5, r5 | |||
mvn r7, r7 | |||
tst r6, #0x100 | |||
it ne | |||
movne r6, r5, lsr #24 | |||
tst r8, #0x100 | |||
it ne | |||
movne r8, r7, lsr #24 | |||
orr r9, r9, r6, lsl #16 | |||
ldr r4, [r1, #4] /* moved form [B] */ | |||
@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
mvn r5, r5 | |||
mvn r7, r7 | |||
tst r6, #0x100 | |||
it ne | |||
movne r6, r5, lsr #24 | |||
tst r8, #0x100 | |||
it ne | |||
movne r8, r7, lsr #24 | |||
mov r9, r6 | |||
ldrsh r5, [r0, #12] /* moved from [D] */ | |||
@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
mvn r5, r5 | |||
mvn r7, r7 | |||
tst r6, #0x100 | |||
it ne | |||
movne r6, r5, lsr #24 | |||
tst r8, #0x100 | |||
it ne | |||
movne r8, r7, lsr #24 | |||
orr r9, r9, r6, lsl #16 | |||
add r0, r0, #16 /* moved from [E] */ | |||
@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1 | |||
ldr r5, [r1, #4] | |||
ldr r6, [r1, #8] | |||
ldr r7, [r1, #12] | |||
ldr r4, [r1], r2 | |||
ldr_post r4, r1, r2 | |||
strd r6, r7, [r0, #8] | |||
ldr r9, [r1, #4] | |||
strd r4, r5, [r0], r2 | |||
strd_post r4, r5, r0, r2 | |||
ldr r10, [r1, #8] | |||
ldr r11, [r1, #12] | |||
ldr r8, [r1], r2 | |||
ldr_post r8, r1, r2 | |||
strd r10, r11, [r0, #8] | |||
subs r3, r3, #2 | |||
strd r8, r9, [r0], r2 | |||
strd_post r8, r9, r0, r2 | |||
bne 1b | |||
pop {r4-r11} | |||
@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1 | |||
push {r4-r7} | |||
1: | |||
ldr r5, [r1, #4] | |||
ldr r4, [r1], r2 | |||
ldr_post r4, r1, r2 | |||
ldr r7, [r1, #4] | |||
strd r4, r5, [r0], r2 | |||
ldr r6, [r1], r2 | |||
strd_post r4, r5, r0, r2 | |||
ldr_post r6, r1, r2 | |||
subs r3, r3, #2 | |||
strd r6, r7, [r0], r2 | |||
strd_post r6, r7, r0, r2 | |||
bne 1b | |||
pop {r4-r7} | |||
@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1 | |||
ldr r5, [r1, #4] | |||
ldr r7, [r1, #5] | |||
lsr r6, r4, #8 | |||
ldr r8, [r1, r2]! | |||
ldr_pre r8, r1, r2 | |||
orr r6, r6, r5, lsl #24 | |||
ldr r9, [r1, #4] | |||
ldr r11, [r1, #5] | |||
@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1 | |||
uhadd8 r9, r9, r11 | |||
and r6, r6, r12 | |||
uadd8 r8, r8, r14 | |||
strd r4, r5, [r0], r2 | |||
strd_post r4, r5, r0, r2 | |||
uadd8 r9, r9, r6 | |||
strd r8, r9, [r0], r2 | |||
strd_post r8, r9, r0, r2 | |||
bne 1b | |||
pop {r4-r11, pc} | |||
@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
orr r12, r12, r12, lsl #16 | |||
ldr r4, [r1] | |||
ldr r5, [r1, #4] | |||
ldr r6, [r1, r2]! | |||
ldr_pre r6, r1, r2 | |||
ldr r7, [r1, #4] | |||
1: | |||
subs r3, r3, #2 | |||
@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
uhadd8 r9, r5, r7 | |||
eor r11, r5, r7 | |||
and r10, r10, r12 | |||
ldr r4, [r1, r2]! | |||
ldr_pre r4, r1, r2 | |||
uadd8 r8, r8, r10 | |||
and r11, r11, r12 | |||
uadd8 r9, r9, r11 | |||
@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
eor r7, r5, r7 | |||
uadd8 r10, r10, r6 | |||
and r7, r7, r12 | |||
ldr r6, [r1, r2]! | |||
ldr_pre r6, r1, r2 | |||
uadd8 r11, r11, r7 | |||
strd r8, r9, [r0], r2 | |||
strd_post r8, r9, r0, r2 | |||
ldr r7, [r1, #4] | |||
strd r10, r11, [r0], r2 | |||
strd_post r10, r11, r0, r2 | |||
bne 1b | |||
pop {r4-r11} | |||
@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1 | |||
ldr r4, [r1] | |||
ldr r5, [r1, #4] | |||
ldr r7, [r1, #5] | |||
ldr r8, [r1, r2]! | |||
ldr_pre r8, r1, r2 | |||
ldr r9, [r1, #4] | |||
ldr r14, [r1, #5] | |||
add r1, r1, r2 | |||
@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1 | |||
push {r4-r9, lr} | |||
ldr r4, [r1] | |||
ldr r5, [r1, #4] | |||
ldr r6, [r1, r2]! | |||
ldr_pre r6, r1, r2 | |||
ldr r7, [r1, #4] | |||
1: | |||
subs r3, r3, #2 | |||
uhadd8 r8, r4, r6 | |||
ldr r4, [r1, r2]! | |||
ldr_pre r4, r1, r2 | |||
uhadd8 r9, r5, r7 | |||
ldr r5, [r1, #4] | |||
uhadd8 r12, r4, r6 | |||
ldr r6, [r1, r2]! | |||
ldr_pre r6, r1, r2 | |||
uhadd8 r14, r5, r7 | |||
ldr r7, [r1, #4] | |||
stm r0, {r8,r9} | |||
@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1 | |||
orr lr, lr, lr, lsl #16 | |||
ldrd r4, r5, [r0] | |||
ldr r10, [r1, #4] | |||
ldr r9, [r1], r2 | |||
ldr_post r9, r1, r2 | |||
subs r3, r3, #2 | |||
1: | |||
pld [r1, r2] | |||
eor r8, r4, r9 | |||
uhadd8 r4, r4, r9 | |||
eor r12, r5, r10 | |||
ldrd r6, r7, [r0, r2] | |||
ldrd_reg r6, r7, r0, r2 | |||
uhadd8 r5, r5, r10 | |||
and r8, r8, lr | |||
ldr r10, [r1, #4] | |||
and r12, r12, lr | |||
uadd8 r4, r4, r8 | |||
ldr r9, [r1], r2 | |||
ldr_post r9, r1, r2 | |||
eor r8, r6, r9 | |||
uadd8 r5, r5, r12 | |||
pld [r1, r2, lsl #1] | |||
eor r12, r7, r10 | |||
uhadd8 r6, r6, r9 | |||
strd r4, r5, [r0], r2 | |||
strd_post r4, r5, r0, r2 | |||
uhadd8 r7, r7, r10 | |||
beq 2f | |||
and r8, r8, lr | |||
ldrd r4, r5, [r0, r2] | |||
ldrd_reg r4, r5, r0, r2 | |||
uadd8 r6, r6, r8 | |||
ldr r10, [r1, #4] | |||
and r12, r12, lr | |||
subs r3, r3, #2 | |||
uadd8 r7, r7, r12 | |||
ldr r9, [r1], r2 | |||
strd r6, r7, [r0], r2 | |||
ldr_post r9, r1, r2 | |||
strd_post r6, r7, r0, r2 | |||
b 1b | |||
2: | |||
and r8, r8, lr | |||
and r12, r12, lr | |||
uadd8 r6, r6, r8 | |||
uadd8 r7, r7, r12 | |||
strd r6, r7, [r0], r2 | |||
strd_post r6, r7, r0, r2 | |||
pop {r4-r10, pc} | |||
endfunc | |||
@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1 | |||
orr r6, r8, r5, lsl #8 | |||
orr r7, r4, lr, lsl #8 | |||
subs r3, r3, #1 | |||
strd r6, r7, [r1], r2 | |||
strd_post r6, r7, r1, r2 | |||
bgt 1b | |||
pop {r4-r8,pc} | |||
endfunc | |||
@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1 | |||
push {r4-r8, lr} | |||
mov lr, #8 | |||
1: | |||
ldrd r4, r5, [r1], r2 | |||
ldrd_post r4, r5, r1, r2 | |||
subs lr, lr, #1 | |||
uxtb16 r6, r4 | |||
uxtb16 r4, r4, ror #8 | |||
@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1 | |||
push {r4-r9, lr} | |||
mov lr, #8 | |||
1: | |||
ldrd r4, r5, [r1], r3 | |||
ldrd r6, r7, [r2], r3 | |||
ldrd_post r4, r5, r1, r3 | |||
ldrd_post r6, r7, r2, r3 | |||
uxtb16 r8, r4 | |||
uxtb16 r4, r4, ror #8 | |||
uxtb16 r9, r6 | |||
@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1 | |||
push {r4-r9, lr} | |||
mov r0, #0 | |||
mov lr, #0 | |||
ldrd r4, r5, [r1], r3 | |||
ldrd_post r4, r5, r1, r3 | |||
1: | |||
subs r12, r12, #2 | |||
ldr r7, [r2, #4] | |||
ldr r6, [r2], r3 | |||
ldrd r8, r9, [r1], r3 | |||
ldr_post r6, r2, r3 | |||
ldrd_post r8, r9, r1, r3 | |||
usada8 r0, r4, r6, r0 | |||
pld [r2, r3] | |||
usada8 lr, r5, r7, lr | |||
ldr r7, [r2, #4] | |||
ldr r6, [r2], r3 | |||
ldr_post r6, r2, r3 | |||
beq 2f | |||
ldrd r4, r5, [r1], r3 | |||
ldrd_post r4, r5, r1, r3 | |||
usada8 r0, r8, r6, r0 | |||
pld [r2, r3] | |||
usada8 lr, r9, r7, lr | |||
@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1 | |||
ldr r7, [r0, #12] | |||
usada8 r2, r6, lr, r2 | |||
beq 2f | |||
ldr r4, [r0, r1]! | |||
ldr_pre r4, r0, r1 | |||
usada8 r3, r7, lr, r3 | |||
bgt 1b | |||
2: | |||
@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1 | |||
2: vst1.32 {d2-d3}, [r3, :128]! | |||
vst1.32 {d0-d1}, [r12,:128]! | |||
it lt | |||
bxlt lr | |||
3: vld1.32 {d2-d3}, [r1,:128] | |||
@@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2 | |||
2: vst1.32 {q2},[r0,:128]! | |||
vst1.32 {q3},[r0,:128]! | |||
ands len, len, #15 | |||
it eq | |||
bxeq lr | |||
3: vld1.32 {q0},[r1,:128]! | |||
vmul.f32 q0, q0, q8 | |||
@@ -638,6 +640,7 @@ NOVFP ldr r3, [sp] | |||
2: vst1.32 {q8},[r0,:128]! | |||
vst1.32 {q9},[r0,:128]! | |||
ands r3, r3, #7 | |||
it eq | |||
popeq {pc} | |||
3: vld1.32 {q0},[r1,:128]! | |||
ldr r12, [r2], #4 | |||
@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1 | |||
1: | |||
subs r3, r3, #16 | |||
vmul.f32 s12, s4, s12 | |||
itttt ge | |||
vldmiage r1!, {s16-s19} | |||
vldmiage r2!, {s24-s27} | |||
vldmiage r1!, {s20-s23} | |||
vldmiage r2!, {s28-s31} | |||
it ge | |||
vmulge.f32 s24, s16, s24 | |||
vstmia r0!, {s8-s11} | |||
vstmia r0!, {s12-s15} | |||
it ge | |||
vmulge.f32 s28, s20, s28 | |||
itttt gt | |||
vldmiagt r1!, {s0-s3} | |||
vldmiagt r2!, {s8-s11} | |||
vldmiagt r1!, {s4-s7} | |||
vldmiagt r2!, {s12-s15} | |||
ittt ge | |||
vmulge.f32 s8, s0, s8 | |||
vstmiage r0!, {s24-s27} | |||
vstmiage r0!, {s28-s31} | |||
@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1 | |||
vmul.f32 s11, s0, s11 | |||
1: | |||
subs r3, r3, #16 | |||
it ge | |||
vldmdbge r2!, {s16-s19} | |||
vmul.f32 s12, s7, s12 | |||
it ge | |||
vldmiage r1!, {s24-s27} | |||
vmul.f32 s13, s6, s13 | |||
it ge | |||
vldmdbge r2!, {s20-s23} | |||
vmul.f32 s14, s5, s14 | |||
it ge | |||
vldmiage r1!, {s28-s31} | |||
vmul.f32 s15, s4, s15 | |||
it ge | |||
vmulge.f32 s24, s19, s24 | |||
it gt | |||
vldmdbgt r2!, {s0-s3} | |||
it ge | |||
vmulge.f32 s25, s18, s25 | |||
vstmia r0!, {s8-s13} | |||
it ge | |||
vmulge.f32 s26, s17, s26 | |||
it gt | |||
vldmiagt r1!, {s8-s11} | |||
itt ge | |||
vmulge.f32 s27, s16, s27 | |||
vmulge.f32 s28, s23, s28 | |||
it gt | |||
vldmdbgt r2!, {s4-s7} | |||
it ge | |||
vmulge.f32 s29, s22, s29 | |||
vstmia r0!, {s14-s15} | |||
ittt ge | |||
vmulge.f32 s30, s21, s30 | |||
vmulge.f32 s31, s20, s31 | |||
vmulge.f32 s8, s3, s8 | |||
it gt | |||
vldmiagt r1!, {s12-s15} | |||
itttt ge | |||
vmulge.f32 s9, s2, s9 | |||
vmulge.f32 s10, s1, s10 | |||
vstmiage r0!, {s24-s27} | |||
vmulge.f32 s11, s0, s11 | |||
it ge | |||
vstmiage r0!, {s28-s31} | |||
bgt 1b | |||
@@ -71,6 +71,7 @@ endfunc | |||
function ff_float_to_int16_interleave_neon, export=1 | |||
cmp r3, #2 | |||
itt lt | |||
ldrlt r1, [r1] | |||
blt ff_float_to_int16_neon | |||
bne 4f | |||
@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
vst1.64 {d3}, [r8], ip | |||
vst1.64 {d7}, [r8], ip | |||
subs r3, r3, #4 | |||
it eq | |||
popeq {r4-r8,pc} | |||
cmp r3, #4 | |||
add r0, r0, #8 | |||
@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
vst1.32 {d23[1]}, [r8], ip | |||
8: subs r3, r3, #2 | |||
add r0, r0, #4 | |||
it eq | |||
popeq {r4-r8,pc} | |||
@ 1 channel | |||
@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
vst1.16 {d2[3]}, [r5,:16], ip | |||
vst1.16 {d3[1]}, [r5,:16], ip | |||
vst1.16 {d3[3]}, [r5,:16], ip | |||
it eq | |||
popeq {r4-r8,pc} | |||
vld1.64 {d0-d1}, [r4,:128]! | |||
vcvt.s32.f32 q0, q0, #16 | |||
@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1 | |||
vmov r5, r6, s2, s3 | |||
vmov r7, r8, s4, s5 | |||
vmov ip, lr, s6, s7 | |||
it gt | |||
vldmiagt r1!, {s16-s23} | |||
ssat r4, #16, r4 | |||
ssat r3, #16, r3 | |||
@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1 | |||
ssat r5, #16, r5 | |||
pkhbt r3, r3, r4, lsl #16 | |||
pkhbt r4, r5, r6, lsl #16 | |||
itttt gt | |||
vcvtgt.s32.f32 s0, s16 | |||
vcvtgt.s32.f32 s1, s17 | |||
vcvtgt.s32.f32 s2, s18 | |||
vcvtgt.s32.f32 s3, s19 | |||
itttt gt | |||
vcvtgt.s32.f32 s4, s20 | |||
vcvtgt.s32.f32 s5, s21 | |||
vcvtgt.s32.f32 s6, s22 | |||
@@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 | |||
pld [r1] | |||
pld [r1, r2] | |||
muls r7, r4, r5 | |||
A muls r7, r4, r5 | |||
T mul r7, r4, r5 | |||
T cmp r7, #0 | |||
rsb r6, r7, r5, lsl #3 | |||
rsb ip, r7, r4, lsl #3 | |||
sub r4, r7, r4, lsl #3 | |||
@@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 | |||
pld [r1] | |||
pld [r1, r2] | |||
muls r7, r4, r5 | |||
A muls r7, r4, r5 | |||
T mul r7, r4, r5 | |||
T cmp r7, #0 | |||
rsb r6, r7, r5, lsl #3 | |||
rsb ip, r7, r4, lsl #3 | |||
sub r4, r7, r4, lsl #3 | |||
@@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 | |||
pop {r4-r6, pc} | |||
2: | |||
.ifc \type,put | |||
ldrh r5, [r1], r2 | |||
strh r5, [r0], r2 | |||
ldrh r6, [r1], r2 | |||
strh r6, [r0], r2 | |||
ldrh_post r5, r1, r2 | |||
strh_post r5, r0, r2 | |||
ldrh_post r6, r1, r2 | |||
strh_post r6, r0, r2 | |||
.else | |||
vld1.16 {d16[0]}, [r1], r2 | |||
vld1.16 {d16[1]}, [r1], r2 | |||
@@ -404,28 +408,17 @@ endfunc | |||
ldr ip, [sp] | |||
tst r2, r2 | |||
ldr ip, [ip] | |||
it ne | |||
tstne r3, r3 | |||
vmov.32 d24[0], ip | |||
and ip, ip, ip, lsl #16 | |||
it eq | |||
bxeq lr | |||
ands ip, ip, ip, lsl #8 | |||
it lt | |||
bxlt lr | |||
.endm | |||
.macro align_push_regs | |||
and ip, sp, #15 | |||
add ip, ip, #32 | |||
sub sp, sp, ip | |||
vst1.64 {d12-d15}, [sp,:128] | |||
sub sp, sp, #32 | |||
vst1.64 {d8-d11}, [sp,:128] | |||
.endm | |||
.macro align_pop_regs | |||
vld1.64 {d8-d11}, [sp,:128]! | |||
vld1.64 {d12-d15}, [sp,:128], ip | |||
.endm | |||
.macro h264_loop_filter_luma | |||
vdup.8 q11, r2 @ alpha | |||
vmovl.u8 q12, d24 | |||
@@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||
vld1.64 {d18,d19}, [r0,:128], r1 | |||
vld1.64 {d16,d17}, [r0,:128], r1 | |||
align_push_regs | |||
vpush {d8-d15} | |||
h264_loop_filter_luma | |||
@@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||
vst1.64 {d0, d1}, [r0,:128], r1 | |||
vst1.64 {d10,d11}, [r0,:128] | |||
align_pop_regs | |||
vpop {d8-d15} | |||
bx lr | |||
endfunc | |||
@@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | |||
align_push_regs | |||
vpush {d8-d15} | |||
h264_loop_filter_luma | |||
@@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||
vst1.32 {d1[1]}, [r0], r1 | |||
vst1.32 {d11[1]}, [r0], r1 | |||
align_pop_regs | |||
vpop {d8-d15} | |||
bx lr | |||
endfunc | |||
@@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon | |||
vrhadd.u8 d11, d11, d7 | |||
sub r0, r0, r2, lsl #3 | |||
.endif | |||
vst1.64 {d12}, [r0,:64], r2 | |||
vst1.64 {d13}, [r0,:64], r2 | |||
vst1.64 {d14}, [r0,:64], r2 | |||
@@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||
\type\()_h264_qpel8_mc11: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #64 | |||
mov r0, sp | |||
sub r1, r1, #2 | |||
@@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||
mov ip, #8 | |||
vpush {d8-d15} | |||
bl put_h264_qpel8_h_lowpass_neon | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
mov r3, r2 | |||
add ip, sp, #64 | |||
sub r1, r1, r2, lsl #1 | |||
mov r2, #8 | |||
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r11, pc} | |||
endfunc | |||
@@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||
\type\()_h264_qpel8_mc21: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #(8*8+16*12) | |||
sub r1, r1, #2 | |||
mov r3, #8 | |||
@@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||
vpush {d8-d15} | |||
bl put_h264_qpel8_h_lowpass_neon | |||
mov r4, r0 | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
sub r1, r1, r2, lsl #1 | |||
sub r1, r1, #2 | |||
mov r3, r2 | |||
sub r2, r4, #64 | |||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r4, r10, r11, pc} | |||
endfunc | |||
@@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||
\type\()_h264_qpel8_mc12: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #(8*8+16*12) | |||
sub r1, r1, r2, lsl #1 | |||
mov r3, r2 | |||
@@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||
vpush {d8-d15} | |||
bl put_h264_qpel8_v_lowpass_neon | |||
mov r4, r0 | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
sub r1, r1, r3, lsl #1 | |||
sub r1, r1, #2 | |||
sub r2, r4, #64 | |||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r4, r10, r11, pc} | |||
endfunc | |||
function ff_\type\()_h264_qpel8_mc22_neon, export=1 | |||
push {r4, r10, r11, lr} | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r4, r11, #15 | |||
T mov sp, r4 | |||
sub r1, r1, r2, lsl #1 | |||
sub r1, r1, #2 | |||
mov r3, r2 | |||
@@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |||
\type\()_h264_qpel16_mc11: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #256 | |||
mov r0, sp | |||
sub r1, r1, #2 | |||
mov r3, #16 | |||
vpush {d8-d15} | |||
bl put_h264_qpel16_h_lowpass_neon | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
mov r3, r2 | |||
add ip, sp, #64 | |||
sub r1, r1, r2, lsl #1 | |||
mov r2, #16 | |||
bl \type\()_h264_qpel16_v_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r4, r11, pc} | |||
endfunc | |||
@@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |||
\type\()_h264_qpel16_mc21: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #(16*16+16*12) | |||
sub r1, r1, #2 | |||
mov r0, sp | |||
vpush {d8-d15} | |||
bl put_h264_qpel16_h_lowpass_neon_packed | |||
mov r4, r0 | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
sub r1, r1, r2, lsl #1 | |||
sub r1, r1, #2 | |||
mov r3, r2 | |||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r4-r5, r9-r11, pc} | |||
endfunc | |||
@@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||
\type\()_h264_qpel16_mc12: | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r0, r11, #15 | |||
T mov sp, r0 | |||
sub sp, sp, #(16*16+16*12) | |||
sub r1, r1, r2, lsl #1 | |||
mov r0, sp | |||
@@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||
vpush {d8-d15} | |||
bl put_h264_qpel16_v_lowpass_neon_packed | |||
mov r4, r0 | |||
ldrd r0, [r11] | |||
ldrd r0, [r11], #8 | |||
sub r1, r1, r3, lsl #1 | |||
sub r1, r1, #2 | |||
mov r2, r3 | |||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |||
vpop {d8-d15} | |||
add sp, r11, #8 | |||
mov sp, r11 | |||
pop {r4-r5, r9-r11, pc} | |||
endfunc | |||
@@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |||
push {r4, r9-r11, lr} | |||
lowpass_const r3 | |||
mov r11, sp | |||
bic sp, sp, #15 | |||
A bic sp, sp, #15 | |||
T bic r4, r11, #15 | |||
T mov sp, r4 | |||
sub r1, r1, r2, lsl #1 | |||
sub r1, r1, #2 | |||
mov r3, r2 | |||
@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1 | |||
blt 2f | |||
ldrsh lr, [r1] | |||
add r0, r0, r4 | |||
it ne | |||
movne lr, #0 | |||
cmp lr, #0 | |||
adrne lr, ff_h264_idct_dc_add_neon | |||
adreq lr, ff_h264_idct_add_neon | |||
ite ne | |||
adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
blx lr | |||
2: subs ip, ip, #1 | |||
add r1, r1, #32 | |||
@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1 | |||
add r0, r0, r4 | |||
cmp r8, #0 | |||
ldrsh r8, [r1] | |||
adrne lr, ff_h264_idct_add_neon | |||
adreq lr, ff_h264_idct_dc_add_neon | |||
iteet ne | |||
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
cmpeq r8, #0 | |||
blxne lr | |||
subs ip, ip, #1 | |||
@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1 | |||
add r1, r3, r12, lsl #5 | |||
cmp r8, #0 | |||
ldrsh r8, [r1] | |||
adrne lr, ff_h264_idct_add_neon | |||
adreq lr, ff_h264_idct_dc_add_neon | |||
iteet ne | |||
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
cmpeq r8, #0 | |||
blxne lr | |||
add r12, r12, #1 | |||
cmp r12, #4 | |||
itt eq | |||
moveq r12, #16 | |||
moveq r4, r9 | |||
cmp r12, #20 | |||
@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1 | |||
blt 2f | |||
ldrsh lr, [r1] | |||
add r0, r0, r4 | |||
it ne | |||
movne lr, #0 | |||
cmp lr, #0 | |||
adrne lr, ff_h264_idct8_dc_add_neon | |||
adreq lr, ff_h264_idct8_add_neon | |||
ite ne | |||
adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB | |||
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB | |||
blx lr | |||
2: subs r12, r12, #4 | |||
add r1, r1, #128 | |||
@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c) | |||
__asm__ ( | |||
"mov %0, %2 \n\t" | |||
"cmp %1, %2 \n\t" | |||
"itt gt \n\t" | |||
"movgt %0, %1 \n\t" | |||
"movgt %1, %2 \n\t" | |||
"cmp %1, %3 \n\t" | |||
"it le \n\t" | |||
"movle %1, %3 \n\t" | |||
"cmp %0, %1 \n\t" | |||
"it gt \n\t" | |||
"movgt %0, %1 \n\t" | |||
: "=&r"(m), "+r"(a) | |||
: "r"(b), "r"(c) | |||
@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1 | |||
vadd.f32 d17, d17, d3 @ in2u+in1d -I | |||
1: | |||
vmul.f32 d7, d0, d21 @ I*s | |||
ldr r10, [r3, lr, lsr #1] | |||
A ldr r10, [r3, lr, lsr #1] | |||
T lsr r10, lr, #1 | |||
T ldr r10, [r3, r10] | |||
vmul.f32 d6, d1, d20 @ -R*c | |||
ldr r6, [r3, #4]! | |||
vmul.f32 d4, d1, d21 @ -R*s | |||
@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||
sum8 r8, r9, r1, r0, r10, r11, r12, lr | |||
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 | |||
round r10, r8, r9 | |||
strh r10, [r3], r4 | |||
strh_post r10, r3, r4 | |||
mov lr, #15 | |||
1: | |||
@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||
round r10, r8, r9 | |||
adds r8, r8, r4 | |||
adc r9, r9, r7 | |||
strh r10, [r3], r12 | |||
strh_post r10, r3, r12 | |||
round r11, r8, r9 | |||
subs lr, lr, #1 | |||
strh r11, [r5], -r12 | |||
strh_dpost r11, r5, r12 | |||
bgt 1b | |||
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 | |||
@@ -38,15 +38,21 @@ | |||
.macro dequant_t dst, src, mul, add, tmp | |||
rsbs \tmp, ip, \src, asr #16 | |||
it gt | |||
addgt \tmp, \add, #0 | |||
it lt | |||
rsblt \tmp, \add, #0 | |||
it ne | |||
smlatbne \dst, \src, \mul, \tmp | |||
.endm | |||
.macro dequant_b dst, src, mul, add, tmp | |||
rsbs \tmp, ip, \src, lsl #16 | |||
it gt | |||
addgt \tmp, \add, #0 | |||
it lt | |||
rsblt \tmp, \add, #0 | |||
it ne | |||
smlabbne \dst, \src, \mul, \tmp | |||
.endm | |||
@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1 | |||
strh lr, [r0], #2 | |||
subs r3, r3, #8 | |||
it gt | |||
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ | |||
bgt 1b | |||
adds r3, r3, #2 | |||
it le | |||
pople {r4-r9,pc} | |||
2: | |||
ldrsh r9, [r0, #0] | |||
ldrsh lr, [r0, #2] | |||
mov r8, r2 | |||
cmp r9, #0 | |||
it lt | |||
rsblt r8, r2, #0 | |||
it ne | |||
smlabbne r9, r9, r1, r8 | |||
mov r8, r2 | |||
cmp lr, #0 | |||
it lt | |||
rsblt r8, r2, #0 | |||
it ne | |||
smlabbne lr, lr, r1, r8 | |||
strh r9, [r0], #2 | |||
strh lr, [r0], #2 | |||
@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1 | |||
subs r3, r3, #16 | |||
vst1.16 {q0}, [r1,:128]! | |||
vst1.16 {q8}, [r1,:128]! | |||
it le | |||
bxle lr | |||
cmp r3, #8 | |||
bgt 1b | |||
@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||
ldr r6, [r0, #AC_PRED] | |||
add lr, r0, #INTER_SCANTAB_RASTER_END | |||
cmp r6, #0 | |||
it ne | |||
movne r12, #63 | |||
bne 1f | |||
ldr r12, [r12, r2, lsl #2] | |||
@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||
ldrsh r4, [r1] | |||
cmp r5, #0 | |||
mov r5, r1 | |||
it ne | |||
movne r2, #0 | |||
bne 2f | |||
cmp r2, #4 | |||
it ge | |||
addge r0, r0, #4 | |||
sub r2, r3, #1 | |||
ldr r6, [r0, #Y_DC_SCALE] | |||
@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1 | |||
vst1.32 {d22}, [r5,:64] | |||
cmp r6, #0 | |||
it eq | |||
popeq {r4-r8,pc} | |||
vmul.f32 d22, d22, d18 | |||
@@ -121,11 +121,13 @@ __b_evaluation: | |||
ldr r11, [r12, #offW7] @ R11=W7 | |||
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
teq r2, #0 @ if null avoid muls | |||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
teq r2, #0 @ if null avoid muls | |||
itttt ne | |||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
rsbne r2, r2, #0 @ R2=-ROWr16[3] | |||
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
it ne | |||
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], | |||
@@ -148,19 +150,23 @@ __b_evaluation: | |||
@@ MAC16(b3, -W1, row[7]); | |||
@@ MAC16(b1, -W5, row[7]); | |||
mov r3, r3, asr #16 @ R3=ROWr16[5] | |||
teq r3, #0 @ if null avoid muls | |||
teq r3, #0 @ if null avoid muls | |||
it ne | |||
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 | |||
mov r4, r4, asr #16 @ R4=ROWr16[7] | |||
itttt ne | |||
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 | |||
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 | |||
rsbne r3, r3, #0 @ R3=-ROWr16[5] | |||
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 | |||
@@ R3 is free now | |||
teq r4, #0 @ if null avoid muls | |||
teq r4, #0 @ if null avoid muls | |||
itttt ne | |||
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 | |||
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 | |||
rsbne r4, r4, #0 @ R4=-ROWr16[7] | |||
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 | |||
it ne | |||
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 | |||
@@ R4 is free now | |||
__end_b_evaluation: | |||
@@ -204,16 +210,19 @@ __a_evaluation: | |||
@@ a2 -= W4*row[4] | |||
@@ a3 += W4*row[4] | |||
ldrsh r11, [r14, #8] @ R11=ROWr16[4] | |||
teq r11, #0 @ if null avoid muls | |||
teq r11, #0 @ if null avoid muls | |||
it ne | |||
mulne r11, r9, r11 @ R11=W4*ROWr16[4] | |||
@@ R9 is free now | |||
ldrsh r9, [r14, #12] @ R9=ROWr16[6] | |||
itttt ne | |||
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | |||
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | |||
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | |||
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | |||
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | |||
teq r9, #0 @ if null avoid muls | |||
teq r9, #0 @ if null avoid muls | |||
itttt ne | |||
mulne r11, r10, r9 @ R11=W6*ROWr16[6] | |||
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | |||
mulne r10, r8, r9 @ R10=W2*ROWr16[6] | |||
@@ -222,6 +231,7 @@ __a_evaluation: | |||
@@ a1 -= W2*row[6]; | |||
@@ a2 += W2*row[6]; | |||
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | |||
itt ne | |||
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | |||
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | |||
@@ -323,10 +333,12 @@ __b_evaluation2: | |||
ldrsh r2, [r14, #48] | |||
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
teq r2, #0 @ if 0, then avoid muls | |||
itttt ne | |||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
rsbne r2, r2, #0 @ R2=-ROWr16[3] | |||
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
it ne | |||
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | |||
@@ -342,18 +354,22 @@ __b_evaluation2: | |||
@@ MAC16(b1, -W5, col[7x8]); | |||
ldrsh r3, [r14, #80] @ R3=COLr16[5x8] | |||
teq r3, #0 @ if 0 then avoid muls | |||
itttt ne | |||
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 | |||
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 | |||
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 | |||
rsbne r3, r3, #0 @ R3=-ROWr16[5x8] | |||
ldrsh r4, [r14, #112] @ R4=COLr16[7x8] | |||
it ne | |||
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 | |||
@@ R3 is free now | |||
teq r4, #0 @ if 0 then avoid muls | |||
itttt ne | |||
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 | |||
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 | |||
rsbne r4, r4, #0 @ R4=-ROWr16[7x8] | |||
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 | |||
it ne | |||
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 | |||
@@ R4 is free now | |||
__end_b_evaluation2: | |||
@@ -390,15 +406,18 @@ __a_evaluation2: | |||
@@ a3 += W4*row[4] | |||
ldrsh r11, [r14, #64] @ R11=ROWr16[4] | |||
teq r11, #0 @ if null avoid muls | |||
itttt ne | |||
mulne r11, r9, r11 @ R11=W4*ROWr16[4] | |||
@@ R9 is free now | |||
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | |||
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | |||
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | |||
ldrsh r9, [r14, #96] @ R9=ROWr16[6] | |||
it ne | |||
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | |||
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | |||
teq r9, #0 @ if null avoid muls | |||
itttt ne | |||
mulne r11, r10, r9 @ R11=W6*ROWr16[6] | |||
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | |||
mulne r10, r8, r9 @ R10=W2*ROWr16[6] | |||
@@ -407,6 +426,7 @@ __a_evaluation2: | |||
@@ a1 -= W2*row[6]; | |||
@@ a2 += W2*row[6]; | |||
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | |||
itt ne | |||
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | |||
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | |||
__end_a_evaluation2: | |||
@@ -49,6 +49,7 @@ function idct_row_armv5te | |||
ldrd v1, [a1, #8] | |||
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ | |||
orrs v1, v1, v2 | |||
itt eq | |||
cmpeq v1, a4 | |||
cmpeq v1, a3, lsr #16 | |||
beq row_dc_only | |||
@@ -269,6 +270,7 @@ function idct_col_armv5te | |||
ldmfd sp!, {a3, a4} | |||
adds a2, a3, v1 | |||
mov a2, a2, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
add ip, a4, v2 | |||
mov ip, ip, asr #20 | |||
@@ -276,6 +278,7 @@ function idct_col_armv5te | |||
str a2, [a1] | |||
subs a3, a3, v1 | |||
mov a2, a3, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
sub a4, a4, v2 | |||
mov a4, a4, asr #20 | |||
@@ -285,6 +288,7 @@ function idct_col_armv5te | |||
subs a2, a3, v3 | |||
mov a2, a2, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
sub ip, a4, v4 | |||
mov ip, ip, asr #20 | |||
@@ -292,6 +296,7 @@ function idct_col_armv5te | |||
str a2, [a1, #(16*1)] | |||
adds a3, a3, v3 | |||
mov a2, a3, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
add a4, a4, v4 | |||
mov a4, a4, asr #20 | |||
@@ -301,6 +306,7 @@ function idct_col_armv5te | |||
adds a2, a3, v5 | |||
mov a2, a2, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
add ip, a4, v6 | |||
mov ip, ip, asr #20 | |||
@@ -308,6 +314,7 @@ function idct_col_armv5te | |||
str a2, [a1, #(16*2)] | |||
subs a3, a3, v5 | |||
mov a2, a3, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
sub a4, a4, v6 | |||
mov a4, a4, asr #20 | |||
@@ -317,6 +324,7 @@ function idct_col_armv5te | |||
adds a2, a3, v7 | |||
mov a2, a2, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
add ip, a4, fp | |||
mov ip, ip, asr #20 | |||
@@ -324,6 +332,7 @@ function idct_col_armv5te | |||
str a2, [a1, #(16*3)] | |||
subs a3, a3, v7 | |||
mov a2, a3, lsr #20 | |||
it mi | |||
orrmi a2, a2, #0xf000 | |||
sub a4, a4, fp | |||
mov a4, a4, asr #20 | |||
@@ -335,15 +344,19 @@ endfunc | |||
.macro clip dst, src:vararg | |||
movs \dst, \src | |||
it mi | |||
movmi \dst, #0 | |||
cmp \dst, #255 | |||
it gt | |||
movgt \dst, #255 | |||
.endm | |||
.macro aclip dst, src:vararg | |||
adds \dst, \src | |||
it mi | |||
movmi \dst, #0 | |||
cmp \dst, #255 | |||
it gt | |||
movgt \dst, #255 | |||
.endm | |||
@@ -370,35 +383,35 @@ function idct_col_put_armv5te | |||
orr a2, a3, a4, lsl #8 | |||
rsb v2, lr, lr, lsl #3 | |||
ldmfd sp!, {a3, a4} | |||
strh a2, [v2, v1]! | |||
strh_pre a2, v2, v1 | |||
sub a2, a3, v3 | |||
clip a2, a2, asr #20 | |||
sub ip, a4, v4 | |||
clip ip, ip, asr #20 | |||
orr a2, a2, ip, lsl #8 | |||
strh a2, [v1, lr]! | |||
strh_pre a2, v1, lr | |||
add a3, a3, v3 | |||
clip a2, a3, asr #20 | |||
add a4, a4, v4 | |||
clip a4, a4, asr #20 | |||
orr a2, a2, a4, lsl #8 | |||
ldmfd sp!, {a3, a4} | |||
strh a2, [v2, -lr]! | |||
strh_dpre a2, v2, lr | |||
add a2, a3, v5 | |||
clip a2, a2, asr #20 | |||
add ip, a4, v6 | |||
clip ip, ip, asr #20 | |||
orr a2, a2, ip, lsl #8 | |||
strh a2, [v1, lr]! | |||
strh_pre a2, v1, lr | |||
sub a3, a3, v5 | |||
clip a2, a3, asr #20 | |||
sub a4, a4, v6 | |||
clip a4, a4, asr #20 | |||
orr a2, a2, a4, lsl #8 | |||
ldmfd sp!, {a3, a4} | |||
strh a2, [v2, -lr]! | |||
strh_dpre a2, v2, lr | |||
add a2, a3, v7 | |||
clip a2, a2, asr #20 | |||
@@ -411,7 +424,7 @@ function idct_col_put_armv5te | |||
sub a4, a4, fp | |||
clip a4, a4, asr #20 | |||
orr a2, a2, a4, lsl #8 | |||
strh a2, [v2, -lr] | |||
strh_dpre a2, v2, lr | |||
ldr pc, [sp], #4 | |||
endfunc | |||
@@ -436,7 +449,7 @@ function idct_col_add_armv5te | |||
ldr v1, [sp, #32] | |||
sub a4, a4, v2 | |||
rsb v2, v1, v1, lsl #3 | |||
ldrh ip, [v2, lr]! | |||
ldrh_pre ip, v2, lr | |||
strh a2, [lr] | |||
and a2, ip, #255 | |||
aclip a3, a2, a3, asr #20 | |||
@@ -448,7 +461,7 @@ function idct_col_add_armv5te | |||
strh a2, [v2] | |||
ldmfd sp!, {a3, a4} | |||
ldrh ip, [lr, v1]! | |||
ldrh_pre ip, lr, v1 | |||
sub a2, a3, v3 | |||
add a3, a3, v3 | |||
and v3, ip, #255 | |||
@@ -458,7 +471,7 @@ function idct_col_add_armv5te | |||
aclip v3, v3, ip, lsr #8 | |||
orr a2, a2, v3, lsl #8 | |||
add a4, a4, v4 | |||
ldrh ip, [v2, -v1]! | |||
ldrh_dpre ip, v2, v1 | |||
strh a2, [lr] | |||
and a2, ip, #255 | |||
aclip a3, a2, a3, asr #20 | |||
@@ -468,7 +481,7 @@ function idct_col_add_armv5te | |||
strh a2, [v2] | |||
ldmfd sp!, {a3, a4} | |||
ldrh ip, [lr, v1]! | |||
ldrh_pre ip, lr, v1 | |||
add a2, a3, v5 | |||
sub a3, a3, v5 | |||
and v3, ip, #255 | |||
@@ -478,7 +491,7 @@ function idct_col_add_armv5te | |||
aclip v3, v3, ip, lsr #8 | |||
orr a2, a2, v3, lsl #8 | |||
sub a4, a4, v6 | |||
ldrh ip, [v2, -v1]! | |||
ldrh_dpre ip, v2, v1 | |||
strh a2, [lr] | |||
and a2, ip, #255 | |||
aclip a3, a2, a3, asr #20 | |||
@@ -488,7 +501,7 @@ function idct_col_add_armv5te | |||
strh a2, [v2] | |||
ldmfd sp!, {a3, a4} | |||
ldrh ip, [lr, v1]! | |||
ldrh_pre ip, lr, v1 | |||
add a2, a3, v7 | |||
sub a3, a3, v7 | |||
and v3, ip, #255 | |||
@@ -498,7 +511,7 @@ function idct_col_add_armv5te | |||
aclip v3, v3, ip, lsr #8 | |||
orr a2, a2, v3, lsl #8 | |||
sub a4, a4, fp | |||
ldrh ip, [v2, -v1]! | |||
ldrh_dpre ip, v2, v1 | |||
strh a2, [lr] | |||
and a2, ip, #255 | |||
aclip a3, a2, a3, asr #20 | |||
@@ -200,6 +200,7 @@ function idct_row_armv6 | |||
ldr r3, [r0, #8] /* r3 = row[3,1] */ | |||
ldr r2, [r0] /* r2 = row[2,0] */ | |||
orrs lr, lr, ip | |||
itt eq | |||
cmpeq lr, r3 | |||
cmpeq lr, r2, lsr #16 | |||
beq 1f | |||
@@ -282,14 +283,14 @@ function idct_col_put_armv6 | |||
pop {r1, r2} | |||
idct_finish_shift_sat COL_SHIFT | |||
strb r4, [r1], r2 | |||
strb r5, [r1], r2 | |||
strb r6, [r1], r2 | |||
strb r7, [r1], r2 | |||
strb r11,[r1], r2 | |||
strb r10,[r1], r2 | |||
strb r9, [r1], r2 | |||
strb r8, [r1], r2 | |||
strb_post r4, r1, r2 | |||
strb_post r5, r1, r2 | |||
strb_post r6, r1, r2 | |||
strb_post r7, r1, r2 | |||
strb_post r11,r1, r2 | |||
strb_post r10,r1, r2 | |||
strb_post r9, r1, r2 | |||
strb_post r8, r1, r2 | |||
sub r1, r1, r2, lsl #3 | |||
@@ -318,16 +319,16 @@ function idct_col_add_armv6 | |||
add ip, r3, ip, asr #COL_SHIFT | |||
usat ip, #8, ip | |||
add r4, r7, r4, asr #COL_SHIFT | |||
strb ip, [r1], r2 | |||
strb_post ip, r1, r2 | |||
ldrb ip, [r1, r2] | |||
usat r4, #8, r4 | |||
ldrb r11,[r1, r2, lsl #2] | |||
add r5, ip, r5, asr #COL_SHIFT | |||
usat r5, #8, r5 | |||
strb r4, [r1], r2 | |||
strb_post r4, r1, r2 | |||
ldrb r3, [r1, r2] | |||
ldrb ip, [r1, r2, lsl #2] | |||
strb r5, [r1], r2 | |||
strb_post r5, r1, r2 | |||
ldrb r7, [r1, r2] | |||
ldrb r4, [r1, r2, lsl #2] | |||
add r6, r3, r6, asr #COL_SHIFT | |||
@@ -340,11 +341,11 @@ function idct_col_add_armv6 | |||
usat r8, #8, r8 | |||
add lr, r4, lr, asr #COL_SHIFT | |||
usat lr, #8, lr | |||
strb r6, [r1], r2 | |||
strb r10,[r1], r2 | |||
strb r9, [r1], r2 | |||
strb r8, [r1], r2 | |||
strb lr, [r1], r2 | |||
strb_post r6, r1, r2 | |||
strb_post r10,r1, r2 | |||
strb_post r9, r1, r2 | |||
strb_post r8, r1, r2 | |||
strb_post lr, r1, r2 | |||
sub r1, r1, r2, lsl #3 | |||
@@ -71,7 +71,7 @@ function idct_row4_pld_neon | |||
add r3, r0, r1, lsl #2 | |||
pld [r0, r1] | |||
pld [r0, r1, lsl #1] | |||
pld [r3, -r1] | |||
A pld [r3, -r1] | |||
pld [r3] | |||
pld [r3, r1] | |||
add r3, r3, r1, lsl #1 | |||
@@ -164,6 +164,7 @@ function idct_col4_neon | |||
orrs r4, r4, r5 | |||
idct_col4_top | |||
it eq | |||
addeq r2, r2, #16 | |||
beq 1f | |||
@@ -176,6 +177,7 @@ function idct_col4_neon | |||
1: orrs r6, r6, r7 | |||
ldrd r4, [r2, #16] | |||
it eq | |||
addeq r2, r2, #16 | |||
beq 2f | |||
@@ -187,6 +189,7 @@ function idct_col4_neon | |||
2: orrs r4, r4, r5 | |||
ldrd r4, [r2, #16] | |||
it eq | |||
addeq r2, r2, #16 | |||
beq 3f | |||
@@ -199,6 +202,7 @@ function idct_col4_neon | |||
vadd.i32 q13, q13, q8 | |||
3: orrs r4, r4, r5 | |||
it eq | |||
addeq r2, r2, #16 | |||
beq 4f | |||
@@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale | |||
vst1.32 {q9}, [r2,:128] | |||
subs r1, r1, #1 | |||
it eq | |||
popeq {r4-r11,pc} | |||
cmp r4, #0 | |||
itt eq | |||
subeq r8, r8, #512*4 | |||
subeq r9, r9, #512*4 | |||
sub r5, r5, #512*4 | |||
@@ -21,6 +21,14 @@ | |||
#ifndef AVCODEC_ARM_VP56_ARITH_H | |||
#define AVCODEC_ARM_VP56_ARITH_H | |||
#if CONFIG_THUMB | |||
# define A(x) | |||
# define T(x) x | |||
#else | |||
# define A(x) x | |||
# define T(x) | |||
#endif | |||
#if HAVE_ARMV6 && HAVE_INLINE_ASM | |||
#define vp56_rac_get_prob vp56_rac_get_prob_armv6 | |||
@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) | |||
unsigned bit; | |||
__asm__ ("adds %3, %3, %0 \n" | |||
"itt cs \n" | |||
"cmpcs %7, %4 \n" | |||
"ldrcsh %2, [%4], #2 \n" | |||
A("ldrcsh %2, [%4], #2 \n") | |||
T("ldrhcs %2, [%4], #2 \n") | |||
"rsb %0, %6, #256 \n" | |||
"smlabb %0, %5, %6, %0 \n" | |||
T("itttt cs \n") | |||
"rev16cs %2, %2 \n" | |||
"orrcs %1, %1, %2, lsl %3 \n" | |||
T("lslcs %2, %2, %3 \n") | |||
T("orrcs %1, %1, %2 \n") | |||
A("orrcs %1, %1, %2, lsl %3 \n") | |||
"subcs %3, %3, #16 \n" | |||
"lsr %0, %0, #8 \n" | |||
"cmp %1, %0, lsl #16 \n" | |||
"ittte ge \n" | |||
"subge %1, %1, %0, lsl #16 \n" | |||
"subge %0, %5, %0 \n" | |||
"movge %2, #1 \n" | |||
@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) | |||
unsigned tmp; | |||
__asm__ ("adds %3, %3, %0 \n" | |||
"itt cs \n" | |||
"cmpcs %7, %4 \n" | |||
"ldrcsh %2, [%4], #2 \n" | |||
A("ldrcsh %2, [%4], #2 \n") | |||
T("ldrhcs %2, [%4], #2 \n") | |||
"rsb %0, %6, #256 \n" | |||
"smlabb %0, %5, %6, %0 \n" | |||
T("itttt cs \n") | |||
"rev16cs %2, %2 \n" | |||
"orrcs %1, %1, %2, lsl %3 \n" | |||
T("lslcs %2, %2, %3 \n") | |||
T("orrcs %1, %1, %2 \n") | |||
A("orrcs %1, %1, %2, lsl %3 \n") | |||
"subcs %3, %3, #16 \n" | |||
"lsr %0, %0, #8 \n" | |||
"lsl %2, %0, #16 \n" | |||
@@ -25,13 +25,18 @@ | |||
lsl \cw, \cw, \t0 | |||
lsl \t0, \h, \t0 | |||
rsb \h, \pr, #256 | |||
it cs | |||
ldrhcs \t1, [\buf], #2 | |||
smlabb \h, \t0, \pr, \h | |||
T itttt cs | |||
rev16cs \t1, \t1 | |||
orrcs \cw, \cw, \t1, lsl \bs | |||
A orrcs \cw, \cw, \t1, lsl \bs | |||
T lslcs \t1, \t1, \bs | |||
T orrcs \cw, \cw, \t1 | |||
subcs \bs, \bs, #16 | |||
lsr \h, \h, #8 | |||
cmp \cw, \h, lsl #16 | |||
itt ge | |||
subge \cw, \cw, \h, lsl #16 | |||
subge \h, \t0, \h | |||
.endm | |||
@@ -40,14 +45,20 @@ | |||
adds \bs, \bs, \t0 | |||
lsl \cw, \cw, \t0 | |||
lsl \t0, \h, \t0 | |||
it cs | |||
ldrhcs \t1, [\buf], #2 | |||
mov \h, #128 | |||
it cs | |||
rev16cs \t1, \t1 | |||
add \h, \h, \t0, lsl #7 | |||
orrcs \cw, \cw, \t1, lsl \bs | |||
A orrcs \cw, \cw, \t1, lsl \bs | |||
T ittt cs | |||
T lslcs \t1, \t1, \bs | |||
T orrcs \cw, \cw, \t1 | |||
subcs \bs, \bs, #16 | |||
lsr \h, \h, #8 | |||
cmp \cw, \h, lsl #16 | |||
itt ge | |||
subge \cw, \cw, \h, lsl #16 | |||
subge \h, \t0, \h | |||
.endm | |||
@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
cmp r3, #0 | |||
ldr r11, [r5] | |||
ldm r0, {r5-r7} @ high, bits, buf | |||
it ne | |||
pkhtbne r11, r11, r11, asr #16 | |||
ldr r8, [r0, #16] @ code_word | |||
0: | |||
@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
adds r6, r6, r9 | |||
add r4, r4, #11 | |||
lsl r8, r8, r9 | |||
it cs | |||
ldrhcs r10, [r7], #2 | |||
lsl r9, r5, r9 | |||
mov r5, #128 | |||
it cs | |||
rev16cs r10, r10 | |||
add r5, r5, r9, lsl #7 | |||
orrcs r8, r8, r10, lsl r6 | |||
T ittt cs | |||
T lslcs r10, r10, r6 | |||
T orrcs r8, r8, r10 | |||
A orrcs r8, r8, r10, lsl r6 | |||
subcs r6, r6, #16 | |||
lsr r5, r5, #8 | |||
cmp r8, r5, lsl #16 | |||
movrel r10, zigzag_scan-1 | |||
itt ge | |||
subge r8, r8, r5, lsl #16 | |||
subge r5, r9, r5 | |||
ldrb r10, [r10, r3] | |||
it ge | |||
rsbge r12, r12, #0 | |||
cmp r3, #16 | |||
strh r12, [r1, r10] | |||
@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
ldr r0, [sp] | |||
ldr r9, [r0, #12] | |||
cmp r7, r9 | |||
it hi | |||
movhi r7, r9 | |||
stm r0, {r5-r7} @ high, bits, buf | |||
str r8, [r0, #16] @ code_word | |||
@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
mov r12, #2 | |||
ldrb r0, [r4, #4] | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r12, #1 | |||
ldrb r9, [lr, r5] | |||
blt 4f | |||
ldrb r0, [r4, #5] | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r12, #1 | |||
ldrb r9, [lr, r5] | |||
b 4f | |||
@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
mov r12, #5 | |||
mov r0, #159 | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r12, r12, #1 | |||
ldrb r9, [lr, r5] | |||
b 4f | |||
@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
mov r12, #7 | |||
mov r0, #165 | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r12, r12, #2 | |||
ldrb r9, [lr, r5] | |||
mov r0, #145 | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r12, r12, #1 | |||
ldrb r9, [lr, r5] | |||
b 4f | |||
3: | |||
ldrb r0, [r4, #8] | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
it ge | |||
addge r4, r4, #1 | |||
ldrb r9, [lr, r5] | |||
ite ge | |||
movge r12, #2 | |||
movlt r12, #0 | |||
ldrb r0, [r4, #9] | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
mov r9, #8 | |||
it ge | |||
addge r12, r12, #1 | |||
movrel r4, X(ff_vp8_dct_cat_prob) | |||
lsl r9, r9, r12 | |||
@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
lsl r1, r1, #1 | |||
rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
ldrb r0, [r4], #1 | |||
it ge | |||
addge r1, r1, #1 | |||
cmp r0, #0 | |||
bne 1b | |||
@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
add r4, r2, r4 | |||
add r4, r4, #22 | |||
rac_get_128 r5, r6, r7, r8, r9, r10 | |||
it ge | |||
rsbge r12, r12, #0 | |||
smulbb r12, r12, r11 | |||
movrel r9, zigzag_scan-1 | |||
@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1 | |||
push {r4-r6,lr} | |||
1: | |||
subs r12, r12, #4 | |||
ldr r4, [r2], r3 | |||
ldr r5, [r2], r3 | |||
ldr r6, [r2], r3 | |||
ldr lr, [r2], r3 | |||
str r4, [r0], r1 | |||
str r5, [r0], r1 | |||
str r6, [r0], r1 | |||
str lr, [r0], r1 | |||
ldr_post r4, r2, r3 | |||
ldr_post r5, r2, r3 | |||
ldr_post r6, r2, r3 | |||
ldr_post lr, r2, r3 | |||
str_post r4, r0, r1 | |||
str_post r5, r0, r1 | |||
str_post r6, r0, r1 | |||
str_post lr, r0, r1 | |||
bgt 1b | |||
pop {r4-r6,pc} | |||
endfunc | |||
@@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b) | |||
int r; | |||
__asm__ ("cmp %2, #2 \n\t" | |||
"ldr %0, [%3, %2, lsl #2] \n\t" | |||
"ite le \n\t" | |||
"lsrle %0, %1, #1 \n\t" | |||
"smmulgt %0, %0, %1 \n\t" | |||
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); | |||
@@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a) | |||
{ | |||
int x, y; | |||
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t" | |||
"itet ne \n\t" | |||
"mvnne %1, #1<<31 \n\t" | |||
"moveq %0, %Q2 \n\t" | |||
"eorne %0, %1, %R2, asr #31 \n\t" | |||