Browse Source

rv40: NEON optimised loop filter strength selection

Signed-off-by: Mans Rullgard <mans@mansr.com>
tags/n0.10
Mans Rullgard 14 years ago
parent
commit
71ce76027d
3 changed files with 102 additions and 0 deletions
  1. +6
    -0
      libavcodec/arm/asm.S
  2. +10
    -0
      libavcodec/arm/rv40dsp_init_neon.c
  3. +86
    -0
      libavcodec/arm/rv40dsp_neon.S

+ 6
- 0
libavcodec/arm/asm.S View File

@@ -113,6 +113,12 @@ T add \rn, \rn, \rm
T ldr \rt, [\rn]
.endm

.macro ldr_dpre rt, rn, rm:vararg
A ldr \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T ldr \rt, [\rn]
.endm

.macro ldr_post rt, rn, rm:vararg
A ldr \rt, [\rn], \rm
T ldr \rt, [\rn]


+ 10
- 0
libavcodec/arm/rv40dsp_init_neon.c View File

@@ -54,6 +54,13 @@ void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int);
void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int);

int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, int stride,
int beta, int beta2, int edge,
int *p1, int *q1);
int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride,
int beta, int beta2, int edge,
int *p1, int *q1);

void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
@@ -116,4 +123,7 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)

c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;

c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
}

+ 86
- 0
libavcodec/arm/rv40dsp_neon.S View File

@@ -722,3 +722,89 @@ function ff_rv40_weight_func_8_neon, export=1
bne 1b
bx lr
endfunc

function ff_rv40_h_loop_filter_strength_neon, export=1
pkhbt r2, r3, r2, lsl #18

ldr r3, [r0]
ldr_dpre r12, r0, r1
teq r3, r12
beq 1f

sub r0, r0, r1, lsl #1

vld1.32 {d4[]}, [r0,:32], r1 @ -3
vld1.32 {d0[]}, [r0,:32], r1 @ -2
vld1.32 {d4[1]}, [r0,:32], r1 @ -1
vld1.32 {d5[]}, [r0,:32], r1 @ 0
vld1.32 {d1[]}, [r0,:32], r1 @ 1
vld1.32 {d5[0]}, [r0,:32], r1 @ 2

vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
vdup.32 d30, r2 @ beta2, beta << 2
vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
vabd.u16 d16, d18, d16
vclt.u16 d16, d16, d30

ldrd r2, r3, [sp, #4]
vmovl.u16 q12, d16
vtrn.16 d16, d17
vshr.u32 q12, q12, #15
ldr r0, [sp]
vst1.32 {d24[1]}, [r2,:32]
vst1.32 {d25[1]}, [r3,:32]

cmp r0, #0
it eq
bxeq lr

vand d18, d16, d17
vtrn.32 d18, d19
vand d18, d18, d19
vmov.u16 r0, d18[0]
bx lr
1:
ldrd r2, r3, [sp, #4]
mov r0, #0
str r0, [r2]
str r0, [r3]
bx lr
endfunc

function ff_rv40_v_loop_filter_strength_neon, export=1
sub r0, r0, #3
pkhbt r2, r3, r2, lsl #18

vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d3}, [r0], r1

vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vdup.32 q15, r2
vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
vabd.u16 q0, q1, q0
vclt.u16 q0, q0, q15

ldrd r2, r3, [sp, #4]
vmovl.u16 q1, d0
vext.16 d1, d0, d1, #3
vshr.u32 q1, q1, #15
ldr r0, [sp]
vst1.32 {d2[1]}, [r2,:32]
vst1.32 {d3[1]}, [r3,:32]

cmp r0, #0
it eq
bxeq lr

vand d0, d0, d1
vtrn.16 d0, d1
vand d0, d0, d1
vmov.u16 r0, d0[0]
bx lr
endfunc

Loading…
Cancel
Save