Browse Source

rv40dsp x86: use only one register, for both increment and loop counter

Around 10 cycles faster for luma.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
tags/n0.11
Christophe GISQUET Ronald S. Bultje 13 years ago
parent
commit
2130bd8f5b
1 changed files with 20 additions and 23 deletions
  1. +20
    -23
      libavcodec/x86/rv40dsp.asm

+ 20
- 23
libavcodec/x86/rv40dsp.asm View File

@@ -32,13 +32,14 @@ SECTION .text


; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5 %macro RV40_WCORE 4-5
movh m4, [%3 + 0]
movh m5, [%4 + 0]
movh m4, [%3 + r6 + 0]
movh m5, [%4 + r6 + 0]
%if %0 == 4 %if %0 == 4
%define OFFSET mmsize / 2
%define OFFSET r6 + mmsize / 2
%else %else
; 8x8 block and sse2, stride was provided ; 8x8 block and sse2, stride was provided
%define OFFSET %5
%define OFFSET r6
add r6, r5
%endif %endif
movh m6, [%3 + OFFSET] movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET] movh m7, [%4 + OFFSET]
@@ -99,10 +100,12 @@ SECTION .text
packuswb m4, m6 packuswb m4, m6
%if %0 == 5 %if %0 == 5
; Only called for 8x8 blocks and sse2 ; Only called for 8x8 blocks and sse2
movh [%2 + 0], m4
movhps [%2 + %5], m4
sub r6, r5
movh [%2 + r6], m4
add r6, r5
movhps [%2 + r6], m4
%else %else
mova [%2], m4
mova [%2 + r6], m4
%endif %endif
%endmacro %endmacro


@@ -115,26 +118,19 @@ SECTION .text
%endif %endif


; Prepare for next loop ; Prepare for next loop
add r0, r5
add r1, r5
add r2, r5
add r6, r5
%else %else
%ifidn %1, 8 %ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5 RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines ; Prepare 2 next lines
lea r0, [r0 + 2 * r5]
lea r1, [r1 + 2 * r5]
lea r2, [r2 + 2 * r5]
add r6, r5
%else %else
RV40_WCORE %2, r0, r1, r2 RV40_WCORE %2, r0, r1, r2
; Prepare single next line ; Prepare single next line
add r0, r5
add r1, r5
add r2, r5
add r6, r5
%endif %endif
%endif %endif


dec r6
%endmacro %endmacro


; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
@@ -145,7 +141,7 @@ SECTION .text
; Therefore, we check here whether they are multiples of 2^9 for ; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur. ; those simplifications to occur.
%macro RV40_WEIGHT 3 %macro RV40_WEIGHT 3
cglobal rv40_weight_func_%1_%2, 6, 7, %3
cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3) %if cpuflag(ssse3)
mova m1, [shift_round] mova m1, [shift_round]
%else %else
@@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3
%endif %endif
pxor m0, m0 pxor m0, m0
; Set loop counter and increments ; Set loop counter and increments
%if mmsize == 8
mov r6, %2
%else
mov r6, (%2 * %2) / mmsize
%endif
mov r6, r5
shl r6, %3
add r0, r6
add r1, r6
add r2, r6
neg r6


movd m2, r3 movd m2, r3
movd m3, r4 movd m3, r4


Loading…
Cancel
Save