Browse Source

pngdsp x86: use unaligned access

For test images manually generated to contain only up prediction,
timing results:
         8380x3032    255x185
before:   138635       1992
after:    139232       1996

Actually jumping to the proper version depending on the alignment:
8380x3032: 138767

A 0.5% speed improvement for gigantic images is not worth the code
duplication.

Fixes ticket #4148

Signed-off-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Tested-by: Benoit Fouet <benoit.fouet@free.fr>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
tags/n2.5
Christophe Gisquet Michael Niedermayer 11 years ago
parent
commit
9fa056ba75
2 changed files with 8 additions and 8 deletions
  1. +2
    -2
      libavcodec/pngdsp.h
  2. +6
    -6
      libavcodec/x86/pngdsp.asm

+ 2
- 2
libavcodec/pngdsp.h View File

@@ -25,9 +25,9 @@
#include <stdint.h>

typedef struct PNGDSPContext {
void (*add_bytes_l2)(uint8_t *dst /* align 16 */,
void (*add_bytes_l2)(uint8_t *dst,
uint8_t *src1 /* align 16 */,
uint8_t *src2 /* align 16 */, int w);
uint8_t *src2, int w);

/* this might write to dst[w] */
void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,


+ 6
- 6
libavcodec/x86/pngdsp.asm View File

@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
mova m0, [src1q+iq]
mova m1, [src1q+iq+mmsize]
paddb m0, [src2q+iq]
paddb m1, [src2q+iq+mmsize]
mova [dstq+iq ], m0
mova [dstq+iq+mmsize], m1
movu m0, [src2q+iq]
movu m1, [src2q+iq+mmsize]
paddb m0, [src1q+iq]
paddb m1, [src1q+iq+mmsize]
movu [dstq+iq ], m0
movu [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq


Loading…
Cancel
Save