while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -316,8 +316,11 @@ endif | |||||
# armv4l specific stuff | # armv4l specific stuff | ||||
ifeq ($(TARGET_ARCH_ARMV4L),yes) | ifeq ($(TARGET_ARCH_ARMV4L),yes) | ||||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o | |||||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o | |||||
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o | OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o | ||||
ifeq ($(TARGET_IWMMXT),yes) | |||||
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o | |||||
endif | |||||
endif | endif | ||||
# sun mediaLib specific stuff | # sun mediaLib specific stuff | ||||
@@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o | |||||
CFLAGS += $(MLIB_INC) | CFLAGS += $(MLIB_INC) | ||||
endif | endif | ||||
# Intel IPP specific stuff | |||||
# currently only works when libavcodec is used in mplayer | |||||
ifeq ($(HAVE_IPP),yes) | |||||
CFLAGS += $(IPP_INC) | |||||
endif | |||||
# alpha specific stuff | # alpha specific stuff | ||||
ifeq ($(TARGET_ARCH_ALPHA),yes) | ifeq ($(TARGET_ARCH_ALPHA),yes) | ||||
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \ | OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \ | ||||
@@ -18,6 +18,13 @@ | |||||
*/ | */ | ||||
#include "../dsputil.h" | #include "../dsputil.h" | ||||
#ifdef HAVE_IPP | |||||
#include "ipp.h" | |||||
#endif | |||||
#ifdef HAVE_IWMMXT | |||||
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); | |||||
#endif | |||||
extern void j_rev_dct_ARM(DCTELEM *data); | extern void j_rev_dct_ARM(DCTELEM *data); | ||||
extern void simple_idct_ARM(DCTELEM *data); | extern void simple_idct_ARM(DCTELEM *data); | ||||
@@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data); | |||||
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | ||||
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | ||||
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||||
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_pixels8_x2_arm(block, pixels, line_size, h); | |||||
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_pixels8_y2_arm(block, pixels, line_size, h); | |||||
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_pixels8_xy2_arm(block, pixels, line_size, h); | |||||
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h); | |||||
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h); | |||||
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h); | |||||
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size) | |||||
{ | |||||
asm volatile ( | |||||
"mov r10, #8 \n\t" | |||||
"1: \n\t" | |||||
/* load dest */ | |||||
"ldr r4, [%1] \n\t" | |||||
/* block[0] and block[1]*/ | |||||
"ldrsh r5, [%0] \n\t" | |||||
"ldrsh r7, [%0, #2] \n\t" | |||||
"and r6, r4, #0xFF \n\t" | |||||
"and r8, r4, #0xFF00 \n\t" | |||||
"add r6, r5, r6 \n\t" | |||||
"add r8, r7, r8, lsr #8 \n\t" | |||||
"mvn r5, r5 \n\t" | |||||
"mvn r7, r7 \n\t" | |||||
"tst r6, #0x100 \n\t" | |||||
"movne r6, r5, lsr #24 \n\t" | |||||
"tst r8, #0x100 \n\t" | |||||
"movne r8, r7, lsr #24 \n\t" | |||||
"mov r9, r6 \n\t" | |||||
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */ | |||||
"orr r9, r9, r8, lsl #8 \n\t" | |||||
/* block[2] and block[3] */ | |||||
/* [A] */ | |||||
"ldrsh r7, [%0, #6] \n\t" | |||||
"and r6, r4, #0xFF0000 \n\t" | |||||
"and r8, r4, #0xFF000000 \n\t" | |||||
"add r6, r5, r6, lsr #16 \n\t" | |||||
"add r8, r7, r8, lsr #24 \n\t" | |||||
"mvn r5, r5 \n\t" | |||||
"mvn r7, r7 \n\t" | |||||
"tst r6, #0x100 \n\t" | |||||
"movne r6, r5, lsr #24 \n\t" | |||||
"tst r8, #0x100 \n\t" | |||||
"movne r8, r7, lsr #24 \n\t" | |||||
"orr r9, r9, r6, lsl #16 \n\t" | |||||
"ldr r4, [%1, #4] \n\t" /* moved form [B] */ | |||||
"orr r9, r9, r8, lsl #24 \n\t" | |||||
/* store dest */ | |||||
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */ | |||||
"str r9, [%1] \n\t" | |||||
/* load dest */ | |||||
/* [B] */ | |||||
/* block[4] and block[5] */ | |||||
/* [C] */ | |||||
"ldrsh r7, [%0, #10] \n\t" | |||||
"and r6, r4, #0xFF \n\t" | |||||
"and r8, r4, #0xFF00 \n\t" | |||||
"add r6, r5, r6 \n\t" | |||||
"add r8, r7, r8, lsr #8 \n\t" | |||||
"mvn r5, r5 \n\t" | |||||
"mvn r7, r7 \n\t" | |||||
"tst r6, #0x100 \n\t" | |||||
"movne r6, r5, lsr #24 \n\t" | |||||
"tst r8, #0x100 \n\t" | |||||
"movne r8, r7, lsr #24 \n\t" | |||||
"mov r9, r6 \n\t" | |||||
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */ | |||||
"orr r9, r9, r8, lsl #8 \n\t" | |||||
/* block[6] and block[7] */ | |||||
/* [D] */ | |||||
"ldrsh r7, [%0, #14] \n\t" | |||||
"and r6, r4, #0xFF0000 \n\t" | |||||
"and r8, r4, #0xFF000000 \n\t" | |||||
"add r6, r5, r6, lsr #16 \n\t" | |||||
"add r8, r7, r8, lsr #24 \n\t" | |||||
"mvn r5, r5 \n\t" | |||||
"mvn r7, r7 \n\t" | |||||
"tst r6, #0x100 \n\t" | |||||
"movne r6, r5, lsr #24 \n\t" | |||||
"tst r8, #0x100 \n\t" | |||||
"movne r8, r7, lsr #24 \n\t" | |||||
"orr r9, r9, r6, lsl #16 \n\t" | |||||
"add %0, %0, #16 \n\t" /* moved from [E] */ | |||||
"orr r9, r9, r8, lsl #24 \n\t" | |||||
"subs r10, r10, #1 \n\t" /* moved from [F] */ | |||||
/* store dest */ | |||||
"str r9, [%1, #4] \n\t" | |||||
/* [E] */ | |||||
/* [F] */ | |||||
"add %1, %1, %2 \n\t" | |||||
"bne 1b \n\t" | |||||
: | |||||
: "r"(block), | |||||
"r"(dest), | |||||
"r"(line_size) | |||||
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" ); | |||||
} | |||||
/* XXX: those functions should be suppressed ASAP when all IDCTs are | /* XXX: those functions should be suppressed ASAP when all IDCTs are | ||||
converted */ | converted */ | ||||
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) | static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) | ||||
@@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) | |||||
simple_idct_ARM (block); | simple_idct_ARM (block); | ||||
ff_add_pixels_clamped(block, dest, line_size); | ff_add_pixels_clamped(block, dest, line_size); | ||||
} | } | ||||
static void simple_idct_ipp(DCTELEM *block) | |||||
{ | |||||
#ifdef HAVE_IPP | |||||
ippiDCT8x8Inv_Video_16s_C1I(block); | |||||
#endif | |||||
} | |||||
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) | |||||
{ | |||||
#ifdef HAVE_IPP | |||||
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); | |||||
#endif | |||||
} | |||||
#ifdef HAVE_IWMMXT | |||||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); | |||||
#endif | |||||
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) | |||||
{ | |||||
#ifdef HAVE_IPP | |||||
ippiDCT8x8Inv_Video_16s_C1I(block); | |||||
#ifdef HAVE_IWMMXT | |||||
add_pixels_clamped_iwmmxt(block, dest, line_size); | |||||
#else | |||||
add_pixels_clamped_ARM(block, dest, line_size); | |||||
#endif | |||||
#endif | |||||
} | |||||
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | ||||
{ | { | ||||
@@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||||
ff_put_pixels_clamped = c->put_pixels_clamped; | ff_put_pixels_clamped = c->put_pixels_clamped; | ||||
ff_add_pixels_clamped = c->add_pixels_clamped; | ff_add_pixels_clamped = c->add_pixels_clamped; | ||||
#ifdef HAVE_IPP | |||||
if(idct_algo==FF_IDCT_ARM){ | |||||
#else | |||||
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){ | if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){ | ||||
#endif | |||||
c->idct_put= j_rev_dct_ARM_put; | c->idct_put= j_rev_dct_ARM_put; | ||||
c->idct_add= j_rev_dct_ARM_add; | c->idct_add= j_rev_dct_ARM_add; | ||||
c->idct = j_rev_dct_ARM; | c->idct = j_rev_dct_ARM; | ||||
@@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||||
c->idct_add= simple_idct_ARM_add; | c->idct_add= simple_idct_ARM_add; | ||||
c->idct = simple_idct_ARM; | c->idct = simple_idct_ARM; | ||||
c->idct_permutation_type= FF_NO_IDCT_PERM; | c->idct_permutation_type= FF_NO_IDCT_PERM; | ||||
#ifdef HAVE_IPP | |||||
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){ | |||||
#else | |||||
} else if (idct_algo==FF_IDCT_IPP){ | |||||
#endif | |||||
c->idct_put= simple_idct_ipp_put; | |||||
c->idct_add= simple_idct_ipp_add; | |||||
c->idct = simple_idct_ipp; | |||||
c->idct_permutation_type= FF_NO_IDCT_PERM; | |||||
} | } | ||||
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG! | |||||
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK! | |||||
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK! | |||||
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */ | |||||
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */ | |||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK | |||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK | |||||
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */ | |||||
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK | |||||
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK | |||||
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */ | |||||
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */ | |||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK | |||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK | |||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK | |||||
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */ | |||||
#if 1 | |||||
#ifdef HAVE_IWMMXT | |||||
dsputil_init_iwmmxt(c, avctx); | |||||
#endif | |||||
#endif | |||||
} | } |
@@ -0,0 +1,694 @@ | |||||
@ | |||||
@ ARMv4L optimized DSP utils | |||||
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> | |||||
@ | |||||
@ This library is free software; you can redistribute it and/or | |||||
@ modify it under the terms of the GNU Lesser General Public | |||||
@ License as published by the Free Software Foundation; either | |||||
@ version 2 of the License, or (at your option) any later version. | |||||
@ | |||||
@ This library is distributed in the hope that it will be useful, | |||||
@ but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
@ Lesser General Public License for more details. | |||||
@ | |||||
@ You should have received a copy of the GNU Lesser General Public | |||||
@ License along with this library; if not, write to the Free Software | |||||
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||||
@ | |||||
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 | |||||
mov \Rd0, \Rn0, lsr #(\shift * 8) | |||||
mov \Rd1, \Rn1, lsr #(\shift * 8) | |||||
mov \Rd2, \Rn2, lsr #(\shift * 8) | |||||
mov \Rd3, \Rn3, lsr #(\shift * 8) | |||||
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) | |||||
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) | |||||
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) | |||||
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) | |||||
.endm | |||||
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 | |||||
mov \R0, \R0, lsr #(\shift * 8) | |||||
orr \R0, \R0, \R1, lsl #(32 - \shift * 8) | |||||
mov \R1, \R1, lsr #(\shift * 8) | |||||
orr \R1, \R1, \R2, lsl #(32 - \shift * 8) | |||||
.endm | |||||
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 | |||||
mov \Rdst0, \Rsrc0, lsr #(\shift * 8) | |||||
mov \Rdst1, \Rsrc1, lsr #(\shift * 8) | |||||
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) | |||||
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) | |||||
.endm | |||||
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||||
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||||
@ Rmask = 0xFEFEFEFE | |||||
@ Rn = destroy | |||||
eor \Rd0, \Rn0, \Rm0 | |||||
eor \Rd1, \Rn1, \Rm1 | |||||
orr \Rn0, \Rn0, \Rm0 | |||||
orr \Rn1, \Rn1, \Rm1 | |||||
and \Rd0, \Rd0, \Rmask | |||||
and \Rd1, \Rd1, \Rmask | |||||
sub \Rd0, \Rn0, \Rd0, lsr #1 | |||||
sub \Rd1, \Rn1, \Rd1, lsr #1 | |||||
.endm | |||||
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||||
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||||
@ Rmask = 0xFEFEFEFE | |||||
@ Rn = destroy | |||||
eor \Rd0, \Rn0, \Rm0 | |||||
eor \Rd1, \Rn1, \Rm1 | |||||
and \Rn0, \Rn0, \Rm0 | |||||
and \Rn1, \Rn1, \Rm1 | |||||
and \Rd0, \Rd0, \Rmask | |||||
and \Rd1, \Rd1, \Rmask | |||||
add \Rd0, \Rn0, \Rd0, lsr #1 | |||||
add \Rd1, \Rn1, \Rd1, lsr #1 | |||||
.endm | |||||
@ ---------------------------------------------------------------- | |||||
.align 8 | |||||
.global put_pixels16_arm | |||||
put_pixels16_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r11, lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
bic r1, r1, #3 | |||||
add r5, r5, r4, lsl #2 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r7} | |||||
add r1, r1, r2 | |||||
stmia r0, {r4-r7} | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
add r0, r0, r2 | |||||
bne 1b | |||||
ldmfd sp!, {r4-r11, pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r8} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r9-r12} | |||||
add r0, r0, r2 | |||||
bne 2b | |||||
ldmfd sp!, {r4-r11, pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r8} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r9-r12} | |||||
add r0, r0, r2 | |||||
bne 3b | |||||
ldmfd sp!, {r4-r11, pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r8} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r9-r12} | |||||
add r0, r0, r2 | |||||
bne 4b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
5: | |||||
.word 1b | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
@ ---------------------------------------------------------------- | |||||
.align 8 | |||||
.global put_pixels8_arm | |||||
put_pixels8_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r5,lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
bic r1, r1, #3 | |||||
add r5, r5, r4, lsl #2 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r5} | |||||
add r1, r1, r2 | |||||
subs r3, r3, #1 | |||||
pld [r1] | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 1b | |||||
ldmfd sp!, {r4-r5,pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r5, r12} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 2b | |||||
ldmfd sp!, {r4-r5,pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r5, r12} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 3b | |||||
ldmfd sp!, {r4-r5,pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r5, r12} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 | |||||
pld [r1] | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 4b | |||||
ldmfd sp!, {r4-r5,pc} | |||||
.align 8 | |||||
5: | |||||
.word 1b | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
@ ---------------------------------------------------------------- | |||||
.align 8 | |||||
.global put_pixels8_x2_arm | |||||
put_pixels8_x2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
ldr r12, [r5] | |||||
add r5, r5, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||||
pld [r1] | |||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 1b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||||
pld [r1] | |||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 2b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||||
pld [r1] | |||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 3b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||||
pld [r1] | |||||
RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 4b | |||||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||||
.align 8 | |||||
5: | |||||
.word 0xFEFEFEFE | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
.align 8 | |||||
.global put_no_rnd_pixels8_x2_arm | |||||
put_no_rnd_pixels8_x2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
ldr r12, [r5] | |||||
add r5, r5, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||||
pld [r1] | |||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 1b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||||
pld [r1] | |||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 2b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||||
pld [r1] | |||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 3b | |||||
ldmfd sp!, {r4-r10,pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r5, r10} | |||||
add r1, r1, r2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||||
pld [r1] | |||||
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 4b | |||||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||||
.align 8 | |||||
5: | |||||
.word 0xFEFEFEFE | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
@ ---------------------------------------------------------------- | |||||
.align 8 | |||||
.global put_pixels8_y2_arm | |||||
put_pixels8_y2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
mov r3, r3, lsr #1 | |||||
ldr r12, [r5] | |||||
add r5, r5, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r5} | |||||
add r1, r1, r2 | |||||
6: ldmia r1, {r6-r7} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||||
ldmia r1, {r4-r5} | |||||
add r1, r1, r2 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
pld [r1] | |||||
RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
5: | |||||
.word 0xFEFEFEFE | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
.align 8 | |||||
.global put_no_rnd_pixels8_y2_arm | |||||
put_no_rnd_pixels8_y2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||||
adr r5, 5f | |||||
ands r4, r1, #3 | |||||
mov r3, r3, lsr #1 | |||||
ldr r12, [r5] | |||||
add r5, r5, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
ldmia r1, {r4-r5} | |||||
add r1, r1, r2 | |||||
6: ldmia r1, {r6-r7} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||||
ldmia r1, {r4-r5} | |||||
add r1, r1, r2 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
pld [r1] | |||||
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r8-r9} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
2: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
3: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
4: | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||||
6: ldmia r1, {r7-r9} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
ldmia r1, {r4-r6} | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||||
subs r3, r3, #1 | |||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||||
stmia r0, {r10-r11} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.align 8 | |||||
5: | |||||
.word 0xFEFEFEFE | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
@ ---------------------------------------------------------------- | |||||
.macro RND_XY2_IT align, rnd | |||||
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) | |||||
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) | |||||
.if \align == 0 | |||||
ldmia r1, {r6-r8} | |||||
.elseif \align == 3 | |||||
ldmia r1, {r5-r7} | |||||
.else | |||||
ldmia r1, {r8-r10} | |||||
.endif | |||||
add r1, r1, r2 | |||||
pld [r1] | |||||
.if \align == 0 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 | |||||
.elseif \align == 1 | |||||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 | |||||
.elseif \align == 2 | |||||
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 | |||||
.elseif \align == 3 | |||||
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 | |||||
.endif | |||||
ldr r14, [r12, #0] @ 0x03030303 | |||||
tst r3, #1 | |||||
and r8, r4, r14 | |||||
and r9, r5, r14 | |||||
and r10, r6, r14 | |||||
and r11, r7, r14 | |||||
.if \rnd == 1 | |||||
ldreq r14, [r12, #16] @ 0x02020202 | |||||
.else | |||||
ldreq r14, [r12, #28] @ 0x01010101 | |||||
.endif | |||||
add r8, r8, r10 | |||||
add r9, r9, r11 | |||||
addeq r8, r8, r14 | |||||
addeq r9, r9, r14 | |||||
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 | |||||
and r4, r14, r4, lsr #2 | |||||
and r5, r14, r5, lsr #2 | |||||
and r6, r14, r6, lsr #2 | |||||
and r7, r14, r7, lsr #2 | |||||
add r10, r4, r6 | |||||
add r11, r5, r7 | |||||
.endm | |||||
.macro RND_XY2_EXPAND align, rnd | |||||
RND_XY2_IT \align, \rnd | |||||
6: stmfd sp!, {r8-r11} | |||||
RND_XY2_IT \align, \rnd | |||||
ldmfd sp!, {r4-r7} | |||||
add r4, r4, r8 | |||||
add r5, r5, r9 | |||||
add r6, r6, r10 | |||||
add r7, r7, r11 | |||||
ldr r14, [r12, #24] @ 0x0F0F0F0F | |||||
and r4, r14, r4, lsr #2 | |||||
and r5, r14, r5, lsr #2 | |||||
add r4, r4, r6 | |||||
add r5, r5, r7 | |||||
subs r3, r3, #1 | |||||
stmia r0, {r4-r5} | |||||
add r0, r0, r2 | |||||
bne 6b | |||||
ldmfd sp!, {r4-r11,pc} | |||||
.endm | |||||
.align 8 | |||||
.global put_pixels8_xy2_arm | |||||
put_pixels8_xy2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||||
adrl r12, 5f | |||||
ands r4, r1, #3 | |||||
add r5, r12, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
RND_XY2_EXPAND 0, 1 | |||||
.align 8 | |||||
2: | |||||
RND_XY2_EXPAND 1, 1 | |||||
.align 8 | |||||
3: | |||||
RND_XY2_EXPAND 2, 1 | |||||
.align 8 | |||||
4: | |||||
RND_XY2_EXPAND 3, 1 | |||||
5: | |||||
.word 0x03030303 | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
.word 0x02020202 | |||||
.word 0xFCFCFCFC >> 2 | |||||
.word 0x0F0F0F0F | |||||
.word 0x01010101 | |||||
.align 8 | |||||
.global put_no_rnd_pixels8_xy2_arm | |||||
put_no_rnd_pixels8_xy2_arm: | |||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
@ block = word aligned, pixles = unaligned | |||||
pld [r1] | |||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||||
adrl r12, 5f | |||||
ands r4, r1, #3 | |||||
add r5, r12, r4, lsl #2 | |||||
bic r1, r1, #3 | |||||
ldrne pc, [r5] | |||||
1: | |||||
RND_XY2_EXPAND 0, 0 | |||||
.align 8 | |||||
2: | |||||
RND_XY2_EXPAND 1, 0 | |||||
.align 8 | |||||
3: | |||||
RND_XY2_EXPAND 2, 0 | |||||
.align 8 | |||||
4: | |||||
RND_XY2_EXPAND 3, 0 | |||||
5: | |||||
.word 0x03030303 | |||||
.word 2b | |||||
.word 3b | |||||
.word 4b | |||||
.word 0x02020202 | |||||
.word 0xFCFCFCFC >> 2 | |||||
.word 0x0F0F0F0F | |||||
.word 0x01010101 |
@@ -0,0 +1,168 @@ | |||||
/* | |||||
* iWMMXt optimized DSP utils | |||||
* Copyright (c) 2004 AGAWA Koji | |||||
* | |||||
* This library is free software; you can redistribute it and/or | |||||
* modify it under the terms of the GNU Lesser General Public | |||||
* License as published by the Free Software Foundation; either | |||||
* version 2 of the License, or (at your option) any later version. | |||||
* | |||||
* This library is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
* Lesser General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU Lesser General Public | |||||
* License along with this library; if not, write to the Free Software | |||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||||
*/ | |||||
#include "../dsputil.h" | |||||
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt | |||||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); | |||||
#define WAVG2B "wavg2b" | |||||
#include "dsputil_iwmmxt_rnd.h" | |||||
#undef DEF | |||||
#undef SET_RND | |||||
#undef WAVG2B | |||||
#define DEF(x, y) x ## _ ## y ##_iwmmxt | |||||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); | |||||
#define WAVG2B "wavg2br" | |||||
#include "dsputil_iwmmxt_rnd.h" | |||||
#undef DEF | |||||
#undef SET_RND | |||||
#undef WAVG2BR | |||||
// need scheduling | |||||
#define OP(AVG) \ | |||||
asm volatile ( \ | |||||
/* alignment */ \ | |||||
"and r12, %[pixels], #7 \n\t" \ | |||||
"bic %[pixels], %[pixels], #7 \n\t" \ | |||||
"tmcr wcgr1, r12 \n\t" \ | |||||
\ | |||||
"wldrd wr0, [%[pixels]] \n\t" \ | |||||
"wldrd wr1, [%[pixels], #8] \n\t" \ | |||||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||||
"walignr1 wr4, wr0, wr1 \n\t" \ | |||||
\ | |||||
"1: \n\t" \ | |||||
\ | |||||
"wldrd wr2, [%[pixels]] \n\t" \ | |||||
"wldrd wr3, [%[pixels], #8] \n\t" \ | |||||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||||
"pld [%[pixels]] \n\t" \ | |||||
"walignr1 wr5, wr2, wr3 \n\t" \ | |||||
AVG " wr6, wr4, wr5 \n\t" \ | |||||
"wstrd wr6, [%[block]] \n\t" \ | |||||
"add %[block], %[block], %[line_size] \n\t" \ | |||||
\ | |||||
"wldrd wr0, [%[pixels]] \n\t" \ | |||||
"wldrd wr1, [%[pixels], #8] \n\t" \ | |||||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||||
"walignr1 wr4, wr0, wr1 \n\t" \ | |||||
"pld [%[pixels]] \n\t" \ | |||||
AVG " wr6, wr4, wr5 \n\t" \ | |||||
"wstrd wr6, [%[block]] \n\t" \ | |||||
"add %[block], %[block], %[line_size] \n\t" \ | |||||
\ | |||||
"subs %[h], %[h], #2 \n\t" \ | |||||
"bne 1b \n\t" \ | |||||
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ | |||||
: [line_size]"r"(line_size) \ | |||||
: "memory", "r12"); | |||||
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||||
{ | |||||
OP("wavg2br"); | |||||
} | |||||
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||||
{ | |||||
OP("wavg2b"); | |||||
} | |||||
#undef OP | |||||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) | |||||
{ | |||||
uint8_t *pixels2 = pixels + line_size; | |||||
__asm__ __volatile__ ( | |||||
"mov r12, #4 \n\t" | |||||
"1: \n\t" | |||||
"pld [%[pixels], %[line_size2]] \n\t" | |||||
"pld [%[pixels2], %[line_size2]] \n\t" | |||||
"wldrd wr4, [%[pixels]] \n\t" | |||||
"wldrd wr5, [%[pixels2]] \n\t" | |||||
"pld [%[block], #32] \n\t" | |||||
"wunpckelub wr6, wr4 \n\t" | |||||
"wldrd wr0, [%[block]] \n\t" | |||||
"wunpckehub wr7, wr4 \n\t" | |||||
"wldrd wr1, [%[block], #8] \n\t" | |||||
"wunpckelub wr8, wr5 \n\t" | |||||
"wldrd wr2, [%[block], #16] \n\t" | |||||
"wunpckehub wr9, wr5 \n\t" | |||||
"wldrd wr3, [%[block], #24] \n\t" | |||||
"add %[block], %[block], #32 \n\t" | |||||
"waddhss wr10, wr0, wr6 \n\t" | |||||
"waddhss wr11, wr1, wr7 \n\t" | |||||
"waddhss wr12, wr2, wr8 \n\t" | |||||
"waddhss wr13, wr3, wr9 \n\t" | |||||
"wpackhus wr14, wr10, wr11 \n\t" | |||||
"wpackhus wr15, wr12, wr13 \n\t" | |||||
"wstrd wr14, [%[pixels]] \n\t" | |||||
"add %[pixels], %[pixels], %[line_size2] \n\t" | |||||
"subs r12, r12, #1 \n\t" | |||||
"wstrd wr15, [%[pixels2]] \n\t" | |||||
"add %[pixels2], %[pixels2], %[line_size2] \n\t" | |||||
"bne 1b \n\t" | |||||
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) | |||||
: [line_size2]"r"(line_size << 1) | |||||
: "cc", "memory", "r12"); | |||||
} | |||||
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||||
{ | |||||
return; | |||||
} | |||||
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) | |||||
{ | |||||
c->add_pixels_clamped = add_pixels_clamped_iwmmxt; | |||||
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||||
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; | |||||
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; | |||||
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; | |||||
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||||
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; | |||||
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; | |||||
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; | |||||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; | |||||
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; | |||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; | |||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; | |||||
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; | |||||
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; | |||||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; | |||||
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; | |||||
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; | |||||
} |
@@ -21,6 +21,13 @@ | |||||
#include "../mpegvideo.h" | #include "../mpegvideo.h" | ||||
#include "../avcodec.h" | #include "../avcodec.h" | ||||
#ifdef HAVE_IWMMXT | |||||
extern void MPV_common_init_iwmmxt(MpegEncContext *s); | |||||
#endif | |||||
void MPV_common_init_armv4l(MpegEncContext *s) | void MPV_common_init_armv4l(MpegEncContext *s) | ||||
{ | { | ||||
#ifdef HAVE_IWMMXT | |||||
MPV_common_init_iwmmxt(s); | |||||
#endif | |||||
} | } |
@@ -0,0 +1,97 @@ | |||||
#include "../dsputil.h" | |||||
#include "../mpegvideo.h" | |||||
#include "../avcodec.h" | |||||
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, | |||||
DCTELEM *block, int n, int qscale) | |||||
{ | |||||
int level, qmul, qadd; | |||||
int nCoeffs; | |||||
DCTELEM *block_orig = block; | |||||
assert(s->block_last_index[n]>=0); | |||||
qmul = qscale << 1; | |||||
if (!s->h263_aic) { | |||||
if (n < 4) | |||||
level = block[0] * s->y_dc_scale; | |||||
else | |||||
level = block[0] * s->c_dc_scale; | |||||
qadd = (qscale - 1) | 1; | |||||
}else{ | |||||
qadd = 0; | |||||
level = block[0]; | |||||
} | |||||
if(s->ac_pred) | |||||
nCoeffs=63; | |||||
else | |||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||||
__asm__ __volatile__ ( | |||||
/* "movd %1, %%mm6 \n\t" //qmul */ | |||||
/* "packssdw %%mm6, %%mm6 \n\t" */ | |||||
/* "packssdw %%mm6, %%mm6 \n\t" */ | |||||
"tbcsth wr6, %[qmul] \n\t" | |||||
/* "movd %2, %%mm5 \n\t" //qadd */ | |||||
/* "packssdw %%mm5, %%mm5 \n\t" */ | |||||
/* "packssdw %%mm5, %%mm5 \n\t" */ | |||||
"tbcsth wr5, %[qadd] \n\t" | |||||
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ | |||||
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ | |||||
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ | |||||
"1: \n\t" | |||||
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ | |||||
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ | |||||
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ | |||||
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ | |||||
/* "movq (%0, %3), %%mm2 \n\t" */ | |||||
/* "movq 8(%0, %3), %%mm3 \n\t" */ | |||||
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ | |||||
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ | |||||
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ | |||||
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ | |||||
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ | |||||
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ | |||||
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ | |||||
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ | |||||
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ | |||||
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ | |||||
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ | |||||
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ | |||||
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ | |||||
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ | |||||
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ | |||||
"subs %[i], %[i], #1 \n\t" | |||||
"bne 1b \n\t" /* "jng 1b \n\t" */ | |||||
:[block]"+r"(block) | |||||
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) | |||||
:"memory"); | |||||
block_orig[0] = level; | |||||
} | |||||
#if 0 | |||||
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, | |||||
DCTELEM *block, int n, int qscale) | |||||
{ | |||||
int nCoeffs; | |||||
assert(s->block_last_index[n]>=0); | |||||
if(s->ac_pred) | |||||
nCoeffs=63; | |||||
else | |||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||||
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); | |||||
} | |||||
#endif | |||||
void MPV_common_init_iwmmxt(MpegEncContext *s) | |||||
{ | |||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; | |||||
#if 0 | |||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; | |||||
#endif | |||||
} |
@@ -1180,6 +1180,7 @@ typedef struct AVCodecContext { | |||||
#define FF_IDCT_SIMPLEARM 10 | #define FF_IDCT_SIMPLEARM 10 | ||||
#define FF_IDCT_H264 11 | #define FF_IDCT_H264 11 | ||||
#define FF_IDCT_VP3 12 | #define FF_IDCT_VP3 12 | ||||
#define FP_IDCT_IPP 13 | |||||
/** | /** | ||||
* slice count. | * slice count. | ||||
@@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){ | |||||
return (x>>8) | (x<<8); | return (x>>8) | (x<<8); | ||||
} | } | ||||
#ifdef ARCH_ARM | |||||
static always_inline uint32_t bswap_32(uint32_t x){ | |||||
uint32_t t; | |||||
__asm__ ( | |||||
"eor %1, %0, %0, ror #16 \n\t" | |||||
"bic %1, %1, #0xFF0000 \n\t" | |||||
"mov %0, %0, ror #8 \n\t" | |||||
"eor %0, %0, %1, lsr #8 \n\t" | |||||
: "+r"(x), "+r"(t)); | |||||
return x; | |||||
} | |||||
#else | |||||
static always_inline uint32_t bswap_32(uint32_t x){ | static always_inline uint32_t bswap_32(uint32_t x){ | ||||
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); | x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); | ||||
return (x>>16) | (x<<16); | return (x>>16) | (x<<16); | ||||
} | } | ||||
#endif | |||||
static inline uint64_t bswap_64(uint64_t x) | static inline uint64_t bswap_64(uint64_t x) | ||||
{ | { | ||||