while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -316,8 +316,11 @@ endif | |||
# armv4l specific stuff | |||
ifeq ($(TARGET_ARCH_ARMV4L),yes) | |||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o | |||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o | |||
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o | |||
ifeq ($(TARGET_IWMMXT),yes) | |||
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o | |||
endif | |||
endif | |||
# sun mediaLib specific stuff | |||
@@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o | |||
CFLAGS += $(MLIB_INC) | |||
endif | |||
# Intel IPP specific stuff | |||
# currently only works when libavcodec is used in mplayer | |||
ifeq ($(HAVE_IPP),yes) | |||
CFLAGS += $(IPP_INC) | |||
endif | |||
# alpha specific stuff | |||
ifeq ($(TARGET_ARCH_ALPHA),yes) | |||
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \ | |||
@@ -18,6 +18,13 @@ | |||
*/ | |||
#include "../dsputil.h" | |||
#ifdef HAVE_IPP | |||
#include "ipp.h" | |||
#endif | |||
#ifdef HAVE_IWMMXT | |||
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); | |||
#endif | |||
extern void j_rev_dct_ARM(DCTELEM *data); | |||
extern void simple_idct_ARM(DCTELEM *data); | |||
@@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data); | |||
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_pixels8_x2_arm(block, pixels, line_size, h); | |||
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_pixels8_y2_arm(block, pixels, line_size, h); | |||
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_pixels8_xy2_arm(block, pixels, line_size, h); | |||
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h); | |||
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h); | |||
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h); | |||
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||
} | |||
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size) | |||
{ | |||
asm volatile ( | |||
"mov r10, #8 \n\t" | |||
"1: \n\t" | |||
/* load dest */ | |||
"ldr r4, [%1] \n\t" | |||
/* block[0] and block[1]*/ | |||
"ldrsh r5, [%0] \n\t" | |||
"ldrsh r7, [%0, #2] \n\t" | |||
"and r6, r4, #0xFF \n\t" | |||
"and r8, r4, #0xFF00 \n\t" | |||
"add r6, r5, r6 \n\t" | |||
"add r8, r7, r8, lsr #8 \n\t" | |||
"mvn r5, r5 \n\t" | |||
"mvn r7, r7 \n\t" | |||
"tst r6, #0x100 \n\t" | |||
"movne r6, r5, lsr #24 \n\t" | |||
"tst r8, #0x100 \n\t" | |||
"movne r8, r7, lsr #24 \n\t" | |||
"mov r9, r6 \n\t" | |||
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */ | |||
"orr r9, r9, r8, lsl #8 \n\t" | |||
/* block[2] and block[3] */ | |||
/* [A] */ | |||
"ldrsh r7, [%0, #6] \n\t" | |||
"and r6, r4, #0xFF0000 \n\t" | |||
"and r8, r4, #0xFF000000 \n\t" | |||
"add r6, r5, r6, lsr #16 \n\t" | |||
"add r8, r7, r8, lsr #24 \n\t" | |||
"mvn r5, r5 \n\t" | |||
"mvn r7, r7 \n\t" | |||
"tst r6, #0x100 \n\t" | |||
"movne r6, r5, lsr #24 \n\t" | |||
"tst r8, #0x100 \n\t" | |||
"movne r8, r7, lsr #24 \n\t" | |||
"orr r9, r9, r6, lsl #16 \n\t" | |||
"ldr r4, [%1, #4] \n\t" /* moved form [B] */ | |||
"orr r9, r9, r8, lsl #24 \n\t" | |||
/* store dest */ | |||
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */ | |||
"str r9, [%1] \n\t" | |||
/* load dest */ | |||
/* [B] */ | |||
/* block[4] and block[5] */ | |||
/* [C] */ | |||
"ldrsh r7, [%0, #10] \n\t" | |||
"and r6, r4, #0xFF \n\t" | |||
"and r8, r4, #0xFF00 \n\t" | |||
"add r6, r5, r6 \n\t" | |||
"add r8, r7, r8, lsr #8 \n\t" | |||
"mvn r5, r5 \n\t" | |||
"mvn r7, r7 \n\t" | |||
"tst r6, #0x100 \n\t" | |||
"movne r6, r5, lsr #24 \n\t" | |||
"tst r8, #0x100 \n\t" | |||
"movne r8, r7, lsr #24 \n\t" | |||
"mov r9, r6 \n\t" | |||
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */ | |||
"orr r9, r9, r8, lsl #8 \n\t" | |||
/* block[6] and block[7] */ | |||
/* [D] */ | |||
"ldrsh r7, [%0, #14] \n\t" | |||
"and r6, r4, #0xFF0000 \n\t" | |||
"and r8, r4, #0xFF000000 \n\t" | |||
"add r6, r5, r6, lsr #16 \n\t" | |||
"add r8, r7, r8, lsr #24 \n\t" | |||
"mvn r5, r5 \n\t" | |||
"mvn r7, r7 \n\t" | |||
"tst r6, #0x100 \n\t" | |||
"movne r6, r5, lsr #24 \n\t" | |||
"tst r8, #0x100 \n\t" | |||
"movne r8, r7, lsr #24 \n\t" | |||
"orr r9, r9, r6, lsl #16 \n\t" | |||
"add %0, %0, #16 \n\t" /* moved from [E] */ | |||
"orr r9, r9, r8, lsl #24 \n\t" | |||
"subs r10, r10, #1 \n\t" /* moved from [F] */ | |||
/* store dest */ | |||
"str r9, [%1, #4] \n\t" | |||
/* [E] */ | |||
/* [F] */ | |||
"add %1, %1, %2 \n\t" | |||
"bne 1b \n\t" | |||
: | |||
: "r"(block), | |||
"r"(dest), | |||
"r"(line_size) | |||
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" ); | |||
} | |||
/* XXX: those functions should be suppressed ASAP when all IDCTs are | |||
converted */ | |||
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) | |||
@@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) | |||
simple_idct_ARM (block); | |||
ff_add_pixels_clamped(block, dest, line_size); | |||
} | |||
static void simple_idct_ipp(DCTELEM *block) | |||
{ | |||
#ifdef HAVE_IPP | |||
ippiDCT8x8Inv_Video_16s_C1I(block); | |||
#endif | |||
} | |||
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) | |||
{ | |||
#ifdef HAVE_IPP | |||
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); | |||
#endif | |||
} | |||
#ifdef HAVE_IWMMXT | |||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
#endif | |||
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) | |||
{ | |||
#ifdef HAVE_IPP | |||
ippiDCT8x8Inv_Video_16s_C1I(block); | |||
#ifdef HAVE_IWMMXT | |||
add_pixels_clamped_iwmmxt(block, dest, line_size); | |||
#else | |||
add_pixels_clamped_ARM(block, dest, line_size); | |||
#endif | |||
#endif | |||
} | |||
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
{ | |||
@@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
ff_put_pixels_clamped = c->put_pixels_clamped; | |||
ff_add_pixels_clamped = c->add_pixels_clamped; | |||
#ifdef HAVE_IPP | |||
if(idct_algo==FF_IDCT_ARM){ | |||
#else | |||
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){ | |||
#endif | |||
c->idct_put= j_rev_dct_ARM_put; | |||
c->idct_add= j_rev_dct_ARM_add; | |||
c->idct = j_rev_dct_ARM; | |||
@@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
c->idct_add= simple_idct_ARM_add; | |||
c->idct = simple_idct_ARM; | |||
c->idct_permutation_type= FF_NO_IDCT_PERM; | |||
#ifdef HAVE_IPP | |||
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){ | |||
#else | |||
} else if (idct_algo==FF_IDCT_IPP){ | |||
#endif | |||
c->idct_put= simple_idct_ipp_put; | |||
c->idct_add= simple_idct_ipp_add; | |||
c->idct = simple_idct_ipp; | |||
c->idct_permutation_type= FF_NO_IDCT_PERM; | |||
} | |||
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG! | |||
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK! | |||
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK! | |||
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */ | |||
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */ | |||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK | |||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK | |||
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */ | |||
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK | |||
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK | |||
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */ | |||
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */ | |||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK | |||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK | |||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK | |||
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */ | |||
#if 1 | |||
#ifdef HAVE_IWMMXT | |||
dsputil_init_iwmmxt(c, avctx); | |||
#endif | |||
#endif | |||
} |
@@ -0,0 +1,694 @@ | |||
@ | |||
@ ARMv4L optimized DSP utils | |||
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> | |||
@ | |||
@ This library is free software; you can redistribute it and/or | |||
@ modify it under the terms of the GNU Lesser General Public | |||
@ License as published by the Free Software Foundation; either | |||
@ version 2 of the License, or (at your option) any later version. | |||
@ | |||
@ This library is distributed in the hope that it will be useful, | |||
@ but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
@ Lesser General Public License for more details. | |||
@ | |||
@ You should have received a copy of the GNU Lesser General Public | |||
@ License along with this library; if not, write to the Free Software | |||
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
@ | |||
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 | |||
mov \Rd0, \Rn0, lsr #(\shift * 8) | |||
mov \Rd1, \Rn1, lsr #(\shift * 8) | |||
mov \Rd2, \Rn2, lsr #(\shift * 8) | |||
mov \Rd3, \Rn3, lsr #(\shift * 8) | |||
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) | |||
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) | |||
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) | |||
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) | |||
.endm | |||
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 | |||
mov \R0, \R0, lsr #(\shift * 8) | |||
orr \R0, \R0, \R1, lsl #(32 - \shift * 8) | |||
mov \R1, \R1, lsr #(\shift * 8) | |||
orr \R1, \R1, \R2, lsl #(32 - \shift * 8) | |||
.endm | |||
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 | |||
mov \Rdst0, \Rsrc0, lsr #(\shift * 8) | |||
mov \Rdst1, \Rsrc1, lsr #(\shift * 8) | |||
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) | |||
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) | |||
.endm | |||
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||
@ Rmask = 0xFEFEFEFE | |||
@ Rn = destroy | |||
eor \Rd0, \Rn0, \Rm0 | |||
eor \Rd1, \Rn1, \Rm1 | |||
orr \Rn0, \Rn0, \Rm0 | |||
orr \Rn1, \Rn1, \Rm1 | |||
and \Rd0, \Rd0, \Rmask | |||
and \Rd1, \Rd1, \Rmask | |||
sub \Rd0, \Rn0, \Rd0, lsr #1 | |||
sub \Rd1, \Rn1, \Rd1, lsr #1 | |||
.endm | |||
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||
@ Rmask = 0xFEFEFEFE | |||
@ Rn = destroy | |||
eor \Rd0, \Rn0, \Rm0 | |||
eor \Rd1, \Rn1, \Rm1 | |||
and \Rn0, \Rn0, \Rm0 | |||
and \Rn1, \Rn1, \Rm1 | |||
and \Rd0, \Rd0, \Rmask | |||
and \Rd1, \Rd1, \Rmask | |||
add \Rd0, \Rn0, \Rd0, lsr #1 | |||
add \Rd1, \Rn1, \Rd1, lsr #1 | |||
.endm | |||
@ ---------------------------------------------------------------- | |||
.align 8 | |||
.global put_pixels16_arm | |||
put_pixels16_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r11, lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
bic r1, r1, #3 | |||
add r5, r5, r4, lsl #2 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r7} | |||
add r1, r1, r2 | |||
stmia r0, {r4-r7} | |||
pld [r1] | |||
subs r3, r3, #1 | |||
add r0, r0, r2 | |||
bne 1b | |||
ldmfd sp!, {r4-r11, pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r8} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r9-r12} | |||
add r0, r0, r2 | |||
bne 2b | |||
ldmfd sp!, {r4-r11, pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r8} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r9-r12} | |||
add r0, r0, r2 | |||
bne 3b | |||
ldmfd sp!, {r4-r11, pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r8} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r9-r12} | |||
add r0, r0, r2 | |||
bne 4b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
5: | |||
.word 1b | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
@ ---------------------------------------------------------------- | |||
.align 8 | |||
.global put_pixels8_arm | |||
put_pixels8_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r5,lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
bic r1, r1, #3 | |||
add r5, r5, r4, lsl #2 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r5} | |||
add r1, r1, r2 | |||
subs r3, r3, #1 | |||
pld [r1] | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 1b | |||
ldmfd sp!, {r4-r5,pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r5, r12} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 2b | |||
ldmfd sp!, {r4-r5,pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r5, r12} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 3b | |||
ldmfd sp!, {r4-r5,pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r5, r12} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 | |||
pld [r1] | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 4b | |||
ldmfd sp!, {r4-r5,pc} | |||
.align 8 | |||
5: | |||
.word 1b | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
@ ---------------------------------------------------------------- | |||
.align 8 | |||
.global put_pixels8_x2_arm | |||
put_pixels8_x2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
ldr r12, [r5] | |||
add r5, r5, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
pld [r1] | |||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 1b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||
pld [r1] | |||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 2b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||
pld [r1] | |||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 3b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||
pld [r1] | |||
RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 4b | |||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||
.align 8 | |||
5: | |||
.word 0xFEFEFEFE | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
.align 8 | |||
.global put_no_rnd_pixels8_x2_arm | |||
put_no_rnd_pixels8_x2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
ldr r12, [r5] | |||
add r5, r5, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
pld [r1] | |||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 1b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||
pld [r1] | |||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 2b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||
pld [r1] | |||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 3b | |||
ldmfd sp!, {r4-r10,pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r5, r10} | |||
add r1, r1, r2 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||
pld [r1] | |||
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 4b | |||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||
.align 8 | |||
5: | |||
.word 0xFEFEFEFE | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
@ ---------------------------------------------------------------- | |||
.align 8 | |||
.global put_pixels8_y2_arm | |||
put_pixels8_y2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
mov r3, r3, lsr #1 | |||
ldr r12, [r5] | |||
add r5, r5, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r5} | |||
add r1, r1, r2 | |||
6: ldmia r1, {r6-r7} | |||
add r1, r1, r2 | |||
pld [r1] | |||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
ldmia r1, {r4-r5} | |||
add r1, r1, r2 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
pld [r1] | |||
RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
subs r3, r3, #1 | |||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
subs r3, r3, #1 | |||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
subs r3, r3, #1 | |||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
5: | |||
.word 0xFEFEFEFE | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
.align 8 | |||
.global put_no_rnd_pixels8_y2_arm | |||
put_no_rnd_pixels8_y2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
adr r5, 5f | |||
ands r4, r1, #3 | |||
mov r3, r3, lsr #1 | |||
ldr r12, [r5] | |||
add r5, r5, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
ldmia r1, {r4-r5} | |||
add r1, r1, r2 | |||
6: ldmia r1, {r6-r7} | |||
add r1, r1, r2 | |||
pld [r1] | |||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
ldmia r1, {r4-r5} | |||
add r1, r1, r2 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
pld [r1] | |||
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||
subs r3, r3, #1 | |||
stmia r0, {r8-r9} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
2: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
subs r3, r3, #1 | |||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
3: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
subs r3, r3, #1 | |||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
4: | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
6: ldmia r1, {r7-r9} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
ldmia r1, {r4-r6} | |||
add r1, r1, r2 | |||
pld [r1] | |||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
subs r3, r3, #1 | |||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
stmia r0, {r10-r11} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.align 8 | |||
5: | |||
.word 0xFEFEFEFE | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
@ ---------------------------------------------------------------- | |||
.macro RND_XY2_IT align, rnd | |||
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) | |||
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) | |||
.if \align == 0 | |||
ldmia r1, {r6-r8} | |||
.elseif \align == 3 | |||
ldmia r1, {r5-r7} | |||
.else | |||
ldmia r1, {r8-r10} | |||
.endif | |||
add r1, r1, r2 | |||
pld [r1] | |||
.if \align == 0 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 | |||
.elseif \align == 1 | |||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 | |||
.elseif \align == 2 | |||
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 | |||
.elseif \align == 3 | |||
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 | |||
.endif | |||
ldr r14, [r12, #0] @ 0x03030303 | |||
tst r3, #1 | |||
and r8, r4, r14 | |||
and r9, r5, r14 | |||
and r10, r6, r14 | |||
and r11, r7, r14 | |||
.if \rnd == 1 | |||
ldreq r14, [r12, #16] @ 0x02020202 | |||
.else | |||
ldreq r14, [r12, #28] @ 0x01010101 | |||
.endif | |||
add r8, r8, r10 | |||
add r9, r9, r11 | |||
addeq r8, r8, r14 | |||
addeq r9, r9, r14 | |||
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 | |||
and r4, r14, r4, lsr #2 | |||
and r5, r14, r5, lsr #2 | |||
and r6, r14, r6, lsr #2 | |||
and r7, r14, r7, lsr #2 | |||
add r10, r4, r6 | |||
add r11, r5, r7 | |||
.endm | |||
.macro RND_XY2_EXPAND align, rnd | |||
RND_XY2_IT \align, \rnd | |||
6: stmfd sp!, {r8-r11} | |||
RND_XY2_IT \align, \rnd | |||
ldmfd sp!, {r4-r7} | |||
add r4, r4, r8 | |||
add r5, r5, r9 | |||
add r6, r6, r10 | |||
add r7, r7, r11 | |||
ldr r14, [r12, #24] @ 0x0F0F0F0F | |||
and r4, r14, r4, lsr #2 | |||
and r5, r14, r5, lsr #2 | |||
add r4, r4, r6 | |||
add r5, r5, r7 | |||
subs r3, r3, #1 | |||
stmia r0, {r4-r5} | |||
add r0, r0, r2 | |||
bne 6b | |||
ldmfd sp!, {r4-r11,pc} | |||
.endm | |||
.align 8 | |||
.global put_pixels8_xy2_arm | |||
put_pixels8_xy2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
adrl r12, 5f | |||
ands r4, r1, #3 | |||
add r5, r12, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
RND_XY2_EXPAND 0, 1 | |||
.align 8 | |||
2: | |||
RND_XY2_EXPAND 1, 1 | |||
.align 8 | |||
3: | |||
RND_XY2_EXPAND 2, 1 | |||
.align 8 | |||
4: | |||
RND_XY2_EXPAND 3, 1 | |||
5: | |||
.word 0x03030303 | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
.word 0x02020202 | |||
.word 0xFCFCFCFC >> 2 | |||
.word 0x0F0F0F0F | |||
.word 0x01010101 | |||
.align 8 | |||
.global put_no_rnd_pixels8_xy2_arm | |||
put_no_rnd_pixels8_xy2_arm: | |||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
@ block = word aligned, pixles = unaligned | |||
pld [r1] | |||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
adrl r12, 5f | |||
ands r4, r1, #3 | |||
add r5, r12, r4, lsl #2 | |||
bic r1, r1, #3 | |||
ldrne pc, [r5] | |||
1: | |||
RND_XY2_EXPAND 0, 0 | |||
.align 8 | |||
2: | |||
RND_XY2_EXPAND 1, 0 | |||
.align 8 | |||
3: | |||
RND_XY2_EXPAND 2, 0 | |||
.align 8 | |||
4: | |||
RND_XY2_EXPAND 3, 0 | |||
5: | |||
.word 0x03030303 | |||
.word 2b | |||
.word 3b | |||
.word 4b | |||
.word 0x02020202 | |||
.word 0xFCFCFCFC >> 2 | |||
.word 0x0F0F0F0F | |||
.word 0x01010101 |
@@ -0,0 +1,168 @@ | |||
/* | |||
* iWMMXt optimized DSP utils | |||
* Copyright (c) 2004 AGAWA Koji | |||
* | |||
* This library is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2 of the License, or (at your option) any later version. | |||
* | |||
* This library is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with this library; if not, write to the Free Software | |||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
*/ | |||
#include "../dsputil.h" | |||
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt | |||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); | |||
#define WAVG2B "wavg2b" | |||
#include "dsputil_iwmmxt_rnd.h" | |||
#undef DEF | |||
#undef SET_RND | |||
#undef WAVG2B | |||
#define DEF(x, y) x ## _ ## y ##_iwmmxt | |||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); | |||
#define WAVG2B "wavg2br" | |||
#include "dsputil_iwmmxt_rnd.h" | |||
#undef DEF | |||
#undef SET_RND | |||
#undef WAVG2BR | |||
// need scheduling | |||
#define OP(AVG) \ | |||
asm volatile ( \ | |||
/* alignment */ \ | |||
"and r12, %[pixels], #7 \n\t" \ | |||
"bic %[pixels], %[pixels], #7 \n\t" \ | |||
"tmcr wcgr1, r12 \n\t" \ | |||
\ | |||
"wldrd wr0, [%[pixels]] \n\t" \ | |||
"wldrd wr1, [%[pixels], #8] \n\t" \ | |||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
"walignr1 wr4, wr0, wr1 \n\t" \ | |||
\ | |||
"1: \n\t" \ | |||
\ | |||
"wldrd wr2, [%[pixels]] \n\t" \ | |||
"wldrd wr3, [%[pixels], #8] \n\t" \ | |||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
"pld [%[pixels]] \n\t" \ | |||
"walignr1 wr5, wr2, wr3 \n\t" \ | |||
AVG " wr6, wr4, wr5 \n\t" \ | |||
"wstrd wr6, [%[block]] \n\t" \ | |||
"add %[block], %[block], %[line_size] \n\t" \ | |||
\ | |||
"wldrd wr0, [%[pixels]] \n\t" \ | |||
"wldrd wr1, [%[pixels], #8] \n\t" \ | |||
"add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
"walignr1 wr4, wr0, wr1 \n\t" \ | |||
"pld [%[pixels]] \n\t" \ | |||
AVG " wr6, wr4, wr5 \n\t" \ | |||
"wstrd wr6, [%[block]] \n\t" \ | |||
"add %[block], %[block], %[line_size] \n\t" \ | |||
\ | |||
"subs %[h], %[h], #2 \n\t" \ | |||
"bne 1b \n\t" \ | |||
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ | |||
: [line_size]"r"(line_size) \ | |||
: "memory", "r12"); | |||
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||
{ | |||
OP("wavg2br"); | |||
} | |||
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||
{ | |||
OP("wavg2b"); | |||
} | |||
#undef OP | |||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) | |||
{ | |||
uint8_t *pixels2 = pixels + line_size; | |||
__asm__ __volatile__ ( | |||
"mov r12, #4 \n\t" | |||
"1: \n\t" | |||
"pld [%[pixels], %[line_size2]] \n\t" | |||
"pld [%[pixels2], %[line_size2]] \n\t" | |||
"wldrd wr4, [%[pixels]] \n\t" | |||
"wldrd wr5, [%[pixels2]] \n\t" | |||
"pld [%[block], #32] \n\t" | |||
"wunpckelub wr6, wr4 \n\t" | |||
"wldrd wr0, [%[block]] \n\t" | |||
"wunpckehub wr7, wr4 \n\t" | |||
"wldrd wr1, [%[block], #8] \n\t" | |||
"wunpckelub wr8, wr5 \n\t" | |||
"wldrd wr2, [%[block], #16] \n\t" | |||
"wunpckehub wr9, wr5 \n\t" | |||
"wldrd wr3, [%[block], #24] \n\t" | |||
"add %[block], %[block], #32 \n\t" | |||
"waddhss wr10, wr0, wr6 \n\t" | |||
"waddhss wr11, wr1, wr7 \n\t" | |||
"waddhss wr12, wr2, wr8 \n\t" | |||
"waddhss wr13, wr3, wr9 \n\t" | |||
"wpackhus wr14, wr10, wr11 \n\t" | |||
"wpackhus wr15, wr12, wr13 \n\t" | |||
"wstrd wr14, [%[pixels]] \n\t" | |||
"add %[pixels], %[pixels], %[line_size2] \n\t" | |||
"subs r12, r12, #1 \n\t" | |||
"wstrd wr15, [%[pixels2]] \n\t" | |||
"add %[pixels2], %[pixels2], %[line_size2] \n\t" | |||
"bne 1b \n\t" | |||
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) | |||
: [line_size2]"r"(line_size << 1) | |||
: "cc", "memory", "r12"); | |||
} | |||
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
{ | |||
return; | |||
} | |||
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) | |||
{ | |||
c->add_pixels_clamped = add_pixels_clamped_iwmmxt; | |||
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; | |||
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; | |||
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; | |||
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; | |||
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; | |||
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; | |||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; | |||
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; | |||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; | |||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; | |||
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; | |||
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; | |||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; | |||
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; | |||
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; | |||
} |
@@ -21,6 +21,13 @@ | |||
#include "../mpegvideo.h" | |||
#include "../avcodec.h" | |||
#ifdef HAVE_IWMMXT | |||
extern void MPV_common_init_iwmmxt(MpegEncContext *s); | |||
#endif | |||
void MPV_common_init_armv4l(MpegEncContext *s) | |||
{ | |||
#ifdef HAVE_IWMMXT | |||
MPV_common_init_iwmmxt(s); | |||
#endif | |||
} |
@@ -0,0 +1,97 @@ | |||
#include "../dsputil.h" | |||
#include "../mpegvideo.h" | |||
#include "../avcodec.h" | |||
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, | |||
DCTELEM *block, int n, int qscale) | |||
{ | |||
int level, qmul, qadd; | |||
int nCoeffs; | |||
DCTELEM *block_orig = block; | |||
assert(s->block_last_index[n]>=0); | |||
qmul = qscale << 1; | |||
if (!s->h263_aic) { | |||
if (n < 4) | |||
level = block[0] * s->y_dc_scale; | |||
else | |||
level = block[0] * s->c_dc_scale; | |||
qadd = (qscale - 1) | 1; | |||
}else{ | |||
qadd = 0; | |||
level = block[0]; | |||
} | |||
if(s->ac_pred) | |||
nCoeffs=63; | |||
else | |||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||
__asm__ __volatile__ ( | |||
/* "movd %1, %%mm6 \n\t" //qmul */ | |||
/* "packssdw %%mm6, %%mm6 \n\t" */ | |||
/* "packssdw %%mm6, %%mm6 \n\t" */ | |||
"tbcsth wr6, %[qmul] \n\t" | |||
/* "movd %2, %%mm5 \n\t" //qadd */ | |||
/* "packssdw %%mm5, %%mm5 \n\t" */ | |||
/* "packssdw %%mm5, %%mm5 \n\t" */ | |||
"tbcsth wr5, %[qadd] \n\t" | |||
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ | |||
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ | |||
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ | |||
"1: \n\t" | |||
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ | |||
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ | |||
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ | |||
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ | |||
/* "movq (%0, %3), %%mm2 \n\t" */ | |||
/* "movq 8(%0, %3), %%mm3 \n\t" */ | |||
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ | |||
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ | |||
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ | |||
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ | |||
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ | |||
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ | |||
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ | |||
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ | |||
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ | |||
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ | |||
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ | |||
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ | |||
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ | |||
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ | |||
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ | |||
"subs %[i], %[i], #1 \n\t" | |||
"bne 1b \n\t" /* "jng 1b \n\t" */ | |||
:[block]"+r"(block) | |||
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) | |||
:"memory"); | |||
block_orig[0] = level; | |||
} | |||
#if 0 | |||
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, | |||
DCTELEM *block, int n, int qscale) | |||
{ | |||
int nCoeffs; | |||
assert(s->block_last_index[n]>=0); | |||
if(s->ac_pred) | |||
nCoeffs=63; | |||
else | |||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); | |||
} | |||
#endif | |||
void MPV_common_init_iwmmxt(MpegEncContext *s) | |||
{ | |||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; | |||
#if 0 | |||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; | |||
#endif | |||
} |
@@ -1180,6 +1180,7 @@ typedef struct AVCodecContext { | |||
#define FF_IDCT_SIMPLEARM 10 | |||
#define FF_IDCT_H264 11 | |||
#define FF_IDCT_VP3 12 | |||
#define FP_IDCT_IPP 13 | |||
/** | |||
* slice count. | |||
@@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){ | |||
return (x>>8) | (x<<8); | |||
} | |||
#ifdef ARCH_ARM | |||
static always_inline uint32_t bswap_32(uint32_t x){ | |||
uint32_t t; | |||
__asm__ ( | |||
"eor %1, %0, %0, ror #16 \n\t" | |||
"bic %1, %1, #0xFF0000 \n\t" | |||
"mov %0, %0, ror #8 \n\t" | |||
"eor %0, %0, %1, lsr #8 \n\t" | |||
: "+r"(x), "+r"(t)); | |||
return x; | |||
} | |||
#else | |||
static always_inline uint32_t bswap_32(uint32_t x){ | |||
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); | |||
return (x>>16) | (x<<16); | |||
} | |||
#endif | |||
static inline uint64_t bswap_64(uint64_t x) | |||
{ | |||