while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -316,8 +316,11 @@ endif | |||
| # armv4l specific stuff | |||
| ifeq ($(TARGET_ARCH_ARMV4L),yes) | |||
| ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o | |||
| ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o | |||
| OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o | |||
| ifeq ($(TARGET_IWMMXT),yes) | |||
| OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o | |||
| endif | |||
| endif | |||
| # sun mediaLib specific stuff | |||
| @@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o | |||
| CFLAGS += $(MLIB_INC) | |||
| endif | |||
| # Intel IPP specific stuff | |||
| # currently only works when libavcodec is used in mplayer | |||
| ifeq ($(HAVE_IPP),yes) | |||
| CFLAGS += $(IPP_INC) | |||
| endif | |||
| # alpha specific stuff | |||
| ifeq ($(TARGET_ARCH_ALPHA),yes) | |||
| OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \ | |||
| @@ -18,6 +18,13 @@ | |||
| */ | |||
| #include "../dsputil.h" | |||
| #ifdef HAVE_IPP | |||
| #include "ipp.h" | |||
| #endif | |||
| #ifdef HAVE_IWMMXT | |||
| extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); | |||
| #endif | |||
| extern void j_rev_dct_ARM(DCTELEM *data); | |||
| extern void simple_idct_ARM(DCTELEM *data); | |||
| @@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data); | |||
| static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
| static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
| void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); | |||
| static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_pixels8_x2_arm(block, pixels, line_size, h); | |||
| put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_pixels8_y2_arm(block, pixels, line_size, h); | |||
| put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_pixels8_xy2_arm(block, pixels, line_size, h); | |||
| put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h); | |||
| put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h); | |||
| put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h); | |||
| put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); | |||
| } | |||
| static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size) | |||
| { | |||
| asm volatile ( | |||
| "mov r10, #8 \n\t" | |||
| "1: \n\t" | |||
| /* load dest */ | |||
| "ldr r4, [%1] \n\t" | |||
| /* block[0] and block[1]*/ | |||
| "ldrsh r5, [%0] \n\t" | |||
| "ldrsh r7, [%0, #2] \n\t" | |||
| "and r6, r4, #0xFF \n\t" | |||
| "and r8, r4, #0xFF00 \n\t" | |||
| "add r6, r5, r6 \n\t" | |||
| "add r8, r7, r8, lsr #8 \n\t" | |||
| "mvn r5, r5 \n\t" | |||
| "mvn r7, r7 \n\t" | |||
| "tst r6, #0x100 \n\t" | |||
| "movne r6, r5, lsr #24 \n\t" | |||
| "tst r8, #0x100 \n\t" | |||
| "movne r8, r7, lsr #24 \n\t" | |||
| "mov r9, r6 \n\t" | |||
| "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */ | |||
| "orr r9, r9, r8, lsl #8 \n\t" | |||
| /* block[2] and block[3] */ | |||
| /* [A] */ | |||
| "ldrsh r7, [%0, #6] \n\t" | |||
| "and r6, r4, #0xFF0000 \n\t" | |||
| "and r8, r4, #0xFF000000 \n\t" | |||
| "add r6, r5, r6, lsr #16 \n\t" | |||
| "add r8, r7, r8, lsr #24 \n\t" | |||
| "mvn r5, r5 \n\t" | |||
| "mvn r7, r7 \n\t" | |||
| "tst r6, #0x100 \n\t" | |||
| "movne r6, r5, lsr #24 \n\t" | |||
| "tst r8, #0x100 \n\t" | |||
| "movne r8, r7, lsr #24 \n\t" | |||
| "orr r9, r9, r6, lsl #16 \n\t" | |||
| "ldr r4, [%1, #4] \n\t" /* moved form [B] */ | |||
| "orr r9, r9, r8, lsl #24 \n\t" | |||
| /* store dest */ | |||
| "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */ | |||
| "str r9, [%1] \n\t" | |||
| /* load dest */ | |||
| /* [B] */ | |||
| /* block[4] and block[5] */ | |||
| /* [C] */ | |||
| "ldrsh r7, [%0, #10] \n\t" | |||
| "and r6, r4, #0xFF \n\t" | |||
| "and r8, r4, #0xFF00 \n\t" | |||
| "add r6, r5, r6 \n\t" | |||
| "add r8, r7, r8, lsr #8 \n\t" | |||
| "mvn r5, r5 \n\t" | |||
| "mvn r7, r7 \n\t" | |||
| "tst r6, #0x100 \n\t" | |||
| "movne r6, r5, lsr #24 \n\t" | |||
| "tst r8, #0x100 \n\t" | |||
| "movne r8, r7, lsr #24 \n\t" | |||
| "mov r9, r6 \n\t" | |||
| "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */ | |||
| "orr r9, r9, r8, lsl #8 \n\t" | |||
| /* block[6] and block[7] */ | |||
| /* [D] */ | |||
| "ldrsh r7, [%0, #14] \n\t" | |||
| "and r6, r4, #0xFF0000 \n\t" | |||
| "and r8, r4, #0xFF000000 \n\t" | |||
| "add r6, r5, r6, lsr #16 \n\t" | |||
| "add r8, r7, r8, lsr #24 \n\t" | |||
| "mvn r5, r5 \n\t" | |||
| "mvn r7, r7 \n\t" | |||
| "tst r6, #0x100 \n\t" | |||
| "movne r6, r5, lsr #24 \n\t" | |||
| "tst r8, #0x100 \n\t" | |||
| "movne r8, r7, lsr #24 \n\t" | |||
| "orr r9, r9, r6, lsl #16 \n\t" | |||
| "add %0, %0, #16 \n\t" /* moved from [E] */ | |||
| "orr r9, r9, r8, lsl #24 \n\t" | |||
| "subs r10, r10, #1 \n\t" /* moved from [F] */ | |||
| /* store dest */ | |||
| "str r9, [%1, #4] \n\t" | |||
| /* [E] */ | |||
| /* [F] */ | |||
| "add %1, %1, %2 \n\t" | |||
| "bne 1b \n\t" | |||
| : | |||
| : "r"(block), | |||
| "r"(dest), | |||
| "r"(line_size) | |||
| : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" ); | |||
| } | |||
| /* XXX: those functions should be suppressed ASAP when all IDCTs are | |||
| converted */ | |||
| static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) | |||
| @@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) | |||
| simple_idct_ARM (block); | |||
| ff_add_pixels_clamped(block, dest, line_size); | |||
| } | |||
| static void simple_idct_ipp(DCTELEM *block) | |||
| { | |||
| #ifdef HAVE_IPP | |||
| ippiDCT8x8Inv_Video_16s_C1I(block); | |||
| #endif | |||
| } | |||
| static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) | |||
| { | |||
| #ifdef HAVE_IPP | |||
| ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); | |||
| #endif | |||
| } | |||
| #ifdef HAVE_IWMMXT | |||
| void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); | |||
| #endif | |||
| static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) | |||
| { | |||
| #ifdef HAVE_IPP | |||
| ippiDCT8x8Inv_Video_16s_C1I(block); | |||
| #ifdef HAVE_IWMMXT | |||
| add_pixels_clamped_iwmmxt(block, dest, line_size); | |||
| #else | |||
| add_pixels_clamped_ARM(block, dest, line_size); | |||
| #endif | |||
| #endif | |||
| } | |||
| void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| @@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
| ff_put_pixels_clamped = c->put_pixels_clamped; | |||
| ff_add_pixels_clamped = c->add_pixels_clamped; | |||
| #ifdef HAVE_IPP | |||
| if(idct_algo==FF_IDCT_ARM){ | |||
| #else | |||
| if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){ | |||
| #endif | |||
| c->idct_put= j_rev_dct_ARM_put; | |||
| c->idct_add= j_rev_dct_ARM_add; | |||
| c->idct = j_rev_dct_ARM; | |||
| @@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) | |||
| c->idct_add= simple_idct_ARM_add; | |||
| c->idct = simple_idct_ARM; | |||
| c->idct_permutation_type= FF_NO_IDCT_PERM; | |||
| #ifdef HAVE_IPP | |||
| } else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){ | |||
| #else | |||
| } else if (idct_algo==FF_IDCT_IPP){ | |||
| #endif | |||
| c->idct_put= simple_idct_ipp_put; | |||
| c->idct_add= simple_idct_ipp_add; | |||
| c->idct = simple_idct_ipp; | |||
| c->idct_permutation_type= FF_NO_IDCT_PERM; | |||
| } | |||
| /* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG! | |||
| c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK! | |||
| c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK! | |||
| /* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */ | |||
| /* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */ | |||
| c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK | |||
| c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK | |||
| /* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */ | |||
| c->put_pixels_tab[1][0] = put_pixels8_arm; //OK | |||
| c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK | |||
| /* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */ | |||
| /* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */ | |||
| c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK | |||
| c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK | |||
| c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK | |||
| /* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */ | |||
| #if 1 | |||
| #ifdef HAVE_IWMMXT | |||
| dsputil_init_iwmmxt(c, avctx); | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,694 @@ | |||
| @ | |||
| @ ARMv4L optimized DSP utils | |||
| @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> | |||
| @ | |||
| @ This library is free software; you can redistribute it and/or | |||
| @ modify it under the terms of the GNU Lesser General Public | |||
| @ License as published by the Free Software Foundation; either | |||
| @ version 2 of the License, or (at your option) any later version. | |||
| @ | |||
| @ This library is distributed in the hope that it will be useful, | |||
| @ but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| @ Lesser General Public License for more details. | |||
| @ | |||
| @ You should have received a copy of the GNU Lesser General Public | |||
| @ License along with this library; if not, write to the Free Software | |||
| @ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
| @ | |||
| .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 | |||
| mov \Rd0, \Rn0, lsr #(\shift * 8) | |||
| mov \Rd1, \Rn1, lsr #(\shift * 8) | |||
| mov \Rd2, \Rn2, lsr #(\shift * 8) | |||
| mov \Rd3, \Rn3, lsr #(\shift * 8) | |||
| orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) | |||
| orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) | |||
| orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) | |||
| orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) | |||
| .endm | |||
| .macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 | |||
| mov \R0, \R0, lsr #(\shift * 8) | |||
| orr \R0, \R0, \R1, lsl #(32 - \shift * 8) | |||
| mov \R1, \R1, lsr #(\shift * 8) | |||
| orr \R1, \R1, \R2, lsl #(32 - \shift * 8) | |||
| .endm | |||
| .macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 | |||
| mov \Rdst0, \Rsrc0, lsr #(\shift * 8) | |||
| mov \Rdst1, \Rsrc1, lsr #(\shift * 8) | |||
| orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) | |||
| orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) | |||
| .endm | |||
| .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||
| @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||
| @ Rmask = 0xFEFEFEFE | |||
| @ Rn = destroy | |||
| eor \Rd0, \Rn0, \Rm0 | |||
| eor \Rd1, \Rn1, \Rm1 | |||
| orr \Rn0, \Rn0, \Rm0 | |||
| orr \Rn1, \Rn1, \Rm1 | |||
| and \Rd0, \Rd0, \Rmask | |||
| and \Rd1, \Rd1, \Rmask | |||
| sub \Rd0, \Rn0, \Rd0, lsr #1 | |||
| sub \Rd1, \Rn1, \Rd1, lsr #1 | |||
| .endm | |||
| .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask | |||
| @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) | |||
| @ Rmask = 0xFEFEFEFE | |||
| @ Rn = destroy | |||
| eor \Rd0, \Rn0, \Rm0 | |||
| eor \Rd1, \Rn1, \Rm1 | |||
| and \Rn0, \Rn0, \Rm0 | |||
| and \Rn1, \Rn1, \Rm1 | |||
| and \Rd0, \Rd0, \Rmask | |||
| and \Rd1, \Rd1, \Rmask | |||
| add \Rd0, \Rn0, \Rd0, lsr #1 | |||
| add \Rd1, \Rn1, \Rd1, lsr #1 | |||
| .endm | |||
| @ ---------------------------------------------------------------- | |||
| .align 8 | |||
| .global put_pixels16_arm | |||
| put_pixels16_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r11, lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| bic r1, r1, #3 | |||
| add r5, r5, r4, lsl #2 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r7} | |||
| add r1, r1, r2 | |||
| stmia r0, {r4-r7} | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| add r0, r0, r2 | |||
| bne 1b | |||
| ldmfd sp!, {r4-r11, pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r8} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r9-r12} | |||
| add r0, r0, r2 | |||
| bne 2b | |||
| ldmfd sp!, {r4-r11, pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r8} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r9-r12} | |||
| add r0, r0, r2 | |||
| bne 3b | |||
| ldmfd sp!, {r4-r11, pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r8} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r9-r12} | |||
| add r0, r0, r2 | |||
| bne 4b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 5: | |||
| .word 1b | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| @ ---------------------------------------------------------------- | |||
| .align 8 | |||
| .global put_pixels8_arm | |||
| put_pixels8_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r5,lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| bic r1, r1, #3 | |||
| add r5, r5, r4, lsl #2 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r5} | |||
| add r1, r1, r2 | |||
| subs r3, r3, #1 | |||
| pld [r1] | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 1b | |||
| ldmfd sp!, {r4-r5,pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r5, r12} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 2b | |||
| ldmfd sp!, {r4-r5,pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r5, r12} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 3b | |||
| ldmfd sp!, {r4-r5,pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r5, r12} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 | |||
| pld [r1] | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 4b | |||
| ldmfd sp!, {r4-r5,pc} | |||
| .align 8 | |||
| 5: | |||
| .word 1b | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| @ ---------------------------------------------------------------- | |||
| .align 8 | |||
| .global put_pixels8_x2_arm | |||
| put_pixels8_x2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| ldr r12, [r5] | |||
| add r5, r5, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
| pld [r1] | |||
| RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 1b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||
| pld [r1] | |||
| RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 2b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||
| pld [r1] | |||
| RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 3b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||
| pld [r1] | |||
| RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 4b | |||
| ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||
| .align 8 | |||
| 5: | |||
| .word 0xFEFEFEFE | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| .align 8 | |||
| .global put_no_rnd_pixels8_x2_arm | |||
| put_no_rnd_pixels8_x2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r10,lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| ldr r12, [r5] | |||
| add r5, r5, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
| pld [r1] | |||
| NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 1b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 | |||
| pld [r1] | |||
| NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 2b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 | |||
| pld [r1] | |||
| NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 3b | |||
| ldmfd sp!, {r4-r10,pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r5, r10} | |||
| add r1, r1, r2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 | |||
| pld [r1] | |||
| NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 4b | |||
| ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. | |||
| .align 8 | |||
| 5: | |||
| .word 0xFEFEFEFE | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| @ ---------------------------------------------------------------- | |||
| .align 8 | |||
| .global put_pixels8_y2_arm | |||
| put_pixels8_y2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| mov r3, r3, lsr #1 | |||
| ldr r12, [r5] | |||
| add r5, r5, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r5} | |||
| add r1, r1, r2 | |||
| 6: ldmia r1, {r6-r7} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
| ldmia r1, {r4-r5} | |||
| add r1, r1, r2 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| pld [r1] | |||
| RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||
| RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||
| RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||
| RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 5: | |||
| .word 0xFEFEFEFE | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| .align 8 | |||
| .global put_no_rnd_pixels8_y2_arm | |||
| put_no_rnd_pixels8_y2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
| adr r5, 5f | |||
| ands r4, r1, #3 | |||
| mov r3, r3, lsr #1 | |||
| ldr r12, [r5] | |||
| add r5, r5, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| ldmia r1, {r4-r5} | |||
| add r1, r1, r2 | |||
| 6: ldmia r1, {r6-r7} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 | |||
| ldmia r1, {r4-r5} | |||
| add r1, r1, r2 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| pld [r1] | |||
| NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r8-r9} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 2: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 | |||
| NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 3: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 | |||
| NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 4: | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
| 6: ldmia r1, {r7-r9} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 | |||
| NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| ldmia r1, {r4-r6} | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 | |||
| subs r3, r3, #1 | |||
| NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 | |||
| stmia r0, {r10-r11} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .align 8 | |||
| 5: | |||
| .word 0xFEFEFEFE | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| @ ---------------------------------------------------------------- | |||
| .macro RND_XY2_IT align, rnd | |||
| @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) | |||
| @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) | |||
| .if \align == 0 | |||
| ldmia r1, {r6-r8} | |||
| .elseif \align == 3 | |||
| ldmia r1, {r5-r7} | |||
| .else | |||
| ldmia r1, {r8-r10} | |||
| .endif | |||
| add r1, r1, r2 | |||
| pld [r1] | |||
| .if \align == 0 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 | |||
| .elseif \align == 1 | |||
| ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 | |||
| .elseif \align == 2 | |||
| ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 | |||
| .elseif \align == 3 | |||
| ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 | |||
| .endif | |||
| ldr r14, [r12, #0] @ 0x03030303 | |||
| tst r3, #1 | |||
| and r8, r4, r14 | |||
| and r9, r5, r14 | |||
| and r10, r6, r14 | |||
| and r11, r7, r14 | |||
| .if \rnd == 1 | |||
| ldreq r14, [r12, #16] @ 0x02020202 | |||
| .else | |||
| ldreq r14, [r12, #28] @ 0x01010101 | |||
| .endif | |||
| add r8, r8, r10 | |||
| add r9, r9, r11 | |||
| addeq r8, r8, r14 | |||
| addeq r9, r9, r14 | |||
| ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 | |||
| and r4, r14, r4, lsr #2 | |||
| and r5, r14, r5, lsr #2 | |||
| and r6, r14, r6, lsr #2 | |||
| and r7, r14, r7, lsr #2 | |||
| add r10, r4, r6 | |||
| add r11, r5, r7 | |||
| .endm | |||
| .macro RND_XY2_EXPAND align, rnd | |||
| RND_XY2_IT \align, \rnd | |||
| 6: stmfd sp!, {r8-r11} | |||
| RND_XY2_IT \align, \rnd | |||
| ldmfd sp!, {r4-r7} | |||
| add r4, r4, r8 | |||
| add r5, r5, r9 | |||
| add r6, r6, r10 | |||
| add r7, r7, r11 | |||
| ldr r14, [r12, #24] @ 0x0F0F0F0F | |||
| and r4, r14, r4, lsr #2 | |||
| and r5, r14, r5, lsr #2 | |||
| add r4, r4, r6 | |||
| add r5, r5, r7 | |||
| subs r3, r3, #1 | |||
| stmia r0, {r4-r5} | |||
| add r0, r0, r2 | |||
| bne 6b | |||
| ldmfd sp!, {r4-r11,pc} | |||
| .endm | |||
| .align 8 | |||
| .global put_pixels8_xy2_arm | |||
| put_pixels8_xy2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
| adrl r12, 5f | |||
| ands r4, r1, #3 | |||
| add r5, r12, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| RND_XY2_EXPAND 0, 1 | |||
| .align 8 | |||
| 2: | |||
| RND_XY2_EXPAND 1, 1 | |||
| .align 8 | |||
| 3: | |||
| RND_XY2_EXPAND 2, 1 | |||
| .align 8 | |||
| 4: | |||
| RND_XY2_EXPAND 3, 1 | |||
| 5: | |||
| .word 0x03030303 | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| .word 0x02020202 | |||
| .word 0xFCFCFCFC >> 2 | |||
| .word 0x0F0F0F0F | |||
| .word 0x01010101 | |||
| .align 8 | |||
| .global put_no_rnd_pixels8_xy2_arm | |||
| put_no_rnd_pixels8_xy2_arm: | |||
| @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| @ block = word aligned, pixles = unaligned | |||
| pld [r1] | |||
| stmfd sp!, {r4-r11,lr} @ R14 is also called LR | |||
| adrl r12, 5f | |||
| ands r4, r1, #3 | |||
| add r5, r12, r4, lsl #2 | |||
| bic r1, r1, #3 | |||
| ldrne pc, [r5] | |||
| 1: | |||
| RND_XY2_EXPAND 0, 0 | |||
| .align 8 | |||
| 2: | |||
| RND_XY2_EXPAND 1, 0 | |||
| .align 8 | |||
| 3: | |||
| RND_XY2_EXPAND 2, 0 | |||
| .align 8 | |||
| 4: | |||
| RND_XY2_EXPAND 3, 0 | |||
| 5: | |||
| .word 0x03030303 | |||
| .word 2b | |||
| .word 3b | |||
| .word 4b | |||
| .word 0x02020202 | |||
| .word 0xFCFCFCFC >> 2 | |||
| .word 0x0F0F0F0F | |||
| .word 0x01010101 | |||
| @@ -0,0 +1,168 @@ | |||
| /* | |||
| * iWMMXt optimized DSP utils | |||
| * Copyright (c) 2004 AGAWA Koji | |||
| * | |||
| * This library is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2 of the License, or (at your option) any later version. | |||
| * | |||
| * This library is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with this library; if not, write to the Free Software | |||
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
| */ | |||
| #include "../dsputil.h" | |||
| #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt | |||
| #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); | |||
| #define WAVG2B "wavg2b" | |||
| #include "dsputil_iwmmxt_rnd.h" | |||
| #undef DEF | |||
| #undef SET_RND | |||
| #undef WAVG2B | |||
| #define DEF(x, y) x ## _ ## y ##_iwmmxt | |||
| #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); | |||
| #define WAVG2B "wavg2br" | |||
| #include "dsputil_iwmmxt_rnd.h" | |||
| #undef DEF | |||
| #undef SET_RND | |||
| #undef WAVG2BR | |||
| // need scheduling | |||
| #define OP(AVG) \ | |||
| asm volatile ( \ | |||
| /* alignment */ \ | |||
| "and r12, %[pixels], #7 \n\t" \ | |||
| "bic %[pixels], %[pixels], #7 \n\t" \ | |||
| "tmcr wcgr1, r12 \n\t" \ | |||
| \ | |||
| "wldrd wr0, [%[pixels]] \n\t" \ | |||
| "wldrd wr1, [%[pixels], #8] \n\t" \ | |||
| "add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
| "walignr1 wr4, wr0, wr1 \n\t" \ | |||
| \ | |||
| "1: \n\t" \ | |||
| \ | |||
| "wldrd wr2, [%[pixels]] \n\t" \ | |||
| "wldrd wr3, [%[pixels], #8] \n\t" \ | |||
| "add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
| "pld [%[pixels]] \n\t" \ | |||
| "walignr1 wr5, wr2, wr3 \n\t" \ | |||
| AVG " wr6, wr4, wr5 \n\t" \ | |||
| "wstrd wr6, [%[block]] \n\t" \ | |||
| "add %[block], %[block], %[line_size] \n\t" \ | |||
| \ | |||
| "wldrd wr0, [%[pixels]] \n\t" \ | |||
| "wldrd wr1, [%[pixels], #8] \n\t" \ | |||
| "add %[pixels], %[pixels], %[line_size] \n\t" \ | |||
| "walignr1 wr4, wr0, wr1 \n\t" \ | |||
| "pld [%[pixels]] \n\t" \ | |||
| AVG " wr6, wr4, wr5 \n\t" \ | |||
| "wstrd wr6, [%[block]] \n\t" \ | |||
| "add %[block], %[block], %[line_size] \n\t" \ | |||
| \ | |||
| "subs %[h], %[h], #2 \n\t" \ | |||
| "bne 1b \n\t" \ | |||
| : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ | |||
| : [line_size]"r"(line_size) \ | |||
| : "memory", "r12"); | |||
| void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||
| { | |||
| OP("wavg2br"); | |||
| } | |||
| void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |||
| { | |||
| OP("wavg2b"); | |||
| } | |||
| #undef OP | |||
| void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) | |||
| { | |||
| uint8_t *pixels2 = pixels + line_size; | |||
| __asm__ __volatile__ ( | |||
| "mov r12, #4 \n\t" | |||
| "1: \n\t" | |||
| "pld [%[pixels], %[line_size2]] \n\t" | |||
| "pld [%[pixels2], %[line_size2]] \n\t" | |||
| "wldrd wr4, [%[pixels]] \n\t" | |||
| "wldrd wr5, [%[pixels2]] \n\t" | |||
| "pld [%[block], #32] \n\t" | |||
| "wunpckelub wr6, wr4 \n\t" | |||
| "wldrd wr0, [%[block]] \n\t" | |||
| "wunpckehub wr7, wr4 \n\t" | |||
| "wldrd wr1, [%[block], #8] \n\t" | |||
| "wunpckelub wr8, wr5 \n\t" | |||
| "wldrd wr2, [%[block], #16] \n\t" | |||
| "wunpckehub wr9, wr5 \n\t" | |||
| "wldrd wr3, [%[block], #24] \n\t" | |||
| "add %[block], %[block], #32 \n\t" | |||
| "waddhss wr10, wr0, wr6 \n\t" | |||
| "waddhss wr11, wr1, wr7 \n\t" | |||
| "waddhss wr12, wr2, wr8 \n\t" | |||
| "waddhss wr13, wr3, wr9 \n\t" | |||
| "wpackhus wr14, wr10, wr11 \n\t" | |||
| "wpackhus wr15, wr12, wr13 \n\t" | |||
| "wstrd wr14, [%[pixels]] \n\t" | |||
| "add %[pixels], %[pixels], %[line_size2] \n\t" | |||
| "subs r12, r12, #1 \n\t" | |||
| "wstrd wr15, [%[pixels2]] \n\t" | |||
| "add %[pixels2], %[pixels2], %[line_size2] \n\t" | |||
| "bne 1b \n\t" | |||
| : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) | |||
| : [line_size2]"r"(line_size << 1) | |||
| : "cc", "memory", "r12"); | |||
| } | |||
| static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |||
| { | |||
| return; | |||
| } | |||
| void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| c->add_pixels_clamped = add_pixels_clamped_iwmmxt; | |||
| c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||
| c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; | |||
| c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; | |||
| c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; | |||
| c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||
| c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; | |||
| c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; | |||
| c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; | |||
| c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; | |||
| c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||
| c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; | |||
| c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; | |||
| c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; | |||
| c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; | |||
| c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; | |||
| c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; | |||
| c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; | |||
| c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; | |||
| } | |||
| @@ -21,6 +21,13 @@ | |||
| #include "../mpegvideo.h" | |||
| #include "../avcodec.h" | |||
| #ifdef HAVE_IWMMXT | |||
| extern void MPV_common_init_iwmmxt(MpegEncContext *s); | |||
| #endif | |||
| void MPV_common_init_armv4l(MpegEncContext *s) | |||
| { | |||
| #ifdef HAVE_IWMMXT | |||
| MPV_common_init_iwmmxt(s); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,97 @@ | |||
| #include "../dsputil.h" | |||
| #include "../mpegvideo.h" | |||
| #include "../avcodec.h" | |||
| static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, | |||
| DCTELEM *block, int n, int qscale) | |||
| { | |||
| int level, qmul, qadd; | |||
| int nCoeffs; | |||
| DCTELEM *block_orig = block; | |||
| assert(s->block_last_index[n]>=0); | |||
| qmul = qscale << 1; | |||
| if (!s->h263_aic) { | |||
| if (n < 4) | |||
| level = block[0] * s->y_dc_scale; | |||
| else | |||
| level = block[0] * s->c_dc_scale; | |||
| qadd = (qscale - 1) | 1; | |||
| }else{ | |||
| qadd = 0; | |||
| level = block[0]; | |||
| } | |||
| if(s->ac_pred) | |||
| nCoeffs=63; | |||
| else | |||
| nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||
| __asm__ __volatile__ ( | |||
| /* "movd %1, %%mm6 \n\t" //qmul */ | |||
| /* "packssdw %%mm6, %%mm6 \n\t" */ | |||
| /* "packssdw %%mm6, %%mm6 \n\t" */ | |||
| "tbcsth wr6, %[qmul] \n\t" | |||
| /* "movd %2, %%mm5 \n\t" //qadd */ | |||
| /* "packssdw %%mm5, %%mm5 \n\t" */ | |||
| /* "packssdw %%mm5, %%mm5 \n\t" */ | |||
| "tbcsth wr5, %[qadd] \n\t" | |||
| "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ | |||
| "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ | |||
| "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ | |||
| "1: \n\t" | |||
| "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ | |||
| "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ | |||
| "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ | |||
| "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ | |||
| /* "movq (%0, %3), %%mm2 \n\t" */ | |||
| /* "movq 8(%0, %3), %%mm3 \n\t" */ | |||
| "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ | |||
| "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ | |||
| "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ | |||
| "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ | |||
| "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ | |||
| "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ | |||
| "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ | |||
| "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ | |||
| "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ | |||
| "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ | |||
| "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ | |||
| "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ | |||
| "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ | |||
| "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ | |||
| "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ | |||
| "subs %[i], %[i], #1 \n\t" | |||
| "bne 1b \n\t" /* "jng 1b \n\t" */ | |||
| :[block]"+r"(block) | |||
| :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) | |||
| :"memory"); | |||
| block_orig[0] = level; | |||
| } | |||
| #if 0 | |||
| static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, | |||
| DCTELEM *block, int n, int qscale) | |||
| { | |||
| int nCoeffs; | |||
| assert(s->block_last_index[n]>=0); | |||
| if(s->ac_pred) | |||
| nCoeffs=63; | |||
| else | |||
| nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |||
| ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); | |||
| } | |||
| #endif | |||
| void MPV_common_init_iwmmxt(MpegEncContext *s) | |||
| { | |||
| s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; | |||
| #if 0 | |||
| s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; | |||
| #endif | |||
| } | |||
| @@ -1180,6 +1180,7 @@ typedef struct AVCodecContext { | |||
| #define FF_IDCT_SIMPLEARM 10 | |||
| #define FF_IDCT_H264 11 | |||
| #define FF_IDCT_VP3 12 | |||
| #define FP_IDCT_IPP 13 | |||
| /** | |||
| * slice count. | |||
| @@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){ | |||
| return (x>>8) | (x<<8); | |||
| } | |||
| #ifdef ARCH_ARM | |||
| static always_inline uint32_t bswap_32(uint32_t x){ | |||
| uint32_t t; | |||
| __asm__ ( | |||
| "eor %1, %0, %0, ror #16 \n\t" | |||
| "bic %1, %1, #0xFF0000 \n\t" | |||
| "mov %0, %0, ror #8 \n\t" | |||
| "eor %0, %0, %1, lsr #8 \n\t" | |||
| : "+r"(x), "+r"(t)); | |||
| return x; | |||
| } | |||
| #else | |||
| static always_inline uint32_t bswap_32(uint32_t x){ | |||
| x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); | |||
| return (x>>16) | (x<<16); | |||
| } | |||
| #endif | |||
| static inline uint64_t bswap_64(uint64_t x) | |||
| { | |||