Originally committed as revision 274 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -37,6 +37,12 @@ OBJS += mlib/dsputil_mlib.o | |||||
| CFLAGS += $(MLIB_INC) | CFLAGS += $(MLIB_INC) | ||||
| endif | endif | ||||
| # alpha specific stuff | |||||
| ifeq ($(TARGET_ARCH_ALPHA),yes) | |||||
| OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o | |||||
| CFLAGS += -Wa,-mpca56 | |||||
| endif | |||||
| SRCS = $(OBJS:.o=.c) $(ASM_OBJS:.o=.s) | SRCS = $(OBJS:.o=.c) $(ASM_OBJS:.o=.s) | ||||
| LIB= libavcodec.a | LIB= libavcodec.a | ||||
| @@ -74,6 +80,7 @@ clean: | |||||
| rm -f *.o *~ $(LIB) $(SLIB) *.so i386/*.o i386/*~ \ | rm -f *.o *~ $(LIB) $(SLIB) *.so i386/*.o i386/*~ \ | ||||
| armv4l/*.o armv4l/*~ \ | armv4l/*.o armv4l/*~ \ | ||||
| mlib/*.o mlib/*~ \ | mlib/*.o mlib/*~ \ | ||||
| alpha/*.o alpha/*~ \ | |||||
| libac3/*.o libac3/*~ \ | libac3/*.o libac3/*~ \ | ||||
| apiexample $(TESTS) | apiexample $(TESTS) | ||||
| @@ -0,0 +1,141 @@ | |||||
| /* | |||||
| * Alpha optimized DSP utils | |||||
| * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 2 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write to the Free Software | |||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |||||
| */ | |||||
| #ifndef LIBAVCODEC_ALPHA_ASM_H | |||||
| #define LIBAVCODEC_ALPHA_ASM_H | |||||
| #include <stdint.h> | |||||
| #define AMASK_BWX (1 << 0) | |||||
| #define AMASK_FIX (1 << 1) | |||||
| #define AMASK_MVI (1 << 8) | |||||
| static inline uint64_t BYTE_VEC(uint64_t x) | |||||
| { | |||||
| x |= x << 8; | |||||
| x |= x << 16; | |||||
| x |= x << 32; | |||||
| return x; | |||||
| } | |||||
| static inline uint64_t WORD_VEC(uint64_t x) | |||||
| { | |||||
| x |= x << 16; | |||||
| x |= x << 32; | |||||
| return x; | |||||
| } | |||||
| static inline int32_t ldl(const void* p) | |||||
| { | |||||
| return *(const int32_t*) p; | |||||
| } | |||||
| static inline uint64_t ldq(const void* p) | |||||
| { | |||||
| return *(const uint64_t*) p; | |||||
| } | |||||
| /* FIXME ccc doesn't seem to get it? Use inline asm? */ | |||||
| static inline uint64_t ldq_u(const void* p) | |||||
| { | |||||
| return *(const uint64_t*) ((uintptr_t) p & ~7ul); | |||||
| } | |||||
| static inline void stl(uint32_t l, void* p) | |||||
| { | |||||
| *(uint32_t*) p = l; | |||||
| } | |||||
| static inline void stq(uint64_t l, void* p) | |||||
| { | |||||
| *(uint64_t*) p = l; | |||||
| } | |||||
| #ifdef __GNUC__ | |||||
| #define OPCODE1(name) \ | |||||
| static inline uint64_t name(uint64_t l) \ | |||||
| { \ | |||||
| uint64_t r; \ | |||||
| asm (#name " %1, %0" : "=r" (r) : "r" (l)); \ | |||||
| return r; \ | |||||
| } | |||||
| #define OPCODE2(name) \ | |||||
| static inline uint64_t name(uint64_t l1, uint64_t l2) \ | |||||
| { \ | |||||
| uint64_t r; \ | |||||
| asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \ | |||||
| return r; \ | |||||
| } | |||||
| /* We don't want gcc to move this around or combine it with another | |||||
| rpcc, so mark it volatile. */ | |||||
| static inline uint64_t rpcc(void) | |||||
| { | |||||
| uint64_t r; | |||||
| asm volatile ("rpcc %0" : "=r" (r)); | |||||
| return r; | |||||
| } | |||||
| static inline uint64_t uldq(const void* v) | |||||
| { | |||||
| struct foo { | |||||
| unsigned long l; | |||||
| } __attribute__((packed)); | |||||
| return ((const struct foo*) v)->l; | |||||
| } | |||||
| #elif defined(__DECC) /* Compaq "ccc" compiler */ | |||||
| #include <c_asm.h> | |||||
| #define OPCODE1(name) \ | |||||
| static inline uint64_t name(uint64_t l) \ | |||||
| { \ | |||||
| return asm (#name " %a0, %v0", l); \ | |||||
| } | |||||
| #define OPCODE2(name) \ | |||||
| static inline uint64_t name(uint64_t l1, uint64_t l2) \ | |||||
| { \ | |||||
| return asm (#name " %a0, %a1, %v0", l1, l2); \ | |||||
| } | |||||
| static inline uint64_t rpcc(void) | |||||
| { | |||||
| return asm ("rpcc %v0"); | |||||
| } | |||||
| static inline uint64_t uldq(const void* v) | |||||
| { | |||||
| return *(const __unaligned uint64_t *) v; | |||||
| } | |||||
| #endif | |||||
| OPCODE1(amask); | |||||
| OPCODE1(unpkbw); | |||||
| OPCODE1(pkwb); | |||||
| OPCODE2(extql); | |||||
| OPCODE2(extqh); | |||||
| OPCODE2(zap); | |||||
| OPCODE2(cmpbge); | |||||
| OPCODE2(minsw4); | |||||
| OPCODE2(minuw4); | |||||
| OPCODE2(minub8); | |||||
| OPCODE2(maxsw4); | |||||
| OPCODE2(maxuw4); | |||||
| OPCODE2(perr); | |||||
| #endif /* LIBAVCODEC_ALPHA_ASM_H */ | |||||
| @@ -0,0 +1,223 @@ | |||||
| /* | |||||
| * Alpha optimized DSP utils | |||||
| * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 2 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write to the Free Software | |||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |||||
| */ | |||||
| #include "asm.h" | |||||
| #include "../dsputil.h" | |||||
| void simple_idct_axp(DCTELEM *block); | |||||
| static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | |||||
| int line_size) | |||||
| { | |||||
| int i = 8; | |||||
| do { | |||||
| UINT64 shorts; | |||||
| shorts = ldq(block); | |||||
| shorts = maxsw4(shorts, 0); | |||||
| shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |||||
| stl(pkwb(shorts), pixels); | |||||
| shorts = ldq(block + 4); | |||||
| shorts = maxsw4(shorts, 0); | |||||
| shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |||||
| stl(pkwb(shorts), pixels + 4); | |||||
| pixels += line_size; | |||||
| block += 8; | |||||
| } while (--i); | |||||
| } | |||||
| static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | |||||
| int line_size) | |||||
| { | |||||
| int i = 8; | |||||
| do { | |||||
| UINT64 shorts; | |||||
| shorts = ldq(block); | |||||
| shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ | |||||
| shorts += unpkbw(ldl(pixels)); | |||||
| shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ | |||||
| shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ | |||||
| shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ | |||||
| shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ | |||||
| stl(pkwb(shorts), pixels); | |||||
| /* next 4 */ | |||||
| shorts = ldq(block + 4); | |||||
| shorts &= ~WORD_VEC(0x8000); | |||||
| shorts += unpkbw(ldl(pixels + 4)); | |||||
| shorts &= ~WORD_VEC(0x8000); | |||||
| shorts = minuw4(shorts, WORD_VEC(0x4000)); | |||||
| shorts &= ~WORD_VEC(0x4000); | |||||
| shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |||||
| stl(pkwb(shorts), pixels + 4); | |||||
| pixels += line_size; | |||||
| block += 8; | |||||
| } while (--i); | |||||
| } | |||||
| /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 | |||||
| Since the immediate result could be greater than 255, we do the | |||||
| shift first. The result is too low by one if the bytes were both | |||||
| odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ | |||||
| static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) | |||||
| { | |||||
| UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); | |||||
| l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; | |||||
| l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; | |||||
| return l1 + l2 + correction; | |||||
| } | |||||
| /* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 | |||||
| The '1' only has an effect when one byte is even and the other odd, | |||||
| i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). | |||||
| Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ | |||||
| static inline UINT64 avg2(UINT64 l1, UINT64 l2) | |||||
| { | |||||
| UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); | |||||
| l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; | |||||
| l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; | |||||
| return l1 + l2 + correction; | |||||
| } | |||||
| static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |||||
| { | |||||
| UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||||
| UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) | |||||
| + (l2 & BYTE_VEC(0x03)) | |||||
| + (l3 & BYTE_VEC(0x03)) | |||||
| + (l4 & BYTE_VEC(0x03)) | |||||
| + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |||||
| return r1 + r2; | |||||
| } | |||||
| static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |||||
| { | |||||
| UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||||
| + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||||
| UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) | |||||
| + (l2 & BYTE_VEC(0x03)) | |||||
| + (l3 & BYTE_VEC(0x03)) | |||||
| + (l4 & BYTE_VEC(0x03)) | |||||
| + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); | |||||
| return r1 + r2; | |||||
| } | |||||
| #define PIXOPNAME(suffix) put ## suffix | |||||
| #define BTYPE UINT8 | |||||
| #define AVG2 avg2 | |||||
| #define AVG4 avg4 | |||||
| #define STORE(l, b) stq(l, b) | |||||
| #include "pixops.h" | |||||
| #undef PIXOPNAME | |||||
| #undef BTYPE | |||||
| #undef AVG2 | |||||
| #undef AVG4 | |||||
| #undef STORE | |||||
| #define PIXOPNAME(suffix) put_no_rnd ## suffix | |||||
| #define BTYPE UINT8 | |||||
| #define AVG2 avg2_no_rnd | |||||
| #define AVG4 avg4_no_rnd | |||||
| #define STORE(l, b) stq(l, b) | |||||
| #include "pixops.h" | |||||
| #undef PIXOPNAME | |||||
| #undef BTYPE | |||||
| #undef AVG2 | |||||
| #undef AVG4 | |||||
| #undef STORE | |||||
| /* The following functions are untested. */ | |||||
| #if 0 | |||||
| #define PIXOPNAME(suffix) avg ## suffix | |||||
| #define BTYPE UINT8 | |||||
| #define AVG2 avg2 | |||||
| #define AVG4 avg4 | |||||
| #define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
| #include "pixops.h" | |||||
| #undef PIXOPNAME | |||||
| #undef BTYPE | |||||
| #undef AVG2 | |||||
| #undef AVG4 | |||||
| #undef STORE | |||||
| #define PIXOPNAME(suffix) avg_no_rnd ## suffix | |||||
| #define BTYPE UINT8 | |||||
| #define AVG2 avg2_no_rnd | |||||
| #define AVG4 avg4_no_rnd | |||||
| #define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
| #include "pixops.h" | |||||
| #undef PIXOPNAME | |||||
| #undef BTYPE | |||||
| #undef AVG2 | |||||
| #undef AVG4 | |||||
| #undef STORE | |||||
| #define PIXOPNAME(suffix) sub ## suffix | |||||
| #define BTYPE DCTELEM | |||||
| #define AVG2 avg2 | |||||
| #define AVG4 avg4 | |||||
| #define STORE(l, block) do { \ | |||||
| UINT64 xxx = l; \ | |||||
| (block)[0] -= (xxx >> 0) & 0xff; \ | |||||
| (block)[1] -= (xxx >> 8) & 0xff; \ | |||||
| (block)[2] -= (xxx >> 16) & 0xff; \ | |||||
| (block)[3] -= (xxx >> 24) & 0xff; \ | |||||
| (block)[4] -= (xxx >> 32) & 0xff; \ | |||||
| (block)[5] -= (xxx >> 40) & 0xff; \ | |||||
| (block)[6] -= (xxx >> 48) & 0xff; \ | |||||
| (block)[7] -= (xxx >> 56) & 0xff; \ | |||||
| } while (0) | |||||
| #include "pixops.h" | |||||
| #undef PIXOPNAME | |||||
| #undef BTYPE | |||||
| #undef AVG2 | |||||
| #undef AVG4 | |||||
| #undef STORE | |||||
| #endif | |||||
| void dsputil_init_alpha(void) | |||||
| { | |||||
| put_pixels_tab[0] = put_pixels_axp; | |||||
| put_pixels_tab[1] = put_pixels_x2_axp; | |||||
| put_pixels_tab[2] = put_pixels_y2_axp; | |||||
| put_pixels_tab[3] = put_pixels_xy2_axp; | |||||
| put_no_rnd_pixels_tab[0] = put_pixels_axp; | |||||
| put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; | |||||
| put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | |||||
| put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | |||||
| /* amask clears all bits that correspond to present features. */ | |||||
| if (amask(AMASK_MVI) == 0) { | |||||
| fprintf(stderr, "MVI extension detected\n"); | |||||
| put_pixels_clamped = put_pixels_clamped_axp; | |||||
| add_pixels_clamped = add_pixels_clamped_axp; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,88 @@ | |||||
| /* | |||||
| * Alpha optimized DSP utils | |||||
| * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 2 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write to the Free Software | |||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |||||
| */ | |||||
| #include "asm.h" | |||||
| #include "../dsputil.h" | |||||
| #include "../mpegvideo.h" | |||||
| extern UINT8 zigzag_end[64]; | |||||
| static void dct_unquantize_h263_axp(MpegEncContext *s, | |||||
| DCTELEM *block, int n, int qscale) | |||||
| { | |||||
| int i, level; | |||||
| UINT64 qmul, qadd; | |||||
| if (s->mb_intra) { | |||||
| if (n < 4) | |||||
| block[0] = block[0] * s->y_dc_scale; | |||||
| else | |||||
| block[0] = block[0] * s->c_dc_scale; | |||||
| /* Catch up to aligned point. */ | |||||
| qmul = s->qscale << 1; | |||||
| qadd = (s->qscale - 1) | 1; | |||||
| for (i = 1; i < 4; ++i) { | |||||
| level = block[i]; | |||||
| if (level) { | |||||
| if (level < 0) { | |||||
| level = level * qmul - qadd; | |||||
| } else { | |||||
| level = level * qmul + qadd; | |||||
| } | |||||
| block[i] = level; | |||||
| } | |||||
| } | |||||
| block += 4; | |||||
| i = 60 / 4; | |||||
| } else { | |||||
| i = zigzag_end[s->block_last_index[n]] / 4; | |||||
| } | |||||
| qmul = s->qscale << 1; | |||||
| qadd = WORD_VEC((qscale - 1) | 1); | |||||
| do { | |||||
| UINT64 levels, negmask, zeromask, corr; | |||||
| levels = ldq(block); | |||||
| if (levels == 0) | |||||
| continue; | |||||
| zeromask = cmpbge(0, levels); | |||||
| zeromask &= zeromask >> 1; | |||||
| /* Negate all negative words. */ | |||||
| negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */ | |||||
| negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ | |||||
| corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */ | |||||
| levels ^= negmask; | |||||
| levels += corr; | |||||
| levels = levels * qmul; | |||||
| levels += zap(qadd, zeromask); | |||||
| /* Re-negate negative words. */ | |||||
| levels -= corr; | |||||
| levels ^= negmask; | |||||
| stq(levels, block); | |||||
| } while (block += 4, --i); | |||||
| } | |||||
| void MPV_common_init_axp(MpegEncContext *s) | |||||
| { | |||||
| if (amask(AMASK_MVI) == 0) { | |||||
| if (s->out_format == FMT_H263) | |||||
| s->dct_unquantize = dct_unquantize_h263_axp; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,135 @@ | |||||
| /* | |||||
| * Alpha optimized DSP utils | |||||
| * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 2 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write to the Free Software | |||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |||||
| */ | |||||
| /* This file is intended to be #included with proper definitions of | |||||
| * PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */ | |||||
| static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels, | |||||
| int line_size, int h) | |||||
| { | |||||
| if ((size_t) pixels & 0x7) { | |||||
| do { | |||||
| STORE(uldq(pixels), block); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } while (--h); | |||||
| } else { | |||||
| do { | |||||
| STORE(ldq(pixels), block); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } while (--h); | |||||
| } | |||||
| } | |||||
| static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels, | |||||
| int line_size, int h) | |||||
| { | |||||
| if ((size_t) pixels & 0x7) { | |||||
| do { | |||||
| UINT64 pix1, pix2; | |||||
| pix1 = uldq(pixels); | |||||
| pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| STORE(AVG2(pix1, pix2), block); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } while (--h); | |||||
| } else { | |||||
| do { | |||||
| UINT64 pix1, pix2; | |||||
| pix1 = ldq(pixels); | |||||
| pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| STORE(AVG2(pix1, pix2), block); | |||||
| pixels += line_size; | |||||
| block += line_size; | |||||
| } while (--h); | |||||
| } | |||||
| } | |||||
| static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels, | |||||
| int line_size, int h) | |||||
| { | |||||
| if ((size_t) pixels & 0x7) { | |||||
| UINT64 pix = uldq(pixels); | |||||
| do { | |||||
| UINT64 next_pix; | |||||
| pixels += line_size; | |||||
| next_pix = uldq(pixels); | |||||
| STORE(AVG2(pix, next_pix), block); | |||||
| block += line_size; | |||||
| pix = next_pix; | |||||
| } while (--h); | |||||
| } else { | |||||
| UINT64 pix = ldq(pixels); | |||||
| do { | |||||
| UINT64 next_pix; | |||||
| pixels += line_size; | |||||
| next_pix = ldq(pixels); | |||||
| STORE(AVG2(pix, next_pix), block); | |||||
| block += line_size; | |||||
| pix = next_pix; | |||||
| } while (--h); | |||||
| } | |||||
| } | |||||
| /* This could be further sped up by recycling AVG4 intermediate | |||||
| results from the previous loop pass. */ | |||||
| static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels, | |||||
| int line_size, int h) | |||||
| { | |||||
| if ((size_t) pixels & 0x7) { | |||||
| UINT64 pix1 = uldq(pixels); | |||||
| UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| do { | |||||
| UINT64 next_pix1, next_pix2; | |||||
| pixels += line_size; | |||||
| next_pix1 = uldq(pixels); | |||||
| next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); | |||||
| block += line_size; | |||||
| pix1 = next_pix1; | |||||
| pix2 = next_pix2; | |||||
| } while (--h); | |||||
| } else { | |||||
| UINT64 pix1 = ldq(pixels); | |||||
| UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| do { | |||||
| UINT64 next_pix1, next_pix2; | |||||
| pixels += line_size; | |||||
| next_pix1 = ldq(pixels); | |||||
| next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56); | |||||
| STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); | |||||
| block += line_size; | |||||
| pix1 = next_pix1; | |||||
| pix2 = next_pix2; | |||||
| } while (--h); | |||||
| } | |||||
| } | |||||
| @@ -497,6 +497,10 @@ void dsputil_init(void) | |||||
| dsputil_init_mlib(); | dsputil_init_mlib(); | ||||
| use_permuted_idct = 0; | use_permuted_idct = 0; | ||||
| #endif | #endif | ||||
| #ifdef ARCH_ALPHA | |||||
| dsputil_init_alpha(); | |||||
| use_permuted_idct = 0; | |||||
| #endif | |||||
| #ifdef SIMPLE_IDCT | #ifdef SIMPLE_IDCT | ||||
| if(ff_idct == simple_idct) use_permuted_idct=0; | if(ff_idct == simple_idct) use_permuted_idct=0; | ||||
| @@ -123,6 +123,13 @@ void dsputil_init_armv4l(void); | |||||
| void dsputil_init_mlib(void); | void dsputil_init_mlib(void); | ||||
| #elif defined(ARCH_ALPHA) | |||||
| #define emms_c() | |||||
| #define __align8 __attribute__ ((aligned (8))) | |||||
| void dsputil_init_alpha(void); | |||||
| #else | #else | ||||
| #define emms_c() | #define emms_c() | ||||
| @@ -460,7 +460,19 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n, | |||||
| : "r" (scale) | : "r" (scale) | ||||
| : "%eax", "%edx" | : "%eax", "%edx" | ||||
| ); | ); | ||||
| #else | |||||
| #elif defined (ARCH_ALPHA) | |||||
| /* Divisions are extremely costly on Alpha; optimize the most | |||||
| common case. */ | |||||
| if (scale == 8) { | |||||
| a = (a + (8 >> 1)) / 8; | |||||
| b = (b + (8 >> 1)) / 8; | |||||
| c = (c + (8 >> 1)) / 8; | |||||
| } else { | |||||
| a = (a + (scale >> 1)) / scale; | |||||
| b = (b + (scale >> 1)) / scale; | |||||
| c = (c + (scale >> 1)) / scale; | |||||
| } | |||||
| #else | |||||
| a = (a + (scale >> 1)) / scale; | a = (a + (scale >> 1)) / scale; | ||||
| b = (b + (scale >> 1)) / scale; | b = (b + (scale >> 1)) / scale; | ||||
| c = (c + (scale >> 1)) / scale; | c = (c + (scale >> 1)) / scale; | ||||
| @@ -23,6 +23,7 @@ | |||||
| #include <inttypes.h> | #include <inttypes.h> | ||||
| #include "simple_idct.h" | #include "simple_idct.h" | ||||
| #include "../config.h" | |||||
| #if 0 | #if 0 | ||||
| #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | ||||
| @@ -102,6 +103,107 @@ static int inline idctRowCondZ (int16_t * row) | |||||
| return 1; | return 1; | ||||
| } | } | ||||
| #ifdef ARCH_ALPHA | |||||
| static int inline idctRowCondDC(int16_t *row) | |||||
| { | |||||
| int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; | |||||
| uint64_t *lrow = (uint64_t *) row; | |||||
| if (lrow[1] == 0) { | |||||
| if (lrow[0] == 0) | |||||
| return 0; | |||||
| if ((lrow[0] & ~0xffffULL) == 0) { | |||||
| uint64_t v; | |||||
| a0 = W4 * row[0]; | |||||
| a0 += 1 << (ROW_SHIFT - 1); | |||||
| a0 >>= ROW_SHIFT; | |||||
| v = (uint16_t) a0; | |||||
| v += v << 16; | |||||
| v += v << 32; | |||||
| lrow[0] = v; | |||||
| lrow[1] = v; | |||||
| return 1; | |||||
| } | |||||
| } | |||||
| a0 = W4 * row[0]; | |||||
| a1 = W4 * row[0]; | |||||
| a2 = W4 * row[0]; | |||||
| a3 = W4 * row[0]; | |||||
| if (row[2]) { | |||||
| a0 += W2 * row[2]; | |||||
| a1 += W6 * row[2]; | |||||
| a2 -= W6 * row[2]; | |||||
| a3 -= W2 * row[2]; | |||||
| } | |||||
| if (row[4]) { | |||||
| a0 += W4 * row[4]; | |||||
| a1 -= W4 * row[4]; | |||||
| a2 -= W4 * row[4]; | |||||
| a3 += W4 * row[4]; | |||||
| } | |||||
| if (row[6]) { | |||||
| a0 += W6 * row[6]; | |||||
| a1 -= W2 * row[6]; | |||||
| a2 += W2 * row[6]; | |||||
| a3 -= W6 * row[6]; | |||||
| } | |||||
| a0 += 1 << (ROW_SHIFT - 1); | |||||
| a1 += 1 << (ROW_SHIFT - 1); | |||||
| a2 += 1 << (ROW_SHIFT - 1); | |||||
| a3 += 1 << (ROW_SHIFT - 1); | |||||
| if (row[1]) { | |||||
| b0 = W1 * row[1]; | |||||
| b1 = W3 * row[1]; | |||||
| b2 = W5 * row[1]; | |||||
| b3 = W7 * row[1]; | |||||
| } else { | |||||
| b0 = 0; | |||||
| b1 = 0; | |||||
| b2 = 0; | |||||
| b3 = 0; | |||||
| } | |||||
| if (row[3]) { | |||||
| b0 += W3 * row[3]; | |||||
| b1 -= W7 * row[3]; | |||||
| b2 -= W1 * row[3]; | |||||
| b3 -= W5 * row[3]; | |||||
| } | |||||
| if (row[5]) { | |||||
| b0 += W5 * row[5]; | |||||
| b1 -= W1 * row[5]; | |||||
| b2 += W7 * row[5]; | |||||
| b3 += W3 * row[5]; | |||||
| } | |||||
| if (row[7]) { | |||||
| b0 += W7 * row[7]; | |||||
| b1 -= W5 * row[7]; | |||||
| b2 += W3 * row[7]; | |||||
| b3 -= W1 * row[7]; | |||||
| } | |||||
| row[0] = (a0 + b0) >> ROW_SHIFT; | |||||
| row[1] = (a1 + b1) >> ROW_SHIFT; | |||||
| row[2] = (a2 + b2) >> ROW_SHIFT; | |||||
| row[3] = (a3 + b3) >> ROW_SHIFT; | |||||
| row[4] = (a3 - b3) >> ROW_SHIFT; | |||||
| row[5] = (a2 - b2) >> ROW_SHIFT; | |||||
| row[6] = (a1 - b1) >> ROW_SHIFT; | |||||
| row[7] = (a0 - b0) >> ROW_SHIFT; | |||||
| return 1; | |||||
| } | |||||
| #else /* not ARCH_ALPHA */ | |||||
| static int inline idctRowCondDC (int16_t * row) | static int inline idctRowCondDC (int16_t * row) | ||||
| { | { | ||||
| int a0, a1, a2, a3, b0, b1, b2, b3; | int a0, a1, a2, a3, b0, b1, b2, b3; | ||||
| @@ -147,6 +249,7 @@ static int inline idctRowCondDC (int16_t * row) | |||||
| return 1; | return 1; | ||||
| } | } | ||||
| #endif /* not ARCH_ALPHA */ | |||||
| static void inline idctCol (int16_t * col) | static void inline idctCol (int16_t * col) | ||||
| { | { | ||||
| @@ -243,6 +346,7 @@ static void inline idctSparseCol (int16_t * col) | |||||
| b3 += - W1*col[8*7]; | b3 += - W1*col[8*7]; | ||||
| } | } | ||||
| #ifndef ARCH_ALPHA | |||||
| if(!(b0|b1|b2|b3)){ | if(!(b0|b1|b2|b3)){ | ||||
| col[8*0] = (a0) >> COL_SHIFT; | col[8*0] = (a0) >> COL_SHIFT; | ||||
| col[8*7] = (a0) >> COL_SHIFT; | col[8*7] = (a0) >> COL_SHIFT; | ||||
| @@ -253,6 +357,7 @@ static void inline idctSparseCol (int16_t * col) | |||||
| col[8*3] = (a3) >> COL_SHIFT; | col[8*3] = (a3) >> COL_SHIFT; | ||||
| col[8*4] = (a3) >> COL_SHIFT; | col[8*4] = (a3) >> COL_SHIFT; | ||||
| }else{ | }else{ | ||||
| #endif | |||||
| col[8*0] = (a0 + b0) >> COL_SHIFT; | col[8*0] = (a0 + b0) >> COL_SHIFT; | ||||
| col[8*7] = (a0 - b0) >> COL_SHIFT; | col[8*7] = (a0 - b0) >> COL_SHIFT; | ||||
| col[8*1] = (a1 + b1) >> COL_SHIFT; | col[8*1] = (a1 + b1) >> COL_SHIFT; | ||||
| @@ -261,7 +366,9 @@ static void inline idctSparseCol (int16_t * col) | |||||
| col[8*5] = (a2 - b2) >> COL_SHIFT; | col[8*5] = (a2 - b2) >> COL_SHIFT; | ||||
| col[8*3] = (a3 + b3) >> COL_SHIFT; | col[8*3] = (a3 + b3) >> COL_SHIFT; | ||||
| col[8*4] = (a3 - b3) >> COL_SHIFT; | col[8*4] = (a3 - b3) >> COL_SHIFT; | ||||
| #ifndef ARCH_ALPHA | |||||
| } | } | ||||
| #endif | |||||
| } | } | ||||
| static void inline idctSparse2Col (int16_t * col) | static void inline idctSparse2Col (int16_t * col) | ||||
| @@ -337,6 +444,34 @@ static void inline idctSparse2Col (int16_t * col) | |||||
| col[8*4] = (a3 - b3) >> COL_SHIFT; | col[8*4] = (a3 - b3) >> COL_SHIFT; | ||||
| } | } | ||||
| #ifdef ARCH_ALPHA | |||||
| /* If all rows but the first one are zero after row transformation, | |||||
| all rows will be identical after column transformation. */ | |||||
| static inline void idctCol2(int16_t *col) | |||||
| { | |||||
| int i; | |||||
| uint64_t l, r; | |||||
| uint64_t *lcol = (uint64_t *) col; | |||||
| for (i = 0; i < 8; ++i) { | |||||
| int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4; | |||||
| a0 *= W4; | |||||
| col[0] = a0 >> COL_SHIFT; | |||||
| ++col; | |||||
| } | |||||
| l = lcol[0]; | |||||
| r = lcol[1]; | |||||
| lcol[ 2] = l; lcol[ 3] = r; | |||||
| lcol[ 4] = l; lcol[ 5] = r; | |||||
| lcol[ 6] = l; lcol[ 7] = r; | |||||
| lcol[ 8] = l; lcol[ 9] = r; | |||||
| lcol[10] = l; lcol[11] = r; | |||||
| lcol[12] = l; lcol[13] = r; | |||||
| lcol[14] = l; lcol[15] = r; | |||||
| } | |||||
| #endif | |||||
| void simple_idct (short *block) | void simple_idct (short *block) | ||||
| { | { | ||||
| @@ -411,7 +546,22 @@ void simple_idct (short *block) | |||||
| for(i=0; i<8; i++) | for(i=0; i<8; i++) | ||||
| idctSparse2Col(block + i); | idctSparse2Col(block + i); | ||||
| } | } | ||||
| #else | |||||
| #elif defined(ARCH_ALPHA) | |||||
| int shortcut = 1; | |||||
| for (i = 0; i < 8; i++) { | |||||
| int anynonzero = idctRowCondDC(block + 8 * i); | |||||
| if (i > 0 && anynonzero) | |||||
| shortcut = 0; | |||||
| } | |||||
| if (shortcut) { | |||||
| idctCol2(block); | |||||
| } else { | |||||
| for (i = 0; i < 8; i++) | |||||
| idctSparseCol(block + i); | |||||
| } | |||||
| #else | |||||
| for(i=0; i<8; i++) | for(i=0; i<8; i++) | ||||
| idctRowCondDC(block + i*8); | idctRowCondDC(block + i*8); | ||||