Originally committed as revision 14 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -0,0 +1,92 @@ | |||||
| #ifndef __BSWAP_H__ | |||||
| #define __BSWAP_H__ | |||||
| #ifdef HAVE_CONFIG_H | |||||
| #include "config.h" | |||||
| #endif | |||||
| #ifdef HAVE_BYTESWAP_H | |||||
| #include <byteswap.h> | |||||
| #else | |||||
| #include <inttypes.h> | |||||
| #ifdef ARCH_X86 | |||||
| inline static unsigned short ByteSwap16(unsigned short x) | |||||
| { | |||||
| __asm("xchgb %b0,%h0" : | |||||
| "=q" (x) : | |||||
| "0" (x)); | |||||
| return x; | |||||
| } | |||||
| #define bswap_16(x) ByteSwap16(x) | |||||
| inline static unsigned int ByteSwap32(unsigned int x) | |||||
| { | |||||
| #if __CPU__ > 386 | |||||
| __asm("bswap %0": | |||||
| "=r" (x) : | |||||
| #else | |||||
| __asm("xchgb %b0,%h0\n" | |||||
| " rorl $16,%0\n" | |||||
| " xchgb %b0,%h0": | |||||
| "=q" (x) : | |||||
| #endif | |||||
| "0" (x)); | |||||
| return x; | |||||
| } | |||||
| #define bswap_32(x) ByteSwap32(x) | |||||
| inline static unsigned long long int ByteSwap64(unsigned long long int x) | |||||
| { | |||||
| register union { __extension__ unsigned long long int __ll; | |||||
| unsigned long int __l[2]; } __x; | |||||
| asm("xchgl %0,%1": | |||||
| "=r"(__x.__l[0]),"=r"(__x.__l[1]): | |||||
| "0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32)))); | |||||
| return __x.__ll; | |||||
| } | |||||
| #define bswap_64(x) ByteSwap64(x) | |||||
| #else | |||||
| #define bswap_16(x) (((x) & 0x00ff) << 8 | ((x) & 0xff00) >> 8) | |||||
| // code from bits/byteswap.h (C) 1997, 1998 Free Software Foundation, Inc. | |||||
| #define bswap_32(x) \ | |||||
| ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ | |||||
| (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) | |||||
| #define bswap_64(x) \ | |||||
| (__extension__ \ | |||||
| ({ union { __extension__ unsigned long long int __ll; \ | |||||
| unsigned long int __l[2]; } __w, __r; \ | |||||
| __w.__ll = (x); \ | |||||
| __r.__l[0] = bswap_32 (__w.__l[1]); \ | |||||
| __r.__l[1] = bswap_32 (__w.__l[0]); \ | |||||
| __r.__ll; })) | |||||
| #endif /* !ARCH_X86 */ | |||||
| #endif /* !HAVE_BYTESWAP_H */ | |||||
| // be2me ... BigEndian to MachineEndian | |||||
| // le2me ... LittleEndian to MachineEndian | |||||
| #ifdef WORDS_BIGENDIAN | |||||
| #define be2me_16(x) (x) | |||||
| #define be2me_32(x) (x) | |||||
| #define be2me_64(x) (x) | |||||
| #define le2me_16(x) bswap_16(x) | |||||
| #define le2me_32(x) bswap_32(x) | |||||
| #define le2me_64(x) bswap_64(x) | |||||
| #else | |||||
| #define be2me_16(x) bswap_16(x) | |||||
| #define be2me_32(x) bswap_32(x) | |||||
| #define be2me_64(x) bswap_64(x) | |||||
| #define le2me_16(x) (x) | |||||
| #define le2me_32(x) (x) | |||||
| #define le2me_64(x) (x) | |||||
| #endif | |||||
| #endif | |||||
| @@ -58,10 +58,15 @@ echo "Creating config.mak and config.h" | |||||
| echo "# Automatically generated by configure - do not modify" > config.mak | echo "# Automatically generated by configure - do not modify" > config.mak | ||||
| echo "/* Automatically generated by configure - do not modify */" > config.h | echo "/* Automatically generated by configure - do not modify */" > config.h | ||||
| # Checking for CFLAGS | |||||
| if test -z "$CFLAGS"; then | |||||
| CFLAGS="-O2" | |||||
| fi | |||||
| echo "prefix=$prefix" >> config.mak | echo "prefix=$prefix" >> config.mak | ||||
| echo "CC=$cc" >> config.mak | echo "CC=$cc" >> config.mak | ||||
| echo "AR=$ar" >> config.mak | echo "AR=$ar" >> config.mak | ||||
| echo "OPTFLAGS=-O2" >> config.mak | |||||
| echo "OPTFLAGS=$CFLAGS" >> config.mak | |||||
| if [ "$cpu" = "x86" ] ; then | if [ "$cpu" = "x86" ] ; then | ||||
| echo "TARGET_ARCH_X86=yes" >> config.mak | echo "TARGET_ARCH_X86=yes" >> config.mak | ||||
| echo "#define ARCH_X86 1" >> config.h | echo "#define ARCH_X86 1" >> config.h | ||||
| @@ -74,6 +79,7 @@ if [ "$gprof" = "yes" ] ; then | |||||
| echo "TARGET_GPROF=yes" >> config.mak | echo "TARGET_GPROF=yes" >> config.mak | ||||
| echo "#define HAVE_GPROF 1" >> config.h | echo "#define HAVE_GPROF 1" >> config.h | ||||
| fi | fi | ||||
| echo "#define BIN_PORTABILITY 1 /*undefine it if you want to get maximal performance*/" >> config.h | |||||
| # if you do not want to use encoders, disable that. | # if you do not want to use encoders, disable that. | ||||
| echo "#define CONFIG_ENCODERS 1" >> config.h | echo "#define CONFIG_ENCODERS 1" >> config.h | ||||
| @@ -1,6 +1,6 @@ | |||||
| include ../config.mak | include ../config.mak | ||||
| CFLAGS= $(OPTFLAGS) -Wall -g | |||||
| CFLAGS= $(OPTFLAGS) -Wall -g -DHAVE_CONFIG_H | |||||
| LDFLAGS= -g | LDFLAGS= -g | ||||
| OBJS= common.o utils.o mpegvideo.o h263.o jrevdct.o jfdctfst.o \ | OBJS= common.o utils.o mpegvideo.o h263.o jrevdct.o jfdctfst.o \ | ||||
| @@ -29,6 +29,8 @@ | |||||
| #define NDEBUG | #define NDEBUG | ||||
| #include <assert.h> | #include <assert.h> | ||||
| #include "../bswap.h" | |||||
| void init_put_bits(PutBitContext *s, | void init_put_bits(PutBitContext *s, | ||||
| UINT8 *buffer, int buffer_size, | UINT8 *buffer, int buffer_size, | ||||
| void *opaque, | void *opaque, | ||||
| @@ -222,10 +224,14 @@ unsigned int get_bits(GetBitContext *s, int n) | |||||
| buf_ptr += 4; | buf_ptr += 4; | ||||
| /* handle common case: we can read everything */ | /* handle common case: we can read everything */ | ||||
| if (buf_ptr <= s->buf_end) { | if (buf_ptr <= s->buf_end) { | ||||
| bit_buf = (buf_ptr[-4] << 24) | | |||||
| (buf_ptr[-3] << 16) | | |||||
| #if ARCH_X86 | |||||
| bit_buf = bswap_32(*((unsigned long*)(&buf_ptr[-4]))); | |||||
| #else | |||||
| bit_buf = (buf_ptr[-4] << 24) | | |||||
| (buf_ptr[-3] << 16) | | |||||
| (buf_ptr[-2] << 8) | | (buf_ptr[-2] << 8) | | ||||
| (buf_ptr[-1]); | |||||
| (buf_ptr[-1]); | |||||
| #endif | |||||
| } else { | } else { | ||||
| buf_ptr -= 4; | buf_ptr -= 4; | ||||
| bit_buf = 0; | bit_buf = 0; | ||||
| @@ -30,8 +30,10 @@ int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | ||||
| /* pixel operations */ | /* pixel operations */ | ||||
| static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |||||
| static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |||||
| static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | |||||
| static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |||||
| //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |||||
| //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |||||
| /***********************************/ | /***********************************/ | ||||
| /* 3Dnow specific */ | /* 3Dnow specific */ | ||||
| @@ -215,7 +217,7 @@ static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm4\n\t" | "movq %0, %%mm4\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -250,7 +252,7 @@ static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm4\n\t" | "movq %0, %%mm4\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -287,7 +289,7 @@ static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wtwo[0]):"memory"); | |||||
| ::"m"(mm_wtwo):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -399,7 +401,7 @@ static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -448,7 +450,7 @@ static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %0, %%mm0\n\t" | "movq %0, %%mm0\n\t" | ||||
| @@ -485,7 +487,7 @@ static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_si | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm1\n\t" | "movq %1, %%mm1\n\t" | ||||
| @@ -531,7 +533,7 @@ static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_si | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm1\n\t" | "movq %1, %%mm1\n\t" | ||||
| @@ -577,7 +579,7 @@ static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_s | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wtwo[0]):"memory"); | |||||
| ::"m"(mm_wtwo):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -621,7 +623,7 @@ static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_s | |||||
| "movq %%mm0, %0\n\t" | "movq %%mm0, %0\n\t" | ||||
| :"=m"(*p) | :"=m"(*p) | ||||
| :"m"(*pix), | :"m"(*pix), | ||||
| "m"(*(pix+line_size)), "m"(mm_wone[0]) | |||||
| "m"(*(pix+line_size)), "m"(mm_wone) | |||||
| :"memory"); | :"memory"); | ||||
| pix += line_size; | pix += line_size; | ||||
| p += line_size ; | p += line_size ; | ||||
| @@ -748,7 +750,7 @@ static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -832,7 +834,7 @@ static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_si | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6" | "movq %0, %%mm6" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %0, %%mm0\n\t" | "movq %0, %%mm0\n\t" | ||||
| @@ -872,7 +874,7 @@ static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_si | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6" | "movq %0, %%mm6" | ||||
| ::"m"(mm_wone[0]):"memory"); | |||||
| ::"m"(mm_wone):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %0, %%mm0\n\t" | "movq %0, %%mm0\n\t" | ||||
| @@ -912,7 +914,7 @@ static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wtwo[0]):"memory"); | |||||
| ::"m"(mm_wtwo):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -243,7 +243,7 @@ static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_si | |||||
| __asm __volatile( | __asm __volatile( | ||||
| "pxor %%mm7, %%mm7\n\t" | "pxor %%mm7, %%mm7\n\t" | ||||
| "movq %0, %%mm6\n\t" | "movq %0, %%mm6\n\t" | ||||
| ::"m"(mm_wtwo[0]):"memory"); | |||||
| ::"m"(mm_wtwo):"memory"); | |||||
| do { | do { | ||||
| __asm __volatile( | __asm __volatile( | ||||
| "movq %1, %%mm0\n\t" | "movq %1, %%mm0\n\t" | ||||
| @@ -0,0 +1,239 @@ | |||||
| /* | |||||
| * The simplest mpeg encoder (well, it was the simplest!) | |||||
| * Copyright (c) 2000,2001 Gerard Lantau. | |||||
| * | |||||
| * This program is free software; you can redistribute it and/or modify | |||||
| * it under the terms of the GNU General Public License as published by | |||||
| * the Free Software Foundation; either version 2 of the License, or | |||||
| * (at your option) any later version. | |||||
| * | |||||
| * This program is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| * GNU General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU General Public License | |||||
| * along with this program; if not, write to the Free Software | |||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |||||
| * | |||||
| * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> | |||||
| */ | |||||
| void MPV_frame_start(MpegEncContext *s) | |||||
| { | |||||
| if (s->pict_type == B_TYPE) { | |||||
| __asm __volatile( | |||||
| "movl (%1), %%eax\n\t" | |||||
| "movl 4(%1), %%edx\n\t" | |||||
| "movl 8(%1), %%ecx\n\t" | |||||
| "movl %%eax, (%0)\n\t" | |||||
| "movl %%edx, 4(%0)\n\t" | |||||
| "movl %%ecx, 8(%0)\n\t" | |||||
| : | |||||
| :"r"(s->current_picture), "r"(s->aux_picture) | |||||
| :"eax","edx","ecx","memory"); | |||||
| } else { | |||||
| /* swap next and last */ | |||||
| __asm __volatile( | |||||
| "movl (%1), %%eax\n\t" | |||||
| "movl 4(%1), %%edx\n\t" | |||||
| "movl 8(%1), %%ecx\n\t" | |||||
| "xchgl (%0), %%eax\n\t" | |||||
| "xchgl 4(%0), %%edx\n\t" | |||||
| "xchgl 8(%0), %%ecx\n\t" | |||||
| "movl %%eax, (%1)\n\t" | |||||
| "movl %%edx, 4(%1)\n\t" | |||||
| "movl %%ecx, 8(%1)\n\t" | |||||
| "movl %%eax, (%2)\n\t" | |||||
| "movl %%edx, 4(%2)\n\t" | |||||
| "movl %%ecx, 8(%2)\n\t" | |||||
| : | |||||
| :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) | |||||
| :"eax","edx","ecx","memory"); | |||||
| } | |||||
| } | |||||
| static void dct_unquantize(MpegEncContext *s, DCTELEM *block, int n, int qscale); | |||||
| #ifdef HAVE_MMX | |||||
| static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; | |||||
| static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |||||
| /* | |||||
| NK: | |||||
| Note: looking at PARANOID: | |||||
| "enable all paranoid tests for rounding, overflows, etc..." | |||||
| #ifdef PARANOID | |||||
| if (level < -2048 || level > 2047) | |||||
| fprintf(stderr, "unquant error %d %d\n", i, level); | |||||
| #endif | |||||
| We can suppose that result of two multiplications can't be greate of 0xFFFF | |||||
| i.e. is 16-bit, so we use here only PMULLW instruction and can avoid | |||||
| a complex multiplication. | |||||
| ===================================================== | |||||
| Full formula for multiplication of 2 integer numbers | |||||
| which are represent as high:low words: | |||||
| input: value1 = high1:low1 | |||||
| value2 = high2:low2 | |||||
| output: value3 = value1*value2 | |||||
| value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |||||
| this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |||||
| but this algorithm will compute only 0x66cb0ce4 | |||||
| this limited by 16-bit size of operands | |||||
| --------------------------------- | |||||
| tlow1 = high1*low2 | |||||
| tlow2 = high2*low1 | |||||
| tlow1 = tlow1 + tlow2 | |||||
| high3:low3 = low1*low2 | |||||
| high3 += tlow1 | |||||
| */ | |||||
| #ifdef BIN_PORTABILITY | |||||
| static void dct_unquantize_mmx | |||||
| #else | |||||
| #define HAVE_DCT_UNQUANTIZE 1 | |||||
| static void dct_unquantize | |||||
| #endif | |||||
| (MpegEncContext *s,DCTELEM *block, int n, int qscale) | |||||
| { | |||||
| int i, level; | |||||
| const UINT16 *quant_matrix; | |||||
| if (s->mb_intra) { | |||||
| if (n < 4) | |||||
| block[0] = block[0] * s->y_dc_scale; | |||||
| else | |||||
| block[0] = block[0] * s->c_dc_scale; | |||||
| if (s->out_format == FMT_H263) { | |||||
| i = 1; | |||||
| goto unquant_even; | |||||
| } | |||||
| /* XXX: only mpeg1 */ | |||||
| quant_matrix = s->intra_matrix; | |||||
| i=1; | |||||
| /* Align on 4 elements boundary */ | |||||
| while(i&3) | |||||
| { | |||||
| level = block[i]; | |||||
| if (level) { | |||||
| if (level < 0) level = -level; | |||||
| level = (int)(level * qscale * quant_matrix[i]) >> 3; | |||||
| level = (level - 1) | 1; | |||||
| if (block[i] < 0) level = -level; | |||||
| block[i] = level; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| __asm __volatile( | |||||
| "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ | |||||
| "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ | |||||
| "movq %2, %%mm4\n\t" | |||||
| "movq %%mm6, %%mm7\n\t" | |||||
| "movq %1, %%mm5\n\t" | |||||
| "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ | |||||
| "pxor %%mm6, %%mm6\n\t" | |||||
| ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); | |||||
| for(;i<64;i+=4) { | |||||
| __asm __volatile( | |||||
| "movq %1, %%mm0\n\t" | |||||
| "movq %%mm7, %%mm1\n\t" | |||||
| "movq %%mm0, %%mm2\n\t" | |||||
| "movq %%mm0, %%mm3\n\t" | |||||
| "pcmpgtw %%mm6, %%mm2\n\t" | |||||
| "pmullw %2, %%mm1\n\t" | |||||
| "pandn %%mm4, %%mm2\n\t" | |||||
| "por %%mm5, %%mm2\n\t" | |||||
| "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ | |||||
| "pcmpeqw %%mm6, %%mm3\n\t" | |||||
| "pmullw %%mm0, %%mm1\n\t" | |||||
| "psraw $3, %%mm1\n\t" | |||||
| "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ | |||||
| "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ | |||||
| "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ | |||||
| "pmullw %%mm2, %%mm1\n\t" /* change signs again */ | |||||
| "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ | |||||
| "movq %%mm1, %0" | |||||
| :"=m"(block[i]) | |||||
| :"m"(block[i]), "m"(quant_matrix[i]) | |||||
| :"memory"); | |||||
| } | |||||
| } else { | |||||
| i = 0; | |||||
| unquant_even: | |||||
| quant_matrix = s->non_intra_matrix; | |||||
| /* Align on 4 elements boundary */ | |||||
| while(i&3) | |||||
| { | |||||
| level = block[i]; | |||||
| if (level) { | |||||
| if (level < 0) level = -level; | |||||
| level = (((level << 1) + 1) * qscale * | |||||
| ((int) quant_matrix[i])) >> 4; | |||||
| level = (level - 1) | 1; | |||||
| if(block[i] < 0) level = -level; | |||||
| block[i] = level; | |||||
| } | |||||
| i++; | |||||
| } | |||||
| __asm __volatile( | |||||
| "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ | |||||
| "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ | |||||
| "movq %2, %%mm4\n\t" | |||||
| "movq %%mm6, %%mm7\n\t" | |||||
| "movq %1, %%mm5\n\t" | |||||
| "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ | |||||
| "pxor %%mm6, %%mm6\n\t" | |||||
| ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); | |||||
| for(;i<64;i+=4) { | |||||
| __asm __volatile( | |||||
| "movq %1, %%mm0\n\t" | |||||
| "movq %%mm7, %%mm1\n\t" | |||||
| "movq %%mm0, %%mm2\n\t" | |||||
| "movq %%mm0, %%mm3\n\t" | |||||
| "pcmpgtw %%mm6, %%mm2\n\t" | |||||
| "pmullw %2, %%mm1\n\t" | |||||
| "pandn %%mm4, %%mm2\n\t" | |||||
| "por %%mm5, %%mm2\n\t" | |||||
| "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ | |||||
| "psllw $1, %%mm0\n\t" /* block[i] <<= 1 */ | |||||
| "paddw %%mm5, %%mm0\n\t" /* block[i] ++ */ | |||||
| "pmullw %%mm0, %%mm1\n\t" | |||||
| "psraw $4, %%mm1\n\t" | |||||
| "pcmpeqw %%mm6, %%mm3\n\t" | |||||
| "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ | |||||
| "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ | |||||
| "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ | |||||
| "pmullw %%mm2, %%mm1\n\t" /* change signs again */ | |||||
| "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ | |||||
| "movq %%mm1, %0" | |||||
| :"=m"(block[i]) | |||||
| :"m"(block[i]), "m"(quant_matrix[i]) | |||||
| :"memory"); | |||||
| } | |||||
| } | |||||
| } | |||||
| #ifdef BIN_PORTABILITY | |||||
| static void (*dct_unquantize_ptr)(MpegEncContext *s, | |||||
| DCTELEM *block, int n, int qscale); | |||||
| void MPV_common_init_mmx(void) | |||||
| { | |||||
| int mm_flags; | |||||
| mm_flags = mm_support(); | |||||
| if (mm_flags & MM_MMX) { | |||||
| dct_unquantize_ptr = dct_unquantize_mmx; | |||||
| } | |||||
| else { | |||||
| dct_unquantize_ptr = dct_unquantize; | |||||
| } | |||||
| } | |||||
| #define DCT_UNQUANTIZE(a,b,c,d) (*dct_unquantize_ptr)(a,b,c,d) | |||||
| #else | |||||
| #define DCT_UNQUANTIZE(a,b,c,d) dct_unquantize(a,b,c,d) | |||||
| #endif /* BIN_PORTABILITY */ | |||||
| #endif /* HAVE_MMX */ | |||||
| @@ -24,6 +24,15 @@ | |||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "mpegvideo.h" | #include "mpegvideo.h" | ||||
| #include "../config.h" | |||||
| #ifdef ARCH_X86 | |||||
| #include "i386/mpegvideo.c" | |||||
| #endif | |||||
| #ifndef DCT_UNQUANTIZE | |||||
| #define DCT_UNQUANTIZE(a,b,c,d) dct_unquantize(a,b,c,d) | |||||
| #endif | |||||
| #define EDGE_WIDTH 16 | #define EDGE_WIDTH 16 | ||||
| /* enable all paranoid tests for rounding, overflows, etc... */ | /* enable all paranoid tests for rounding, overflows, etc... */ | ||||
| @@ -89,6 +98,9 @@ int MPV_common_init(MpegEncContext *s) | |||||
| int c_size, i; | int c_size, i; | ||||
| UINT8 *pict; | UINT8 *pict; | ||||
| #if defined ( HAVE_MMX ) && defined ( BIN_PORTABILITY ) | |||||
| MPV_common_init_mmx(); | |||||
| #endif | |||||
| s->mb_width = (s->width + 15) / 16; | s->mb_width = (s->width + 15) / 16; | ||||
| s->mb_height = (s->height + 15) / 16; | s->mb_height = (s->height + 15) / 16; | ||||
| s->linesize = s->mb_width * 16 + 2 * EDGE_WIDTH; | s->linesize = s->mb_width * 16 + 2 * EDGE_WIDTH; | ||||
| @@ -345,8 +357,8 @@ static void draw_edges(UINT8 *buf, int wrap, int width, int height, int w) | |||||
| } | } | ||||
| } | } | ||||
| /* generic function for encode/decode called before a frame is coded/decoded */ | /* generic function for encode/decode called before a frame is coded/decoded */ | ||||
| #ifndef ARCH_X86 | |||||
| void MPV_frame_start(MpegEncContext *s) | void MPV_frame_start(MpegEncContext *s) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -366,7 +378,7 @@ void MPV_frame_start(MpegEncContext *s) | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| /* generic function for encode/decode called after a frame has been coded/decoded */ | /* generic function for encode/decode called after a frame has been coded/decoded */ | ||||
| void MPV_frame_end(MpegEncContext *s) | void MPV_frame_end(MpegEncContext *s) | ||||
| { | { | ||||
| @@ -621,7 +633,7 @@ static inline void put_dct(MpegEncContext *s, | |||||
| DCTELEM *block, int i, UINT8 *dest, int line_size) | DCTELEM *block, int i, UINT8 *dest, int line_size) | ||||
| { | { | ||||
| if (!s->mpeg2) | if (!s->mpeg2) | ||||
| dct_unquantize(s, block, i, s->qscale); | |||||
| DCT_UNQUANTIZE(s, block, i, s->qscale); | |||||
| j_rev_dct (block); | j_rev_dct (block); | ||||
| put_pixels_clamped(block, dest, line_size); | put_pixels_clamped(block, dest, line_size); | ||||
| } | } | ||||
| @@ -632,7 +644,7 @@ static inline void add_dct(MpegEncContext *s, | |||||
| { | { | ||||
| if (s->block_last_index[i] >= 0) { | if (s->block_last_index[i] >= 0) { | ||||
| if (!s->mpeg2) | if (!s->mpeg2) | ||||
| dct_unquantize(s, block, i, s->qscale); | |||||
| DCT_UNQUANTIZE(s, block, i, s->qscale); | |||||
| j_rev_dct (block); | j_rev_dct (block); | ||||
| add_pixels_clamped(block, dest, line_size); | add_pixels_clamped(block, dest, line_size); | ||||
| } | } | ||||
| @@ -1109,6 +1121,7 @@ static int dct_quantize_mmx(MpegEncContext *s, | |||||
| return last_non_zero; | return last_non_zero; | ||||
| } | } | ||||
| #ifndef HAVE_DCT_UNQUANTIZE | |||||
| static void dct_unquantize(MpegEncContext *s, | static void dct_unquantize(MpegEncContext *s, | ||||
| DCTELEM *block, int n, int qscale) | DCTELEM *block, int n, int qscale) | ||||
| { | { | ||||
| @@ -1172,7 +1185,7 @@ static void dct_unquantize(MpegEncContext *s, | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| /* rate control */ | /* rate control */ | ||||