Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -403,7 +403,7 @@ OBJS-$(ARCH_SH4) += sh4/idct_sh4.o \ | |||||
| sh4/dsputil_align.o \ | sh4/dsputil_align.o \ | ||||
| sh4/dsputil_sh4.o \ | sh4/dsputil_sh4.o \ | ||||
| OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ | |||||
| ALTIVEC-OBJS-yes += ppc/dsputil_altivec.o \ | |||||
| ppc/fdct_altivec.o \ | ppc/fdct_altivec.o \ | ||||
| ppc/fft_altivec.o \ | ppc/fft_altivec.o \ | ||||
| ppc/float_altivec.o \ | ppc/float_altivec.o \ | ||||
| @@ -413,12 +413,17 @@ OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ | |||||
| ppc/mpegvideo_altivec.o \ | ppc/mpegvideo_altivec.o \ | ||||
| ppc/mpegvideo_ppc.o \ | ppc/mpegvideo_ppc.o \ | ||||
| ifeq ($(HAVE_ALTIVEC),yes) | |||||
| OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o | |||||
| OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o | |||||
| OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o | |||||
| OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o | |||||
| endif | |||||
| ALTIVEC-OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o | |||||
| ALTIVEC-OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o | |||||
| ALTIVEC-OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o | |||||
| ALTIVEC-OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o | |||||
| # -maltivec is needed in order to build AltiVec code. | |||||
| $(ALTIVEC-OBJS-yes): CFLAGS += -maltivec -mabi=altivec | |||||
| # check_altivec must be built without -maltivec | |||||
| OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS-yes) \ | |||||
| ppc/check_altivec.o | |||||
| OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ | OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ | ||||
| bfin/mpegvideo_bfin.o \ | bfin/mpegvideo_bfin.o \ | ||||
| @@ -557,12 +557,6 @@ extern int mm_flags; | |||||
| extern int mm_flags; | extern int mm_flags; | ||||
| #if defined(HAVE_ALTIVEC) && !defined(__APPLE_CC__) | |||||
| #define pixel altivec_pixel | |||||
| #include <altivec.h> | |||||
| #undef pixel | |||||
| #endif | |||||
| #define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v) | #define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v) | ||||
| #define STRIDE_ALIGN 16 | #define STRIDE_ALIGN 16 | ||||
| @@ -28,6 +28,10 @@ | |||||
| #include "swscale.h" | #include "swscale.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #ifdef HAVE_ALTIVEC | |||||
| #include "ppc/imgresample_altivec.h" | |||||
| #endif | |||||
| #define NB_COMPONENTS 3 | #define NB_COMPONENTS 3 | ||||
| #define PHASE_BITS 4 | #define PHASE_BITS 4 | ||||
| @@ -281,133 +285,6 @@ static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src, | |||||
| } | } | ||||
| #endif /* HAVE_MMX */ | #endif /* HAVE_MMX */ | ||||
| #ifdef HAVE_ALTIVEC | |||||
| typedef union { | |||||
| vector unsigned char v; | |||||
| unsigned char c[16]; | |||||
| } vec_uc_t; | |||||
| typedef union { | |||||
| vector signed short v; | |||||
| signed short s[8]; | |||||
| } vec_ss_t; | |||||
| void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||||
| int wrap, int16_t *filter) | |||||
| { | |||||
| int sum, i; | |||||
| const uint8_t *s; | |||||
| vector unsigned char *tv, tmp, dstv, zero; | |||||
| vec_ss_t srchv[4], srclv[4], fv[4]; | |||||
| vector signed short zeros, sumhv, sumlv; | |||||
| s = src; | |||||
| for(i=0;i<4;i++) | |||||
| { | |||||
| /* | |||||
| The vec_madds later on does an implicit >>15 on the result. | |||||
| Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |||||
| a signed short, we have just enough bits to pre-shift our | |||||
| filter constants <<7 to compensate for vec_madds. | |||||
| */ | |||||
| fv[i].s[0] = filter[i] << (15-FILTER_BITS); | |||||
| fv[i].v = vec_splat(fv[i].v, 0); | |||||
| } | |||||
| zero = vec_splat_u8(0); | |||||
| zeros = vec_splat_s16(0); | |||||
| /* | |||||
| When we're resampling, we'd ideally like both our input buffers, | |||||
| and output buffers to be 16-byte aligned, so we can do both aligned | |||||
| reads and writes. Sadly we can't always have this at the moment, so | |||||
| we opt for aligned writes, as unaligned writes have a huge overhead. | |||||
| To do this, do enough scalar resamples to get dst 16-byte aligned. | |||||
| */ | |||||
| i = (-(int)dst) & 0xf; | |||||
| while(i>0) { | |||||
| sum = s[0 * wrap] * filter[0] + | |||||
| s[1 * wrap] * filter[1] + | |||||
| s[2 * wrap] * filter[2] + | |||||
| s[3 * wrap] * filter[3]; | |||||
| sum = sum >> FILTER_BITS; | |||||
| if (sum<0) sum = 0; else if (sum>255) sum=255; | |||||
| dst[0] = sum; | |||||
| dst++; | |||||
| s++; | |||||
| dst_width--; | |||||
| i--; | |||||
| } | |||||
| /* Do our altivec resampling on 16 pixels at once. */ | |||||
| while(dst_width>=16) { | |||||
| /* | |||||
| Read 16 (potentially unaligned) bytes from each of | |||||
| 4 lines into 4 vectors, and split them into shorts. | |||||
| Interleave the multipy/accumulate for the resample | |||||
| filter with the loads to hide the 3 cycle latency | |||||
| the vec_madds have. | |||||
| */ | |||||
| tv = (vector unsigned char *) &s[0 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |||||
| srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[0].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); | |||||
| sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); | |||||
| tv = (vector unsigned char *) &s[1 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); | |||||
| srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[1].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); | |||||
| sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); | |||||
| tv = (vector unsigned char *) &s[2 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); | |||||
| srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[2].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); | |||||
| sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); | |||||
| tv = (vector unsigned char *) &s[3 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); | |||||
| srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[3].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |||||
| sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |||||
| /* | |||||
| Pack the results into our destination vector, | |||||
| and do an aligned write of that back to memory. | |||||
| */ | |||||
| dstv = vec_packsu(sumhv, sumlv) ; | |||||
| vec_st(dstv, 0, (vector unsigned char *) dst); | |||||
| dst+=16; | |||||
| s+=16; | |||||
| dst_width-=16; | |||||
| } | |||||
| /* | |||||
| If there are any leftover pixels, resample them | |||||
| with the slow scalar method. | |||||
| */ | |||||
| while(dst_width>0) { | |||||
| sum = s[0 * wrap] * filter[0] + | |||||
| s[1 * wrap] * filter[1] + | |||||
| s[2 * wrap] * filter[2] + | |||||
| s[3 * wrap] * filter[3]; | |||||
| sum = sum >> FILTER_BITS; | |||||
| if (sum<0) sum = 0; else if (sum>255) sum=255; | |||||
| dst[0] = sum; | |||||
| dst++; | |||||
| s++; | |||||
| dst_width--; | |||||
| } | |||||
| } | |||||
| #endif /* HAVE_ALTIVEC */ | |||||
| /* slow version to handle limit cases. Does not need optimisation */ | /* slow version to handle limit cases. Does not need optimisation */ | ||||
| static void h_resample_slow(uint8_t *dst, int dst_width, | static void h_resample_slow(uint8_t *dst, int dst_width, | ||||
| const uint8_t *src, int src_width, | const uint8_t *src, int src_width, | ||||
| @@ -0,0 +1,95 @@ | |||||
| /* | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| /** | |||||
| * @file check_altivec.c | |||||
| * Checks for AltiVec presence. | |||||
| */ | |||||
| #ifdef __APPLE__ | |||||
| #include <sys/sysctl.h> | |||||
| #elif __AMIGAOS4__ | |||||
| #include <exec/exec.h> | |||||
| #include <interfaces/exec.h> | |||||
| #include <proto/exec.h> | |||||
| #else | |||||
| #include <signal.h> | |||||
| #include <setjmp.h> | |||||
| static sigjmp_buf jmpbuf; | |||||
| static volatile sig_atomic_t canjump = 0; | |||||
| static void sigill_handler (int sig) | |||||
| { | |||||
| if (!canjump) { | |||||
| signal (sig, SIG_DFL); | |||||
| raise (sig); | |||||
| } | |||||
| canjump = 0; | |||||
| siglongjmp (jmpbuf, 1); | |||||
| } | |||||
| #endif /* __APPLE__ */ | |||||
| /** | |||||
| * This function MAY rely on signal() or fork() in order to make sure altivec | |||||
| * is present | |||||
| */ | |||||
| int has_altivec(void) | |||||
| { | |||||
| #ifdef __AMIGAOS4__ | |||||
| ULONG result = 0; | |||||
| extern struct ExecIFace *IExec; | |||||
| IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); | |||||
| if (result == VECTORTYPE_ALTIVEC) return 1; | |||||
| return 0; | |||||
| #elif __APPLE__ | |||||
| int sels[2] = {CTL_HW, HW_VECTORUNIT}; | |||||
| int has_vu = 0; | |||||
| size_t len = sizeof(has_vu); | |||||
| int err; | |||||
| err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |||||
| if (err == 0) return (has_vu != 0); | |||||
| return 0; | |||||
| #else | |||||
| /* Do it the brute-force way, borrowed from the libmpeg2 library. */ | |||||
| { | |||||
| signal (SIGILL, sigill_handler); | |||||
| if (sigsetjmp (jmpbuf, 1)) { | |||||
| signal (SIGILL, SIG_DFL); | |||||
| } else { | |||||
| canjump = 1; | |||||
| asm volatile ("mtspr 256, %0\n\t" | |||||
| "vand %%v0, %%v0, %%v0" | |||||
| : | |||||
| : "r" (-1)); | |||||
| signal (SIGILL, SIG_DFL); | |||||
| return 1; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| #endif /* __AMIGAOS4__ */ | |||||
| } | |||||
| @@ -25,31 +25,7 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| #ifdef __APPLE__ | |||||
| #include <sys/sysctl.h> | |||||
| #elif __AMIGAOS4__ | |||||
| #include <exec/exec.h> | |||||
| #include <interfaces/exec.h> | |||||
| #include <proto/exec.h> | |||||
| #else | |||||
| #include <signal.h> | |||||
| #include <setjmp.h> | |||||
| static sigjmp_buf jmpbuf; | |||||
| static volatile sig_atomic_t canjump = 0; | |||||
| static void sigill_handler (int sig) | |||||
| { | |||||
| if (!canjump) { | |||||
| signal (sig, SIG_DFL); | |||||
| raise (sig); | |||||
| } | |||||
| canjump = 0; | |||||
| siglongjmp (jmpbuf, 1); | |||||
| } | |||||
| #endif /* __APPLE__ */ | |||||
| #include "util_altivec.h" | |||||
| int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) | int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) | ||||
| { | { | ||||
| @@ -1417,47 +1393,6 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); | |||||
| return score; | return score; | ||||
| } | } | ||||
| int has_altivec(void) | |||||
| { | |||||
| #ifdef __AMIGAOS4__ | |||||
| ULONG result = 0; | |||||
| extern struct ExecIFace *IExec; | |||||
| IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); | |||||
| if (result == VECTORTYPE_ALTIVEC) return 1; | |||||
| return 0; | |||||
| #elif __APPLE__ | |||||
| int sels[2] = {CTL_HW, HW_VECTORUNIT}; | |||||
| int has_vu = 0; | |||||
| size_t len = sizeof(has_vu); | |||||
| int err; | |||||
| err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |||||
| if (err == 0) return (has_vu != 0); | |||||
| return 0; | |||||
| #else | |||||
| /* Do it the brute-force way, borrowed from the libmpeg2 library. */ | |||||
| { | |||||
| signal (SIGILL, sigill_handler); | |||||
| if (sigsetjmp (jmpbuf, 1)) { | |||||
| signal (SIGILL, SIG_DFL); | |||||
| } else { | |||||
| canjump = 1; | |||||
| asm volatile ("mtspr 256, %0\n\t" | |||||
| "vand %%v0, %%v0, %%v0" | |||||
| : | |||||
| : "r" (-1)); | |||||
| signal (SIGILL, SIG_DFL); | |||||
| return 1; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| #endif /* __AMIGAOS4__ */ | |||||
| } | |||||
| static void vorbis_inverse_coupling_altivec(float *mag, float *ang, | static void vorbis_inverse_coupling_altivec(float *mag, float *ang, | ||||
| int blocksize) | int blocksize) | ||||
| { | { | ||||
| @@ -31,83 +31,4 @@ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, | |||||
| void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); | ||||
| // used to build registers permutation vectors (vcprm) | |||||
| // the 's' are for words in the _s_econd vector | |||||
| #define WORD_0 0x00,0x01,0x02,0x03 | |||||
| #define WORD_1 0x04,0x05,0x06,0x07 | |||||
| #define WORD_2 0x08,0x09,0x0a,0x0b | |||||
| #define WORD_3 0x0c,0x0d,0x0e,0x0f | |||||
| #define WORD_s0 0x10,0x11,0x12,0x13 | |||||
| #define WORD_s1 0x14,0x15,0x16,0x17 | |||||
| #define WORD_s2 0x18,0x19,0x1a,0x1b | |||||
| #define WORD_s3 0x1c,0x1d,0x1e,0x1f | |||||
| #ifdef __APPLE_CC__ | |||||
| #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) | |||||
| #else | |||||
| #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |||||
| #endif | |||||
| // vcprmle is used to keep the same index as in the SSE version. | |||||
| // it's the same as vcprm, with the index inversed | |||||
| // ('le' is Little Endian) | |||||
| #define vcprmle(a,b,c,d) vcprm(d,c,b,a) | |||||
| // used to build inverse/identity vectors (vcii) | |||||
| // n is _n_egative, p is _p_ositive | |||||
| #define FLOAT_n -1. | |||||
| #define FLOAT_p 1. | |||||
| #ifdef __APPLE_CC__ | |||||
| #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) | |||||
| #else | |||||
| #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |||||
| #endif | |||||
| // Transpose 8x8 matrix of 16-bit elements (in-place) | |||||
| #define TRANSPOSE8(a,b,c,d,e,f,g,h) \ | |||||
| do { \ | |||||
| vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ | |||||
| vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ | |||||
| \ | |||||
| A1 = vec_mergeh (a, e); \ | |||||
| B1 = vec_mergel (a, e); \ | |||||
| C1 = vec_mergeh (b, f); \ | |||||
| D1 = vec_mergel (b, f); \ | |||||
| E1 = vec_mergeh (c, g); \ | |||||
| F1 = vec_mergel (c, g); \ | |||||
| G1 = vec_mergeh (d, h); \ | |||||
| H1 = vec_mergel (d, h); \ | |||||
| \ | |||||
| A2 = vec_mergeh (A1, E1); \ | |||||
| B2 = vec_mergel (A1, E1); \ | |||||
| C2 = vec_mergeh (B1, F1); \ | |||||
| D2 = vec_mergel (B1, F1); \ | |||||
| E2 = vec_mergeh (C1, G1); \ | |||||
| F2 = vec_mergel (C1, G1); \ | |||||
| G2 = vec_mergeh (D1, H1); \ | |||||
| H2 = vec_mergel (D1, H1); \ | |||||
| \ | |||||
| a = vec_mergeh (A2, E2); \ | |||||
| b = vec_mergel (A2, E2); \ | |||||
| c = vec_mergeh (B2, F2); \ | |||||
| d = vec_mergel (B2, F2); \ | |||||
| e = vec_mergeh (C2, G2); \ | |||||
| f = vec_mergel (C2, G2); \ | |||||
| g = vec_mergeh (D2, H2); \ | |||||
| h = vec_mergel (D2, H2); \ | |||||
| } while (0) | |||||
| /** \brief loads unaligned vector \a *src with offset \a offset | |||||
| and returns it */ | |||||
| static inline vector unsigned char unaligned_load(int offset, uint8_t *src) | |||||
| { | |||||
| register vector unsigned char first = vec_ld(offset, src); | |||||
| register vector unsigned char second = vec_ld(offset+15, src); | |||||
| register vector unsigned char mask = vec_lvsl(offset, src); | |||||
| return vec_perm(first, second, mask); | |||||
| } | |||||
| #endif /* DSPUTIL_ALTIVEC_H */ | #endif /* DSPUTIL_ALTIVEC_H */ | ||||
| @@ -24,8 +24,8 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | |||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | |||||
| /* | /* | ||||
| those three macros are from libavcodec/fft.c | those three macros are from libavcodec/fft.c | ||||
| and are required for the reference C code | and are required for the reference C code | ||||
| @@ -24,7 +24,8 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | |||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | |||||
| /* | /* | ||||
| altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, | altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, | ||||
| @@ -22,7 +22,8 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | |||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | |||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s | #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s | ||||
| @@ -0,0 +1,153 @@ | |||||
| /* | |||||
| * High quality image resampling with polyphase filters | |||||
| * Copyright (c) 2001 Fabrice Bellard. | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| /** | |||||
| * @file imgresample_altivec.c | |||||
| * High quality image resampling with polyphase filters - AltiVec bits | |||||
| */ | |||||
| #include "gcc_fixes.h" | |||||
| typedef union { | |||||
| vector unsigned char v; | |||||
| unsigned char c[16]; | |||||
| } vec_uc_t; | |||||
| typedef union { | |||||
| vector signed short v; | |||||
| signed short s[8]; | |||||
| } vec_ss_t; | |||||
| void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||||
| int wrap, int16_t *filter) | |||||
| { | |||||
| int sum, i; | |||||
| const uint8_t *s; | |||||
| vector unsigned char *tv, tmp, dstv, zero; | |||||
| vec_ss_t srchv[4], srclv[4], fv[4]; | |||||
| vector signed short zeros, sumhv, sumlv; | |||||
| s = src; | |||||
| for(i=0;i<4;i++) | |||||
| { | |||||
| /* | |||||
| The vec_madds later on does an implicit >>15 on the result. | |||||
| Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |||||
| a signed short, we have just enough bits to pre-shift our | |||||
| filter constants <<7 to compensate for vec_madds. | |||||
| */ | |||||
| fv[i].s[0] = filter[i] << (15-FILTER_BITS); | |||||
| fv[i].v = vec_splat(fv[i].v, 0); | |||||
| } | |||||
| zero = vec_splat_u8(0); | |||||
| zeros = vec_splat_s16(0); | |||||
| /* | |||||
| When we're resampling, we'd ideally like both our input buffers, | |||||
| and output buffers to be 16-byte aligned, so we can do both aligned | |||||
| reads and writes. Sadly we can't always have this at the moment, so | |||||
| we opt for aligned writes, as unaligned writes have a huge overhead. | |||||
| To do this, do enough scalar resamples to get dst 16-byte aligned. | |||||
| */ | |||||
| i = (-(int)dst) & 0xf; | |||||
| while(i>0) { | |||||
| sum = s[0 * wrap] * filter[0] + | |||||
| s[1 * wrap] * filter[1] + | |||||
| s[2 * wrap] * filter[2] + | |||||
| s[3 * wrap] * filter[3]; | |||||
| sum = sum >> FILTER_BITS; | |||||
| if (sum<0) sum = 0; else if (sum>255) sum=255; | |||||
| dst[0] = sum; | |||||
| dst++; | |||||
| s++; | |||||
| dst_width--; | |||||
| i--; | |||||
| } | |||||
| /* Do our altivec resampling on 16 pixels at once. */ | |||||
| while(dst_width>=16) { | |||||
| /* | |||||
| Read 16 (potentially unaligned) bytes from each of | |||||
| 4 lines into 4 vectors, and split them into shorts. | |||||
| Interleave the multipy/accumulate for the resample | |||||
| filter with the loads to hide the 3 cycle latency | |||||
| the vec_madds have. | |||||
| */ | |||||
| tv = (vector unsigned char *) &s[0 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |||||
| srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[0].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); | |||||
| sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); | |||||
| tv = (vector unsigned char *) &s[1 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); | |||||
| srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[1].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); | |||||
| sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); | |||||
| tv = (vector unsigned char *) &s[2 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); | |||||
| srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[2].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); | |||||
| sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); | |||||
| tv = (vector unsigned char *) &s[3 * wrap]; | |||||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); | |||||
| srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); | |||||
| srclv[3].v = (vector signed short) vec_mergel(zero, tmp); | |||||
| sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |||||
| sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |||||
| /* | |||||
| Pack the results into our destination vector, | |||||
| and do an aligned write of that back to memory. | |||||
| */ | |||||
| dstv = vec_packsu(sumhv, sumlv) ; | |||||
| vec_st(dstv, 0, (vector unsigned char *) dst); | |||||
| dst+=16; | |||||
| s+=16; | |||||
| dst_width-=16; | |||||
| } | |||||
| /* | |||||
| If there are any leftover pixels, resample them | |||||
| with the slow scalar method. | |||||
| */ | |||||
| while(dst_width>0) { | |||||
| sum = s[0 * wrap] * filter[0] + | |||||
| s[1 * wrap] * filter[1] + | |||||
| s[2 * wrap] * filter[2] + | |||||
| s[3 * wrap] * filter[3]; | |||||
| sum = sum >> FILTER_BITS; | |||||
| if (sum<0) sum = 0; else if (sum>255) sum=255; | |||||
| dst[0] = sum; | |||||
| dst++; | |||||
| s++; | |||||
| dst_width--; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,24 @@ | |||||
| /* | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #ifndef IMGRESAMPLE_ALTIVEC_H | |||||
| #define IMGRESAMPLE_ALTIVEC_H | |||||
| void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||||
| int wrap, int16_t *filter); | |||||
| #endif /* IMGRESAMPLE_ALTIVEC_H */ | |||||
| @@ -28,8 +28,8 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | |||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | |||||
| // Swaps two variables (used for altivec registers) | // Swaps two variables (used for altivec registers) | ||||
| #define SWAP(a,b) \ | #define SWAP(a,b) \ | ||||
| do { \ | do { \ | ||||
| @@ -0,0 +1,106 @@ | |||||
| /* | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| /** | |||||
| * @file util_altivec.h | |||||
| * Contains misc utility macros and inline functions | |||||
| */ | |||||
| #ifndef UTIL_ALTIVEC_H | |||||
| #define UTIL_ALTIVEC_H | |||||
| // used to build registers permutation vectors (vcprm) | |||||
| // the 's' are for words in the _s_econd vector | |||||
| #define WORD_0 0x00,0x01,0x02,0x03 | |||||
| #define WORD_1 0x04,0x05,0x06,0x07 | |||||
| #define WORD_2 0x08,0x09,0x0a,0x0b | |||||
| #define WORD_3 0x0c,0x0d,0x0e,0x0f | |||||
| #define WORD_s0 0x10,0x11,0x12,0x13 | |||||
| #define WORD_s1 0x14,0x15,0x16,0x17 | |||||
| #define WORD_s2 0x18,0x19,0x1a,0x1b | |||||
| #define WORD_s3 0x1c,0x1d,0x1e,0x1f | |||||
| #ifdef __APPLE_CC__ | |||||
| #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) | |||||
| #else | |||||
| #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |||||
| #endif | |||||
| // vcprmle is used to keep the same index as in the SSE version. | |||||
| // it's the same as vcprm, with the index inversed | |||||
| // ('le' is Little Endian) | |||||
| #define vcprmle(a,b,c,d) vcprm(d,c,b,a) | |||||
| // used to build inverse/identity vectors (vcii) | |||||
| // n is _n_egative, p is _p_ositive | |||||
| #define FLOAT_n -1. | |||||
| #define FLOAT_p 1. | |||||
| #ifdef __APPLE_CC__ | |||||
| #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) | |||||
| #else | |||||
| #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |||||
| #endif | |||||
| // Transpose 8x8 matrix of 16-bit elements (in-place) | |||||
| #define TRANSPOSE8(a,b,c,d,e,f,g,h) \ | |||||
| do { \ | |||||
| vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ | |||||
| vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ | |||||
| \ | |||||
| A1 = vec_mergeh (a, e); \ | |||||
| B1 = vec_mergel (a, e); \ | |||||
| C1 = vec_mergeh (b, f); \ | |||||
| D1 = vec_mergel (b, f); \ | |||||
| E1 = vec_mergeh (c, g); \ | |||||
| F1 = vec_mergel (c, g); \ | |||||
| G1 = vec_mergeh (d, h); \ | |||||
| H1 = vec_mergel (d, h); \ | |||||
| \ | |||||
| A2 = vec_mergeh (A1, E1); \ | |||||
| B2 = vec_mergel (A1, E1); \ | |||||
| C2 = vec_mergeh (B1, F1); \ | |||||
| D2 = vec_mergel (B1, F1); \ | |||||
| E2 = vec_mergeh (C1, G1); \ | |||||
| F2 = vec_mergel (C1, G1); \ | |||||
| G2 = vec_mergeh (D1, H1); \ | |||||
| H2 = vec_mergel (D1, H1); \ | |||||
| \ | |||||
| a = vec_mergeh (A2, E2); \ | |||||
| b = vec_mergel (A2, E2); \ | |||||
| c = vec_mergeh (B2, F2); \ | |||||
| d = vec_mergel (B2, F2); \ | |||||
| e = vec_mergeh (C2, G2); \ | |||||
| f = vec_mergel (C2, G2); \ | |||||
| g = vec_mergeh (D2, H2); \ | |||||
| h = vec_mergel (D2, H2); \ | |||||
| } while (0) | |||||
| /** \brief loads unaligned vector \a *src with offset \a offset | |||||
| and returns it */ | |||||
| static inline vector unsigned char unaligned_load(int offset, uint8_t *src) | |||||
| { | |||||
| register vector unsigned char first = vec_ld(offset, src); | |||||
| register vector unsigned char second = vec_ld(offset+15, src); | |||||
| register vector unsigned char mask = vec_lvsl(offset, src); | |||||
| return vec_perm(first, second, mask); | |||||
| } | |||||
| #endif /* UTIL_ALTIVEC_H */ | |||||
| @@ -23,7 +23,7 @@ | |||||
| #include "gcc_fixes.h" | #include "gcc_fixes.h" | ||||
| #include "dsputil_altivec.h" | |||||
| #include "util_altivec.h" | |||||
| // main steps of 8x8 transform | // main steps of 8x8 transform | ||||
| #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ | #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ | ||||