This functionality is better accessed through tools like oprofile. Originally committed as revision 23808 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -211,8 +211,6 @@ Advanced options (experts only): | |||||
| --arch=ARCH select architecture [$arch] | --arch=ARCH select architecture [$arch] | ||||
| --cpu=CPU select the minimum required CPU (affects | --cpu=CPU select the minimum required CPU (affects | ||||
| instruction selection, may crash on older CPUs) | instruction selection, may crash on older CPUs) | ||||
| --enable-powerpc-perf enable performance report on PPC | |||||
| (requires enabling PMC) | |||||
| --disable-asm disable all assembler optimizations | --disable-asm disable all assembler optimizations | ||||
| --disable-altivec disable AltiVec optimizations | --disable-altivec disable AltiVec optimizations | ||||
| --disable-amd3dnow disable 3DNow! optimizations | --disable-amd3dnow disable 3DNow! optimizations | ||||
| @@ -886,7 +884,6 @@ CONFIG_LIST=" | |||||
| nonfree | nonfree | ||||
| pic | pic | ||||
| postproc | postproc | ||||
| powerpc_perf | |||||
| rdft | rdft | ||||
| runtime_cpudetect | runtime_cpudetect | ||||
| shared | shared | ||||
| @@ -2772,7 +2769,6 @@ if enabled ppc; then | |||||
| echo "AltiVec enabled ${altivec-no}" | echo "AltiVec enabled ${altivec-no}" | ||||
| echo "PPC 4xx optimizations ${ppc4xx-no}" | echo "PPC 4xx optimizations ${ppc4xx-no}" | ||||
| echo "dcbzl available ${dcbzl-no}" | echo "dcbzl available ${dcbzl-no}" | ||||
| echo "performance report ${powerpc_perf-no}" | |||||
| fi | fi | ||||
| if enabled sparc; then | if enabled sparc; then | ||||
| echo "VIS enabled ${vis-no}" | echo "VIS enabled ${vis-no}" | ||||
| @@ -1,172 +0,0 @@ | |||||
| FFmpeg & evaluating performance on the PowerPC Architecture HOWTO | |||||
| (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> | |||||
| I - Introduction | |||||
| The PowerPC architecture and its SIMD extension AltiVec offer some | |||||
| interesting tools to evaluate performance and improve the code. | |||||
| This document tries to explain how to use those tools with FFmpeg. | |||||
| The architecture itself offers two ways to evaluate the performance of | |||||
| a given piece of code: | |||||
| 1) The Time Base Registers (TBL) | |||||
| 2) The Performance Monitor Counter Registers (PMC) | |||||
| The first ones are always available, always active, but they're not very | |||||
| accurate: the registers increment by one every four *bus* cycles. On | |||||
| my 667 Mhz tiBook (ppc7450), this means once every twenty *processor* | |||||
| cycles. So we won't use that. | |||||
| The PMC are much more useful: not only can they report cycle-accurate | |||||
| timing, but they can also be used to monitor many other parameters, | |||||
| such as the number of AltiVec stalls for every kind of instruction, | |||||
| or instruction cache misses. The downside is that not all processors | |||||
| support the PMC (all G3, all G4 and the 970 do support them), and | |||||
| they're inactive by default - you need to activate them with a | |||||
| dedicated tool. Also, the number of available PMC depends on the | |||||
| procesor: the various 604 have 2, the various 75x (aka. G3) have 4, | |||||
| and the various 74xx (aka G4) have 6. | |||||
| *WARNING*: The PowerPC 970 is not very well documented, and its PMC | |||||
| registers are 64 bits wide. To properly notify the code, you *must* | |||||
| tune for the 970 (using --tune=970), or the code will assume 32 bit | |||||
| registers. | |||||
| II - Enabling FFmpeg PowerPC performance support | |||||
| This needs to be done by hand. First, you need to configure FFmpeg as | |||||
| usual, but add the "--powerpc-perf-enable" option. For instance: | |||||
| ##### | |||||
| ./configure --prefix=/usr/local/ffmpeg-svn --cc=gcc-3.3 --tune=7450 --powerpc-perf-enable | |||||
| ##### | |||||
| This will configure FFmpeg to install inside /usr/local/ffmpeg-svn, | |||||
| compiling with gcc-3.3 (you should try to use this one or a newer | |||||
| gcc), and tuning for the PowerPC 7450 (i.e. the newer G4; as a rule of | |||||
| thumb, those at 550Mhz and more). It will also enable the PMC. | |||||
| You may also edit the file "config.h" to enable the following line: | |||||
| ##### | |||||
| // #define ALTIVEC_USE_REFERENCE_C_CODE 1 | |||||
| ##### | |||||
| If you enable this line, then the code will not make use of AltiVec, | |||||
| but will use the reference C code instead. This is useful to compare | |||||
| performance between two versions of the code. | |||||
| Also, the number of enabled PMC is defined in "libavcodec/ppc/dsputil_ppc.h": | |||||
| ##### | |||||
| #define POWERPC_NUM_PMC_ENABLED 4 | |||||
| ##### | |||||
| If you have a G4 CPU, you can enable all 6 PMC. DO NOT enable more | |||||
| PMC than available on your CPU! | |||||
| Then, simply compile FFmpeg as usual (make && make install). | |||||
| III - Using FFmpeg PowerPC performance support | |||||
| This FFmeg can be used exactly as usual. But before exiting, FFmpeg | |||||
| will dump a per-function report that looks like this: | |||||
| ##### | |||||
| PowerPC performance report | |||||
| Values are from the PMC registers, and represent whatever the | |||||
| registers are set to record. | |||||
| Function "gmc1_altivec" (pmc1): | |||||
| min: 231 | |||||
| max: 1339867 | |||||
| avg: 558.25 (255302) | |||||
| Function "gmc1_altivec" (pmc2): | |||||
| min: 93 | |||||
| max: 2164 | |||||
| avg: 267.31 (255302) | |||||
| Function "gmc1_altivec" (pmc3): | |||||
| min: 72 | |||||
| max: 1987 | |||||
| avg: 276.20 (255302) | |||||
| (...) | |||||
| ##### | |||||
| In this example, PMC1 was set to record CPU cycles, PMC2 was set to | |||||
| record AltiVec Permute Stall Cycles, and PMC3 was set to record AltiVec | |||||
| Issue Stalls. | |||||
| The function "gmc1_altivec" was monitored 255302 times, and the | |||||
| minimum execution time was 231 processor cycles. The max and average | |||||
| aren't much use, as it's very likely the OS interrupted execution for | |||||
| reasons of its own :-( | |||||
| With the exact same settings and source file, but using the reference C | |||||
| code we get: | |||||
| ##### | |||||
| PowerPC performance report | |||||
| Values are from the PMC registers, and represent whatever the | |||||
| registers are set to record. | |||||
| Function "gmc1_altivec" (pmc1): | |||||
| min: 592 | |||||
| max: 2532235 | |||||
| avg: 962.88 (255302) | |||||
| Function "gmc1_altivec" (pmc2): | |||||
| min: 0 | |||||
| max: 33 | |||||
| avg: 0.00 (255302) | |||||
| Function "gmc1_altivec" (pmc3): | |||||
| min: 0 | |||||
| max: 350 | |||||
| avg: 0.03 (255302) | |||||
| (...) | |||||
| ##### | |||||
| 592 cycles, so the fastest AltiVec execution is about 2.5x faster than | |||||
| the fastest C execution in this example. It's not perfect but it's not | |||||
| bad (well I wrote this function so I can't say otherwise :-). | |||||
| Once you have that kind of report, you can try to improve things by | |||||
| finding what goes wrong and fixing it; in the example above, one | |||||
| should try to diminish the number of AltiVec stalls, as this *may* | |||||
| improve performance. | |||||
| IV) Enabling the PMC in Mac OS X | |||||
| This is easy. Use "Monster" and "monster". Those tools come from | |||||
| Apple's CHUD package, and can be found hidden in the developer web | |||||
| site & FTP site. "MONster" is the graphical application, use it to | |||||
| generate a config file specifying what each register should | |||||
| monitor. Then use the command-line application "monster" to use that | |||||
| config file, and enjoy the results. | |||||
| Note that "MONster" can be used for many other things, but it's | |||||
| documented by Apple, it's not my subject. | |||||
| If you are using CHUD 4.4.2 or later, you'll notice that MONster is | |||||
| no longer available. It's been superseeded by Shark, where | |||||
| configuration of PMCs is available as a plugin. | |||||
| V) Enabling the PMC on Linux | |||||
| On linux you may use oprofile from http://oprofile.sf.net, depending on the | |||||
| version and the cpu you may need to apply a patch[1] to access a set of the | |||||
| possibile counters from the userspace application. You can always define them | |||||
| using the kernel interface /dev/oprofile/* . | |||||
| [1] http://dev.gentoo.org/~lu_zero/development/oprofile-g4-20060423.patch | |||||
| -- | |||||
| Romain Dolbeau <romain@dolbeau.org> | |||||
| Luca Barbato <lu_zero@gentoo.org> | |||||
| @@ -618,11 +618,6 @@ static int av_exit(int ret) | |||||
| av_free(video_standard); | av_free(video_standard); | ||||
| #if CONFIG_POWERPC_PERF | |||||
| void powerpc_display_perf_report(void); | |||||
| powerpc_display_perf_report(); | |||||
| #endif /* CONFIG_POWERPC_PERF */ | |||||
| for (i=0;i<AVMEDIA_TYPE_NB;i++) | for (i=0;i<AVMEDIA_TYPE_NB;i++) | ||||
| av_free(avcodec_opts[i]); | av_free(avcodec_opts[i]); | ||||
| av_free(avformat_opts); | av_free(avformat_opts); | ||||
| @@ -25,7 +25,6 @@ | |||||
| #include <altivec.h> | #include <altivec.h> | ||||
| #endif | #endif | ||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | #include "util_altivec.h" | ||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| @@ -610,7 +609,6 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { | |||||
| /* next one assumes that ((line_size % 16) == 0) */ | /* next one assumes that ((line_size % 16) == 0) */ | ||||
| void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); | |||||
| register vector unsigned char pixelsv1, pixelsv2; | register vector unsigned char pixelsv1, pixelsv2; | ||||
| register vector unsigned char pixelsv1B, pixelsv2B; | register vector unsigned char pixelsv1B, pixelsv2B; | ||||
| register vector unsigned char pixelsv1C, pixelsv2C; | register vector unsigned char pixelsv1C, pixelsv2C; | ||||
| @@ -622,7 +620,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); | |||||
| register int line_size_3 = line_size + line_size_2; | register int line_size_3 = line_size + line_size_2; | ||||
| register int line_size_4 = line_size << 2; | register int line_size_4 = line_size << 2; | ||||
| POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |||||
| // hand-unrolling the loop by 4 gains about 15% | // hand-unrolling the loop by 4 gains about 15% | ||||
| // mininum execution time goes from 74 to 60 cycles | // mininum execution time goes from 74 to 60 cycles | ||||
| // it's faster than -funroll-loops, but using | // it's faster than -funroll-loops, but using | ||||
| @@ -659,20 +656,16 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |||||
| block +=line_size_4; | block +=line_size_4; | ||||
| } | } | ||||
| #endif | #endif | ||||
| POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 16) == 0) */ | /* next one assumes that ((line_size % 16) == 0) */ | ||||
| #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | ||||
| void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); | |||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | ||||
| register vector unsigned char perm = vec_lvsl(0, pixels); | register vector unsigned char perm = vec_lvsl(0, pixels); | ||||
| int i; | int i; | ||||
| POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); | |||||
| for (i = 0; i < h; i++) { | for (i = 0; i < h; i++) { | ||||
| pixelsv1 = vec_ld( 0, pixels); | pixelsv1 = vec_ld( 0, pixels); | ||||
| pixelsv2 = vec_ld(16,pixels); | pixelsv2 = vec_ld(16,pixels); | ||||
| @@ -683,19 +676,14 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); | |||||
| pixels+=line_size; | pixels+=line_size; | ||||
| block +=line_size; | block +=line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 8) == 0) */ | /* next one assumes that ((line_size % 8) == 0) */ | ||||
| static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); | |||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | ||||
| int i; | int i; | ||||
| POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); | |||||
| for (i = 0; i < h; i++) { | for (i = 0; i < h; i++) { | ||||
| /* block is 8 bytes-aligned, so we're either in the | /* block is 8 bytes-aligned, so we're either in the | ||||
| left block (16 bytes-aligned) or in the right block (not) */ | left block (16 bytes-aligned) or in the right block (not) */ | ||||
| @@ -719,14 +707,11 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); | |||||
| pixels += line_size; | pixels += line_size; | ||||
| block += line_size; | block += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 8) == 0) */ | /* next one assumes that ((line_size % 8) == 0) */ | ||||
| static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); | |||||
| register int i; | register int i; | ||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | ||||
| register vector unsigned char blockv, temp1, temp2; | register vector unsigned char blockv, temp1, temp2; | ||||
| @@ -748,7 +733,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); | |||||
| (vector unsigned short)pixelsv2); | (vector unsigned short)pixelsv2); | ||||
| pixelssum1 = vec_add(pixelssum1, vctwo); | pixelssum1 = vec_add(pixelssum1, vctwo); | ||||
| POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |||||
| for (i = 0; i < h ; i++) { | for (i = 0; i < h ; i++) { | ||||
| int rightside = ((unsigned long)block & 0x0000000F); | int rightside = ((unsigned long)block & 0x0000000F); | ||||
| blockv = vec_ld(0, block); | blockv = vec_ld(0, block); | ||||
| @@ -782,14 +766,11 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |||||
| block += line_size; | block += line_size; | ||||
| pixels += line_size; | pixels += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 8) == 0) */ | /* next one assumes that ((line_size % 8) == 0) */ | ||||
| static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |||||
| register int i; | register int i; | ||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | ||||
| register vector unsigned char blockv, temp1, temp2; | register vector unsigned char blockv, temp1, temp2; | ||||
| @@ -812,7 +793,6 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |||||
| (vector unsigned short)pixelsv2); | (vector unsigned short)pixelsv2); | ||||
| pixelssum1 = vec_add(pixelssum1, vcone); | pixelssum1 = vec_add(pixelssum1, vcone); | ||||
| POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |||||
| for (i = 0; i < h ; i++) { | for (i = 0; i < h ; i++) { | ||||
| int rightside = ((unsigned long)block & 0x0000000F); | int rightside = ((unsigned long)block & 0x0000000F); | ||||
| blockv = vec_ld(0, block); | blockv = vec_ld(0, block); | ||||
| @@ -846,14 +826,11 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |||||
| block += line_size; | block += line_size; | ||||
| pixels += line_size; | pixels += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 16) == 0) */ | /* next one assumes that ((line_size % 16) == 0) */ | ||||
| static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); | |||||
| register int i; | register int i; | ||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | ||||
| register vector unsigned char blockv, temp1, temp2; | register vector unsigned char blockv, temp1, temp2; | ||||
| @@ -862,8 +839,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); | |||||
| register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | ||||
| register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | ||||
| POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |||||
| temp1 = vec_ld(0, pixels); | temp1 = vec_ld(0, pixels); | ||||
| temp2 = vec_ld(16, pixels); | temp2 = vec_ld(16, pixels); | ||||
| pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | ||||
| @@ -919,14 +894,11 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |||||
| block += line_size; | block += line_size; | ||||
| pixels += line_size; | pixels += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |||||
| } | } | ||||
| /* next one assumes that ((line_size % 16) == 0) */ | /* next one assumes that ((line_size % 16) == 0) */ | ||||
| static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |||||
| register int i; | register int i; | ||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | ||||
| register vector unsigned char blockv, temp1, temp2; | register vector unsigned char blockv, temp1, temp2; | ||||
| @@ -936,8 +908,6 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |||||
| register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | ||||
| register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | ||||
| POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |||||
| temp1 = vec_ld(0, pixels); | temp1 = vec_ld(0, pixels); | ||||
| temp2 = vec_ld(16, pixels); | temp2 = vec_ld(16, pixels); | ||||
| pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | ||||
| @@ -993,18 +963,14 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |||||
| block += line_size; | block += line_size; | ||||
| pixels += line_size; | pixels += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |||||
| } | } | ||||
| static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | ||||
| POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); | |||||
| int sum; | int sum; | ||||
| register const vector unsigned char vzero = | register const vector unsigned char vzero = | ||||
| (const vector unsigned char)vec_splat_u8(0); | (const vector unsigned char)vec_splat_u8(0); | ||||
| register vector signed short temp0, temp1, temp2, temp3, temp4, | register vector signed short temp0, temp1, temp2, temp3, temp4, | ||||
| temp5, temp6, temp7; | temp5, temp6, temp7; | ||||
| POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); | |||||
| { | { | ||||
| register const vector signed short vprod1 =(const vector signed short) | register const vector signed short vprod1 =(const vector signed short) | ||||
| { 1,-1, 1,-1, 1,-1, 1,-1 }; | { 1,-1, 1,-1, 1,-1, 1,-1 }; | ||||
| @@ -1100,7 +1066,6 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); | |||||
| vsum = vec_splat(vsum, 3); | vsum = vec_splat(vsum, 3); | ||||
| vec_ste(vsum, 0, &sum); | vec_ste(vsum, 0, &sum); | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); | |||||
| return sum; | return sum; | ||||
| } | } | ||||
| @@ -1319,16 +1284,13 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, | |||||
| } | } | ||||
| static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ | ||||
| POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); | |||||
| int score; | int score; | ||||
| POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); | |||||
| score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | ||||
| if (h==16) { | if (h==16) { | ||||
| dst += 8*stride; | dst += 8*stride; | ||||
| src += 8*stride; | src += 8*stride; | ||||
| score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); | |||||
| return score; | return score; | ||||
| } | } | ||||
| @@ -1358,7 +1320,6 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang, | |||||
| /* next one assumes that ((line_size % 8) == 0) */ | /* next one assumes that ((line_size % 8) == 0) */ | ||||
| static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); | |||||
| register int i; | register int i; | ||||
| register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | ||||
| register vector unsigned char blockv, temp1, temp2, blocktemp; | register vector unsigned char blockv, temp1, temp2, blocktemp; | ||||
| @@ -1383,7 +1344,6 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); | |||||
| (vector unsigned short)pixelsv2); | (vector unsigned short)pixelsv2); | ||||
| pixelssum1 = vec_add(pixelssum1, vctwo); | pixelssum1 = vec_add(pixelssum1, vctwo); | ||||
| POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); | |||||
| for (i = 0; i < h ; i++) { | for (i = 0; i < h ; i++) { | ||||
| int rightside = ((unsigned long)block & 0x0000000F); | int rightside = ((unsigned long)block & 0x0000000F); | ||||
| blockv = vec_ld(0, block); | blockv = vec_ld(0, block); | ||||
| @@ -1418,8 +1378,6 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); | |||||
| block += line_size; | block += line_size; | ||||
| pixels += line_size; | pixels += line_size; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); | |||||
| } | } | ||||
| void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) | void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) | ||||
| @@ -21,9 +21,6 @@ | |||||
| */ | */ | ||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| int mm_flags = 0; | int mm_flags = 0; | ||||
| @@ -39,63 +36,6 @@ int mm_support(void) | |||||
| return result; | return result; | ||||
| } | } | ||||
| #if CONFIG_POWERPC_PERF | |||||
| unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |||||
| /* list below must match enum in dsputil_ppc.h */ | |||||
| static unsigned char* perfname[] = { | |||||
| "ff_fft_calc_altivec", | |||||
| "gmc1_altivec", | |||||
| "dct_unquantize_h263_altivec", | |||||
| "fdct_altivec", | |||||
| "idct_add_altivec", | |||||
| "idct_put_altivec", | |||||
| "put_pixels16_altivec", | |||||
| "avg_pixels16_altivec", | |||||
| "avg_pixels8_altivec", | |||||
| "put_pixels8_xy2_altivec", | |||||
| "put_no_rnd_pixels8_xy2_altivec", | |||||
| "put_pixels16_xy2_altivec", | |||||
| "put_no_rnd_pixels16_xy2_altivec", | |||||
| "hadamard8_diff8x8_altivec", | |||||
| "hadamard8_diff16_altivec", | |||||
| "avg_pixels8_xy2_altivec", | |||||
| "clear_blocks_dcbz32_ppc", | |||||
| "clear_blocks_dcbz128_ppc", | |||||
| "put_h264_chroma_mc8_altivec", | |||||
| "avg_h264_chroma_mc8_altivec", | |||||
| "put_h264_qpel16_h_lowpass_altivec", | |||||
| "avg_h264_qpel16_h_lowpass_altivec", | |||||
| "put_h264_qpel16_v_lowpass_altivec", | |||||
| "avg_h264_qpel16_v_lowpass_altivec", | |||||
| "put_h264_qpel16_hv_lowpass_altivec", | |||||
| "avg_h264_qpel16_hv_lowpass_altivec", | |||||
| "" | |||||
| }; | |||||
| #include <stdio.h> | |||||
| #endif | |||||
| #if CONFIG_POWERPC_PERF | |||||
| void powerpc_display_perf_report(void) | |||||
| { | |||||
| int i, j; | |||||
| av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); | |||||
| for(i = 0 ; i < powerpc_perf_total ; i++) { | |||||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { | |||||
| if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) | |||||
| av_log(NULL, AV_LOG_INFO, | |||||
| " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", | |||||
| perfname[i], | |||||
| j+1, | |||||
| perfdata[j][i][powerpc_data_min], | |||||
| perfdata[j][i][powerpc_data_max], | |||||
| (double)perfdata[j][i][powerpc_data_sum] / | |||||
| (double)perfdata[j][i][powerpc_data_num], | |||||
| perfdata[j][i][powerpc_data_num]); | |||||
| } | |||||
| } | |||||
| } | |||||
| #endif /* CONFIG_POWERPC_PERF */ | |||||
| /* ***** WARNING ***** WARNING ***** WARNING ***** */ | /* ***** WARNING ***** WARNING ***** WARNING ***** */ | ||||
| /* | /* | ||||
| clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a | clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a | ||||
| @@ -118,10 +58,8 @@ and <http://developer.apple.com/technotes/tn/tn2086.html> | |||||
| */ | */ | ||||
| static void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | static void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); | |||||
| register int misal = ((unsigned long)blocks & 0x00000010); | register int misal = ((unsigned long)blocks & 0x00000010); | ||||
| register int i = 0; | register int i = 0; | ||||
| POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); | |||||
| #if 1 | #if 1 | ||||
| if (misal) { | if (misal) { | ||||
| ((unsigned long*)blocks)[0] = 0L; | ((unsigned long*)blocks)[0] = 0L; | ||||
| @@ -143,7 +81,6 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); | |||||
| #else | #else | ||||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | memset(blocks, 0, sizeof(DCTELEM)*6*64); | ||||
| #endif | #endif | ||||
| POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); | |||||
| } | } | ||||
| /* same as above, when dcbzl clear a whole 128B cache line | /* same as above, when dcbzl clear a whole 128B cache line | ||||
| @@ -151,10 +88,8 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); | |||||
| #if HAVE_DCBZL | #if HAVE_DCBZL | ||||
| static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); | |||||
| register int misal = ((unsigned long)blocks & 0x0000007f); | register int misal = ((unsigned long)blocks & 0x0000007f); | ||||
| register int i = 0; | register int i = 0; | ||||
| POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); | |||||
| #if 1 | #if 1 | ||||
| if (misal) { | if (misal) { | ||||
| // we could probably also optimize this case, | // we could probably also optimize this case, | ||||
| @@ -169,7 +104,6 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); | |||||
| #else | #else | ||||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | memset(blocks, 0, sizeof(DCTELEM)*6*64); | ||||
| #endif | #endif | ||||
| POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); | |||||
| } | } | ||||
| #else | #else | ||||
| static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | ||||
| @@ -279,19 +213,6 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) | |||||
| } | } | ||||
| } | } | ||||
| #if CONFIG_POWERPC_PERF | |||||
| { | |||||
| int i, j; | |||||
| for (i = 0 ; i < powerpc_perf_total ; i++) { | |||||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { | |||||
| perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; | |||||
| perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; | |||||
| perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; | |||||
| perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; | |||||
| } | |||||
| } | |||||
| } | |||||
| #endif /* CONFIG_POWERPC_PERF */ | |||||
| } | } | ||||
| #endif /* HAVE_ALTIVEC */ | #endif /* HAVE_ALTIVEC */ | ||||
| } | } | ||||
| @@ -1,154 +0,0 @@ | |||||
| /* | |||||
| * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> | |||||
| * | |||||
| * This file is part of FFmpeg. | |||||
| * | |||||
| * FFmpeg is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * FFmpeg is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with FFmpeg; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #ifndef AVCODEC_PPC_DSPUTIL_PPC_H | |||||
| #define AVCODEC_PPC_DSPUTIL_PPC_H | |||||
| #include "config.h" | |||||
| #if CONFIG_POWERPC_PERF | |||||
| void powerpc_display_perf_report(void); | |||||
| /* the 604* have 2, the G3* have 4, the G4s have 6, | |||||
| and the G5 are completely different (they MUST use | |||||
| ARCH_PPC64, and let's hope all future 64 bis PPC | |||||
| will use the same PMCs... */ | |||||
| #define POWERPC_NUM_PMC_ENABLED 6 | |||||
| /* if you add to the enum below, also add to the perfname array | |||||
| in dsputil_ppc.c */ | |||||
| enum powerpc_perf_index { | |||||
| altivec_fft_num = 0, | |||||
| altivec_gmc1_num, | |||||
| altivec_dct_unquantize_h263_num, | |||||
| altivec_fdct, | |||||
| altivec_idct_add_num, | |||||
| altivec_idct_put_num, | |||||
| altivec_put_pixels16_num, | |||||
| altivec_avg_pixels16_num, | |||||
| altivec_avg_pixels8_num, | |||||
| altivec_put_pixels8_xy2_num, | |||||
| altivec_put_no_rnd_pixels8_xy2_num, | |||||
| altivec_put_pixels16_xy2_num, | |||||
| altivec_put_no_rnd_pixels16_xy2_num, | |||||
| altivec_hadamard8_diff8x8_num, | |||||
| altivec_hadamard8_diff16_num, | |||||
| altivec_avg_pixels8_xy2_num, | |||||
| powerpc_clear_blocks_dcbz32, | |||||
| powerpc_clear_blocks_dcbz128, | |||||
| altivec_put_h264_chroma_mc8_num, | |||||
| altivec_avg_h264_chroma_mc8_num, | |||||
| altivec_put_h264_qpel16_h_lowpass_num, | |||||
| altivec_avg_h264_qpel16_h_lowpass_num, | |||||
| altivec_put_h264_qpel16_v_lowpass_num, | |||||
| altivec_avg_h264_qpel16_v_lowpass_num, | |||||
| altivec_put_h264_qpel16_hv_lowpass_num, | |||||
| altivec_avg_h264_qpel16_hv_lowpass_num, | |||||
| powerpc_perf_total | |||||
| }; | |||||
| enum powerpc_data_index { | |||||
| powerpc_data_min = 0, | |||||
| powerpc_data_max, | |||||
| powerpc_data_sum, | |||||
| powerpc_data_num, | |||||
| powerpc_data_total | |||||
| }; | |||||
| extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |||||
| #if !ARCH_PPC64 | |||||
| #define POWERP_PMC_DATATYPE unsigned long | |||||
| #define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a)) | |||||
| #if (POWERPC_NUM_PMC_ENABLED > 2) | |||||
| #define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a)) | |||||
| #else | |||||
| #define POWERPC_GET_PMC3(a) do {} while (0) | |||||
| #define POWERPC_GET_PMC4(a) do {} while (0) | |||||
| #endif | |||||
| #if (POWERPC_NUM_PMC_ENABLED > 4) | |||||
| #define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a)) | |||||
| #else | |||||
| #define POWERPC_GET_PMC5(a) do {} while (0) | |||||
| #define POWERPC_GET_PMC6(a) do {} while (0) | |||||
| #endif | |||||
| #else /* ARCH_PPC64 */ | |||||
| #define POWERP_PMC_DATATYPE unsigned long long | |||||
| #define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a)) | |||||
| #if (POWERPC_NUM_PMC_ENABLED > 2) | |||||
| #define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a)) | |||||
| #else | |||||
| #define POWERPC_GET_PMC3(a) do {} while (0) | |||||
| #define POWERPC_GET_PMC4(a) do {} while (0) | |||||
| #endif | |||||
| #if (POWERPC_NUM_PMC_ENABLED > 4) | |||||
| #define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a)) | |||||
| #define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a)) | |||||
| #else | |||||
| #define POWERPC_GET_PMC5(a) do {} while (0) | |||||
| #define POWERPC_GET_PMC6(a) do {} while (0) | |||||
| #endif | |||||
| #endif /* ARCH_PPC64 */ | |||||
| #define POWERPC_PERF_DECLARE(a, cond) \ | |||||
| POWERP_PMC_DATATYPE \ | |||||
| pmc_start[POWERPC_NUM_PMC_ENABLED], \ | |||||
| pmc_stop[POWERPC_NUM_PMC_ENABLED], \ | |||||
| pmc_loop_index; | |||||
| #define POWERPC_PERF_START_COUNT(a, cond) do { \ | |||||
| POWERPC_GET_PMC6(pmc_start[5]); \ | |||||
| POWERPC_GET_PMC5(pmc_start[4]); \ | |||||
| POWERPC_GET_PMC4(pmc_start[3]); \ | |||||
| POWERPC_GET_PMC3(pmc_start[2]); \ | |||||
| POWERPC_GET_PMC2(pmc_start[1]); \ | |||||
| POWERPC_GET_PMC1(pmc_start[0]); \ | |||||
| } while (0) | |||||
| #define POWERPC_PERF_STOP_COUNT(a, cond) do { \ | |||||
| POWERPC_GET_PMC1(pmc_stop[0]); \ | |||||
| POWERPC_GET_PMC2(pmc_stop[1]); \ | |||||
| POWERPC_GET_PMC3(pmc_stop[2]); \ | |||||
| POWERPC_GET_PMC4(pmc_stop[3]); \ | |||||
| POWERPC_GET_PMC5(pmc_stop[4]); \ | |||||
| POWERPC_GET_PMC6(pmc_stop[5]); \ | |||||
| if (cond) { \ | |||||
| for(pmc_loop_index = 0; \ | |||||
| pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ | |||||
| pmc_loop_index++) { \ | |||||
| if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ | |||||
| POWERP_PMC_DATATYPE diff = \ | |||||
| pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ | |||||
| if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ | |||||
| perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ | |||||
| if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ | |||||
| perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ | |||||
| perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ | |||||
| perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ | |||||
| } \ | |||||
| } \ | |||||
| } \ | |||||
| } while (0) | |||||
| #else /* CONFIG_POWERPC_PERF */ | |||||
| // those are needed to avoid empty statements. | |||||
| #define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) | |||||
| #define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) | |||||
| #define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) | |||||
| #endif /* CONFIG_POWERPC_PERF */ | |||||
| #endif /* AVCODEC_PPC_DSPUTIL_PPC_H */ | |||||
| @@ -24,7 +24,6 @@ | |||||
| #endif | #endif | ||||
| #include "libavutil/common.h" | #include "libavutil/common.h" | ||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| #define vs16(v) ((vector signed short)(v)) | #define vs16(v) ((vector signed short)(v)) | ||||
| @@ -198,7 +197,6 @@ static vector float fdctconsts[3] = { | |||||
| void fdct_altivec(int16_t *block) | void fdct_altivec(int16_t *block) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_fdct, 1); | |||||
| vector signed short *bp; | vector signed short *bp; | ||||
| vector float *cp; | vector float *cp; | ||||
| vector float b00, b10, b20, b30, b40, b50, b60, b70; | vector float b00, b10, b20, b30, b40, b50, b60, b70; | ||||
| @@ -206,9 +204,6 @@ POWERPC_PERF_DECLARE(altivec_fdct, 1); | |||||
| vector float mzero, cnst, cnsts0, cnsts1, cnsts2; | vector float mzero, cnst, cnsts0, cnsts1, cnsts2; | ||||
| vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; | vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; | ||||
| POWERPC_PERF_START_COUNT(altivec_fdct, 1); | |||||
| /* setup constants {{{ */ | /* setup constants {{{ */ | ||||
| /* mzero = -0.0 */ | /* mzero = -0.0 */ | ||||
| mzero = ((vector float)vec_splat_u32(-1)); | mzero = ((vector float)vec_splat_u32(-1)); | ||||
| @@ -487,8 +482,6 @@ POWERPC_PERF_DECLARE(altivec_fdct, 1); | |||||
| #undef CTS | #undef CTS | ||||
| /* }}} */ | /* }}} */ | ||||
| POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); | |||||
| } | } | ||||
| /* vim:set foldmethod=marker foldlevel=0: */ | /* vim:set foldmethod=marker foldlevel=0: */ | ||||
| @@ -21,7 +21,6 @@ | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
| */ | */ | ||||
| #include "libavcodec/fft.h" | #include "libavcodec/fft.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | #include "util_altivec.h" | ||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| @@ -38,7 +37,6 @@ | |||||
| */ | */ | ||||
| static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) | static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); | |||||
| register const vector float vczero = (const vector float)vec_splat_u32(0.); | register const vector float vczero = (const vector float)vec_splat_u32(0.); | ||||
| int ln = s->nbits; | int ln = s->nbits; | ||||
| @@ -48,8 +46,6 @@ POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); | |||||
| FFTComplex *cptr, *cptr1; | FFTComplex *cptr, *cptr1; | ||||
| int k; | int k; | ||||
| POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); | |||||
| np = 1 << ln; | np = 1 << ln; | ||||
| { | { | ||||
| @@ -132,8 +128,6 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); | |||||
| nblocks = nblocks >> 1; | nblocks = nblocks >> 1; | ||||
| nloops = nloops << 1; | nloops = nloops << 1; | ||||
| } while (nblocks != 0); | } while (nblocks != 0); | ||||
| POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); | |||||
| } | } | ||||
| av_cold void ff_fft_init_altivec(FFTContext *s) | av_cold void ff_fft_init_altivec(FFTContext *s) | ||||
| @@ -21,7 +21,6 @@ | |||||
| */ | */ | ||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | #include "util_altivec.h" | ||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| @@ -30,10 +29,8 @@ | |||||
| altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, | altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, | ||||
| to preserve proper dst alignment. | to preserve proper dst alignment. | ||||
| */ | */ | ||||
| #define GMC1_PERF_COND (h==8) | |||||
| void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) | void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); | |||||
| const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; | const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; | ||||
| const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = | const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = | ||||
| { | { | ||||
| @@ -51,9 +48,6 @@ POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); | |||||
| unsigned long dst_odd = (unsigned long)dst & 0x0000000F; | unsigned long dst_odd = (unsigned long)dst & 0x0000000F; | ||||
| unsigned long src_really_odd = (unsigned long)src & 0x0000000F; | unsigned long src_really_odd = (unsigned long)src & 0x0000000F; | ||||
| POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |||||
| tempA = vec_ld(0, (unsigned short*)ABCD); | tempA = vec_ld(0, (unsigned short*)ABCD); | ||||
| Av = vec_splat(tempA, 0); | Av = vec_splat(tempA, 0); | ||||
| Bv = vec_splat(tempA, 1); | Bv = vec_splat(tempA, 1); | ||||
| @@ -133,6 +127,4 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |||||
| dst += stride; | dst += stride; | ||||
| src += stride; | src += stride; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |||||
| } | } | ||||
| @@ -22,7 +22,6 @@ | |||||
| #include "libavcodec/h264data.h" | #include "libavcodec/h264data.h" | ||||
| #include "libavcodec/h264dsp.h" | #include "libavcodec/h264dsp.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| #include "util_altivec.h" | #include "util_altivec.h" | ||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| @@ -77,7 +77,6 @@ | |||||
| static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | ||||
| int stride, int h, int x, int y) { | int stride, int h, int x, int y) { | ||||
| POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); | |||||
| DECLARE_ALIGNED(16, signed int, ABCD)[4] = | DECLARE_ALIGNED(16, signed int, ABCD)[4] = | ||||
| {((8 - x) * (8 - y)), | {((8 - x) * (8 - y)), | ||||
| (( x) * (8 - y)), | (( x) * (8 - y)), | ||||
| @@ -103,8 +102,6 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | |||||
| vec_s16 vsrc2ssH, vsrc3ssH, psum; | vec_s16 vsrc2ssH, vsrc3ssH, psum; | ||||
| vec_u8 vdst, ppsum, vfdst, fsum; | vec_u8 vdst, ppsum, vfdst, fsum; | ||||
| POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); | |||||
| if (((unsigned long)dst) % 16 == 0) { | if (((unsigned long)dst) % 16 == 0) { | ||||
| fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, | fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, | ||||
| 0x14, 0x15, 0x16, 0x17, | 0x14, 0x15, 0x16, 0x17, | ||||
| @@ -203,7 +200,6 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); | |||||
| } | } | ||||
| /* this code assume that stride % 16 == 0 */ | /* this code assume that stride % 16 == 0 */ | ||||
| @@ -295,7 +291,6 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i | |||||
| /* this code assume stride % 16 == 0 */ | /* this code assume stride % 16 == 0 */ | ||||
| static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | ||||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||||
| register int i; | register int i; | ||||
| LOAD_ZERO; | LOAD_ZERO; | ||||
| @@ -323,8 +318,6 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i | |||||
| vec_u8 sum, vdst, fsum; | vec_u8 sum, vdst, fsum; | ||||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||||
| for (i = 0 ; i < 16 ; i ++) { | for (i = 0 ; i < 16 ; i ++) { | ||||
| vec_u8 srcR1 = vec_ld(-2, src); | vec_u8 srcR1 = vec_ld(-2, src); | ||||
| vec_u8 srcR2 = vec_ld(14, src); | vec_u8 srcR2 = vec_ld(14, src); | ||||
| @@ -433,13 +426,10 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i | |||||
| src += srcStride; | src += srcStride; | ||||
| dst += dstStride; | dst += dstStride; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||||
| } | } | ||||
| /* this code assume stride % 16 == 0 */ | /* this code assume stride % 16 == 0 */ | ||||
| static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | ||||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||||
| register int i; | register int i; | ||||
| LOAD_ZERO; | LOAD_ZERO; | ||||
| @@ -490,8 +480,6 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i | |||||
| vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; | vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; | ||||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||||
| for (i = 0 ; i < 16 ; i++) { | for (i = 0 ; i < 16 ; i++) { | ||||
| srcP3a = vec_ld(0, srcbis += srcStride); | srcP3a = vec_ld(0, srcbis += srcStride); | ||||
| srcP3b = vec_ld(16, srcbis); | srcP3b = vec_ld(16, srcbis); | ||||
| @@ -544,12 +532,10 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i | |||||
| dst += dstStride; | dst += dstStride; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||||
| } | } | ||||
| /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | ||||
| static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | ||||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||||
| register int i; | register int i; | ||||
| LOAD_ZERO; | LOAD_ZERO; | ||||
| const vec_u8 permM2 = vec_lvsl(-2, src); | const vec_u8 permM2 = vec_lvsl(-2, src); | ||||
| @@ -589,7 +575,6 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, | |||||
| vec_u8 fsum, sumv, sum, vdst; | vec_u8 fsum, sumv, sum, vdst; | ||||
| vec_s16 ssume, ssumo; | vec_s16 ssume, ssumo; | ||||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||||
| src -= (2 * srcStride); | src -= (2 * srcStride); | ||||
| for (i = 0 ; i < 21 ; i ++) { | for (i = 0 ; i < 21 ; i ++) { | ||||
| vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | ||||
| @@ -779,5 +764,4 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, | |||||
| dst += dstStride; | dst += dstStride; | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||||
| } | } | ||||
| @@ -43,7 +43,6 @@ | |||||
| #endif | #endif | ||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| #define IDCT_HALF \ | #define IDCT_HALF \ | ||||
| @@ -161,13 +160,9 @@ static const vec_s16 constants[5] = { | |||||
| void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) | void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); | |||||
| vec_s16 *block = (vec_s16*)blk; | vec_s16 *block = (vec_s16*)blk; | ||||
| vec_u8 tmp; | vec_u8 tmp; | ||||
| #if CONFIG_POWERPC_PERF | |||||
| POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); | |||||
| #endif | |||||
| IDCT | IDCT | ||||
| #define COPY(dest,src) \ | #define COPY(dest,src) \ | ||||
| @@ -183,13 +178,10 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); | |||||
| COPY (dest, vx5) dest += stride; | COPY (dest, vx5) dest += stride; | ||||
| COPY (dest, vx6) dest += stride; | COPY (dest, vx6) dest += stride; | ||||
| COPY (dest, vx7) | COPY (dest, vx7) | ||||
| POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); | |||||
| } | } | ||||
| void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) | void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); | |||||
| vec_s16 *block = (vec_s16*)blk; | vec_s16 *block = (vec_s16*)blk; | ||||
| vec_u8 tmp; | vec_u8 tmp; | ||||
| vec_s16 tmp2, tmp3; | vec_s16 tmp2, tmp3; | ||||
| @@ -197,10 +189,6 @@ POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); | |||||
| vec_u8 perm1; | vec_u8 perm1; | ||||
| vec_u8 p0, p1, p; | vec_u8 p0, p1, p; | ||||
| #if CONFIG_POWERPC_PERF | |||||
| POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); | |||||
| #endif | |||||
| IDCT | IDCT | ||||
| p0 = vec_lvsl (0, dest); | p0 = vec_lvsl (0, dest); | ||||
| @@ -226,7 +214,5 @@ POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); | |||||
| ADD (dest, vx5, perm1) dest += stride; | ADD (dest, vx5, perm1) dest += stride; | ||||
| ADD (dest, vx6, perm0) dest += stride; | ADD (dest, vx6, perm0) dest += stride; | ||||
| ADD (dest, vx7, perm1) | ADD (dest, vx7, perm1) | ||||
| POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); | |||||
| } | } | ||||
| @@ -26,7 +26,6 @@ | |||||
| #include "libavcodec/dsputil.h" | #include "libavcodec/dsputil.h" | ||||
| #include "libavcodec/mpegvideo.h" | #include "libavcodec/mpegvideo.h" | ||||
| #include "dsputil_ppc.h" | |||||
| #include "util_altivec.h" | #include "util_altivec.h" | ||||
| #include "types_altivec.h" | #include "types_altivec.h" | ||||
| #include "dsputil_altivec.h" | #include "dsputil_altivec.h" | ||||
| @@ -479,14 +478,11 @@ static int dct_quantize_altivec(MpegEncContext* s, | |||||
| static void dct_unquantize_h263_altivec(MpegEncContext *s, | static void dct_unquantize_h263_altivec(MpegEncContext *s, | ||||
| DCTELEM *block, int n, int qscale) | DCTELEM *block, int n, int qscale) | ||||
| { | { | ||||
| POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); | |||||
| int i, level, qmul, qadd; | int i, level, qmul, qadd; | ||||
| int nCoeffs; | int nCoeffs; | ||||
| assert(s->block_last_index[n]>=0); | assert(s->block_last_index[n]>=0); | ||||
| POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); | |||||
| qadd = (qscale - 1) | 1; | qadd = (qscale - 1) | 1; | ||||
| qmul = qscale << 1; | qmul = qscale << 1; | ||||
| @@ -569,7 +565,6 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); | |||||
| block[0] = backup_0; | block[0] = backup_0; | ||||
| } | } | ||||
| } | } | ||||
| POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); | |||||
| } | } | ||||