This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -60,33 +60,33 @@ int mm_support(void) | |||
| unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |||
| /* list below must match enum in dsputil_ppc.h */ | |||
| static unsigned char* perfname[] = { | |||
| "ff_fft_calc_altivec", | |||
| "gmc1_altivec", | |||
| "dct_unquantize_h263_altivec", | |||
| "fdct_altivec", | |||
| "idct_add_altivec", | |||
| "idct_put_altivec", | |||
| "put_pixels16_altivec", | |||
| "avg_pixels16_altivec", | |||
| "avg_pixels8_altivec", | |||
| "put_pixels8_xy2_altivec", | |||
| "put_no_rnd_pixels8_xy2_altivec", | |||
| "put_pixels16_xy2_altivec", | |||
| "put_no_rnd_pixels16_xy2_altivec", | |||
| "hadamard8_diff8x8_altivec", | |||
| "hadamard8_diff16_altivec", | |||
| "avg_pixels8_xy2_altivec", | |||
| "clear_blocks_dcbz32_ppc", | |||
| "clear_blocks_dcbz128_ppc", | |||
| "put_h264_chroma_mc8_altivec", | |||
| "avg_h264_chroma_mc8_altivec", | |||
| "put_h264_qpel16_h_lowpass_altivec", | |||
| "avg_h264_qpel16_h_lowpass_altivec", | |||
| "put_h264_qpel16_v_lowpass_altivec", | |||
| "avg_h264_qpel16_v_lowpass_altivec", | |||
| "put_h264_qpel16_hv_lowpass_altivec", | |||
| "avg_h264_qpel16_hv_lowpass_altivec", | |||
| "" | |||
| "ff_fft_calc_altivec", | |||
| "gmc1_altivec", | |||
| "dct_unquantize_h263_altivec", | |||
| "fdct_altivec", | |||
| "idct_add_altivec", | |||
| "idct_put_altivec", | |||
| "put_pixels16_altivec", | |||
| "avg_pixels16_altivec", | |||
| "avg_pixels8_altivec", | |||
| "put_pixels8_xy2_altivec", | |||
| "put_no_rnd_pixels8_xy2_altivec", | |||
| "put_pixels16_xy2_altivec", | |||
| "put_no_rnd_pixels16_xy2_altivec", | |||
| "hadamard8_diff8x8_altivec", | |||
| "hadamard8_diff16_altivec", | |||
| "avg_pixels8_xy2_altivec", | |||
| "clear_blocks_dcbz32_ppc", | |||
| "clear_blocks_dcbz128_ppc", | |||
| "put_h264_chroma_mc8_altivec", | |||
| "avg_h264_chroma_mc8_altivec", | |||
| "put_h264_qpel16_h_lowpass_altivec", | |||
| "avg_h264_qpel16_h_lowpass_altivec", | |||
| "put_h264_qpel16_v_lowpass_altivec", | |||
| "avg_h264_qpel16_v_lowpass_altivec", | |||
| "put_h264_qpel16_hv_lowpass_altivec", | |||
| "avg_h264_qpel16_hv_lowpass_altivec", | |||
| "" | |||
| }; | |||
| #include <stdio.h> | |||
| #endif | |||
| @@ -94,51 +94,44 @@ static unsigned char* perfname[] = { | |||
| #ifdef CONFIG_POWERPC_PERF | |||
| void powerpc_display_perf_report(void) | |||
| { | |||
| int i, j; | |||
| av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); | |||
| for(i = 0 ; i < powerpc_perf_total ; i++) | |||
| { | |||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) | |||
| { | |||
| if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) | |||
| av_log(NULL, AV_LOG_INFO, | |||
| " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", | |||
| perfname[i], | |||
| j+1, | |||
| perfdata[j][i][powerpc_data_min], | |||
| perfdata[j][i][powerpc_data_max], | |||
| (double)perfdata[j][i][powerpc_data_sum] / | |||
| (double)perfdata[j][i][powerpc_data_num], | |||
| perfdata[j][i][powerpc_data_num]); | |||
| } | |||
| } | |||
| int i, j; | |||
| av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); | |||
| for(i = 0 ; i < powerpc_perf_total ; i++) { | |||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { | |||
| if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) | |||
| av_log(NULL, AV_LOG_INFO, | |||
| " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", | |||
| perfname[i], | |||
| j+1, | |||
| perfdata[j][i][powerpc_data_min], | |||
| perfdata[j][i][powerpc_data_max], | |||
| (double)perfdata[j][i][powerpc_data_sum] / | |||
| (double)perfdata[j][i][powerpc_data_num], | |||
| perfdata[j][i][powerpc_data_num]); | |||
| } | |||
| } | |||
| } | |||
| #endif /* CONFIG_POWERPC_PERF */ | |||
| /* ***** WARNING ***** WARNING ***** WARNING ***** */ | |||
| /* | |||
| clear_blocks_dcbz32_ppc will not work properly | |||
| on PowerPC processors with a cache line size | |||
| not equal to 32 bytes. | |||
| Fortunately all processor used by Apple up to | |||
| at least the 7450 (aka second generation G4) | |||
| use 32 bytes cache line. | |||
| This is due to the use of the 'dcbz' instruction. | |||
| It simply clear to zero a single cache line, | |||
| so you need to know the cache line size to use it ! | |||
| It's absurd, but it's fast... | |||
| clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a | |||
| cache line size not equal to 32 bytes. | |||
| Fortunately all processor used by Apple up to at least the 7450 (aka second | |||
| generation G4) use 32 bytes cache line. | |||
| This is due to the use of the 'dcbz' instruction. It simply clear to zero a | |||
| single cache line, so you need to know the cache line size to use it ! | |||
| It's absurd, but it's fast... | |||
| update 24/06/2003 : Apple released yesterday the G5, | |||
| with a PPC970. cache line size : 128 bytes. Oups. | |||
| The semantic of dcbz was changed, it always clear | |||
| 32 bytes. so the function below will work, but will | |||
| be slow. So I fixed check_dcbz_effect to use dcbzl, | |||
| which is defined to clear a cache line (as dcbz before). | |||
| So we still can distinguish, and use dcbz (32 bytes) | |||
| or dcbzl (one cache line) as required. | |||
| update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line | |||
| size: 128 bytes. Oups. | |||
| The semantic of dcbz was changed, it always clear 32 bytes. so the function | |||
| below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, | |||
| which is defined to clear a cache line (as dcbz before). So we still can | |||
| distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. | |||
| see <http://developer.apple.com/technotes/tn/tn2087.html> | |||
| and <http://developer.apple.com/technotes/tn/tn2086.html> | |||
| see <http://developer.apple.com/technotes/tn/tn2087.html> | |||
| and <http://developer.apple.com/technotes/tn/tn2086.html> | |||
| */ | |||
| void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | |||
| { | |||
| @@ -148,21 +141,21 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); | |||
| POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); | |||
| #if 1 | |||
| if (misal) { | |||
| ((unsigned long*)blocks)[0] = 0L; | |||
| ((unsigned long*)blocks)[1] = 0L; | |||
| ((unsigned long*)blocks)[2] = 0L; | |||
| ((unsigned long*)blocks)[3] = 0L; | |||
| i += 16; | |||
| ((unsigned long*)blocks)[0] = 0L; | |||
| ((unsigned long*)blocks)[1] = 0L; | |||
| ((unsigned long*)blocks)[2] = 0L; | |||
| ((unsigned long*)blocks)[3] = 0L; | |||
| i += 16; | |||
| } | |||
| for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { | |||
| asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |||
| asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |||
| } | |||
| if (misal) { | |||
| ((unsigned long*)blocks)[188] = 0L; | |||
| ((unsigned long*)blocks)[189] = 0L; | |||
| ((unsigned long*)blocks)[190] = 0L; | |||
| ((unsigned long*)blocks)[191] = 0L; | |||
| i += 16; | |||
| ((unsigned long*)blocks)[188] = 0L; | |||
| ((unsigned long*)blocks)[189] = 0L; | |||
| ((unsigned long*)blocks)[190] = 0L; | |||
| ((unsigned long*)blocks)[191] = 0L; | |||
| i += 16; | |||
| } | |||
| #else | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| @@ -180,16 +173,16 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); | |||
| register int i = 0; | |||
| POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); | |||
| #if 1 | |||
| if (misal) { | |||
| // we could probably also optimize this case, | |||
| // but there's not much point as the machines | |||
| // aren't available yet (2003-06-26) | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| if (misal) { | |||
| // we could probably also optimize this case, | |||
| // but there's not much point as the machines | |||
| // aren't available yet (2003-06-26) | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| } | |||
| else | |||
| for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { | |||
| asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |||
| } | |||
| for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { | |||
| asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |||
| } | |||
| #else | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| #endif | |||
| @@ -198,7 +191,7 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); | |||
| #else | |||
| void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |||
| { | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| memset(blocks, 0, sizeof(DCTELEM)*6*64); | |||
| } | |||
| #endif | |||
| @@ -210,34 +203,32 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |||
| knows about dcbzl ... */ | |||
| long check_dcbzl_effect(void) | |||
| { | |||
| register char *fakedata = av_malloc(1024); | |||
| register char *fakedata_middle; | |||
| register long zero = 0; | |||
| register long i = 0; | |||
| long count = 0; | |||
| register char *fakedata = av_malloc(1024); | |||
| register char *fakedata_middle; | |||
| register long zero = 0; | |||
| register long i = 0; | |||
| long count = 0; | |||
| if (!fakedata) | |||
| { | |||
| return 0L; | |||
| } | |||
| if (!fakedata) { | |||
| return 0L; | |||
| } | |||
| fakedata_middle = (fakedata + 512); | |||
| fakedata_middle = (fakedata + 512); | |||
| memset(fakedata, 0xFF, 1024); | |||
| memset(fakedata, 0xFF, 1024); | |||
| /* below the constraint "b" seems to mean "Address base register" | |||
| in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ | |||
| asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); | |||
| /* below the constraint "b" seems to mean "Address base register" | |||
| in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ | |||
| asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); | |||
| for (i = 0; i < 1024 ; i ++) | |||
| { | |||
| if (fakedata[i] == (char)0) | |||
| count++; | |||
| } | |||
| for (i = 0; i < 1024 ; i ++) { | |||
| if (fakedata[i] == (char)0) | |||
| count++; | |||
| } | |||
| av_free(fakedata); | |||
| av_free(fakedata); | |||
| return count; | |||
| return count; | |||
| } | |||
| #else | |||
| long check_dcbzl_effect(void) | |||
| @@ -286,36 +277,31 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) | |||
| #ifdef CONFIG_ENCODERS | |||
| if (avctx->dct_algo == FF_DCT_AUTO || | |||
| avctx->dct_algo == FF_DCT_ALTIVEC) | |||
| { | |||
| avctx->dct_algo == FF_DCT_ALTIVEC) { | |||
| c->fdct = fdct_altivec; | |||
| } | |||
| #endif //CONFIG_ENCODERS | |||
| if (avctx->lowres==0) | |||
| { | |||
| if ((avctx->idct_algo == FF_IDCT_AUTO) || | |||
| (avctx->idct_algo == FF_IDCT_ALTIVEC)) | |||
| { | |||
| c->idct_put = idct_put_altivec; | |||
| c->idct_add = idct_add_altivec; | |||
| c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |||
| } | |||
| if (avctx->lowres==0) { | |||
| if ((avctx->idct_algo == FF_IDCT_AUTO) || | |||
| (avctx->idct_algo == FF_IDCT_ALTIVEC)) { | |||
| c->idct_put = idct_put_altivec; | |||
| c->idct_add = idct_add_altivec; | |||
| c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |||
| } | |||
| } | |||
| #ifdef CONFIG_POWERPC_PERF | |||
| { | |||
| int i, j; | |||
| for (i = 0 ; i < powerpc_perf_total ; i++) | |||
| { | |||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) | |||
| { | |||
| perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; | |||
| perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; | |||
| perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; | |||
| perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; | |||
| int i, j; | |||
| for (i = 0 ; i < powerpc_perf_total ; i++) { | |||
| for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { | |||
| perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; | |||
| perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; | |||
| perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; | |||
| perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif /* CONFIG_POWERPC_PERF */ | |||
| } | |||
| @@ -31,40 +31,40 @@ void powerpc_display_perf_report(void); | |||
| /* if you add to the enum below, also add to the perfname array | |||
| in dsputil_ppc.c */ | |||
| enum powerpc_perf_index { | |||
| altivec_fft_num = 0, | |||
| altivec_gmc1_num, | |||
| altivec_dct_unquantize_h263_num, | |||
| altivec_fdct, | |||
| altivec_idct_add_num, | |||
| altivec_idct_put_num, | |||
| altivec_put_pixels16_num, | |||
| altivec_avg_pixels16_num, | |||
| altivec_avg_pixels8_num, | |||
| altivec_put_pixels8_xy2_num, | |||
| altivec_put_no_rnd_pixels8_xy2_num, | |||
| altivec_put_pixels16_xy2_num, | |||
| altivec_put_no_rnd_pixels16_xy2_num, | |||
| altivec_hadamard8_diff8x8_num, | |||
| altivec_hadamard8_diff16_num, | |||
| altivec_avg_pixels8_xy2_num, | |||
| powerpc_clear_blocks_dcbz32, | |||
| powerpc_clear_blocks_dcbz128, | |||
| altivec_put_h264_chroma_mc8_num, | |||
| altivec_avg_h264_chroma_mc8_num, | |||
| altivec_put_h264_qpel16_h_lowpass_num, | |||
| altivec_avg_h264_qpel16_h_lowpass_num, | |||
| altivec_put_h264_qpel16_v_lowpass_num, | |||
| altivec_avg_h264_qpel16_v_lowpass_num, | |||
| altivec_put_h264_qpel16_hv_lowpass_num, | |||
| altivec_avg_h264_qpel16_hv_lowpass_num, | |||
| powerpc_perf_total | |||
| altivec_fft_num = 0, | |||
| altivec_gmc1_num, | |||
| altivec_dct_unquantize_h263_num, | |||
| altivec_fdct, | |||
| altivec_idct_add_num, | |||
| altivec_idct_put_num, | |||
| altivec_put_pixels16_num, | |||
| altivec_avg_pixels16_num, | |||
| altivec_avg_pixels8_num, | |||
| altivec_put_pixels8_xy2_num, | |||
| altivec_put_no_rnd_pixels8_xy2_num, | |||
| altivec_put_pixels16_xy2_num, | |||
| altivec_put_no_rnd_pixels16_xy2_num, | |||
| altivec_hadamard8_diff8x8_num, | |||
| altivec_hadamard8_diff16_num, | |||
| altivec_avg_pixels8_xy2_num, | |||
| powerpc_clear_blocks_dcbz32, | |||
| powerpc_clear_blocks_dcbz128, | |||
| altivec_put_h264_chroma_mc8_num, | |||
| altivec_avg_h264_chroma_mc8_num, | |||
| altivec_put_h264_qpel16_h_lowpass_num, | |||
| altivec_avg_h264_qpel16_h_lowpass_num, | |||
| altivec_put_h264_qpel16_v_lowpass_num, | |||
| altivec_avg_h264_qpel16_v_lowpass_num, | |||
| altivec_put_h264_qpel16_hv_lowpass_num, | |||
| altivec_avg_h264_qpel16_hv_lowpass_num, | |||
| powerpc_perf_total | |||
| }; | |||
| enum powerpc_data_index { | |||
| powerpc_data_min = 0, | |||
| powerpc_data_max, | |||
| powerpc_data_sum, | |||
| powerpc_data_num, | |||
| powerpc_data_total | |||
| powerpc_data_min = 0, | |||
| powerpc_data_max, | |||
| powerpc_data_sum, | |||
| powerpc_data_num, | |||
| powerpc_data_total | |||
| }; | |||
| extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |||
| @@ -105,45 +105,42 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ | |||
| #define POWERPC_GET_PMC6(a) do {} while (0) | |||
| #endif | |||
| #endif /* HAVE_PPC64 */ | |||
| #define POWERPC_PERF_DECLARE(a, cond) \ | |||
| POWERP_PMC_DATATYPE \ | |||
| pmc_start[POWERPC_NUM_PMC_ENABLED], \ | |||
| pmc_stop[POWERPC_NUM_PMC_ENABLED], \ | |||
| pmc_loop_index; | |||
| #define POWERPC_PERF_DECLARE(a, cond) \ | |||
| POWERP_PMC_DATATYPE \ | |||
| pmc_start[POWERPC_NUM_PMC_ENABLED], \ | |||
| pmc_stop[POWERPC_NUM_PMC_ENABLED], \ | |||
| pmc_loop_index; | |||
| #define POWERPC_PERF_START_COUNT(a, cond) do { \ | |||
| POWERPC_GET_PMC6(pmc_start[5]); \ | |||
| POWERPC_GET_PMC5(pmc_start[4]); \ | |||
| POWERPC_GET_PMC4(pmc_start[3]); \ | |||
| POWERPC_GET_PMC3(pmc_start[2]); \ | |||
| POWERPC_GET_PMC2(pmc_start[1]); \ | |||
| POWERPC_GET_PMC1(pmc_start[0]); \ | |||
| } while (0) | |||
| POWERPC_GET_PMC6(pmc_start[5]); \ | |||
| POWERPC_GET_PMC5(pmc_start[4]); \ | |||
| POWERPC_GET_PMC4(pmc_start[3]); \ | |||
| POWERPC_GET_PMC3(pmc_start[2]); \ | |||
| POWERPC_GET_PMC2(pmc_start[1]); \ | |||
| POWERPC_GET_PMC1(pmc_start[0]); \ | |||
| } while (0) | |||
| #define POWERPC_PERF_STOP_COUNT(a, cond) do { \ | |||
| POWERPC_GET_PMC1(pmc_stop[0]); \ | |||
| POWERPC_GET_PMC2(pmc_stop[1]); \ | |||
| POWERPC_GET_PMC3(pmc_stop[2]); \ | |||
| POWERPC_GET_PMC4(pmc_stop[3]); \ | |||
| POWERPC_GET_PMC5(pmc_stop[4]); \ | |||
| POWERPC_GET_PMC6(pmc_stop[5]); \ | |||
| if (cond) \ | |||
| { \ | |||
| for(pmc_loop_index = 0; \ | |||
| pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ | |||
| pmc_loop_index++) \ | |||
| { \ | |||
| if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ | |||
| { \ | |||
| POWERP_PMC_DATATYPE diff = \ | |||
| pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ | |||
| if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ | |||
| if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| POWERPC_GET_PMC1(pmc_stop[0]); \ | |||
| POWERPC_GET_PMC2(pmc_stop[1]); \ | |||
| POWERPC_GET_PMC3(pmc_stop[2]); \ | |||
| POWERPC_GET_PMC4(pmc_stop[3]); \ | |||
| POWERPC_GET_PMC5(pmc_stop[4]); \ | |||
| POWERPC_GET_PMC6(pmc_stop[5]); \ | |||
| if (cond) { \ | |||
| for(pmc_loop_index = 0; \ | |||
| pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ | |||
| pmc_loop_index++) { \ | |||
| if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ | |||
| POWERP_PMC_DATATYPE diff = \ | |||
| pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ | |||
| if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ | |||
| if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ | |||
| perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ | |||
| } \ | |||
| } \ | |||
| } \ | |||
| } while (0) | |||
| #else /* CONFIG_POWERPC_PERF */ | |||
| // those are needed to avoid empty statements. | |||
| @@ -33,21 +33,21 @@ | |||
| /* butter fly op */ | |||
| #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ | |||
| {\ | |||
| FFTSample ax, ay, bx, by;\ | |||
| bx=pre1;\ | |||
| by=pim1;\ | |||
| ax=qre1;\ | |||
| ay=qim1;\ | |||
| pre = (bx + ax);\ | |||
| pim = (by + ay);\ | |||
| qre = (bx - ax);\ | |||
| qim = (by - ay);\ | |||
| FFTSample ax, ay, bx, by;\ | |||
| bx=pre1;\ | |||
| by=pim1;\ | |||
| ax=qre1;\ | |||
| ay=qim1;\ | |||
| pre = (bx + ax);\ | |||
| pim = (by + ay);\ | |||
| qre = (bx - ax);\ | |||
| qim = (by - ay);\ | |||
| } | |||
| #define MUL16(a,b) ((a) * (b)) | |||
| #define CMUL(pre, pim, are, aim, bre, bim) \ | |||
| {\ | |||
| pre = (MUL16(are, bre) - MUL16(aim, bim));\ | |||
| pim = (MUL16(are, bim) + MUL16(bre, aim));\ | |||
| pre = (MUL16(are, bre) - MUL16(aim, bim));\ | |||
| pim = (MUL16(are, bim) + MUL16(bre, aim));\ | |||
| } | |||
| @@ -85,14 +85,11 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); | |||
| c1 = vcii(p,p,n,n); | |||
| if (s->inverse) | |||
| { | |||
| c2 = vcii(p,p,n,p); | |||
| } | |||
| else | |||
| { | |||
| c2 = vcii(p,p,p,n); | |||
| } | |||
| if (s->inverse) { | |||
| c2 = vcii(p,p,n,p); | |||
| } else { | |||
| c2 = vcii(p,p,p,n); | |||
| } | |||
| j = (np >> 2); | |||
| do { | |||
| @@ -36,16 +36,16 @@ void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int str | |||
| { | |||
| POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); | |||
| const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = | |||
| {rounder, rounder, rounder, rounder, | |||
| rounder, rounder, rounder, rounder}; | |||
| {rounder, rounder, rounder, rounder, | |||
| rounder, rounder, rounder, rounder}; | |||
| const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = | |||
| { | |||
| (16-x16)*(16-y16), /* A */ | |||
| ( x16)*(16-y16), /* B */ | |||
| (16-x16)*( y16), /* C */ | |||
| ( x16)*( y16), /* D */ | |||
| 0, 0, 0, 0 /* padding */ | |||
| }; | |||
| { | |||
| (16-x16)*(16-y16), /* A */ | |||
| ( x16)*(16-y16), /* B */ | |||
| (16-x16)*( y16), /* C */ | |||
| ( x16)*( y16), /* D */ | |||
| 0, 0, 0, 0 /* padding */ | |||
| }; | |||
| register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |||
| register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); | |||
| register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; | |||
| @@ -74,73 +74,67 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |||
| src_1 = vec_ld(16, src); | |||
| srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); | |||
| if (src_really_odd != 0x0000000F) | |||
| { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. | |||
| srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); | |||
| } | |||
| else | |||
| { | |||
| srcvB = src_1; | |||
| if (src_really_odd != 0x0000000F) { | |||
| // if src & 0xF == 0xF, then (src+1) is properly aligned | |||
| // on the second vector. | |||
| srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); | |||
| } else { | |||
| srcvB = src_1; | |||
| } | |||
| srcvA = vec_mergeh(vczero, srcvA); | |||
| srcvB = vec_mergeh(vczero, srcvB); | |||
| for(i=0; i<h; i++) | |||
| { | |||
| dst_odd = (unsigned long)dst & 0x0000000F; | |||
| src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; | |||
| dstv = vec_ld(0, dst); | |||
| // we we'll be able to pick-up our 9 char elements | |||
| // at src + stride from those 32 bytes | |||
| // then reuse the resulting 2 vectors srvcC and srcvD | |||
| // as the next srcvA and srcvB | |||
| src_0 = vec_ld(stride + 0, src); | |||
| src_1 = vec_ld(stride + 16, src); | |||
| srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); | |||
| if (src_really_odd != 0x0000000F) | |||
| { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. | |||
| srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); | |||
| } | |||
| else | |||
| { | |||
| srcvD = src_1; | |||
| } | |||
| srcvC = vec_mergeh(vczero, srcvC); | |||
| srcvD = vec_mergeh(vczero, srcvD); | |||
| // OK, now we (finally) do the math :-) | |||
| // those four instructions replaces 32 int muls & 32 int adds. | |||
| // isn't AltiVec nice ? | |||
| tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); | |||
| tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); | |||
| tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); | |||
| tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); | |||
| srcvA = srcvC; | |||
| srcvB = srcvD; | |||
| tempD = vec_sr(tempD, vcsr8); | |||
| dstv2 = vec_pack(tempD, (vector unsigned short)vczero); | |||
| if (dst_odd) | |||
| { | |||
| dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); | |||
| } | |||
| else | |||
| { | |||
| dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); | |||
| } | |||
| vec_st(dstv2, 0, dst); | |||
| dst += stride; | |||
| src += stride; | |||
| for(i=0; i<h; i++) { | |||
| dst_odd = (unsigned long)dst & 0x0000000F; | |||
| src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; | |||
| dstv = vec_ld(0, dst); | |||
| // we we'll be able to pick-up our 9 char elements | |||
| // at src + stride from those 32 bytes | |||
| // then reuse the resulting 2 vectors srvcC and srcvD | |||
| // as the next srcvA and srcvB | |||
| src_0 = vec_ld(stride + 0, src); | |||
| src_1 = vec_ld(stride + 16, src); | |||
| srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); | |||
| if (src_really_odd != 0x0000000F) { | |||
| // if src & 0xF == 0xF, then (src+1) is properly aligned | |||
| // on the second vector. | |||
| srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); | |||
| } else { | |||
| srcvD = src_1; | |||
| } | |||
| srcvC = vec_mergeh(vczero, srcvC); | |||
| srcvD = vec_mergeh(vczero, srcvD); | |||
| // OK, now we (finally) do the math :-) | |||
| // those four instructions replaces 32 int muls & 32 int adds. | |||
| // isn't AltiVec nice ? | |||
| tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); | |||
| tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); | |||
| tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); | |||
| tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); | |||
| srcvA = srcvC; | |||
| srcvB = srcvD; | |||
| tempD = vec_sr(tempD, vcsr8); | |||
| dstv2 = vec_pack(tempD, (vector unsigned short)vczero); | |||
| if (dst_odd) { | |||
| dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); | |||
| } else { | |||
| dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); | |||
| } | |||
| vec_st(dstv2, 0, dst); | |||
| dst += stride; | |||
| src += stride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |||
| @@ -196,7 +196,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride | |||
| const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); | |||
| LOAD_ZERO; | |||
| const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); | |||
| const vec_u16_t v6us = vec_splat_u16(6); | |||
| const vec_u16_t v6us = vec_splat_u16(6); | |||
| register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | |||
| register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | |||
| @@ -392,8 +392,8 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, | |||
| #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) | |||
| */ | |||
| H264_MC(put_, 16, altivec) | |||
| H264_MC(avg_, 16, altivec) | |||
| H264_MC(put_, 16, altivec) | |||
| H264_MC(avg_, 16, altivec) | |||
| /**************************************************************************** | |||
| @@ -685,9 +685,9 @@ static inline void write16x4(uint8_t *dst, int dst_stride, | |||
| r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ | |||
| \ | |||
| /*Third merge*/ \ | |||
| r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ | |||
| r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ | |||
| r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ | |||
| r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ | |||
| r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ | |||
| r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ | |||
| r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ | |||
| r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ | |||
| r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ | |||
| @@ -206,489 +206,489 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | |||
| /* this code assume stride % 16 == 0 */ | |||
| static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| register int i; | |||
| LOAD_ZERO; | |||
| const vec_u8_t permM2 = vec_lvsl(-2, src); | |||
| const vec_u8_t permM1 = vec_lvsl(-1, src); | |||
| const vec_u8_t permP0 = vec_lvsl(+0, src); | |||
| const vec_u8_t permP1 = vec_lvsl(+1, src); | |||
| const vec_u8_t permP2 = vec_lvsl(+2, src); | |||
| const vec_u8_t permP3 = vec_lvsl(+3, src); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_u16_t v5us = vec_splat_u16(5); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |||
| vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |||
| register int align = ((((unsigned long)src) - 2) % 16); | |||
| vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | |||
| srcP2A, srcP2B, srcP3A, srcP3B, | |||
| srcM1A, srcM1B, srcM2A, srcM2B, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |||
| pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |||
| psumA, psumB, sumA, sumB; | |||
| vec_u8_t sum, vdst, fsum; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| for (i = 0 ; i < 16 ; i ++) { | |||
| vec_u8_t srcR1 = vec_ld(-2, src); | |||
| vec_u8_t srcR2 = vec_ld(14, src); | |||
| switch (align) { | |||
| default: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = vec_perm(srcR1, srcR2, permP3); | |||
| } break; | |||
| case 11: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = srcR2; | |||
| } break; | |||
| case 12: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = srcR2; | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 13: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = srcR2; | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 14: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = srcR2; | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 15: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = srcR2; | |||
| srcP0 = vec_perm(srcR2, srcR3, permP0); | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| } | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| register int i; | |||
| srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| LOAD_ZERO; | |||
| const vec_u8_t permM2 = vec_lvsl(-2, src); | |||
| const vec_u8_t permM1 = vec_lvsl(-1, src); | |||
| const vec_u8_t permP0 = vec_lvsl(+0, src); | |||
| const vec_u8_t permP1 = vec_lvsl(+1, src); | |||
| const vec_u8_t permP2 = vec_lvsl(+2, src); | |||
| const vec_u8_t permP3 = vec_lvsl(+3, src); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_u16_t v5us = vec_splat_u16(5); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |||
| srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |||
| srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| register int align = ((((unsigned long)src) - 2) % 16); | |||
| vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | |||
| srcP2A, srcP2B, srcP3A, srcP3B, | |||
| srcM1A, srcM1B, srcM2A, srcM2B, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |||
| pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |||
| psumA, psumB, sumA, sumB; | |||
| vec_u8_t sum, vdst, fsum; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| for (i = 0 ; i < 16 ; i ++) { | |||
| vec_u8_t srcR1 = vec_ld(-2, src); | |||
| vec_u8_t srcR2 = vec_ld(14, src); | |||
| switch (align) { | |||
| default: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = vec_perm(srcR1, srcR2, permP3); | |||
| } break; | |||
| case 11: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = srcR2; | |||
| } break; | |||
| case 12: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = srcR2; | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 13: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = srcR2; | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 14: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = srcR2; | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 15: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = srcR2; | |||
| srcP0 = vec_perm(srcR2, srcR3, permP0); | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| } | |||
| sum1A = vec_adds(srcP0A, srcP1A); | |||
| sum1B = vec_adds(srcP0B, srcP1B); | |||
| sum2A = vec_adds(srcM1A, srcP2A); | |||
| sum2B = vec_adds(srcM1B, srcP2B); | |||
| sum3A = vec_adds(srcM2A, srcP3A); | |||
| sum3B = vec_adds(srcM2B, srcP3B); | |||
| srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| pp1A = vec_mladd(sum1A, v20ss, v16ss); | |||
| pp1B = vec_mladd(sum1B, v20ss, v16ss); | |||
| srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| pp3A = vec_add(sum3A, pp1A); | |||
| pp3B = vec_add(sum3B, pp1B); | |||
| sum1A = vec_adds(srcP0A, srcP1A); | |||
| sum1B = vec_adds(srcP0B, srcP1B); | |||
| sum2A = vec_adds(srcM1A, srcP2A); | |||
| sum2B = vec_adds(srcM1B, srcP2B); | |||
| sum3A = vec_adds(srcM2A, srcP3A); | |||
| sum3B = vec_adds(srcM2B, srcP3B); | |||
| psumA = vec_sub(pp3A, pp2A); | |||
| psumB = vec_sub(pp3B, pp2B); | |||
| pp1A = vec_mladd(sum1A, v20ss, v16ss); | |||
| pp1B = vec_mladd(sum1B, v20ss, v16ss); | |||
| sumA = vec_sra(psumA, v5us); | |||
| sumB = vec_sra(psumB, v5us); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| sum = vec_packsu(sumA, sumB); | |||
| pp3A = vec_add(sum3A, pp1A); | |||
| pp3B = vec_add(sum3B, pp1B); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| psumA = vec_sub(pp3A, pp2A); | |||
| psumB = vec_sub(pp3B, pp2B); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| sumA = vec_sra(psumA, v5us); | |||
| sumB = vec_sra(psumB, v5us); | |||
| vec_st(fsum, 0, dst); | |||
| sum = vec_packsu(sumA, sumB); | |||
| src += srcStride; | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| vec_st(fsum, 0, dst); | |||
| src += srcStride; | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |||
| } | |||
| /* this code assume stride % 16 == 0 */ | |||
| static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| register int i; | |||
| LOAD_ZERO; | |||
| const vec_u8_t perm = vec_lvsl(0, src); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_u16_t v5us = vec_splat_u16(5); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |||
| uint8_t *srcbis = src - (srcStride * 2); | |||
| const vec_u8_t srcM2a = vec_ld(0, srcbis); | |||
| const vec_u8_t srcM2b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); | |||
| // srcbis += srcStride; | |||
| const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcM1b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); | |||
| // srcbis += srcStride; | |||
| const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP0b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); | |||
| // srcbis += srcStride; | |||
| const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP1b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); | |||
| // srcbis += srcStride; | |||
| const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP2b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); | |||
| // srcbis += srcStride; | |||
| vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |||
| psumA, psumB, sumA, sumB, | |||
| srcP3ssA, srcP3ssB, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | |||
| vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| for (i = 0 ; i < 16 ; i++) { | |||
| srcP3a = vec_ld(0, srcbis += srcStride); | |||
| srcP3b = vec_ld(16, srcbis); | |||
| srcP3 = vec_perm(srcP3a, srcP3b, perm); | |||
| srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| // srcbis += srcStride; | |||
| sum1A = vec_adds(srcP0ssA, srcP1ssA); | |||
| sum1B = vec_adds(srcP0ssB, srcP1ssB); | |||
| sum2A = vec_adds(srcM1ssA, srcP2ssA); | |||
| sum2B = vec_adds(srcM1ssB, srcP2ssB); | |||
| sum3A = vec_adds(srcM2ssA, srcP3ssA); | |||
| sum3B = vec_adds(srcM2ssB, srcP3ssB); | |||
| srcM2ssA = srcM1ssA; | |||
| srcM2ssB = srcM1ssB; | |||
| srcM1ssA = srcP0ssA; | |||
| srcM1ssB = srcP0ssB; | |||
| srcP0ssA = srcP1ssA; | |||
| srcP0ssB = srcP1ssB; | |||
| srcP1ssA = srcP2ssA; | |||
| srcP1ssB = srcP2ssB; | |||
| srcP2ssA = srcP3ssA; | |||
| srcP2ssB = srcP3ssB; | |||
| pp1A = vec_mladd(sum1A, v20ss, v16ss); | |||
| pp1B = vec_mladd(sum1B, v20ss, v16ss); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| pp3A = vec_add(sum3A, pp1A); | |||
| pp3B = vec_add(sum3B, pp1B); | |||
| psumA = vec_sub(pp3A, pp2A); | |||
| psumB = vec_sub(pp3B, pp2B); | |||
| sumA = vec_sra(psumA, v5us); | |||
| sumB = vec_sra(psumB, v5us); | |||
| sum = vec_packsu(sumA, sumB); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| vec_st(fsum, 0, dst); | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| register int i; | |||
| LOAD_ZERO; | |||
| const vec_u8_t perm = vec_lvsl(0, src); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_u16_t v5us = vec_splat_u16(5); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |||
| uint8_t *srcbis = src - (srcStride * 2); | |||
| const vec_u8_t srcM2a = vec_ld(0, srcbis); | |||
| const vec_u8_t srcM2b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); | |||
| //srcbis += srcStride; | |||
| const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcM1b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); | |||
| //srcbis += srcStride; | |||
| const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP0b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); | |||
| //srcbis += srcStride; | |||
| const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP1b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); | |||
| //srcbis += srcStride; | |||
| const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); | |||
| const vec_u8_t srcP2b = vec_ld(16, srcbis); | |||
| const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); | |||
| //srcbis += srcStride; | |||
| vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |||
| psumA, psumB, sumA, sumB, | |||
| srcP3ssA, srcP3ssB, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | |||
| vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| for (i = 0 ; i < 16 ; i++) { | |||
| srcP3a = vec_ld(0, srcbis += srcStride); | |||
| srcP3b = vec_ld(16, srcbis); | |||
| srcP3 = vec_perm(srcP3a, srcP3b, perm); | |||
| srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| //srcbis += srcStride; | |||
| sum1A = vec_adds(srcP0ssA, srcP1ssA); | |||
| sum1B = vec_adds(srcP0ssB, srcP1ssB); | |||
| sum2A = vec_adds(srcM1ssA, srcP2ssA); | |||
| sum2B = vec_adds(srcM1ssB, srcP2ssB); | |||
| sum3A = vec_adds(srcM2ssA, srcP3ssA); | |||
| sum3B = vec_adds(srcM2ssB, srcP3ssB); | |||
| srcM2ssA = srcM1ssA; | |||
| srcM2ssB = srcM1ssB; | |||
| srcM1ssA = srcP0ssA; | |||
| srcM1ssB = srcP0ssB; | |||
| srcP0ssA = srcP1ssA; | |||
| srcP0ssB = srcP1ssB; | |||
| srcP1ssA = srcP2ssA; | |||
| srcP1ssB = srcP2ssB; | |||
| srcP2ssA = srcP3ssA; | |||
| srcP2ssB = srcP3ssB; | |||
| pp1A = vec_mladd(sum1A, v20ss, v16ss); | |||
| pp1B = vec_mladd(sum1B, v20ss, v16ss); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| pp3A = vec_add(sum3A, pp1A); | |||
| pp3B = vec_add(sum3B, pp1B); | |||
| psumA = vec_sub(pp3A, pp2A); | |||
| psumB = vec_sub(pp3B, pp2B); | |||
| sumA = vec_sra(psumA, v5us); | |||
| sumB = vec_sra(psumB, v5us); | |||
| sum = vec_packsu(sumA, sumB); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| vec_st(fsum, 0, dst); | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |||
| } | |||
| /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | |||
| static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| register int i; | |||
| LOAD_ZERO; | |||
| const vec_u8_t permM2 = vec_lvsl(-2, src); | |||
| const vec_u8_t permM1 = vec_lvsl(-1, src); | |||
| const vec_u8_t permP0 = vec_lvsl(+0, src); | |||
| const vec_u8_t permP1 = vec_lvsl(+1, src); | |||
| const vec_u8_t permP2 = vec_lvsl(+2, src); | |||
| const vec_u8_t permP3 = vec_lvsl(+3, src); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_u32_t v10ui = vec_splat_u32(10); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_s16_t v1ss = vec_splat_s16(1); | |||
| const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | |||
| const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | |||
| register int align = ((((unsigned long)src) - 2) % 16); | |||
| vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | |||
| srcP2A, srcP2B, srcP3A, srcP3B, | |||
| srcM1A, srcM1B, srcM2A, srcM2B, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |||
| pp1A, pp1B, pp2A, pp2B, psumA, psumB; | |||
| const vec_u8_t mperm = (const vec_u8_t) | |||
| AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |||
| 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | |||
| int16_t *tmpbis = tmp; | |||
| vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | |||
| tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | |||
| tmpP2ssA, tmpP2ssB; | |||
| vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | |||
| pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | |||
| pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | |||
| ssumAe, ssumAo, ssumBe, ssumBo; | |||
| vec_u8_t fsum, sumv, sum, vdst; | |||
| vec_s16_t ssume, ssumo; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| src -= (2 * srcStride); | |||
| for (i = 0 ; i < 21 ; i ++) { | |||
| vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |||
| vec_u8_t srcR1 = vec_ld(-2, src); | |||
| vec_u8_t srcR2 = vec_ld(14, src); | |||
| switch (align) { | |||
| default: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = vec_perm(srcR1, srcR2, permP3); | |||
| } break; | |||
| case 11: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = srcR2; | |||
| } break; | |||
| case 12: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = srcR2; | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 13: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = srcR2; | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 14: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = srcR2; | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 15: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = srcR2; | |||
| srcP0 = vec_perm(srcR2, srcR3, permP0); | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| } | |||
| POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| register int i; | |||
| LOAD_ZERO; | |||
| const vec_u8_t permM2 = vec_lvsl(-2, src); | |||
| const vec_u8_t permM1 = vec_lvsl(-1, src); | |||
| const vec_u8_t permP0 = vec_lvsl(+0, src); | |||
| const vec_u8_t permP1 = vec_lvsl(+1, src); | |||
| const vec_u8_t permP2 = vec_lvsl(+2, src); | |||
| const vec_u8_t permP3 = vec_lvsl(+3, src); | |||
| const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |||
| const vec_u32_t v10ui = vec_splat_u32(10); | |||
| const vec_s16_t v5ss = vec_splat_s16(5); | |||
| const vec_s16_t v1ss = vec_splat_s16(1); | |||
| const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | |||
| const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | |||
| register int align = ((((unsigned long)src) - 2) % 16); | |||
| vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | |||
| srcP2A, srcP2B, srcP3A, srcP3B, | |||
| srcM1A, srcM1B, srcM2A, srcM2B, | |||
| sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |||
| pp1A, pp1B, pp2A, pp2B, psumA, psumB; | |||
| const vec_u8_t mperm = (const vec_u8_t) | |||
| AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |||
| 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | |||
| int16_t *tmpbis = tmp; | |||
| vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | |||
| tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | |||
| tmpP2ssA, tmpP2ssB; | |||
| vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | |||
| pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | |||
| pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | |||
| ssumAe, ssumAo, ssumBe, ssumBo; | |||
| vec_u8_t fsum, sumv, sum, vdst; | |||
| vec_s16_t ssume, ssumo; | |||
| POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| src -= (2 * srcStride); | |||
| for (i = 0 ; i < 21 ; i ++) { | |||
| vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |||
| vec_u8_t srcR1 = vec_ld(-2, src); | |||
| vec_u8_t srcR2 = vec_ld(14, src); | |||
| switch (align) { | |||
| default: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = vec_perm(srcR1, srcR2, permP3); | |||
| } break; | |||
| case 11: { | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = vec_perm(srcR1, srcR2, permP2); | |||
| srcP3 = srcR2; | |||
| } break; | |||
| case 12: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = vec_perm(srcR1, srcR2, permP1); | |||
| srcP2 = srcR2; | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 13: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = vec_perm(srcR1, srcR2, permP0); | |||
| srcP1 = srcR2; | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 14: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = vec_perm(srcR1, srcR2, permM1); | |||
| srcP0 = srcR2; | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| case 15: { | |||
| vec_u8_t srcR3 = vec_ld(30, src); | |||
| srcM2 = vec_perm(srcR1, srcR2, permM2); | |||
| srcM1 = srcR2; | |||
| srcP0 = vec_perm(srcR2, srcR3, permP0); | |||
| srcP1 = vec_perm(srcR2, srcR3, permP1); | |||
| srcP2 = vec_perm(srcR2, srcR3, permP2); | |||
| srcP3 = vec_perm(srcR2, srcR3, permP3); | |||
| } break; | |||
| } | |||
| srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| sum1A = vec_adds(srcP0A, srcP1A); | |||
| sum1B = vec_adds(srcP0B, srcP1B); | |||
| sum2A = vec_adds(srcM1A, srcP2A); | |||
| sum2B = vec_adds(srcM1B, srcP2B); | |||
| sum3A = vec_adds(srcM2A, srcP3A); | |||
| sum3B = vec_adds(srcM2B, srcP3B); | |||
| pp1A = vec_mladd(sum1A, v20ss, sum3A); | |||
| pp1B = vec_mladd(sum1B, v20ss, sum3B); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | |||
| srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | |||
| srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | |||
| srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | |||
| srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | |||
| srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | |||
| srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | |||
| srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | |||
| srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | |||
| srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | |||
| srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | |||
| srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | |||
| sum1A = vec_adds(srcP0A, srcP1A); | |||
| sum1B = vec_adds(srcP0B, srcP1B); | |||
| sum2A = vec_adds(srcM1A, srcP2A); | |||
| sum2B = vec_adds(srcM1B, srcP2B); | |||
| sum3A = vec_adds(srcM2A, srcP3A); | |||
| sum3B = vec_adds(srcM2B, srcP3B); | |||
| pp1A = vec_mladd(sum1A, v20ss, sum3A); | |||
| pp1B = vec_mladd(sum1B, v20ss, sum3B); | |||
| pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |||
| pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |||
| psumA = vec_sub(pp1A, pp2A); | |||
| psumB = vec_sub(pp1B, pp2B); | |||
| vec_st(psumA, 0, tmp); | |||
| vec_st(psumB, 16, tmp); | |||
| src += srcStride; | |||
| tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ | |||
| } | |||
| tmpM2ssA = vec_ld(0, tmpbis); | |||
| tmpM2ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpM1ssA = vec_ld(0, tmpbis); | |||
| tmpM1ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP0ssA = vec_ld(0, tmpbis); | |||
| tmpP0ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP1ssA = vec_ld(0, tmpbis); | |||
| tmpP1ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP2ssA = vec_ld(0, tmpbis); | |||
| tmpP2ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| for (i = 0 ; i < 16 ; i++) { | |||
| const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); | |||
| const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); | |||
| const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | |||
| const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | |||
| const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | |||
| const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | |||
| const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | |||
| const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | |||
| psumA = vec_sub(pp1A, pp2A); | |||
| psumB = vec_sub(pp1B, pp2B); | |||
| vec_st(psumA, 0, tmp); | |||
| vec_st(psumB, 16, tmp); | |||
| src += srcStride; | |||
| tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ | |||
| } | |||
| tmpM2ssA = vec_ld(0, tmpbis); | |||
| tmpM2ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpM1ssA = vec_ld(0, tmpbis); | |||
| tmpM1ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP0ssA = vec_ld(0, tmpbis); | |||
| tmpP0ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP1ssA = vec_ld(0, tmpbis); | |||
| tmpP1ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpP2ssA = vec_ld(0, tmpbis); | |||
| tmpP2ssB = vec_ld(16, tmpbis); | |||
| tmpbis += tmpStride; | |||
| tmpM2ssA = tmpM1ssA; | |||
| tmpM2ssB = tmpM1ssB; | |||
| tmpM1ssA = tmpP0ssA; | |||
| tmpM1ssB = tmpP0ssB; | |||
| tmpP0ssA = tmpP1ssA; | |||
| tmpP0ssB = tmpP1ssB; | |||
| tmpP1ssA = tmpP2ssA; | |||
| tmpP1ssB = tmpP2ssB; | |||
| tmpP2ssA = tmpP3ssA; | |||
| tmpP2ssB = tmpP3ssB; | |||
| pp1Ae = vec_mule(sum1A, v20ss); | |||
| pp1Ao = vec_mulo(sum1A, v20ss); | |||
| pp1Be = vec_mule(sum1B, v20ss); | |||
| pp1Bo = vec_mulo(sum1B, v20ss); | |||
| pp2Ae = vec_mule(sum2A, v5ss); | |||
| pp2Ao = vec_mulo(sum2A, v5ss); | |||
| pp2Be = vec_mule(sum2B, v5ss); | |||
| pp2Bo = vec_mulo(sum2B, v5ss); | |||
| pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); | |||
| pp3Ao = vec_mulo(sum3A, v1ss); | |||
| pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); | |||
| pp3Bo = vec_mulo(sum3B, v1ss); | |||
| pp1cAe = vec_add(pp1Ae, v512si); | |||
| pp1cAo = vec_add(pp1Ao, v512si); | |||
| pp1cBe = vec_add(pp1Be, v512si); | |||
| pp1cBo = vec_add(pp1Bo, v512si); | |||
| pp32Ae = vec_sub(pp3Ae, pp2Ae); | |||
| pp32Ao = vec_sub(pp3Ao, pp2Ao); | |||
| pp32Be = vec_sub(pp3Be, pp2Be); | |||
| pp32Bo = vec_sub(pp3Bo, pp2Bo); | |||
| sumAe = vec_add(pp1cAe, pp32Ae); | |||
| sumAo = vec_add(pp1cAo, pp32Ao); | |||
| sumBe = vec_add(pp1cBe, pp32Be); | |||
| sumBo = vec_add(pp1cBo, pp32Bo); | |||
| ssumAe = vec_sra(sumAe, v10ui); | |||
| ssumAo = vec_sra(sumAo, v10ui); | |||
| ssumBe = vec_sra(sumBe, v10ui); | |||
| ssumBo = vec_sra(sumBo, v10ui); | |||
| ssume = vec_packs(ssumAe, ssumBe); | |||
| ssumo = vec_packs(ssumAo, ssumBo); | |||
| sumv = vec_packsu(ssume, ssumo); | |||
| sum = vec_perm(sumv, sumv, mperm); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| vec_st(fsum, 0, dst); | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| for (i = 0 ; i < 16 ; i++) { | |||
| const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); | |||
| const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); | |||
| const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | |||
| const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | |||
| const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | |||
| const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | |||
| const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | |||
| const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | |||
| tmpbis += tmpStride; | |||
| tmpM2ssA = tmpM1ssA; | |||
| tmpM2ssB = tmpM1ssB; | |||
| tmpM1ssA = tmpP0ssA; | |||
| tmpM1ssB = tmpP0ssB; | |||
| tmpP0ssA = tmpP1ssA; | |||
| tmpP0ssB = tmpP1ssB; | |||
| tmpP1ssA = tmpP2ssA; | |||
| tmpP1ssB = tmpP2ssB; | |||
| tmpP2ssA = tmpP3ssA; | |||
| tmpP2ssB = tmpP3ssB; | |||
| pp1Ae = vec_mule(sum1A, v20ss); | |||
| pp1Ao = vec_mulo(sum1A, v20ss); | |||
| pp1Be = vec_mule(sum1B, v20ss); | |||
| pp1Bo = vec_mulo(sum1B, v20ss); | |||
| pp2Ae = vec_mule(sum2A, v5ss); | |||
| pp2Ao = vec_mulo(sum2A, v5ss); | |||
| pp2Be = vec_mule(sum2B, v5ss); | |||
| pp2Bo = vec_mulo(sum2B, v5ss); | |||
| pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); | |||
| pp3Ao = vec_mulo(sum3A, v1ss); | |||
| pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); | |||
| pp3Bo = vec_mulo(sum3B, v1ss); | |||
| pp1cAe = vec_add(pp1Ae, v512si); | |||
| pp1cAo = vec_add(pp1Ao, v512si); | |||
| pp1cBe = vec_add(pp1Be, v512si); | |||
| pp1cBo = vec_add(pp1Bo, v512si); | |||
| pp32Ae = vec_sub(pp3Ae, pp2Ae); | |||
| pp32Ao = vec_sub(pp3Ao, pp2Ao); | |||
| pp32Be = vec_sub(pp3Be, pp2Be); | |||
| pp32Bo = vec_sub(pp3Bo, pp2Bo); | |||
| sumAe = vec_add(pp1cAe, pp32Ae); | |||
| sumAo = vec_add(pp1cAo, pp32Ao); | |||
| sumBe = vec_add(pp1cBe, pp32Be); | |||
| sumBo = vec_add(pp1cBo, pp32Bo); | |||
| ssumAe = vec_sra(sumAe, v10ui); | |||
| ssumAo = vec_sra(sumAo, v10ui); | |||
| ssumBe = vec_sra(sumBe, v10ui); | |||
| ssumBo = vec_sra(sumBo, v10ui); | |||
| ssume = vec_packs(ssumAe, ssumBe); | |||
| ssumo = vec_packs(ssumAo, ssumBo); | |||
| sumv = vec_packsu(ssume, ssumo); | |||
| sum = vec_perm(sumv, sumv, mperm); | |||
| ASSERT_ALIGNED(dst); | |||
| vdst = vec_ld(0, dst); | |||
| OP_U8_ALTIVEC(fsum, sum, vdst); | |||
| vec_st(fsum, 0, dst); | |||
| dst += dstStride; | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |||
| } | |||
| @@ -22,7 +22,6 @@ | |||
| * NOTE: This code is based on GPL code from the libmpeg2 project. The | |||
| * author, Michel Lespinasses, has given explicit permission to release | |||
| * under LGPL as part of ffmpeg. | |||
| * | |||
| */ | |||
| /* | |||
| @@ -46,8 +46,7 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||
| vector signed short zeros, sumhv, sumlv; | |||
| s = src; | |||
| for(i=0;i<4;i++) | |||
| { | |||
| for(i=0;i<4;i++) { | |||
| /* | |||
| The vec_madds later on does an implicit >>15 on the result. | |||
| Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |||
| @@ -86,13 +85,11 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||
| /* Do our altivec resampling on 16 pixels at once. */ | |||
| while(dst_width>=16) { | |||
| /* | |||
| Read 16 (potentially unaligned) bytes from each of | |||
| /* Read 16 (potentially unaligned) bytes from each of | |||
| 4 lines into 4 vectors, and split them into shorts. | |||
| Interleave the multipy/accumulate for the resample | |||
| filter with the loads to hide the 3 cycle latency | |||
| the vec_madds have. | |||
| */ | |||
| the vec_madds have. */ | |||
| tv = (vector unsigned char *) &s[0 * wrap]; | |||
| tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |||
| srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |||
| @@ -121,10 +118,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||
| sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |||
| sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |||
| /* | |||
| Pack the results into our destination vector, | |||
| and do an aligned write of that back to memory. | |||
| */ | |||
| /* Pack the results into our destination vector, | |||
| and do an aligned write of that back to memory. */ | |||
| dstv = vec_packsu(sumhv, sumlv) ; | |||
| vec_st(dstv, 0, (vector unsigned char *) dst); | |||
| @@ -133,10 +128,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |||
| dst_width-=16; | |||
| } | |||
| /* | |||
| If there are any leftover pixels, resample them | |||
| with the slow scalar method. | |||
| */ | |||
| /* If there are any leftover pixels, resample them | |||
| with the slow scalar method. */ | |||
| while(dst_width>0) { | |||
| sum = s[0 * wrap] * filter[0] + | |||
| s[1 * wrap] * filter[1] + | |||
| @@ -38,7 +38,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, | |||
| vector signed short vpix2, vdiff, vpix1l,vpix1h; | |||
| union { vector signed int vscore; | |||
| int32_t score[4]; | |||
| } u; | |||
| } u; | |||
| u.vscore = vec_splat_s32(0); | |||
| // | |||
| //XXX lazy way, fix it later | |||
| @@ -25,14 +25,14 @@ | |||
| #if defined(ARCH_POWERPC_405) | |||
| /* signed 16x16 -> 32 multiply add accumulate */ | |||
| # define MAC16(rt, ra, rb) \ | |||
| asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); | |||
| #define MAC16(rt, ra, rb) \ | |||
| asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); | |||
| /* signed 16x16 -> 32 multiply */ | |||
| # define MUL16(ra, rb) \ | |||
| ({ int __rt; \ | |||
| asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ | |||
| __rt; }) | |||
| #define MUL16(ra, rb) \ | |||
| ({ int __rt; \ | |||
| asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ | |||
| __rt; }) | |||
| #endif | |||
| #endif /* FFMPEG_PPC_MATHOPS_H */ | |||
| @@ -41,15 +41,15 @@ do { \ | |||
| // transposes a matrix consisting of four vectors with four elements each | |||
| #define TRANSPOSE4(a,b,c,d) \ | |||
| do { \ | |||
| __typeof__(a) _trans_ach = vec_mergeh(a, c); \ | |||
| __typeof__(a) _trans_acl = vec_mergel(a, c); \ | |||
| __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ | |||
| __typeof__(a) _trans_bdl = vec_mergel(b, d); \ | |||
| \ | |||
| a = vec_mergeh(_trans_ach, _trans_bdh); \ | |||
| b = vec_mergel(_trans_ach, _trans_bdh); \ | |||
| c = vec_mergeh(_trans_acl, _trans_bdl); \ | |||
| d = vec_mergel(_trans_acl, _trans_bdl); \ | |||
| __typeof__(a) _trans_ach = vec_mergeh(a, c); \ | |||
| __typeof__(a) _trans_acl = vec_mergel(a, c); \ | |||
| __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ | |||
| __typeof__(a) _trans_bdl = vec_mergel(b, d); \ | |||
| \ | |||
| a = vec_mergeh(_trans_ach, _trans_bdh); \ | |||
| b = vec_mergel(_trans_ach, _trans_bdh); \ | |||
| c = vec_mergeh(_trans_acl, _trans_bdl); \ | |||
| d = vec_mergel(_trans_acl, _trans_bdl); \ | |||
| } while (0) | |||
| @@ -58,19 +58,19 @@ do { \ | |||
| // target address is four-byte aligned (which should be always). | |||
| #define LOAD4(vec, address) \ | |||
| { \ | |||
| __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ | |||
| vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ | |||
| vec = vec_ld(0, _load_addr); \ | |||
| vec = vec_perm(vec, vec, _perm_vec); \ | |||
| vec = vec_splat(vec, 0); \ | |||
| __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ | |||
| vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ | |||
| vec = vec_ld(0, _load_addr); \ | |||
| vec = vec_perm(vec, vec, _perm_vec); \ | |||
| vec = vec_splat(vec, 0); \ | |||
| } | |||
| #define FOUROF(a) AVV(a,a,a,a) | |||
| int dct_quantize_altivec(MpegEncContext* s, | |||
| DCTELEM* data, int n, | |||
| int qscale, int* overflow) | |||
| DCTELEM* data, int n, | |||
| int qscale, int* overflow) | |||
| { | |||
| int lastNonZero; | |||
| vector float row0, row1, row2, row3, row4, row5, row6, row7; | |||
| @@ -137,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| int whichPass, whichHalf; | |||
| for(whichPass = 1; whichPass<=2; whichPass++) | |||
| { | |||
| for(whichHalf = 1; whichHalf<=2; whichHalf++) | |||
| { | |||
| for(whichPass = 1; whichPass<=2; whichPass++) { | |||
| for(whichHalf = 1; whichHalf<=2; whichHalf++) { | |||
| vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |||
| vector float tmp10, tmp11, tmp12, tmp13; | |||
| vector float z1, z2, z3, z4, z5; | |||
| @@ -235,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| SWAP(row7, alt7); | |||
| } | |||
| if (whichPass == 1) | |||
| { | |||
| if (whichPass == 1) { | |||
| // transpose the data for the second pass | |||
| // First, block transpose the upper right with lower left. | |||
| @@ -261,8 +258,7 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| const vector signed int* qmat; | |||
| vector float bias, negBias; | |||
| if (s->mb_intra) | |||
| { | |||
| if (s->mb_intra) { | |||
| vector signed int baseVector; | |||
| // We must cache element 0 in the intra case | |||
| @@ -272,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| qmat = (vector signed int*)s->q_intra_matrix[qscale]; | |||
| biasAddr = &(s->intra_quant_bias); | |||
| } | |||
| else | |||
| { | |||
| } else { | |||
| qmat = (vector signed int*)s->q_inter_matrix[qscale]; | |||
| biasAddr = &(s->inter_quant_bias); | |||
| } | |||
| @@ -439,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| // and handle it using the vector unit if we can. This is the permute used | |||
| // by the altivec idct, so it is common when using the altivec dct. | |||
| if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) | |||
| { | |||
| if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { | |||
| TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); | |||
| } | |||
| @@ -456,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| } | |||
| // special handling of block[0] | |||
| if (s->mb_intra) | |||
| { | |||
| if (!s->h263_aic) | |||
| { | |||
| if (s->mb_intra) { | |||
| if (!s->h263_aic) { | |||
| if (n < 4) | |||
| oldBaseValue /= s->y_dc_scale; | |||
| else | |||
| @@ -474,8 +465,7 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| // need to permute the "no" permutation case. | |||
| if ((lastNonZero > 0) && | |||
| (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && | |||
| (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) | |||
| { | |||
| (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { | |||
| ff_block_permute(data, s->dsp.idct_permutation, | |||
| s->intra_scantable.scantable, lastNonZero); | |||
| } | |||
| @@ -483,10 +473,8 @@ int dct_quantize_altivec(MpegEncContext* s, | |||
| return lastNonZero; | |||
| } | |||
| /* | |||
| AltiVec version of dct_unquantize_h263 | |||
| this code assumes `block' is 16 bytes-aligned | |||
| */ | |||
| /* AltiVec version of dct_unquantize_h263 | |||
| this code assumes `block' is 16 bytes-aligned */ | |||
| void dct_unquantize_h263_altivec(MpegEncContext *s, | |||
| DCTELEM *block, int n, int qscale) | |||
| { | |||
| @@ -517,82 +505,81 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); | |||
| } | |||
| { | |||
| register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); | |||
| DECLARE_ALIGNED_16(short, qmul8[]) = | |||
| { | |||
| qmul, qmul, qmul, qmul, | |||
| qmul, qmul, qmul, qmul | |||
| }; | |||
| DECLARE_ALIGNED_16(short, qadd8[]) = | |||
| { | |||
| qadd, qadd, qadd, qadd, | |||
| qadd, qadd, qadd, qadd | |||
| }; | |||
| DECLARE_ALIGNED_16(short, nqadd8[]) = | |||
| { | |||
| -qadd, -qadd, -qadd, -qadd, | |||
| -qadd, -qadd, -qadd, -qadd | |||
| }; | |||
| register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; | |||
| register vector bool short blockv_null, blockv_neg; | |||
| register short backup_0 = block[0]; | |||
| register int j = 0; | |||
| qmulv = vec_ld(0, qmul8); | |||
| qaddv = vec_ld(0, qadd8); | |||
| nqaddv = vec_ld(0, nqadd8); | |||
| #if 0 // block *is* 16 bytes-aligned, it seems. | |||
| // first make sure block[j] is 16 bytes-aligned | |||
| for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { | |||
| level = block[j]; | |||
| if (level) { | |||
| if (level < 0) { | |||
| level = level * qmul - qadd; | |||
| } else { | |||
| level = level * qmul + qadd; | |||
| register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); | |||
| DECLARE_ALIGNED_16(short, qmul8[]) = | |||
| { | |||
| qmul, qmul, qmul, qmul, | |||
| qmul, qmul, qmul, qmul | |||
| }; | |||
| DECLARE_ALIGNED_16(short, qadd8[]) = | |||
| { | |||
| qadd, qadd, qadd, qadd, | |||
| qadd, qadd, qadd, qadd | |||
| }; | |||
| DECLARE_ALIGNED_16(short, nqadd8[]) = | |||
| { | |||
| -qadd, -qadd, -qadd, -qadd, | |||
| -qadd, -qadd, -qadd, -qadd | |||
| }; | |||
| register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; | |||
| register vector bool short blockv_null, blockv_neg; | |||
| register short backup_0 = block[0]; | |||
| register int j = 0; | |||
| qmulv = vec_ld(0, qmul8); | |||
| qaddv = vec_ld(0, qadd8); | |||
| nqaddv = vec_ld(0, nqadd8); | |||
| #if 0 // block *is* 16 bytes-aligned, it seems. | |||
| // first make sure block[j] is 16 bytes-aligned | |||
| for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { | |||
| level = block[j]; | |||
| if (level) { | |||
| if (level < 0) { | |||
| level = level * qmul - qadd; | |||
| } else { | |||
| level = level * qmul + qadd; | |||
| } | |||
| block[j] = level; | |||
| } | |||
| block[j] = level; | |||
| } | |||
| } | |||
| #endif | |||
| // vectorize all the 16 bytes-aligned blocks | |||
| // of 8 elements | |||
| for(; (j + 7) <= nCoeffs ; j+=8) | |||
| { | |||
| blockv = vec_ld(j << 1, block); | |||
| blockv_neg = vec_cmplt(blockv, vczero); | |||
| blockv_null = vec_cmpeq(blockv, vczero); | |||
| // choose between +qadd or -qadd as the third operand | |||
| temp1 = vec_sel(qaddv, nqaddv, blockv_neg); | |||
| // multiply & add (block{i,i+7} * qmul [+-] qadd) | |||
| temp1 = vec_mladd(blockv, qmulv, temp1); | |||
| // put 0 where block[{i,i+7} used to have 0 | |||
| blockv = vec_sel(temp1, blockv, blockv_null); | |||
| vec_st(blockv, j << 1, block); | |||
| } | |||
| // if nCoeffs isn't a multiple of 8, finish the job | |||
| // using good old scalar units. | |||
| // (we could do it using a truncated vector, | |||
| // but I'm not sure it's worth the hassle) | |||
| for(; j <= nCoeffs ; j++) { | |||
| level = block[j]; | |||
| if (level) { | |||
| if (level < 0) { | |||
| level = level * qmul - qadd; | |||
| } else { | |||
| level = level * qmul + qadd; | |||
| // vectorize all the 16 bytes-aligned blocks | |||
| // of 8 elements | |||
| for(; (j + 7) <= nCoeffs ; j+=8) { | |||
| blockv = vec_ld(j << 1, block); | |||
| blockv_neg = vec_cmplt(blockv, vczero); | |||
| blockv_null = vec_cmpeq(blockv, vczero); | |||
| // choose between +qadd or -qadd as the third operand | |||
| temp1 = vec_sel(qaddv, nqaddv, blockv_neg); | |||
| // multiply & add (block{i,i+7} * qmul [+-] qadd) | |||
| temp1 = vec_mladd(blockv, qmulv, temp1); | |||
| // put 0 where block[{i,i+7} used to have 0 | |||
| blockv = vec_sel(temp1, blockv, blockv_null); | |||
| vec_st(blockv, j << 1, block); | |||
| } | |||
| // if nCoeffs isn't a multiple of 8, finish the job | |||
| // using good old scalar units. | |||
| // (we could do it using a truncated vector, | |||
| // but I'm not sure it's worth the hassle) | |||
| for(; j <= nCoeffs ; j++) { | |||
| level = block[j]; | |||
| if (level) { | |||
| if (level < 0) { | |||
| level = level * qmul - qadd; | |||
| } else { | |||
| level = level * qmul + qadd; | |||
| } | |||
| block[j] = level; | |||
| } | |||
| block[j] = level; | |||
| } | |||
| } | |||
| if (i == 1) | |||
| { // cheat. this avoid special-casing the first iteration | |||
| block[0] = backup_0; | |||
| } | |||
| if (i == 1) { | |||
| // cheat. this avoid special-casing the first iteration | |||
| block[0] = backup_0; | |||
| } | |||
| } | |||
| POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); | |||
| } | |||
| @@ -605,11 +592,9 @@ void MPV_common_init_altivec(MpegEncContext *s) | |||
| { | |||
| if ((mm_flags & MM_ALTIVEC) == 0) return; | |||
| if (s->avctx->lowres==0) | |||
| { | |||
| if (s->avctx->lowres==0) { | |||
| if ((s->avctx->idct_algo == FF_IDCT_AUTO) || | |||
| (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) | |||
| { | |||
| (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) { | |||
| s->dsp.idct_put = idct_put_altivec; | |||
| s->dsp.idct_add = idct_add_altivec; | |||
| s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |||
| @@ -618,15 +603,13 @@ void MPV_common_init_altivec(MpegEncContext *s) | |||
| // Test to make sure that the dct required alignments are met. | |||
| if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || | |||
| (((long)(s->q_inter_matrix) & 0x0f) != 0)) | |||
| { | |||
| (((long)(s->q_inter_matrix) & 0x0f) != 0)) { | |||
| av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " | |||
| "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); | |||
| return; | |||
| } | |||
| if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) | |||
| { | |||
| if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) { | |||
| av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " | |||
| "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); | |||
| return; | |||
| @@ -634,8 +617,7 @@ void MPV_common_init_altivec(MpegEncContext *s) | |||
| if ((s->avctx->dct_algo == FF_DCT_AUTO) || | |||
| (s->avctx->dct_algo == FF_DCT_ALTIVEC)) | |||
| { | |||
| (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { | |||
| #if 0 /* seems to cause trouble under some circumstances */ | |||
| s->dct_quantize = dct_quantize_altivec; | |||
| #endif | |||
| @@ -379,8 +379,7 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, | |||
| v4=(vector signed int *)b4; | |||
| v5=(vector signed int *)b5; | |||
| for (i=0; i< w4;i++) | |||
| { | |||
| for (i=0; i< w4;i++) { | |||
| #if 0 | |||
| b4[i] -= (3*(b3[i] + b5[i])+4)>>3; | |||
| @@ -782,8 +781,8 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, | |||
| void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| #if 0 | |||
| c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; | |||
| c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; | |||
| c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; | |||
| c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; | |||
| c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; | |||
| c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; | |||
| #endif | |||
| } | |||