15% faster vorbis. Originally committed as revision 5975 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -3753,6 +3753,39 @@ WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) | |||||
WARPER8_16_SQ(rd8x8_c, rd16_c) | WARPER8_16_SQ(rd8x8_c, rd16_c) | ||||
WARPER8_16_SQ(bit8x8_c, bit16_c) | WARPER8_16_SQ(bit8x8_c, bit16_c) | ||||
static void vector_fmul_c(float *dst, const float *src, int len){ | |||||
int i; | |||||
for(i=0; i<len; i++) | |||||
dst[i] *= src[i]; | |||||
} | |||||
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ | |||||
int i; | |||||
src1 += len-1; | |||||
for(i=0; i<len; i++) | |||||
dst[i] = src0[i] * src1[-i]; | |||||
} | |||||
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){ | |||||
int i; | |||||
for(i=0; i<len; i++) | |||||
dst[i*step] = src0[i] * src1[i] + src2[i] + src3; | |||||
} | |||||
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){ | |||||
int i; | |||||
for(i=0; i<len; i++) { | |||||
int_fast32_t tmp = ((int32_t*)src)[i]; | |||||
if(tmp & 0xf0000){ | |||||
tmp = (0x43c0ffff - tmp)>>31; | |||||
// is this faster on some gcc/cpu combinations? | |||||
// if(tmp > 0x43c0ffff) tmp = 0xFFFF; | |||||
// else tmp = 0; | |||||
} | |||||
dst[i] = tmp - 0x8000; | |||||
} | |||||
} | |||||
/* XXX: those functions should be suppressed ASAP when all IDCTs are | /* XXX: those functions should be suppressed ASAP when all IDCTs are | ||||
converted */ | converted */ | ||||
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | ||||
@@ -4096,6 +4129,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
#ifdef CONFIG_VORBIS_DECODER | #ifdef CONFIG_VORBIS_DECODER | ||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling; | c->vorbis_inverse_coupling = vorbis_inverse_coupling; | ||||
#endif | #endif | ||||
c->vector_fmul = vector_fmul_c; | |||||
c->vector_fmul_reverse = vector_fmul_reverse_c; | |||||
c->vector_fmul_add_add = ff_vector_fmul_add_add_c; | |||||
c->float_to_int16 = ff_float_to_int16_c; | |||||
c->shrink[0]= ff_img_copy_plane; | c->shrink[0]= ff_img_copy_plane; | ||||
c->shrink[1]= ff_shrink22; | c->shrink[1]= ff_shrink22; | ||||
@@ -58,6 +58,10 @@ void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); | |||||
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); | void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); | ||||
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); | void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); | ||||
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, | |||||
const float *src2, int src3, int blocksize, int step); | |||||
void ff_float_to_int16_c(int16_t *dst, const float *src, int len); | |||||
/* encoding scans */ | /* encoding scans */ | ||||
extern const uint8_t ff_alternate_horizontal_scan[64]; | extern const uint8_t ff_alternate_horizontal_scan[64]; | ||||
extern const uint8_t ff_alternate_vertical_scan[64]; | extern const uint8_t ff_alternate_vertical_scan[64]; | ||||
@@ -307,7 +311,17 @@ typedef struct DSPContext { | |||||
void (*h261_loop_filter)(uint8_t *src, int stride); | void (*h261_loop_filter)(uint8_t *src, int stride); | ||||
/* assume len is a multiple of 4, and arrays are 16-byte aligned */ | |||||
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); | void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); | ||||
void (*vector_fmul)(float *dst, const float *src, int len); | |||||
/* assume len is a multiple of 8, and arrays are 16-byte aligned */ | |||||
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); | |||||
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */ | |||||
void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); | |||||
/* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] | |||||
* asm versions: convert floats from [-32768.0,32767.0] without rescaling */ | |||||
void (*float_to_int16)(int16_t *dst, const float *src, int len); | |||||
/* (I)DCT */ | /* (I)DCT */ | ||||
void (*fdct)(DCTELEM *block/* align 16*/); | void (*fdct)(DCTELEM *block/* align 16*/); | ||||
@@ -2772,6 +2772,198 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) | |||||
} | } | ||||
} | } | ||||
static void vector_fmul_3dnow(float *dst, const float *src, int len){ | |||||
long i; | |||||
len >>= 1; | |||||
for(i=0; i<len; i++) { | |||||
asm volatile( | |||||
"movq %0, %%mm0 \n\t" | |||||
"pfmul %1, %%mm0 \n\t" | |||||
"movq %%mm0, %0 \n\t" | |||||
:"+m"(dst[i*2]) | |||||
:"m"(src[i*2]) | |||||
:"memory" | |||||
); | |||||
} | |||||
asm volatile("femms"); | |||||
} | |||||
static void vector_fmul_sse(float *dst, const float *src, int len){ | |||||
long i; | |||||
len >>= 2; | |||||
for(i=0; i<len; i++) { | |||||
asm volatile( | |||||
"movaps %0, %%xmm0 \n\t" | |||||
"mulps %1, %%xmm0 \n\t" | |||||
"movaps %%xmm0, %0 \n\t" | |||||
:"+m"(dst[i*4]) | |||||
:"m"(src[i*4]) | |||||
:"memory" | |||||
); | |||||
} | |||||
} | |||||
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ | |||||
long i = len*4-16; | |||||
asm volatile( | |||||
"1: \n\t" | |||||
"pswapd 8(%1), %%mm0 \n\t" | |||||
"pswapd (%1), %%mm1 \n\t" | |||||
"pfmul (%3,%0), %%mm0 \n\t" | |||||
"pfmul 8(%3,%0), %%mm1 \n\t" | |||||
"movq %%mm0, (%2,%0) \n\t" | |||||
"movq %%mm1, 8(%2,%0) \n\t" | |||||
"add $16, %1 \n\t" | |||||
"sub $16, %0 \n\t" | |||||
"jge 1b \n\t" | |||||
:"+r"(i), "+r"(src1) | |||||
:"r"(dst), "r"(src0) | |||||
); | |||||
asm volatile("femms"); | |||||
} | |||||
static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){ | |||||
long i = len*4-32; | |||||
asm volatile( | |||||
"1: \n\t" | |||||
"pshufd $0x1b, 16(%1), %%xmm0 \n\t" | |||||
"pshufd $0x1b, (%1), %%xmm1 \n\t" | |||||
"mulps (%3,%0), %%xmm0 \n\t" | |||||
"mulps 16(%3,%0), %%xmm1 \n\t" | |||||
"movaps %%xmm0, (%2,%0) \n\t" | |||||
"movaps %%xmm1, 16(%2,%0) \n\t" | |||||
"add $32, %1 \n\t" | |||||
"sub $32, %0 \n\t" | |||||
"jge 1b \n\t" | |||||
:"+r"(i), "+r"(src1) | |||||
:"r"(dst), "r"(src0) | |||||
); | |||||
} | |||||
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, | |||||
const float *src2, int src3, int len, int step){ | |||||
long i; | |||||
if(step == 2 && src3 == 0){ | |||||
i = (len-4)*4; | |||||
dst += (len-4)*2; | |||||
asm volatile( | |||||
"1: \n\t" | |||||
"movq (%2,%0), %%mm0 \n\t" | |||||
"movq 8(%2,%0), %%mm1 \n\t" | |||||
"pfmul (%3,%0), %%mm0 \n\t" | |||||
"pfmul 8(%3,%0), %%mm1 \n\t" | |||||
"pfadd (%4,%0), %%mm0 \n\t" | |||||
"pfadd 8(%4,%0), %%mm1 \n\t" | |||||
"movd %%mm0, (%1) \n\t" | |||||
"movd %%mm1, 16(%1) \n\t" | |||||
"psrlq $32, %%mm0 \n\t" | |||||
"psrlq $32, %%mm1 \n\t" | |||||
"movd %%mm0, 8(%1) \n\t" | |||||
"movd %%mm1, 24(%1) \n\t" | |||||
"sub $32, %1 \n\t" | |||||
"sub $16, %0 \n\t" | |||||
"jge 1b \n\t" | |||||
:"+r"(i), "+r"(dst) | |||||
:"r"(src0), "r"(src1), "r"(src2) | |||||
:"memory" | |||||
); | |||||
} | |||||
else if(step == 1 && src3 == 0){ | |||||
for(i=0; i<len; i+=2){ | |||||
asm volatile( | |||||
"movq %1, %%mm0 \n\t" | |||||
"pfmul %2, %%mm0 \n\t" | |||||
"pfadd %3, %%mm0 \n\t" | |||||
"movq %%mm0, %0 \n\t" | |||||
:"=m"(dst[i]) | |||||
:"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) | |||||
); | |||||
} | |||||
} | |||||
else | |||||
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |||||
asm volatile("femms"); | |||||
} | |||||
static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1, | |||||
const float *src2, float src3, int len, int step){ | |||||
long i; | |||||
if(step == 2 && src3 == 0){ | |||||
i = (len-8)*4; | |||||
dst += (len-8)*2; | |||||
asm volatile( | |||||
"1: \n\t" | |||||
"movaps (%2,%0), %%xmm0 \n\t" | |||||
"movaps 16(%2,%0), %%xmm1 \n\t" | |||||
"mulps (%3,%0), %%xmm0 \n\t" | |||||
"mulps 16(%3,%0), %%xmm1 \n\t" | |||||
"addps (%4,%0), %%xmm0 \n\t" | |||||
"addps 16(%4,%0), %%xmm1 \n\t" | |||||
"movd %%xmm0, (%1) \n\t" | |||||
"movd %%xmm1, 32(%1) \n\t" | |||||
"psrldq $4, %%xmm0 \n\t" | |||||
"psrldq $4, %%xmm1 \n\t" | |||||
"movd %%xmm0, 8(%1) \n\t" | |||||
"movd %%xmm1, 40(%1) \n\t" | |||||
"psrldq $4, %%xmm0 \n\t" | |||||
"psrldq $4, %%xmm1 \n\t" | |||||
"movd %%xmm0, 16(%1) \n\t" | |||||
"movd %%xmm1, 48(%1) \n\t" | |||||
"psrldq $4, %%xmm0 \n\t" | |||||
"psrldq $4, %%xmm1 \n\t" | |||||
"movd %%xmm0, 24(%1) \n\t" | |||||
"movd %%xmm1, 56(%1) \n\t" | |||||
"sub $64, %1 \n\t" | |||||
"sub $32, %0 \n\t" | |||||
"jge 1b \n\t" | |||||
:"+r"(i), "+r"(dst) | |||||
:"r"(src0), "r"(src1), "r"(src2) | |||||
:"memory" | |||||
); | |||||
} | |||||
else if(step == 1 && src3 == 0){ | |||||
for(i=0; i<len; i+=4){ | |||||
asm volatile( | |||||
"movaps %1, %%xmm0 \n\t" | |||||
"mulps %2, %%xmm0 \n\t" | |||||
"addps %3, %%xmm0 \n\t" | |||||
"movaps %%xmm0, %0 \n\t" | |||||
:"=m"(dst[i]) | |||||
:"m"(src0[i]), "m"(src1[i]), "m"(src2[i]) | |||||
); | |||||
} | |||||
} | |||||
else | |||||
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); | |||||
} | |||||
void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ | |||||
// not bit-exact: pf2id uses different rounding than C and SSE | |||||
int i; | |||||
for(i=0; i<len; i+=4) { | |||||
asm volatile( | |||||
"pf2id %1, %%mm0 \n\t" | |||||
"pf2id %2, %%mm1 \n\t" | |||||
"packssdw %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, %0 \n\t" | |||||
:"=m"(dst[i]) | |||||
:"m"(src[i]), "m"(src[i+2]) | |||||
); | |||||
} | |||||
asm volatile("femms"); | |||||
} | |||||
void float_to_int16_sse(int16_t *dst, const float *src, int len){ | |||||
int i; | |||||
for(i=0; i<len; i+=4) { | |||||
asm volatile( | |||||
"cvtps2pi %1, %%mm0 \n\t" | |||||
"cvtps2pi %2, %%mm1 \n\t" | |||||
"packssdw %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, %0 \n\t" | |||||
:"=m"(dst[i]) | |||||
:"m"(src[i]), "m"(src[i+2]) | |||||
); | |||||
} | |||||
asm volatile("emms"); | |||||
} | |||||
#ifdef CONFIG_SNOW_ENCODER | #ifdef CONFIG_SNOW_ENCODER | ||||
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); | extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); | ||||
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); | extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); | ||||
@@ -3199,10 +3391,25 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
} | } | ||||
#endif | #endif | ||||
if(mm_flags & MM_SSE) | |||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |||||
else if(mm_flags & MM_3DNOW) | |||||
if(mm_flags & MM_3DNOW){ | |||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | ||||
c->vector_fmul = vector_fmul_3dnow; | |||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) | |||||
c->float_to_int16 = float_to_int16_3dnow; | |||||
} | |||||
if(mm_flags & MM_3DNOWEXT) | |||||
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | |||||
if(mm_flags & MM_SSE){ | |||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |||||
c->vector_fmul = vector_fmul_sse; | |||||
c->float_to_int16 = float_to_int16_sse; | |||||
} | |||||
if(mm_flags & MM_SSE2){ | |||||
c->vector_fmul_reverse = vector_fmul_reverse_sse2; | |||||
c->vector_fmul_add_add = vector_fmul_add_add_sse2; | |||||
} | |||||
if(mm_flags & MM_3DNOW) | |||||
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2 | |||||
} | } | ||||
#ifdef CONFIG_ENCODERS | #ifdef CONFIG_ENCODERS | ||||
@@ -192,6 +192,11 @@ static void vorbis_free(vorbis_context *vc) { | |||||
av_free(vc->mappings[i].mux); | av_free(vc->mappings[i].mux); | ||||
} | } | ||||
av_freep(&vc->mappings); | av_freep(&vc->mappings); | ||||
if(vc->exp_bias){ | |||||
av_freep(&vc->swin); | |||||
av_freep(&vc->lwin); | |||||
} | |||||
} | } | ||||
// Parse setup header ------------------------------------------------- | // Parse setup header ------------------------------------------------- | ||||
@@ -888,6 +893,19 @@ static int vorbis_parse_id_hdr(vorbis_context *vc){ | |||||
vc->swin=vwin[bl0-6]; | vc->swin=vwin[bl0-6]; | ||||
vc->lwin=vwin[bl1-6]; | vc->lwin=vwin[bl1-6]; | ||||
if(vc->exp_bias){ | |||||
int i; | |||||
float *win; | |||||
win = av_malloc(vc->blocksize_0/2 * sizeof(float)); | |||||
for(i=0; i<vc->blocksize_0/2; i++) | |||||
win[i] = vc->swin[i] * (1<<15); | |||||
vc->swin = win; | |||||
win = av_malloc(vc->blocksize_1/2 * sizeof(float)); | |||||
for(i=0; i<vc->blocksize_1/2; i++) | |||||
win[i] = vc->lwin[i] * (1<<15); | |||||
vc->lwin = win; | |||||
} | |||||
if ((get_bits1(gb)) == 0) { | if ((get_bits1(gb)) == 0) { | ||||
av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n"); | av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n"); | ||||
return 2; | return 2; | ||||
@@ -931,6 +949,14 @@ static int vorbis_decode_init(AVCodecContext *avccontext) { | |||||
vc->avccontext = avccontext; | vc->avccontext = avccontext; | ||||
dsputil_init(&vc->dsp, avccontext); | dsputil_init(&vc->dsp, avccontext); | ||||
if(vc->dsp.float_to_int16 == ff_float_to_int16_c) { | |||||
vc->add_bias = 385; | |||||
vc->exp_bias = 0; | |||||
} else { | |||||
vc->add_bias = 0; | |||||
vc->exp_bias = 15<<23; | |||||
} | |||||
if (!headers_len) { | if (!headers_len) { | ||||
av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n"); | av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n"); | ||||
return -1; | return -1; | ||||
@@ -1472,7 +1498,6 @@ void vorbis_inverse_coupling(float *mag, float *ang, int blocksize) | |||||
} | } | ||||
// Decode the audio packet using the functions above | // Decode the audio packet using the functions above | ||||
#define BIAS 385 | |||||
static int vorbis_parse_audio_packet(vorbis_context *vc) { | static int vorbis_parse_audio_packet(vorbis_context *vc) { | ||||
GetBitContext *gb=&vc->gb; | GetBitContext *gb=&vc->gb; | ||||
@@ -1490,6 +1515,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||||
uint_fast8_t res_num=0; | uint_fast8_t res_num=0; | ||||
int_fast16_t retlen=0; | int_fast16_t retlen=0; | ||||
uint_fast16_t saved_start=0; | uint_fast16_t saved_start=0; | ||||
float fadd_bias = vc->add_bias; | |||||
if (get_bits1(gb)) { | if (get_bits1(gb)) { | ||||
av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n"); | av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n"); | ||||
@@ -1576,10 +1602,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||||
for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) { | for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) { | ||||
ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2; | ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2; | ||||
for(i=0;i<blocksize/2;++i) { | |||||
ch_floor_ptr[i]*=ch_res_ptr[i]; //FPMATH | |||||
} | |||||
vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize/2); | |||||
} | } | ||||
// MDCT, overlap/add, save data for next overlapping FPMATH | // MDCT, overlap/add, save data for next overlapping FPMATH | ||||
@@ -1600,59 +1623,54 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||||
vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); | vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); | ||||
//FIXME process channels together, to allow faster simd vector_fmul_add_add? | |||||
if (vc->modes[mode_number].blockflag) { | if (vc->modes[mode_number].blockflag) { | ||||
// -- overlap/add | // -- overlap/add | ||||
if (previous_window) { | if (previous_window) { | ||||
for(k=j, i=0;i<vc->blocksize_1/2;++i, k+=step) { | |||||
ret[k]=saved[i]+buf[i]*lwin[i]+BIAS; | |||||
} | |||||
vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize_1/2, step); | |||||
retlen=vc->blocksize_1/2; | retlen=vc->blocksize_1/2; | ||||
} else { | } else { | ||||
buf += (vc->blocksize_1-vc->blocksize_0)/4; | |||||
for(k=j, i=0;i<vc->blocksize_0/2;++i, k+=step) { | |||||
ret[k]=saved[i]+buf[i]*swin[i]+BIAS; | |||||
} | |||||
int len = (vc->blocksize_1-vc->blocksize_0)/4; | |||||
buf += len; | |||||
vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize_0/2, step); | |||||
k = vc->blocksize_0/2*step + j; | |||||
buf += vc->blocksize_0/2; | buf += vc->blocksize_0/2; | ||||
for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) { | |||||
ret[k]=buf[i]+BIAS; | |||||
if(vc->exp_bias){ | |||||
for(i=0; i<len; i++, k+=step) | |||||
((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias) | |||||
} else { | |||||
for(i=0; i<len; i++, k+=step) | |||||
ret[k] = buf[i] + fadd_bias; | |||||
} | } | ||||
buf=vc->buf; | buf=vc->buf; | ||||
retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4; | |||||
retlen=vc->blocksize_0/2+len; | |||||
} | } | ||||
// -- save | // -- save | ||||
if (next_window) { | if (next_window) { | ||||
buf += vc->blocksize_1/2; | buf += vc->blocksize_1/2; | ||||
lwin += vc->blocksize_1/2-1; | |||||
for(i=0;i<vc->blocksize_1/2;++i) { | |||||
saved[i]=buf[i]*lwin[-i]; | |||||
} | |||||
vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize_1/2); | |||||
saved_start=0; | saved_start=0; | ||||
} else { | } else { | ||||
saved_start=(vc->blocksize_1-vc->blocksize_0)/4; | saved_start=(vc->blocksize_1-vc->blocksize_0)/4; | ||||
buf += vc->blocksize_1/2; | buf += vc->blocksize_1/2; | ||||
for(i=0;i<saved_start;++i) { | |||||
saved[i]=buf[i]; | |||||
} | |||||
swin += vc->blocksize_0/2-1; | |||||
for(i=0;i<vc->blocksize_0/2;++i) { | |||||
saved[saved_start+i]=buf[saved_start+i]*swin[-i]; | |||||
} | |||||
for(i=0; i<saved_start; i++) | |||||
((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias; | |||||
vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize_0/2); | |||||
} | } | ||||
} else { | } else { | ||||
// --overlap/add | // --overlap/add | ||||
for(k=j, i=0;i<saved_start;++i, k+=step) { | |||||
ret[k]=saved[i]+BIAS; | |||||
} | |||||
for(i=0;i<vc->blocksize_0/2;++i, k+=step) { | |||||
ret[k]=saved[saved_start+i]+buf[i]*swin[i]+BIAS; | |||||
if(vc->add_bias) { | |||||
for(k=j, i=0;i<saved_start;++i, k+=step) | |||||
ret[k] = saved[i] + fadd_bias; | |||||
} else { | |||||
for(k=j, i=0;i<saved_start;++i, k+=step) | |||||
ret[k] = saved[i]; | |||||
} | } | ||||
vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize_0/2, step); | |||||
retlen=saved_start+vc->blocksize_0/2; | retlen=saved_start+vc->blocksize_0/2; | ||||
// -- save | // -- save | ||||
buf += vc->blocksize_0/2; | buf += vc->blocksize_0/2; | ||||
swin += vc->blocksize_0/2-1; | |||||
for(i=0;i<vc->blocksize_0/2;++i) { | |||||
saved[i]=buf[i]*swin[-i]; | |||||
} | |||||
vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize_0/2); | |||||
saved_start=0; | saved_start=0; | ||||
} | } | ||||
} | } | ||||
@@ -1695,16 +1713,7 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, | |||||
AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); | AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); | ||||
for(i=0;i<len;++i) { | |||||
int_fast32_t tmp= ((int32_t*)vc->ret)[i]; | |||||
if(tmp & 0xf0000){ | |||||
// tmp= (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower | |||||
if(tmp > 0x43c0ffff) tmp= 0xFFFF; | |||||
else tmp= 0; | |||||
} | |||||
((int16_t*)data)[i]=tmp - 0x8000; | |||||
} | |||||
vc->dsp.float_to_int16(data, vc->ret, len); | |||||
*data_size=len*2; | *data_size=len*2; | ||||
return buf_size ; | return buf_size ; | ||||
@@ -120,6 +120,8 @@ typedef struct vorbis_context_s { | |||||
float *ret; | float *ret; | ||||
float *buf; | float *buf; | ||||
float *buf_tmp; | float *buf_tmp; | ||||
uint_fast32_t add_bias; // for float->int conversion | |||||
uint_fast32_t exp_bias; | |||||
} vorbis_context; | } vorbis_context; | ||||