| @@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2, | |||||
| } | } | ||||
| } | } | ||||
| static void butterflies_float_interleave_c(float *dst, const float *src0, | |||||
| const float *src1, int len) | |||||
| { | |||||
| int i; | |||||
| for (i = 0; i < len; i++) { | |||||
| float f1 = src0[i]; | |||||
| float f2 = src1[i]; | |||||
| dst[2*i ] = f1 + f2; | |||||
| dst[2*i + 1] = f1 - f2; | |||||
| } | |||||
| } | |||||
| static float scalarproduct_float_c(const float *v1, const float *v2, int len) | static float scalarproduct_float_c(const float *v1, const float *v2, int len) | ||||
| { | { | ||||
| float p = 0.0; | float p = 0.0; | ||||
| @@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->vector_clip_int32 = vector_clip_int32_c; | c->vector_clip_int32 = vector_clip_int32_c; | ||||
| c->scalarproduct_float = scalarproduct_float_c; | c->scalarproduct_float = scalarproduct_float_c; | ||||
| c->butterflies_float = butterflies_float_c; | c->butterflies_float = butterflies_float_c; | ||||
| c->butterflies_float_interleave = butterflies_float_interleave_c; | |||||
| c->vector_fmul_scalar = vector_fmul_scalar_c; | c->vector_fmul_scalar = vector_fmul_scalar_c; | ||||
| c->vector_fmac_scalar = vector_fmac_scalar_c; | c->vector_fmac_scalar = vector_fmac_scalar_c; | ||||
| @@ -453,6 +453,23 @@ typedef struct DSPContext { | |||||
| */ | */ | ||||
| void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); | void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); | ||||
| /** | |||||
| * Calculate the sum and difference of two vectors of floats and interleave | |||||
| * results into a separate output vector of floats, with each sum | |||||
| * positioned before the corresponding difference. | |||||
| * | |||||
| * @param dst output vector | |||||
| * constraints: 16-byte aligned | |||||
| * @param src0 first input vector | |||||
| * constraints: 32-byte aligned | |||||
| * @param src1 second input vector | |||||
| * constraints: 32-byte aligned | |||||
| * @param len number of elements in the input | |||||
| * constraints: multiple of 8 | |||||
| */ | |||||
| void (*butterflies_float_interleave)(float *dst, const float *src0, | |||||
| const float *src1, int len); | |||||
| /* (I)DCT */ | /* (I)DCT */ | ||||
| void (*fdct)(DCTELEM *block/* align 16*/); | void (*fdct)(DCTELEM *block/* align 16*/); | ||||
| void (*fdct248)(DCTELEM *block/* align 16*/); | void (*fdct248)(DCTELEM *block/* align 16*/); | ||||
| @@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, | |||||
| float *out) | float *out) | ||||
| { | { | ||||
| const ModeTab *mtab = tctx->mtab; | const ModeTab *mtab = tctx->mtab; | ||||
| int size1, size2; | |||||
| float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0]; | float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0]; | ||||
| int i, j; | |||||
| int i; | |||||
| for (i = 0; i < tctx->avctx->channels; i++) { | for (i = 0; i < tctx->avctx->channels; i++) { | ||||
| imdct_and_window(tctx, ftype, wtype, | imdct_and_window(tctx, ftype, wtype, | ||||
| @@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, | |||||
| i); | i); | ||||
| } | } | ||||
| size2 = tctx->last_block_pos[0]; | |||||
| size1 = mtab->size - size2; | |||||
| if (tctx->avctx->channels == 2) { | if (tctx->avctx->channels == 2) { | ||||
| for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) { | |||||
| float f1 = prev_buf[ i]; | |||||
| float f2 = prev_buf[2*mtab->size + i]; | |||||
| out[2*i ] = f1 + f2; | |||||
| out[2*i + 1] = f1 - f2; | |||||
| } | |||||
| for (j = 0; i < mtab->size; j++,i++) { | |||||
| float f1 = tctx->curr_frame[ j]; | |||||
| float f2 = tctx->curr_frame[2*mtab->size + j]; | |||||
| out[2*i ] = f1 + f2; | |||||
| out[2*i + 1] = f1 - f2; | |||||
| } | |||||
| tctx->dsp.butterflies_float_interleave(out, prev_buf, | |||||
| &prev_buf[2*mtab->size], | |||||
| size1); | |||||
| out += 2 * size1; | |||||
| tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame, | |||||
| &tctx->curr_frame[2*mtab->size], | |||||
| size2); | |||||
| } else { | } else { | ||||
| memcpy(out, prev_buf, | |||||
| (mtab->size - tctx->last_block_pos[0]) * sizeof(*out)); | |||||
| memcpy(out, prev_buf, size1 * sizeof(*out)); | |||||
| out += mtab->size - tctx->last_block_pos[0]; | |||||
| out += size1; | |||||
| memcpy(out, tctx->curr_frame, | |||||
| (tctx->last_block_pos[0]) * sizeof(*out)); | |||||
| memcpy(out, tctx->curr_frame, size2 * sizeof(*out)); | |||||
| } | } | ||||
| } | } | ||||
| @@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min | |||||
| void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, | void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, | ||||
| int32_t max, unsigned int len); | int32_t max, unsigned int len); | ||||
| extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0, | |||||
| const float *src1, int len); | |||||
| extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, | |||||
| const float *src1, int len); | |||||
| void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | ||||
| { | { | ||||
| int mm_flags = av_get_cpu_flags(); | int mm_flags = av_get_cpu_flags(); | ||||
| @@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| c->vector_clipf = vector_clipf_sse; | c->vector_clipf = vector_clipf_sse; | ||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| c->scalarproduct_float = ff_scalarproduct_float_sse; | c->scalarproduct_float = ff_scalarproduct_float_sse; | ||||
| c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; | |||||
| #endif | #endif | ||||
| } | } | ||||
| if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) | if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) | ||||
| @@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; | c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; | ||||
| c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; | c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; | ||||
| } | } | ||||
| c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 | |||||
| %else | %else | ||||
| VECTOR_CLIP_INT32 6, 1, 0, 0 | VECTOR_CLIP_INT32 6, 1, 0, 0 | ||||
| %endif | %endif | ||||
| ;----------------------------------------------------------------------------- | |||||
| ; void ff_butterflies_float_interleave(float *dst, const float *src0, | |||||
| ; const float *src1, int len); | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro BUTTERFLIES_FLOAT_INTERLEAVE 0 | |||||
| cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len | |||||
| %ifdef ARCH_X86_64 | |||||
| movsxd lenq, lend | |||||
| %endif | |||||
| test lenq, lenq | |||||
| jz .end | |||||
| shl lenq, 2 | |||||
| lea src0q, [src0q + lenq] | |||||
| lea src1q, [src1q + lenq] | |||||
| lea dstq, [ dstq + 2*lenq] | |||||
| neg lenq | |||||
| .loop: | |||||
| mova m0, [src0q + lenq] | |||||
| mova m1, [src1q + lenq] | |||||
| subps m2, m0, m1 | |||||
| addps m0, m0, m1 | |||||
| unpcklps m1, m0, m2 | |||||
| unpckhps m0, m0, m2 | |||||
| %if cpuflag(avx) | |||||
| vextractf128 [dstq + 2*lenq ], m1, 0 | |||||
| vextractf128 [dstq + 2*lenq + 16], m0, 0 | |||||
| vextractf128 [dstq + 2*lenq + 32], m1, 1 | |||||
| vextractf128 [dstq + 2*lenq + 48], m0, 1 | |||||
| %else | |||||
| mova [dstq + 2*lenq ], m1 | |||||
| mova [dstq + 2*lenq + mmsize], m0 | |||||
| %endif | |||||
| add lenq, mmsize | |||||
| jl .loop | |||||
| %if mmsize == 32 | |||||
| vzeroupper | |||||
| RET | |||||
| %endif | |||||
| .end: | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_XMM sse | |||||
| BUTTERFLIES_FLOAT_INTERLEAVE | |||||
| INIT_YMM avx | |||||
| BUTTERFLIES_FLOAT_INTERLEAVE | |||||