| @@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input, | |||||
| } | } | ||||
| } | } | ||||
| static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len) | |||||
| { | |||||
| do { | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| *dst++ = av_clip(*src++, min, max); | |||||
| len -= 8; | |||||
| } while (len > 0); | |||||
| } | |||||
| #define W0 2048 | #define W0 2048 | ||||
| #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | ||||
| #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | ||||
| @@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||||
| c->scalarproduct_int16 = scalarproduct_int16_c; | c->scalarproduct_int16 = scalarproduct_int16_c; | ||||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | ||||
| c->apply_window_int16 = apply_window_int16_c; | c->apply_window_int16 = apply_window_int16_c; | ||||
| c->vector_clip_int32 = vector_clip_int32_c; | |||||
| c->scalarproduct_float = scalarproduct_float_c; | c->scalarproduct_float = scalarproduct_float_c; | ||||
| c->butterflies_float = butterflies_float_c; | c->butterflies_float = butterflies_float_c; | ||||
| c->vector_fmul_scalar = vector_fmul_scalar_c; | c->vector_fmul_scalar = vector_fmul_scalar_c; | ||||
| @@ -555,6 +555,22 @@ typedef struct DSPContext { | |||||
| void (*apply_window_int16)(int16_t *output, const int16_t *input, | void (*apply_window_int16)(int16_t *output, const int16_t *input, | ||||
| const int16_t *window, unsigned int len); | const int16_t *window, unsigned int len); | ||||
| /** | |||||
| * Clip each element in an array of int32_t to a given minimum and maximum value. | |||||
| * @param dst destination array | |||||
| * constraints: 16-byte aligned | |||||
| * @param src source array | |||||
| * constraints: 16-byte aligned | |||||
| * @param min minimum value | |||||
| * constraints: must in the the range [-(1<<24), 1<<24] | |||||
| * @param max maximum value | |||||
| * constraints: must in the the range [-(1<<24), 1<<24] | |||||
| * @param len number of elements in the array | |||||
| * constraints: multiple of 32 greater than zero | |||||
| */ | |||||
| void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len); | |||||
| /* rv30 functions */ | /* rv30 functions */ | ||||
| qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; | qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; | ||||
| qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; | qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; | ||||
| @@ -2429,6 +2429,15 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i | |||||
| float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | ||||
| void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len); | |||||
| void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len); | |||||
| void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len); | |||||
| void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min, | |||||
| int32_t max, unsigned int len); | |||||
| void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | ||||
| { | { | ||||
| int mm_flags = av_get_cpu_flags(); | int mm_flags = av_get_cpu_flags(); | ||||
| @@ -2570,6 +2579,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; | c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; | ||||
| c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; | c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; | ||||
| c->vector_clip_int32 = ff_vector_clip_int32_mmx; | |||||
| #endif | #endif | ||||
| if (mm_flags & AV_CPU_FLAG_MMX2) { | if (mm_flags & AV_CPU_FLAG_MMX2) { | ||||
| @@ -2855,6 +2866,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | ||||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | ||||
| if (mm_flags & AV_CPU_FLAG_ATOM) { | |||||
| c->vector_clip_int32 = ff_vector_clip_int32_sse2_int; | |||||
| } else { | |||||
| c->vector_clip_int32 = ff_vector_clip_int32_sse2; | |||||
| } | |||||
| if (avctx->flags & CODEC_FLAG_BITEXACT) { | if (avctx->flags & CODEC_FLAG_BITEXACT) { | ||||
| c->apply_window_int16 = ff_apply_window_int16_sse2_ba; | c->apply_window_int16 = ff_apply_window_int16_sse2_ba; | ||||
| } else { | } else { | ||||
| @@ -2880,6 +2896,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { | |||||
| #if HAVE_YASM | |||||
| c->vector_clip_int32 = ff_vector_clip_int32_sse41; | |||||
| #endif | |||||
| } | |||||
| #if HAVE_AVX && HAVE_YASM | #if HAVE_AVX && HAVE_YASM | ||||
| if (mm_flags & AV_CPU_FLAG_AVX) { | if (mm_flags & AV_CPU_FLAG_AVX) { | ||||
| if (bit_depth == 10) { | if (bit_depth == 10) { | ||||
| @@ -1048,3 +1048,118 @@ emu_edge sse | |||||
| %ifdef ARCH_X86_32 | %ifdef ARCH_X86_32 | ||||
| emu_edge mmx | emu_edge mmx | ||||
| %endif | %endif | ||||
| ;----------------------------------------------------------------------------- | |||||
| ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, | |||||
| ; int32_t max, unsigned int len) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PMINSD_MMX 3 ; dst, src, tmp | |||||
| mova %3, %2 | |||||
| pcmpgtd %3, %1 | |||||
| pxor %1, %2 | |||||
| pand %1, %3 | |||||
| pxor %1, %2 | |||||
| %endmacro | |||||
| %macro PMAXSD_MMX 3 ; dst, src, tmp | |||||
| mova %3, %1 | |||||
| pcmpgtd %3, %2 | |||||
| pand %1, %3 | |||||
| pandn %3, %2 | |||||
| por %1, %3 | |||||
| %endmacro | |||||
| %macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp | |||||
| PMINSD_MMX %1, %3, %4 | |||||
| PMAXSD_MMX %1, %2, %4 | |||||
| %endmacro | |||||
| %macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused | |||||
| cvtdq2ps %1, %1 | |||||
| minps %1, %3 | |||||
| maxps %1, %2 | |||||
| cvtps2dq %1, %1 | |||||
| %endmacro | |||||
| %macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused | |||||
| pminsd %1, %3 | |||||
| pmaxsd %1, %2 | |||||
| %endmacro | |||||
| %macro SPLATD_MMX 1 | |||||
| punpckldq %1, %1 | |||||
| %endmacro | |||||
| %macro SPLATD_SSE2 1 | |||||
| pshufd %1, %1, 0 | |||||
| %endmacro | |||||
| %macro VECTOR_CLIP_INT32 4 | |||||
| cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len | |||||
| %ifidn %1, sse2 | |||||
| cvtsi2ss m4, minm | |||||
| cvtsi2ss m5, maxm | |||||
| %else | |||||
| movd m4, minm | |||||
| movd m5, maxm | |||||
| %endif | |||||
| SPLATD m4 | |||||
| SPLATD m5 | |||||
| .loop: | |||||
| %assign %%i 1 | |||||
| %rep %3 | |||||
| mova m0, [srcq+mmsize*0*%%i] | |||||
| mova m1, [srcq+mmsize*1*%%i] | |||||
| mova m2, [srcq+mmsize*2*%%i] | |||||
| mova m3, [srcq+mmsize*3*%%i] | |||||
| %if %4 | |||||
| mova m7, [srcq+mmsize*4*%%i] | |||||
| mova m8, [srcq+mmsize*5*%%i] | |||||
| mova m9, [srcq+mmsize*6*%%i] | |||||
| mova m10, [srcq+mmsize*7*%%i] | |||||
| %endif | |||||
| CLIPD m0, m4, m5, m6 | |||||
| CLIPD m1, m4, m5, m6 | |||||
| CLIPD m2, m4, m5, m6 | |||||
| CLIPD m3, m4, m5, m6 | |||||
| %if %4 | |||||
| CLIPD m7, m4, m5, m6 | |||||
| CLIPD m8, m4, m5, m6 | |||||
| CLIPD m9, m4, m5, m6 | |||||
| CLIPD m10, m4, m5, m6 | |||||
| %endif | |||||
| mova [dstq+mmsize*0*%%i], m0 | |||||
| mova [dstq+mmsize*1*%%i], m1 | |||||
| mova [dstq+mmsize*2*%%i], m2 | |||||
| mova [dstq+mmsize*3*%%i], m3 | |||||
| %if %4 | |||||
| mova [dstq+mmsize*4*%%i], m7 | |||||
| mova [dstq+mmsize*5*%%i], m8 | |||||
| mova [dstq+mmsize*6*%%i], m9 | |||||
| mova [dstq+mmsize*7*%%i], m10 | |||||
| %endif | |||||
| %assign %%i %%i+1 | |||||
| %endrep | |||||
| add srcq, mmsize*4*(%3+%4) | |||||
| add dstq, mmsize*4*(%3+%4) | |||||
| sub lend, mmsize*(%3+%4) | |||||
| jg .loop | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| %define SPLATD SPLATD_MMX | |||||
| %define CLIPD CLIPD_MMX | |||||
| VECTOR_CLIP_INT32 mmx, 0, 1, 0 | |||||
| INIT_XMM | |||||
| %define SPLATD SPLATD_SSE2 | |||||
| VECTOR_CLIP_INT32 sse2_int, 6, 1, 0 | |||||
| %define CLIPD CLIPD_SSE2 | |||||
| VECTOR_CLIP_INT32 sse2, 6, 2, 0 | |||||
| %define CLIPD CLIPD_SSE41 | |||||
| %ifdef m8 | |||||
| VECTOR_CLIP_INT32 sse41, 11, 1, 1 | |||||
| %else | |||||
| VECTOR_CLIP_INT32 sse41, 6, 1, 0 | |||||
| %endif | |||||