| @@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input, | |||
| } | |||
| } | |||
| static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len) | |||
| { | |||
| do { | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| *dst++ = av_clip(*src++, min, max); | |||
| len -= 8; | |||
| } while (len > 0); | |||
| } | |||
| #define W0 2048 | |||
| #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | |||
| #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | |||
| @@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||
| c->scalarproduct_int16 = scalarproduct_int16_c; | |||
| c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | |||
| c->apply_window_int16 = apply_window_int16_c; | |||
| c->vector_clip_int32 = vector_clip_int32_c; | |||
| c->scalarproduct_float = scalarproduct_float_c; | |||
| c->butterflies_float = butterflies_float_c; | |||
| c->vector_fmul_scalar = vector_fmul_scalar_c; | |||
| @@ -555,6 +555,22 @@ typedef struct DSPContext { | |||
| void (*apply_window_int16)(int16_t *output, const int16_t *input, | |||
| const int16_t *window, unsigned int len); | |||
| /** | |||
| * Clip each element in an array of int32_t to a given minimum and maximum value. | |||
| * @param dst destination array | |||
| * constraints: 16-byte aligned | |||
| * @param src source array | |||
| * constraints: 16-byte aligned | |||
| * @param min minimum value | |||
| * constraints: must in the the range [-(1<<24), 1<<24] | |||
| * @param max maximum value | |||
| * constraints: must in the the range [-(1<<24), 1<<24] | |||
| * @param len number of elements in the array | |||
| * constraints: multiple of 32 greater than zero | |||
| */ | |||
| void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len); | |||
| /* rv30 functions */ | |||
| qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; | |||
| qpel_mc_func avg_rv30_tpel_pixels_tab[4][16]; | |||
| @@ -2429,6 +2429,15 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i | |||
| float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); | |||
| void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len); | |||
| void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len); | |||
| void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len); | |||
| void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min, | |||
| int32_t max, unsigned int len); | |||
| void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| { | |||
| int mm_flags = av_get_cpu_flags(); | |||
| @@ -2570,6 +2579,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; | |||
| c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; | |||
| c->vector_clip_int32 = ff_vector_clip_int32_mmx; | |||
| #endif | |||
| if (mm_flags & AV_CPU_FLAG_MMX2) { | |||
| @@ -2855,6 +2866,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| #if HAVE_YASM | |||
| c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |||
| c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |||
| if (mm_flags & AV_CPU_FLAG_ATOM) { | |||
| c->vector_clip_int32 = ff_vector_clip_int32_sse2_int; | |||
| } else { | |||
| c->vector_clip_int32 = ff_vector_clip_int32_sse2; | |||
| } | |||
| if (avctx->flags & CODEC_FLAG_BITEXACT) { | |||
| c->apply_window_int16 = ff_apply_window_int16_sse2_ba; | |||
| } else { | |||
| @@ -2880,6 +2896,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| } | |||
| #endif | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { | |||
| #if HAVE_YASM | |||
| c->vector_clip_int32 = ff_vector_clip_int32_sse41; | |||
| #endif | |||
| } | |||
| #if HAVE_AVX && HAVE_YASM | |||
| if (mm_flags & AV_CPU_FLAG_AVX) { | |||
| if (bit_depth == 10) { | |||
| @@ -1048,3 +1048,118 @@ emu_edge sse | |||
| %ifdef ARCH_X86_32 | |||
| emu_edge mmx | |||
| %endif | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, | |||
| ; int32_t max, unsigned int len) | |||
| ;----------------------------------------------------------------------------- | |||
| %macro PMINSD_MMX 3 ; dst, src, tmp | |||
| mova %3, %2 | |||
| pcmpgtd %3, %1 | |||
| pxor %1, %2 | |||
| pand %1, %3 | |||
| pxor %1, %2 | |||
| %endmacro | |||
| %macro PMAXSD_MMX 3 ; dst, src, tmp | |||
| mova %3, %1 | |||
| pcmpgtd %3, %2 | |||
| pand %1, %3 | |||
| pandn %3, %2 | |||
| por %1, %3 | |||
| %endmacro | |||
| %macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp | |||
| PMINSD_MMX %1, %3, %4 | |||
| PMAXSD_MMX %1, %2, %4 | |||
| %endmacro | |||
| %macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused | |||
| cvtdq2ps %1, %1 | |||
| minps %1, %3 | |||
| maxps %1, %2 | |||
| cvtps2dq %1, %1 | |||
| %endmacro | |||
| %macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused | |||
| pminsd %1, %3 | |||
| pmaxsd %1, %2 | |||
| %endmacro | |||
| %macro SPLATD_MMX 1 | |||
| punpckldq %1, %1 | |||
| %endmacro | |||
| %macro SPLATD_SSE2 1 | |||
| pshufd %1, %1, 0 | |||
| %endmacro | |||
| %macro VECTOR_CLIP_INT32 4 | |||
| cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len | |||
| %ifidn %1, sse2 | |||
| cvtsi2ss m4, minm | |||
| cvtsi2ss m5, maxm | |||
| %else | |||
| movd m4, minm | |||
| movd m5, maxm | |||
| %endif | |||
| SPLATD m4 | |||
| SPLATD m5 | |||
| .loop: | |||
| %assign %%i 1 | |||
| %rep %3 | |||
| mova m0, [srcq+mmsize*0*%%i] | |||
| mova m1, [srcq+mmsize*1*%%i] | |||
| mova m2, [srcq+mmsize*2*%%i] | |||
| mova m3, [srcq+mmsize*3*%%i] | |||
| %if %4 | |||
| mova m7, [srcq+mmsize*4*%%i] | |||
| mova m8, [srcq+mmsize*5*%%i] | |||
| mova m9, [srcq+mmsize*6*%%i] | |||
| mova m10, [srcq+mmsize*7*%%i] | |||
| %endif | |||
| CLIPD m0, m4, m5, m6 | |||
| CLIPD m1, m4, m5, m6 | |||
| CLIPD m2, m4, m5, m6 | |||
| CLIPD m3, m4, m5, m6 | |||
| %if %4 | |||
| CLIPD m7, m4, m5, m6 | |||
| CLIPD m8, m4, m5, m6 | |||
| CLIPD m9, m4, m5, m6 | |||
| CLIPD m10, m4, m5, m6 | |||
| %endif | |||
| mova [dstq+mmsize*0*%%i], m0 | |||
| mova [dstq+mmsize*1*%%i], m1 | |||
| mova [dstq+mmsize*2*%%i], m2 | |||
| mova [dstq+mmsize*3*%%i], m3 | |||
| %if %4 | |||
| mova [dstq+mmsize*4*%%i], m7 | |||
| mova [dstq+mmsize*5*%%i], m8 | |||
| mova [dstq+mmsize*6*%%i], m9 | |||
| mova [dstq+mmsize*7*%%i], m10 | |||
| %endif | |||
| %assign %%i %%i+1 | |||
| %endrep | |||
| add srcq, mmsize*4*(%3+%4) | |||
| add dstq, mmsize*4*(%3+%4) | |||
| sub lend, mmsize*(%3+%4) | |||
| jg .loop | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX | |||
| %define SPLATD SPLATD_MMX | |||
| %define CLIPD CLIPD_MMX | |||
| VECTOR_CLIP_INT32 mmx, 0, 1, 0 | |||
| INIT_XMM | |||
| %define SPLATD SPLATD_SSE2 | |||
| VECTOR_CLIP_INT32 sse2_int, 6, 1, 0 | |||
| %define CLIPD CLIPD_SSE2 | |||
| VECTOR_CLIP_INT32 sse2, 6, 2, 0 | |||
| %define CLIPD CLIPD_SSE41 | |||
| %ifdef m8 | |||
| VECTOR_CLIP_INT32 sse41, 11, 1, 1 | |||
| %else | |||
| VECTOR_CLIP_INT32 sse41, 6, 1, 0 | |||
| %endif | |||