ac3enc: add int32_t array clipping function to DSPUtil, including x86 versions.

14 years ago · 6054cd25b4
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
    }
 }
 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                int32_t max, unsigned int len)
 {
    do {
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        *dst++ = av_clip(*src++, min, max);
        len -= 8;
    } while (len > 0);
 }
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    c->apply_window_int16 = apply_window_int16_c;
    c->vector_clip_int32 = vector_clip_int32_c;
    c->scalarproduct_float = scalarproduct_float_c;
    c->butterflies_float = butterflies_float_c;
    c->vector_fmul_scalar = vector_fmul_scalar_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -555,6 +555,22 @@ typedef struct DSPContext {
    void (*apply_window_int16)(int16_t *output, const int16_t *input,
                               const int16_t *window, unsigned int len);
    /**
     * Clip each element in an array of int32_t to a given minimum and maximum value.
     * @param dst  destination array
     *             constraints: 16-byte aligned
     * @param src  source array
     *             constraints: 16-byte aligned
     * @param min  minimum value
     *             constraints: must in the the range [-(1<<24), 1<<24]
     * @param max  maximum value
     *             constraints: must in the the range [-(1<<24), 1<<24]
     * @param len  number of elements in the array
     *             constraints: multiple of 32 greater than zero
     */
    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                              int32_t max, unsigned int len);
    /* rv30 functions */
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2429,6 +2429,15 @@ int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src, int32_t min,
                                   int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src, int32_t min,
                                   int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
                                   int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse41   (int32_t *dst, const int32_t *src, int32_t min,
                                   int32_t max, unsigned int len);
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();
@@ -2570,6 +2579,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
        c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
        c->vector_clip_int32 = ff_vector_clip_int32_mmx;
 #endif
        if (mm_flags & AV_CPU_FLAG_MMX2) {
@@ -2855,6 +2866,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
            if (mm_flags & AV_CPU_FLAG_ATOM) {
                c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
            } else {
                c->vector_clip_int32 = ff_vector_clip_int32_sse2;
            }
            if (avctx->flags & CODEC_FLAG_BITEXACT) {
                c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
            } else {
@@ -2880,6 +2896,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            }
 #endif
        }
        if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
 #if HAVE_YASM
            c->vector_clip_int32 = ff_vector_clip_int32_sse41;
 #endif
        }
 #if HAVE_AVX && HAVE_YASM
        if (mm_flags & AV_CPU_FLAG_AVX) {
            if (bit_depth == 10) {
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1048,3 +1048,118 @@ emu_edge sse
 %ifdef ARCH_X86_32
 emu_edge mmx
 %endif
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------
 %macro PMINSD_MMX 3 ; dst, src, tmp
    mova      %3, %2
    pcmpgtd   %3, %1
    pxor      %1, %2
    pand      %1, %3
    pxor      %1, %2
 %endmacro
 %macro PMAXSD_MMX 3 ; dst, src, tmp
    mova      %3, %1
    pcmpgtd   %3, %2
    pand      %1, %3
    pandn     %3, %2
    por       %1, %3
 %endmacro
 %macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
    PMINSD_MMX %1, %3, %4
    PMAXSD_MMX %1, %2, %4
 %endmacro
 %macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
    cvtdq2ps  %1, %1
    minps     %1, %3
    maxps     %1, %2
    cvtps2dq  %1, %1
 %endmacro
 %macro CLIPD_SSE41 3-4 ;  src/dst, min, max, unused
    pminsd  %1, %3
    pmaxsd  %1, %2
 %endmacro
 %macro SPLATD_MMX 1
    punpckldq  %1, %1
 %endmacro
 %macro SPLATD_SSE2 1
    pshufd  %1, %1, 0
 %endmacro
 %macro VECTOR_CLIP_INT32 4
 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
 %ifidn %1, sse2
    cvtsi2ss  m4, minm
    cvtsi2ss  m5, maxm
 %else
    movd      m4, minm
    movd      m5, maxm
 %endif
    SPLATD    m4
    SPLATD    m5
 .loop:
 %assign %%i 1
 %rep %3
    mova      m0,  [srcq+mmsize*0*%%i]
    mova      m1,  [srcq+mmsize*1*%%i]
    mova      m2,  [srcq+mmsize*2*%%i]
    mova      m3,  [srcq+mmsize*3*%%i]
 %if %4
    mova      m7,  [srcq+mmsize*4*%%i]
    mova      m8,  [srcq+mmsize*5*%%i]
    mova      m9,  [srcq+mmsize*6*%%i]
    mova      m10, [srcq+mmsize*7*%%i]
 %endif
    CLIPD  m0,  m4, m5, m6
    CLIPD  m1,  m4, m5, m6
    CLIPD  m2,  m4, m5, m6
    CLIPD  m3,  m4, m5, m6
 %if %4
    CLIPD  m7,  m4, m5, m6
    CLIPD  m8,  m4, m5, m6
    CLIPD  m9,  m4, m5, m6
    CLIPD  m10, m4, m5, m6
 %endif
    mova  [dstq+mmsize*0*%%i], m0
    mova  [dstq+mmsize*1*%%i], m1
    mova  [dstq+mmsize*2*%%i], m2
    mova  [dstq+mmsize*3*%%i], m3
 %if %4
    mova  [dstq+mmsize*4*%%i], m7
    mova  [dstq+mmsize*5*%%i], m8
    mova  [dstq+mmsize*6*%%i], m9
    mova  [dstq+mmsize*7*%%i], m10
 %endif
 %assign %%i %%i+1
 %endrep
    add     srcq, mmsize*4*(%3+%4)
    add     dstq, mmsize*4*(%3+%4)
    sub     lend, mmsize*(%3+%4)
    jg .loop
    REP_RET
 %endmacro
 INIT_MMX
 %define SPLATD SPLATD_MMX
 %define CLIPD CLIPD_MMX
 VECTOR_CLIP_INT32 mmx, 0, 1, 0
 INIT_XMM
 %define SPLATD SPLATD_SSE2
 VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
 %define CLIPD CLIPD_SSE2
 VECTOR_CLIP_INT32 sse2, 6, 2, 0
 %define CLIPD CLIPD_SSE41
 %ifdef m8
 VECTOR_CLIP_INT32 sse41, 11, 1, 1
 %else
 VECTOR_CLIP_INT32 sse41, 6, 1, 0
 %endif