Remove unneeded add bias from 3 functions.

DSPContext.vector_fmul_window() DCADSPContext.lfe_fir() SynthFilterContext.synth_filter_float() Signed-off-by: Mans Rullgard <mans@mansr.com>
15 years ago · 80ba1ddb58
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -1721,19 +1721,19 @@ static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
     */
    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
        ac->dsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 0, 512);
        ac->dsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
    } else {
        memcpy(                        out,               saved,            448 * sizeof(float));

        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
            ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 0, 64);
            ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      0, 64);
            ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      0, 64);
            ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      0, 64);
            ac->dsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      0, 64);
            ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
            ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
            ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
            ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
            ac->dsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
            memcpy(                    out + 448 + 4*128, temp, 64 * sizeof(float));
        } else {
            ac->dsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 0, 64);
            ac->dsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
            memcpy(                    out + 576,         buf + 64,         448 * sizeof(float));
        }
    }
@@ -1741,9 +1741,9 @@ static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
    // buffer update
    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
        memcpy(                    saved,       temp + 64,         64 * sizeof(float));
        ac->dsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 0, 64);
        ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 0, 64);
        ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 0, 64);
        ac->dsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
        ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
        ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
        memcpy(                    saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
        memcpy(                    saved,       buf + 512,        448 * sizeof(float));
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -628,13 +628,13 @@ static inline void do_imdct(AC3DecodeContext *s, int channels)
            for(i=0; i<128; i++)
                x[i] = s->transform_coeffs[ch][2*i];
            ff_imdct_half(&s->imdct_256, s->tmp_output, x);
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128);
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128);
            for(i=0; i<128; i++)
                x[i] = s->transform_coeffs[ch][2*i+1];
            ff_imdct_half(&s->imdct_256, s->delay[ch-1], x);
        } else {
            ff_imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128);
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128);
            memcpy(s->delay[ch-1], s->tmp_output+128, 128*sizeof(float));
        }
    }
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -23,7 +23,7 @@
 #include "libavcodec/dcadsp.h"

 void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
                         int decifactor, float scale, float bias);
                         int decifactor, float scale);

 void av_cold ff_dcadsp_init_arm(DCADSPContext *s)
 {
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -29,7 +29,7 @@ function ff_dca_lfe_fir_neon, export=1
        cmp             r3,  #32
        moveq           r6,  #256/32
        movne           r6,  #256/64
 NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
 NOVFP   vldr            s0,  [sp, #16]          @ scale
        mov             lr,  #-16
 1:
        vmov.f32        q2,  #0.0               @ v0
@@ -51,8 +51,7 @@ NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
        vadd.f32        d4,  d4,  d5
        vadd.f32        d6,  d6,  d7
        vpadd.f32       d4,  d4,  d6
        vdup.32         d5,  d0[1]
        vmla.f32        d5,  d4,  d0[0]
        vmul.f32        d5,  d4,  d0[0]
        vst1.32         {d5[0]},  [r0,:32]!
        vst1.32         {d5[1]},  [r4,:32]!
        bne             1b
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -140,8 +140,7 @@ void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);

 void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
 void ff_vector_fmul_window_neon(float *dst, const float *src0,
                                const float *src1, const float *win,
                                float add_bias, int len);
                                const float *src1, const float *win, int len);
 void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
                                int len);
 void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src,
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -777,11 +777,8 @@ function ff_vector_fmul_neon, export=1
 endfunc

 function ff_vector_fmul_window_neon, export=1
 VFP     vdup.32         q8,  d0[0]
 NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
        push            {r4,r5,lr}
 VFP     ldr             lr,  [sp, #12]
 NOVFP   ldr             lr,  [sp, #16]
        ldr             lr,  [sp, #12]
        sub             r2,  r2,  #8
        sub             r5,  lr,  #2
        add             r2,  r2,  r5, lsl #2
@@ -793,14 +790,12 @@ NOVFP   ldr             lr,  [sp, #16]
        vld1.64         {d4,d5},  [r3,:128]!
        vld1.64         {d6,d7},  [r4,:128], r5
 1:      subs            lr,  lr,  #4
        vmov            q11, q8
        vmla.f32        d22, d0,  d4
        vmov            q10, q8
        vmla.f32        d23, d1,  d5
        vmul.f32        d22, d0,  d4
        vrev64.32       q3,  q3
        vmla.f32        d20, d0,  d7
        vmul.f32        d23, d1,  d5
        vrev64.32       q1,  q1
        vmla.f32        d21, d1,  d6
        vmul.f32        d20, d0,  d7
        vmul.f32        d21, d1,  d6
        beq             2f
        vmla.f32        d22, d3,  d7
        vld1.64         {d0,d1},  [r1,:128]!
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -34,7 +34,7 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
                                float synth_buf2[32], const float window[512],
                                float out[32], const float in[32],
                                float scale, float bias);
                                float scale);

 av_cold void ff_fft_init_arm(FFTContext *s)
 {
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -42,7 +42,7 @@ VFP     vpop            {d0}

        ldr             r5,  [sp, #9*4]         @ window
        ldr             r2,  [sp, #10*4]        @ out
 NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
 NOVFP   vldr            s0,  [sp, #12*4]        @ scale
        add             r8,  r9,  #12*4

        mov             lr,  #64*4
@@ -90,10 +90,8 @@ NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
        sub             r11, r11, #512*4
        b               2b
 3:
        vdup.32         q8,  d0[1]
        vdup.32         q9,  d0[1]
        vmla.f32        q8,  q10, d0[0]
        vmla.f32        q9,  q1,  d0[0]
        vmul.f32        q8,  q10, d0[0]
        vmul.f32        q9,  q1,  d0[0]
        vst1.32         {q3},     [r3,:128]
        sub             r3,  r3,  #16*4
        vst1.32         {q2},     [r3,:128]
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -141,7 +141,7 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)

            /* overlap and window */
            q->dsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
                                      &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 0, 16);
                                      &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);

            prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
            start_pos += block_size;
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -896,7 +896,7 @@ static void qmf_32_subbands(DCAContext * s, int chans,
        s->synth.synth_filter_float(&s->imdct,
                              s->subband_fir_hist[chans], &s->hist_index[chans],
                              s->subband_fir_noidea[chans], prCoeff,
                              samples_out, s->raXin, scale, 0);
                              samples_out, s->raXin, scale);
        samples_out+= 32;

    }
@@ -929,7 +929,7 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
    /* Interpolation */
    for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
        s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor,
                          scale, 0);
                          scale);
        samples_in++;
        samples_out += 2 * decifactor;
    }
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -23,7 +23,7 @@
 #include "dcadsp.h"

 static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
                          int decifactor, float scale, float bias)
                          int decifactor, float scale)
 {
    float *out2 = out + decifactor;
    const float *cf0 = coefs;
@@ -39,8 +39,8 @@ static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
            v0 += s * *cf0++;
            v1 += s * *--cf1;
        }
        *out++  = (v0 * scale) + bias;
        *out2++ = (v1 * scale) + bias;
        *out++  = v0 * scale;
        *out2++ = v1 * scale;
    }
 }

--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -21,7 +21,7 @@

 typedef struct DCADSPContext {
    void (*lfe_fir)(float *out, const float *in, const float *coefs,
                    int decifactor, float scale, float bias);
                    int decifactor, float scale);
 } DCADSPContext;

 void ff_dcadsp_init(DCADSPContext *s);
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3776,7 +3776,9 @@ static void vector_fmul_add_c(float *dst, const float *src0, const float *src1,
        dst[i] = src0[i] * src1[i] + src2[i];
 }

 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
 static void vector_fmul_window_c(float *dst, const float *src0,
                                 const float *src1, const float *win, int len)
 {
    int i,j;
    dst += len;
    win += len;
@@ -3786,8 +3788,8 @@ void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, c
        float s1 = src1[j];
        float wi = win[i];
        float wj = win[j];
        dst[i] = s0*wj - s1*wi + add_bias;
        dst[j] = s0*wi + s1*wj + add_bias;
        dst[i] = s0*wj - s1*wi;
        dst[j] = s0*wi + s1*wj;
    }
 }

@@ -4434,7 +4436,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->vector_fmul = vector_fmul_c;
    c->vector_fmul_reverse = vector_fmul_reverse_c;
    c->vector_fmul_add = vector_fmul_add_c;
    c->vector_fmul_window = ff_vector_fmul_window_c;
    c->vector_fmul_window = vector_fmul_window_c;
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
    c->vector_clipf = vector_clipf_c;
    c->float_to_int16 = ff_float_to_int16_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -68,9 +68,6 @@ void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
 void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
 void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);

 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                             const float *win, float add_bias, int len);

 /* encoding scans */
 extern const uint8_t ff_alternate_horizontal_scan[64];
 extern const uint8_t ff_alternate_vertical_scan[64];
@@ -393,7 +390,7 @@ typedef struct DSPContext {
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -90,13 +90,9 @@ static void vector_fmul_add_altivec(float *dst, const float *src0,
    }
 }

 static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
 static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
 {
    union {
        vector float v;
        float s[4];
    } vadd;
    vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
    vector float zero, t0, t1, s0, s1, wi, wj;
    const vector unsigned char reverse = vcprm(3,2,1,0);
    int i,j;

@@ -104,8 +100,6 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
    win += len;
    src0+= len;

    vadd.s[0] = add_bias;
    vadd_bias = vec_splat(vadd.v, 0);
    zero = (vector float)vec_splat_u32(0);

    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
@@ -117,9 +111,9 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
        s1 = vec_perm(s1, s1, reverse);
        wj = vec_perm(wj, wj, reverse);

        t0 = vec_madd(s0, wj, vadd_bias);
        t0 = vec_madd(s0, wj, zero);
        t0 = vec_nmsub(s1, wi, t0);
        t1 = vec_madd(s0, wi, vadd_bias);
        t1 = vec_madd(s0, wi, zero);
        t1 = vec_madd(s1, wj, t1);
        t1 = vec_perm(t1, t1, reverse);

--- a/libavcodec/synth_filter.c
+++ b/libavcodec/synth_filter.c
@@ -24,7 +24,7 @@
 static void synth_filter_float(FFTContext *imdct,
                           float *synth_buf_ptr, int *synth_buf_offset,
                           float synth_buf2[32], const float window[512],
                           float out[32], const float in[32], float scale, float bias)
                           float out[32], const float in[32], float scale)
 {
    float *synth_buf= synth_buf_ptr + *synth_buf_offset;
    int i, j;
@@ -48,8 +48,8 @@ static void synth_filter_float(FFTContext *imdct,
            c += window[i + j + 32]*( synth_buf[16 + i + j - 512]);
            d += window[i + j + 48]*( synth_buf[31 - i + j - 512]);
        }
        out[i     ] = a*scale + bias;
        out[i + 16] = b*scale + bias;
        out[i     ] = a*scale;
        out[i + 16] = b*scale;
        synth_buf2[i     ] = c;
        synth_buf2[i + 16] = d;
    }
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@@ -28,7 +28,7 @@ typedef struct SynthFilterContext {
                               float *synth_buf_ptr, int *synth_buf_offset,
                               float synth_buf2[32], const float window[512],
                               float out[32], const float in[32],
                               float scale, float bias);
                               float scale);
 } SynthFilterContext;

 void ff_synth_filter_init(SynthFilterContext *c);
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -646,7 +646,6 @@ static void imdct_and_window(TwinContext *tctx, enum FrameType ftype, int wtype,
                                     prev_buf + (bsize-wsize)/2,
                                     buf1 + bsize*j,
                                     ff_sine_windows[av_log2(wsize)],
                                     0.0,
                                     wsize/2);
        out2 += wsize;

--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -1575,13 +1575,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc)
        const float *win  = vc->win[blockflag & previous_window];

        if (blockflag == previous_window) {
            vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, blocksize / 4);
            vc->dsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
        } else if (blockflag > previous_window) {
            vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, bs0 / 4);
            vc->dsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
            memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
        } else {
            memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
            vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, 0, bs0 / 4);
            vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
        }
        memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
    }
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -1031,7 +1031,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
        winlen >>= 1;

        s->dsp.vector_fmul_window(start, start, start + winlen,
                                  window, 0, winlen);
                                  window, winlen);

        s->channel[c].prev_block_len = s->subframe_len;
    }
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2190,10 +2190,9 @@ static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1
    );
 }

 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
                                      const float *win, float add_bias, int len){
 #if HAVE_6REGS
    if(add_bias == 0){
 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
                                      const float *win, int len){
        x86_reg i = -len*4;
        x86_reg j = len*4-8;
        __asm__ volatile(
@@ -2220,15 +2219,10 @@ static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float
            :"+r"(i), "+r"(j)
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
        );
    }else
 #endif
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
 }

 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
                                   const float *win, float add_bias, int len){
 #if HAVE_6REGS
    if(add_bias == 0){
                                   const float *win, int len){
        x86_reg i = -len*4;
        x86_reg j = len*4-16;
        __asm__ volatile(
@@ -2256,10 +2250,8 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
            :"+r"(i), "+r"(j)
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
        );
    }else
 #endif
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
 }
 #endif /* HAVE_6REGS */

 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
 {
@@ -2882,7 +2874,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        }
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_3dnow2;
 #endif
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
            }
@@ -2899,7 +2893,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            c->vector_fmul = vector_fmul_sse;
            c->vector_fmul_reverse = vector_fmul_reverse_sse;
            c->vector_fmul_add = vector_fmul_add_sse;
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_sse;
 #endif
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
            c->vector_clipf = vector_clipf_sse;
            c->float_to_int16 = float_to_int16_sse;