Merge remote-tracking branch 'qatar/master'

* qatar/master: (22 commits) g723.1: do not pass large structs by value g723.1: do not bounce intermediate values via memory g723.1: declare a variable in the block it is used g723.1: avoid saving/restoring excitation g723.1: avoid unnecessary memcpy() in residual_interp() g723.1: make postfilter write directly to output buffer g723.1: drop unnecessary variable buf_ptr in formant_postfilter() g723.1: make scale_vector() output to a separate buffer g723.1: make autocorr_max() work on an arbitrary buffer g723.1: do not needlessly use int64_t g723.1: use saturating addition functions g723.1: optimise scale_vector() g723.1: remove useless uses of MUL64() g723.1: remove unnecessary argument 'shift' from dot_product() g723.1: deobfuscate "(x << 4) - x" to "15 * x" celp: optimise ff_celp_lp_synthesis_filter() libavutil: add saturating addition functions cllc: Implement ARGB support cllc: Add support for QRGB cllc: Rename some funcs to represent what they actually do ... Conflicts: LICENSE libavcodec/g723_1.c libavcodec/x86/Makefile Merged-by: Michael Niedermayer <michaelni@gmx.at>
13 years ago · d8c3170c9f
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,4 @@
 FFmpeg:
 -------

 Most files in FFmpeg are under the GNU Lesser General Public License version 2.1
 or later (LGPL v2.1+). Read the file COPYING.LGPLv2.1 for details. Some other
@@ -51,18 +50,29 @@ for you. Read the file COPYING.LGPLv3 or, if you have enabled GPL parts,
 COPYING.GPLv3 to learn the exact legal terms that apply in this case.


 external libraries:
 -------------------
 external libraries
 ==================

 Some external libraries, e.g. libx264, are under GPL and can be used in
 conjunction with FFmpeg. They require --enable-gpl to be passed to configure
 as well.
 FFmpeg can be combined with a number of external libraries, which sometimes
 affect the licensing of binaries resulting from the combination.

 The OpenCORE external libraries are under the Apache License 2.0. That license
 is incompatible with the LGPL v2.1 and the GPL v2, but not with version 3 of
 those licenses. So to combine the OpenCORE libraries with FFmpeg, the license
 version needs to be upgraded by passing --enable-version3 to configure.
 compatible libraries
 --------------------

 The nonfree external libraries libfaac and libaacplus can be hooked up in FFmpeg.
 You need to pass --enable-nonfree to configure to enable it. Employ this option
 with care as FFmpeg then becomes nonfree and unredistributable.
 The libcdio, libx264, libxavs and libxvid libraries are under GPL. When
 combining them with FFmpeg, FFmpeg needs to be licensed as GPL as well by
 passing --enable-gpl to configure.

 The OpenCORE and VisualOn libraries are under the Apache License 2.0. That
 license is incompatible with the LGPL v2.1 and the GPL v2, but not with
 version 3 of those licenses. So to combine these libraries with FFmpeg, the
 license version needs to be upgraded by passing --enable-version3 to configure.

 incompatible libraries
 ----------------------

 The Fraunhofer AAC library, FAAC and aacplus are under licenses incompatible
 with all (L)GPL versions. Thus, unfortunately, since both licenses cannot be
 satisfied simultaneously, binaries resulting from the combination of FFmpeg
 with these libraries are nonfree und unredistributable. If you wish to enable
 any of these libraries nonetheless, pass --enable-nonfree to configure.
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -63,17 +63,16 @@ int ff_celp_lp_synthesis_filter(int16_t *out, const int16_t *filter_coeffs,
    int i,n;

    for (n = 0; n < buffer_length; n++) {
        int sum = rounder;
        int sum = -rounder, sum1;
        for (i = 1; i <= filter_length; i++)
            sum -= filter_coeffs[i-1] * out[n-i];
            sum += filter_coeffs[i-1] * out[n-i];

        sum = ((sum >> 12) + in[n]) >> shift;
        sum1 = ((-sum >> 12) + in[n]) >> shift;
        sum  = av_clip_int16(sum1);

        if (stop_on_overflow && sum != sum1)
            return 1;

        if (sum + 0x8000 > 0xFFFFU) {
            if (stop_on_overflow)
                return 1;
            sum = (sum >> 31) ^ 32767;
        }
        out[n] = sum;
    }

--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -65,7 +65,7 @@ typedef struct g723_1_context {
    int pf_gain;                 ///< formant postfilter
                                 ///< gain scaling unit memory
    int postfilter;
    int16_t audio[FRAME_LEN + LPC_ORDER];
    int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX];
    int16_t prev_data[HALF_FRAME_LEN];
    int16_t prev_weight_sig[PITCH_MAX];

@@ -245,32 +245,27 @@ static int normalize_bits(int num, int width)

 #define normalize_bits_int16(num) normalize_bits(num, 15)
 #define normalize_bits_int32(num) normalize_bits(num, 31)
 #define dot_product(a,b,c,d) (ff_dot_product(a,b,c)<<(d))

 /**
 * Scale vector contents based on the largest of their absolutes.
 */
 static int scale_vector(int16_t *vector, int length)
 static int scale_vector(int16_t *dst, const int16_t *vector, int length)
 {
    int bits, scale, max = 0;
    int bits, max = 0;
    int i;

    const int16_t shift_table[16] = {
        0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
        0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x7fff
    };

    for (i = 0; i < length; i++)
        max = FFMAX(max, FFABS(vector[i]));
        max |= FFABS(vector[i]);

    max   = FFMIN(max, 0x7FFF);
    bits  = normalize_bits(max, 15);
    scale = shift_table[bits];

    for (i = 0; i < length; i++) {
        av_assert2(av_clipl_int32(vector[i] * (int64_t)scale << 1) == vector[i] * (int64_t)scale << 1);
        vector[i] = (vector[i] * scale) >> 3;
    }
    if (bits == 15)
        for (i = 0; i < length; i++)
            dst[i] = vector[i] * 0x7fff >> 3;
    else
        for (i = 0; i < length; i++)
            dst[i] = vector[i] << bits >> 3;

    return bits - 3;
 }
@@ -369,11 +364,11 @@ static void lsp2lpc(int16_t *lpc)
    for (j = 0; j < LPC_ORDER; j++) {
        int index     = lpc[j] >> 7;
        int offset    = lpc[j] & 0x7f;
        int64_t temp1 = cos_tab[index] << 16;
        int temp1     = cos_tab[index] << 16;
        int temp2     = (cos_tab[index + 1] - cos_tab[index]) *
                          ((offset << 8) + 0x80) << 1;

        lpc[j] = -(av_clipl_int32(((temp1 + temp2) << 1) + (1 << 15)) >> 16);
        lpc[j] = -(av_sat_dadd32(1 << 15, temp1 + temp2) >> 16);
    }

    /*
@@ -473,7 +468,7 @@ static void gen_dirac_train(int16_t *buf, int pitch_lag)
 * @param pitch_lag closed loop pitch lag
 * @param index     current subframe index
 */
 static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
 static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe *subfrm,
                               enum Rate cur_rate, int pitch_lag, int index)
 {
    int temp, i, j;
@@ -481,34 +476,34 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
    memset(vector, 0, SUBFRAME_LEN * sizeof(*vector));

    if (cur_rate == RATE_6300) {
        if (subfrm.pulse_pos >= max_pos[index])
        if (subfrm->pulse_pos >= max_pos[index])
            return;

        /* Decode amplitudes and positions */
        j = PULSE_MAX - pulses[index];
        temp = subfrm.pulse_pos;
        temp = subfrm->pulse_pos;
        for (i = 0; i < SUBFRAME_LEN / GRID_SIZE; i++) {
            temp -= combinatorial_table[j][i];
            if (temp >= 0)
                continue;
            temp += combinatorial_table[j++][i];
            if (subfrm.pulse_sign & (1 << (PULSE_MAX - j))) {
                vector[subfrm.grid_index + GRID_SIZE * i] =
                                        -fixed_cb_gain[subfrm.amp_index];
            if (subfrm->pulse_sign & (1 << (PULSE_MAX - j))) {
                vector[subfrm->grid_index + GRID_SIZE * i] =
                                        -fixed_cb_gain[subfrm->amp_index];
            } else {
                vector[subfrm.grid_index + GRID_SIZE * i] =
                                         fixed_cb_gain[subfrm.amp_index];
                vector[subfrm->grid_index + GRID_SIZE * i] =
                                         fixed_cb_gain[subfrm->amp_index];
            }
            if (j == PULSE_MAX)
                break;
        }
        if (subfrm.dirac_train == 1)
        if (subfrm->dirac_train == 1)
            gen_dirac_train(vector, pitch_lag);
    } else { /* 5300 bps */
        int cb_gain  = fixed_cb_gain[subfrm.amp_index];
        int cb_shift = subfrm.grid_index;
        int cb_sign  = subfrm.pulse_sign;
        int cb_pos   = subfrm.pulse_pos;
        int cb_gain  = fixed_cb_gain[subfrm->amp_index];
        int cb_shift = subfrm->grid_index;
        int cb_sign  = subfrm->pulse_sign;
        int cb_pos   = subfrm->pulse_pos;
        int offset, beta, lag;

        for (i = 0; i < 8; i += 2) {
@@ -519,9 +514,9 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
        }

        /* Enhance harmonic components */
        lag  = pitch_contrib[subfrm.ad_cb_gain << 1] + pitch_lag +
               subfrm.ad_cb_lag - 1;
        beta = pitch_contrib[(subfrm.ad_cb_gain << 1) + 1];
        lag  = pitch_contrib[subfrm->ad_cb_gain << 1] + pitch_lag +
               subfrm->ad_cb_lag - 1;
        beta = pitch_contrib[(subfrm->ad_cb_gain << 1) + 1];

        if (lag < SUBFRAME_LEN - 2) {
            for (i = lag; i < SUBFRAME_LEN; i++)
@@ -546,19 +541,25 @@ static void get_residual(int16_t *residual, int16_t *prev_excitation, int lag)
        residual[i] = prev_excitation[offset + (i - 2) % lag];
 }

 static int dot_product(const int16_t *a, const int16_t *b, int length)
 {
    int sum = ff_dot_product(a,b,length);
    return av_sat_add32(sum, sum);
 }

 /**
 * Generate adaptive codebook excitation.
 */
 static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
                               int pitch_lag, G723_1_Subframe subfrm,
                               int pitch_lag, G723_1_Subframe *subfrm,
                               enum Rate cur_rate)
 {
    int16_t residual[SUBFRAME_LEN + PITCH_ORDER - 1];
    const int16_t *cb_ptr;
    int lag = pitch_lag + subfrm.ad_cb_lag - 1;
    int lag = pitch_lag + subfrm->ad_cb_lag - 1;

    int i;
    int64_t sum;
    int sum;

    get_residual(residual, prev_excitation, lag);

@@ -569,28 +570,27 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
        cb_ptr = adaptive_cb_gain170;

    /* Calculate adaptive vector */
    cb_ptr += subfrm.ad_cb_gain * 20;
    cb_ptr += subfrm->ad_cb_gain * 20;
    for (i = 0; i < SUBFRAME_LEN; i++) {
        sum = ff_dot_product(residual + i, cb_ptr, PITCH_ORDER);
        vector[i] = av_clipl_int32((sum << 2) + (1 << 15)) >> 16;
        vector[i] = av_sat_dadd32(1 << 15, av_sat_add32(sum, sum)) >> 16;
    }
 }

 /**
 * Estimate maximum auto-correlation around pitch lag.
 *
 * @param p         the context
 * @param buf       buffer with offset applied
 * @param offset    offset of the excitation vector
 * @param ccr_max   pointer to the maximum auto-correlation
 * @param pitch_lag decoded pitch lag
 * @param length    length of autocorrelation
 * @param dir       forward lag(1) / backward lag(-1)
 */
 static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max,
 static int autocorr_max(const int16_t *buf, int offset, int *ccr_max,
                        int pitch_lag, int length, int dir)
 {
    int limit, ccr, lag = 0;
    int16_t *buf = p->excitation + offset;
    int i;

    pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag);
@@ -600,7 +600,7 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max,
        limit = pitch_lag + 3;

    for (i = pitch_lag - 3; i <= limit; i++) {
        ccr = ff_dot_product(buf, buf + dir * i, length)<<1;
        ccr = dot_product(buf, buf + dir * i, length);

        if (ccr > *ccr_max) {
            *ccr_max = ccr;
@@ -624,7 +624,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate,
                           int tgt_eng, int ccr, int res_eng)
 {
    int pf_residual;     /* square of postfiltered residual */
    int64_t temp1, temp2;
    int temp1, temp2;

    ppf->index = lag;

@@ -641,7 +641,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate,
        /* pf_res^2 = tgt_eng + 2*ccr*gain + res_eng*gain^2 */
        temp1       = (tgt_eng << 15) + (ccr * ppf->opt_gain << 1);
        temp2       = (ppf->opt_gain * ppf->opt_gain >> 15) * res_eng;
        pf_residual = av_clipl_int32(temp1 + temp2 + (1 << 15)) >> 16;
        pf_residual = av_sat_add32(temp1, temp2 + (1 << 15)) >> 16;

        if (tgt_eng >= pf_residual << 1) {
            temp1 = 0x7fff;
@@ -674,7 +674,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,

    int16_t scale;
    int i;
    int64_t temp1, temp2;
    int temp1, temp2;

    /*
     * 0 - target energy
@@ -684,10 +684,10 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
     * 4 - backward residual energy
     */
    int energy[5] = {0, 0, 0, 0, 0};
    int16_t *buf  = p->excitation + offset;
    int fwd_lag   = autocorr_max(p, offset, &energy[1], pitch_lag,
    int16_t *buf  = p->audio + LPC_ORDER + offset;
    int fwd_lag   = autocorr_max(buf, offset, &energy[1], pitch_lag,
                                 SUBFRAME_LEN, 1);
    int back_lag  = autocorr_max(p, offset, &energy[3], pitch_lag,
    int back_lag  = autocorr_max(buf, offset, &energy[3], pitch_lag,
                                 SUBFRAME_LEN, -1);

    ppf->index    = 0;
@@ -699,17 +699,15 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
        return;

    /* Compute target energy */
    energy[0] = ff_dot_product(buf, buf, SUBFRAME_LEN)<<1;
    energy[0] = dot_product(buf, buf, SUBFRAME_LEN);

    /* Compute forward residual energy */
    if (fwd_lag)
        energy[2] = ff_dot_product(buf + fwd_lag, buf + fwd_lag,
                                   SUBFRAME_LEN)<<1;
        energy[2] = dot_product(buf + fwd_lag, buf + fwd_lag, SUBFRAME_LEN);

    /* Compute backward residual energy */
    if (back_lag)
        energy[4] = ff_dot_product(buf - back_lag, buf - back_lag,
                                   SUBFRAME_LEN)<<1;
        energy[4] = dot_product(buf - back_lag, buf - back_lag, SUBFRAME_LEN);

    /* Normalize and shorten */
    temp1 = 0;
@@ -758,28 +756,28 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
                             int *exc_eng, int *scale)
 {
    int offset = PITCH_MAX + 2 * SUBFRAME_LEN;
    int16_t *buf = p->excitation + offset;
    int16_t *buf = p->audio + LPC_ORDER;

    int index, ccr, tgt_eng, best_eng, temp;

    *scale = scale_vector(p->excitation, FRAME_LEN + PITCH_MAX);
    *scale = scale_vector(buf, p->excitation, FRAME_LEN + PITCH_MAX);
    buf   += offset;

    /* Compute maximum backward cross-correlation */
    ccr   = 0;
    index = autocorr_max(p, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1);
    ccr   = av_clipl_int32((int64_t)ccr + (1 << 15)) >> 16;
    index = autocorr_max(buf, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1);
    ccr   = av_sat_add32(ccr, 1 << 15) >> 16;

    /* Compute target energy */
    tgt_eng  = ff_dot_product(buf, buf, SUBFRAME_LEN * 2)<<1;
    *exc_eng = av_clipl_int32(tgt_eng + (1 << 15)) >> 16;
    tgt_eng  = dot_product(buf, buf, SUBFRAME_LEN * 2);
    *exc_eng = av_sat_add32(tgt_eng, 1 << 15) >> 16;

    if (ccr <= 0)
        return 0;

    /* Compute best energy */
    best_eng = ff_dot_product(buf - index, buf - index,
                              SUBFRAME_LEN * 2)<<1;
    best_eng = av_clipl_int32((int64_t)best_eng + (1 << 15)) >> 16;
    best_eng = dot_product(buf - index, buf - index, SUBFRAME_LEN * 2);
    best_eng = av_sat_add32(best_eng, 1 << 15) >> 16;

    temp = best_eng * *exc_eng >> 3;

@@ -806,10 +804,9 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
        int16_t *vector_ptr = buf + PITCH_MAX;
        /* Attenuate */
        for (i = 0; i < lag; i++)
            vector_ptr[i - lag] = vector_ptr[i - lag] * 3 >> 2;
        av_memcpy_backptr((uint8_t*)vector_ptr, lag * sizeof(*vector_ptr),
                          FRAME_LEN * sizeof(*vector_ptr));
        memcpy(out, vector_ptr, FRAME_LEN * sizeof(*vector_ptr));
            out[i] = vector_ptr[i - lag] * 3 >> 2;
        av_memcpy_backptr((uint8_t*)(out + lag), lag * sizeof(*out),
                          (FRAME_LEN - lag) * sizeof(*out));
    } else {  /* Unvoiced */
        for (i = 0; i < FRAME_LEN; i++) {
            *rseed = *rseed * 521 + 259;
@@ -861,9 +858,9 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
    num   = energy;
    denom = 0;
    for (i = 0; i < SUBFRAME_LEN; i++) {
        int64_t temp = buf[i] >> 2;
        temp  = av_clipl_int32(MUL64(temp, temp) << 1);
        denom = av_clipl_int32(denom + temp);
        int temp = buf[i] >> 2;
        temp *= temp;
        denom = av_sat_dadd32(denom, temp);
    }

    if (num && denom) {
@@ -882,7 +879,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
    }

    for (i = 0; i < SUBFRAME_LEN; i++) {
        p->pf_gain = ((p->pf_gain << 4) - p->pf_gain + gain + (1 << 3)) >> 4;
        p->pf_gain = (15 * p->pf_gain + gain + (1 << 3)) >> 4;
        buf[i]     = av_clip_int16((buf[i] * (p->pf_gain + (p->pf_gain >> 4)) +
                                   (1 << 10)) >> 11);
    }
@@ -893,11 +890,13 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
 *
 * @param p   the context
 * @param lpc quantized lpc coefficients
 * @param buf output buffer
 * @param buf input buffer
 * @param dst output buffer
 */
 static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
                               int16_t *buf, int16_t *dst)
 {
    int16_t filter_coef[2][LPC_ORDER], *buf_ptr;
    int16_t filter_coef[2][LPC_ORDER];
    int filter_signal[LPC_ORDER + FRAME_LEN], *signal_ptr;
    int i, j, k;

@@ -919,23 +918,19 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(int16_t));
    memcpy(p->iir_mem, filter_signal + FRAME_LEN, LPC_ORDER * sizeof(int));

    buf_ptr    = buf + LPC_ORDER;
    buf += LPC_ORDER;
    signal_ptr = filter_signal + LPC_ORDER;
    for (i = 0; i < SUBFRAMES; i++) {
        int16_t temp_vector[SUBFRAME_LEN];
        int temp;
        int auto_corr[2];
        int scale, energy;

        /* Normalize */
        memcpy(temp_vector, buf_ptr, SUBFRAME_LEN * sizeof(*temp_vector));
        scale = scale_vector(temp_vector, SUBFRAME_LEN);
        scale = scale_vector(dst, buf, SUBFRAME_LEN);

        /* Compute auto correlation coefficients */
        auto_corr[0] = ff_dot_product(temp_vector, temp_vector + 1,
                                      SUBFRAME_LEN - 1)<<1;
        auto_corr[1] = ff_dot_product(temp_vector, temp_vector,
                                      SUBFRAME_LEN)<<1;
        auto_corr[0] = dot_product(dst, dst + 1, SUBFRAME_LEN - 1);
        auto_corr[1] = dot_product(dst, dst,     SUBFRAME_LEN);

        /* Compute reflection coefficient */
        temp = auto_corr[1] >> 16;
@@ -947,9 +942,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)

        /* Compensation filter */
        for (j = 0; j < SUBFRAME_LEN; j++) {
            buf_ptr[j] = av_clipl_int32((int64_t)signal_ptr[j] +
                                        ((signal_ptr[j - 1] >> 16) *
                                         temp << 1)) >> 16;
            dst[j] = av_sat_dadd32(signal_ptr[j],
                                   (signal_ptr[j - 1] >> 16) * temp) >> 16;
        }

        /* Compute normalized signal energy */
@@ -959,10 +953,11 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
        } else
            energy = auto_corr[1] >> temp;

        gain_scale(p, buf_ptr, energy);
        gain_scale(p, dst, energy);

        buf_ptr    += SUBFRAME_LEN;
        buf        += SUBFRAME_LEN;
        signal_ptr += SUBFRAME_LEN;
        dst        += SUBFRAME_LEN;
    }
 }

@@ -978,9 +973,9 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
    int16_t cur_lsp[LPC_ORDER];
    int16_t lpc[SUBFRAMES * LPC_ORDER];
    int16_t acb_vector[SUBFRAME_LEN];
    int16_t *vector_ptr;
    int16_t *out;
    int bad_frame = 0, i, j, ret;
    int16_t *audio = p->audio;

    if (buf_size < frame_size[dec_mode]) {
        if (buf_size)
@@ -1022,48 +1017,38 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
        /* Generate the excitation for the frame */
        memcpy(p->excitation, p->prev_excitation,
               PITCH_MAX * sizeof(*p->excitation));
        vector_ptr = p->excitation + PITCH_MAX;
        if (!p->erased_frames) {
            int16_t *vector_ptr = p->excitation + PITCH_MAX;

            /* Update interpolation gain memory */
            p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index +
                                            p->subframe[3].amp_index) >> 1];
            for (i = 0; i < SUBFRAMES; i++) {
                gen_fcb_excitation(vector_ptr, p->subframe[i], p->cur_rate,
                gen_fcb_excitation(vector_ptr, &p->subframe[i], p->cur_rate,
                                   p->pitch_lag[i >> 1], i);
                gen_acb_excitation(acb_vector, &p->excitation[SUBFRAME_LEN * i],
                                   p->pitch_lag[i >> 1], p->subframe[i],
                                   p->pitch_lag[i >> 1], &p->subframe[i],
                                   p->cur_rate);
                /* Get the total excitation */
                for (j = 0; j < SUBFRAME_LEN; j++) {
                    vector_ptr[j] = av_clip_int16(vector_ptr[j] << 1);
                    vector_ptr[j] = av_clip_int16(vector_ptr[j] +
                                                  acb_vector[j]);
                    int v = av_clip_int16(vector_ptr[j] << 1);
                    vector_ptr[j] = av_clip_int16(v + acb_vector[j]);
                }
                vector_ptr += SUBFRAME_LEN;
            }

            vector_ptr = p->excitation + PITCH_MAX;

            /* Save the excitation */
            memcpy(p->audio + LPC_ORDER, vector_ptr, FRAME_LEN * sizeof(*p->audio));

            p->interp_index = comp_interp_index(p, p->pitch_lag[1],
                                                &p->sid_gain, &p->cur_gain);

            /* Peform pitch postfiltering */
            if (p->postfilter) {
                i = PITCH_MAX;
                for (j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
                    comp_ppf_coeff(p, i, p->pitch_lag[j >> 1],
                                   ppf + j, p->cur_rate);
            }

            /* Restore the original excitation */
            memcpy(p->excitation, p->prev_excitation,
                   PITCH_MAX * sizeof(*p->excitation));
            memcpy(vector_ptr, p->audio + LPC_ORDER, FRAME_LEN * sizeof(*vector_ptr));

            /* Peform pitch postfiltering */
            if (p->postfilter)
                for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
                    ff_acelp_weighted_vector_sum(p->audio + LPC_ORDER + i,
                                                 vector_ptr + i,
@@ -1071,24 +1056,35 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                                 ppf[j].sc_gain,
                                                 ppf[j].opt_gain,
                                                 1 << 14, 15, SUBFRAME_LEN);
            } else {
                audio = vector_ptr - LPC_ORDER;
            }

            /* Save the excitation for the next frame */
            memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
                   PITCH_MAX * sizeof(*p->excitation));
        } else {
            p->interp_gain = (p->interp_gain * 3 + 2) >> 2;
            if (p->erased_frames == 3) {
                /* Mute output */
                memset(p->excitation, 0,
                       (FRAME_LEN + PITCH_MAX) * sizeof(*p->excitation));
                memset(p->prev_excitation, 0,
                       PITCH_MAX * sizeof(*p->excitation));
                memset(p->frame.data[0], 0,
                       (FRAME_LEN + LPC_ORDER) * sizeof(int16_t));
            } else {
                int16_t *buf = p->audio + LPC_ORDER;

                /* Regenerate frame */
                residual_interp(p->excitation, p->audio + LPC_ORDER, p->interp_index,
                residual_interp(p->excitation, buf, p->interp_index,
                                p->interp_gain, &p->random_seed);

                /* Save the excitation for the next frame */
                memcpy(p->prev_excitation, buf + (FRAME_LEN - PITCH_MAX),
                       PITCH_MAX * sizeof(*p->excitation));
            }
        }
        /* Save the excitation for the next frame */
        memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
               PITCH_MAX * sizeof(*p->excitation));
    } else {
        memset(out, 0, FRAME_LEN * 2);
        av_log(avctx, AV_LOG_WARNING,
@@ -1104,13 +1100,12 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
    memcpy(p->audio, p->synth_mem, LPC_ORDER * sizeof(*p->audio));
    for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
        ff_celp_lp_synthesis_filter(p->audio + i, &lpc[j * LPC_ORDER],
                                    p->audio + i, SUBFRAME_LEN, LPC_ORDER,
                                    audio + i, SUBFRAME_LEN, LPC_ORDER,
                                    0, 1, 1 << 12);
    memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio));

    if (p->postfilter) {
        formant_postfilter(p, lpc, p->audio);
        memcpy(p->frame.data[0], p->audio + LPC_ORDER, FRAME_LEN * 2);
        formant_postfilter(p, lpc, p->audio, out);
    } else { // if output is not postfiltered it should be scaled by 2
        for (i = 0; i < FRAME_LEN; i++)
            out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1);
@@ -1214,14 +1209,14 @@ static void comp_autocorr(int16_t *buf, int16_t *autocorr)
    int16_t vector[LPC_FRAME];

    memcpy(vector, buf, LPC_FRAME * sizeof(int16_t));
    scale_vector(vector, LPC_FRAME);
    scale_vector(vector, vector, LPC_FRAME);

    /* Apply the Hamming window */
    for (i = 0; i < LPC_FRAME; i++)
        vector[i] = (vector[i] * hamming_window[i] + (1 << 14)) >> 15;

    /* Compute the first autocorrelation coefficient */
    temp = dot_product(vector, vector, LPC_FRAME, 0);
    temp = ff_dot_product(vector, vector, LPC_FRAME);

    /* Apply a white noise correlation factor of (1025/1024) */
    temp += temp >> 10;
@@ -1236,7 +1231,7 @@ static void comp_autocorr(int16_t *buf, int16_t *autocorr)
        memset(autocorr + 1, 0, LPC_ORDER * sizeof(int16_t));
    } else {
        for (i = 1; i <= LPC_ORDER; i++) {
           temp = dot_product(vector, vector + i, LPC_FRAME - i, 0);
           temp = ff_dot_product(vector, vector + i, LPC_FRAME - i);
           temp = MULL2((temp << scale), binomial_window[i - 1]);
           autocorr[i] = av_clipl_int32((int64_t)temp + (1 << 15)) >> 16;
        }
@@ -1416,8 +1411,8 @@ static void lpc2lsp(int16_t *lpc, int16_t *prev_lsp, int16_t *lsp)
            temp[j] = (weight[j + (offset)] * lsp_band##num[i][j] +\
                      (1 << 14)) >> 15;\
        }\
        error =  dot_product(lsp + (offset), temp, size, 1) << 1;\
        error -= dot_product(lsp_band##num[i], temp, size, 1);\
        error =  dot_product(lsp + (offset), temp, size) << 1;\
        error -= dot_product(lsp_band##num[i], temp, size);\
        if (error > max) {\
            max = error;\
            lsp_index[num] = i;\
@@ -1522,7 +1517,7 @@ static int estimate_pitch(int16_t *buf, int start)

    int i;

    orig_eng = dot_product(buf + offset, buf + offset, HALF_FRAME_LEN, 0);
    orig_eng = ff_dot_product(buf + offset, buf + offset, HALF_FRAME_LEN);

    for (i = PITCH_MIN; i <= PITCH_MAX - 3; i++) {
        offset--;
@@ -1530,7 +1525,7 @@ static int estimate_pitch(int16_t *buf, int start)
        /* Update energy and compute correlation */
        orig_eng += buf[offset] * buf[offset] -
                    buf[offset + HALF_FRAME_LEN] * buf[offset + HALF_FRAME_LEN];
        ccr      =  dot_product(buf + start, buf + offset, HALF_FRAME_LEN, 0);
        ccr      =  ff_dot_product(buf + start, buf + offset, HALF_FRAME_LEN);
        if (ccr <= 0)
            continue;

@@ -1591,13 +1586,13 @@ static void comp_harmonic_coeff(int16_t *buf, int16_t pitch_lag, HFParam *hf)

    for (i = 0, j = pitch_lag - 3; j <= pitch_lag + 3; i++, j++) {
        /* Compute residual energy */
        energy[i << 1] = dot_product(buf - j, buf - j, SUBFRAME_LEN, 0);
        energy[i << 1] = ff_dot_product(buf - j, buf - j, SUBFRAME_LEN);
        /* Compute correlation */
        energy[(i << 1) + 1] = dot_product(buf, buf - j, SUBFRAME_LEN, 0);
        energy[(i << 1) + 1] = ff_dot_product(buf, buf - j, SUBFRAME_LEN);
    }

    /* Compute target energy */
    energy[14] = dot_product(buf, buf, SUBFRAME_LEN, 0);
    energy[14] = ff_dot_product(buf, buf, SUBFRAME_LEN);

    /* Normalize */
    max = 0;
@@ -1778,19 +1773,19 @@ static void acb_search(G723_1_Context *p, int16_t *residual,

        /* Compute crosscorrelation with the signal */
        for (j = 0; j < PITCH_ORDER; j++) {
            temp = dot_product(buf, flt_buf[j], SUBFRAME_LEN, 0);
            temp = ff_dot_product(buf, flt_buf[j], SUBFRAME_LEN);
            ccr_buf[count++] = av_clipl_int32(temp << 1);
        }

        /* Compute energies */
        for (j = 0; j < PITCH_ORDER; j++) {
            ccr_buf[count++] = dot_product(flt_buf[j], flt_buf[j],
                                           SUBFRAME_LEN, 1);
                                           SUBFRAME_LEN);
        }

        for (j = 1; j < PITCH_ORDER; j++) {
            for (k = 0; k < j; k++) {
                temp = dot_product(flt_buf[j], flt_buf[k], SUBFRAME_LEN, 0);
                temp = ff_dot_product(flt_buf[j], flt_buf[k], SUBFRAME_LEN);
                ccr_buf[count++] = av_clipl_int32(temp<<2);
            }
        }
@@ -1893,20 +1888,20 @@ static void get_fcb_param(FCBParam *optim, int16_t *impulse_resp,
        temp_corr[i] = impulse_r[i] >> 1;

    /* Compute impulse response autocorrelation */
    temp = dot_product(temp_corr, temp_corr, SUBFRAME_LEN, 1);
    temp = dot_product(temp_corr, temp_corr, SUBFRAME_LEN);

    scale = normalize_bits_int32(temp);
    impulse_corr[0] = av_clipl_int32((temp << scale) + (1 << 15)) >> 16;

    for (i = 1; i < SUBFRAME_LEN; i++) {
        temp = dot_product(temp_corr + i, temp_corr, SUBFRAME_LEN - i, 1);
        temp = dot_product(temp_corr + i, temp_corr, SUBFRAME_LEN - i);
        impulse_corr[i] = av_clipl_int32((temp << scale) + (1 << 15)) >> 16;
    }

    /* Compute crosscorrelation of impulse response with residual signal */
    scale -= 4;
    for (i = 0; i < SUBFRAME_LEN; i++){
        temp = dot_product(buf + i, impulse_r, SUBFRAME_LEN - i, 1);
        temp = dot_product(buf + i, impulse_r, SUBFRAME_LEN - i);
        if (scale < 0)
            ccr1[i] = temp >> -scale;
        else
@@ -2185,7 +2180,7 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    memcpy(vector, p->prev_weight_sig, sizeof(int16_t) * PITCH_MAX);
    memcpy(vector + PITCH_MAX, in, sizeof(int16_t) * FRAME_LEN);

    scale_vector(vector, FRAME_LEN + PITCH_MAX);
    scale_vector(vector, vector, FRAME_LEN + PITCH_MAX);

    p->pitch_lag[0] = estimate_pitch(vector, PITCH_MAX);
    p->pitch_lag[1] = estimate_pitch(vector, PITCH_MAX + HALF_FRAME_LEN);
@@ -2237,14 +2232,14 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,

        acb_search(p, residual, impulse_resp, in, i);
        gen_acb_excitation(residual, p->prev_excitation,p->pitch_lag[i >> 1],
                           p->subframe[i], p->cur_rate);
                           &p->subframe[i], p->cur_rate);
        sub_acb_contrib(residual, impulse_resp, in);

        fcb_search(p, impulse_resp, in, i);

        /* Reconstruct the excitation */
        gen_acb_excitation(impulse_resp, p->prev_excitation, p->pitch_lag[i >> 1],
                           p->subframe[i], RATE_6300);
                           &p->subframe[i], RATE_6300);

        memmove(p->prev_excitation, p->prev_excitation + SUBFRAME_LEN,
               sizeof(int16_t) * (PITCH_MAX - SUBFRAME_LEN));
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -41,7 +41,7 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32_sse.o
 YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
 YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc_yasm.o
 YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
                                          $(YASM-OBJS-FFT-yes)

@@ -65,11 +65,11 @@ YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                          x86/rv40dsp.o
 YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp_yasm.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp56dsp.o
 YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o

 YASM-OBJS                              += x86/dsputil_yasm.o            \
 YASM-OBJS                              += x86/dsputil.o                 \
                                          x86/deinterlace.o             \
                                          x86/fmtconvert.o              \
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
--- a/libavcodec/x86/dsputilenc_yasm.asm
+++ b/libavcodec/x86/dsputilenc_yasm.asm
--- a/libavcodec/x86/vc1dsp_yasm.asm
+++ b/libavcodec/x86/vc1dsp_yasm.asm
--- a/libavutil/arm/intmath.h
+++ b/libavutil/arm/intmath.h
@@ -83,6 +83,21 @@ static av_always_inline av_const unsigned av_clip_uintp2_arm(int a, int p)
    return x;
 }

 #define av_sat_add32 av_sat_add32_arm
 static av_always_inline int av_sat_add32_arm(int a, int b)
 {
    int r;
    __asm__ ("qadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
    return r;
 }

 #define av_sat_dadd32 av_sat_dadd32_arm
 static av_always_inline int av_sat_dadd32_arm(int a, int b)
 {
    int r;
    __asm__ ("qdadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
    return r;
 }

 #else /* HAVE_ARMV6 */

--- a/libavutil/common.h
+++ b/libavutil/common.h
@@ -186,6 +186,30 @@ static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p)
    else                   return  a;
 }

 /**
 * Add two signed 32-bit values with saturation.
 *
 * @param  a one value
 * @param  b another value
 * @return sum with signed saturation
 */
 static av_always_inline int av_sat_add32_c(int a, int b)
 {
    return av_clipl_int32((int64_t)a + b);
 }

 /**
 * Add a doubled value to another value with saturation at both stages.
 *
 * @param  a first value
 * @param  b value doubled and added to a
 * @return sum with signed saturation
 */
 static av_always_inline int av_sat_dadd32_c(int a, int b)
 {
    return av_sat_add32(a, av_sat_add32(b, b));
 }

 /**
 * Clip a float value into the amin-amax range.
 * @param a value to clip
@@ -392,6 +416,12 @@ static av_always_inline av_const int av_popcount64_c(uint64_t x)
 #ifndef av_clip_uintp2
 #   define av_clip_uintp2   av_clip_uintp2_c
 #endif
 #ifndef av_sat_add32
 #   define av_sat_add32     av_sat_add32_c
 #endif
 #ifndef av_sat_dadd32
 #   define av_sat_dadd32    av_sat_dadd32_c
 #endif
 #ifndef av_clipf
 #   define av_clipf         av_clipf_c
 #endif