wmavoice: disable bitstream checking.

The checked bitstream reader does that already. To allow parsing of superframes split over a packet boundary, we always decode the last superframe in each packet at the start of the next packet, even if theoretically we could have decoded it. The last superframe in the last packet is decoded using AV_CODEC_CAP_DELAY.
9 years ago · 3deb4b54a2
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -251,6 +251,7 @@ typedef struct WMAVoiceContext {

    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
                                  ///< only used for comfort noise in #pRNG()
    int nb_superframes;           ///< number of superframes in current packet
    float gain_pred_err[6];       ///< cache for gain prediction
    float excitation_history[MAX_SIGNAL_HISTORY];
                                  ///< cache of the signal of previous
@@ -875,7 +876,6 @@ static void dequant_lsps(double *lsps, int num,
 /**
 * @name LSP dequantization routines
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
 * @note we assume enough bits are available, caller should check.
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
 * @{
@@ -1419,7 +1419,6 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,

 /**
 * Parse data in a single block.
 * @note we assume enough bits are available, caller should check.
 *
 * @param s WMA Voice decoding context private data
 * @param gb bit I/O context
@@ -1463,7 +1462,6 @@ static void synth_block(WMAVoiceContext *s, GetBitContext *gb,

 /**
 * Synthesize output samples for a single frame.
 * @note we assume enough bits are available, caller should check.
 *
 * @param ctx WMA Voice decoder context
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
@@ -1681,83 +1679,6 @@ static void stabilize_lsps(double *lsps, int num)
    }
 }

 /**
 * Test if there's enough bits to read 1 superframe.
 *
 * @param orig_gb bit I/O context used for reading. This function
 *                does not modify the state of the bitreader; it
 *                only uses it to copy the current stream position
 * @param s WMA Voice decoding context private data
 * @return < 0 on error, 1 on not enough bits or 0 if OK.
 */
 static int check_bits_for_superframe(GetBitContext *orig_gb,
                                     WMAVoiceContext *s)
 {
    GetBitContext s_gb, *gb = &s_gb;
    int n, need_bits, bd_idx;
    const struct frame_type_desc *frame_desc;

    /* initialize a copy */
    init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
    skip_bits_long(gb, get_bits_count(orig_gb));
    av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));

    /* superframe header */
    if (get_bits_left(gb) < 14)
        return 1;
    if (!get_bits1(gb))
        return AVERROR(ENOSYS);           // WMAPro-in-WMAVoice superframe
    if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
    if (s->has_residual_lsps) {           // residual LSPs (for all frames)
        if (get_bits_left(gb) < s->sframe_lsp_bitsize)
            return 1;
        skip_bits_long(gb, s->sframe_lsp_bitsize);
    }

    /* frames */
    for (n = 0; n < MAX_FRAMES; n++) {
        int aw_idx_is_ext = 0;

        if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
           if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
           skip_bits_long(gb, s->frame_lsp_bitsize);
        }
        bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
        if (bd_idx < 0)
            return AVERROR_INVALIDDATA; // invalid frame type VLC code
        frame_desc = &frame_descs[bd_idx];
        if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
            if (get_bits_left(gb) < s->pitch_nbits)
                return 1;
            skip_bits_long(gb, s->pitch_nbits);
        }
        if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
            skip_bits(gb, 8);
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
            int tmp = get_bits(gb, 6);
            if (tmp >= 0x36) {
                skip_bits(gb, 2);
                aw_idx_is_ext = 1;
            }
        }

        /* blocks */
        if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
            need_bits = s->block_pitch_nbits +
                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
            need_bits = 2 * !aw_idx_is_ext;
        } else
            need_bits = 0;
        need_bits += frame_desc->frame_size;
        if (get_bits_left(gb) < need_bits)
            return 1;
        skip_bits_long(gb, need_bits);
    }

    return 0;
 }

 /**
 * Synthesize output samples for a single superframe. If we have any data
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
@@ -1780,7 +1701,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 {
    WMAVoiceContext *s = ctx->priv_data;
    GetBitContext *gb = &s->gb, s_gb;
    int n, res, n_samples = 480;
    int n, res, n_samples = MAX_SFRAMESIZE;
    double lsps[MAX_FRAMES][MAX_LSPS];
    const double *mean_lsf = s->lsps == 16 ?
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
@@ -1799,12 +1720,6 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
        s->sframe_cache_size = 0;
    }

    if ((res = check_bits_for_superframe(gb, s)) == 1) {
        *got_frame_ptr = 0;
        return 1;
    } else if (res < 0)
        return res;

    /* First bit is speech/music bit, it differentiates between WMAVoice
     * speech samples (the actual codec) and WMAVoice music samples, which
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
@@ -1816,13 +1731,14 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,

    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
    if (get_bits1(gb)) {
        if ((n_samples = get_bits(gb, 12)) > 480) {
        if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
            av_log(ctx, AV_LOG_ERROR,
                   "Superframe encodes >480 samples (%d), not allowed\n",
                   n_samples);
                   "Superframe encodes > %d samples (%d), not allowed\n",
                   MAX_SFRAMESIZE, n_samples);
            return AVERROR_INVALIDDATA;
        }
    }

    /* Parse LSPs, if global for the superframe (can also be per-frame). */
    if (s->has_residual_lsps) {
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
@@ -1845,7 +1761,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
    }

    /* get output buffer */
    frame->nb_samples = 480;
    frame->nb_samples = MAX_SFRAMESIZE;
    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
        return res;
    frame->nb_samples = n_samples;
@@ -1905,26 +1821,23 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 * decoder).
 *
 * @param s WMA Voice decoding context private data
 * @return 1 if not enough bits were available, or 0 on success.
 * @return <0 on error, nb_superframes on success.
 */
 static int parse_packet_header(WMAVoiceContext *s)
 {
    GetBitContext *gb = &s->gb;
    unsigned int res;
    unsigned int res, n_superframes = 0;

    if (get_bits_left(gb) < 11)
        return 1;
    skip_bits(gb, 4);          // packet sequence number
    s->has_residual_lsps = get_bits1(gb);
    do {
        res = get_bits(gb, 6); // number of superframes per packet
                               // (minus first one if there is spillover)
        if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
            return 1;
        n_superframes += res;
    } while (res == 0x3F);
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);

    return 0;
    return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
 }

 /**
@@ -1984,23 +1897,24 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     * in a single "muxer" packet, so we artificially emulate that by
     * capping the packet size at ctx->block_align. */
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
    if (!size) {
        *got_frame_ptr = 0;
        return 0;
    }
    init_get_bits(&s->gb, avpkt->data, size << 3);

    /* size == ctx->block_align is used to indicate whether we are dealing with
     * a new packet or a packet of which we already read the packet header
     * previously. */
    if (size == ctx->block_align) { // new packet header
        if ((res = parse_packet_header(s)) < 0)
            return res;
    if (!(size % ctx->block_align)) { // new packet header
        if (!size) {
            s->spillover_nbits = 0;
            s->nb_superframes = 0;
        } else {
            if ((res = parse_packet_header(s)) < 0)
                return res;
            s->nb_superframes = res;
        }

        /* If the packet header specifies a s->spillover_nbits, then we want
         * to push out all data of the previous packet (+ spillover) before
         * continuing to parse new superframes in the current packet. */
        if (s->spillover_nbits > 0) {
            if (s->sframe_cache_size > 0) {
                int cnt = get_bits_count(gb);
                copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
@@ -2021,9 +1935,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
                } else
                    skip_bits_long (gb, s->spillover_nbits - cnt +
                                    get_bits_count(gb)); // resync
            } else
            } else if (s->spillover_nbits) {
                skip_bits_long(gb, s->spillover_nbits);  // resync
        }
            }
    } else if (s->skip_bits_next)
        skip_bits(gb, s->skip_bits_next);

@@ -2031,6 +1945,10 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
    s->sframe_cache_size = 0;
    s->skip_bits_next = 0;
    pos = get_bits_left(gb);
    if (s->nb_superframes-- == 0) {
        *got_frame_ptr = 0;
        return size;
    } else if (s->nb_superframes > 0) {
    if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
        return res;
    } else if (*got_frame_ptr) {
@@ -2044,13 +1962,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
            return AVERROR_INVALIDDATA;
        }
        return res;
    }
    } else if ((s->sframe_cache_size = pos) > 0) {
        /* rewind bit reader to start of last (incomplete) superframe... */
        init_get_bits(gb, avpkt->data, size << 3);
        skip_bits_long(gb, (size << 3) - pos);
        av_assert1(get_bits_left(gb) == pos);

        /* ...and cache it for spillover in next packet */
        /* ... cache it for spillover in next packet */
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
        // FIXME bad - just copy bytes as whole and add use the
@@ -2084,6 +1998,6 @@ AVCodec ff_wmavoice_decoder = {
    .init_static_data = wmavoice_init_static_data,
    .close            = wmavoice_decode_end,
    .decode           = wmavoice_decode_packet,
    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
    .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
    .flush            = wmavoice_flush,
 };