|
|
|
@@ -251,6 +251,7 @@ typedef struct WMAVoiceContext { |
|
|
|
|
|
|
|
int frame_cntr; ///< current frame index [0 - 0xFFFE]; is |
|
|
|
///< only used for comfort noise in #pRNG() |
|
|
|
int nb_superframes; ///< number of superframes in current packet |
|
|
|
float gain_pred_err[6]; ///< cache for gain prediction |
|
|
|
float excitation_history[MAX_SIGNAL_HISTORY]; |
|
|
|
///< cache of the signal of previous |
|
|
|
@@ -875,7 +876,6 @@ static void dequant_lsps(double *lsps, int num, |
|
|
|
/** |
|
|
|
* @name LSP dequantization routines |
|
|
|
* LSP dequantization routines, for 10/16LSPs and independent/residual coding. |
|
|
|
* @note we assume enough bits are available, caller should check. |
|
|
|
* lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits; |
|
|
|
* lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits. |
|
|
|
* @{ |
|
|
|
@@ -1419,7 +1419,6 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, |
|
|
|
|
|
|
|
/** |
|
|
|
* Parse data in a single block. |
|
|
|
* @note we assume enough bits are available, caller should check. |
|
|
|
* |
|
|
|
* @param s WMA Voice decoding context private data |
|
|
|
* @param gb bit I/O context |
|
|
|
@@ -1463,7 +1462,6 @@ static void synth_block(WMAVoiceContext *s, GetBitContext *gb, |
|
|
|
|
|
|
|
/** |
|
|
|
* Synthesize output samples for a single frame. |
|
|
|
* @note we assume enough bits are available, caller should check. |
|
|
|
* |
|
|
|
* @param ctx WMA Voice decoder context |
|
|
|
* @param gb bit I/O context (s->gb or one for cross-packet superframes) |
|
|
|
@@ -1681,83 +1679,6 @@ static void stabilize_lsps(double *lsps, int num) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* Test if there's enough bits to read 1 superframe. |
|
|
|
* |
|
|
|
* @param orig_gb bit I/O context used for reading. This function |
|
|
|
* does not modify the state of the bitreader; it |
|
|
|
* only uses it to copy the current stream position |
|
|
|
* @param s WMA Voice decoding context private data |
|
|
|
* @return < 0 on error, 1 on not enough bits or 0 if OK. |
|
|
|
*/ |
|
|
|
static int check_bits_for_superframe(GetBitContext *orig_gb, |
|
|
|
WMAVoiceContext *s) |
|
|
|
{ |
|
|
|
GetBitContext s_gb, *gb = &s_gb; |
|
|
|
int n, need_bits, bd_idx; |
|
|
|
const struct frame_type_desc *frame_desc; |
|
|
|
|
|
|
|
/* initialize a copy */ |
|
|
|
init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits); |
|
|
|
skip_bits_long(gb, get_bits_count(orig_gb)); |
|
|
|
av_assert1(get_bits_left(gb) == get_bits_left(orig_gb)); |
|
|
|
|
|
|
|
/* superframe header */ |
|
|
|
if (get_bits_left(gb) < 14) |
|
|
|
return 1; |
|
|
|
if (!get_bits1(gb)) |
|
|
|
return AVERROR(ENOSYS); // WMAPro-in-WMAVoice superframe |
|
|
|
if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe |
|
|
|
if (s->has_residual_lsps) { // residual LSPs (for all frames) |
|
|
|
if (get_bits_left(gb) < s->sframe_lsp_bitsize) |
|
|
|
return 1; |
|
|
|
skip_bits_long(gb, s->sframe_lsp_bitsize); |
|
|
|
} |
|
|
|
|
|
|
|
/* frames */ |
|
|
|
for (n = 0; n < MAX_FRAMES; n++) { |
|
|
|
int aw_idx_is_ext = 0; |
|
|
|
|
|
|
|
if (!s->has_residual_lsps) { // independent LSPs (per-frame) |
|
|
|
if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1; |
|
|
|
skip_bits_long(gb, s->frame_lsp_bitsize); |
|
|
|
} |
|
|
|
bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)]; |
|
|
|
if (bd_idx < 0) |
|
|
|
return AVERROR_INVALIDDATA; // invalid frame type VLC code |
|
|
|
frame_desc = &frame_descs[bd_idx]; |
|
|
|
if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { |
|
|
|
if (get_bits_left(gb) < s->pitch_nbits) |
|
|
|
return 1; |
|
|
|
skip_bits_long(gb, s->pitch_nbits); |
|
|
|
} |
|
|
|
if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { |
|
|
|
skip_bits(gb, 8); |
|
|
|
} else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { |
|
|
|
int tmp = get_bits(gb, 6); |
|
|
|
if (tmp >= 0x36) { |
|
|
|
skip_bits(gb, 2); |
|
|
|
aw_idx_is_ext = 1; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* blocks */ |
|
|
|
if (frame_desc->acb_type == ACB_TYPE_HAMMING) { |
|
|
|
need_bits = s->block_pitch_nbits + |
|
|
|
(frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits; |
|
|
|
} else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { |
|
|
|
need_bits = 2 * !aw_idx_is_ext; |
|
|
|
} else |
|
|
|
need_bits = 0; |
|
|
|
need_bits += frame_desc->frame_size; |
|
|
|
if (get_bits_left(gb) < need_bits) |
|
|
|
return 1; |
|
|
|
skip_bits_long(gb, need_bits); |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* Synthesize output samples for a single superframe. If we have any data |
|
|
|
* cached in s->sframe_cache, that will be used instead of whatever is loaded |
|
|
|
@@ -1780,7 +1701,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, |
|
|
|
{ |
|
|
|
WMAVoiceContext *s = ctx->priv_data; |
|
|
|
GetBitContext *gb = &s->gb, s_gb; |
|
|
|
int n, res, n_samples = 480; |
|
|
|
int n, res, n_samples = MAX_SFRAMESIZE; |
|
|
|
double lsps[MAX_FRAMES][MAX_LSPS]; |
|
|
|
const double *mean_lsf = s->lsps == 16 ? |
|
|
|
wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode]; |
|
|
|
@@ -1799,12 +1720,6 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, |
|
|
|
s->sframe_cache_size = 0; |
|
|
|
} |
|
|
|
|
|
|
|
if ((res = check_bits_for_superframe(gb, s)) == 1) { |
|
|
|
*got_frame_ptr = 0; |
|
|
|
return 1; |
|
|
|
} else if (res < 0) |
|
|
|
return res; |
|
|
|
|
|
|
|
/* First bit is speech/music bit, it differentiates between WMAVoice |
|
|
|
* speech samples (the actual codec) and WMAVoice music samples, which |
|
|
|
* are really WMAPro-in-WMAVoice-superframes. I've never seen those in |
|
|
|
@@ -1816,13 +1731,14 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, |
|
|
|
|
|
|
|
/* (optional) nr. of samples in superframe; always <= 480 and >= 0 */ |
|
|
|
if (get_bits1(gb)) { |
|
|
|
if ((n_samples = get_bits(gb, 12)) > 480) { |
|
|
|
if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) { |
|
|
|
av_log(ctx, AV_LOG_ERROR, |
|
|
|
"Superframe encodes >480 samples (%d), not allowed\n", |
|
|
|
n_samples); |
|
|
|
"Superframe encodes > %d samples (%d), not allowed\n", |
|
|
|
MAX_SFRAMESIZE, n_samples); |
|
|
|
return AVERROR_INVALIDDATA; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* Parse LSPs, if global for the superframe (can also be per-frame). */ |
|
|
|
if (s->has_residual_lsps) { |
|
|
|
double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2]; |
|
|
|
@@ -1845,7 +1761,7 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, |
|
|
|
} |
|
|
|
|
|
|
|
/* get output buffer */ |
|
|
|
frame->nb_samples = 480; |
|
|
|
frame->nb_samples = MAX_SFRAMESIZE; |
|
|
|
if ((res = ff_get_buffer(ctx, frame, 0)) < 0) |
|
|
|
return res; |
|
|
|
frame->nb_samples = n_samples; |
|
|
|
@@ -1905,26 +1821,23 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, |
|
|
|
* decoder). |
|
|
|
* |
|
|
|
* @param s WMA Voice decoding context private data |
|
|
|
* @return 1 if not enough bits were available, or 0 on success. |
|
|
|
* @return <0 on error, nb_superframes on success. |
|
|
|
*/ |
|
|
|
static int parse_packet_header(WMAVoiceContext *s) |
|
|
|
{ |
|
|
|
GetBitContext *gb = &s->gb; |
|
|
|
unsigned int res; |
|
|
|
unsigned int res, n_superframes = 0; |
|
|
|
|
|
|
|
if (get_bits_left(gb) < 11) |
|
|
|
return 1; |
|
|
|
skip_bits(gb, 4); // packet sequence number |
|
|
|
s->has_residual_lsps = get_bits1(gb); |
|
|
|
do { |
|
|
|
res = get_bits(gb, 6); // number of superframes per packet |
|
|
|
// (minus first one if there is spillover) |
|
|
|
if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize) |
|
|
|
return 1; |
|
|
|
n_superframes += res; |
|
|
|
} while (res == 0x3F); |
|
|
|
s->spillover_nbits = get_bits(gb, s->spillover_bitsize); |
|
|
|
|
|
|
|
return 0; |
|
|
|
return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA; |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
@@ -1984,23 +1897,24 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, |
|
|
|
* in a single "muxer" packet, so we artificially emulate that by |
|
|
|
* capping the packet size at ctx->block_align. */ |
|
|
|
for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align); |
|
|
|
if (!size) { |
|
|
|
*got_frame_ptr = 0; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
init_get_bits(&s->gb, avpkt->data, size << 3); |
|
|
|
|
|
|
|
/* size == ctx->block_align is used to indicate whether we are dealing with |
|
|
|
* a new packet or a packet of which we already read the packet header |
|
|
|
* previously. */ |
|
|
|
if (size == ctx->block_align) { // new packet header |
|
|
|
if ((res = parse_packet_header(s)) < 0) |
|
|
|
return res; |
|
|
|
if (!(size % ctx->block_align)) { // new packet header |
|
|
|
if (!size) { |
|
|
|
s->spillover_nbits = 0; |
|
|
|
s->nb_superframes = 0; |
|
|
|
} else { |
|
|
|
if ((res = parse_packet_header(s)) < 0) |
|
|
|
return res; |
|
|
|
s->nb_superframes = res; |
|
|
|
} |
|
|
|
|
|
|
|
/* If the packet header specifies a s->spillover_nbits, then we want |
|
|
|
* to push out all data of the previous packet (+ spillover) before |
|
|
|
* continuing to parse new superframes in the current packet. */ |
|
|
|
if (s->spillover_nbits > 0) { |
|
|
|
if (s->sframe_cache_size > 0) { |
|
|
|
int cnt = get_bits_count(gb); |
|
|
|
copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits); |
|
|
|
@@ -2021,9 +1935,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, |
|
|
|
} else |
|
|
|
skip_bits_long (gb, s->spillover_nbits - cnt + |
|
|
|
get_bits_count(gb)); // resync |
|
|
|
} else |
|
|
|
} else if (s->spillover_nbits) { |
|
|
|
skip_bits_long(gb, s->spillover_nbits); // resync |
|
|
|
} |
|
|
|
} |
|
|
|
} else if (s->skip_bits_next) |
|
|
|
skip_bits(gb, s->skip_bits_next); |
|
|
|
|
|
|
|
@@ -2031,6 +1945,10 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, |
|
|
|
s->sframe_cache_size = 0; |
|
|
|
s->skip_bits_next = 0; |
|
|
|
pos = get_bits_left(gb); |
|
|
|
if (s->nb_superframes-- == 0) { |
|
|
|
*got_frame_ptr = 0; |
|
|
|
return size; |
|
|
|
} else if (s->nb_superframes > 0) { |
|
|
|
if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) { |
|
|
|
return res; |
|
|
|
} else if (*got_frame_ptr) { |
|
|
|
@@ -2044,13 +1962,9 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, |
|
|
|
return AVERROR_INVALIDDATA; |
|
|
|
} |
|
|
|
return res; |
|
|
|
} |
|
|
|
} else if ((s->sframe_cache_size = pos) > 0) { |
|
|
|
/* rewind bit reader to start of last (incomplete) superframe... */ |
|
|
|
init_get_bits(gb, avpkt->data, size << 3); |
|
|
|
skip_bits_long(gb, (size << 3) - pos); |
|
|
|
av_assert1(get_bits_left(gb) == pos); |
|
|
|
|
|
|
|
/* ...and cache it for spillover in next packet */ |
|
|
|
/* ... cache it for spillover in next packet */ |
|
|
|
init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE); |
|
|
|
copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size); |
|
|
|
// FIXME bad - just copy bytes as whole and add use the |
|
|
|
@@ -2084,6 +1998,6 @@ AVCodec ff_wmavoice_decoder = { |
|
|
|
.init_static_data = wmavoice_init_static_data, |
|
|
|
.close = wmavoice_decode_end, |
|
|
|
.decode = wmavoice_decode_packet, |
|
|
|
.capabilities = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1, |
|
|
|
.capabilities = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY, |
|
|
|
.flush = wmavoice_flush, |
|
|
|
}; |