Merge remote-tracking branch 'qatar/master'

* qatar/master: libx264: fix indentation. vorbis: fix overflows in floor1[] vector and inverse db table index. win64: add a XMM clobber test configure option. movdec: Parse the dvc1 atom ARM: ac3: fix ac3_bit_alloc_calc_bap_armv6 swscale: K&R formatting cosmetics for Blackfin code frwu: lowercase the FRWU codec name movdec: fix dts generation in fragmented files fate: make acodec-ac3_fixed test output raw AC3 APIchanges: add missing commit hashes swscale: implement MMX, SSE2 and AVX functions for RGB32 input. ra144enc: drop pointless "encoder" from .long_name bethsoftvideo: fix palette reading. mpc7: use av_fast_padded_malloc() mpc7: simplify handling of packet sizes that are not a multiple of 4 bytes doc: decoding Forward Uncompressed is supported Fix a typo in the x86 asm version of ff_vector_clip_int32() pcmenc: Do not set avpkt->size. ff_alloc_packet: modify the size of the packet to match the requested size Conflicts: doc/APIchanges libavcodec/libx264.c libavcodec/mpc7.c libavformat/isom.h libswscale/Makefile libswscale/bfin/yuv2rgb_bfin.c tests/ref/fate/bethsoft-vid tests/ref/seek/ac3_ac3 Merged-by: Michael Niedermayer <michaelni@gmx.at>
14 years ago · d77294c5e4
--- a/configure
+++ b/configure
@@ -276,6 +276,8 @@ Developer options (useful when working on FFmpeg itself):
                           Cannot be combined with --target-exec
  --samples=PATH           location of test samples for FATE, if not set use
                           \$FATE_SAMPLES at make invocation time.
  --enable-xmm-clobber-test check XMM registers for clobbering (Win64-only;
                           should be used only for debugging purposes)

 NOTE: Object files are built at the place where configure is launched.
 EOF
@@ -1085,6 +1087,7 @@ CONFIG_LIST="
    vda
    vdpau
    version3
    xmm_clobber_test
    x11grab
    zlib
 "
@@ -1779,7 +1782,7 @@ test_deps _muxer _demuxer                                               \
    wav                                                                 \
    yuv4mpegpipe=yuv4mpeg                                               \

 ac3_fixed_test_deps="ac3_fixed_encoder ac3_decoder rm_muxer rm_demuxer"
 ac3_fixed_test_deps="ac3_fixed_encoder ac3_decoder"
 mpg_test_deps="mpeg1system_muxer mpegps_demuxer"

 # default parameters
@@ -3304,6 +3307,17 @@ check_ldflags -Wl,--warn-common
 check_ldflags -Wl,-rpath-link=libpostproc:libswresample:libswscale:libavfilter:libavdevice:libavformat:libavcodec:libavutil
 test_ldflags -Wl,-Bsymbolic && append SHFLAGS -Wl,-Bsymbolic

 enabled xmm_clobber_test &&                             \
    check_ldflags -Wl,--wrap,avcodec_open2              \
                  -Wl,--wrap,avcodec_decode_audio4      \
                  -Wl,--wrap,avcodec_decode_video2      \
                  -Wl,--wrap,avcodec_decode_subtitle2   \
                  -Wl,--wrap,avcodec_encode_audio2      \
                  -Wl,--wrap,avcodec_encode_video       \
                  -Wl,--wrap,avcodec_encode_subtitle    \
                  -Wl,--wrap,sws_scale ||               \
    disable xmm_clobber_test

 echo "X{};" > $TMPV
 if test_ldflags -Wl,--version-script,$TMPV; then
    append SHFLAGS '-Wl,--version-script,\$(SUBDIR)lib\$(NAME).ver'
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -19,18 +19,18 @@ API changes, most recent first:
 2012-01-24 - xxxxxxx - lavfi 2.60.100
  Add avfilter_graph_dump.

 2012-02-01 - xxxxxxx - lavc 54.01.0
 2012-02-01 - 316fc74 - lavc 54.01.0
  Add av_fast_padded_malloc() as alternative for av_realloc() when aligned
  memory is required. The buffer will always have FF_INPUT_BUFFER_PADDING_SIZE
  zero-padded bytes at the end.

 2012-01-31 - xxxxxxx - lavf 54.01.0
 2012-01-31 - dd6d3b0 - lavf 54.01.0
  Add avformat_get_riff_video_tags() and avformat_get_riff_audio_tags().

 2012-01-31 - xxxxxxx - lavc 54.01.0
 2012-01-31 - af08d9a - lavc 54.01.0
  Add avcodec_is_open() function.

 2012-01-30 - xxxxxxx - lavu 51.22.0 - intfloat.h
 2012-01-30 - 8b93312 - lavu 51.22.0 - intfloat.h
  Add a new installed header libavutil/intfloat.h with int/float punning
  functions.

--- a/doc/general.texi
+++ b/doc/general.texi
@@ -497,6 +497,7 @@ following image formats are supported:
@item Flash Screen Video v2  @tab  X  @tab  X
@item Flash Video (FLV)      @tab  X  @tab  X
    @tab Sorenson H.263 used in Flash
@item Forward Uncompressed   @tab     @tab  X
@item Fraps                  @tab     @tab  X
@item H.261                  @tab  X  @tab  X
@item H.263 / H.263-1996     @tab  X  @tab  X
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -34,24 +34,23 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
        add             r0,  r0,  r4,  lsl #1           @ mask + band
        add             r4,  lr,  r4
        add             r7,  r7,  r2                    @ bap + start
        ldrb            r10, [r4], #1
 1:
        ldrsh           r9,  [r0], #2                   @ mask[band]
        mov             r8,  #0xff0
        sub             r9,  r9,  r12                   @   - snr_offset
        mov             r11, r10
        ldrb            r10, [r4], #1                   @ band_start_tab[band++]
        ldrb            r10, [r4, #1]!                  @ band_start_tab[++band]
        subs            r9,  r9,  r5                    @   - floor
        it              lt
        movlt           r9,  #0
        cmp             r10, r3                         @   - end
        and             r9,  r9,  r8, lsl #1            @   & 0x1fe0
        ite             gt
        subgt           r8,  r3,  r11
        suble           r8,  r10, r11
        subgt           r8,  r3,  r2
        suble           r8,  r10, r2
        mov             r2,  r10
        add             r9,  r9,  r5                    @   + floor => m
        tst             r8,  #1
        add             r2,  r7,  r8
        add             r11, r7,  r8
        bne             3f
        b               5f
 2:
@@ -65,9 +64,9 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
        ldrb            lr,  [r6, lr]
        strb            r8,  [r7], #1                   @ bap[bin]
        strb            lr,  [r7], #1
 5:      cmp             r7,  r2
 5:      cmp             r7,  r11
        blo             2b
        cmp             r3,  r11
        cmp             r3,  r10
        bgt             1b
        pop             {r4-r11,pc}
 3:
--- a/libavcodec/bethsoftvideo.c
+++ b/libavcodec/bethsoftvideo.c
@@ -61,7 +61,7 @@ static int set_palette(BethsoftvidContext *ctx)
        palette[a] |= palette[a] >> 6 & 0x30303;
    }
    ctx->frame.palette_has_changed = 1;
    return 256*3;
    return 0;
 }

 static int bethsoftvid_decode_frame(AVCodecContext *avctx,
@@ -88,7 +88,13 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx,

    switch(block_type = bytestream2_get_byte(&vid->g)){
        case PALETTE_BLOCK: {
            return set_palette(vid);
            int ret;
            *data_size = 0;
            if ((ret = set_palette(vid)) < 0) {
                av_log(avctx, AV_LOG_ERROR, "error reading palette\n");
                return ret;
            }
            return bytestream2_tell(&vid->g);
        }
        case VIDEO_YOFF_P_FRAME:
            yoffset = bytestream2_get_le16(&vid->g);
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -130,6 +130,7 @@ int avpriv_unlock_avformat(void);
 *                If avpkt->data is already set, avpkt->size is checked
 *                to ensure it is large enough.
 *                If avpkt->data is NULL, a new buffer is allocated.
 *                avpkt->size is set to the specified size.
 *                All other AVPacket fields will be reset with av_init_packet().
 * @param size    the minimum required packet size
 * @return        0 on success, negative error code on failure
--- a/libavcodec/libx264.c
+++ b/libavcodec/libx264.c
@@ -188,12 +188,12 @@ static int X264_frame(AVCodecContext *ctx, uint8_t *buf,

    do {
        bufsize = orig_bufsize;
    if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0)
        return -1;
        if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0)
            return -1;

    bufsize = encode_nals(ctx, buf, bufsize, nal, nnal, 0);
    if (bufsize < 0)
        return -1;
        bufsize = encode_nals(ctx, buf, bufsize, nal, nnal, 0);
        if (bufsize < 0)
            return -1;
    } while (!bufsize && !frame && x264_encoder_delayed_frames(x4->enc));

    /* FIXME: libx264 now provides DTS, but AVFrame doesn't have a field for it. */
--- a/libavcodec/mpc.h
+++ b/libavcodec/mpc.h
@@ -66,8 +66,6 @@ typedef struct {
    int buf_size;
    AVLFG rnd;
    int frames_to_skip;
    uint8_t *buffer;
    int buffer_size;
    /* for synthesis */
    DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512*2];
    int synth_buf_offset[MPA_MAX_CHANNELS];
--- a/libavcodec/mpc7.c
+++ b/libavcodec/mpc7.c
@@ -200,34 +200,46 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
 {
    const uint8_t *buf = avpkt->data;
    int buf_size = avpkt->size;
    int buf_size;
    MPCContext *c = avctx->priv_data;
    GetBitContext gb;
    int i, ch;
    int mb = -1;
    Band *bands = c->bands;
    int off, ret;
    int off, ret, last_frame, skip;
    int bits_used, bits_avail;

    memset(bands, 0, sizeof(*bands) * (c->maxbands + 1));
    if(buf_size <= 4){
        av_log(avctx, AV_LOG_ERROR, "Too small buffer passed (%i bytes)\n", buf_size);
        return AVERROR(EINVAL);

    buf_size = avpkt->size & ~3;
    if (buf_size <= 0) {
        av_log(avctx, AV_LOG_ERROR, "packet size is too small (%i bytes)\n",
               avpkt->size);
        return AVERROR_INVALIDDATA;
    }
    if (buf_size != avpkt->size) {
        av_log(avctx, AV_LOG_WARNING, "packet size is not a multiple of 4. "
               "extra bytes at the end will be skipped.\n");
    }

    skip       = buf[0];
    last_frame = buf[1];
    buf       += 4;
    buf_size  -= 4;

    /* get output buffer */
    c->frame.nb_samples = buf[1] ? c->lastframelen : MPC_FRAME_SIZE;
    c->frame.nb_samples = last_frame ? c->lastframelen : MPC_FRAME_SIZE;
    if ((ret = avctx->get_buffer(avctx, &c->frame)) < 0) {
        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
        return ret;
    }

    av_fast_padded_malloc(&c->buffer, &c->buffer_size, FFALIGN(buf_size - 1, 4));
    if (!c->buffer)
    av_fast_padded_malloc(&c->bits, &c->buf_size, buf_size);
    if (!c->bits)
        return AVERROR(ENOMEM);
    c->dsp.bswap_buf((uint32_t*)c->buffer, (const uint32_t*)(buf + 4), (buf_size - 4) >> 2);
    init_get_bits(&gb, c->buffer, (buf_size - 4)* 8);
    skip_bits_long(&gb, buf[0]);
    c->dsp.bswap_buf((uint32_t *)c->bits, (const uint32_t *)buf, buf_size >> 2);
    init_get_bits(&gb, c->bits, buf_size * 8);
    skip_bits_long(&gb, skip);

    /* read subband indexes */
    for(i = 0; i <= c->maxbands; i++){
@@ -284,21 +296,21 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
    ff_mpc_dequantize_and_synth(c, mb, c->frame.data[0], 2);

    bits_used = get_bits_count(&gb);
    bits_avail = (buf_size - 4) * 8;
    if(!buf[1] && ((bits_avail < bits_used) || (bits_used + 32 <= bits_avail))){
    bits_avail = buf_size * 8;
    if (!last_frame && ((bits_avail < bits_used) || (bits_used + 32 <= bits_avail))) {
        av_log(NULL,0, "Error decoding frame: used %i of %i bits\n", bits_used, bits_avail);
        return -1;
    }
    if(c->frames_to_skip){
        c->frames_to_skip--;
        *got_frame_ptr = 0;
        return buf_size;
        return avpkt->size;
    }

    *got_frame_ptr   = 1;
    *(AVFrame *)data = c->frame;

    return buf_size;
    return avpkt->size;
 }

 static void mpc7_decode_flush(AVCodecContext *avctx)
@@ -312,8 +324,8 @@ static void mpc7_decode_flush(AVCodecContext *avctx)
 static av_cold int mpc7_decode_close(AVCodecContext *avctx)
 {
    MPCContext *c = avctx->priv_data;
    av_freep(&c->buffer);
    c->buffer_size = 0;
    av_freep(&c->bits);
    c->buf_size = 0;
    return 0;
 }

--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -194,7 +194,6 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
        return -1;
    }

    avpkt->size = frame->nb_samples * avctx->channels * sample_size;
    *got_packet_ptr = 1;
    return 0;
 }
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -521,5 +521,5 @@ AVCodec ff_ra_144_encoder = {
    .close          = ra144_encode_close,
    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                     AV_SAMPLE_FMT_NONE },
    .long_name      = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K) encoder"),
    .long_name      = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"),
 };
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -919,16 +919,14 @@ int ff_alloc_packet(AVPacket *avpkt, int size)

    if (avpkt->data) {
        uint8_t *pkt_data;
        int pkt_size;

        if (avpkt->size < size)
            return AVERROR(EINVAL);

        pkt_data = avpkt->data;
        pkt_size = avpkt->size;
        av_init_packet(avpkt);
        avpkt->data = pkt_data;
        avpkt->size = pkt_size;
        avpkt->size = size;
        return 0;
    } else {
        return av_new_packet(avpkt, size);
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -156,7 +156,7 @@ void ff_vorbis_ready_floor1_list(vorbis_floor1_entry * list, int values)
    }
 }

 static inline void render_line_unrolled(intptr_t x, uint8_t y, int x1,
 static inline void render_line_unrolled(intptr_t x, int y, int x1,
                                        intptr_t sy, int ady, int adx,
                                        float *buf)
 {
@@ -168,30 +168,30 @@ static inline void render_line_unrolled(intptr_t x, uint8_t y, int x1,
        if (err >= 0) {
            err += ady - adx;
            y   += sy;
            buf[x++] = ff_vorbis_floor1_inverse_db_table[y];
            buf[x++] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)];
        }
        buf[x] = ff_vorbis_floor1_inverse_db_table[y];
        buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)];
    }
    if (x <= 0) {
        if (err + ady >= 0)
            y += sy;
        buf[x] = ff_vorbis_floor1_inverse_db_table[y];
        buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)];
    }
 }

 static void render_line(int x0, uint8_t y0, int x1, int y1, float *buf)
 static void render_line(int x0, int y0, int x1, int y1, float *buf)
 {
    int dy  = y1 - y0;
    int adx = x1 - x0;
    int ady = FFABS(dy);
    int sy  = dy < 0 ? -1 : 1;
    buf[x0] = ff_vorbis_floor1_inverse_db_table[y0];
    buf[x0] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y0)];
    if (ady*2 <= adx) { // optimized common case
        render_line_unrolled(x0, y0, x1, sy, ady, adx, buf);
    } else {
        int base  = dy / adx;
        int x     = x0;
        uint8_t y = y0;
        int y     = y0;
        int err   = -adx;
        ady -= FFABS(base) * adx;
        while (++x < x1) {
@@ -201,7 +201,7 @@ static void render_line(int x0, uint8_t y0, int x1, int y1, float *buf)
                err -= adx;
                y   += sy;
            }
            buf[x] = ff_vorbis_floor1_inverse_db_table[y];
            buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)];
        }
    }
 }
@@ -210,8 +210,7 @@ void ff_vorbis_floor1_render_list(vorbis_floor1_entry * list, int values,
                                  uint16_t *y_list, int *flag,
                                  int multiplier, float *out, int samples)
 {
    int lx, i;
    uint8_t ly;
    int lx, ly, i;
    lx = 0;
    ly = y_list[0] * multiplier;
    for (i = 1; i < values; i++) {
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@@ -1256,20 +1256,20 @@ static int vorbis_floor1_decode(vorbis_context *vc,
            floor1_flag[i]               = 1;
            if (val >= room) {
                if (highroom > lowroom) {
                    floor1_Y_final[i] = val - lowroom + predicted;
                    floor1_Y_final[i] = av_clip_uint16(val - lowroom + predicted);
                } else {
                    floor1_Y_final[i] = predicted - val + highroom - 1;
                    floor1_Y_final[i] = av_clip_uint16(predicted - val + highroom - 1);
                }
            } else {
                if (val & 1) {
                    floor1_Y_final[i] = predicted - (val + 1) / 2;
                    floor1_Y_final[i] = av_clip_uint16(predicted - (val + 1) / 2);
                } else {
                    floor1_Y_final[i] = predicted + val / 2;
                    floor1_Y_final[i] = av_clip_uint16(predicted + val / 2);
                }
            }
        } else {
            floor1_flag[i]    = 0;
            floor1_Y_final[i] = predicted;
            floor1_Y_final[i] = av_clip_uint16(predicted);
        }

        av_dlog(NULL, " Decoded floor(%d) = %u / val %u\n",
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -83,3 +83,4 @@ OBJS-$(HAVE_MMX)                       += x86/dsputil_mmx.o             \
                                          x86/mpegvideo_mmx.o           \
                                          x86/simple_idct_mmx.o         \

 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1063,7 +1063,7 @@ emu_edge mmx
 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
 ; %5 = suffix
 %macro VECTOR_CLIP_INT32 4-5
 cglobal vector_clip_int32%5, 5,5,%2, dst, src, min, max, len
 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
 %if %4
    cvtsi2ss  m4, minm
    cvtsi2ss  m5, maxm
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -0,0 +1,80 @@
 /*
 * check XMM registers for clobbers on Win64
 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavcodec/avcodec.h"
 #include "libavutil/x86/w64xmmtest.h"

 wrap(avcodec_open2(AVCodecContext *avctx,
                   AVCodec *codec,
                   AVDictionary **options))
 {
    testxmmclobbers(avcodec_open2, avctx, codec, options);
 }

 wrap(avcodec_decode_audio4(AVCodecContext *avctx,
                           AVFrame *frame,
                           int *got_frame_ptr,
                           AVPacket *avpkt))
 {
    testxmmclobbers(avcodec_decode_audio4, avctx, frame,
                    got_frame_ptr, avpkt);
 }

 wrap(avcodec_decode_video2(AVCodecContext *avctx,
                           AVFrame *picture,
                           int *got_picture_ptr,
                           AVPacket *avpkt))
 {
    testxmmclobbers(avcodec_decode_video2, avctx, picture,
                    got_picture_ptr, avpkt);
 }

 wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
                              AVSubtitle *sub,
                              int *got_sub_ptr,
                              AVPacket *avpkt))
 {
    testxmmclobbers(avcodec_decode_subtitle2, avctx, sub,
                    got_sub_ptr, avpkt);
 }

 wrap(avcodec_encode_audio2(AVCodecContext *avctx,
                           AVPacket *avpkt,
                           const AVFrame *frame,
                           int *got_packet_ptr))
 {
    testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
                    got_packet_ptr);
 }

 wrap(avcodec_encode_video(AVCodecContext *avctx,
                          uint8_t *buf, int buf_size,
                          const AVFrame *pict))
 {
    testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
 }

 wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
                             uint8_t *buf, int buf_size,
                             const AVSubtitle *sub))
 {
    testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
 }
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@@ -129,6 +129,7 @@ typedef struct MOVStreamContext {
    int has_palette;
    int64_t data_size;
    uint32_t tmcd_flags;  ///< tmcd track flags
    int64_t track_end;    ///< used for dts generation in fragmented movie files
 } MOVStreamContext;

 typedef struct MOVContext {
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -1012,6 +1012,32 @@ static int mov_read_glbl(MOVContext *c, AVIOContext *pb, MOVAtom atom)
    return 0;
 }

 static int mov_read_dvc1(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 {
    AVStream *st;
    uint8_t profile_level;

    if (c->fc->nb_streams < 1)
        return 0;
    st = c->fc->streams[c->fc->nb_streams-1];

    if (atom.size >= (1<<28) || atom.size < 7)
        return AVERROR_INVALIDDATA;

    profile_level = avio_r8(pb);
    if (profile_level & 0xf0 != 0xc0)
        return 0;

    av_free(st->codec->extradata);
    st->codec->extradata = av_mallocz(atom.size - 7 + FF_INPUT_BUFFER_PADDING_SIZE);
    if (!st->codec->extradata)
        return AVERROR(ENOMEM);
    st->codec->extradata_size = atom.size - 7;
    avio_seek(pb, 6, SEEK_CUR);
    avio_read(pb, st->codec->extradata, st->codec->extradata_size);
    return 0;
 }

 /**
 * An strf atom is a BITMAPINFOHEADER struct. This struct is 40 bytes itself,
 * but can have extradata appended at the end after the 40 bytes belonging
@@ -1706,6 +1732,7 @@ static int mov_read_stts(MOVContext *c, AVIOContext *pb, MOVAtom atom)
    st->nb_frames= total_sample_count;
    if (duration)
        st->duration= duration;
    sc->track_end = duration;
    return 0;
 }

@@ -2326,7 +2353,7 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom)

    if (flags & 0x001) data_offset        = avio_rb32(pb);
    if (flags & 0x004) first_sample_flags = avio_rb32(pb);
    dts = st->duration - sc->time_offset;
    dts    = sc->track_end - sc->time_offset;
    offset = frag->base_data_offset + data_offset;
    distance = 0;
    av_dlog(c->fc, "first sample flags 0x%x\n", first_sample_flags);
@@ -2356,7 +2383,7 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom)
        sc->data_size += sample_size;
    }
    frag->moof_offset = offset;
    st->duration = dts + sc->time_offset;
    st->duration = sc->track_end = dts + sc->time_offset;
    return 0;
 }

@@ -2538,6 +2565,7 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
 { MKTAG('w','f','e','x'), mov_read_wfex },
 { MKTAG('c','m','o','v'), mov_read_cmov },
 { MKTAG('c','h','a','n'), mov_read_chan }, /* channel layout */
 { MKTAG('d','v','c','1'), mov_read_dvc1 },
 { 0, NULL }
 };

--- a/libavutil/x86/w64xmmtest.h
+++ b/libavutil/x86/w64xmmtest.h
@@ -0,0 +1,71 @@
 /*
 * check XMM registers for clobbers on Win64
 * Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>
 #include <stdlib.h>
 #include <stdarg.h>

 #include "libavutil/bswap.h"

 #define storexmmregs(mem)               \
    __asm__ volatile(                   \
        "movups %%xmm6 , 0x00(%0)\n\t"  \
        "movups %%xmm7 , 0x10(%0)\n\t"  \
        "movups %%xmm8 , 0x20(%0)\n\t"  \
        "movups %%xmm9 , 0x30(%0)\n\t"  \
        "movups %%xmm10, 0x40(%0)\n\t"  \
        "movups %%xmm11, 0x50(%0)\n\t"  \
        "movups %%xmm12, 0x60(%0)\n\t"  \
        "movups %%xmm13, 0x70(%0)\n\t"  \
        "movups %%xmm14, 0x80(%0)\n\t"  \
        "movups %%xmm15, 0x90(%0)\n\t"  \
        :: "r"(mem) : "memory")

 #define testxmmclobbers(func, ctx, ...)                         \
    uint64_t xmm[2][10][2];                                     \
    int ret;                                                    \
    storexmmregs(xmm[0]);                                       \
    ret = __real_ ## func(ctx, __VA_ARGS__);                    \
    storexmmregs(xmm[1]);                                       \
    if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) {               \
        int i;                                                  \
        av_log(ctx, AV_LOG_ERROR,                               \
               "XMM REGS CLOBBERED IN %s!\n", #func);           \
        for (i = 0; i < 10; i ++)                               \
            if (xmm[0][i][0] != xmm[1][i][0] ||                 \
                xmm[0][i][1] != xmm[1][i][1]) {                 \
                av_log(ctx, AV_LOG_ERROR,                       \
                       "xmm%-2d = %016"PRIx64"%016"PRIx64"\n",  \
                       6 + i, av_bswap64(xmm[0][i][0]),         \
                       av_bswap64(xmm[0][i][1]));               \
                av_log(ctx, AV_LOG_ERROR,                       \
                         "     -> %016"PRIx64"%016"PRIx64"\n",  \
                       av_bswap64(xmm[1][i][0]),                \
                       av_bswap64(xmm[1][i][1]));               \
            }                                                   \
        abort();                                                \
    }                                                           \
    return ret

 #define wrap(func)      \
 int __real_ ## func;    \
 int __wrap_ ## func;    \
 int __wrap_ ## func
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -25,6 +25,8 @@ MMX-OBJS-$(HAVE_YASM)      +=  x86/input.o              \

 $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)

 OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o

 TESTPROGS = colorspace swscale

 DIRS = bfin mlib ppc sparc x86
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -30,11 +30,11 @@ and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.

 The following calculation is used for the conversion:

  r = clipz((y-oy)*cy  + crv*(v-128))
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
  b = clipz((y-oy)*cy  + cbu*(u-128))
  r = clipz((y - oy) * cy  + crv * (v - 128))
  g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
  b = clipz((y - oy) * cy  + cbu * (u - 128))

 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
 y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.


 New factorization to eliminate the truncation error which was
@@ -47,7 +47,7 @@ occurring due to the byteop3p.
 2) Scale operands up by a factor of 4 not 8 because Blackfin
   multiplies include a shift.

 3) Compute into the accumulators cy*yx0, cy*yx1.
 3) Compute into the accumulators cy * yx0, cy * yx1.

 4) Compute each of the linear equations:
     r = clipz((y - oy) * cy  + crv * (v - 128))
@@ -73,7 +73,7 @@ occurring due to the byteop3p.

 Where coeffs have the following layout in memory.

 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
 uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;

 coeffs is a pointer to oy.

--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -27,32 +27,34 @@
 #include <assert.h>
 #include "config.h"
 #include <unistd.h>

 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"

 #if defined (__FDPIC__) && CONFIG_SRAM
 #define L1CODE __attribute__ ((l1_text))
 #define L1CODE __attribute__((l1_text))
 #else
 #define L1CODE
 #endif

 int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                       int width, int height,
 int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height,
                       int lumStride, int chromStride, int srcStride) L1CODE;

 int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                       int width, int height,
 int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height,
                       int lumStride, int chromStride, int srcStride) L1CODE;

 static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[])
 static int uyvytoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
                               int srcSliceY, int srcSliceH, uint8_t *dst[],
                               int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
    int w         = dstStride[0];
    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
    uint8_t *ip   = src[0] + srcStride[0] * srcSliceY;
    int w = dstStride[0];

    ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                       dstStride[0], dstStride[1], srcStride[0]);
@@ -60,14 +62,15 @@ static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
    return srcSliceH;
 }

 static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[])
 static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[],
                               int srcSliceY, int srcSliceH, uint8_t *dst[],
                               int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
    int w         = dstStride[0];
    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
    uint8_t *ip   = src[0] + srcStride[0] * srcSliceY;
    int w = dstStride[0];

    ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                       dstStride[0], dstStride[1], srcStride[0]);
@@ -75,15 +78,16 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
    return srcSliceH;
 }


 void ff_bfin_get_unscaled_swscale(SwsContext *c)
 {
    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
        av_log(NULL, AV_LOG_VERBOSE,
               "selecting Blackfin optimized uyvytoyv12_unscaled\n");
        c->swScale = uyvytoyv12_unscaled;
    }
    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
        av_log(NULL, AV_LOG_VERBOSE,
               "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
        c->swScale = yuyvtoyv12_unscaled;
    }
 }
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -26,15 +26,16 @@
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
 #include <unistd.h>
 #include "libavutil/pixdesc.h"

 #include "config.h"
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"

 #if defined(__FDPIC__) && CONFIG_SRAM
 #define L1CODE __attribute__ ((l1_text))
 #define L1CODE __attribute__((l1_text))
 #else
 #define L1CODE
 #endif
@@ -48,21 +49,20 @@ void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
 void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                            int w, uint32_t *coeffs) L1CODE;

 typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                            int w, uint32_t *coeffs);

 typedef void (*ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                           int w, uint32_t *coeffs);

 static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
 {
    int oy;
    oy      = c->yOffset&0xffff;
    oy      = oy >> 3; // keep everything U8.0 for offset calculation
    oy = c->yOffset & 0xffff;
    oy = oy >> 3;      // keep everything U8.0 for offset calculation

    c->oc   = 128*0x01010101U;
    c->oy   =  oy*0x01010101U;
    c->oc = 128 * 0x01010101U;
    c->oy = oy * 0x01010101U;

    /* copy 64bit vector coeffs down to 32bit vector coeffs */
    c->cy  = c->yCoeff;
    c->cy   = c->yCoeff;
    c->zero = 0;

    if (rgb) {
@@ -77,7 +77,6 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
        c->cgv = c->ugCoeff;
    }


    if (masks == 555) {
        c->rmask = 0x001f * 0x00010001U;
        c->gmask = 0x03e0 * 0x00010001U;
@@ -89,27 +88,25 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
    }
 }

 static int core_yuv420_rgb(SwsContext *c,
                           uint8_t **in, int *instrides,
                           int srcSliceY, int srcSliceH,
                           uint8_t **oplanes, int *outstrides,
                           ltransform lcscf, int rgb, int masks)
 static int core_yuv420_rgb(SwsContext *c, uint8_t **in, int *instrides,
                           int srcSliceY, int srcSliceH, uint8_t **oplanes,
                           int *outstrides, ltransform lcscf,
                           int rgb, int masks)
 {
    uint8_t *py,*pu,*pv,*op;
    uint8_t *py, *pu, *pv, *op;
    int w  = instrides[0];
    int h2 = srcSliceH>>1;
    int h2 = srcSliceH >> 1;
    int i;

    bfin_prepare_coefficients(c, rgb, masks);

    py = in[0];
    pu = in[1+(1^rgb)];
    pv = in[1+(0^rgb)];

    op = oplanes[0] + srcSliceY*outstrides[0];
    pu = in[1 + (1 ^ rgb)];
    pv = in[1 + (0 ^ rgb)];

    for (i=0;i<h2;i++) {
    op = oplanes[0] + srcSliceY * outstrides[0];

    for (i = 0; i < h2; i++) {
        lcscf(py, pu, pv, op, w, &c->oy);

        py += instrides[0];
@@ -126,9 +123,7 @@ static int core_yuv420_rgb(SwsContext *c,
    return srcSliceH;
 }


 static int bfin_yuv420_rgb555(SwsContext *c,
                              uint8_t **in, int *instrides,
 static int bfin_yuv420_rgb555(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -136,8 +131,7 @@ static int bfin_yuv420_rgb555(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
 }

 static int bfin_yuv420_bgr555(SwsContext *c,
                              uint8_t **in, int *instrides,
 static int bfin_yuv420_bgr555(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -145,8 +139,7 @@ static int bfin_yuv420_bgr555(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
 }

 static int bfin_yuv420_rgb24(SwsContext *c,
                             uint8_t **in, int *instrides,
 static int bfin_yuv420_rgb24(SwsContext *c, uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides)
 {
@@ -154,8 +147,7 @@ static int bfin_yuv420_rgb24(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
 }

 static int bfin_yuv420_bgr24(SwsContext *c,
                             uint8_t **in, int *instrides,
 static int bfin_yuv420_bgr24(SwsContext *c, uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides)
 {
@@ -163,8 +155,7 @@ static int bfin_yuv420_bgr24(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
 }

 static int bfin_yuv420_rgb565(SwsContext *c,
                              uint8_t **in, int *instrides,
 static int bfin_yuv420_rgb565(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -172,8 +163,7 @@ static int bfin_yuv420_rgb565(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
 }

 static int bfin_yuv420_bgr565(SwsContext *c,
                              uint8_t **in, int *instrides,
 static int bfin_yuv420_bgr565(SwsContext *c, uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
@@ -181,18 +171,29 @@ static int bfin_yuv420_bgr565(SwsContext *c,
                           outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
 }


 SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
 {
    SwsFunc f;

    switch(c->dstFormat) {
    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
    switch (c->dstFormat) {
    case PIX_FMT_RGB555:
        f = bfin_yuv420_rgb555;
        break;
    case PIX_FMT_BGR555:
        f = bfin_yuv420_bgr555;
        break;
    case PIX_FMT_RGB565:
        f = bfin_yuv420_rgb565;
        break;
    case PIX_FMT_BGR565:
        f = bfin_yuv420_bgr565;
        break;
    case PIX_FMT_RGB24:
        f = bfin_yuv420_rgb24;
        break;
    case PIX_FMT_BGR24:
        f = bfin_yuv420_bgr24;
        break;
    default:
        return 0;
    }
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
 rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
 rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV

 rgba_Ycoeff_rb:  times 4 dw RY, BY
 rgba_Ycoeff_br:  times 4 dw BY, RY
 rgba_Ycoeff_ga:  times 4 dw GY, 0
 rgba_Ycoeff_ag:  times 4 dw 0,  GY
 rgba_Ucoeff_rb:  times 4 dw RU, BU
 rgba_Ucoeff_br:  times 4 dw BU, RU
 rgba_Ucoeff_ga:  times 4 dw GU, 0
 rgba_Ucoeff_ag:  times 4 dw 0,  GU
 rgba_Vcoeff_rb:  times 4 dw RV, BV
 rgba_Vcoeff_br:  times 4 dw BV, RV
 rgba_Vcoeff_ga:  times 4 dw GV, 0
 rgba_Vcoeff_ag:  times 4 dw 0,  GV

 shuf_rgb_12x4:   db 0, 0x80, 1, 0x80,  2, 0x80,  3, 0x80, \
                    6, 0x80, 7, 0x80,  8, 0x80,  9, 0x80
 shuf_rgb_3x56:   db 2, 0x80, 3, 0x80,  4, 0x80,  5, 0x80, \
@@ -294,6 +307,150 @@ RGB24_FUNCS 11, 13
 INIT_XMM avx
 RGB24_FUNCS 11, 13

 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_Y_FN 5-6
 cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
    mova           m5, [rgba_Ycoeff_%2%4]
    mova           m6, [rgba_Ycoeff_%3%5]
 %if %0 == 6
    jmp mangle(program_name %+ _ %+ %6 %+ ToY %+ SUFFIX).body
 %else ; %0 == 6
 .body:
 %if ARCH_X86_64
    movsxd         wq, wd
 %endif
    lea          srcq, [srcq+wq*4]
    add            wq, wq
    add          dstq, wq
    neg            wq
    mova           m4, [rgb_Yrnd]
    pcmpeqb        m7, m7
    psrlw          m7, 8                  ; (word) { 0x00ff } x4
 .loop:
    ; FIXME check alignment and use mova
    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
    pmaddwd        m3, m5                 ; (dword) { Bx*BY + Rx*RY }[4-7]
    pmaddwd        m2, m6                 ; (dword) { Gx*GY }[4-7]
    paddd          m0, m4                 ; += rgb_Yrnd
    paddd          m2, m4                 ; += rgb_Yrnd
    paddd          m0, m1                 ; (dword) { Y[0-3] }
    paddd          m2, m3                 ; (dword) { Y[4-7] }
    psrad          m0, 9
    psrad          m2, 9
    packssdw       m0, m2                 ; (word) { Y[0-7] }
    mova    [dstq+wq], m0
    add            wq, mmsize
    jl .loop
    REP_RET
 %endif ; %0 == 3
 %endmacro

 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_UV_FN 5-6
 cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
    mova           m8, [rgba_Ucoeff_%2%4]
    mova           m9, [rgba_Ucoeff_%3%5]
    mova          m10, [rgba_Vcoeff_%2%4]
    mova          m11, [rgba_Vcoeff_%3%5]
 %define coeffU1 m8
 %define coeffU2 m9
 %define coeffV1 m10
 %define coeffV2 m11
 %else ; x86-32
 %define coeffU1 [rgba_Ucoeff_%2%4]
 %define coeffU2 [rgba_Ucoeff_%3%5]
 %define coeffV1 [rgba_Vcoeff_%2%4]
 %define coeffV2 [rgba_Vcoeff_%3%5]
 %endif ; x86-64/32
 %if ARCH_X86_64 && %0 == 6
    jmp mangle(program_name %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
 %else ; ARCH_X86_64 && %0 == 6
 .body:
 %if ARCH_X86_64
    movsxd         wq, dword r5m
 %else ; x86-32
    mov            wq, r5m
 %endif
    add            wq, wq
    add         dstUq, wq
    add         dstVq, wq
    lea          srcq, [srcq+wq*2]
    neg            wq
    pcmpeqb        m7, m7
    psrlw          m7, 8                  ; (word) { 0x00ff } x4
    mova           m6, [rgb_UVrnd]
 .loop:
    ; FIXME check alignment and use mova
    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
    paddd          m3, m6                 ; += rgb_UVrnd
    paddd          m1, m6                 ; += rgb_UVrnd
    paddd          m2, m3                 ; (dword) { V[0-3] }
    paddd          m0, m1                 ; (dword) { U[0-3] }
    pmaddwd        m3, m5, coeffV1        ; (dword) { Bx*BV + Rx*RV }[4-7]
    pmaddwd        m1, m4, coeffV2        ; (dword) { Gx*GV }[4-7]
    pmaddwd        m5, coeffU1            ; (dword) { Bx*BU + Rx*RU }[4-7]
    pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
    paddd          m3, m6                 ; += rgb_UVrnd
    paddd          m5, m6                 ; += rgb_UVrnd
    psrad          m0, 9
    paddd          m1, m3                 ; (dword) { V[4-7] }
    paddd          m4, m5                 ; (dword) { U[4-7] }
    psrad          m2, 9
    psrad          m4, 9
    psrad          m1, 9
    packssdw       m0, m4                 ; (word) { U[0-7] }
    packssdw       m2, m1                 ; (word) { V[0-7] }
 %if mmsize == 8
    mova   [dstUq+wq], m0
    mova   [dstVq+wq], m2
 %else ; mmsize == 16
    mova   [dstUq+wq], m0
    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
    add            wq, mmsize
    jl .loop
    REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
 %endmacro

 ; %1 = nr. of XMM registers for rgb-to-Y func
 ; %2 = nr. of XMM registers for rgb-to-UV func
 %macro RGB32_FUNCS 2
 RGB32_TO_Y_FN %1, r, g, b, a
 RGB32_TO_Y_FN %1, b, g, r, a, rgba
 RGB32_TO_Y_FN %1, a, r, g, b, rgba
 RGB32_TO_Y_FN %1, a, b, g, r, rgba

 RGB32_TO_UV_FN %2, r, g, b, a
 RGB32_TO_UV_FN %2, b, g, r, a, rgba
 RGB32_TO_UV_FN %2, a, r, g, b, rgba
 RGB32_TO_UV_FN %2, a, b, g, r, rgba
 %endmacro

 %if ARCH_X86_32
 INIT_MMX mmx
 RGB32_FUNCS 0, 0
 %endif

 INIT_XMM sse2
 RGB32_FUNCS 8, 12

 INIT_XMM avx
 RGB32_FUNCS 8, 12

 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
 ;
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -308,6 +308,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
    INPUT_FUNC(yuyv, opt); \
    INPUT_UV_FUNC(nv12, opt); \
    INPUT_UV_FUNC(nv21, opt); \
    INPUT_FUNC(rgba, opt); \
    INPUT_FUNC(bgra, opt); \
    INPUT_FUNC(argb, opt); \
    INPUT_FUNC(abgr, opt); \
    INPUT_FUNC(rgb24, opt); \
    INPUT_FUNC(bgr24, opt)

@@ -406,6 +410,10 @@ switch(c->dstBpc){ \
            break;
        case_rgb(rgb24, RGB24, mmx);
        case_rgb(bgr24, BGR24, mmx);
        case_rgb(bgra,  BGRA,  mmx);
        case_rgb(rgba,  RGBA,  mmx);
        case_rgb(abgr,  ABGR,  mmx);
        case_rgb(argb,  ARGB,  mmx);
        default:
            break;
        }
@@ -450,6 +458,10 @@ switch(c->dstBpc){ \
            break;
        case_rgb(rgb24, RGB24, sse2);
        case_rgb(bgr24, BGR24, sse2);
        case_rgb(bgra,  BGRA,  sse2);
        case_rgb(rgba,  RGBA,  sse2);
        case_rgb(abgr,  ABGR,  sse2);
        case_rgb(argb,  ARGB,  sse2);
        default:
            break;
        }
@@ -493,6 +505,10 @@ switch(c->dstBpc){ \
            break;
        case_rgb(rgb24, RGB24, avx);
        case_rgb(bgr24, BGR24, avx);
        case_rgb(bgra,  BGRA,  avx);
        case_rgb(rgba,  RGBA,  avx);
        case_rgb(abgr,  ABGR,  avx);
        case_rgb(argb,  ARGB,  avx);
        default:
            break;
        }
--- a/libswscale/x86/w64xmmtest.c
+++ b/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
 /*
 * check XMM registers for clobbers on Win64
 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/x86/w64xmmtest.h"
 #include "libswscale/swscale.h"

 wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
               const int srcStride[], int srcSliceY, int srcSliceH,
               uint8_t *const dst[], const int dstStride[]))
 {
    testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
                    srcSliceH, dst, dstStride);
 }
--- a/tests/codec-regression.sh
+++ b/tests/codec-regression.sh
@@ -368,7 +368,7 @@ $tiny_psnr $pcm_dst $pcm_ref 2 1924
 fi

 if [ -n "$do_ac3_fixed" ] ; then
 do_audio_encoding ac3.rm "-vn -acodec ac3_fixed"
 do_audio_encoding ac3.ac3 "-vn -acodec ac3_fixed"
 # binaries configured with --disable-sse decode ac3 differently
 #do_audio_decoding
 #$tiny_psnr $pcm_dst $pcm_ref 2 1024
--- a/tests/ref/acodec/ac3_fixed
+++ b/tests/ref/acodec/ac3_fixed
@@ -1,2 +1,2 @@
 e7fa185030a56d9db8663ad9e38c6c94 *./tests/data/acodec/ac3.rm
 98751 ./tests/data/acodec/ac3.rm
 a1d1fc116463b771abf5aef7ed37d7b1 *./tests/data/acodec/ac3.ac3
 96408 ./tests/data/acodec/ac3.ac3
--- a/tests/ref/fate/vc1-ism
+++ b/tests/ref/fate/vc1-ism
@@ -117,4 +117,3 @@
 0, 438750, 37440, 0xf0fe8c1c
 0, 442500, 37440, 0xc0036222
 0, 446250, 37440, 0x3058385c
 0, 450000, 37440, 0x68141016
--- a/tests/ref/seek/ac3_ac3
+++ b/tests/ref/seek/ac3_ac3
@@ -0,0 +1,49 @@
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:   556
 ret: 0         st:-1 flags:0  ts:-1.000000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:   556
 ret: 0         st:-1 flags:1  ts: 1.894167
 ret: 0         st: 0 flags:1 dts: 1.880400 pts: 1.880400 pos:  30092 size:   558
 ret: 0         st: 0 flags:0  ts: 0.788333
 ret: 0         st: 0 flags:1 dts: 0.800911 pts: 0.800911 pos:  12818 size:   556
 ret:-1         st: 0 flags:1  ts:-0.317500
 ret: 0         st:-1 flags:0  ts: 2.576668
 ret: 0         st: 0 flags:1 dts: 2.576844 pts: 2.576844 pos:  41238 size:   558
 ret: 0         st:-1 flags:1  ts: 1.470835
 ret: 0         st: 0 flags:1 dts: 1.462533 pts: 1.462533 pos:  23406 size:   556
 ret: 0         st: 0 flags:0  ts: 0.365000
 ret: 0         st: 0 flags:1 dts: 0.383044 pts: 0.383044 pos:   6130 size:   558
 ret:-1         st: 0 flags:1  ts:-0.740833
 ret: 0         st:-1 flags:0  ts: 2.153336
 ret: 0         st: 0 flags:1 dts: 2.158978 pts: 2.158978 pos:  34552 size:   556
 ret: 0         st:-1 flags:1  ts: 1.047503
 ret: 0         st: 0 flags:1 dts: 1.044667 pts: 1.044667 pos:  16718 size:   558
 ret: 0         st: 0 flags:0  ts:-0.058333
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:   556
 ret: 0         st: 0 flags:1  ts: 2.835833
 ret: 0         st: 0 flags:1 dts: 2.820600 pts: 2.820600 pos:  45140 size:   556
 ret: 0         st:-1 flags:0  ts: 1.730004
 ret: 0         st: 0 flags:1 dts: 1.741111 pts: 1.741111 pos:  27864 size:   556
 ret: 0         st:-1 flags:1  ts: 0.624171
 ret: 0         st: 0 flags:1 dts: 0.591978 pts: 0.591978 pos:   9474 size:   556
 ret: 0         st: 0 flags:0  ts:-0.481667
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:   556
 ret: 0         st: 0 flags:1  ts: 2.412500
 ret: 0         st: 0 flags:1 dts: 2.402733 pts: 2.402733 pos:  38452 size:   558
 ret: 0         st:-1 flags:0  ts: 1.306672
 ret: 0         st: 0 flags:1 dts: 1.323244 pts: 1.323244 pos:  21176 size:   558
 ret: 0         st:-1 flags:1  ts: 0.200839
 ret: 0         st: 0 flags:1 dts: 0.174111 pts: 0.174111 pos:   2786 size:   558
 ret: 0         st: 0 flags:0  ts:-0.904989
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:      0 size:   556
 ret: 0         st: 0 flags:1  ts: 1.989178
 ret: 0         st: 0 flags:1 dts: 1.984867 pts: 1.984867 pos:  31764 size:   558
 ret: 0         st:-1 flags:0  ts: 0.883340
 ret: 0         st: 0 flags:1 dts: 0.905378 pts: 0.905378 pos:  14488 size:   558
 ret:-1         st:-1 flags:1  ts:-0.222493
 ret: 0         st: 0 flags:0  ts: 2.671678
 ret: 0         st: 0 flags:1 dts: 2.681311 pts: 2.681311 pos:  42910 size:   558
 ret: 0         st: 0 flags:1  ts: 1.565844
 ret: 0         st: 0 flags:1 dts: 1.532178 pts: 1.532178 pos:  24520 size:   558
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 0.487511 pts: 0.487511 pos:   7802 size:   556
 ret:-1         st:-1 flags:1  ts:-0.645825
--- a/tests/ref/seek/ac3_rm
+++ b/tests/ref/seek/ac3_rm
@@ -1,41 +0,0 @@
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st:-1 flags:0  ts:-1.000000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st:-1 flags:1  ts: 1.894167
 ret:-1         st: 0 flags:0  ts: 0.788000
 ret: 0         st: 0 flags:1  ts:-0.317000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st:-1 flags:0  ts: 2.576668
 ret: 0         st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos:  34997 size:   558
 ret:-1         st:-1 flags:1  ts: 1.470835
 ret:-1         st: 0 flags:0  ts: 0.365000
 ret: 0         st: 0 flags:1  ts:-0.741000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st:-1 flags:0  ts: 2.153336
 ret: 0         st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos:  34997 size:   558
 ret:-1         st:-1 flags:1  ts: 1.047503
 ret: 0         st: 0 flags:0  ts:-0.058000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st: 0 flags:1  ts: 2.836000
 ret: 0         st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos:  34997 size:   558
 ret:-1         st:-1 flags:0  ts: 1.730004
 ret:-1         st:-1 flags:1  ts: 0.624171
 ret: 0         st: 0 flags:0  ts:-0.482000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st: 0 flags:1  ts: 2.413000
 ret: 0         st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos:  34997 size:   558
 ret: 0         st:-1 flags:0  ts: 1.306672
 ret: 0         st: 0 flags:1 dts:65.537000 pts:65.537000 pos:  87488 size:  6132
 ret:-1         st:-1 flags:1  ts: 0.200839
 ret: 0         st: 0 flags:0  ts:-0.905000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st: 0 flags:1  ts: 1.989000
 ret:-1         st:-1 flags:0  ts: 0.883340
 ret: 0         st:-1 flags:1  ts:-0.222493
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st: 0 flags:0  ts: 2.672000
 ret:-1         st: 0 flags:1  ts: 1.566000
 ret: 0         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts: 1.567000 pts: 1.567000 pos:  25889 size:   556
 ret: 0         st:-1 flags:1  ts:-0.645825
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556