Merge remote-tracking branch 'qatar/master'

* qatar/master: (22 commits) configure: enable memalign_hack automatically when needed swscale: unbreak the build on non-x86 systems. swscale: remove if(bitexact) branch from functions. swscale: remove if(canMMX2BeUsed) conditional. swscale: remove swScale_{c,MMX,MMX2} duplication. swscale: use emms_c(). Move emms_c() from libavcodec to libavutil. tiff: set palette in the context when specified in TIFF_PAL tag rtsp: use strtoul to parse rtptime and seq values. pgssubdec: fix incorrect colors. dvdsubdec: fix incorrect colors. ape: Allow demuxing of files with metadata tags. swscale: remove dead macro WRITEBGR24OLD. swscale: remove AMD3DNOW "optimizations". swscale: remove duplicate code in ppc/ subdirectory. swscale: remove duplicated x86/ functions. swscale: force --enable-runtime-cpudetect and remove SWS_CPU_CAPS_*. vsrc_buffer.h: add file doxy vsrc_buffer: tweak error message in init() msmpeg4: reindent. ... Merged-by: Michael Niedermayer <michaelni@gmx.at>
14 years ago · 034fc7bf12
--- a/configure
+++ b/configure
@@ -2859,11 +2859,6 @@ check_header X11/extensions/XvMClib.h

 check_struct dxva2api.h DXVA_PictureParameters wDecodedPictureIndex

 if ! enabled_any memalign memalign_hack posix_memalign malloc_aligned &&
     enabled_any $need_memalign ; then
    die "Error, no aligned memory allocator but SSE enabled, disable it or use --enable-memalign-hack."
 fi

 disabled  zlib || check_lib   zlib.h      zlibVersion -lz   || disable  zlib
 disabled bzlib || check_lib2 bzlib.h BZ2_bzlibVersion -lbz2 || disable bzlib

@@ -3156,6 +3151,9 @@ check_deps $CONFIG_LIST       \

 enabled asm || { arch=c; disable $ARCH_LIST $ARCH_EXT_LIST; }

 ! enabled_any memalign posix_memalign malloc_aligned &&
    enabled_any $need_memalign && enable memalign_hack

 echo "install prefix            $prefix"
 echo "source path               $source_path"
 echo "C compiler                $cc"
--- a/doc/encoders.texi
+++ b/doc/encoders.texi
@@ -433,3 +433,49 @@ For more information about libx264 and the supported options see:
@url{http://www.videolan.org/developers/x264.html}

@c man end VIDEO ENCODERS

@subheading Floating-Point-Only AC-3 Encoding Options

 These options are only valid for the floating-point encoder and do not exist
 for the fixed-point encoder due to the corresponding features not being
 implemented in fixed-point.

@table @option

@item -channel_coupling @var{boolean}
 Enables/Disables use of channel coupling, which is an optional AC-3 feature
 that increases quality by combining high frequency information from multiple
 channels into a single channel. The per-channel high frequency information is
 sent with less accuracy in both the frequency and time domains. This allows
 more bits to be used for lower frequencies while preserving enough information
 to reconstruct the high frequencies. This option is enabled by default for the
 floating-point encoder and should generally be left as enabled except for
 testing purposes or to increase encoding speed.
@table @option
@item -1
@itemx auto
 Selected by Encoder (default)
@item 0
@itemx off
 Disable Channel Coupling
@item 1
@itemx on
 Enable Channel Coupling
@end table

@item -cpl_start_band @var{number}
 Coupling Start Band. Sets the channel coupling start band, from 1 to 15. If a
 value higher than the bandwidth is used, it will be reduced to 1 less than the
 coupling end band. If @var{auto} is used, the start band will be determined by
 the encoder based on the bit rate, sample rate, and channel layout. This option
 has no effect if channel coupling is disabled.
@table @option
@item -1
@itemx auto
 Selected by Encoder (default)
@end table

@end table

@c man end ENCODERS

--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -269,8 +269,6 @@ OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpegvideo_enc.o \
                                          mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)     += vaapi_mpeg4.o
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V1_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
                                          h263.o ituh263dec.o mpeg4videodec.o
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
                                          h263.o ituh263dec.o mpeg4videodec.o
 OBJS-$(CONFIG_MSMPEG4V2_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -28,7 +28,8 @@
 #define AVCODEC_AC3_H

 #define AC3_MAX_CODED_FRAME_SIZE 3840 /* in bytes */
 #define AC3_MAX_CHANNELS 6 /* including LFE channel */
 #define AC3_MAX_CHANNELS 7            /**< maximum number of channels, including coupling channel */
 #define CPL_CH 0                      /**< coupling channel index */

 #define AC3_MAX_COEFS   256
 #define AC3_BLOCK_SIZE  256
@@ -158,7 +159,9 @@ typedef struct AC3EncOptions {

    /* other encoding options */
    int allow_per_frame_metadata;
    int stereo_rematrixing;    
    int stereo_rematrixing;
    int channel_coupling;
    int cpl_start;    
 } AC3EncOptions;


--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -58,11 +58,6 @@
 #include "fft.h"
 #include "fmtconvert.h"

 /* override ac3.h to include coupling channel */
 #undef AC3_MAX_CHANNELS
 #define AC3_MAX_CHANNELS 7
 #define CPL_CH 0

 #define AC3_OUTPUT_LFEON  8

 #define SPX_MAX_BANDS    17
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -53,12 +53,6 @@ const uint8_t ff_eac3_hebap_tab[64] = {
    19, 19, 19, 19,
 };

 /**
 * Table E2.16 Default Coupling Banding Structure
 */
 const uint8_t ff_eac3_default_cpl_band_struct[18] =
 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1 };

 /**
 * Table E2.15 Default Spectral Extension Banding Structure
 */
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -27,7 +27,6 @@
 extern const uint8_t ff_ac3_ungroup_3_in_5_bits_tab[32][3];

 extern const uint8_t ff_eac3_hebap_tab[64];
 extern const uint8_t ff_eac3_default_cpl_band_struct[18];
 extern const uint8_t ff_eac3_default_spx_band_struct[17];

 #endif /* AVCODEC_AC3DEC_DATA_H */
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -101,7 +101,7 @@ static void scale_coefficients(AC3EncodeContext *s)

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = 0; ch < s->channels; ch++) {
        for (ch = 1; ch <= s->channels; ch++) {
            s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
                                       block->coeff_shift[ch]);
        }
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -93,8 +93,10 @@ static int normalize_samples(AC3EncodeContext *s)
 */
 static void scale_coefficients(AC3EncodeContext *s)
 {
    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
    int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS;
    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer + chan_size,
                               s->mdct_coef_buffer  + chan_size,
                               chan_size * s->channels);
 }


--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -138,6 +138,13 @@ const uint16_t ff_ac3_bitrate_tab[19] = {
 */
 const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };

 /**
 * Table E2.16 Default Coupling Banding Structure
 */
 const uint8_t ff_eac3_default_cpl_band_struct[18] = {
    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1
 };

 /* AC-3 MDCT window */

 /* MDCT window */
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -39,6 +39,7 @@ extern const uint8_t  ff_ac3_dec_channel_map[8][2][6];
 extern const uint16_t ff_ac3_sample_rate_tab[3];
 extern const uint16_t ff_ac3_bitrate_tab[19];
 extern const uint8_t  ff_ac3_rematrix_band_tab[5];
 extern const uint8_t  ff_eac3_default_cpl_band_struct[18];
 extern const int16_t  ff_ac3_window[AC3_WINDOW_SIZE/2];
 extern const uint8_t  ff_ac3_log_add_tab[260];
 extern const uint16_t ff_ac3_hearing_threshold_tab[AC3_CRITICAL_BANDS][3];
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -156,7 +156,7 @@ void avcodec_register_all(void)
    REGISTER_DECODER (MPEG1_VDPAU, mpeg1_vdpau);
    REGISTER_DECODER (MPEG2_CRYSTALHD, mpeg2_crystalhd);
    REGISTER_DECODER (MSMPEG4_CRYSTALHD, msmpeg4_crystalhd);
    REGISTER_ENCDEC  (MSMPEG4V1, msmpeg4v1);
    REGISTER_DECODER (MSMPEG4V1, msmpeg4v1);
    REGISTER_ENCDEC  (MSMPEG4V2, msmpeg4v2);
    REGISTER_ENCDEC  (MSMPEG4V3, msmpeg4v3);
    REGISTER_DECODER (MSRLE, msrle);
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -628,13 +628,6 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
    }
 }

 /**
 * Empty mmx state.
 * this must be called between any dsp function and float/double code.
 * for example sin(); dsp->idct_put(); emms_c(); cos()
 */
 #define emms_c()

 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
@@ -652,22 +645,9 @@ void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
 void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);

 #if HAVE_MMX

 #undef emms_c

 static inline void emms(void)
 {
    __asm__ volatile ("emms;":::"memory");
 }

 #define emms_c() \
 {\
    if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)\
        emms();\
 }
 #if ARCH_ARM

 #elif ARCH_ARM

 #if HAVE_NEON
 #   define STRIDE_ALIGN 16
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -638,15 +638,6 @@ av_cold int MPV_encode_init(AVCodecContext *avctx)
        s->low_delay= s->max_b_frames ? 0 : 1;
        avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
        break;
    case CODEC_ID_MSMPEG4V1:
        s->out_format = FMT_H263;
        s->h263_msmpeg4 = 1;
        s->h263_pred = 1;
        s->unrestricted_mv = 1;
        s->msmpeg4_version= 1;
        avctx->delay=0;
        s->low_delay=1;
        break;
    case CODEC_ID_MSMPEG4V2:
        s->out_format = FMT_H263;
        s->h263_msmpeg4 = 1;
@@ -3807,18 +3798,6 @@ AVCodec ff_h263p_encoder = {
    .long_name= NULL_IF_CONFIG_SMALL("H.263+ / H.263-1998 / H.263 version 2"),
 };

 AVCodec ff_msmpeg4v1_encoder = {
    "msmpeg4v1",
    AVMEDIA_TYPE_VIDEO,
    CODEC_ID_MSMPEG4V1,
    sizeof(MpegEncContext),
    MPV_encode_init,
    MPV_encode_picture,
    MPV_encode_end,
    .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
    .long_name= NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 1"),
 };

 AVCodec ff_msmpeg4v2_encoder = {
    "msmpeg4v2",
    AVMEDIA_TYPE_VIDEO,
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -846,22 +846,14 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
    int pred, extquant;
    int extrabits = 0;

    if(s->msmpeg4_version==1){
        int32_t *dc_val;
        pred = msmpeg4v1_pred_dc(s, n, &dc_val);

        /* update predictor */
        *dc_val= level;
    }else{
        int16_t *dc_val;
        pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
    int16_t *dc_val;
    pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);

        /* update predictor */
        if (n < 4) {
            *dc_val = level * s->y_dc_scale;
        } else {
            *dc_val = level * s->c_dc_scale;
        }
    /* update predictor */
    if (n < 4) {
        *dc_val = level * s->y_dc_scale;
    } else {
        *dc_val = level * s->c_dc_scale;
    }

    /* do the prediction */
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -54,8 +54,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
                                CONFIG_MSMPEG4V3_DECODER || \
                                CONFIG_WMV2_DECODER      || \
                                CONFIG_VC1_DECODER)
 #define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V1_ENCODER || \
                                CONFIG_MSMPEG4V2_ENCODER || \
 #define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
                                CONFIG_MSMPEG4V3_ENCODER || \
                                CONFIG_WMV2_ENCODER)

--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -39,6 +39,8 @@ typedef struct TiffContext {

    int width, height;
    unsigned int bpp, bppcount;
    uint32_t palette[256];
    int palette_is_set;
    int le;
    enum TiffCompr compr;
    int invert;
@@ -255,11 +257,15 @@ static int init_image(TiffContext *s)
        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
        return ret;
    }
    if (s->bpp == 8 && s->picture.data[1]){
        /* make default grayscale pal */
        pal = (uint32_t *) s->picture.data[1];
        for (i = 0; i < 256; i++)
            pal[i] = i * 0x010101;
    if (s->avctx->pix_fmt == PIX_FMT_PAL8) {
        if (s->palette_is_set) {
            memcpy(s->picture.data[1], s->palette, sizeof(s->palette));
        } else {
            /* make default grayscale pal */
            pal = (uint32_t *) s->picture.data[1];
            for (i = 0; i < 256; i++)
                pal[i] = i * 0x010101;
        }
    }
    return 0;
 }
@@ -442,11 +448,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
        s->fill_order = value - 1;
        break;
    case TIFF_PAL:
        if(s->avctx->pix_fmt != PIX_FMT_PAL8){
            av_log(s->avctx, AV_LOG_ERROR, "Palette met but this is not palettized format\n");
            return -1;
        }
        pal = (uint32_t *) s->picture.data[1];
        pal = (uint32_t *) s->palette;
        off = type_sizes[type];
        rp = buf;
        gp = buf + count / 3 * off;
@@ -459,6 +461,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
            j |= tget(&bp, type, s->le) >> off;
            pal[i] = j;
        }
        s->palette_is_set = 1;
        break;
    case TIFF_PLANAR:
        if(value == 2){
--- a/libavfilter/vf_mp.c
+++ b/libavfilter/vf_mp.c
@@ -287,13 +287,8 @@ zrmjpeg
 CpuCaps gCpuCaps; //FIXME initialize this so optims work


 //exact copy from vf_scale.c
 int get_sws_cpuflags(void){
    return
          (gCpuCaps.hasMMX   ? SWS_CPU_CAPS_MMX   : 0)
        | (gCpuCaps.hasMMX2  ? SWS_CPU_CAPS_MMX2  : 0)
        | (gCpuCaps.has3DNow ? SWS_CPU_CAPS_3DNOW : 0)
        | (gCpuCaps.hasAltiVec ? SWS_CPU_CAPS_ALTIVEC : 0);
    return 0;
 }

 static void sws_getFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam)
@@ -348,7 +343,7 @@ struct SwsContext *sws_getContextFromCmdLine(int srcW, int srcH, int srcFormat,
        if (srcFormat == IMGFMT_RGB8 || srcFormat == IMGFMT_BGR8) sfmt = PIX_FMT_PAL8;
        sws_getFlagsAndFilterFromCmdLine(&flags, &srcFilterParam, &dstFilterParam);

        return sws_getContext(srcW, srcH, sfmt, dstW, dstH, dfmt, flags | get_sws_cpuflags(), srcFilterParam, dstFilterParam, NULL);
        return sws_getContext(srcW, srcH, sfmt, dstW, dstH, dfmt, flags , srcFilterParam, dstFilterParam, NULL);
 }

 typedef struct {
--- a/libavfilter/vsrc_buffer.h
+++ b/libavfilter/vsrc_buffer.h
@@ -27,6 +27,7 @@
 * memory buffer source API for video
 */


 #include "avfilter.h"

 /**
--- a/libavformat/ape.c
+++ b/libavformat/ape.c
@@ -276,7 +276,7 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
    ape->frames[0].nblocks = ape->blocksperframe;
    ape->frames[0].skip    = 0;
    for (i = 1; i < ape->totalframes; i++) {
        ape->frames[i].pos      = ape->seektable[i] + ape->junklength; //ape->frames[i-1].pos + ape->blocksperframe;
        ape->frames[i].pos      = ape->seektable[i] + ape->junklength;
        ape->frames[i].nblocks  = ape->blocksperframe;
        ape->frames[i - 1].size = ape->frames[i].pos - ape->frames[i - 1].pos;
        ape->frames[i].skip     = (ape->frames[i].pos - ape->frames[0].pos) & 3;
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -37,6 +37,7 @@
 #include "config.h"
 #include "attributes.h"
 #include "timer.h"
 #include "cpu.h"

 #ifndef attribute_align_arg
 #if ARCH_X86_32 && AV_GCC_VERSION_AT_LEAST(4,2)
@@ -222,4 +223,19 @@
 #   define ONLY_IF_THREADS_ENABLED(x) NULL
 #endif

 #if HAVE_MMX
 /**
 * Empty mmx state.
 * this must be called between any dsp function and float/double code.
 * for example sin(); dsp->idct_put(); emms_c(); cos()
 */
 static av_always_inline void emms_c(void)
 {
    if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
        __asm__ volatile ("emms" ::: "memory");
 }
 #else /* HAVE_MMX */
 #define emms_c()
 #endif /* HAVE_MMX */

 #endif /* AVUTIL_INTERNAL_H */
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -79,15 +79,13 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
 void ff_bfin_get_unscaled_swscale(SwsContext *c)
 {
    SwsFunc swScale = c->swScale;
    if (c->flags & SWS_CPU_CAPS_BFIN)
        if (c->dstFormat == PIX_FMT_YUV420P)
            if (c->srcFormat == PIX_FMT_UYVY422) {
                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
                c->swScale = uyvytoyv12_unscaled;
            }
        if (c->dstFormat == PIX_FMT_YUV420P)
            if (c->srcFormat == PIX_FMT_YUYV422) {
                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
                c->swScale = yuyvtoyv12_unscaled;
            }

    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
        c->swScale = uyvytoyv12_unscaled;
    }
    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
        c->swScale = yuyvtoyv12_unscaled;
    }
 }
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -33,31 +33,6 @@

 #define FUNC(s,d,n) {s,d,#n,n}

 static int cpu_caps;

 static char *args_parse(int argc, char *argv[])
 {
    int o;

    while ((o = getopt(argc, argv, "m23")) != -1) {
        switch (o) {
        case 'm':
            cpu_caps |= SWS_CPU_CAPS_MMX;
            break;
        case '2':
            cpu_caps |= SWS_CPU_CAPS_MMX2;
            break;
        case '3':
            cpu_caps |= SWS_CPU_CAPS_3DNOW;
            break;
        default:
            av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
        }
    }

    return argv[optind];
 }

 int main(int argc, char **argv)
 {
    int i, funcNum;
@@ -70,9 +45,7 @@ int main(int argc, char **argv)
        return -1;

    av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
    args_parse(argc, argv);
    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
    sws_rgb2rgb_init(cpu_caps);
    sws_rgb2rgb_init();

    for(funcNum=0; ; funcNum++) {
        struct func_info_s {
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -48,12 +48,6 @@ static const AVOption options[] = {
    { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "print_info", "print info", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "sse2", "SSE2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_SSE2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_3DNOW }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_ALTIVEC }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_BFIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bitexact", "", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
--- a/libswscale/ppc/swscale_template.c
+++ b/libswscale/ppc/swscale_template.c
@@ -23,69 +23,16 @@
 #include "swscale_altivec_template.c"
 #endif

 #if COMPILE_TEMPLATE_ALTIVEC
 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
 #if COMPILE_TEMPLATE_ALTIVEC
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
                          chrFilter, chrSrc, chrFilterSize,
                          dest, uDest, vDest, dstW, chrDstW);
 #else //COMPILE_TEMPLATE_ALTIVEC
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
                chrFilter, chrSrc, chrFilterSize,
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
 #endif //!COMPILE_TEMPLATE_ALTIVEC
 }

 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
 {
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
                 chrFilter, chrSrc, chrFilterSize,
                 dest, uDest, dstW, chrDstW, dstFormat);
 }

 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
    int i;
    for (i=0; i<dstW; i++) {
        int val= (lumSrc[i]+64)>>7;

        if (val&256) {
            if (val<0) val=0;
            else       val=255;
        }

        dest[i]= val;
    }

    if (uDest)
        for (i=0; i<chrDstW; i++) {
            int u=(chrSrc[i       ]+64)>>7;
            int v=(chrSrc[i + VOFW]+64)>>7;

            if ((u|v)&256) {
                if (u<0)        u=0;
                else if (u>255) u=255;
                if (v<0)        v=0;
                else if (v>255) v=255;
            }

            uDest[i]= u;
            vDest[i]= v;
        }

    if (CONFIG_SWSCALE_ALPHA && aDest)
        for (i=0; i<dstW; i++) {
            int val= (alpSrc[i]+64)>>7;
            aDest[i]= av_clip_uint8(val);
        }
 }


 /**
 * vertical scale YV12 to RGB
 */
@@ -93,7 +40,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
 {
 #if COMPILE_TEMPLATE_ALTIVEC
    /* The following list of supported dstFormat values should
       match what's found in the body of ff_yuv2packedX_altivec() */
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
@@ -104,815 +50,17 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                                   chrFilter, chrSrc, chrFilterSize,
                                   dest, dstW, dstY);
    else
 #endif
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
                       chrFilter, chrSrc, chrFilterSize,
                       alpSrc, dest, dstW, dstY);
 }
 #endif

 /**
 * vertical bilinear scale YV12 to RGB
 */
 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
    int  yalpha1=4095- yalpha;
    int uvalpha1=4095-uvalpha;
    int i;

    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
 }

 /**
 * YV12 to RGB without scaling or interpolating
 */
 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
 {
    const int yalpha1=0;
    int i;

    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    const int yalpha= 4096; //FIXME ...

    if (flags&SWS_FULL_CHR_H_INT) {
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
        return;
    }

    if (uvalpha < 2048) {
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
    } else {
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
    }
 }

 //FIXME yuy2* can read up to 7 samples too much

 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++)
        dst[i]= src[2*i];
 }

 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        dstU[i]= src1[4*i + 1];
        dstV[i]= src1[4*i + 3];
    }
    assert(src1 == src2);
 }

 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        dstU[i]= src1[2*i + 1];
        dstV[i]= src2[2*i + 1];
    }
 }

 /* This is almost identical to the previous, end exists only because
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++)
        dst[i]= src[2*i+1];
 }

 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        dstU[i]= src1[4*i + 0];
        dstV[i]= src1[4*i + 2];
    }
    assert(src1 == src2);
 }

 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        dstU[i]= src1[2*i];
        dstV[i]= src2[2*i];
    }
 }

 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
                                    const uint8_t *src, long width)
 {
    int i;
    for (i = 0; i < width; i++) {
        dst1[i] = src[2*i+0];
        dst2[i] = src[2*i+1];
    }
 }

 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
                                    const uint8_t *src1, const uint8_t *src2,
                                    long width, uint32_t *unused)
 {
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
 }

 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
                                    const uint8_t *src1, const uint8_t *src2,
                                    long width, uint32_t *unused)
 {
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
 }


 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        int b= src[i*3+0];
        int g= src[i*3+1];
        int r= src[i*3+2];

        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
    }
 }

 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        int b= src1[3*i + 0];
        int g= src1[3*i + 1];
        int r= src1[3*i + 2];

        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
    }
    assert(src1 == src2);
 }

 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        int b= src1[6*i + 0] + src1[6*i + 3];
        int g= src1[6*i + 1] + src1[6*i + 4];
        int r= src1[6*i + 2] + src1[6*i + 5];

        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
    }
    assert(src1 == src2);
 }

 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++) {
        int r= src[i*3+0];
        int g= src[i*3+1];
        int b= src[i*3+2];

        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
    }
 }

 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    assert(src1==src2);
    for (i=0; i<width; i++) {
        int r= src1[3*i + 0];
        int g= src1[3*i + 1];
        int b= src1[3*i + 2];

        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
    }
 }

 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    assert(src1==src2);
    for (i=0; i<width; i++) {
        int r= src1[6*i + 0] + src1[6*i + 3];
        int g= src1[6*i + 1] + src1[6*i + 4];
        int b= src1[6*i + 2] + src1[6*i + 5];

        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
    }
 }


 // bilinear / bicubic scaling
 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
 {
 #if COMPILE_TEMPLATE_ALTIVEC
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
 #else
    int i;
    for (i=0; i<dstW; i++) {
        int j;
        int srcPos= filterPos[i];
        int val=0;
        //printf("filterPos: %d\n", filterPos[i]);
        for (j=0; j<filterSize; j++) {
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
        }
        //filter += hFilterSize;
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
        //dst[i] = val>>7;
    }
 #endif /* COMPILE_TEMPLATE_ALTIVEC */
 }

 //FIXME all pal and rgb srcFormats could do this convertion as well
 //FIXME all scalers more complex than bilinear could do half of this transform
 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
 {
    int i;
    for (i = 0; i < width; i++) {
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
    }
 }
 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
 {
    int i;
    for (i = 0; i < width; i++) {
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
    }
 }
 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
 {
    int i;
    for (i = 0; i < width; i++)
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
 }
 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
 {
    int i;
    for (i = 0; i < width; i++)
        dst[i] = (dst[i]*14071 + 33561947)>>14;
 }

 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                        long dstWidth, const uint8_t *src, int srcW,
                                        int xInc)
 {
    int i;
    unsigned int xpos=0;
    for (i=0;i<dstWidth;i++) {
        register unsigned int xx=xpos>>16;
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
        xpos+=xInc;
    }
 }

      // *** horizontal scale Y line to temp buffer
 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
                                   const int16_t *hLumFilter,
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
                                   uint8_t *formatConvBuffer,
                                   uint32_t *pal, int isAlpha)
 {
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
    void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;

    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;

    if (toYV12) {
        toYV12(formatConvBuffer, src, srcW, pal);
        src= formatConvBuffer;
    }

    if (c->hScale16) {
        c->hScale16(dst, dstWidth, (uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
    } else if (!c->hyscale_fast) {
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
    } else { // fast bilinear upscale / crap downscale
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
    }

    if (convertRange)
        convertRange(dst, dstWidth);
 }

 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
                                        long dstWidth, const uint8_t *src1,
                                        const uint8_t *src2, int srcW, int xInc)
 {
    int i;
    unsigned int xpos=0;
    for (i=0;i<dstWidth;i++) {
        register unsigned int xx=xpos>>16;
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
        /* slower
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
        */
        xpos+=xInc;
    }
 }

 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
                                   int srcW, int xInc, const int16_t *hChrFilter,
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
                                   uint8_t *formatConvBuffer,
                                   uint32_t *pal)
 {

    src1 += c->chrSrcOffset;
    src2 += c->chrSrcOffset;

    if (c->chrToYV12) {
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
        src1= formatConvBuffer;
        src2= formatConvBuffer+VOFW;
    }

    if (c->hScale16) {
        c->hScale16(dst     , dstWidth, (uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
        c->hScale16(dst+VOFW, dstWidth, (uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
    } else if (!c->hcscale_fast) {
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
    } else { // fast bilinear upscale / crap downscale
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
    }

    if (c->chrConvertRange)
        c->chrConvertRange(dst, dstWidth);
 }

 #define DEBUG_SWSCALE_BUFFERS 0
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)

 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    /* load a few things into local vars to make the code more readable? and faster */
    const int srcW= c->srcW;
    const int dstW= c->dstW;
    const int dstH= c->dstH;
    const int chrDstW= c->chrDstW;
    const int chrSrcW= c->chrSrcW;
    const int lumXInc= c->lumXInc;
    const int chrXInc= c->chrXInc;
    const enum PixelFormat dstFormat= c->dstFormat;
    const int flags= c->flags;
    int16_t *vLumFilterPos= c->vLumFilterPos;
    int16_t *vChrFilterPos= c->vChrFilterPos;
    int16_t *hLumFilterPos= c->hLumFilterPos;
    int16_t *hChrFilterPos= c->hChrFilterPos;
    int16_t *vLumFilter= c->vLumFilter;
    int16_t *vChrFilter= c->vChrFilter;
    int16_t *hLumFilter= c->hLumFilter;
    int16_t *hChrFilter= c->hChrFilter;
    int32_t *lumMmxFilter= c->lumMmxFilter;
    int32_t *chrMmxFilter= c->chrMmxFilter;
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
    const int vLumFilterSize= c->vLumFilterSize;
    const int vChrFilterSize= c->vChrFilterSize;
    const int hLumFilterSize= c->hLumFilterSize;
    const int hChrFilterSize= c->hChrFilterSize;
    int16_t **lumPixBuf= c->lumPixBuf;
    int16_t **chrPixBuf= c->chrPixBuf;
    int16_t **alpPixBuf= c->alpPixBuf;
    const int vLumBufSize= c->vLumBufSize;
    const int vChrBufSize= c->vChrBufSize;
    uint8_t *formatConvBuffer= c->formatConvBuffer;
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
    int lastDstY;
    uint32_t *pal=c->pal_yuv;

    /* vars which will change and which we need to store back in the context */
    int dstY= c->dstY;
    int lumBufIndex= c->lumBufIndex;
    int chrBufIndex= c->chrBufIndex;
    int lastInLumBuf= c->lastInLumBuf;
    int lastInChrBuf= c->lastInChrBuf;

    if (isPacked(c->srcFormat)) {
        src[0]=
        src[1]=
        src[2]=
        src[3]= src[0];
        srcStride[0]=
        srcStride[1]=
        srcStride[2]=
        srcStride[3]= srcStride[0];
    }
    srcStride[1]<<= c->vChrDrop;
    srcStride[2]<<= c->vChrDrop;

    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
                   srcSliceY,    srcSliceH,    dstY,    dstH);
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);

    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
        static int warnedAlready=0; //FIXME move this into the context perhaps
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
                   "         ->cannot do aligned memory accesses anymore\n");
            warnedAlready=1;
        }
    }

    /* Note the user might start scaling the picture in the middle so this
       will not get executed. This is not really intended but works
       currently, so people might do it. */
    if (srcSliceY ==0) {
        lumBufIndex=-1;
        chrBufIndex=-1;
        dstY=0;
        lastInLumBuf= -1;
        lastInChrBuf= -1;
    }

    lastDstY= dstY;

    for (;dstY < dstH; dstY++) {
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
        const int chrDstY= dstY>>c->chrDstVSubSample;
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;

        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
        int enough_lines;

        //handle holes (FAST_BILINEAR & weird filters)
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);

        DEBUG_BUFFERS("dstY: %d\n", dstY);
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);

        // Do we have enough lines in this slice to output the dstY line
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);

        if (!enough_lines) {
            lastLumSrcY = srcSliceY + srcSliceH - 1;
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
                                            lastLumSrcY, lastChrSrcY);
        }

        //Do horizontal scaling
        while(lastInLumBuf < lastLumSrcY) {
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
            lumBufIndex++;
            assert(lumBufIndex < 2*vLumBufSize);
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
                            hLumFilter, hLumFilterPos, hLumFilterSize,
                            formatConvBuffer,
                            pal, 0);
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
                                hLumFilter, hLumFilterPos, hLumFilterSize,
                                formatConvBuffer,
                                pal, 1);
            lastInLumBuf++;
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
                               lumBufIndex,    lastInLumBuf);
        }
        while(lastInChrBuf < lastChrSrcY) {
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
            chrBufIndex++;
            assert(chrBufIndex < 2*vChrBufSize);
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
            //FIXME replace parameters through context struct (some at least)

            if (c->needs_hcscale)
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
                                hChrFilter, hChrFilterPos, hChrFilterSize,
                                formatConvBuffer,
                                pal);
            lastInChrBuf++;
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
                               chrBufIndex,    lastInChrBuf);
        }
        //wrap buf index around to stay inside the ring buffer
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
        if (!enough_lines)
            break; //we can't output a dstY line so let's try with the next slice

        if (dstY < dstH-2) {
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
                c->yuv2nv12X(c,
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                             dest, uDest, dstW, chrDstW, dstFormat);
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
                    yuv2yuvX16inC(
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
                                  dstFormat);
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
                    const int16_t *lumBuf = lumSrcPtr[0];
                    const int16_t *chrBuf= chrSrcPtr[0];
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
                } else { //General YV12
                    c->yuv2yuvX(c,
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
                }
            } else {
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
                    int chrAlpha= vChrFilter[2*dstY+1];
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    } else {
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
                                       alpPixBuf ? *alpSrcPtr : NULL,
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
                    }
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
                    int lumAlpha= vLumFilter[2*dstY+1];
                    int chrAlpha= vChrFilter[2*dstY+1];
                    lumMmxFilter[2]=
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
                    chrMmxFilter[2]=
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    } else {
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
                    }
                } else { //general RGB
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c,
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    } else {
                        c->yuv2packedX(c,
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                       alpSrcPtr, dest, dstW, dstY);
                    }
                }
            }
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
                yuv2nv12XinC(
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                             dest, uDest, dstW, chrDstW, dstFormat);
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
                    yuv2yuvX16inC(
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
                                  dstFormat);
                } else {
                    yuv2yuvXinC(
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
                }
            } else {
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
                if(flags & SWS_FULL_CHR_H_INT) {
                    yuv2rgbXinC_full(c,
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                     alpSrcPtr, dest, dstW, dstY);
                } else {
                    yuv2packedXinC(c,
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                   alpSrcPtr, dest, dstW, dstY);
                }
            }
        }
    }

    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);

    /* store changed local vars back in the context */
    c->dstY= dstY;
    c->lumBufIndex= lumBufIndex;
    c->chrBufIndex= chrBufIndex;
    c->lastInLumBuf= lastInLumBuf;
    c->lastInChrBuf= lastInChrBuf;

    return dstY - lastDstY;
 }

 static void RENAME(sws_init_swScale)(SwsContext *c)
 {
    enum PixelFormat srcFormat = c->srcFormat;

    c->yuv2nv12X    = RENAME(yuv2nv12X   );
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
    c->yuv2packed1  = RENAME(yuv2packed1 );
    c->yuv2packed2  = RENAME(yuv2packed2 );
    c->yuv2packedX  = RENAME(yuv2packedX );

    c->hScale       = RENAME(hScale      );

    if (c->flags & SWS_FAST_BILINEAR)
    {
        c->hyscale_fast = RENAME(hyscale_fast);
        c->hcscale_fast = RENAME(hcscale_fast);
    }

    c->chrToYV12 = NULL;
    switch(srcFormat) {
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
        case PIX_FMT_RGB8     :
        case PIX_FMT_BGR8     :
        case PIX_FMT_PAL8     :
        case PIX_FMT_BGR4_BYTE:
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
        case PIX_FMT_GRAY16BE :
        case PIX_FMT_YUV420P9BE:
        case PIX_FMT_YUV422P10BE:
        case PIX_FMT_YUV420P10BE:
        case PIX_FMT_YUV420P16BE:
        case PIX_FMT_YUV422P16BE:
        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16) : RENAME(hScale16X); break;
        case PIX_FMT_GRAY16LE :
        case PIX_FMT_YUV420P9LE:
        case PIX_FMT_YUV422P10LE:
        case PIX_FMT_YUV420P10LE:
        case PIX_FMT_YUV420P16LE:
        case PIX_FMT_YUV422P16LE:
        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16X) : RENAME(hScale16); break;
    }
    if (c->chrSrcHSubSample) {
        switch(srcFormat) {
        case PIX_FMT_RGB48BE:
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
        case PIX_FMT_BGR48BE:
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half;  break;
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half;  break;
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
        }
    } else {
        switch(srcFormat) {
        case PIX_FMT_RGB48BE:
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
        case PIX_FMT_BGR48BE:
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV;  break;
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV;  break;
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
        }
    }

    c->lumToYV12 = NULL;
    c->alpToYV12 = NULL;
    switch (srcFormat) {
    case PIX_FMT_YUYV422  :
    case PIX_FMT_GRAY8A   :
                            c->lumToYV12 = RENAME(yuy2ToY); break;
    case PIX_FMT_UYVY422  :
                            c->lumToYV12 = RENAME(uyvyToY); break;
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
    case PIX_FMT_RGB8     :
    case PIX_FMT_BGR8     :
    case PIX_FMT_PAL8     :
    case PIX_FMT_BGR4_BYTE:
    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY;  break;
    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY;  break;
    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
    case PIX_FMT_RGB48BE:
    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
    case PIX_FMT_BGR48BE:
    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
    }
    if (c->alpPixBuf) {
        switch (srcFormat) {
        case PIX_FMT_RGB32  :
        case PIX_FMT_RGB32_1:
        case PIX_FMT_BGR32  :
        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
        case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
        case PIX_FMT_PAL8   : c->alpToYV12 = palToA; break;
        }
    }

    switch (srcFormat) {
    case PIX_FMT_GRAY8A :
        c->alpSrcOffset = 1;
        break;
    case PIX_FMT_RGB32  :
    case PIX_FMT_BGR32  :
        c->alpSrcOffset = 3;
        break;
    case PIX_FMT_RGB48LE:
    case PIX_FMT_BGR48LE:
        c->lumSrcOffset = 1;
        c->chrSrcOffset = 1;
        c->alpSrcOffset = 1;
        break;
    }

    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
        if (c->srcRange) {
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
        } else {
            c->lumConvertRange = RENAME(lumRangeToJpeg);
            c->chrConvertRange = RENAME(chrRangeToJpeg);
        }
    }

    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
        c->needs_hcscale = 1;
 }
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -94,6 +94,7 @@ adjustment.
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/cpu.h"

 #undef PROFILE_THE_BEAST
 #undef INC_SCALING
@@ -692,7 +693,7 @@ static int altivec_uyvy_rgb32 (SwsContext *c,
 */
 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 {
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
        return NULL;

    /*
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -116,12 +116,11 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t
 32-bit C version, and and&add trick by Michael Niedermayer
 */

 void sws_rgb2rgb_init(int flags)
 void sws_rgb2rgb_init(void)
 {
    rgb2rgb_init_c();
 #if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
    rgb2rgb_init_x86(flags);
 #endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
    if (HAVE_MMX)
        rgb2rgb_init_x86();
 }

 #if LIBSWSCALE_VERSION_MAJOR < 1
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -166,8 +166,8 @@ extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const u
                            long width, long height,
                            long lumStride, long chromStride, long srcStride);

 void sws_rgb2rgb_init(int flags);
 void sws_rgb2rgb_init(void);

 void rgb2rgb_init_x86(int flags);
 void rgb2rgb_init_x86(void);

 #endif /* SWSCALE_RGB2RGB_H */
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -278,25 +278,6 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, long src_siz
    }
 }

 /*
 * mm0 = 00 B3 00 B2 00 B1 00 B0
 * mm1 = 00 G3 00 G2 00 G1 00 G0
 * mm2 = 00 R3 00 R2 00 R1 00 R0
 * mm6 = FF FF FF FF FF FF FF FF
 * mm7 = 00 00 00 00 00 00 00 00
 */
 #define PACK_RGB32 \
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
    "movq       %%mm0, %%mm3    \n\t"                               \
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \

 static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, long src_size)
 {
    const uint16_t *end;
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -63,6 +63,7 @@ untested special converters
 #include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/x86_cpu.h"
 #include "libavutil/cpu.h"
 #include "libavutil/avutil.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/bswap.h"
@@ -71,10 +72,6 @@ untested special converters
 #undef MOVNTQ
 #undef PAVGB

 //#undef HAVE_MMX2
 //#define HAVE_AMD3DNOW
 //#undef HAVE_MMX
 //#undef ARCH_X86
 #define DITHER1XBPP

 #define isPacked(x)         (       \
@@ -1262,57 +1259,13 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin

 //Note: we have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
 //Plain C versions
 #if CONFIG_RUNTIME_CPUDETECT
 #  define COMPILE_C 1
 #  if   ARCH_X86
 #    define COMPILE_MMX     1
 #    define COMPILE_MMX2    1
 #    define COMPILE_3DNOW   1
 #  elif ARCH_PPC
 #    define COMPILE_ALTIVEC HAVE_ALTIVEC
 #  endif
 #else /* CONFIG_RUNTIME_CPUDETECT */
 #  if   ARCH_X86
 #    if   HAVE_MMX2
 #      define COMPILE_MMX2  1
 #    elif HAVE_AMD3DNOW
 #      define COMPILE_3DNOW 1
 #    elif HAVE_MMX
 #      define COMPILE_MMX   1
 #    else
 #      define COMPILE_C     1
 #    endif
 #  elif ARCH_PPC && HAVE_ALTIVEC
 #    define COMPILE_ALTIVEC 1
 #  else
 #    define COMPILE_C       1
 #  endif
 #endif

 #ifndef COMPILE_C
 #  define COMPILE_C 0
 #endif
 #ifndef COMPILE_MMX
 #  define COMPILE_MMX 0
 #endif
 #ifndef COMPILE_MMX2
 #  define COMPILE_MMX2 0
 #endif
 #ifndef COMPILE_3DNOW
 #  define COMPILE_3DNOW 0
 #endif
 #ifndef COMPILE_ALTIVEC
 #  define COMPILE_ALTIVEC 0
 #endif

 #define COMPILE_TEMPLATE_MMX 0
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_ALTIVEC 0

 #include "swscale_template.c"

 #if COMPILE_ALTIVEC
 #if HAVE_ALTIVEC
 #undef RENAME
 #undef COMPILE_TEMPLATE_ALTIVEC
 #define COMPILE_TEMPLATE_ALTIVEC 1
@@ -1320,90 +1273,42 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #include "ppc/swscale_template.c"
 #endif

 #if ARCH_X86

 //MMX versions
 #if COMPILE_MMX
 #if HAVE_MMX
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
 #define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
 #include "x86/swscale_template.c"
 #endif

 //MMX2 versions
 #if COMPILE_MMX2
 #if HAVE_MMX2
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
 #define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 1
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX2
 #include "x86/swscale_template.c"
 #endif

 //3DNOW versions
 #if COMPILE_3DNOW
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
 #define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNow
 #include "x86/swscale_template.c"
 #endif

 #endif //ARCH_X86

 SwsFunc ff_getSwsFunc(SwsContext *c)
 {
    int cpu_flags = av_get_cpu_flags();

    sws_init_swScale_c(c);

 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
    // ordered per speed fastest first
    if (c->flags & SWS_CPU_CAPS_MMX2) {
        sws_init_swScale_MMX2(c);
        return swScale_MMX2;
    } else if (c->flags & SWS_CPU_CAPS_3DNOW) {
        sws_init_swScale_3DNow(c);
        return swScale_3DNow;
    } else if (c->flags & SWS_CPU_CAPS_MMX) {
 #if HAVE_MMX
    if (cpu_flags & AV_CPU_FLAG_MMX)
        sws_init_swScale_MMX(c);
        return swScale_MMX;
    }

 #else
 #if COMPILE_ALTIVEC
    if (c->flags & SWS_CPU_CAPS_ALTIVEC) {
        sws_init_swScale_altivec(c);
        return swScale_altivec;
    }
 #endif
 #endif /* ARCH_X86 */
 #else //CONFIG_RUNTIME_CPUDETECT
 #if   COMPILE_TEMPLATE_MMX2
    sws_init_swScale_MMX2(c);
    return swScale_MMX2;
 #elif COMPILE_TEMPLATE_AMD3DNOW
    sws_init_swScale_3DNow(c);
    return swScale_3DNow;
 #elif COMPILE_TEMPLATE_MMX
    sws_init_swScale_MMX(c);
    return swScale_MMX;
 #elif COMPILE_TEMPLATE_ALTIVEC
    sws_init_swScale_altivec(c);
    return swScale_altivec;
 #if HAVE_MMX2
    if (cpu_flags & AV_CPU_FLAG_MMX2)
        sws_init_swScale_MMX2(c);
 #endif
 #if HAVE_ALTIVEC
    if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
        sws_init_swScale_altivec(c);
 #endif
 #endif //!CONFIG_RUNTIME_CPUDETECT

    return swScale_c;
 }
@@ -1900,23 +1805,6 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
    return srcSliceH;
 }

 int ff_hardcodedcpuflags(void)
 {
    int flags = 0;
 #if   COMPILE_TEMPLATE_MMX2
    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
 #elif COMPILE_TEMPLATE_AMD3DNOW
    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
 #elif COMPILE_TEMPLATE_MMX
    flags |= SWS_CPU_CAPS_MMX;
 #elif COMPILE_TEMPLATE_ALTIVEC
    flags |= SWS_CPU_CAPS_ALTIVEC;
 #elif ARCH_BFIN
    flags |= SWS_CPU_CAPS_BFIN;
 #endif
    return flags;
 }

 void ff_get_unscaled_swscale(SwsContext *c)
 {
    const enum PixelFormat srcFormat = c->srcFormat;
@@ -2000,8 +1888,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
    if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
        c->swScale= uyvyToYuv422Wrapper;

 #if COMPILE_ALTIVEC
    if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
 #if HAVE_ALTIVEC
    if ((av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) &&
        !(c->flags & SWS_BITEXACT) &&
        srcFormat == PIX_FMT_YUV420P) {
        // unscaled YV12 -> packed YUV, we want speed
@@ -2031,8 +1919,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
            c->swScale= planarCopyWrapper;
    }
 #if ARCH_BFIN
    if (flags & SWS_CPU_CAPS_BFIN)
        ff_bfin_get_unscaled_swscale (c);
    ff_bfin_get_unscaled_swscale (c);
 #endif
 }

--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -95,13 +95,6 @@ const char *swscale_license(void);
 #define SWS_ACCURATE_RND      0x40000
 #define SWS_BITEXACT          0x80000

 #define SWS_CPU_CAPS_MMX      0x80000000
 #define SWS_CPU_CAPS_MMX2     0x20000000
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
 #define SWS_CPU_CAPS_BFIN     0x01000000
 #define SWS_CPU_CAPS_SSE2     0x02000000

 #define SWS_MAX_REDUCE_CUTOFF 0.002

 #define SWS_CS_ITU709         1
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -481,11 +481,6 @@ extern const AVClass sws_context_class;
 */
 void ff_get_unscaled_swscale(SwsContext *c);

 /**
 * Returns the SWS_CPU_CAPS for the optimized code compiled into swscale.
 */
 int ff_hardcodedcpuflags(void);

 /**
 * Returns function pointer to fastest main scaler path function depending
 * on architecture and available optimizations.
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -363,153 +363,11 @@ static inline void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
    }
 }

 static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
 static inline void hScale16_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
                                    const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
 {
    int i, j;
 #if COMPILE_TEMPLATE_MMX
    assert(filterSize % 4 == 0 && filterSize>0);
    if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
        x86_reg counter= -2*dstW;
        filter-= counter*2;
        filterPos-= counter/2;
        dst-= counter/2;
        __asm__ volatile(
            "movd                   %5, %%mm7       \n\t"
 #if defined(PIC)
            "push            %%"REG_b"              \n\t"
 #endif
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
            ".p2align                4              \n\t"
            "1:                                     \n\t"
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
            "movq      (%3, %%"REG_a", 2), %%mm0    \n\t"
            "movq      (%3, %%"REG_b", 2), %%mm2    \n\t"
            "pmaddwd             %%mm1, %%mm0       \n\t"
            "pmaddwd             %%mm2, %%mm3       \n\t"
            "movq                %%mm0, %%mm4       \n\t"
            "punpckldq           %%mm3, %%mm0       \n\t"
            "punpckhdq           %%mm3, %%mm4       \n\t"
            "paddd               %%mm4, %%mm0       \n\t"
            "psrad               %%mm7, %%mm0       \n\t"
            "packssdw            %%mm0, %%mm0       \n\t"
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
            "add                    $4, %%"REG_BP"  \n\t"
            " jnc                   1b              \n\t"

            "pop            %%"REG_BP"              \n\t"
 #if defined(PIC)
            "pop             %%"REG_b"              \n\t"
 #endif
            : "+a" (counter)
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
 #if !defined(PIC)
            : "%"REG_b
 #endif
        );
    } else if (filterSize==8 && shift<15) {
        x86_reg counter= -2*dstW;
        filter-= counter*4;
        filterPos-= counter/2;
        dst-= counter/2;
        __asm__ volatile(
            "movd                   %5, %%mm7       \n\t"
 #if defined(PIC)
            "push            %%"REG_b"              \n\t"
 #endif
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
            "mov              %%"REG_a", %%"REG_BP" \n\t"
            ".p2align                 4             \n\t"
            "1:                                     \n\t"
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
            "movq       (%3, %%"REG_a", 2), %%mm0   \n\t"
            "movq       (%3, %%"REG_b", 2), %%mm2   \n\t"
            "pmaddwd              %%mm1, %%mm0      \n\t"
            "pmaddwd              %%mm2, %%mm3      \n\t"

            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
            "movq      8(%3, %%"REG_a", 2), %%mm4   \n\t"
            "movq      8(%3, %%"REG_b", 2), %%mm2   \n\t"
            "pmaddwd              %%mm1, %%mm4      \n\t"
            "pmaddwd              %%mm2, %%mm5      \n\t"
            "paddd                %%mm4, %%mm0      \n\t"
            "paddd                %%mm5, %%mm3      \n\t"
            "movq                 %%mm0, %%mm4      \n\t"
            "punpckldq            %%mm3, %%mm0      \n\t"
            "punpckhdq            %%mm3, %%mm4      \n\t"
            "paddd                %%mm4, %%mm0      \n\t"
            "psrad                %%mm7, %%mm0      \n\t"
            "packssdw             %%mm0, %%mm0      \n\t"
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
            "add                     $4, %%"REG_BP" \n\t"
            " jnc                    1b             \n\t"

            "pop             %%"REG_BP"             \n\t"
 #if defined(PIC)
            "pop             %%"REG_b"              \n\t"
 #endif
            : "+a" (counter)
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
 #if !defined(PIC)
            : "%"REG_b
 #endif
        );
    } else if (shift<15){
        const uint16_t *offset = src+filterSize;
        x86_reg counter= -2*dstW;
        //filter-= counter*filterSize/2;
        filterPos-= counter/2;
        dst-= counter/2;
        __asm__ volatile(
            "movd                   %7, %%mm7       \n\t"
            ".p2align                  4            \n\t"
            "1:                                     \n\t"
            "mov                      %2, %%"REG_c" \n\t"
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
            "mov                      %5, %%"REG_c" \n\t"
            "pxor                  %%mm4, %%mm4     \n\t"
            "pxor                  %%mm5, %%mm5     \n\t"
            "2:                                     \n\t"
            "movq                   (%1), %%mm1     \n\t"
            "movq               (%1, %6), %%mm3     \n\t"
            "movq (%%"REG_c", %%"REG_a", 2), %%mm0     \n\t"
            "movq (%%"REG_c", %%"REG_d", 2), %%mm2     \n\t"
            "pmaddwd               %%mm1, %%mm0     \n\t"
            "pmaddwd               %%mm2, %%mm3     \n\t"
            "paddd                 %%mm3, %%mm5     \n\t"
            "paddd                 %%mm0, %%mm4     \n\t"
            "add                      $8, %1        \n\t"
            "add                      $8, %%"REG_c" \n\t"
            "cmp                      %4, %%"REG_c" \n\t"
            " jb                      2b            \n\t"
            "add                      %6, %1        \n\t"
            "movq                  %%mm4, %%mm0     \n\t"
            "punpckldq             %%mm5, %%mm4     \n\t"
            "punpckhdq             %%mm5, %%mm0     \n\t"
            "paddd                 %%mm0, %%mm4     \n\t"
            "psrad                 %%mm7, %%mm4     \n\t"
            "packssdw              %%mm4, %%mm4     \n\t"
            "mov                      %3, %%"REG_a" \n\t"
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
            "add                      $4, %0        \n\t"
            " jnc                     1b            \n\t"

            : "+r" (counter), "+r" (filter)
            : "m" (filterPos), "m" (dst), "m"(offset),
            "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
            : "%"REG_a, "%"REG_c, "%"REG_d
        );
    } else
 #endif

    for (i=0; i<dstW; i++) {
        int srcPos= filterPos[i];
        int val=0;
@@ -520,7 +378,7 @@ static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src,
    }
 }

 static inline void RENAME(hScale16X)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
 static inline void hScale16X_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
                                    const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
 {
    int i, j;
@@ -660,6 +518,11 @@ inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth,
 #define DEBUG_SWSCALE_BUFFERS 0
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)

 #if HAVE_MMX
 static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
                                  int lastInLumBuf, int lastInChrBuf);
 #endif

 static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
                     int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
 {
@@ -831,6 +694,9 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
        if (!enough_lines)
            break; //we can't output a dstY line so let's try with the next slice

 #if HAVE_MMX
        updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
 #endif
        if (dstY < dstH-2) {
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
@@ -955,6 +821,12 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);

 #if HAVE_MMX2
    if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
        __asm__ volatile("sfence":::"memory");
 #endif
    emms_c();

    /* store changed local vars back in the context */
    c->dstY= dstY;
    c->lumBufIndex= lumBufIndex;
@@ -1001,14 +873,14 @@ static void sws_init_swScale_c(SwsContext *c)
        case PIX_FMT_YUV420P10BE:
        case PIX_FMT_YUV420P16BE:
        case PIX_FMT_YUV422P16BE:
        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16) : RENAME(hScale16X); break;
        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? hScale16_c : hScale16X_c; break;
        case PIX_FMT_GRAY16LE :
        case PIX_FMT_YUV420P9LE:
        case PIX_FMT_YUV422P10LE:
        case PIX_FMT_YUV420P10LE:
        case PIX_FMT_YUV420P16LE:
        case PIX_FMT_YUV422P16LE:
        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16X) : RENAME(hScale16); break;
        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? hScale16X_c : hScale16_c; break;
    }
    if (c->chrSrcHSubSample) {
        switch(srcFormat) {
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -185,7 +185,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
 }

 static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
                      int srcW, int dstW, int filterAlign, int one, int flags,
                      int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags,
                      SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
 {
    int i;
@@ -196,10 +196,8 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
    int64_t *filter2=NULL;
    const int64_t fone= 1LL<<54;
    int ret= -1;
 #if ARCH_X86
    if (flags & SWS_CPU_CAPS_MMX)
        __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
 #endif

    emms_c(); //FIXME this should not be required but it IS (even for non-MMX versions)

    // NOTE: the +1 is for the MMX scaler which reads over the end
    FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+1)*sizeof(int16_t), fail);
@@ -416,7 +414,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
        if (min>minFilterSize) minFilterSize= min;
    }

    if (flags & SWS_CPU_CAPS_ALTIVEC) {
    if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) {
        // we can handle the special case 4,
        // so we don't want to go to the full 8
        if (minFilterSize < 5)
@@ -431,7 +429,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
            filterAlign = 1;
    }

    if (flags & SWS_CPU_CAPS_MMX) {
    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
        // special case for unscaled vertical filtering
        if (minFilterSize == 1 && filterAlign == 2)
            filterAlign= 1;
@@ -521,7 +519,7 @@ fail:
    return ret;
 }

 #if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
 #if HAVE_MMX2
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
 {
    uint8_t *fragmentA;
@@ -679,7 +677,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *fil

    return fragmentPos + 1;
 }
 #endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
 #endif /* HAVE_MMX2 */

 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@@ -687,8 +685,6 @@ static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
    *v = av_pix_fmt_descriptors[format].log2_chroma_h;
 }

 static int update_flags_cpu(int flags);

 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation)
 {
    memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
@@ -703,15 +699,12 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange

    c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->dstFormat]);
    c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->srcFormat]);
    c->flags = update_flags_cpu(c->flags);

    ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
    //FIXME factorize

 #if HAVE_ALTIVEC
    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
    if (HAVE_ALTIVEC && av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)
        ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness, contrast, saturation);
 #endif
    return 0;
 }

@@ -741,27 +734,6 @@ static int handle_jpeg(enum PixelFormat *format)
    }
 }

 static int update_flags_cpu(int flags)
 {
 #if !CONFIG_RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
    flags &= ~( SWS_CPU_CAPS_MMX
               |SWS_CPU_CAPS_MMX2
               |SWS_CPU_CAPS_3DNOW
               |SWS_CPU_CAPS_SSE2
               |SWS_CPU_CAPS_ALTIVEC
               |SWS_CPU_CAPS_BFIN);
    flags |= ff_hardcodedcpuflags();
 #else /* !CONFIG_RUNTIME_CPUDETECT */
    int cpuflags = av_get_cpu_flags();

    flags |= (cpuflags & AV_CPU_FLAG_SSE2 ? SWS_CPU_CAPS_SSE2 : 0);
    flags |= (cpuflags & AV_CPU_FLAG_MMX ? SWS_CPU_CAPS_MMX : 0);
    flags |= (cpuflags & AV_CPU_FLAG_MMX2 ? SWS_CPU_CAPS_MMX2 : 0);
    flags |= (cpuflags & AV_CPU_FLAG_3DNOW ? SWS_CPU_CAPS_3DNOW : 0);
 #endif /* CONFIG_RUNTIME_CPUDETECT */
    return flags;
 }

 SwsContext *sws_alloc_context(void)
 {
    SwsContext *c= av_mallocz(sizeof(SwsContext));
@@ -782,16 +754,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
    int srcH= c->srcH;
    int dstW= c->dstW;
    int dstH= c->dstH;
    int flags;
    int flags, cpu_flags;
    enum PixelFormat srcFormat= c->srcFormat;
    enum PixelFormat dstFormat= c->dstFormat;

    flags= c->flags = update_flags_cpu(c->flags);
 #if ARCH_X86
    if (flags & SWS_CPU_CAPS_MMX)
        __asm__ volatile("emms\n\t"::: "memory");
 #endif
    if (!rgb15to16) sws_rgb2rgb_init(flags);
    cpu_flags = av_get_cpu_flags();
    flags     = c->flags;
    emms_c();
    if (!rgb15to16) sws_rgb2rgb_init();

    unscaled = (srcW == dstW && srcH == dstH);

@@ -884,7 +854,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
        }
    }

    if (flags & SWS_CPU_CAPS_MMX2) {
    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
        c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
        if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
            if (flags&SWS_PRINT_INFO)
@@ -910,7 +880,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
            c->chrXInc+= 20;
        }
        //we don't use the x86 asm scaler if MMX is available
        else if (flags & SWS_CPU_CAPS_MMX) {
        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
            c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
            c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
        }
@@ -918,7 +888,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)

    /* precalculate horizontal scaler filter coefficients */
    {
 #if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
 #if HAVE_MMX2
 // can't downscale !!!
        if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
            c->lumMmx2FilterCodeSize = initMMX2HScaler(      dstW, c->lumXInc, NULL, NULL, NULL, 8);
@@ -954,21 +924,21 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
            mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
 #endif
        } else
 #endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
 #endif /* HAVE_MMX2 */
        {
            const int filterAlign=
                (flags & SWS_CPU_CAPS_MMX) ? 4 :
                (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
                (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
                (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
                1;

            if (initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
                           srcW      ,       dstW, filterAlign, 1<<14,
                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
                           srcFilter->lumH, dstFilter->lumH, c->param) < 0)
                goto fail;
            if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
                           c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
                           srcFilter->chrH, dstFilter->chrH, c->param) < 0)
                goto fail;
        }
@@ -977,18 +947,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
    /* precalculate vertical scaler filter coefficients */
    {
        const int filterAlign=
            (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
            (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
            (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
            (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
            1;

        if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
                       srcH      ,        dstH, filterAlign, (1<<12),
                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
                       srcFilter->lumV, dstFilter->lumV, c->param) < 0)
            goto fail;
        if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
                       c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
                       srcFilter->chrV, dstFilter->chrV, c->param) < 0)
            goto fail;

@@ -1082,13 +1052,13 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 #endif
               sws_format_name(dstFormat));

        if      (flags & SWS_CPU_CAPS_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
        else if (flags & SWS_CPU_CAPS_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
        else if (flags & SWS_CPU_CAPS_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
        else if (flags & SWS_CPU_CAPS_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
        if      (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
        else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
        else if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
        else if (HAVE_ALTIVEC  && cpu_flags & AV_CPU_FLAG_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
        else                                   av_log(c, AV_LOG_INFO, "using C\n");

        if (flags & SWS_CPU_CAPS_MMX) {
        if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
            if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
                av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
            else {
@@ -1107,7 +1077,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                    av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n");
            }
        } else {
 #if ARCH_X86
 #if HAVE_MMX
            av_log(c, AV_LOG_VERBOSE, "using x86 asm scaler for horizontal scaling\n");
 #else
            if (flags & SWS_FAST_BILINEAR)
@@ -1118,31 +1088,41 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
        }
        if (isPlanarYUV(dstFormat)) {
            if (c->vLumFilterSize==1)
                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n",
                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
            else
                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n",
                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
        } else {
            if (c->vLumFilterSize==1 && c->vChrFilterSize==2)
                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
                       "      2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
                       "      2-tap scaler for vertical chrominance scaling (BGR)\n",
                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
            else if (c->vLumFilterSize==2 && c->vChrFilterSize==2)
                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n",
                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
            else
                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n",
                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
        }

        if (dstFormat==PIX_FMT_BGR24)
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 converter\n",
                   (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
                   (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) ? "MMX2" :
                   ((HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C"));
        else if (dstFormat==PIX_FMT_RGB32)
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n",
                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
        else if (dstFormat==PIX_FMT_BGR565)
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n",
                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
        else if (dstFormat==PIX_FMT_BGR555)
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n",
                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
        else if (dstFormat == PIX_FMT_RGB444BE || dstFormat == PIX_FMT_RGB444LE ||
                 dstFormat == PIX_FMT_BGR444BE || dstFormat == PIX_FMT_BGR444LE)
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n",
                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");

        av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
        av_log(c, AV_LOG_DEBUG, "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
@@ -1527,7 +1507,7 @@ void sws_freeContext(SwsContext *c)
    av_freep(&c->hLumFilterPos);
    av_freep(&c->hChrFilterPos);

 #if ARCH_X86
 #if HAVE_MMX
 #ifdef MAP_ANONYMOUS
    if (c->lumMmx2FilterCode) munmap(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize);
    if (c->chrMmx2FilterCode) munmap(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize);
@@ -1540,7 +1520,7 @@ void sws_freeContext(SwsContext *c)
 #endif
    c->lumMmx2FilterCode=NULL;
    c->chrMmx2FilterCode=NULL;
 #endif /* ARCH_X86 */
 #endif /* HAVE_MMX */

    av_freep(&c->yuvTable);

@@ -1557,8 +1537,6 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context,
    if (!param)
        param = default_param;

    flags = update_flags_cpu(flags);

    if (context &&
        (context->srcW      != srcW      ||
         context->srcH      != srcH      ||
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -27,6 +27,7 @@

 #include "config.h"
 #include "libavutil/x86_cpu.h"
 #include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
@@ -122,16 +123,16 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 32-bit C version, and and&add trick by Michael Niedermayer
 */

 void rgb2rgb_init_x86(int flags)
 void rgb2rgb_init_x86(void)
 {
 #if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
    if (flags & SWS_CPU_CAPS_SSE2)
        rgb2rgb_init_SSE2();
    else if (flags & SWS_CPU_CAPS_MMX2)
        rgb2rgb_init_MMX2();
    else if (flags & SWS_CPU_CAPS_3DNOW)
        rgb2rgb_init_3DNOW();
    else if (flags & SWS_CPU_CAPS_MMX)
    int cpu_flags = av_get_cpu_flags();

    if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)
        rgb2rgb_init_MMX();
 #endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
    if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
        rgb2rgb_init_3DNOW();
    if (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)
        rgb2rgb_init_MMX2();
    if (HAVE_SSE      && cpu_flags & AV_CPU_FLAG_SSE2)
        rgb2rgb_init_SSE2();
 }
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
--- a/libswscale/x86/yuv2rgb_mmx.c
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -34,6 +34,7 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/x86_cpu.h"
 #include "libavutil/cpu.h"

 #define DITHER1XBPP // only for MMX

@@ -46,57 +47,58 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
 DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;

 //MMX versions
 #if HAVE_MMX
 #undef RENAME
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
 #undef COMPILE_TEMPLATE_MMX2
 #define COMPILE_TEMPLATE_MMX2 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"
 #endif /* HAVE_MMX */

 //MMX2 versions
 #if HAVE_MMX2
 #undef RENAME
 #undef HAVE_MMX2
 #define HAVE_MMX2 1
 #undef COMPILE_TEMPLATE_MMX2
 #define COMPILE_TEMPLATE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"
 #endif /* HAVE_MMX2 */

 SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 {
    if (c->flags & SWS_CPU_CAPS_MMX2) {
    int cpu_flags = av_get_cpu_flags();

    if (c->srcFormat != PIX_FMT_YUV420P &&
        c->srcFormat != PIX_FMT_YUVA420P)
        return NULL;

    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
                break;
            } else return yuv420_rgb32_MMX2;
        case PIX_FMT_BGR32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_bgr32_MMX2;
                break;
            } else return yuv420_bgr32_MMX2;
        case PIX_FMT_RGB24:  return yuv420_rgb24_MMX2;
        case PIX_FMT_BGR24:  return yuv420_bgr24_MMX2;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
        }
    }
    if (c->flags & SWS_CPU_CAPS_MMX) {

    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_rgb32_MMX;
                break;
            } else return yuv420_rgb32_MMX;
        case PIX_FMT_BGR32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_bgr32_MMX;
                break;
            } else return yuv420_bgr32_MMX;
        case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
        case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
 #if HAVE_7REGS
                    return yuva420_rgb32_MMX;
 #endif
                    break;
                } else return yuv420_rgb32_MMX;
            case PIX_FMT_BGR32:
                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
 #if HAVE_7REGS
                    return yuva420_bgr32_MMX;
 #endif
                    break;
                } else return yuv420_bgr32_MMX;
            case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
            case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
            case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
            case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
        }
    }

--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -25,14 +25,7 @@
 #undef EMMS
 #undef SFENCE

 #if HAVE_AMD3DNOW
 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
 #define EMMS   "femms"
 #else
 #define EMMS   "emms"
 #endif

 #if HAVE_MMX2
 #if COMPILE_TEMPLATE_MMX2
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -159,7 +152,8 @@
    }                                                             \

 #define YUV2RGB_ENDFUNC                          \
    __asm__ volatile (SFENCE"\n\t"EMMS);         \
    __asm__ volatile (SFENCE"\n\t"               \
                    "emms    \n\t");             \
    return srcSliceH;                            \

 #define IF0(x)
@@ -188,6 +182,7 @@
    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \

 #if !COMPILE_TEMPLATE_MMX2
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                       int srcStride[],
                                       int srcSliceY, int srcSliceH,
@@ -243,6 +238,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_OPERANDS
    YUV2RGB_ENDFUNC
 }
 #endif /* !COMPILE_TEMPLATE_MMX2 */

 #define RGB_PACK24(blue, red)\
    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
@@ -259,7 +255,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
    RGB_PACK24_B

 #if HAVE_MMX2
 #if COMPILE_TEMPLATE_MMX2
 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
@@ -366,6 +362,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \

 #if !COMPILE_TEMPLATE_MMX2
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                       int srcStride[],
                                       int srcSliceY, int srcSliceH,
@@ -386,12 +383,12 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_ENDFUNC
 }

 #if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
 static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
 #if HAVE_7REGS
    int y, h_size;

    YUV2RGB_LOOP(4)
@@ -406,9 +403,8 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_ENDLOOP(4)
    YUV2RGB_OPERANDS_ALPHA
    YUV2RGB_ENDFUNC
 #endif
    return 0;
 }
 #endif

 static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
                                       int srcStride[],
@@ -430,12 +426,12 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_ENDFUNC
 }

 #if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
 static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
 #if HAVE_7REGS
    int y, h_size;

    YUV2RGB_LOOP(4)
@@ -450,6 +446,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
    YUV2RGB_ENDLOOP(4)
    YUV2RGB_OPERANDS_ALPHA
    YUV2RGB_ENDFUNC
 #endif
    return 0;
 }
 #endif

 #endif /* !COMPILE_TEMPLATE_MMX2 */
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -32,7 +32,7 @@
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "libavutil/x86_cpu.h"
 #include "libavutil/cpu.h"
 #include "libavutil/bswap.h"

 extern const uint8_t dither_4x4_16[4][8];
@@ -579,24 +579,18 @@ CLOSEYUV2RGBFUNC(1)
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 {
    SwsFunc t = NULL;
 #if HAVE_MMX
     t = ff_yuv2rgb_init_mmx(c);
 #endif
 #if HAVE_VIS
    t = ff_yuv2rgb_init_vis(c);
 #endif
 #if CONFIG_MLIB
    t = ff_yuv2rgb_init_mlib(c);
 #endif
 #if HAVE_ALTIVEC
    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
        t = ff_yuv2rgb_init_altivec(c);
 #endif

 #if ARCH_BFIN
    if (c->flags & SWS_CPU_CAPS_BFIN)
    if (HAVE_MMX) {
        t = ff_yuv2rgb_init_mmx(c);
    } else if (HAVE_VIS) {
        t = ff_yuv2rgb_init_vis(c);
    } else if (CONFIG_MLIB) {
        t = ff_yuv2rgb_init_mlib(c);
    } else if (HAVE_ALTIVEC) {
        t = ff_yuv2rgb_init_altivec(c);
    } else if (ARCH_BFIN) {
        t = ff_yuv2rgb_get_func_ptr_bfin(c);
 #endif
    }

    if (t)
        return t;