Cosmetics:

- Place curly brackets in the same line as while/for/if/switch/else/do; - Place curly brackets at column 0 in the next line starting a function. Originally committed as revision 29523 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
16 years ago · dd68318cee
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
    sws_rgb2rgb_init(cpu_caps);
    for(funcNum=0; ; funcNum++){
    for(funcNum=0; ; funcNum++) {
        struct func_info_s {
            int src_bpp;
            int dst_bpp;
@@ -118,13 +118,13 @@ int main(int argc, char **argv)
        av_log(NULL, AV_LOG_INFO,".");
        memset(srcBuffer, srcByte, SIZE);
        for(width=63; width>0; width--){
        for(width=63; width>0; width--) {
            int dstOffset;
            for(dstOffset=128; dstOffset<196; dstOffset+=4){
            for(dstOffset=128; dstOffset<196; dstOffset+=4) {
                int srcOffset;
                memset(dstBuffer, dstByte, SIZE);
                for(srcOffset=128; srcOffset<196; srcOffset+=4){
                for(srcOffset=128; srcOffset<196; srcOffset+=4) {
                    uint8_t *src= srcBuffer+srcOffset;
                    uint8_t *dst= dstBuffer+dstOffset;
                    const char *name=NULL;
@@ -139,24 +139,24 @@ int main(int argc, char **argv)
                    if(!srcBpp) break;
                    for(i=0; i<SIZE; i++){
                        if(srcBuffer[i]!=srcByte){
                    for(i=0; i<SIZE; i++) {
                        if(srcBuffer[i]!=srcByte) {
                            av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
                            break;
                        }
                    }
                    for(i=0; i<dstOffset; i++){
                        if(dstBuffer[i]!=dstByte){
                    for(i=0; i<dstOffset; i++) {
                        if(dstBuffer[i]!=dstByte) {
                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
                            break;
                        }
                    }
                    for(i=dstOffset + width*dstBpp; i<SIZE; i++){
                        if(dstBuffer[i]!=dstByte){
                    for(i=dstOffset + width*dstBpp; i<SIZE; i++) {
                        if(dstBuffer[i]!=dstByte) {
                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
--- a/libswscale/mlib/yuv2rgb_mlib.c
+++ b/libswscale/mlib/yuv2rgb_mlib.c
@@ -31,8 +31,9 @@
 #include "libswscale/swscale.h"
 static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
                               int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    if(c->srcFormat == PIX_FMT_YUV422P) {
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }
@@ -45,8 +46,9 @@ static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], i
 }
 static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
                               int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    if(c->srcFormat == PIX_FMT_YUV422P) {
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }
@@ -59,8 +61,9 @@ static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], i
 }
 static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                              int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
                              int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    if(c->srcFormat == PIX_FMT_YUV422P) {
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }
@@ -75,7 +78,7 @@ static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], in
 SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c)
 {
    switch(c->dstFormat){
    switch(c->dstFormat) {
    case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
    case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
    case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -23,7 +23,8 @@
 #include "swscale.h"
 #include "swscale_internal.h"
 static const char * sws_context_to_name(void * ptr) {
 static const char * sws_context_to_name(void * ptr)
 {
    return "swscaler";
 }
--- a/libswscale/ppc/swscale_altivec_template.c
+++ b/libswscale/ppc/swscale_altivec_template.c
@@ -24,7 +24,8 @@
 #define vzero vec_splat_s32(0)
 static inline void
 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) {
 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW)
 {
    register int i;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));
@@ -389,7 +390,8 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW,
 }
 static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[])
 {
    uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
    uint8_t *ysrc = src[0];
@@ -466,7 +468,8 @@ static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int
 }
 static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[])
 {
    uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
    uint8_t *ysrc = src[0];
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -714,7 +714,7 @@ SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
        if ((c->srcH & 0x1) != 0)
            return NULL;
        switch(c->dstFormat){
        switch(c->dstFormat) {
        case PIX_FMT_RGB24:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
            return altivec_yuv2_rgb24;
@@ -738,7 +738,7 @@ SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
        break;
    case PIX_FMT_UYVY422:
        switch(c->dstFormat){
        switch(c->dstFormat) {
        case PIX_FMT_BGR32:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
            return altivec_uyvy_rgb32;
@@ -800,7 +800,7 @@ ff_yuv2packedX_altivec(SwsContext *c,
    out = (vector unsigned char *)dest;
    for (i=0; i<dstW; i+=16){
    for (i=0; i<dstW; i+=16) {
        Y0 = RND;
        Y1 = RND;
        /* extract 16 coeffs from lumSrc */
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -196,7 +196,8 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 32-bit C version, and and&add trick by Michael Niedermayer
 */
 void sws_rgb2rgb_init(int flags){
 void sws_rgb2rgb_init(int flags)
 {
 #if (HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX)  && CONFIG_GPL
    if (flags & SWS_CPU_CAPS_MMX2)
        rgb2rgb_init_MMX2();
@@ -227,8 +228,7 @@ void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const
 {
    long i;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        //FIXME slow?
        dst[0]= palette[src[i]*4+0];
        dst[1]= palette[src[i]*4+1];
@@ -273,8 +273,7 @@ void rgb32to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 2;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        #if HAVE_BIGENDIAN
            /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
            dst[3*i + 0] = src[4*i + 1];
@@ -291,8 +290,7 @@ void rgb32to24(const uint8_t *src, uint8_t *dst, long src_size)
 void rgb24to32(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    for (i=0; 3*i<src_size; i++)
    {
    for (i=0; 3*i<src_size; i++) {
        #if HAVE_BIGENDIAN
            /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */
            dst[4*i + 0] = 255;
@@ -314,8 +312,7 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        #if HAVE_BIGENDIAN
@@ -338,8 +335,7 @@ void rgb16to24(const uint8_t *src, uint8_t *dst, long src_size)
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0xF800)>>8;
@@ -353,8 +349,7 @@ void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
    long i;
    long num_pixels = src_size >> 1;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = (rgb>>11) | (rgb&0x7E0) | (rgb<<11);
    }
@@ -365,8 +360,7 @@ void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
    long i;
    long num_pixels = src_size >> 1;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = (rgb>>11) | ((rgb&0x7C0)>>1) | ((rgb&0x1F)<<10);
    }
@@ -378,8 +372,7 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        #if HAVE_BIGENDIAN
@@ -402,8 +395,7 @@ void rgb15to24(const uint8_t *src, uint8_t *dst, long src_size)
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0x7C00)>>7;
@@ -417,8 +409,7 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
    long i;
    long num_pixels = src_size >> 1;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = ((rgb&0x7C00)>>10) | ((rgb&0x3E0)<<1) | (rgb<<11);
    }
@@ -429,8 +420,7 @@ void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
    long i;
    long num_pixels = src_size >> 1;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        unsigned br;
        unsigned rgb = ((const uint16_t*)src)[i];
        br = rgb&0x7c1F;
@@ -442,8 +432,7 @@ void bgr8torgb8(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size;
    for (i=0; i<num_pixels; i++)
    {
    for (i=0; i<num_pixels; i++) {
        unsigned b,g,r;
        register uint8_t rgb;
        rgb = src[i];
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -84,8 +84,7 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 23;
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -113,8 +112,7 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
    #endif
    while (s < end)
    {
    while (s < end) {
    #if HAVE_BIGENDIAN
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
        *dest++ = 255;
@@ -143,8 +141,7 @@ static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long s
 #if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 31;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
@@ -199,8 +196,7 @@ static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
 #if HAVE_BIGENDIAN
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
        s++;
@@ -234,8 +230,7 @@ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
    mm_end = end - 15;
    while (s<mm_end)
    {
    while (s<mm_end) {
        __asm__ volatile(
            PREFETCH"  32%1         \n\t"
            "movq        %1, %%mm0  \n\t"
@@ -258,15 +253,13 @@ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(EMMS:::"memory");
 #endif
    mm_end = end - 3;
    while (s < mm_end)
    {
    while (s < mm_end) {
        register unsigned x= *((const uint32_t *)s);
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
        d+=4;
        s+=4;
    }
    if (s < end)
    {
    if (s < end) {
        register unsigned short x= *((const uint16_t *)s);
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
    }
@@ -284,8 +277,7 @@ static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
    mm_end = end - 15;
    while (s<mm_end)
    {
    while (s<mm_end) {
        __asm__ volatile(
            PREFETCH"  32%1         \n\t"
            "movq        %1, %%mm0  \n\t"
@@ -312,15 +304,13 @@ static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(EMMS:::"memory");
 #endif
    mm_end = end - 3;
    while (s < mm_end)
    {
    while (s < mm_end) {
        register uint32_t x= *((const uint32_t*)s);
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
        s+=4;
        d+=4;
    }
    if (s < end)
    {
    if (s < end) {
        register uint16_t x= *((const uint16_t*)s);
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
    }
@@ -378,8 +368,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
        "movq    %0, %%mm7    \n\t"
        "movq    %1, %%mm6    \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -417,8 +406,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
    }
@@ -440,8 +428,7 @@ static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long s
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -478,8 +465,7 @@ static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
    }
@@ -537,8 +523,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
        "movq          %0, %%mm7    \n\t"
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -576,8 +561,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
    }
@@ -599,8 +583,7 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long s
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -637,8 +620,7 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
    }
@@ -660,8 +642,7 @@ static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long s
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 11;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -698,8 +679,7 @@ static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        const int b = *s++;
        const int g = *s++;
        const int r = *s++;
@@ -723,8 +703,7 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -761,8 +740,7 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        const int r = *s++;
        const int g = *s++;
        const int b = *s++;
@@ -786,8 +764,7 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 11;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
@@ -824,8 +801,7 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        const int b = *s++;
        const int g = *s++;
        const int r = *s++;
@@ -849,8 +825,7 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"   32%1            \n\t"
            "movd         %1, %%mm0     \n\t"
@@ -887,8 +862,7 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        const int r = *s++;
        const int g = *s++;
        const int b = *s++;
@@ -929,8 +903,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long s
 #if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 7;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
@@ -1049,8 +1022,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0x1F)<<3;
@@ -1071,8 +1043,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long s
 #if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 7;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
@@ -1190,8 +1161,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long s
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0x1F)<<3;
@@ -1233,8 +1203,7 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
    mm_end = end - 3;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
@@ -1256,8 +1225,7 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
 #if HAVE_BIGENDIAN
@@ -1288,8 +1256,7 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
    mm_end = end - 3;
    while (s < mm_end)
    {
    while (s < mm_end) {
        __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
@@ -1311,8 +1278,7 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
 #endif
    while (s < end)
    {
    while (s < end) {
        register uint16_t bgr;
        bgr = *s++;
 #if HAVE_BIGENDIAN
@@ -1453,8 +1419,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
    src-= src_size;
    dst-= src_size;
 #endif
    for (i=0; i<src_size; i+=3)
    {
    for (i=0; i<src_size; i+=3) {
        register uint8_t x;
        x          = src[i + 2];
        dst[i + 1] = src[i + 1];
@@ -1469,8 +1434,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
 {
    long y;
    const x86_reg chromWidth= width>>1;
    for (y=0; y<height; y++)
    {
    for (y=0; y<height; y++) {
 #if HAVE_MMX
 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
        __asm__ volatile(
@@ -1530,7 +1494,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
        const uint32_t *yc = (uint32_t *) ysrc;
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
        for (i = 0; i < chromWidth; i += 8){
        for (i = 0; i < chromWidth; i += 8) {
            uint64_t y1, y2, yuv1, yuv2;
            uint64_t u, v;
            /* Prefetch */
@@ -1559,7 +1523,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
        int i;
        uint64_t *ldst = (uint64_t *) dst;
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
        for (i = 0; i < chromWidth; i += 2){
        for (i = 0; i < chromWidth; i += 2) {
            uint64_t k, l;
            k = yc[0] + (uc[0] << 8) +
                (yc[1] << 16) + (vc[0] << 24);
@@ -1574,7 +1538,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
 #else
        int i, *idst = (int32_t *) dst;
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
        for (i = 0; i < chromWidth; i++){
        for (i = 0; i < chromWidth; i++) {
 #if HAVE_BIGENDIAN
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
                (yc[1] << 8) + (vc[0] << 0);
@@ -1588,8 +1552,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
        }
 #endif
 #endif
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
        {
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
            usrc += chromStride;
            vsrc += chromStride;
        }
@@ -1621,8 +1584,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
 {
    long y;
    const x86_reg chromWidth= width>>1;
    for (y=0; y<height; y++)
    {
    for (y=0; y<height; y++) {
 #if HAVE_MMX
 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
        __asm__ volatile(
@@ -1665,7 +1627,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
        int i;
        uint64_t *ldst = (uint64_t *) dst;
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
        for (i = 0; i < chromWidth; i += 2){
        for (i = 0; i < chromWidth; i += 2) {
            uint64_t k, l;
            k = uc[0] + (yc[0] << 8) +
                (vc[0] << 16) + (yc[1] << 24);
@@ -1680,7 +1642,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
 #else
        int i, *idst = (int32_t *) dst;
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
        for (i = 0; i < chromWidth; i++){
        for (i = 0; i < chromWidth; i++) {
 #if HAVE_BIGENDIAN
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
                (vc[0] << 8) + (yc[1] << 0);
@@ -1694,8 +1656,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
        }
 #endif
 #endif
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
        {
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
            usrc += chromStride;
            vsrc += chromStride;
        }
@@ -1751,8 +1712,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 {
    long y;
    const x86_reg chromWidth= width>>1;
    for (y=0; y<height; y+=2)
    {
    for (y=0; y<height; y+=2) {
 #if HAVE_MMX
        __asm__ volatile(
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
@@ -1837,8 +1797,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
        );
 #else
        long i;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            ydst[2*i+0]     = src[4*i+0];
            udst[i]     = src[4*i+1];
            ydst[2*i+1]     = src[4*i+2];
@@ -1847,8 +1806,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
        ydst += lumStride;
        src  += srcStride;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            ydst[2*i+0]     = src[4*i+0];
            ydst[2*i+1]     = src[4*i+2];
        }
@@ -1882,7 +1840,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
    dst[0]= src[0];
    // first line
    for (x=0; x<srcWidth-1; x++){
    for (x=0; x<srcWidth-1; x++) {
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
    }
@@ -1890,7 +1848,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
    dst+= dstStride;
    for (y=1; y<srcHeight; y++){
    for (y=1; y<srcHeight; y++) {
 #if HAVE_MMX2 || HAVE_AMD3DNOW
        const x86_reg mmxSize= srcWidth&~15;
        __asm__ volatile(
@@ -1941,7 +1899,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
        for (x=mmxSize-1; x<srcWidth-1; x++){
        for (x=mmxSize-1; x<srcWidth-1; x++) {
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
@@ -1958,13 +1916,13 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
 #if 1
    dst[0]= src[0];
    for (x=0; x<srcWidth-1; x++){
    for (x=0; x<srcWidth-1; x++) {
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
    }
    dst[2*srcWidth-1]= src[srcWidth-1];
 #else
    for (x=0; x<srcWidth; x++){
    for (x=0; x<srcWidth; x++) {
        dst[2*x+0]=
        dst[2*x+1]= src[x];
    }
@@ -1989,8 +1947,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 {
    long y;
    const x86_reg chromWidth= width>>1;
    for (y=0; y<height; y+=2)
    {
    for (y=0; y<height; y+=2) {
 #if HAVE_MMX
        __asm__ volatile(
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
@@ -2075,8 +2032,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
        );
 #else
        long i;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            udst[i]     = src[4*i+0];
            ydst[2*i+0] = src[4*i+1];
            vdst[i]     = src[4*i+2];
@@ -2085,8 +2041,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
        ydst += lumStride;
        src  += srcStride;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            ydst[2*i+0] = src[4*i+1];
            ydst[2*i+1] = src[4*i+3];
        }
@@ -2117,11 +2072,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
    long y;
    const x86_reg chromWidth= width>>1;
 #if HAVE_MMX
    for (y=0; y<height-2; y+=2)
    {
    for (y=0; y<height-2; y+=2) {
        long i;
        for (i=0; i<2; i++)
        {
        for (i=0; i<2; i++) {
            __asm__ volatile(
                "mov                        %2, %%"REG_a"   \n\t"
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
@@ -2355,11 +2308,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 #else
    y=0;
 #endif
    for (; y<height; y+=2)
    {
    for (; y<height; y+=2) {
        long i;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            unsigned int b = src[6*i+0];
            unsigned int g = src[6*i+1];
            unsigned int r = src[6*i+2];
@@ -2382,8 +2333,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
        ydst += lumStride;
        src  += srcStride;
        for (i=0; i<chromWidth; i++)
        {
        for (i=0; i<chromWidth; i++) {
            unsigned int b = src[6*i+0];
            unsigned int g = src[6*i+1];
            unsigned int r = src[6*i+2];
@@ -2408,11 +2358,11 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
                             long width, long height, long src1Stride,
                             long src2Stride, long dstStride){
                             long src2Stride, long dstStride)
 {
    long h;
    for (h=0; h < height; h++)
    {
    for (h=0; h < height; h++) {
        long w;
 #if HAVE_MMX
@@ -2462,14 +2412,12 @@ static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
            : "memory", "%"REG_a
        );
 #endif
        for (w= (width&(~15)); w < width; w++)
        {
        for (w= (width&(~15)); w < width; w++) {
            dest[2*w+0] = src1[w];
            dest[2*w+1] = src2[w];
        }
 #else
        for (w=0; w < width; w++)
        {
        for (w=0; w < width; w++) {
            dest[2*w+0] = src1[w];
            dest[2*w+1] = src2[w];
        }
@@ -2502,13 +2450,12 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
        PREFETCH" %1    \n\t"
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
 #endif
    for (y=0;y<h;y++){
    for (y=0;y<h;y++) {
        const uint8_t* s1=src1+srcStride1*(y>>1);
        uint8_t* d=dst1+dstStride1*y;
        x=0;
 #if HAVE_MMX
        for (;x<w-31;x+=32)
        {
        for (;x<w-31;x+=32) {
            __asm__ volatile(
                PREFETCH"   32%1        \n\t"
                "movq         %1, %%mm0 \n\t"
@@ -2542,13 +2489,12 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
 #endif
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
    }
    for (y=0;y<h;y++){
    for (y=0;y<h;y++) {
        const uint8_t* s2=src2+srcStride2*(y>>1);
        uint8_t* d=dst2+dstStride2*y;
        x=0;
 #if HAVE_MMX
        for (;x<w-31;x+=32)
        {
        for (;x<w-31;x+=32) {
            __asm__ volatile(
                PREFETCH"   32%1        \n\t"
                "movq         %1, %%mm0 \n\t"
@@ -2600,15 +2546,14 @@ static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2
    x86_reg x;
    long y,w,h;
    w=width/2; h=height;
    for (y=0;y<h;y++){
    for (y=0;y<h;y++) {
        const uint8_t* yp=src1+srcStride1*y;
        const uint8_t* up=src2+srcStride2*(y>>2);
        const uint8_t* vp=src3+srcStride3*(y>>2);
        uint8_t* d=dst+dstStride*y;
        x=0;
 #if HAVE_MMX
        for (;x<w-7;x+=8)
        {
        for (;x<w-7;x+=8) {
            __asm__ volatile(
                PREFETCH"   32(%1, %0)          \n\t"
                PREFETCH"   32(%2, %0)          \n\t"
@@ -2661,8 +2606,7 @@ static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2
                :"memory");
        }
 #endif
        for (; x<w; x++)
        {
        for (; x<w; x++) {
            const long x2 = x<<2;
            d[8*x+0] = yp[x2];
            d[8*x+1] = up[x];
@@ -2690,7 +2634,7 @@ static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count
    count= - count;
 #if HAVE_MMX
    if(count <= -16){
    if(count <= -16) {
        count += 15;
        __asm__ volatile(
            "pcmpeqw       %%mm7, %%mm7        \n\t"
@@ -2716,7 +2660,7 @@ static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count
        count -= 15;
    }
 #endif
    while(count<0){
    while(count<0) {
        dst[count]= src[2*count];
        count++;
    }
@@ -2729,7 +2673,7 @@ static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *ds
    src += 4*count;
    count= - count;
 #if HAVE_MMX
    if(count <= -8){
    if(count <= -8) {
        count += 7;
        __asm__ volatile(
            "pcmpeqw       %%mm7, %%mm7        \n\t"
@@ -2763,7 +2707,7 @@ static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *ds
        count -= 7;
    }
 #endif
    while(count<0){
    while(count<0) {
        dst0[count]= src[4*count+0];
        dst1[count]= src[4*count+2];
        count++;
@@ -2778,7 +2722,7 @@ static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, u
    src1 += 4*count;
    count= - count;
 #ifdef PAVGB
    if(count <= -8){
    if(count <= -8) {
        count += 7;
        __asm__ volatile(
            "pcmpeqw        %%mm7, %%mm7        \n\t"
@@ -2816,7 +2760,7 @@ static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, u
        count -= 7;
    }
 #endif
    while(count<0){
    while(count<0) {
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
        count++;
@@ -2830,7 +2774,7 @@ static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst
    src += 4*count;
    count= - count;
 #if HAVE_MMX
    if(count <= -8){
    if(count <= -8) {
        count += 7;
        __asm__ volatile(
            "pcmpeqw       %%mm7, %%mm7        \n\t"
@@ -2865,7 +2809,7 @@ static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst
    }
 #endif
    src++;
    while(count<0){
    while(count<0) {
        dst0[count]= src[4*count+0];
        dst1[count]= src[4*count+2];
        count++;
@@ -2880,7 +2824,7 @@ static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, ui
    src1 += 4*count;
    count= - count;
 #ifdef PAVGB
    if(count <= -8){
    if(count <= -8) {
        count += 7;
        __asm__ volatile(
            "pcmpeqw        %%mm7, %%mm7        \n\t"
@@ -2920,7 +2864,7 @@ static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, ui
 #endif
    src0++;
    src1++;
    while(count<0){
    while(count<0) {
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
        count++;
@@ -2934,9 +2878,9 @@ static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
    long y;
    const long chromWidth= -((-width)>>1);
    for (y=0; y<height; y++){
    for (y=0; y<height; y++) {
        RENAME(extract_even)(src, ydst, width);
        if(y&1){
        if(y&1) {
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
            udst+= chromStride;
            vdst+= chromStride;
@@ -2961,7 +2905,7 @@ static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
    long y;
    const long chromWidth= -((-width)>>1);
    for (y=0; y<height; y++){
    for (y=0; y<height; y++) {
        RENAME(extract_even)(src, ydst, width);
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
@@ -2986,9 +2930,9 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
    long y;
    const long chromWidth= -((-width)>>1);
    for (y=0; y<height; y++){
    for (y=0; y<height; y++) {
        RENAME(extract_even)(src+1, ydst, width);
        if(y&1){
        if(y&1) {
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
            udst+= chromStride;
            vdst+= chromStride;
@@ -3013,7 +2957,7 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
    long y;
    const long chromWidth= -((-width)>>1);
    for (y=0; y<height; y++){
    for (y=0; y<height; y++) {
        RENAME(extract_even)(src+1, ydst, width);
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
@@ -3031,7 +2975,8 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
 #endif
 }
 static inline void RENAME(rgb2rgb_init)(void){
 static inline void RENAME(rgb2rgb_init)(void)
 {
    rgb15to16       = RENAME(rgb15to16);
    rgb15tobgr24    = RENAME(rgb15tobgr24);
    rgb15to32       = RENAME(rgb15to32);
--- a/libswscale/sparc/yuv2rgb_vis.c
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -82,7 +82,8 @@
 // FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
                           int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, out1, out2, out3, out4, out5, out6;
    for(y=0;y < srcSliceH;++y) {
@@ -134,7 +135,8 @@ static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int s
 // FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
                           int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, out1, out2, out3, out4, out5, out6;
    for(y=0;y < srcSliceH;++y) {
@@ -184,7 +186,8 @@ static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int s
    return srcSliceH;
 }
 SwsFunc ff_yuv2rgb_init_vis(SwsContext *c){
 SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
 {
    c->sparc_coeffs[5]=c->yCoeff;
    c->sparc_coeffs[6]=c->vgCoeff;
    c->sparc_coeffs[7]=c->vrCoeff;
--- a/libswscale/swscale-example.c
+++ b/libswscale/swscale-example.c
@@ -50,14 +50,15 @@ const char *sws_format_name(enum PixelFormat format);
        || (x)==PIX_FMT_YUVA420P    \
    )
 static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
 static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h)
 {
    int x,y;
    uint64_t ssd=0;
 //printf("%d %d\n", w, h);
    for (y=0; y<h; y++){
        for (x=0; x<w; x++){
    for (y=0; y<h; y++) {
        for (x=0; x<w; x++) {
            int d= src1[x + y*stride1] - src2[x + y*stride2];
            ssd+= d*d;
 //printf("%d", abs(src1[x + y*stride1] - src2[x + y*stride2])/26 );
@@ -70,7 +71,8 @@ static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, i
 // test by ref -> src -> dst -> out & compare out against ref
 // ref & out are YV12
 static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, int srcFormat, int dstFormat,
                  int srcW, int srcH, int dstW, int dstH, int flags){
                  int srcW, int srcH, int dstW, int dstH, int flags)
 {
    uint8_t *src[4] = {0};
    uint8_t *dst[4] = {0};
    uint8_t *out[4] = {0};
@@ -82,7 +84,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, int srcFormat
    int res;
    res = 0;
    for (i=0; i<4; i++){
    for (i=0; i<4; i++) {
        // avoid stride % bpp != 0
        if (srcFormat==PIX_FMT_RGB24 || srcFormat==PIX_FMT_BGR24)
            srcStride[i]= srcW*3;
@@ -169,7 +171,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, int srcFormat
    sws_freeContext(dstContext);
    sws_freeContext(outContext);
    for (i=0; i<4; i++){
    for (i=0; i<4; i++) {
        free(src[i]);
        free(dst[i]);
        free(out[i]);
@@ -178,7 +180,8 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h, int srcFormat
    return res;
 }
 static void selfTest(uint8_t *src[4], int stride[4], int w, int h){
 static void selfTest(uint8_t *src[4], int stride[4], int w, int h)
 {
    enum PixelFormat srcFormat, dstFormat;
    int srcW, srcH, dstW, dstH;
    int flags;
@@ -206,7 +209,8 @@ static void selfTest(uint8_t *src[4], int stride[4], int w, int h){
 #define W 96
 #define H 96
 int main(int argc, char **argv){
 int main(int argc, char **argv)
 {
    uint8_t *rgb_data = malloc (W*H*4);
    uint8_t *rgb_src[3]= {rgb_data, NULL, NULL};
    int rgb_stride[3]={4*W, 0, 0};
@@ -221,8 +225,8 @@ int main(int argc, char **argv){
    av_lfg_init(&rand, 1);
    for (y=0; y<H; y++){
        for (x=0; x<W*4; x++){
    for (y=0; y<H; y++) {
        for (x=0; x<W*4; x++) {
            rgb_data[ x + y*4*W]= av_lfg_get(&rand);
        }
    }
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -64,7 +64,7 @@ typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[],
                       uint8_t* dst[], int dstStride[]);
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext{
 typedef struct SwsContext {
    /**
     * info on struct for av_log
     */
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -906,23 +906,23 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
 #if COMPILE_TEMPLATE_MMX
    if(!(c->flags & SWS_BITEXACT)){
        if (c->flags & SWS_ACCURATE_RND){
            if (uDest){
    if(!(c->flags & SWS_BITEXACT)) {
        if (c->flags & SWS_ACCURATE_RND) {
            if (uDest) {
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
            }
            if (CONFIG_SWSCALE_ALPHA && aDest){
            if (CONFIG_SWSCALE_ALPHA && aDest) {
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
            }
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
        }else{
            if (uDest){
        } else {
            if (uDest) {
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
            }
            if (CONFIG_SWSCALE_ALPHA && aDest){
            if (CONFIG_SWSCALE_ALPHA && aDest) {
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
            }
@@ -956,15 +956,15 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
 {
    int i;
 #if COMPILE_TEMPLATE_MMX
    if(!(c->flags & SWS_BITEXACT)){
    if(!(c->flags & SWS_BITEXACT)) {
        long p= 4;
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
        if (c->flags & SWS_ACCURATE_RND){
            while(p--){
                if (dst[p]){
        if (c->flags & SWS_ACCURATE_RND) {
            while(p--) {
                if (dst[p]) {
                    __asm__ volatile(
                        YSCALEYUV2YV121_ACCURATE
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
@@ -973,9 +973,9 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
                    );
                }
            }
        }else{
            while(p--){
                if (dst[p]){
        } else {
            while(p--) {
                if (dst[p]) {
                    __asm__ volatile(
                        YSCALEYUV2YV121
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
@@ -988,11 +988,10 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
        return;
    }
 #endif
    for (i=0; i<dstW; i++)
    {
    for (i=0; i<dstW; i++) {
        int val= (lumSrc[i]+64)>>7;
        if (val&256){
        if (val&256) {
            if (val<0) val=0;
            else       val=255;
        }
@@ -1001,12 +1000,11 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
    }
    if (uDest)
        for (i=0; i<chrDstW; i++)
        {
        for (i=0; i<chrDstW; i++) {
            int u=(chrSrc[i       ]+64)>>7;
            int v=(chrSrc[i + VOFW]+64)>>7;
            if ((u|v)&256){
            if ((u|v)&256) {
                if (u<0)        u=0;
                else if (u>255) u=255;
                if (v<0)        v=0;
@@ -1018,7 +1016,7 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
        }
    if (CONFIG_SWSCALE_ALPHA && aDest)
        for (i=0; i<dstW; i++){
        for (i=0; i<dstW; i++) {
            int val= (alpSrc[i]+64)>>7;
            aDest[i]= av_clip_uint8(val);
        }
@@ -1034,11 +1032,11 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 {
 #if COMPILE_TEMPLATE_MMX
    x86_reg dummy=0;
    if(!(c->flags & SWS_BITEXACT)){
        if (c->flags & SWS_ACCURATE_RND){
            switch(c->dstFormat){
    if(!(c->flags & SWS_BITEXACT)) {
        if (c->flags & SWS_ACCURATE_RND) {
            switch(c->dstFormat) {
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                    YSCALEYUV2PACKEDX_ACCURATE
                    YSCALEYUV2RGBX
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
@@ -1052,7 +1050,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
                    YSCALEYUV2PACKEDX_END
                }else{
                } else {
                    YSCALEYUV2PACKEDX_ACCURATE
                    YSCALEYUV2RGBX
                    "pcmpeqd %%mm7, %%mm7 \n\t"
@@ -1116,11 +1114,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                YSCALEYUV2PACKEDX_END
                return;
            }
        }else{
            switch(c->dstFormat)
            {
        } else {
            switch(c->dstFormat) {
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                    YSCALEYUV2PACKEDX
                    YSCALEYUV2RGBX
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
@@ -1129,7 +1126,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                    "packuswb                  %%mm7, %%mm1         \n\t"
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
                    YSCALEYUV2PACKEDX_END
                }else{
                } else {
                    YSCALEYUV2PACKEDX
                    YSCALEYUV2RGBX
                    "pcmpeqd %%mm7, %%mm7 \n\t"
@@ -1222,12 +1219,11 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
    int i;
 #if COMPILE_TEMPLATE_MMX
    if(!(c->flags & SWS_BITEXACT)){
        switch(c->dstFormat)
        {
    if(!(c->flags & SWS_BITEXACT)) {
        switch(c->dstFormat) {
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 #if ARCH_X86_64
                    __asm__ volatile(
                    YSCALEYUV2RGB(%%REGBP, %5)
@@ -1268,7 +1264,7 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                    "a" (&c->redDither)
                    );
 #endif
                }else{
                } else {
                    __asm__ volatile(
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                    "mov        %4, %%"REG_b"               \n\t"
@@ -1373,20 +1369,17 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    const int yalpha= 4096; //FIXME ...
    if (flags&SWS_FULL_CHR_H_INT)
    {
    if (flags&SWS_FULL_CHR_H_INT) {
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
        return;
    }
 #if COMPILE_TEMPLATE_MMX
    if(!(flags & SWS_BITEXACT)){
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
        {
            switch(dstFormat)
            {
    if(!(flags & SWS_BITEXACT)) {
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
            switch(dstFormat) {
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                    __asm__ volatile(
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                    "mov        %4, %%"REG_b"               \n\t"
@@ -1400,7 +1393,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                    "a" (&c->redDither)
                    );
                }else{
                } else {
                    __asm__ volatile(
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                    "mov        %4, %%"REG_b"               \n\t"
@@ -1489,13 +1482,10 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                );
                return;
            }
        }
        else
        {
            switch(dstFormat)
            {
        } else {
            switch(dstFormat) {
            case PIX_FMT_RGB32:
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                    __asm__ volatile(
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                    "mov        %4, %%"REG_b"               \n\t"
@@ -1509,7 +1499,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                    "a" (&c->redDither)
                    );
                }else{
                } else {
                    __asm__ volatile(
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                    "mov        %4, %%"REG_b"               \n\t"
@@ -1601,10 +1591,9 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
        }
    }
 #endif /* COMPILE_TEMPLATE_MMX */
    if (uvalpha < 2048)
    {
    if (uvalpha < 2048) {
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
    }else{
    } else {
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
    }
 }
@@ -1662,8 +1651,7 @@ static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
    );
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        dstU[i]= src1[4*i + 1];
        dstV[i]= src1[4*i + 3];
    }
@@ -1696,8 +1684,7 @@ static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *s
    );
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        dstU[i]= src1[2*i + 1];
        dstV[i]= src2[2*i + 1];
    }
@@ -1756,8 +1743,7 @@ static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
    );
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        dstU[i]= src1[4*i + 0];
        dstV[i]= src1[4*i + 2];
    }
@@ -1791,8 +1777,7 @@ static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *s
    );
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        dstU[i]= src1[2*i];
        dstV[i]= src2[2*i];
    }
@@ -1803,13 +1788,13 @@ static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *s
 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
 {
    if(srcFormat == PIX_FMT_BGR24){
    if(srcFormat == PIX_FMT_BGR24) {
        __asm__ volatile(
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
            :
        );
    }else{
    } else {
        __asm__ volatile(
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
@@ -1918,8 +1903,7 @@ static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int b= src[i*3+0];
        int g= src[i*3+1];
        int r= src[i*3+2];
@@ -1935,8 +1919,7 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int b= src1[3*i + 0];
        int g= src1[3*i + 1];
        int r= src1[3*i + 2];
@@ -1951,8 +1934,7 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
 {
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int b= src1[6*i + 0] + src1[6*i + 3];
        int g= src1[6*i + 1] + src1[6*i + 4];
        int r= src1[6*i + 2] + src1[6*i + 5];
@@ -1969,8 +1951,7 @@ static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
 #else
    int i;
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int r= src[i*3+0];
        int g= src[i*3+1];
        int b= src[i*3+2];
@@ -1988,8 +1969,7 @@ static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
 #else
    int i;
    assert(src1==src2);
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int r= src1[3*i + 0];
        int g= src1[3*i + 1];
        int b= src1[3*i + 2];
@@ -2004,8 +1984,7 @@ static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const ui
 {
    int i;
    assert(src1==src2);
    for (i=0; i<width; i++)
    {
    for (i=0; i<width; i++) {
        int r= src1[6*i + 0] + src1[6*i + 3];
        int g= src1[6*i + 1] + src1[6*i + 4];
        int b= src1[6*i + 2] + src1[6*i + 5];
@@ -2022,8 +2001,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
 {
 #if COMPILE_TEMPLATE_MMX
    assert(filterSize % 4 == 0 && filterSize>0);
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
    {
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
        x86_reg counter= -2*dstW;
        filter-= counter*2;
        filterPos-= counter/2;
@@ -2067,9 +2045,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
        : "%"REG_b
 #endif
        );
    }
    else if (filterSize==8)
    {
    } else if (filterSize==8) {
        x86_reg counter= -2*dstW;
        filter-= counter*4;
        filterPos-= counter/2;
@@ -2124,9 +2100,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
        : "%"REG_b
 #endif
        );
    }
    else
    {
    } else {
        uint8_t *offset = src+filterSize;
        x86_reg counter= -2*dstW;
        //filter-= counter*filterSize/2;
@@ -2180,14 +2154,12 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
 #else
    int i;
    for (i=0; i<dstW; i++)
    {
    for (i=0; i<dstW; i++) {
        int j;
        int srcPos= filterPos[i];
        int val=0;
        //printf("filterPos: %d\n", filterPos[i]);
        for (j=0; j<filterSize; j++)
        {
        for (j=0; j<filterSize; j++) {
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
        }
@@ -2213,8 +2185,7 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
 {
    int i;
    unsigned int xpos=0;
    for (i=0;i<dstWidth;i++)
    {
    for (i=0;i<dstWidth;i++) {
        register unsigned int xx=xpos>>16;
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
@@ -2259,17 +2230,14 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth,
 #endif
    {
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
    }
    else // fast bilinear upscale / crap downscale
    {
    } else { // fast bilinear upscale / crap downscale
 #if ARCH_X86 && CONFIG_GPL
 #if COMPILE_TEMPLATE_MMX2
        int i;
 #if defined(PIC)
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
 #endif
        if (canMMX2BeUsed)
        {
        if (canMMX2BeUsed) {
            __asm__ volatile(
 #if defined(PIC)
            "mov               %%"REG_b", %5        \n\t"
@@ -2328,9 +2296,7 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth,
 #endif
            );
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
        }
        else
        {
        } else {
 #endif /* COMPILE_TEMPLATE_MMX2 */
        x86_reg xInc_shr16 = xInc >> 16;
        uint16_t xInc_mask = xInc & 0xffff;
@@ -2372,14 +2338,14 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth,
 #endif /* ARCH_X86 */
    }
    if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
    if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
        int i;
        //FIXME all pal and rgb srcFormats could do this convertion as well
        //FIXME all scalers more complex than bilinear could do half of this transform
        if(c->srcRange){
        if(c->srcRange) {
            for (i=0; i<dstWidth; i++)
                dst[i]= (dst[i]*14071 + 33561947)>>14;
        }else{
        } else {
            for (i=0; i<dstWidth; i++)
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
        }
@@ -2392,8 +2358,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
 {
    int i;
    unsigned int xpos=0;
    for (i=0;i<dstWidth;i++)
    {
    for (i=0;i<dstWidth;i++) {
        register unsigned int xx=xpos>>16;
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
@@ -2445,17 +2410,14 @@ inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth,
    {
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
    }
    else // fast bilinear upscale / crap downscale
    {
    } else { // fast bilinear upscale / crap downscale
 #if ARCH_X86 && CONFIG_GPL
 #if COMPILE_TEMPLATE_MMX2
        int i;
 #if defined(PIC)
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
 #endif
        if (canMMX2BeUsed)
        {
        if (canMMX2BeUsed) {
            __asm__ volatile(
 #if defined(PIC)
            "mov          %%"REG_b", %6         \n\t"
@@ -2500,15 +2462,12 @@ inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth,
             ,"%"REG_b
 #endif
            );
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
            {
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
                //printf("%d %d %d\n", dstWidth, i, srcW);
                dst[i] = src1[srcW-1]*128;
                dst[i+VOFW] = src2[srcW-1]*128;
            }
        }
        else
        {
        } else {
 #endif /* COMPILE_TEMPLATE_MMX2 */
            x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
            uint16_t xInc_mask = xInc & 0xffff;
@@ -2552,17 +2511,17 @@ inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth,
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
 #endif /* ARCH_X86 */
    }
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
        int i;
        //FIXME all pal and rgb srcFormats could do this convertion as well
        //FIXME all scalers more complex than bilinear could do half of this transform
        if(c->srcRange){
            for (i=0; i<dstWidth; i++){
        if(c->srcRange) {
            for (i=0; i<dstWidth; i++) {
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
            }
        }else{
            for (i=0; i<dstWidth; i++){
        } else {
            for (i=0; i<dstWidth; i++) {
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
            }
@@ -2571,8 +2530,8 @@ inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth,
 }
 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
                           int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    /* load a few things into local vars to make the code more readable? and faster */
    const int srcW= c->srcW;
    const int dstW= c->dstW;
@@ -2617,7 +2576,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
    int lastInLumBuf= c->lastInLumBuf;
    int lastInChrBuf= c->lastInChrBuf;
    if (isPacked(c->srcFormat)){
    if (isPacked(c->srcFormat)) {
        src[0]=
        src[1]=
        src[2]=
@@ -2636,11 +2595,9 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
    //dstStride[0],dstStride[1],dstStride[2]);
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
    {
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
        static int warnedAlready=0; //FIXME move this into the context perhaps
        if (flags & SWS_PRINT_INFO && !warnedAlready)
        {
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
                   "         ->cannot do aligned memory accesses anymore\n");
            warnedAlready=1;
@@ -2650,7 +2607,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
    /* Note the user might start scaling the picture in the middle so this
       will not get executed. This is not really intended but works
       currently, so people might do it. */
    if (srcSliceY ==0){
    if (srcSliceY ==0) {
        lumBufIndex=0;
        chrBufIndex=0;
        dstY=0;
@@ -2660,7 +2617,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
    lastDstY= dstY;
    for (;dstY < dstH; dstY++){
    for (;dstY < dstH; dstY++) {
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
        const int chrDstY= dstY>>c->chrDstVSubSample;
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
@@ -2695,8 +2652,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
        vChrBufSize, vLumBufSize);*/
        //Do horizontal scaling
        while(lastInLumBuf < lastLumSrcY)
        {
        while(lastInLumBuf < lastLumSrcY) {
            uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
            uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
            lumBufIndex++;
@@ -2716,8 +2672,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                                pal, 1);
            lastInLumBuf++;
        }
        while(lastInChrBuf < lastChrSrcY)
        {
        while(lastInChrBuf < lastChrSrcY) {
            uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
            uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
            chrBufIndex++;
@@ -2747,52 +2702,49 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
            c->greenDither= ff_dither4[dstY&1];
        c->redDither= ff_dither8[(dstY+1)&1];
 #endif
        if (dstY < dstH-2)
        {
        if (dstY < dstH-2) {
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
 #if COMPILE_TEMPLATE_MMX
            int i;
            if (flags & SWS_ACCURATE_RND){
            if (flags & SWS_ACCURATE_RND) {
                int s= APCK_SIZE / 8;
                for (i=0; i<vLumFilterSize; i+=2){
                for (i=0; i<vLumFilterSize; i+=2) {
                    *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
                    *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
                        *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
                        *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
                    }
                }
                for (i=0; i<vChrFilterSize; i+=2){
                for (i=0; i<vChrFilterSize; i+=2) {
                    *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
                    *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
                }
            }else{
                for (i=0; i<vLumFilterSize; i++)
                {
            } else {
                for (i=0; i<vLumFilterSize; i++) {
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
                    lumMmxFilter[4*i+2]=
                    lumMmxFilter[4*i+3]=
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
                        alpMmxFilter[4*i+2]=
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
                    }
                }
                for (i=0; i<vChrFilterSize; i++)
                {
                for (i=0; i<vChrFilterSize; i++) {
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
                    chrMmxFilter[4*i+2]=
@@ -2801,87 +2753,72 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                }
            }
 #endif
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
                c->yuv2nv12X(c,
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                             dest, uDest, dstW, chrDstW, dstFormat);
            }
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
            {
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                if (is16BPS(dstFormat))
                {
                if (is16BPS(dstFormat)) {
                    yuv2yuvX16inC(
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
                                  dstFormat);
                }
                else
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
                {
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
                    int16_t *lumBuf = lumPixBuf[0];
                    int16_t *chrBuf= chrPixBuf[0];
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
                }
                else //General YV12
                {
                } else { //General YV12
                    c->yuv2yuvX(c,
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
                }
            }
            else
            {
            } else {
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
                {
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
                    int chrAlpha= vChrFilter[2*dstY+1];
                    if(flags & SWS_FULL_CHR_H_INT){
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    }else{
                    } else {
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
                                       alpPixBuf ? *alpSrcPtr : NULL,
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
                    }
                }
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
                {
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
                    int lumAlpha= vLumFilter[2*dstY+1];
                    int chrAlpha= vChrFilter[2*dstY+1];
                    lumMmxFilter[2]=
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
                    chrMmxFilter[2]=
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
                    if(flags & SWS_FULL_CHR_H_INT){
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    }else{
                    } else {
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
                    }
                }
                else //general RGB
                {
                    if(flags & SWS_FULL_CHR_H_INT){
                } else { //general RGB
                    if(flags & SWS_FULL_CHR_H_INT) {
                        yuv2rgbXinC_full(c,
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                         alpSrcPtr, dest, dstW, dstY);
                    }else{
                    } else {
                        c->yuv2packedX(c,
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
@@ -2889,50 +2826,41 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                    }
                }
            }
        }
        else // hmm looks like we can't use MMX here without overwriting this array's tail
        {
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
                yuv2nv12XinC(
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                             dest, uDest, dstW, chrDstW, dstFormat);
            }
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
            {
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                if (is16BPS(dstFormat))
                {
                if (is16BPS(dstFormat)) {
                    yuv2yuvX16inC(
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
                                  dstFormat);
                }
                else
                {
                } else {
                    yuv2yuvXinC(
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
                }
            }
            else
            {
            } else {
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
                if(flags & SWS_FULL_CHR_H_INT){
                if(flags & SWS_FULL_CHR_H_INT) {
                    yuv2rgbXinC_full(c,
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                                     alpSrcPtr, dest, dstW, dstY);
                }else{
                } else {
                    yuv2packedXinC(c,
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
--- a/libswscale/x86/yuv2rgb_mmx.c
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -63,15 +63,15 @@ SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
    if (c->flags & SWS_CPU_CAPS_MMX2) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
                break;
            }else return yuv420_rgb32_MMX2;
            } else return yuv420_rgb32_MMX2;
        case PIX_FMT_BGR32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_bgr32_MMX2;
                break;
            }else return yuv420_bgr32_MMX2;
            } else return yuv420_bgr32_MMX2;
        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
@@ -80,15 +80,15 @@ SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
    if (c->flags & SWS_CPU_CAPS_MMX) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_rgb32_MMX;
                break;
            }else return yuv420_rgb32_MMX;
            } else return yuv420_rgb32_MMX;
        case PIX_FMT_BGR32:
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
                if (HAVE_7REGS) return yuva420_bgr32_MMX;
                break;
            }else return yuv420_bgr32_MMX;
            } else return yuv420_bgr32_MMX;
        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -122,7 +122,7 @@
 #define YUV422_UNSHIFT                   \
    if(c->srcFormat == PIX_FMT_YUV422P){ \
    if(c->srcFormat == PIX_FMT_YUV422P) {\
        srcStride[1] *= 2;               \
        srcStride[2] *= 2;               \
    }                                    \
@@ -180,7 +180,8 @@
    return srcSliceH; \
 static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
                                       int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, h_size;
    YUV422_UNSHIFT
@@ -236,7 +237,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
 }
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
                                       int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, h_size;
    YUV422_UNSHIFT
@@ -294,7 +296,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStr
 }
 static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
                                       int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, h_size;
    YUV422_UNSHIFT
@@ -470,7 +473,8 @@ etc.
    "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
                                       int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, h_size;
    YUV422_UNSHIFT
@@ -486,7 +490,8 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr
 }
 static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
                                        int srcSliceH, uint8_t* dst[], int dstStride[])
 {
 #if HAVE_7REGS
    int y, h_size;
@@ -504,7 +509,8 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcSt
 }
 static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
                                       int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    int y, h_size;
    YUV422_UNSHIFT
@@ -520,7 +526,8 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t* src[], int
 }
 static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
                                        int srcSliceH, uint8_t* dst[], int dstStride[])
 {
 #if HAVE_7REGS
    int y, h_size;
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -92,7 +92,8 @@ const int32_t ff_yuv2rgb_coeffs[8][4] = {
 #define YUV2RGBFUNC(func_name, dst_type, alpha) \
 static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
                     int srcSliceH, uint8_t* dst[], int dstStride[]){\
                     int srcSliceH, uint8_t* dst[], int dstStride[]) \
 {\
    int y;\
 \
    if (!alpha && c->srcFormat == PIX_FMT_YUV422P) {\
@@ -110,7 +111,7 @@ static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSlic
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];\
        uint8_t av_unused *pa_1, *pa_2;\
        unsigned int h_size = c->dstW>>3;\
        if (alpha){\
        if (alpha) {\
            pa_1 = src[3] + y*srcStride[3];\
            pa_2 = pa_1 + srcStride[3];\
        }\